Compare commits
358 Commits
fj_test_20
...
1.7.0-0.7
| Author | SHA1 | Date | |
|---|---|---|---|
| 63ed4e7af0 | |||
| d7cf39883f | |||
| 40f8091fab | |||
| a20e1acf01 | |||
| b3d7bbda56 | |||
| 9a60997ea0 | |||
| 4b66373813 | |||
| b44b11ace7 | |||
| ebc91cea0e | |||
| 58106d791a | |||
| 56b51d4f97 | |||
| bafe540d86 | |||
| d78a0fd05d | |||
| 999bc91b4f | |||
| b3bd2ea9b3 | |||
| d3d9e2400d | |||
| 199407b2a1 | |||
| 5973d66e2d | |||
| d7ef74659b | |||
| ac86affecc | |||
| 2026cf8dad | |||
| 1d135492c3 | |||
| 1cfc5ca71f | |||
| 7ee533d620 | |||
| 28334c7a29 | |||
| 697e9386b3 | |||
| 0e787b731e | |||
| 612f364e6a | |||
| ceee4c379f | |||
| 36c981bc34 | |||
| fd941dad44 | |||
| 5f5b9f94d1 | |||
| 3f3c4acd71 | |||
| 00007dafaa | |||
| cbe2b2149d | |||
| 4cecde3fba | |||
| 8022a2a8c0 | |||
| 3328ce03d9 | |||
| 97b107f61c | |||
| 6f3be17c19 | |||
| dea7d00545 | |||
| 4512778569 | |||
| a7adb266ff | |||
| 2566f4f213 | |||
| ac0081eddd | |||
| d4056acfc3 | |||
| 1910543380 | |||
| 6332903f0d | |||
| 29d27b7c8d | |||
| 7136384384 | |||
| 2fe5c8de2e | |||
| e774e1b984 | |||
| 33b7414615 | |||
| 3c646e2485 | |||
| a5fcc91656 | |||
| d370e9241f | |||
| 3e254c06bf | |||
| 07537cd2e7 | |||
| a37f72da0e | |||
| ab11b168f0 | |||
| eac414d6d8 | |||
| bb725f5f50 | |||
| 5224551782 | |||
| 91146acfe5 | |||
| f64731ab34 | |||
| cd46cbd4b3 | |||
| 39780917af | |||
| 0f8f6d298e | |||
| f8e8b21f04 | |||
| 5c2f9b8239 | |||
| 1afc3d9b70 | |||
| 17a8f68d60 | |||
| 2b9a053504 | |||
| 6441aa1abb | |||
| 9b55b68934 | |||
| 83ef96a739 | |||
| b5337358cf | |||
| 2db3717e57 | |||
| 5395891966 | |||
| c32a5e261b | |||
| c0c80b71ca | |||
| d15a396d5a | |||
| e35ec09da1 | |||
| 5e44c9c9f9 | |||
| 0f6c36870c | |||
| 2ec2112cc5 | |||
| c86a38e18f | |||
| 6aa7b50e26 | |||
| c3c57940ba | |||
| 7aa2d64294 | |||
| 51fe77cdae | |||
| d5aafca1ae | |||
| 54b529c82d | |||
| 232bc9c44b | |||
| f34373d1c0 | |||
| 4698ae166c | |||
| db9ca358f9 | |||
| 16a6a1d08b | |||
| 2e2e973d78 | |||
| c3c0b7197f | |||
| d086100b35 | |||
| 8f74888f87 | |||
| 8e42c2a254 | |||
| caf0f5ef63 | |||
| 3d030391e8 | |||
| 0aeab6b840 | |||
| 367bbda713 | |||
| 0082447043 | |||
| 4f50c90f6e | |||
| 79950e045e | |||
| 6cf7cebb2d | |||
| c9f05f238d | |||
| f1caaa9b74 | |||
| 97cd379ee2 | |||
| 8ee1d61d0f | |||
| 04d17dd3e9 | |||
| 33eef71133 | |||
| c10b4a1c16 | |||
| 8cf70900e7 | |||
| b2618a98f5 | |||
| 01d06cb218 | |||
| c78803ac08 | |||
| 3300e65efc | |||
| d82ac31bc6 | |||
| 4946fbdd82 | |||
| 33cba1ad48 | |||
| 7c69cfaf67 | |||
| b3cbdeec84 | |||
| 1d1ec39a27 | |||
| 0a4e6b49b4 | |||
| bb7e140655 | |||
| 32b32f0c4a | |||
| bf7fd81c1b | |||
| 92d191de9e | |||
| baf68f7e71 | |||
| 26bebb2749 | |||
| 9e2196c9ce | |||
| 93581cb142 | |||
| 67f5a1d4e0 | |||
| edf7b36669 | |||
| 1a204b6674 | |||
| 305511b48f | |||
| 606db376fd | |||
| 5719b4c64a | |||
| 343121c3d0 | |||
| 86c45484e3 | |||
| 767792808a | |||
| 117f070fd6 | |||
| a27909be88 | |||
| cec6f24559 | |||
| b3b8283f87 | |||
| d62f80a7c0 | |||
| 6d584feaef | |||
| e2e015e120 | |||
| 5fb3abe87b | |||
| 37fd9e0cd2 | |||
| 7e748b4ecb | |||
| cafb46efc7 | |||
| 41ea9d16c4 | |||
| 4bbdee395e | |||
| 597baf8445 | |||
| 55faba77a5 | |||
| 6bef773741 | |||
| 7882110e9f | |||
| d1df17ffb7 | |||
| 72af689e69 | |||
| 153d0609de | |||
| 83bbb87a0f | |||
| f00d03445c | |||
| 911b07f507 | |||
| 5b26fe2956 | |||
| 1db00ebc04 | |||
| d5de68e97b | |||
| 1526237bc6 | |||
| b8d96a74ce | |||
| 3c256e1a6c | |||
| 7fc4272b89 | |||
| d052acab1d | |||
| 91ea69cf8f | |||
| 0c63a2a3cd | |||
| a8696d811d | |||
| 569dc33a9c | |||
| 4b252a990f | |||
| adb6cce3ce | |||
| ed21b6849d | |||
| 37605740a4 | |||
| e069694c12 | |||
| dca1cb2625 | |||
| caac060684 | |||
| d330721421 | |||
| 157eeca41a | |||
| 8ba725b225 | |||
| a563d780c1 | |||
| 621533bbd3 | |||
| 37ea770f8c | |||
| edd3ea0103 | |||
| 41d37bcd30 | |||
| 309145587f | |||
| bc06d68d84 | |||
| 18412616e1 | |||
| c371fbf13b | |||
| 1492f16d67 | |||
| fd38ab6fd0 | |||
| f115bae8a7 | |||
| ba80dd8650 | |||
| 06960a41d9 | |||
| 86a2aabb24 | |||
| b4101d9c36 | |||
| ec31d72483 | |||
| 83ade5cdcd | |||
| dec133c1dd | |||
| 04a528ab27 | |||
| 8e4073c2ca | |||
| ff982b8594 | |||
| 299d47abf5 | |||
| f2460695c4 | |||
| 6ce5c754f3 | |||
| e932f2e70c | |||
| bb08742467 | |||
| 3e9fdfc0f1 | |||
| 58f4593478 | |||
| de0e07f29e | |||
| a4b83dc6d4 | |||
| beac6c3e80 | |||
| 5d6715078f | |||
| 0615a0b00b | |||
| 51cd7cbb6c | |||
| 0c1cae45fe | |||
| 11ef2f8092 | |||
| 12aef0b578 | |||
| 9b3450ee7e | |||
| 0d3ef65092 | |||
| 258156b57e | |||
| 8efced7bf7 | |||
| 2dd8687974 | |||
| f0bc1a6b07 | |||
| c52370b959 | |||
| 9c78d4d249 | |||
| b6285c9aa9 | |||
| b945367c90 | |||
| 0f434288e1 | |||
| b5cd813229 | |||
| 7268942c35 | |||
| f8cad24a9a | |||
| 2b6b3f31e5 | |||
| ca19ee434a | |||
| bb2589bac4 | |||
| e1c6e17400 | |||
| 207eba93ea | |||
| 06af2d62c6 | |||
| 3e267e24cb | |||
| e58e1c6e33 | |||
| fb924ebb9d | |||
| ac61577414 | |||
| 4cee9b1a27 | |||
| b55e164669 | |||
| aa66fe2cb1 | |||
| 3b74b0a093 | |||
| 0267a0c8ea | |||
| b3b7801d51 | |||
| 10f1fe76db | |||
| 089b443aaf | |||
| e9955a4bba | |||
| dc52c8a11a | |||
| bc4629dfb0 | |||
| 99fba2df1c | |||
| 239c95449b | |||
| 9dfc139eae | |||
| bc81d362b4 | |||
| 90b6aec53d | |||
| 0887e0de6d | |||
| 2c5c47344d | |||
| b9f223ceca | |||
| 6297181dcd | |||
| 80f964e44f | |||
| cc07d6e017 | |||
| 07c517828d | |||
| 75e42badf4 | |||
| bdccbf7356 | |||
| ad3ee26d36 | |||
| 16f8ccb35b | |||
| 3fda54ece8 | |||
| 4d252c2bb2 | |||
| 0cf89c5682 | |||
| 0d902872a1 | |||
| 9b6a88eeeb | |||
| 96b4729cd5 | |||
| 3372bbfd23 | |||
| f17c30da07 | |||
| 9a0eb915fb | |||
| a5ded1fc06 | |||
| de042b2cb2 | |||
| 2cee82673b | |||
| dfb3bef96d | |||
| 2dc51530f3 | |||
| 13758417c5 | |||
| c32edff2bb | |||
| 8356ef6c96 | |||
| 63d500515a | |||
| 791e8c2114 | |||
| 0bb612caea | |||
| 5e992bc195 | |||
| 08f817a654 | |||
| b87ac8b8c0 | |||
| a48a2cd3e8 | |||
| 7c238c27c9 | |||
| de77d2b061 | |||
| 52f89cf8fa | |||
| c96dfb0c68 | |||
| 21c9e57646 | |||
| 312b6c171b | |||
| 2ce695b47b | |||
| e5c1fdf129 | |||
| 9e3dd53c58 | |||
| fe53c6e0a5 | |||
| e988bfaf50 | |||
| f6f48b1210 | |||
| 70b42fde5d | |||
| ccb36a5849 | |||
| ea7f517e3d | |||
| ac18a24a27 | |||
| 8880710fad | |||
| 03a85825ed | |||
| 940eeca6f5 | |||
| 19b02cf4ed | |||
| 76a0cc71fc | |||
| ab39798181 | |||
| 0cc3496747 | |||
| 10cca81401 | |||
| 0c79de67b4 | |||
| 3fbad79afb | |||
| 1b76aaa7e1 | |||
| aa3c5e91db | |||
| 20d5900c35 | |||
| 414cffd95b | |||
| 9ec0aeeab5 | |||
| 06e96005a6 | |||
| 4606714c07 | |||
| a5d5baf8a8 | |||
| 8074445d59 | |||
| 6a456f11aa | |||
| 81e665cb48 | |||
| e0b9c5deec | |||
| 62772c8a24 | |||
| 63d15f7dfc | |||
| fb3f1c58a8 | |||
| 69846345de | |||
| b8155cc618 | |||
| f07e20a381 | |||
| 764948b51f | |||
| 7da5fede8b | |||
| 6810506c3d | |||
| c82c2c1231 | |||
| 5bc54a3bbe | |||
| 07aa96ef95 | |||
| dac99f708c | |||
| f3c9fbf4ea | |||
| 54122360e8 |
9
.gitignore
vendored
9
.gitignore
vendored
@ -13,6 +13,10 @@ old_timestamp
|
||||
CMakeFiles
|
||||
CMakeCache.txt
|
||||
Makefile
|
||||
!test/*/*/Makefile
|
||||
!test/signalonfork+wait/Makefile
|
||||
!test/perf_overflow/Makefile
|
||||
!test/*/*/*.cmd
|
||||
Kbuild
|
||||
cmake_install.cmake
|
||||
config.h
|
||||
@ -33,3 +37,8 @@ executer/user/libmcexec.a
|
||||
executer/user/libldump2mcdump.so
|
||||
executer/user/eclair
|
||||
tools/mcstat/mcstat
|
||||
/_CPack_Packages
|
||||
/CPackSourceConfig.cmake
|
||||
CPackConfig.cmake
|
||||
/build
|
||||
mckernel-*.tar.gz
|
||||
|
||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -1,3 +1,6 @@
|
||||
[submodule "ihk"]
|
||||
path = ihk
|
||||
url = https://github.com/RIKEN-SysSoft/ihk.git
|
||||
[submodule "executer/user/lib/libdwarf/libdwarf"]
|
||||
path = executer/user/lib/libdwarf/libdwarf
|
||||
url = https://github.com/bgerofi/libdwarf.git
|
||||
|
||||
182
CMakeLists.txt
182
CMakeLists.txt
@ -7,71 +7,157 @@ endif (NOT CMAKE_BUILD_TYPE)
|
||||
enable_language(C ASM)
|
||||
|
||||
project(mckernel C ASM)
|
||||
set(MCKERNEL_VERSION "1.6.0")
|
||||
set(MCKERNEL_VERSION "1.7.0")
|
||||
|
||||
# See "Fedora Packaging Guidlines -- Versioning"
|
||||
set(MCKERNEL_RELEASE "0.7")
|
||||
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/modules)
|
||||
# for rpmbuild
|
||||
if(DEFINED SYSCONF_INSTALL_DIR)
|
||||
set(CMAKE_INSTALL_SYSCONFDIR "${SYSCONF_INSTALL_DIR}")
|
||||
endif()
|
||||
include(GNUInstallDirs)
|
||||
include(CMakeParseArguments)
|
||||
include(Kbuild)
|
||||
include(Ksym)
|
||||
include(CheckCCompilerFlag)
|
||||
|
||||
set(CFLAGS_WARNINGS "-Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Wno-unused-function")
|
||||
CHECK_C_COMPILER_FLAG(-Wno-implicit-fallthrough IMPLICIT_FALLTHROUGH)
|
||||
if(IMPLICIT_FALLTHROUGH)
|
||||
set(CFLAGS_WARNINGS "${CFLAGS_WARNINGS} -Wno-implicit-fallthrough")
|
||||
endif(IMPLICIT_FALLTHROUGH)
|
||||
|
||||
# C flags need to be set before enabling language?
|
||||
set(CMAKE_C_FLAGS_DEBUG "-g ${CFLAGS_WARNINGS}" CACHE STRING "Debug compiler flags")
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CFLAGS_WARNINGS}" CACHE STRING "Release compiler flags")
|
||||
|
||||
# build options
|
||||
option(ENABLE_WERROR "Enable -Werror" OFF)
|
||||
if (ENABLE_WERROR)
|
||||
add_compile_options("-Werror")
|
||||
endif(ENABLE_WERROR)
|
||||
|
||||
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
||||
set(BUILD_TARGET "smp-x86" CACHE STRING "Build target: smp-x86 | smp-arm64")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
||||
set(BUILD_TARGET "smp-arm64" CACHE STRING "Build target: smp-x86 | smp-arm64")
|
||||
endif()
|
||||
|
||||
if (BUILD_TARGET STREQUAL "smp-x86")
|
||||
set(ARCH "x86_64")
|
||||
elseif (BUILD_TARGET STREQUAL "smp-arm64")
|
||||
set(ARCH "arm64")
|
||||
endif()
|
||||
|
||||
include(GNUInstallDirs)
|
||||
include(CMakeParseArguments)
|
||||
include(Kbuild)
|
||||
include(CheckCCompilerFlag)
|
||||
include(AutoconfHelper)
|
||||
|
||||
CHECK_C_COMPILER_FLAG(-Wno-implicit-fallthrough IMPLICIT_FALLTHROUGH)
|
||||
if(IMPLICIT_FALLTHROUGH)
|
||||
set(EXTRA_WARNINGS "-Wno-implicit-fallthrough")
|
||||
endif(IMPLICIT_FALLTHROUGH)
|
||||
|
||||
# build options
|
||||
set(CFLAGS_WARNING "-Wall" "-Wextra" "-Wno-unused-parameter" "-Wno-sign-compare" "-Wno-unused-function" ${EXTRA_WARNINGS} CACHE STRING "Warning flags")
|
||||
add_compile_options(${CFLAGS_WARNING})
|
||||
|
||||
option(ENABLE_WERROR "Enable -Werror" OFF)
|
||||
if (ENABLE_WERROR)
|
||||
add_compile_options("-Werror")
|
||||
endif(ENABLE_WERROR)
|
||||
|
||||
option(ENABLE_LINUX_WORK_IRQ_FOR_IKC "Use Linux work IRQ for IKC IPI" ON)
|
||||
if (ENABLE_LINUX_WORK_IRQ_FOR_IKC)
|
||||
set(KBUILD_C_FLAGS "${KBUILD_C_FLAGS} -DIHK_IKC_USE_LINUX_WORK_IRQ")
|
||||
add_definitions(-DIHK_IKC_USE_LINUX_WORK_IRQ)
|
||||
endif()
|
||||
|
||||
if (BUILD_TARGET STREQUAL "smp-arm64")
|
||||
foreach(i RANGE 1 120)
|
||||
add_definitions(-DPOSTK_DEBUG_ARCH_DEP_${i} -DPOSTK_DEBUG_TEMP_FIX_${i})
|
||||
set(KBUILD_C_FLAGS "${KBUILD_C_FLAGS} -DPOSTK_DEBUG_ARCH_DEP_${i} -DPOSTK_DEBUG_TEMP_FIX_${i}")
|
||||
endforeach()
|
||||
add_definitions(-DCONFIG_ARM64_64K_PAGES -DCONFIG_ARM64_VA_BITS=48)
|
||||
|
||||
execute_process(COMMAND awk -F= "$1 == \"CONFIG_ARM64_64K_PAGES\" { print $2; exit; }" "${KERNEL_DIR}/.config"
|
||||
OUTPUT_VARIABLE CONFIG_ARM64_64K_PAGES OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
execute_process(COMMAND awk -F= "$1 == \"CONFIG_ARM64_VA_BITS\" { print $2; exit; }" "${KERNEL_DIR}/.config"
|
||||
OUTPUT_VARIABLE CONFIG_ARM64_VA_BITS OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
message("Host kernel CONFIG_ARM64_64K_PAGES=${CONFIG_ARM64_64K_PAGES}")
|
||||
message("Host kernel CONFIG_ARM64_VA_BITS=${CONFIG_ARM64_VA_BITS}")
|
||||
|
||||
if(CONFIG_ARM64_64K_PAGES STREQUAL "y")
|
||||
if(CONFIG_ARM64_VA_BITS STREQUAL 42)
|
||||
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=2 -DCONFIG_ARM64_VA_BITS=42 -DCONFIG_ARM64_64K_PAGES)
|
||||
set(LINKER_SCRIPT "smp-arm64_type3.lds")
|
||||
elseif(CONFIG_ARM64_VA_BITS STREQUAL 48)
|
||||
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=3 -DCONFIG_ARM64_VA_BITS=48 -DCONFIG_ARM64_64K_PAGES)
|
||||
set(LINKER_SCRIPT "smp-arm64_type4.lds")
|
||||
endif()
|
||||
else(CONFIG_ARM64_64K_PAGES STREQUAL "y")
|
||||
if(CONFIG_ARM64_VA_BITS STREQUAL 39)
|
||||
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=3 -DCONFIG_ARM64_VA_BITS=39)
|
||||
set(LINKER_SCRIPT "smp-arm64_type1.lds")
|
||||
elseif(CONFIG_ARM64_VA_BITS STREQUAL 48)
|
||||
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=4 -DCONFIG_ARM64_VA_BITS=48)
|
||||
set(LINKER_SCRIPT "smp-arm64_type2.lds")
|
||||
endif()
|
||||
endif(CONFIG_ARM64_64K_PAGES STREQUAL "y")
|
||||
endif()
|
||||
set_property(CACHE BUILD_TARGET PROPERTY STRINGS smp-x86 smp-arm64)
|
||||
|
||||
# define MAP_KERNEL_START
|
||||
|
||||
set(tmpdir ${CMAKE_CURRENT_BINARY_DIR}/tmp.resolve_MODULES_END)
|
||||
file(REMOVE_RECURSE ${tmpdir})
|
||||
file(MAKE_DIRECTORY ${tmpdir})
|
||||
file(WRITE ${tmpdir}/driver.c "#include <linux/module.h>\n")
|
||||
file(APPEND ${tmpdir}/driver.c "unsigned long MAP_KERNEL_START = MODULES_END - (1UL << 23);\n")
|
||||
file(WRITE ${tmpdir}/Makefile "obj-m := driver.o\n")
|
||||
file(APPEND ${tmpdir}/Makefile "all:\n")
|
||||
file(APPEND ${tmpdir}/Makefile "\tmake ${KBUILD_MAKE_FLAGS_STR} -C ${KERNEL_DIR} M=${tmpdir} modules\n")
|
||||
|
||||
execute_process(COMMAND make -C ${tmpdir})
|
||||
execute_process(COMMAND bash -c "offset=`readelf -S ${tmpdir}/driver.ko | grep .data | sed 's/.* //g'`; echo $((0x$offset))"
|
||||
OUTPUT_VARIABLE MAP_KERNEL_START_OFFSET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
execute_process(COMMAND bash -c "dd if=${tmpdir}/driver.ko bs=1 skip=${MAP_KERNEL_START_OFFSET} count=8 2>/dev/null | od -tx8 -Ax | head -1 | sed 's|.* |0x|g'"
|
||||
OUTPUT_VARIABLE MAP_KERNEL_START OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
|
||||
set(ENABLE_MEMDUMP ON)
|
||||
option(ENABLE_PERF "Enable perf support" ON)
|
||||
option(ENABLE_RUSAGE "Enable rusage support" ON)
|
||||
option(ENABLE_MCOVERLAYFS "Enable overlay filesystem" OFF)
|
||||
option(ENABLE_QLMPI "Enable qlmpi programs" OFF)
|
||||
option(ENABLE_UTI "Enable uti support" OFF)
|
||||
option(ENABLE_UBSAN "Enable undefined behaviour sanitizer on mckernel size" OFF)
|
||||
option(ENABLE_PER_CPU_ALLOC_CACHE "Enable per-CPU allocator cache (ThunderX2 workaround)" OFF)
|
||||
|
||||
find_package(PkgConfig REQUIRED)
|
||||
set(PKG_CONFIG_USE_CMAKE_PREFIX_PATH ON)
|
||||
|
||||
find_library(LIBRT rt)
|
||||
if (NOT LIBRT)
|
||||
message(FATAL_ERROR "error: couldn't find librt")
|
||||
endif()
|
||||
find_library(LIBNUMA numa)
|
||||
if (NOT LIBNUMA)
|
||||
message(FATAL_ERROR "error: couldn't find libnuma")
|
||||
endif()
|
||||
find_library(LIBBFD bfd)
|
||||
if (NOT LIBBFD)
|
||||
message(FATAL_ERROR "error: couldn't find libbfd")
|
||||
endif()
|
||||
find_library(LIBIBERTY iberty)
|
||||
if (NOT LIBIBERTY)
|
||||
message(FATAL_ERROR "error: couldn't find libiberty")
|
||||
endif()
|
||||
|
||||
find_library(LIBDWARF dwarf)
|
||||
|
||||
if (NOT LIBDWARF)
|
||||
if (CMAKE_CROSSCOMPILING)
|
||||
message(FATAL_ERROR "Could not find libdwarf.so, install libdwarf-devel to ${CMAKE_FIND_ROOT_PATH}")
|
||||
endif()
|
||||
message("WARNING: libdwarf will be compiled locally")
|
||||
enable_language(CXX)
|
||||
else()
|
||||
# Note that libdwarf-devel provides /usr/include/libdwarf/dwarf.h
|
||||
# but elfutils-devel provides /usr/include/dwarf.h
|
||||
# while mcinspect.c performs "#include <dwarf.h>"
|
||||
find_path(DWARF_H dwarf.h PATH_SUFFIXES libdwarf)
|
||||
endif()
|
||||
|
||||
if (ENABLE_QLMPI)
|
||||
find_package(MPI REQUIRED)
|
||||
endif()
|
||||
|
||||
if (ENABLE_UTI)
|
||||
find_library(LIBSYSCALL_INTERCEPT syscall_intercept)
|
||||
pkg_check_modules(LIBSYSCALL_INTERCEPT REQUIRED libsyscall_intercept)
|
||||
link_directories(${LIBSYSCALL_INTERCEPT_LIBRARY_DIRS})
|
||||
endif()
|
||||
|
||||
string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(-([0-9]+)(.*))?" "\\1;\\2;\\3;\\5;\\6" LINUX_VERSION ${UNAME_R})
|
||||
@ -81,29 +167,11 @@ list(GET LINUX_VERSION 2 LINUX_VERSION_PATCH)
|
||||
list(GET LINUX_VERSION 3 LINUX_VERSION_RELEASE)
|
||||
math(EXPR LINUX_VERSION_CODE "${LINUX_VERSION_MAJOR} * 65536 + ${LINUX_VERSION_MINOR} * 256 + ${LINUX_VERSION_PATCH}")
|
||||
|
||||
ksym(sys_mount PREFIX MCCTRL_)
|
||||
ksym(sys_umount PREFIX MCCTRL_)
|
||||
ksym(sys_unshare PREFIX MCCTRL_)
|
||||
ksym(zap_page_range PREFIX MCCTRL_)
|
||||
ksym(vdso_image_64 PREFIX MCCTRL_)
|
||||
ksym(vdso_start PREFIX MCCTRL_)
|
||||
ksym(vdso_end PREFIX MCCTRL_)
|
||||
ksym(vdso_pages PREFIX MCCTRL_)
|
||||
ksym(__vvar_page PREFIX MCCTRL_)
|
||||
ksym(hpet_address PREFIX MCCTRL_)
|
||||
# POSTK_DEBUG_ARCH_DEP_50, add:find kernel symbol.
|
||||
ksym(vdso_spec PREFIX MCCTRL_)
|
||||
ksym(hv_clock PREFIX MCCTRL_)
|
||||
ksym(sys_readlink PREFIX MCCTRL_)
|
||||
ksym(walk_page_range PREFIX MCCTRL_)
|
||||
|
||||
|
||||
# compat with various install paths
|
||||
set(MCKERNEL_LIBDIR ${CMAKE_INSTALL_FULL_LIBDIR})
|
||||
set(BINDIR ${CMAKE_INSTALL_FULL_BINDIR})
|
||||
set(SBINDIR ${CMAKE_INSTALL_FULL_SBINDIR})
|
||||
set(ETCDIR ${CMAKE_INSTALL_FULL_SYSCONFDIR})
|
||||
set(ROOTFSDIR "${CMAKE_INSTALL_PREFIX}/rootfs")
|
||||
set(ETCDIR ${CMAKE_INSTALL_PREFIX}/etc)
|
||||
set(ROOTFSDIR "/rootfs")
|
||||
if (CMAKE_INSTALL_PREFIX STREQUAL "/usr")
|
||||
set(KMODDIR "/lib/modules/${UNAME_R}/extra/mckernel")
|
||||
set(MCKERNELDIR "${CMAKE_INSTALL_FULL_DATADIR}/mckernel/${BUILD_TARGET}")
|
||||
@ -138,23 +206,23 @@ configure_file(config.h.in config.h)
|
||||
|
||||
# actual build section - just subdirs
|
||||
add_subdirectory(executer/kernel/mcctrl)
|
||||
if (ENABLE_MCOVERLAYFS)
|
||||
add_subdirectory(executer/kernel/mcoverlayfs)
|
||||
endif()
|
||||
add_subdirectory(executer/user)
|
||||
add_subdirectory(kernel)
|
||||
add_subdirectory(tools/mcstat)
|
||||
add_subdirectory(tools/crash)
|
||||
|
||||
configure_file(arch/x86_64/tools/mcreboot-smp-x86.sh.in mcreboot.sh @ONLY)
|
||||
configure_file(arch/x86_64/tools/mcstop+release-smp-x86.sh.in mcstop+release.sh @ONLY)
|
||||
configure_file(arch/x86_64/tools/mcreboot.1in mcreboot.1 @ONLY)
|
||||
configure_file(scripts/mcreboot-smp.sh.in mcreboot.sh @ONLY)
|
||||
configure_file(scripts/mcstop+release-smp.sh.in mcstop+release.sh @ONLY)
|
||||
configure_file(scripts/mcreboot.1in mcreboot.1 @ONLY)
|
||||
configure_file(scripts/eclair-dump-backtrace.exp.in eclair-dump-backtrace.exp @ONLY)
|
||||
install(PROGRAMS
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/mcreboot.sh"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/mcstop+release.sh"
|
||||
DESTINATION "${CMAKE_INSTALL_SBINDIR}")
|
||||
install(FILES
|
||||
"arch/x86_64/tools/irqbalance_mck.service"
|
||||
"arch/x86_64/tools/irqbalance_mck.in"
|
||||
install(PROGRAMS
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/eclair-dump-backtrace.exp"
|
||||
DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||
install(FILES "scripts/irqbalance_mck.in"
|
||||
DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}")
|
||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/mcreboot.1"
|
||||
DESTINATION "${CMAKE_INSTALL_MANDIR}/man1")
|
||||
@ -162,7 +230,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/mcreboot.1"
|
||||
|
||||
configure_file(scripts/mckernel.spec.in scripts/mckernel.spec @ONLY)
|
||||
set(CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${MCKERNEL_VERSION}")
|
||||
set(CPACK_SOURCE_IGNORE_FILES "/.git$")
|
||||
set(CPACK_SOURCE_IGNORE_FILES "/.git/;/build;/CMakeCache.txt$;/CMakeFiles$;/Makefile$")
|
||||
set(CPACK_SOURCE_INSTALLED_DIRECTORIES "${CMAKE_SOURCE_DIR};/;${IHK_FULL_SOURCE_DIR};/ihk;${CMAKE_BINARY_DIR}/scripts;/scripts")
|
||||
set(CPACK_SOURCE_GENERATOR "TGZ")
|
||||
include(CPack)
|
||||
@ -181,12 +249,14 @@ message("KERNEL_DIR: ${KERNEL_DIR}")
|
||||
message("SYSTEM_MAP: ${SYSTEM_MAP}")
|
||||
message("VMLINUX: ${VMLINUX}")
|
||||
message("KBUILD_C_FLAGS: ${KBUILD_C_FLAGS}")
|
||||
message("MAP_KERNEL_START: ${MAP_KERNEL_START}")
|
||||
message("ENABLE_MEMDUMP: ${ENABLE_MEMDUMP}")
|
||||
message("ENABLE_PERF: ${ENABLE_PERF}")
|
||||
message("ENABLE_RUSAGE: ${ENABLE_RUSAGE}")
|
||||
message("ENABLE_MCOVERLAYFS: ${ENABLE_MCOVERLAYFS}")
|
||||
message("ENABLE_QLMPI: ${ENABLE_QLMPI}")
|
||||
message("ENABLE_UTI: ${ENABLE_UTI}")
|
||||
message("ENABLE_WERROR: ${ENABLE_WERROR}")
|
||||
message("ENABLE_UBSAN: ${ENABLE_UBSAN}")
|
||||
message("ENABLE_LINUX_WORK_IRQ_FOR_IKC: ${ENABLE_LINUX_WORK_IRQ_FOR_IKC}")
|
||||
message("ENABLE_PER_CPU_ALLOC_CACHE: ${ENABLE_PER_CPU_ALLOC_CACHE}")
|
||||
message("-------------------------------")
|
||||
|
||||
70
KNOWN_BUGS.md
Normal file
70
KNOWN_BUGS.md
Normal file
@ -0,0 +1,70 @@
|
||||
Linux crash when offlining CPU (el7, hardware-specific)
|
||||
=========================================================
|
||||
|
||||
On some hardware with el7 kernel, linux can crash due to a bug in the
|
||||
irq handling when offlining CPUs (reserve cpu part of mcreboot)
|
||||
|
||||
Example stack trace:
|
||||
```
|
||||
[ 4147.052753] BUG: unable to handle kernel NULL pointer dereference at 0000000000000040
|
||||
[ 4147.060677] IP: [<ffffffff8102ce26>] check_irq_vectors_for_cpu_disable+0x86/0x1c0
|
||||
[ 4147.068226] PGD 1057e44067 PUD 105f1e7067 PMD 0
|
||||
[ 4147.072935] Oops: 0000 [#1] SMP
|
||||
[ 4147.076230] Modules linked in: mcctrl(OE) ihk_smp_x86_64(OE) ihk(OE) xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 tun rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache ip6t_rpfilter ipt_REJECT nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge stp llc ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp scsi_tgt ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm mlx4_ib ib_core
|
||||
[ 4147.148619] dm_mirror dm_region_hash dm_log dm_mod sb_edac edac_core intel_powerclamp coretemp ext4 mbcache jbd2 intel_rapl iosf_mbi kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul ipmi_ssif glue_helper ablk_helper joydev iTCO_wdt iTCO_vendor_support cryptd ipmi_si ipmi_devintf ipmi_msghandler pcspkr wmi mei_me mei lpc_ich i2c_i801 sg ioatdma shpchp nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c mlx4_en sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm isci igb drm mlx4_core libsas ahci libahci scsi_transport_sas libata crct10dif_pclmul ptp crct10dif_common pps_core crc32c_intel dca i2c_algo_bit i2c_core devlink [last unloaded: ihk]
|
||||
[ 4147.215370] CPU: 6 PID: 38 Comm: migration/6 Tainted: G OE ------------ T 3.10.0-693.2.2.el7.x86_64 #1
|
||||
[ 4147.225672] Hardware name: SGI.COM C1104G-RP5/X9DRG-HF, BIOS 3.0 10/25/2013
|
||||
[ 4147.232747] task: ffff880174689fa0 ti: ffff8801746ac000 task.ti: ffff8801746ac000
|
||||
[ 4147.240278] RIP: 0010:[<ffffffff8102ce26>] [<ffffffff8102ce26>] check_irq_vectors_for_cpu_disable+0x86/0x1c0
|
||||
[ 4147.250275] RSP: 0018:ffff8801746afd30 EFLAGS: 00010046
|
||||
[ 4147.255608] RAX: 0000000000000000 RBX: 000000000000004e RCX: 0000000000000000
|
||||
[ 4147.262770] RDX: 0000000000000020 RSI: 000000000000005f RDI: 0000000000000023
|
||||
[ 4147.269936] RBP: ffff8801746afd58 R08: 0000000000000001 R09: ffff88017f800490
|
||||
[ 4147.277103] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000006
|
||||
[ 4147.284269] R13: 0000000000000000 R14: ffff88085ca82500 R15: 000000000000005f
|
||||
[ 4147.291429] FS: 0000000000000000(0000) GS:ffff88085fb80000(0000) knlGS:0000000000000000
|
||||
[ 4147.299556] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
|
||||
[ 4147.305326] CR2: 0000000000000040 CR3: 0000001059704000 CR4: 00000000001407e0
|
||||
[ 4147.312490] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
|
||||
[ 4147.319659] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
|
||||
[ 4147.326827] Stack:
|
||||
[ 4147.328857] ffff8808f43078c8 ffff8808f4307850 0000000000000286 ffff8808f4307701
|
||||
[ 4147.336384] 0000000000000000 ffff8801746afd70 ffffffff81052a82 0000000200000000
|
||||
[ 4147.343915] ffff8801746afd88 ffffffff81693ca3 0000000000000003 ffff8801746afdc0
|
||||
[ 4147.351447] Call Trace:
|
||||
[ 4147.353921] [<ffffffff81052a82>] native_cpu_disable+0x12/0x40
|
||||
[ 4147.359795] [<ffffffff81693ca3>] take_cpu_down+0x13/0x40
|
||||
[ 4147.365236] [<ffffffff81116899>] multi_cpu_stop+0xd9/0x100
|
||||
[ 4147.370850] [<ffffffff811167c0>] ? cpu_stop_should_run+0x50/0x50
|
||||
[ 4147.376983] [<ffffffff81116ab7>] cpu_stopper_thread+0x97/0x150
|
||||
[ 4147.382942] [<ffffffff816a8fad>] ? __schedule+0x39d/0x8b0
|
||||
[ 4147.388461] [<ffffffff810b909f>] smpboot_thread_fn+0x12f/0x180
|
||||
[ 4147.394406] [<ffffffff810b8f70>] ? lg_double_unlock+0x40/0x40
|
||||
[ 4147.400276] [<ffffffff810b098f>] kthread+0xcf/0xe0
|
||||
[ 4147.405182] [<ffffffff810b08c0>] ? insert_kthread_work+0x40/0x40
|
||||
[ 4147.411319] [<ffffffff816b4f58>] ret_from_fork+0x58/0x90
|
||||
[ 4147.418893] [<ffffffff810b08c0>] ? insert_kthread_work+0x40/0x40
|
||||
[ 4147.426524] Code: 81 fb 00 01 00 00 0f 84 8a 00 00 00 89 d8 65 44 8b 3c 85 20 c6 00 00 45 85 ff 78 e1 44 89 ff e8 91 31 10 00 48 63 15 7e 10 af 00 <48> 8b 70 40 48 c7 c7 80 71 cf 81 49 89 c6 48 83 c2 3f 48 c1 fa
|
||||
[ 4147.450352] RIP [<ffffffff8102ce26>] check_irq_vectors_for_cpu_disable+0x86/0x1c0
|
||||
[ 4147.460135] RSP <ffff8801746afd30>
|
||||
[ 4147.465154] CR2: 0000000000000040
|
||||
```
|
||||
|
||||
This bug has been fixed upstream, but redhat will not backport the fixes.
|
||||
You can work around the problem with a kpatch by backporting the three
|
||||
following commits:
|
||||
|
||||
x86: irq: Get correct available vectors for cpu disable
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ac2a55395eddccd6e3e39532df9869d61e97b2ee
|
||||
|
||||
x86/irq: Check for valid irq descriptor in check_irq_vectors_for_cpu_disable()
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d97eb8966c91f2c9d05f0a22eb89ed5b76d966d1
|
||||
|
||||
x86/irq: Use proper locking in check_irq_vectors_for_cpu_disable()
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cbb24dc761d95fe39a7a122bb1b298e9604cae15
|
||||
|
||||
|
||||
Alternatively, since it is related to the irq configuration, it might
|
||||
be possible to mitigate the issue by setting the irq affinities early
|
||||
on and making sure none of the cpus that will be offlined have any irq
|
||||
configured.
|
||||
540
NEWS.md
Normal file
540
NEWS.md
Normal file
@ -0,0 +1,540 @@
|
||||
=============================================
|
||||
What's new in version 1.7.0rc4 (Apr 15, 2020)
|
||||
=============================================
|
||||
|
||||
----------------------
|
||||
McKernel major updates
|
||||
----------------------
|
||||
1. arm64: Contiguous PTE support
|
||||
2. arm64: Scalable Vector Extension (SVE) support
|
||||
3. arm64: PMU overflow interrupt support
|
||||
4. xpmem: Support large page attachment
|
||||
5. arm64 port: Direct access to Mckernel memory from Linux
|
||||
6. arm64 port: utility thread offloading, which spawns thread onto
|
||||
Linux CPU
|
||||
7. eclair: support for live debug
|
||||
8. Crash utility extension
|
||||
9. Replace mcoverlayfs with a soft userspace overlay
|
||||
10. Build system is switched to cmake
|
||||
11. Core dump includes thread information
|
||||
|
||||
------------------------
|
||||
McKernel major bug fixes
|
||||
------------------------
|
||||
1. shmobj: Fix rusage counting for large page
|
||||
2. mcctrl control: task start_time changed to u64 nsec
|
||||
3. mcctrl: add handling for one more level of page tables
|
||||
4. Add kernel argument to turn on/off time sharing
|
||||
5. flatten_string/process env: realign env and clear trailing bits
|
||||
6. madvise: Add MADV_HUGEPAGE support
|
||||
8. mcctrl: remove in-kernel calls to syscalls
|
||||
9. arch_cpu_read_write_register: error return fix.
|
||||
10. set_cputime(): interrupt enable/disable fix.
|
||||
11. set_mempolicy(): Add mode check.
|
||||
12. mbind(): Fix memory_range_lock deadlock.
|
||||
13. ihk_ikc_recv: Record channel to packet for release
|
||||
14. Add set_cputime() kernel to kernel case and mode enum.
|
||||
15. execve: Call preempt_enable() before error-exit
|
||||
16. memory/x86_64: fix linux safe_kernel_map
|
||||
17. do_kill(): fix pids table when nr of threads is larger than num_processors
|
||||
18. shmget: Use transparent huge pages when page size isn't specified
|
||||
19. prctl: Add support for PR_SET_THP_DISABLE and PR_GET_THP_DISABLE
|
||||
20. monitor_init: fix undetected hang on highest numbered core
|
||||
21. init_process_stack: change premapped stack size based on arch
|
||||
22. x86 syscalls: add a bunch of XXat() delegated syscalls
|
||||
23. do_pageout: fix direct kernel-user access
|
||||
24. stack: add hwcap auxval
|
||||
25. perf counters: add arch-specific perf counters
|
||||
26. Added check of nohost to terminate_host().
|
||||
27. kmalloc: Fix address order in free list
|
||||
28. sysfs: use nr_cpu_ids for cpumasks (fixes libnuma parsing error on ARM)
|
||||
29. monitor_init: Use ihk_mc_cpu_info()
|
||||
30. Fix ThunderX2 write-combined PTE flag insanity
|
||||
31. ARM: eliminate zero page mapping (i.e, init_low_area())
|
||||
32. eliminate futex_cmpxchg_enabled check (not used and dereffed a NULL pointer)
|
||||
33. page_table: Fix return value of lookup_pte when ptl4 is blank
|
||||
34. sysfs: add missing symlinks for cpu/node
|
||||
35. Make Linux handler run when mmap to procfs.
|
||||
36. Separate mmap area from program loading (relocation) area
|
||||
37. move rusage into kernel ELF image (avoid dynamic alloc before NUMA init)
|
||||
38. arm: turn off cpu on panic
|
||||
39. page fault handler: protect thread accesses
|
||||
40. Register PPD and release_handler at the same time.
|
||||
41. fix to missing exclusive processing between terminate() and
|
||||
finalize_process().
|
||||
42. perfctr_stop: add flags to no 'disable_intens'
|
||||
43. fileobj, shmobj: free pages in object destructor (as opposed to page_unmap())
|
||||
44. clear_range_l1, clear_range_middle: Fix handling contiguous PTE
|
||||
45. do_mmap: don't pre-populate the whole file when asked for smaller segment
|
||||
46. invalidate_one_page: Support shmobj and contiguous PTE
|
||||
47. ubsan: fix undefined shifts
|
||||
48. x86: disable zero mapping and add a boot pt for ap trampoline
|
||||
49. rusage: Don't count PF_PATCH change
|
||||
50. Fixed time processing.
|
||||
51. copy_user_pte: vmap area not owned by McKernel
|
||||
52. gencore: Zero-clear ELF header and memory range table
|
||||
53. rpm: ignore CMakeCache.txt in dist and relax BuildRequires on cross build
|
||||
54. gencore: Allocate ELF header to heap instead of stack
|
||||
55. nanosleep: add cpu_pause() in spinwait loop
|
||||
56. init_process: add missing initializations to proc struct
|
||||
57. rus_vm_fault: always use a packet on the stack
|
||||
58. process stack: use PAGE_SIZE in aux vector
|
||||
59. copy_user_pte: base memobj copy on range & VR_PRIVATE
|
||||
60. arm64: ptrace: Fix overwriting 1st argument with return value
|
||||
61. page fault: use cow for private device mappings
|
||||
62. reproductible builds: remove most install paths in c code
|
||||
63. page fault: clear writable bit for non-dirtying access to shared ranges
|
||||
64. mcreboot/mcstop+release: support for regular user execution
|
||||
65. irqbalance_mck: replace extra service with service drop-in
|
||||
66. do_mmap: give addr argument a chance even if not MAP_FIXED
|
||||
67. x86: fix xchg() and cmpxchg() macros
|
||||
68. IHK: support for using Linux work IRQ as IKC interrupt (optional)
|
||||
69. MCS: fix ARM64 issue by using smp_XXX() functions (i.e., barrier()s)
|
||||
70. procfs: add number of threads to stat and status
|
||||
71. memory_range_lock: Fix deadlock in procfs/sysfs handler
|
||||
72. flush instruction cache at context switch time if necessary
|
||||
73. arm64: Fix PMU related functions
|
||||
74. page_fault_process_memory_range: Disable COW for VM region with zeroobj
|
||||
75. extend_process_region: Fall back to demand paging when not contiguous
|
||||
76. munmap: fix deadlock with remote pagefault on vm range lock
|
||||
77. procfs: if memory_range_lock fails, process later
|
||||
78. migrate-cpu: Prevent migration target from calling schedule() twice
|
||||
79. sched_request_migrate(): fix race condition between migration req and IRQs
|
||||
80. get_one_cpu_topology: Renumber core_id (physical core id)
|
||||
81. bb7e140 procfs cpuinfo: use sequence number as processor
|
||||
82. set_host_vma(): do NOT read protect Linux VMA
|
||||
|
||||
===========================================
|
||||
What's new in V1.6.0 (Nov 11, 2018)
|
||||
===========================================
|
||||
|
||||
-----------------------------------------------
|
||||
McKernel new features, improvements and changes
|
||||
-----------------------------------------------
|
||||
1. McKernel and Linux share one unified kernel virtual address space.
|
||||
That is, McKernel sections resides in Linux sections spared for
|
||||
modules. In this way, Linux can access the McKernel kernel memory
|
||||
area.
|
||||
2. hugetlbfs support
|
||||
3. IHK is now included as a git submodule
|
||||
4. Debug messages are turned on/off in per souce file basis at run-time.
|
||||
5. It's prohibited for McKernel to access physical memory ranges which
|
||||
Linux didn't give to McKernel.
|
||||
6. UTI (capability to spawn a thread on Linux CPU) improvement:
|
||||
* System calls issued from the thread are hooked by modifying
|
||||
binary in memory.
|
||||
|
||||
---------------------------
|
||||
McKernel bug fixes (digest)
|
||||
---------------------------
|
||||
#<num> below corresponds to the redmine issue number
|
||||
(https://postpeta.pccluster.org/redmine/).
|
||||
|
||||
1. #926: shmget: Hide object with IPC_RMID from shmget
|
||||
2. #1028: init_process: Inherit parent cpu_set
|
||||
3. #995: Fix shebang recorded in argv[0]
|
||||
4. #1024: Fix VMAP virtual address leak
|
||||
5. #1109: init_process_stack: Support "ulimit -s unlimited"
|
||||
6. x86 mem init: do not map identity mapping
|
||||
7. mcexec_wait_syscall: requeue potential request on interrupted wait
|
||||
8. mcctrl_ikc_send_wait: fix interrupt with do_frees == NULL
|
||||
9. pager_req_read: handle short read
|
||||
10. kprintf: only call eventfd() if it is safe to interrupt
|
||||
11. process_procfs_request: Add Pid to /proc/<PID>/status
|
||||
12. terminate: fix oversubscribe hang when waiting for other threads on same CPU to die
|
||||
13. mcexec: Do not close fd returned to mckernel side
|
||||
14. #976: execve: Clear sigaltstack and fp_regs
|
||||
15. #1002: perf_event: Specify counter by bit_mask on start/stop
|
||||
16. #1027: schedule: Don't reschedule immediately when wake up on migrate
|
||||
17. #mcctrl: lookup unexported symbols at runtime
|
||||
18. __sched_wakeup_thread: Notify interrupt_exit() of re-schedule
|
||||
19. futex_wait_queue_me: Spin-sleep when timeout and idle_halt is specified
|
||||
20. #1167: ihk_os_getperfevent,setperfevent: Timeout IKC sent by mcctrl
|
||||
21. devobj: fix object size (POSTK_DEBUG_TEMP_FIX_36)
|
||||
22. mcctrl: remove rus page cache
|
||||
23. #1021: procfs: Support multiple reads of e.g. /proc/*/maps
|
||||
24. #1006: wait: Delay wake-up parent within switch context
|
||||
25. #1164: mem: Check if phys-mem is within the range of McKernel memory
|
||||
26. #1039: page_fault_process_memory_range: Remove ihk_mc_map_virtual for CoW of device map
|
||||
27. partitioned execution: pass process rank to LWK
|
||||
28. process/vm: implement access_ok()
|
||||
29. spinlock: rewrite spinlock to use Linux ticket head/tail format
|
||||
30. #986: Fix deadlock involving mmap_sem and memory_range_lock
|
||||
31. Prevent one CPU from getting chosen by concurrent forks
|
||||
32. #1009: check_signal: system call restart is done only once
|
||||
33. #1176: syscall: the signal received during system call processing is not processed.
|
||||
34. #1036 syscall_time: Handle by McKernel
|
||||
35. #1165 do_syscall: Delegate system calls to the mcexec with the same pid
|
||||
36. #1194 execve: Fix calling ptrace_report_signal after preemption is disabled
|
||||
37. #1005 coredump: Exclude special areas
|
||||
38. #1018 procfs: Fix pread/pwrite to procfs fail when specified size is bigger than 4MB
|
||||
39. #1180 sched_setaffinity: Check migration after decrementing in_interrupt
|
||||
40. #771, #1179, #1143 ptrace supports threads
|
||||
41. #1189 procfs/do_fork: wait until procfs entries are registered
|
||||
42. #1114 procfs: add '/proc/pid/stat' to mckernel side and fix its comm
|
||||
43. #1116 mcctrl procfs: check entry was returned before using it
|
||||
44. #1167 ihk_os_getperfevent,setperfevent: Return -ETIME when IKC timeouts
|
||||
45. mcexec/execve: fix shebangs handling
|
||||
46. procfs: handle 'comm' on mckernel side
|
||||
47. ihk_os_setperfevent: Return number of registered events
|
||||
48. mcexec: fix terminating zero after readlink()
|
||||
|
||||
===========================================
|
||||
What's new in V1.5.1 (July 9, 2018)
|
||||
===========================================
|
||||
|
||||
-----------------------------------------------
|
||||
McKernel new features, improvements and changes
|
||||
-----------------------------------------------
|
||||
1. Watchdog timer to detect hang of McKernel
|
||||
mcexec prints out the following line to its stderr when a hang of
|
||||
McKernel is detected.
|
||||
|
||||
mcexec detected hang of McKernel
|
||||
|
||||
The watchdog timer is enabled by passing -i <timeout_in_sec> option
|
||||
to mcreboot.sh. <timeout_in_sec> specifies the interval of checking
|
||||
if McKernel is alive.
|
||||
Example: mcreboot.sh -i 600: Detect the hang with 10 minutes interval
|
||||
|
||||
The detailed step of the hang detection is as follows.
|
||||
(1) mcexec acquires eventfd for notification from IHK and perform
|
||||
epoll() on it.
|
||||
(2) A daemon called ihkmond monitors the state of McKernel periodically
|
||||
with the interval specified by the -i option. It judges that
|
||||
McKernel is hanging and notifies mcexec by the eventfd if its
|
||||
state hasn't changed since the last check.
|
||||
|
||||
2. Documentation
|
||||
man page: Installed directory is changed to <install_dir>/share/man
|
||||
|
||||
---------------------------
|
||||
McKernel bug fixes (digest)
|
||||
---------------------------
|
||||
1. #1146: pager_req_map(): do not take mmap_sem if not needed
|
||||
2. #1135: prepare_process_ranges_args_envs(): fix saving cmdline
|
||||
3. #1144: fileobj/devobj: record path name
|
||||
4. #1145: fileobj: use MCS locks for per-file page hash
|
||||
5. #1076: mcctrl: refactor prepare_image into new generic ikc send&wait
|
||||
6. #1072: execve: fix execve with oversubscribing
|
||||
7. #1132: execve: use thread variable instead of cpu_local_var(current)
|
||||
8. #1117: mprotect: do not set page table writable for cow pages
|
||||
9. #1143: syscall wait4: add _WALL (POSTK_DEBUG_ARCH_DEP_44)
|
||||
10. #1064: rusage: Fix initialization of rusage->num_processors
|
||||
11. #1133: pager_req_unmap: Put per-process data at exit
|
||||
12. #731: do_fork: Propagate error code returned by mcexec
|
||||
13. #1149: execve: Reinitialize vm_regions's map area on execve
|
||||
14. #1065: procfs: Show file names in /proc/<PID>/maps
|
||||
15. #1112: mremap: Fix type of size arguments (from ssize_t to size_t)
|
||||
16. #1121: sched_getaffinity: Check arguments in the same order as in Linux
|
||||
17. #1137: mmap, mremap: Check arguments in the same order as in Linux
|
||||
18. #1122: fix return value of sched_getaffinity
|
||||
19. #732: fix: /proc/<PID>/maps outputs a unnecessary NULL character
|
||||
|
||||
===================================
|
||||
What's new in V1.5.0 (Apr 5, 2018)
|
||||
===================================
|
||||
|
||||
--------------------------------------
|
||||
McKernel new features and improvements
|
||||
--------------------------------------
|
||||
1. Aid for Linux version migration: Detect /proc, /sys format change
|
||||
between two kernel verions
|
||||
2. Swap out
|
||||
* Only swap-out anonymous pages for now
|
||||
3. Improve support of /proc/maps
|
||||
4. mcstat: Linux tool to show resource usage
|
||||
|
||||
---------------------------
|
||||
McKernel bug fixes (digest)
|
||||
---------------------------
|
||||
1. #727: execve: Fix memory leak when receiving SIGKILL
|
||||
2. #829: perf_event_open: Support PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
|
||||
3. #906: mcexec: Check return code of fork()
|
||||
4. #1038: mcexec: Timeout when incorrect value is given to -n option
|
||||
5. #943 #945 #946 #960 $961: mcexec: Support strace
|
||||
6. #1029: struct thread is not released with stress-test involving signal
|
||||
and futex
|
||||
7. #863 #870: Respond immediately to terminating signal when
|
||||
offloading system call
|
||||
8. #1119: translate_rva_to_rpa(): use 2MB blocks in 1GB pages on x86
|
||||
11. #898: Shutdown OS only after no in-flight IKC exist
|
||||
12. #882: release_handler: Destroy objects as the process which opened it
|
||||
13. #882: mcexec: Make child process exit if the parent is killed during
|
||||
fork()
|
||||
14. #925: XPMEM: Don't destroy per-process object of the parent
|
||||
15. #885: ptrace: Support the case where a process attaches its child
|
||||
16. #1031: sigaction: Support SA_RESETHAND
|
||||
17. #923: rus_vm_fault: Return error when a thread not performing
|
||||
system call offloading causes remote page fault
|
||||
18. #1032 #1033 #1034: getrusage: Fix ru_maxrss, RUSAGE_CHILDREN,
|
||||
ru_stime related bugs
|
||||
19. #1120: getrusage: Fix deadlock on thread->times_update
|
||||
20. #1123: Fix deadlock related to wait_queue_head_list_node
|
||||
21. #1124: Fix deadlock of calling terminate() from terminate()
|
||||
22. #1125: Fix deadlock related to thread status
|
||||
* Related functions are: hold_thread(), do_kill() and terminate()
|
||||
23. #1126: uti: Fix uti thread on the McKernel side blocks others in do_syscall()
|
||||
24. #1066: procfs: Show Linux /proc/self/cgroup
|
||||
25. #1127: prepare_process_ranges_args_envs(): fix generating saved_cmdline to
|
||||
avoid PF in strlen()
|
||||
26. #1128: ihk_mc_map/unmap_virtual(): do proper TLB invalidation
|
||||
27. #1043: terminate(): fix update_lock and threads_lock order to avoid deadlock
|
||||
28. #1129: mcreboot.sh: Save /proc/irq/*/smp_affinity to /tmp/mcreboot
|
||||
29. #1130: mcexec: drop READ_IMPLIES_EXEC from personality
|
||||
|
||||
--------------------
|
||||
McKernel workarounds
|
||||
--------------------
|
||||
1. Forbid CPU oversubscription
|
||||
* It can be turned on by mcreboot.sh -O option
|
||||
|
||||
|
||||
===================================
|
||||
What's new in V1.4.0 (Oct 30, 2017)
|
||||
===================================
|
||||
|
||||
-----------------------------------------------------------
|
||||
Feature: Abstracted event type support in perf_event_open()
|
||||
-----------------------------------------------------------
|
||||
PERF_TYPE_HARDWARE and PERF_TYPE_CACHE types are supported.
|
||||
|
||||
----------------------------------
|
||||
Clean-up: Direct user-space access
|
||||
----------------------------------
|
||||
Code lines using direct user-space access (e.g. passing user-space
|
||||
pointer to memcpy()) becomes more portable across processor
|
||||
architectures. The modification follows the following rules.
|
||||
|
||||
1. Move the code section as it is to the architecture dependent
|
||||
directory if it is a part of the critical-path.
|
||||
2. Otherwise, rewrite the code section by using the portable methods.
|
||||
The methods include copy_from_user(), copy_to_user(),
|
||||
pte_get_phys() and phys_to_virt().
|
||||
|
||||
--------------------------------
|
||||
Test: MPI and OpenMP micro-bench
|
||||
--------------------------------
|
||||
The performance figures of MPI and OpenMP primitives are compared with
|
||||
those of Linux by using Intel MPI Benchmarks and EPCC OpenMP Micro
|
||||
Benchmark.
|
||||
|
||||
|
||||
===================================
|
||||
What's new in V1.3.0 (Sep 30, 2017)
|
||||
===================================
|
||||
|
||||
--------------------
|
||||
Feature: Kernel dump
|
||||
--------------------
|
||||
1. A dump level of "only kernel memory" is added.
|
||||
|
||||
The following two levels are available now:
|
||||
0: Dump all
|
||||
24: Dump only kernel memory
|
||||
|
||||
The dump level can be set by -d option in ihkosctl or the argument
|
||||
for ihk_os_makedumpfile(), as shown in the following examples:
|
||||
|
||||
Command: ihkosctl 0 dump -d 24
|
||||
Function call: ihk_os_makedumpfile(0, NULL, 24, 0);
|
||||
|
||||
2. Dump file is created when Linux panics.
|
||||
|
||||
The dump level can be set by dump_level kernel argument, as shown in the
|
||||
following example:
|
||||
|
||||
ihkosctl 0 kargs "hidos dump_level=24"
|
||||
|
||||
The IHK dump function is registered to panic_notifier_list when creating
|
||||
/dev/mcdX and called when Linux panics.
|
||||
|
||||
-----------------------------
|
||||
Feature: Quick Process Launch
|
||||
-----------------------------
|
||||
|
||||
MPI process launch time and some of the initialization time can be
|
||||
reduced in application consisting of multiple MPI programs which are
|
||||
launched in turn in the job script.
|
||||
|
||||
The following two steps should be performed to use this feature:
|
||||
1. Replace mpiexec with ql_mpiexec_start and add some lines for
|
||||
ql_mpiexec_finalize in the job script
|
||||
2. Modify the app so that it can repeat calculations and wait for the
|
||||
instructions from ql_mpiexec_{start,finalize} at the end of the
|
||||
loop
|
||||
|
||||
The first step is explained using an example. Assume the original job
|
||||
script looks like this:
|
||||
|
||||
/* Execute ensamble simulation and then data assimilation, and repeat this
|
||||
ten times */
|
||||
for i in {1..10}; do
|
||||
|
||||
/* Each ensamble simulation execution uses 100 nodes, launch ten of them
|
||||
in parallel */
|
||||
for j in {1..10}; do
|
||||
mpiexec -n 100 -machinefile ./list1_$j p1.out a1 & pids[$i]=$!;
|
||||
done
|
||||
|
||||
/* Wait until the ten ensamble simulation programs finish */
|
||||
for j in {1..10}; do wait ${pids[$j]}; done
|
||||
|
||||
/* Launch one data assimilation program using 1000 nodes */
|
||||
mpiexec -n 1000 -machinefile ./list2 p2.out a2
|
||||
done
|
||||
|
||||
The job script should be modified like this:
|
||||
|
||||
for i in {1..10}; do
|
||||
for j in {1..10}; do
|
||||
/* Replace mpiexec with ql_mpiexec_start */
|
||||
ql_mpiexec_start -n 100 -machinefile ./list1_$j p1.out a1 & pids[$j]=$!;
|
||||
done
|
||||
|
||||
for j in {1..10}; do wait ${pids[$j]}; done
|
||||
|
||||
ql_mpiexec_start -n 1000 -machinefile ./list2 p2.out a2
|
||||
done
|
||||
|
||||
/* p1.out and p2.out don't exit but are waiting for the next calculation.
|
||||
So tell them to exit */
|
||||
for j in {1..10}; do
|
||||
ql_mpiexec_finalize -machinefile ./list1_$i p1.out a1;
|
||||
done
|
||||
ql_mpiexec_finalize -machinefile ./list2 p2.out a2;
|
||||
|
||||
|
||||
The second step is explained using a pseudo-code.
|
||||
|
||||
MPI_Init();
|
||||
Prepare data exchange with preceding / following MPI programs
|
||||
loop:
|
||||
foreach Fortran module
|
||||
Initialize data using command-line argments, parameter files,
|
||||
environment variables
|
||||
Input data from preceding MPI programs / Read snap-shot
|
||||
Perform main calculation
|
||||
Output data to following MPI programs / Write snap-shot
|
||||
/* ql_client() waits for command of ql_mpiexec_{start,finish} */
|
||||
if (ql_client() == QL_CONTINUE) { goto loop; }
|
||||
MPI_Finalize();
|
||||
|
||||
qlmpilib.h should be included in the code and libql{mpi,fort}.so
|
||||
should be linked to the executable file.
|
||||
|
||||
|
||||
========================
|
||||
Restrictions on McKernel
|
||||
========================
|
||||
|
||||
1. Pseudo devices such as /dev/mem and /dev/zero are not mmap()ed
|
||||
correctly even if the mmap() returns a success. An access of their
|
||||
mapping receives the SIGSEGV signal.
|
||||
|
||||
2. clone() supports only the following flags. All the other flags
|
||||
cause clone() to return error or are simply ignored.
|
||||
|
||||
* CLONE_CHILD_CLEARTID
|
||||
* CLONE_CHILD_SETTID
|
||||
* CLONE_PARENT_SETTID
|
||||
* CLONE_SETTLS
|
||||
* CLONE_SIGHAND
|
||||
* CLONE_VM
|
||||
|
||||
3. PAPI has the following restriction.
|
||||
|
||||
* Number of counters a user can use at the same time is up to the
|
||||
number of the physical counters in the processor.
|
||||
|
||||
4. msync writes back only the modified pages mapped by the calling process.
|
||||
|
||||
5. The following syscalls always return the ENOSYS error.
|
||||
|
||||
* migrate_pages()
|
||||
* move_pages()
|
||||
* set_robust_list()
|
||||
|
||||
6. The following syscalls always return the EOPNOTSUPP error.
|
||||
|
||||
* arch_prctl(ARCH_SET_GS)
|
||||
* signalfd()
|
||||
|
||||
7. signalfd4() returns a fd, but signal is not notified through the
|
||||
fd.
|
||||
|
||||
8. set_rlimit sets the limit values but they are not enforced.
|
||||
|
||||
9. Address randomization is not supported.
|
||||
|
||||
10. brk() extends the heap more than requestd when -h
|
||||
(--extend-heap-by=)<step> option of mcexec is used with the value
|
||||
larger than 4 KiB. syscall_pwrite02 of LTP would fail for this
|
||||
reason. This is because the test expects that the end of the heap
|
||||
is set to the same address as the argument of sbrk() and expects a
|
||||
segmentation violation occurs when it tries to access the memory
|
||||
area right next to the boundary. However, the optimization sets
|
||||
the end to a value larger than the requested. Therefore, the
|
||||
expected segmentation violation doesn't occur.
|
||||
|
||||
11. setpriority()/getpriority() won't work. They might set/get the
|
||||
priority of a random mcexec thread. This is because there's no
|
||||
fixed correspondence between a McKernel thread which issues the
|
||||
system call and a mcexec thread which handles the offload request.
|
||||
|
||||
12. mbind() can set the policy but it is not used when allocating
|
||||
physical pages.
|
||||
|
||||
13. MPOL_F_RELATIVE_NODES and MPOL_INTERLEAVE flags for
|
||||
set_mempolicy()/mbind() are not supported.
|
||||
|
||||
14. The MPOL_BIND policy for set_mempolicy()/mbind() works as the same
|
||||
as the MPOL_PREFERRED policy. That is, the physical page allocator
|
||||
doesn't give up the allocation when the specified nodes are
|
||||
running out of pages but continues to search pages in the other
|
||||
nodes.
|
||||
|
||||
15. Kernel dump on Linux panic requires Linux kernel CentOS-7.4 and
|
||||
later. In addition, crash_kexec_post_notifiers kernel argument
|
||||
must be given to Linux kernel.
|
||||
|
||||
16. setfsuid()/setfsgid() cannot change the id of the calling thread.
|
||||
Instead, it changes that of the mcexec worker thread which takes
|
||||
the system-call offload request.
|
||||
|
||||
17. mmap (hugeTLBfs): The physical pages corresponding to a map are
|
||||
released when no McKernel process exist. The next map gets fresh
|
||||
physical pages.
|
||||
|
||||
18. Sticky bit on executable file has no effect.
|
||||
|
||||
19. Linux (RHEL-7 for x86_64) could hang when offlining CPUs in the
|
||||
process of booting McKernel due to the Linux bug, found in
|
||||
Linux-3.10 and fixed in the later version. One way to circumvent
|
||||
this is to always assign the same CPU set to McKernel.
|
||||
|
||||
20. madvise:
|
||||
* MADV_HWPOISON and MADV_SOFT_OFFLINE always returns -EPERM.
|
||||
* MADV_MERGEABLE and MADV_UNMERGEABLE always returns -EINVAL.
|
||||
* MADV_HUGEPAGE and MADV_NOHUGEPAGE on file map returns -EINVAL
|
||||
(It succeeds on RHEL-8 for aarch64).
|
||||
|
||||
21. brk() and mmap() doesn't report out-of-memory through its return
|
||||
value. Instead, page-fault reports the error.
|
||||
|
||||
22. Anonymous mmap pre-maps requested number of pages when contiguous
|
||||
pages are available. Demand paging is used when not available.
|
||||
|
||||
23. Mixing page sizes in anonymous shared mapping is not allowed. mmap
|
||||
creates vm_range with one page size. And munmap or mremap that
|
||||
needs the reduced page size changes the sizes of all the pages of
|
||||
the vm_range.
|
||||
|
||||
24. ihk_os_getperfevent() could time-out when invoked from Fujitsu TCS
|
||||
(job-scheduler).
|
||||
|
||||
25. The behaviors of madvise and mbind are changed to do nothing and
|
||||
report success as a workaround for Fugaku.
|
||||
|
||||
26. mmap() allows unlimited overcommit. Note that it corresponds to
|
||||
setting sysctl ``vm.overcommit_memory`` to 1.
|
||||
109
README.md
109
README.md
@ -10,7 +10,7 @@ IHK/McKernel is a light-weight multi-kernel operating system designed for high-e
|
||||
|
||||
## Contents
|
||||
|
||||
- [Background] (#background)
|
||||
- [Background](#background-and-motivation)
|
||||
- [Architectural Overview](#architectural-overview)
|
||||
- [Installation](#installation)
|
||||
- [The Team](#the-team)
|
||||
@ -85,7 +85,7 @@ sudo reboot
|
||||
You will need the following packages installed:
|
||||
|
||||
~~~~
|
||||
sudo yum install kernel-devel binutils-devel libnuma-devel
|
||||
sudo yum install cmake kernel-devel binutils-devel systemd-devel numactl-devel gcc make nasm git
|
||||
~~~~
|
||||
|
||||
Grant read permission to the System.map file of your kernel version:
|
||||
@ -96,24 +96,51 @@ sudo chmod a+r /boot/System.map-`uname -r`
|
||||
|
||||
##### 4. Obtain sources and compile the kernel
|
||||
|
||||
Clone the source code and set up ihk symlink (this is currently required):
|
||||
Clone the source code:
|
||||
|
||||
~~~~
|
||||
mkdir -p ~/src/ihk+mckernel/
|
||||
cd ~/src/ihk+mckernel/
|
||||
git clone -r git@github.com:RIKEN-SysSoft/mckernel.git
|
||||
git clone --recursive -b development https://github.com/RIKEN-SysSoft/mckernel.git
|
||||
~~~~
|
||||
|
||||
(Optional) Checkout to the specific branch or version:
|
||||
|
||||
~~~~
|
||||
cd mckernel
|
||||
git checkout <pathspec>
|
||||
git submodule update
|
||||
~~~~
|
||||
|
||||
Foe example, if you want to try the development branch, use "development" as the pathspec. If you want to try the prerelease version 1.7.0-0.2, use "1.7.0-0.2".
|
||||
|
||||
###### 4.1 Install with cmake
|
||||
|
||||
Configure and compile:
|
||||
|
||||
~~~~
|
||||
mkdir -p build && cd build
|
||||
cmake -DCMAKE_INSTALL_PREFIX=${HOME}/ihk+mckernel $HOME/src/mckernel
|
||||
cmake -DCMAKE_INSTALL_PREFIX=${HOME}/ihk+mckernel $HOME/src/ihk+mckernel/mckernel
|
||||
make -j install
|
||||
~~~~
|
||||
|
||||
The IHK kernel modules and McKernel kernel image should be installed under the **ihk+mckernel** folder in your home directory.
|
||||
|
||||
###### 4.2 Install with rpm
|
||||
|
||||
Configure, compile and build rpm:
|
||||
|
||||
~~~~
|
||||
mkdir -p build && cd build
|
||||
cmake $HOME/src/ihk+mckernel/mckernel
|
||||
make dist
|
||||
cp mckernel-<version>.tar.gz <rpmbuild>/SOURCES
|
||||
rpm -ba scripts/mckernel.spec
|
||||
sudo rpm -ivh <rpmbuild>/RPMS/<arch>/mckernel-<version>-<release>_<linux_kernel_ver>_<dist>.<arch>.rpm
|
||||
~~~~
|
||||
|
||||
The IHK kernel modules and McKernel kernel image are installed under the system directory.
|
||||
|
||||
##### 5. Boot McKernel
|
||||
|
||||
A boot script called mcreboot.sh is provided under sbin in the install folder. To boot on logical CPU 1 with 512MB of memory, use the following invocation:
|
||||
@ -170,6 +197,71 @@ Finally, to shutdown McKernel and release CPU/memory resources back to Linux use
|
||||
sudo ./sbin/mcstop+release.sh
|
||||
~~~~
|
||||
|
||||
##### 7. Advanced: Enable Utility Thread offloading Interface (UTI)
|
||||
|
||||
UTI enables a runtime such as MPI runtime to spawn utility threads such as MPI asynchronous progress threads to Linux cores.
|
||||
|
||||
1. Install capstone
|
||||
|
||||
Install EPEL capstone-devel:
|
||||
|
||||
~~~~
|
||||
sudo yum install epel-release
|
||||
sudo yum install capstone-devel
|
||||
~~~~
|
||||
|
||||
2. Install syscall_intercept
|
||||
|
||||
~~~~
|
||||
git clone https://github.com/RIKEN-SysSoft/syscall_intercept.git
|
||||
cmake ../arch/aarch64 -DCMAKE_INSTALL_PREFIX=<syscall-intercept-install> -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DTREAT_WARNINGS_AS_ERRORS=OFF
|
||||
~~~~
|
||||
|
||||
3. Install UTI for McKernel
|
||||
|
||||
Install:
|
||||
|
||||
~~~~
|
||||
git clone https://github.com/RIKEN-SysSoft/uti.git
|
||||
mkdir build && cd build
|
||||
../uti/configure --prefix=<mckernel-install> --with-rm=mckernel
|
||||
make && make install
|
||||
~~~~
|
||||
|
||||
4. Install McKernel
|
||||
|
||||
~~~~
|
||||
CMAKE_PREFIX_PATH=<syscall-intercept-install> cmake -DCMAKE_INSTALL_PREFIX=${HOME}/ihk+mckernel -DENABLE_UTI=ON $HOME/src/ihk+mckernel/mckernel
|
||||
~~~~
|
||||
|
||||
5. Run executable
|
||||
|
||||
~~~~
|
||||
mcexec --enable-uti <command>
|
||||
~~~~
|
||||
|
||||
6. Install UTI for Linux for performance comparison
|
||||
|
||||
Install by make:
|
||||
|
||||
~~~~
|
||||
git clone https://github.com/RIKEN-SysSoft/uti.git
|
||||
mkdir build && cd build
|
||||
../uti/configure --prefix=<uti-install> --with-rm=linux
|
||||
make && make install
|
||||
~~~~
|
||||
|
||||
Install by rpm:
|
||||
|
||||
~~~~
|
||||
git clone https://github.com/RIKEN-SysSoft/uti.git
|
||||
mkdir build && cd build
|
||||
../uti/configure --prefix=<uti-install> --with-rm=linux
|
||||
rm -f ~/rpmbuild/SOURCES/<version>.tar.gz
|
||||
rpmbuild -ba ./scripts/uti.spec
|
||||
rpm -Uvh uti-<version>-<release>-<arch>.rpm
|
||||
~~~~
|
||||
|
||||
## The Team
|
||||
|
||||
The McKernel project was started at The University of Tokyo and currently it is mainly developed at RIKEN.
|
||||
@ -184,3 +276,10 @@ Some of our collaborators include:
|
||||
## License
|
||||
|
||||
McKernel is GPL licensed, as found in the LICENSE file.
|
||||
|
||||
## Contact
|
||||
|
||||
Please give your feedback to us via one of the following mailing lists. Subscription via [www.pccluster.org](http://www.pccluster.org/mailman/listinfo/mckernel-users) is needed.
|
||||
|
||||
* English: mckernel-users@pccluster.org
|
||||
* Japanese: mckernel-users-jp@pccluster.org
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* assert.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||
/* assert.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
|
||||
#include <process.h>
|
||||
#include <list.h>
|
||||
@ -53,4 +53,4 @@ STATIC_ASSERT(SVE_PT_FPSIMD_OFFSET == sizeof(struct user_sve_header));
|
||||
STATIC_ASSERT(SVE_PT_SVE_OFFSET == sizeof(struct user_sve_header));
|
||||
|
||||
/* assert for struct arm64_cpu_local_thread member offset define */
|
||||
STATIC_ASSERT(offsetof(struct arm64_cpu_local_thread, panic_regs) == 160);
|
||||
STATIC_ASSERT(offsetof(struct arm64_cpu_local_thread, panic_regs) == 168);
|
||||
|
||||
@ -1,9 +1,15 @@
|
||||
/* coredump.c COPYRIGHT FUJITSU LIMITED 2015-2016 */
|
||||
/* coredump.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
#include <process.h>
|
||||
#include <elfcore.h>
|
||||
#include <string.h>
|
||||
#include <ptrace.h>
|
||||
#include <cls.h>
|
||||
#include <hwcap.h>
|
||||
|
||||
void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread, void *regs0)
|
||||
#define align32(x) ((((x) + 3) / 4) * 4)
|
||||
|
||||
void arch_fill_prstatus(struct elf_prstatus64 *prstatus,
|
||||
struct thread *thread, void *regs0, int sig)
|
||||
{
|
||||
struct pt_regs *regs = regs0;
|
||||
struct elf_prstatus64 tmp_prstatus;
|
||||
@ -14,8 +20,6 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
||||
short int pr_cursig;
|
||||
a8_uint64_t pr_sigpend;
|
||||
a8_uint64_t pr_sighold;
|
||||
pid_t pr_pid;
|
||||
pid_t pr_ppid;
|
||||
pid_t pr_pgrp;
|
||||
pid_t pr_sid;
|
||||
struct prstatus64_timeval pr_utime;
|
||||
@ -23,10 +27,66 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
||||
struct prstatus64_timeval pr_cutime;
|
||||
struct prstatus64_timeval pr_cstime;
|
||||
*/
|
||||
|
||||
/* copy x0-30, sp, pc, pstate */
|
||||
memcpy(&tmp_prstatus.pr_reg, ®s->user_regs, sizeof(tmp_prstatus.pr_reg));
|
||||
tmp_prstatus.pr_fpvalid = 0; /* We assume no fp */
|
||||
|
||||
/* copy unaligned prstatus addr */
|
||||
memcpy(prstatus, &tmp_prstatus, sizeof(*prstatus));
|
||||
|
||||
prstatus->pr_pid = thread->tid;
|
||||
if (thread->proc->parent) {
|
||||
prstatus->pr_ppid = thread->proc->parent->pid;
|
||||
}
|
||||
|
||||
prstatus->pr_info.si_signo = sig;
|
||||
prstatus->pr_cursig = sig;
|
||||
}
|
||||
|
||||
int arch_get_thread_core_info_size(void)
|
||||
{
|
||||
const struct user_regset_view *view = current_user_regset_view();
|
||||
const struct user_regset *regset = find_regset(view, NT_ARM_SVE);
|
||||
|
||||
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||
return 0;
|
||||
}
|
||||
return sizeof(struct note) + align32(sizeof("LINUX"))
|
||||
+ regset_size(cpu_local_var(current), regset);
|
||||
}
|
||||
|
||||
void arch_fill_thread_core_info(struct note *head,
|
||||
struct thread *thread, void *regs)
|
||||
{
|
||||
const struct user_regset_view *view = current_user_regset_view();
|
||||
const struct user_regset *regset = find_regset(view, NT_ARM_SVE);
|
||||
|
||||
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* pre saved registers */
|
||||
save_fp_regs(thread);
|
||||
|
||||
if (regset->core_note_type && regset->get &&
|
||||
(!regset->active || regset->active(thread, regset))) {
|
||||
int ret;
|
||||
size_t size = regset_size(thread, regset);
|
||||
void *namep;
|
||||
void *descp;
|
||||
|
||||
namep = (void *) (head + 1);
|
||||
descp = namep + align32(sizeof("LINUX"));
|
||||
|
||||
ret = regset->get(thread, regset, 0, size, descp, NULL);
|
||||
if (ret) {
|
||||
return;
|
||||
}
|
||||
|
||||
head->namesz = sizeof("LINUX");
|
||||
head->descsz = size;
|
||||
head->type = NT_ARM_SVE;
|
||||
memcpy(namep, "LINUX", sizeof("LINUX"));
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
/* cpu.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||
/* cpu.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
#include <ihk/cpu.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <ihk/mm.h>
|
||||
#include <types.h>
|
||||
#include <errno.h>
|
||||
@ -30,9 +29,11 @@
|
||||
#include <debug-monitors.h>
|
||||
#include <sysreg.h>
|
||||
#include <cpufeature.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <hwcap.h>
|
||||
#include <virt.h>
|
||||
#include <init.h>
|
||||
#include <bootparam.h>
|
||||
|
||||
//#define DEBUG_PRINT_CPU
|
||||
|
||||
@ -67,6 +68,7 @@ void (*gic_dist_init)(unsigned long dist_base_pa, unsigned long size);
|
||||
void (*gic_cpu_init)(unsigned long cpu_base_pa, unsigned long size);
|
||||
void (*gic_enable)(void);
|
||||
void (*arm64_issue_ipi)(unsigned int cpid, unsigned int vector);
|
||||
void (*arm64_issue_host_ipi)(unsigned int cpid, unsigned int vector);
|
||||
void (*handle_arch_irq)(struct pt_regs *);
|
||||
|
||||
static void gic_init(void)
|
||||
@ -77,14 +79,18 @@ static void gic_init(void)
|
||||
gic_cpu_init = gic_cpu_init_gicv3;
|
||||
gic_enable = gic_enable_gicv3;
|
||||
arm64_issue_ipi = arm64_issue_ipi_gicv3;
|
||||
arm64_issue_host_ipi = arm64_issue_host_ipi_gicv3;
|
||||
handle_arch_irq = handle_interrupt_gicv3;
|
||||
kprintf("%: GICv3\n", __func__);
|
||||
} else {
|
||||
/* Setup functions for GICv2 */
|
||||
gic_dist_init = gic_dist_init_gicv2;
|
||||
gic_cpu_init = gic_cpu_init_gicv2;
|
||||
gic_enable = gic_enable_gicv2;
|
||||
arm64_issue_ipi = arm64_issue_ipi_gicv2;
|
||||
arm64_issue_host_ipi = arm64_issue_host_ipi_gicv2;
|
||||
handle_arch_irq = handle_interrupt_gicv2;
|
||||
kprintf("%: GICv2\n", __func__);
|
||||
}
|
||||
|
||||
gic_dist_init(ihk_param_gic_dist_base_pa, ihk_param_gic_dist_map_size);
|
||||
@ -114,42 +120,94 @@ static struct ihk_mc_interrupt_handler cpu_stop_handler = {
|
||||
};
|
||||
|
||||
extern long freeze_thaw(void *nmi_ctx);
|
||||
static void multi_nm_interrupt_handler(void *priv)
|
||||
static void multi_interrupt_handler(void *priv)
|
||||
{
|
||||
extern int nmi_mode;
|
||||
struct pt_regs *regs = (struct pt_regs *)priv;
|
||||
union arm64_cpu_local_variables *clv;
|
||||
|
||||
switch (nmi_mode) {
|
||||
switch (multi_intr_mode) {
|
||||
case 1:
|
||||
case 2:
|
||||
/* mode == 1or2, for FREEZER NMI */
|
||||
dkprintf("%s: freeze mode NMI catch. (nmi_mode=%d)\n",
|
||||
__func__, nmi_mode);
|
||||
case 2: /* mode == 1or2, for FREEZER intr */
|
||||
dkprintf("%s: freeze mode intr catch. (multi_intr_mode=%d)\n",
|
||||
__func__, multi_intr_mode);
|
||||
freeze_thaw(NULL);
|
||||
break;
|
||||
default:
|
||||
ekprintf("%s: Unknown multi-intr-mode(%d) detected.\n",
|
||||
__func__, multi_intr_mode);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void arch_save_panic_regs(void *irq_regs)
|
||||
{
|
||||
struct pt_regs *regs = (struct pt_regs *)irq_regs;
|
||||
union arm64_cpu_local_variables *clv;
|
||||
|
||||
clv = get_arm64_this_cpu_local();
|
||||
|
||||
/* For user-space, use saved kernel context */
|
||||
if (regs->pc < USER_END) {
|
||||
memset(clv->arm64_cpu_local_thread.panic_regs,
|
||||
0, sizeof(clv->arm64_cpu_local_thread.panic_regs));
|
||||
clv->arm64_cpu_local_thread.panic_regs[29] =
|
||||
current_thread_info()->cpu_context.fp;
|
||||
clv->arm64_cpu_local_thread.panic_regs[31] =
|
||||
current_thread_info()->cpu_context.sp;
|
||||
clv->arm64_cpu_local_thread.panic_regs[32] =
|
||||
current_thread_info()->cpu_context.pc;
|
||||
clv->arm64_cpu_local_thread.panic_regs[33] =
|
||||
PSR_MODE_EL1h;
|
||||
}
|
||||
else {
|
||||
memcpy(clv->arm64_cpu_local_thread.panic_regs,
|
||||
regs->regs, sizeof(regs->regs));
|
||||
clv->arm64_cpu_local_thread.panic_regs[31] = regs->sp;
|
||||
clv->arm64_cpu_local_thread.panic_regs[32] = regs->pc;
|
||||
clv->arm64_cpu_local_thread.panic_regs[33] =
|
||||
regs->pstate;
|
||||
|
||||
}
|
||||
|
||||
clv->arm64_cpu_local_thread.paniced = 1;
|
||||
}
|
||||
|
||||
void arch_clear_panic(void)
|
||||
{
|
||||
union arm64_cpu_local_variables *clv;
|
||||
|
||||
clv = get_arm64_this_cpu_local();
|
||||
clv->arm64_cpu_local_thread.paniced = 0;
|
||||
}
|
||||
|
||||
static struct ihk_mc_interrupt_handler multi_intr_handler = {
|
||||
.func = multi_interrupt_handler,
|
||||
.priv = NULL,
|
||||
};
|
||||
|
||||
static void multi_nm_interrupt_handler(void *irq_regs)
|
||||
{
|
||||
extern int nmi_mode;
|
||||
|
||||
dkprintf("%s: ...\n", __func__);
|
||||
switch (nmi_mode) {
|
||||
case 0:
|
||||
/* mode == 0, for MEMDUMP NMI */
|
||||
clv = get_arm64_this_cpu_local();
|
||||
|
||||
if (regs) {
|
||||
memcpy(clv->arm64_cpu_local_thread.panic_regs,
|
||||
regs->regs, sizeof(regs->regs));
|
||||
clv->arm64_cpu_local_thread.panic_regs[31] = regs->sp;
|
||||
clv->arm64_cpu_local_thread.panic_regs[32] = regs->pc;
|
||||
clv->arm64_cpu_local_thread.panic_regs[33] =
|
||||
regs->pstate;
|
||||
}
|
||||
clv->arm64_cpu_local_thread.paniced = 1;
|
||||
/* mode == 0, for MEMDUMP NMI */
|
||||
arch_save_panic_regs(irq_regs);
|
||||
ihk_mc_query_mem_areas();
|
||||
/* memdump-nmi is halted McKernel, break is unnecessary. */
|
||||
/* fall through */
|
||||
case 3:
|
||||
/* mode == 3, for SHUTDOWN-WAIT NMI */
|
||||
while (1) {
|
||||
/* mode == 3, for SHUTDOWN-WAIT NMI */
|
||||
kprintf("%s: STOP\n", __func__);
|
||||
while (nmi_mode != 4)
|
||||
cpu_halt();
|
||||
break;
|
||||
|
||||
case 4:
|
||||
/* mode == 4, continue NMI */
|
||||
arch_clear_panic();
|
||||
if (!ihk_mc_get_processor_id()) {
|
||||
ihk_mc_clear_dump_page_completion();
|
||||
}
|
||||
kprintf("%s: RESUME, nmi_mode: %d\n", __func__, nmi_mode);
|
||||
break;
|
||||
|
||||
default:
|
||||
@ -423,6 +481,8 @@ void ihk_mc_init_ap(void)
|
||||
|
||||
ihk_mc_register_interrupt_handler(INTRID_CPU_STOP, &cpu_stop_handler);
|
||||
ihk_mc_register_interrupt_handler(INTRID_MULTI_NMI, &multi_nmi_handler);
|
||||
ihk_mc_register_interrupt_handler(INTRID_MULTI_INTR,
|
||||
&multi_intr_handler);
|
||||
ihk_mc_register_interrupt_handler(
|
||||
ihk_mc_get_vector(IHK_TLB_FLUSH_IRQ_VECTOR_START),
|
||||
&remote_tlb_flush_handler);
|
||||
@ -776,6 +836,21 @@ unsigned long cpu_disable_interrupt_save(void)
|
||||
return flags;
|
||||
}
|
||||
|
||||
/* save ICC_PMR_EL1 & enable interrupt (ICC_PMR_EL1 <= ICC_PMR_EL1_UNMASKED) */
|
||||
unsigned long cpu_enable_interrupt_save(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
unsigned long masked = ICC_PMR_EL1_UNMASKED;
|
||||
|
||||
asm volatile(
|
||||
"mrs_s %0, " __stringify(ICC_PMR_EL1) "\n"
|
||||
"msr_s " __stringify(ICC_PMR_EL1) ",%1"
|
||||
: "=&r" (flags)
|
||||
: "r" (masked)
|
||||
: "memory");
|
||||
return flags;
|
||||
}
|
||||
|
||||
#else /* defined(CONFIG_HAS_NMI) */
|
||||
|
||||
/* @ref.impl arch/arm64/include/asm/irqflags.h::arch_local_irq_enable */
|
||||
@ -824,6 +899,20 @@ unsigned long cpu_disable_interrupt_save(void)
|
||||
: "memory");
|
||||
return flags;
|
||||
}
|
||||
|
||||
/* save PSTATE.DAIF & enable interrupt (PSTATE.DAIF I bit set) */
|
||||
unsigned long cpu_enable_interrupt_save(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
asm volatile(
|
||||
"mrs %0, daif // arch_local_irq_save\n"
|
||||
"msr daifclr, #2"
|
||||
: "=r" (flags)
|
||||
:
|
||||
: "memory");
|
||||
return flags;
|
||||
}
|
||||
#endif /* defined(CONFIG_HAS_NMI) */
|
||||
|
||||
/* we not have "pause" instruction, instead "yield" instruction */
|
||||
@ -951,7 +1040,7 @@ void ihk_mc_boot_cpu(int cpuid, unsigned long pc)
|
||||
setup_cpu_features();
|
||||
}
|
||||
|
||||
init_sve_vl();
|
||||
sve_setup();
|
||||
}
|
||||
|
||||
/* for ihk_mc_init_context() */
|
||||
@ -986,6 +1075,9 @@ void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx,
|
||||
/* branch in ret_from_fork */
|
||||
new_ctx->thread->cpu_context.x19 = (unsigned long)next_function;
|
||||
|
||||
sp -= 16;
|
||||
new_ctx->thread->cpu_context.fp = sp;
|
||||
|
||||
/* set stack_pointer */
|
||||
new_ctx->thread->cpu_context.sp = sp - sizeof(ihk_mc_user_context_t);
|
||||
|
||||
@ -1001,9 +1093,10 @@ void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx,
|
||||
const int lcpuid = ihk_mc_get_processor_id();
|
||||
const unsigned long syscallno = current_pt_regs()->syscallno;
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
const uint16_t orig_sve_vl = current_thread_info()->sve_vl;
|
||||
const uint16_t orig_sve_vl_onexec = current_thread_info()->sve_vl_onexec;
|
||||
const uint16_t orig_sve_flags = current_thread_info()->sve_flags;
|
||||
struct thread_info *ti = current_thread_info();
|
||||
const unsigned int orig_sve_vl = ti->sve_vl;
|
||||
const unsigned int orig_sve_vl_onexec = ti->sve_vl_onexec;
|
||||
const unsigned long orig_sve_flags = ti->sve_flags;
|
||||
#endif /* CONFIG_ARM64_SVE */
|
||||
|
||||
/* get kernel stack address */
|
||||
@ -1023,6 +1116,9 @@ void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx,
|
||||
|
||||
/* set stack_pointer */
|
||||
new_ctx->thread->cpu_context.sp = sp;
|
||||
/* use the 16 bytes padding in ihk_mc_init_user_process()
|
||||
* as closing frame in the frame chain */
|
||||
new_ctx->thread->cpu_context.fp = sp + sizeof(ihk_mc_user_context_t);
|
||||
|
||||
/* clear pt_regs area */
|
||||
new_uctx = (ihk_mc_user_context_t *)new_ctx->thread->cpu_context.sp;
|
||||
@ -1183,7 +1279,7 @@ long ihk_mc_show_cpuinfo(char *buf, size_t buf_size, unsigned long read_off, int
|
||||
|
||||
/* generate strings */
|
||||
loff += scnprintf(lbuf + loff, lbuf_size - loff,
|
||||
"processor\t: %d\n", cpuinfo->hwid);
|
||||
"processor\t: %d\n", i);
|
||||
loff += scnprintf(lbuf + loff, lbuf_size - loff, "Features\t:");
|
||||
|
||||
for (j = 0; hwcap_str[j]; j++) {
|
||||
@ -1234,7 +1330,6 @@ err:
|
||||
}
|
||||
|
||||
static int check_and_allocate_fp_regs(struct thread *thread);
|
||||
void save_fp_regs(struct thread *thread);
|
||||
|
||||
void arch_clone_thread(struct thread *othread, unsigned long pc,
|
||||
unsigned long sp, struct thread *nthread)
|
||||
@ -1346,11 +1441,15 @@ int ihk_mc_arch_get_special_register(enum ihk_asr_type type,
|
||||
}
|
||||
|
||||
/*@
|
||||
@ requires \valid_apicid(cpu); // valid APIC ID or not
|
||||
@ requires \valid_cpuid(cpu); // valid CPU logical ID
|
||||
@ ensures \result == 0
|
||||
@*/
|
||||
int ihk_mc_interrupt_cpu(int cpu, int vector)
|
||||
{
|
||||
if (cpu < 0 || cpu >= num_processors) {
|
||||
kprintf("%s: invalid CPU id: %d\n", __func__, cpu);
|
||||
return -1;
|
||||
}
|
||||
dkprintf("[%d] ihk_mc_interrupt_cpu: %d\n", ihk_mc_get_processor_id(), cpu);
|
||||
(*arm64_issue_ipi)(cpu, vector);
|
||||
return 0;
|
||||
@ -1398,6 +1497,19 @@ struct thread *arch_switch_context(struct thread *prev, struct thread *next)
|
||||
}
|
||||
}
|
||||
#endif /*ENABLE_PERF*/
|
||||
|
||||
#ifdef PROFILE_ENABLE
|
||||
if (prev && prev->profile && prev->profile_start_ts != 0) {
|
||||
prev->profile_elapsed_ts +=
|
||||
(rdtsc() - prev->profile_start_ts);
|
||||
prev->profile_start_ts = 0;
|
||||
}
|
||||
|
||||
if (next->profile && next->profile_start_ts == 0) {
|
||||
next->profile_start_ts = rdtsc();
|
||||
}
|
||||
#endif
|
||||
|
||||
if (likely(prev)) {
|
||||
tls_thread_switch(prev, next);
|
||||
|
||||
@ -1471,8 +1583,7 @@ check_and_allocate_fp_regs(struct thread *thread)
|
||||
|
||||
if (!thread->fp_regs) {
|
||||
kprintf("error: allocating fp_regs pages\n");
|
||||
result = 1;
|
||||
panic("panic: error allocating fp_regs pages");
|
||||
result = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1481,37 +1592,51 @@ check_and_allocate_fp_regs(struct thread *thread)
|
||||
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
if (likely(elf_hwcap & HWCAP_SVE)) {
|
||||
sve_alloc(thread);
|
||||
result = sve_alloc(thread);
|
||||
}
|
||||
#endif /* CONFIG_ARM64_SVE */
|
||||
out:
|
||||
if (result) {
|
||||
release_fp_regs(thread);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/*@
|
||||
@ requires \valid(thread);
|
||||
@*/
|
||||
void
|
||||
int
|
||||
save_fp_regs(struct thread *thread)
|
||||
{
|
||||
int ret = 0;
|
||||
if (thread == &cpu_local_var(idle)) {
|
||||
return;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (likely(elf_hwcap & (HWCAP_FP | HWCAP_ASIMD))) {
|
||||
if (check_and_allocate_fp_regs(thread) != 0) {
|
||||
// alloc error.
|
||||
return;
|
||||
ret = check_and_allocate_fp_regs(thread);
|
||||
if (ret) {
|
||||
goto out;
|
||||
}
|
||||
thread_fpsimd_save(thread);
|
||||
}
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void copy_fp_regs(struct thread *from, struct thread *to)
|
||||
int copy_fp_regs(struct thread *from, struct thread *to)
|
||||
{
|
||||
if ((from->fp_regs != NULL) && (check_and_allocate_fp_regs(to) == 0)) {
|
||||
memcpy(to->fp_regs, from->fp_regs, sizeof(fp_regs_struct));
|
||||
int ret = 0;
|
||||
|
||||
if (from->fp_regs != NULL) {
|
||||
ret = check_and_allocate_fp_regs(to);
|
||||
if (!ret) {
|
||||
memcpy(to->fp_regs,
|
||||
from->fp_regs,
|
||||
sizeof(fp_regs_struct));
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void clear_fp_regs(void)
|
||||
@ -1626,6 +1751,7 @@ static inline int arch_cpu_mrs(uint32_t sys_reg, uint64_t *val)
|
||||
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE5_EL0);
|
||||
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE6_EL0);
|
||||
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE7_EL0);
|
||||
SYSREG_READ_S(IMP_PF_PMUSERENR_EL0);
|
||||
SYSREG_READ_S(IMP_BARRIER_CTRL_EL1);
|
||||
SYSREG_READ_S(IMP_BARRIER_BST_BIT_EL1);
|
||||
SYSREG_READ_S(IMP_BARRIER_INIT_SYNC_BB0_EL1);
|
||||
@ -1696,6 +1822,7 @@ static inline int arch_cpu_msr(uint32_t sys_reg, uint64_t val)
|
||||
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE5_EL0);
|
||||
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE6_EL0);
|
||||
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE7_EL0);
|
||||
SYSREG_WRITE_S(IMP_PF_PMUSERENR_EL0);
|
||||
SYSREG_WRITE_S(IMP_BARRIER_CTRL_EL1);
|
||||
SYSREG_WRITE_S(IMP_BARRIER_BST_BIT_EL1);
|
||||
SYSREG_WRITE_S(IMP_BARRIER_INIT_SYNC_BB0_EL1);
|
||||
@ -1753,6 +1880,11 @@ int arch_cpu_read_write_register(
|
||||
ret = -1;
|
||||
}
|
||||
|
||||
dkprintf("%s: MCCTRL_OS_CPU_%s_REGISTER: reg: 0x%lx, val: 0x%lx\n",
|
||||
__FUNCTION__,
|
||||
(op == MCCTRL_OS_CPU_READ_REGISTER ? "READ" : "WRITE"),
|
||||
desc->addr, desc->val);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1762,4 +1894,9 @@ int smp_call_func(cpu_set_t *__cpu_set, smp_func_t __func, void *__arg)
|
||||
return -1;
|
||||
}
|
||||
|
||||
void arch_flush_icache_all(void)
|
||||
{
|
||||
asm("ic ialluis");
|
||||
dsb(ish);
|
||||
}
|
||||
/*** end of file ***/
|
||||
|
||||
@ -970,7 +970,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SVE),
|
||||
#endif
|
||||
{},
|
||||
{ 0 },
|
||||
};
|
||||
|
||||
/* @ref.impl arch/arm64/kernel/cpufeature.c */
|
||||
|
||||
@ -10,5 +10,5 @@ struct cpu_info cpu_table[] = {
|
||||
.cpu_name = "AArch64 Processor",
|
||||
.cpu_setup = __cpu_setup,
|
||||
},
|
||||
{ /* Empty */ },
|
||||
{ 0 },
|
||||
};
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
#include <cputype.h>
|
||||
#include <irqflags.h>
|
||||
#include <ihk/context.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <signal.h>
|
||||
#include <errno.h>
|
||||
#include <debug-monitors.h>
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* fpsimd.c COPYRIGHT FUJITSU LIMITED 2016-2018 */
|
||||
/* fpsimd.c COPYRIGHT FUJITSU LIMITED 2016-2019 */
|
||||
#include <thread_info.h>
|
||||
#include <fpsimd.h>
|
||||
#include <cpuinfo.h>
|
||||
@ -9,8 +9,9 @@
|
||||
#include <prctl.h>
|
||||
#include <cpufeature.h>
|
||||
#include <kmalloc.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <process.h>
|
||||
#include <bitmap.h>
|
||||
|
||||
//#define DEBUG_PRINT_FPSIMD
|
||||
|
||||
@ -21,11 +22,87 @@
|
||||
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
|
||||
/* Set of available vector lengths, as vq_to_bit(vq): */
|
||||
static DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
|
||||
|
||||
/* Maximum supported vector length across all CPUs (initially poisoned) */
|
||||
int sve_max_vl = -1;
|
||||
|
||||
/* Default VL for tasks that don't set it explicitly: */
|
||||
int sve_default_vl = -1;
|
||||
|
||||
/*
|
||||
* Helpers to translate bit indices in sve_vq_map to VQ values (and
|
||||
* vice versa). This allows find_next_bit() to be used to find the
|
||||
* _maximum_ VQ not exceeding a certain value.
|
||||
*/
|
||||
|
||||
static unsigned int vq_to_bit(unsigned int vq)
|
||||
{
|
||||
return SVE_VQ_MAX - vq;
|
||||
}
|
||||
|
||||
static unsigned int bit_to_vq(unsigned int bit)
|
||||
{
|
||||
if (bit >= SVE_VQ_MAX) {
|
||||
bit = SVE_VQ_MAX - 1;
|
||||
}
|
||||
return SVE_VQ_MAX - bit;
|
||||
}
|
||||
|
||||
/*
|
||||
* All vector length selection from userspace comes through here.
|
||||
* We're on a slow path, so some sanity-checks are included.
|
||||
* If things go wrong there's a bug somewhere, but try to fall back to a
|
||||
* safe choice.
|
||||
*/
|
||||
static unsigned int find_supported_vector_length(unsigned int vl)
|
||||
{
|
||||
int bit;
|
||||
int max_vl = sve_max_vl;
|
||||
|
||||
if (!sve_vl_valid(vl)) {
|
||||
vl = SVE_VL_MIN;
|
||||
}
|
||||
|
||||
if (!sve_vl_valid(max_vl)) {
|
||||
max_vl = SVE_VL_MIN;
|
||||
}
|
||||
|
||||
if (vl > max_vl) {
|
||||
vl = max_vl;
|
||||
}
|
||||
|
||||
bit = find_next_bit(sve_vq_map, SVE_VQ_MAX,
|
||||
vq_to_bit(sve_vq_from_vl(vl)));
|
||||
return sve_vl_from_vq(bit_to_vq(bit));
|
||||
}
|
||||
|
||||
static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX))
|
||||
{
|
||||
unsigned int vq, vl;
|
||||
unsigned long zcr;
|
||||
|
||||
bitmap_zero(map, SVE_VQ_MAX);
|
||||
|
||||
zcr = ZCR_EL1_LEN_MASK;
|
||||
zcr = read_sysreg_s(SYS_ZCR_EL1) & ~zcr;
|
||||
|
||||
for (vq = SVE_VQ_MAX; vq >= SVE_VQ_MIN; --vq) {
|
||||
/* self-syncing */
|
||||
write_sysreg_s(zcr | (vq - 1), SYS_ZCR_EL1);
|
||||
vl = sve_get_vl();
|
||||
/* skip intervening lengths */
|
||||
vq = sve_vq_from_vl(vl);
|
||||
set_bit(vq_to_bit(vq), map);
|
||||
}
|
||||
}
|
||||
|
||||
void sve_init_vq_map(void)
|
||||
{
|
||||
sve_probe_vqs(sve_vq_map);
|
||||
}
|
||||
|
||||
size_t sve_state_size(struct thread const *thread)
|
||||
{
|
||||
unsigned int vl = thread->ctx.thread->sve_vl;
|
||||
@ -42,17 +119,19 @@ void sve_free(struct thread *thread)
|
||||
}
|
||||
}
|
||||
|
||||
void sve_alloc(struct thread *thread)
|
||||
int sve_alloc(struct thread *thread)
|
||||
{
|
||||
if (thread->ctx.thread->sve_state) {
|
||||
return;
|
||||
return 0;
|
||||
}
|
||||
|
||||
thread->ctx.thread->sve_state =
|
||||
kmalloc(sve_state_size(thread), IHK_MC_AP_NOWAIT);
|
||||
BUG_ON(!thread->ctx.thread->sve_state);
|
||||
|
||||
if (thread->ctx.thread->sve_state == NULL) {
|
||||
return -ENOMEM;
|
||||
}
|
||||
memset(thread->ctx.thread->sve_state, 0, sve_state_size(thread));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int get_nr_threads(struct process *proc)
|
||||
@ -75,19 +154,7 @@ int sve_set_vector_length(struct thread *thread,
|
||||
{
|
||||
struct thread_info *ti = thread->ctx.thread;
|
||||
|
||||
BUG_ON(thread == cpu_local_var(current) && cpu_local_var(no_preempt) == 0);
|
||||
|
||||
/*
|
||||
* To avoid accidents, forbid setting for individual threads of a
|
||||
* multithreaded process. User code that knows what it's doing can
|
||||
* pass PR_SVE_SET_VL_THREAD to override this restriction:
|
||||
*/
|
||||
if (!(flags & PR_SVE_SET_VL_THREAD) && get_nr_threads(thread->proc) != 1) {
|
||||
return -EINVAL;
|
||||
}
|
||||
flags &= ~(unsigned long)PR_SVE_SET_VL_THREAD;
|
||||
|
||||
if (flags & ~(unsigned long)(PR_SVE_SET_VL_INHERIT |
|
||||
if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT |
|
||||
PR_SVE_SET_VL_ONEXEC)) {
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -96,13 +163,19 @@ int sve_set_vector_length(struct thread *thread,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (vl > sve_max_vl) {
|
||||
BUG_ON(!sve_vl_valid(sve_max_vl));
|
||||
vl = sve_max_vl;
|
||||
/*
|
||||
* Clamp to the maximum vector length that VL-agnostic SVE code can
|
||||
* work with. A flag may be assigned in the future to allow setting
|
||||
* of larger vector lengths without confusing older software.
|
||||
*/
|
||||
if (vl > SVE_VL_ARCH_MAX) {
|
||||
vl = SVE_VL_ARCH_MAX;
|
||||
}
|
||||
|
||||
if (flags & (PR_SVE_SET_VL_ONEXEC |
|
||||
PR_SVE_SET_VL_INHERIT)) {
|
||||
vl = find_supported_vector_length(vl);
|
||||
|
||||
if (flags & (PR_SVE_VL_INHERIT |
|
||||
PR_SVE_SET_VL_ONEXEC)) {
|
||||
ti->sve_vl_onexec = vl;
|
||||
} else {
|
||||
/* Reset VL to system default on next exec: */
|
||||
@ -114,39 +187,42 @@ int sve_set_vector_length(struct thread *thread,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (vl != ti->sve_vl) {
|
||||
if ((elf_hwcap & HWCAP_SVE)) {
|
||||
fp_regs_struct fp_regs;
|
||||
memset(&fp_regs, 0, sizeof(fp_regs));
|
||||
if (vl == ti->sve_vl) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* for self at prctl syscall */
|
||||
if (thread == cpu_local_var(current)) {
|
||||
save_fp_regs(thread);
|
||||
clear_fp_regs();
|
||||
thread_sve_to_fpsimd(thread, &fp_regs);
|
||||
sve_free(thread);
|
||||
if ((elf_hwcap & HWCAP_SVE)) {
|
||||
fp_regs_struct fp_regs;
|
||||
|
||||
ti->sve_vl = vl;
|
||||
memset(&fp_regs, 0, sizeof(fp_regs));
|
||||
|
||||
sve_alloc(thread);
|
||||
thread_fpsimd_to_sve(thread, &fp_regs);
|
||||
restore_fp_regs(thread);
|
||||
/* for target thread at ptrace */
|
||||
} else {
|
||||
thread_sve_to_fpsimd(thread, &fp_regs);
|
||||
sve_free(thread);
|
||||
/* for self at prctl syscall */
|
||||
if (thread == cpu_local_var(current)) {
|
||||
save_fp_regs(thread);
|
||||
clear_fp_regs();
|
||||
thread_sve_to_fpsimd(thread, &fp_regs);
|
||||
sve_free(thread);
|
||||
|
||||
ti->sve_vl = vl;
|
||||
ti->sve_vl = vl;
|
||||
|
||||
sve_alloc(thread);
|
||||
thread_fpsimd_to_sve(thread, &fp_regs);
|
||||
}
|
||||
sve_alloc(thread);
|
||||
thread_fpsimd_to_sve(thread, &fp_regs);
|
||||
restore_fp_regs(thread);
|
||||
/* for target thread at ptrace */
|
||||
} else {
|
||||
thread_sve_to_fpsimd(thread, &fp_regs);
|
||||
sve_free(thread);
|
||||
|
||||
ti->sve_vl = vl;
|
||||
|
||||
sve_alloc(thread);
|
||||
thread_fpsimd_to_sve(thread, &fp_regs);
|
||||
}
|
||||
}
|
||||
ti->sve_vl = vl;
|
||||
|
||||
out:
|
||||
ti->sve_flags = flags & PR_SVE_SET_VL_INHERIT;
|
||||
ti->sve_flags = flags & PR_SVE_VL_INHERIT;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -156,44 +232,53 @@ out:
|
||||
* Encode the current vector length and flags for return.
|
||||
* This is only required for prctl(): ptrace has separate fields
|
||||
*/
|
||||
static int sve_prctl_status(const struct thread_info *ti)
|
||||
static int sve_prctl_status(unsigned long flags)
|
||||
{
|
||||
int ret = ti->sve_vl;
|
||||
int ret;
|
||||
struct thread_info *ti = cpu_local_var(current)->ctx.thread;
|
||||
|
||||
ret |= ti->sve_flags << 16;
|
||||
if (flags & PR_SVE_SET_VL_ONEXEC) {
|
||||
ret = ti->sve_vl_onexec;
|
||||
}
|
||||
else {
|
||||
ret = ti->sve_vl;
|
||||
}
|
||||
|
||||
if (ti->sve_flags & PR_SVE_VL_INHERIT) {
|
||||
ret |= PR_SVE_VL_INHERIT;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* @ref.impl arch/arm64/kernel/fpsimd.c::sve_set_task_vl */
|
||||
int sve_set_thread_vl(struct thread *thread, const unsigned long vector_length,
|
||||
const unsigned long flags)
|
||||
int sve_set_thread_vl(unsigned long arg)
|
||||
{
|
||||
unsigned long vl, flags;
|
||||
int ret;
|
||||
|
||||
if (!(elf_hwcap & HWCAP_SVE)) {
|
||||
vl = arg & PR_SVE_VL_LEN_MASK;
|
||||
flags = arg & ~vl;
|
||||
|
||||
/* Instead of system_supports_sve() */
|
||||
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
BUG_ON(thread != cpu_local_var(current));
|
||||
|
||||
preempt_disable();
|
||||
ret = sve_set_vector_length(thread, vector_length, flags);
|
||||
preempt_enable();
|
||||
|
||||
ret = sve_set_vector_length(cpu_local_var(current), vl, flags);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
return sve_prctl_status(thread->ctx.thread);
|
||||
return sve_prctl_status(flags);
|
||||
}
|
||||
|
||||
/* @ref.impl arch/arm64/kernel/fpsimd.c::sve_get_ti_vl */
|
||||
int sve_get_thread_vl(const struct thread *thread)
|
||||
int sve_get_thread_vl(void)
|
||||
{
|
||||
if (!(elf_hwcap & HWCAP_SVE)) {
|
||||
/* Instead of system_supports_sve() */
|
||||
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||
return -EINVAL;
|
||||
}
|
||||
return sve_prctl_status(thread->ctx.thread);
|
||||
return sve_prctl_status(0);
|
||||
}
|
||||
|
||||
void do_sve_acc(unsigned int esr, struct pt_regs *regs)
|
||||
@ -203,25 +288,48 @@ void do_sve_acc(unsigned int esr, struct pt_regs *regs)
|
||||
panic("");
|
||||
}
|
||||
|
||||
void init_sve_vl(void)
|
||||
void sve_setup(void)
|
||||
{
|
||||
extern unsigned long ihk_param_default_vl;
|
||||
uint64_t zcr;
|
||||
|
||||
/* Instead of system_supports_sve() */
|
||||
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||
return;
|
||||
}
|
||||
|
||||
zcr = read_system_reg(SYS_ZCR_EL1);
|
||||
BUG_ON(((zcr & ZCR_EL1_LEN_MASK) + 1) * 16 > sve_max_vl);
|
||||
/* init sve_vq_map bitmap */
|
||||
sve_init_vq_map();
|
||||
|
||||
/*
|
||||
* The SVE architecture mandates support for 128-bit vectors,
|
||||
* so sve_vq_map must have at least SVE_VQ_MIN set.
|
||||
* If something went wrong, at least try to patch it up:
|
||||
*/
|
||||
if (!test_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map)) {
|
||||
set_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map);
|
||||
}
|
||||
|
||||
zcr = read_system_reg(SYS_ZCR_EL1);
|
||||
sve_max_vl = sve_vl_from_vq((zcr & ZCR_EL1_LEN_MASK) + 1);
|
||||
|
||||
/*
|
||||
* Sanity-check that the max VL we determined through CPU features
|
||||
* corresponds properly to sve_vq_map. If not, do our best:
|
||||
*/
|
||||
if (sve_max_vl != find_supported_vector_length(sve_max_vl)) {
|
||||
sve_max_vl = find_supported_vector_length(sve_max_vl);
|
||||
}
|
||||
|
||||
sve_max_vl = ((zcr & ZCR_EL1_LEN_MASK) + 1) * 16;
|
||||
sve_default_vl = ihk_param_default_vl;
|
||||
|
||||
if (sve_default_vl == 0) {
|
||||
kprintf("SVE: Getting default VL = 0 from HOST-Linux.\n");
|
||||
sve_default_vl = sve_max_vl > 64 ? 64 : sve_max_vl;
|
||||
kprintf("SVE: Using default vl(%d byte).\n", sve_default_vl);
|
||||
if (ihk_param_default_vl !=
|
||||
find_supported_vector_length(ihk_param_default_vl)) {
|
||||
kprintf("SVE: Getting unsupported default VL = %d "
|
||||
"from HOST-Linux.\n", sve_default_vl);
|
||||
sve_default_vl = find_supported_vector_length(64);
|
||||
kprintf("SVE: Using default vl(%d byte).\n",
|
||||
sve_default_vl);
|
||||
}
|
||||
|
||||
kprintf("SVE: maximum available vector length %u bytes per vector\n",
|
||||
@ -232,7 +340,7 @@ void init_sve_vl(void)
|
||||
|
||||
#else /* CONFIG_ARM64_SVE */
|
||||
|
||||
void init_sve_vl(void)
|
||||
void sve_setup(void)
|
||||
{
|
||||
/* nothing to do. */
|
||||
}
|
||||
|
||||
@ -10,7 +10,7 @@
|
||||
#include <smp.h>
|
||||
#include <arm-gic-v3.h>
|
||||
|
||||
#define KERNEL_RAM_VADDR MAP_KERNEL_START
|
||||
/* KERNEL_RAM_VADDR is defined by cmake */
|
||||
|
||||
//#ifndef CONFIG_SMP
|
||||
//# define PTE_FLAGS PTE_TYPE_PAGE | PTE_AF
|
||||
|
||||
@ -255,90 +255,6 @@ static void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
|
||||
cpu_restore_interrupt(flags);
|
||||
}
|
||||
|
||||
/* An implementation of the Mellor-Crummey Scott (MCS) lock */
|
||||
typedef struct mcs_lock_node {
|
||||
unsigned long locked;
|
||||
struct mcs_lock_node *next;
|
||||
unsigned long irqsave;
|
||||
#ifndef ENABLE_UBSAN
|
||||
} __aligned(64) mcs_lock_node_t;
|
||||
#else
|
||||
} mcs_lock_node_t;
|
||||
#endif
|
||||
|
||||
typedef mcs_lock_node_t mcs_lock_t;
|
||||
|
||||
static void mcs_lock_init(struct mcs_lock_node *node)
|
||||
{
|
||||
node->locked = 0;
|
||||
node->next = NULL;
|
||||
}
|
||||
|
||||
static void __mcs_lock_lock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
struct mcs_lock_node *pred;
|
||||
|
||||
node->next = NULL;
|
||||
node->locked = 0;
|
||||
pred = xchg8(&(lock->next), node);
|
||||
|
||||
if (pred) {
|
||||
node->locked = 1;
|
||||
pred->next = node;
|
||||
while (node->locked != 0) {
|
||||
cpu_pause();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void __mcs_lock_unlock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
if (node->next == NULL) {
|
||||
struct mcs_lock_node *old = atomic_cmpxchg8(&(lock->next), node, 0);
|
||||
|
||||
if (old == node) {
|
||||
return;
|
||||
}
|
||||
|
||||
while (node->next == NULL) {
|
||||
cpu_pause();
|
||||
}
|
||||
}
|
||||
|
||||
node->next->locked = 0;
|
||||
}
|
||||
|
||||
static void mcs_lock_lock_noirq(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
preempt_disable();
|
||||
__mcs_lock_lock(lock, node);
|
||||
}
|
||||
|
||||
static void mcs_lock_unlock_noirq(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
__mcs_lock_unlock(lock, node);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void mcs_lock_lock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
node->irqsave = cpu_disable_interrupt_save();
|
||||
mcs_lock_lock_noirq(lock, node);
|
||||
}
|
||||
|
||||
static void mcs_lock_unlock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
mcs_lock_unlock_noirq(lock, node);
|
||||
cpu_restore_interrupt(node->irqsave);
|
||||
}
|
||||
|
||||
|
||||
#define SPINLOCK_IN_MCS_RWLOCK
|
||||
|
||||
// reader/writer lock
|
||||
@ -743,5 +659,102 @@ static inline int irqflags_can_interrupt(unsigned long flags)
|
||||
}
|
||||
#endif /* CONFIG_HAS_NMI */
|
||||
|
||||
struct ihk_rwlock {
|
||||
unsigned int lock;
|
||||
};
|
||||
|
||||
static inline void ihk_mc_rwlock_init(struct ihk_rwlock *rw)
|
||||
{
|
||||
rw->lock = 0;
|
||||
}
|
||||
|
||||
static inline void ihk_mc_read_lock(struct ihk_rwlock *rw)
|
||||
{
|
||||
unsigned int tmp, tmp2;
|
||||
|
||||
asm volatile(
|
||||
" sevl\n"
|
||||
"1: wfe\n"
|
||||
"2: ldaxr %w0, %2\n"
|
||||
" add %w0, %w0, #1\n"
|
||||
" tbnz %w0, #31, 1b\n"
|
||||
" stxr %w1, %w0, %2\n"
|
||||
" cbnz %w1, 2b\n"
|
||||
: "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock)
|
||||
:
|
||||
: "cc", "memory");
|
||||
}
|
||||
|
||||
static inline int ihk_mc_read_trylock(struct ihk_rwlock *rw)
|
||||
{
|
||||
unsigned int tmp, tmp2 = 1;
|
||||
|
||||
asm volatile(
|
||||
" ldaxr %w0, %2\n"
|
||||
" add %w0, %w0, #1\n"
|
||||
" tbnz %w0, #31, 1f\n"
|
||||
" stxr %w1, %w0, %2\n"
|
||||
"1:\n"
|
||||
: "=&r" (tmp), "+r" (tmp2), "+Q" (rw->lock)
|
||||
:
|
||||
: "cc", "memory");
|
||||
|
||||
return !tmp2;
|
||||
}
|
||||
|
||||
static inline void ihk_mc_read_unlock(struct ihk_rwlock *rw)
|
||||
{
|
||||
unsigned int tmp, tmp2;
|
||||
|
||||
asm volatile(
|
||||
"1: ldxr %w0, %2\n"
|
||||
" sub %w0, %w0, #1\n"
|
||||
" stlxr %w1, %w0, %2\n"
|
||||
" cbnz %w1, 1b\n"
|
||||
: "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock)
|
||||
:
|
||||
: "cc", "memory");
|
||||
}
|
||||
|
||||
static inline void ihk_mc_write_lock(struct ihk_rwlock *rw)
|
||||
{
|
||||
unsigned int tmp;
|
||||
|
||||
asm volatile(
|
||||
" sevl\n"
|
||||
"1: wfe\n"
|
||||
"2: ldaxr %w0, %1\n"
|
||||
" cbnz %w0, 1b\n"
|
||||
" stxr %w0, %w2, %1\n"
|
||||
" cbnz %w0, 2b\n"
|
||||
: "=&r" (tmp), "+Q" (rw->lock)
|
||||
: "r" (0x80000000)
|
||||
: "cc", "memory");
|
||||
}
|
||||
|
||||
static inline int ihk_mc_write_trylock(struct ihk_rwlock *rw)
|
||||
{
|
||||
unsigned int tmp;
|
||||
|
||||
asm volatile(
|
||||
" ldaxr %w0, %1\n"
|
||||
" cbnz %w0, 1f\n"
|
||||
" stxr %w0, %w2, %1\n"
|
||||
"1:\n"
|
||||
: "=&r" (tmp), "+Q" (rw->lock)
|
||||
: "r" (0x80000000)
|
||||
: "cc", "memory");
|
||||
|
||||
return !tmp;
|
||||
}
|
||||
|
||||
static inline void ihk_mc_write_unlock(struct ihk_rwlock *rw)
|
||||
{
|
||||
asm volatile(
|
||||
" stlr %w1, %0\n"
|
||||
: "=Q" (rw->lock) : "r" (0) : "memory");
|
||||
}
|
||||
|
||||
#define ihk_mc_read_can_lock(rw) ((rw)->lock < 0x80000000)
|
||||
#define ihk_mc_write_can_lock(rw) ((rw)->lock == 0)
|
||||
#endif /* !__HEADER_ARM64_COMMON_ARCH_LOCK_H */
|
||||
|
||||
@ -34,7 +34,7 @@ void panic(const char *);
|
||||
*/
|
||||
/* early alloc area address */
|
||||
/* START:_end, SIZE:512 pages */
|
||||
#define MAP_EARLY_ALLOC_SHIFT 9
|
||||
#define MAP_EARLY_ALLOC_SHIFT 5
|
||||
#define MAP_EARLY_ALLOC_SIZE (UL(1) << (PAGE_SHIFT + MAP_EARLY_ALLOC_SHIFT))
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
@ -55,7 +55,11 @@ extern char _end[];
|
||||
# define MAP_BOOT_PARAM_END (MAP_BOOT_PARAM + MAP_BOOT_PARAM_SIZE)
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
|
||||
#if (VA_BITS == 39 && GRANULE_SIZE == _SZ4KB)
|
||||
/*
|
||||
* MAP_KERNEL_START is HOST MODULES_END - 8MiB.
|
||||
* It's defined by cmake.
|
||||
*/
|
||||
#if (VA_BITS == 39 && GRANULE_SIZE == _SZ4KB) /* ARM64_MEMORY_LAYOUT=1 */
|
||||
#
|
||||
# define LD_TASK_UNMAPPED_BASE UL(0x0000000400000000)
|
||||
# define TASK_UNMAPPED_BASE UL(0x0000000800000000)
|
||||
@ -64,9 +68,8 @@ extern char _end[];
|
||||
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
||||
# define MAP_FIXED_START UL(0xffffffbffbdfd000)
|
||||
# define MAP_ST_START UL(0xffffffc000000000)
|
||||
# define MAP_KERNEL_START UL(0xffffffffff800000)
|
||||
#
|
||||
#elif (VA_BITS == 42 && GRANULE_SIZE == _SZ64KB)
|
||||
#elif (VA_BITS == 42 && GRANULE_SIZE == _SZ64KB) /* ARM64_MEMORY_LAYOUT=3 */
|
||||
#
|
||||
# define LD_TASK_UNMAPPED_BASE UL(0x0000002000000000)
|
||||
# define TASK_UNMAPPED_BASE UL(0x0000004000000000)
|
||||
@ -75,9 +78,8 @@ extern char _end[];
|
||||
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
||||
# define MAP_FIXED_START UL(0xfffffdfffbdd0000)
|
||||
# define MAP_ST_START UL(0xfffffe0000000000)
|
||||
# define MAP_KERNEL_START UL(0xffffffffe0000000)
|
||||
#
|
||||
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ4KB)
|
||||
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ4KB) /* ARM64_MEMORY_LAYOUT=2 */
|
||||
#
|
||||
# define LD_TASK_UNMAPPED_BASE UL(0x0000080000000000)
|
||||
# define TASK_UNMAPPED_BASE UL(0x0000100000000000)
|
||||
@ -86,9 +88,8 @@ extern char _end[];
|
||||
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
||||
# define MAP_FIXED_START UL(0xffff7ffffbdfd000)
|
||||
# define MAP_ST_START UL(0xffff800000000000)
|
||||
# define MAP_KERNEL_START UL(0xffffffffff800000)
|
||||
#
|
||||
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ64KB)
|
||||
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ64KB) /* ARM64_MEMORY_LAYOUT=4 */
|
||||
#
|
||||
# define LD_TASK_UNMAPPED_BASE UL(0x0000080000000000)
|
||||
# define TASK_UNMAPPED_BASE UL(0x0000100000000000)
|
||||
@ -97,7 +98,6 @@ extern char _end[];
|
||||
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
||||
# define MAP_FIXED_START UL(0xffff7ffffbdd0000)
|
||||
# define MAP_ST_START UL(0xffff800000000000)
|
||||
# define MAP_KERNEL_START UL(0xffffffffe0000000)
|
||||
#
|
||||
#else
|
||||
# error address space is not defined.
|
||||
@ -583,6 +583,40 @@ static inline int pgsize_to_tbllv(size_t pgsize)
|
||||
return level;
|
||||
}
|
||||
|
||||
static inline int pgsize_to_pgshift(size_t pgsize)
|
||||
{
|
||||
/* We need to use if instead of switch because
|
||||
* sometimes PTLX_CONT_SIZE == PTLX_SIZE
|
||||
*/
|
||||
if (pgsize == PTL4_CONT_SIZE) {
|
||||
if (CONFIG_ARM64_PGTABLE_LEVELS > 3) {
|
||||
return PTL4_CONT_SHIFT;
|
||||
}
|
||||
} else if (pgsize == PTL4_SIZE) {
|
||||
if (CONFIG_ARM64_PGTABLE_LEVELS > 3) {
|
||||
return PTL4_SHIFT;
|
||||
}
|
||||
} else if (pgsize == PTL3_CONT_SIZE) {
|
||||
if (CONFIG_ARM64_PGTABLE_LEVELS > 2) {
|
||||
return PTL3_CONT_SHIFT;
|
||||
}
|
||||
} else if (pgsize == PTL3_SIZE) {
|
||||
if (CONFIG_ARM64_PGTABLE_LEVELS > 2) {
|
||||
return PTL3_SHIFT;
|
||||
}
|
||||
} else if (pgsize == PTL2_CONT_SIZE) {
|
||||
return PTL2_CONT_SHIFT;
|
||||
} else if (pgsize == PTL2_SIZE) {
|
||||
return PTL2_SHIFT;
|
||||
} else if (pgsize == PTL1_CONT_SIZE) {
|
||||
return PTL1_CONT_SHIFT;
|
||||
} else if (pgsize == PTL1_SIZE) {
|
||||
return PTL1_SHIFT;
|
||||
}
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static inline size_t tbllv_to_pgsize(int level)
|
||||
{
|
||||
size_t pgsize = 0;
|
||||
|
||||
@ -20,17 +20,21 @@ struct arm_pmu {
|
||||
void (*reset)(void*);
|
||||
int (*enable_pmu)(void);
|
||||
void (*disable_pmu)(void);
|
||||
int (*enable_counter)(int);
|
||||
int (*disable_counter)(int);
|
||||
int (*enable_intens)(int);
|
||||
int (*disable_intens)(int);
|
||||
int (*enable_counter)(unsigned long counter_mask);
|
||||
int (*disable_counter)(unsigned long counter_mask);
|
||||
int (*enable_intens)(unsigned long counter_mask);
|
||||
int (*disable_intens)(unsigned long counter_mask);
|
||||
int (*set_event_filter)(unsigned long*, int);
|
||||
void (*write_evtype)(int, uint32_t);
|
||||
int (*get_event_idx)(int num_events, unsigned long used_mask,
|
||||
unsigned long config);
|
||||
int (*map_event)(uint32_t, uint64_t);
|
||||
int (*map_hw_event)(uint64_t config);
|
||||
int (*map_cache_event)(uint64_t config);
|
||||
int (*map_raw_event)(uint64_t config);
|
||||
void (*enable_user_access_pmu_regs)(void);
|
||||
void (*disable_user_access_pmu_regs)(void);
|
||||
int (*counter_mask_valid)(unsigned long counter_mask);
|
||||
struct per_cpu_arm_pmu *per_cpu;
|
||||
};
|
||||
|
||||
|
||||
@ -102,4 +102,6 @@ static inline void cpu_disable_nmi(void)
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
void arch_flush_icache_all(void);
|
||||
|
||||
#endif /* !__HEADER_ARM64_ARCH_CPU_H */
|
||||
|
||||
@ -1,60 +0,0 @@
|
||||
#ifndef ARCH_RUSAGE_H_INCLUDED
|
||||
#define ARCH_RUSAGE_H_INCLUDED
|
||||
|
||||
#define DEBUG_RUSAGE
|
||||
|
||||
#define IHK_OS_PGSIZE_4KB 0
|
||||
#define IHK_OS_PGSIZE_2MB 1
|
||||
#define IHK_OS_PGSIZE_1GB 2
|
||||
|
||||
extern struct ihk_os_monitor *monitor;
|
||||
|
||||
extern int sprintf(char * buf, const char *fmt, ...);
|
||||
|
||||
#define DEBUG_ARCH_RUSAGE
|
||||
#ifdef DEBUG_ARCH_RUSAGE
|
||||
#define dprintf(...) \
|
||||
do { \
|
||||
char msg[1024]; \
|
||||
sprintf(msg, __VA_ARGS__); \
|
||||
kprintf("%s,%s", __FUNCTION__, msg); \
|
||||
} while (0);
|
||||
#define eprintf(...) \
|
||||
do { \
|
||||
char msg[1024]; \
|
||||
sprintf(msg, __VA_ARGS__); \
|
||||
kprintf("%s,%s", __FUNCTION__, msg); \
|
||||
} while (0);
|
||||
#else
|
||||
#define dprintf(...) do { } while (0)
|
||||
#define eprintf(...) \
|
||||
do { \
|
||||
char msg[1024]; \
|
||||
sprintf(msg, __VA_ARGS__); \
|
||||
kprintf("%s,%s", __FUNCTION__, msg); \
|
||||
} while (0);
|
||||
#endif
|
||||
|
||||
static inline int rusage_pgsize_to_pgtype(size_t pgsize)
|
||||
{
|
||||
int ret = IHK_OS_PGSIZE_4KB;
|
||||
#if 0 /* postk-TODO */
|
||||
switch (pgsize) {
|
||||
case PTL1_SIZE:
|
||||
ret = IHK_OS_PGSIZE_4KB;
|
||||
break;
|
||||
case PTL2_SIZE:
|
||||
ret = IHK_OS_PGSIZE_2MB;
|
||||
break;
|
||||
case PTL3_SIZE:
|
||||
ret = IHK_OS_PGSIZE_1GB;
|
||||
break;
|
||||
default:
|
||||
eprintf("unknown pgsize=%ld\n", pgsize);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* !defined(ARCH_RUSAGE_H_INCLUDED) */
|
||||
@ -1,33 +0,0 @@
|
||||
#ifndef ARCH_RUSAGE_H_INCLUDED
|
||||
#define ARCH_RUSAGE_H_INCLUDED
|
||||
|
||||
#include <arch-memory.h>
|
||||
|
||||
#define DEBUG_RUSAGE
|
||||
|
||||
#define IHK_OS_PGSIZE_4KB 0
|
||||
#define IHK_OS_PGSIZE_2MB 1
|
||||
#define IHK_OS_PGSIZE_1GB 2
|
||||
|
||||
extern struct rusage_global rusage;
|
||||
|
||||
static inline int rusage_pgsize_to_pgtype(size_t pgsize)
|
||||
{
|
||||
int ret = IHK_OS_PGSIZE_4KB;
|
||||
|
||||
if (pgsize == PTL1_SIZE) {
|
||||
ret = IHK_OS_PGSIZE_4KB;
|
||||
}
|
||||
else if (pgsize == PTL2_SIZE) {
|
||||
ret = IHK_OS_PGSIZE_2MB;
|
||||
}
|
||||
else if (pgsize == PTL3_SIZE) {
|
||||
ret = IHK_OS_PGSIZE_1GB;
|
||||
}
|
||||
else {
|
||||
kprintf("%s: Error: Unknown pgsize=%ld\n", __FUNCTION__, pgsize);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* !defined(ARCH_RUSAGE_H_INCLUDED) */
|
||||
@ -8,6 +8,7 @@
|
||||
#define SYSCALL_HANDLED(number, name) DECLARATOR(number, name)
|
||||
#define SYSCALL_DELEGATED(number, name) DECLARATOR(number, name)
|
||||
|
||||
#include <config.h>
|
||||
#include <syscall_list.h>
|
||||
|
||||
#undef DECLARATOR
|
||||
|
||||
@ -67,21 +67,12 @@ struct arm64_cpu_capabilities {
|
||||
int def_scope;/* default scope */
|
||||
int (*matches)(const struct arm64_cpu_capabilities *caps, int scope);
|
||||
int (*enable)(void *);/* Called on all active CPUs */
|
||||
union {
|
||||
struct {/* To be used for erratum handling only */
|
||||
uint32_t midr_model;
|
||||
uint32_t midr_range_min, midr_range_max;
|
||||
};
|
||||
|
||||
struct {/* Feature register checking */
|
||||
uint32_t sys_reg;
|
||||
uint8_t field_pos;
|
||||
uint8_t min_field_value;
|
||||
uint8_t hwcap_type;
|
||||
int sign;
|
||||
unsigned long hwcap;
|
||||
};
|
||||
};
|
||||
uint32_t sys_reg;
|
||||
uint8_t field_pos;
|
||||
uint8_t min_field_value;
|
||||
uint8_t hwcap_type;
|
||||
int sign;
|
||||
unsigned long hwcap;
|
||||
};
|
||||
|
||||
/* @ref.impl include/linux/bitops.h */
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* fpsimd.h COPYRIGHT FUJITSU LIMITED 2016-2017 */
|
||||
/* fpsimd.h COPYRIGHT FUJITSU LIMITED 2016-2019 */
|
||||
#ifndef __HEADER_ARM64_COMMON_FPSIMD_H
|
||||
#define __HEADER_ARM64_COMMON_FPSIMD_H
|
||||
|
||||
@ -42,16 +42,19 @@ extern void thread_sve_to_fpsimd(struct thread *thread, fp_regs_struct *fp_regs)
|
||||
|
||||
extern size_t sve_state_size(struct thread const *thread);
|
||||
extern void sve_free(struct thread *thread);
|
||||
extern void sve_alloc(struct thread *thread);
|
||||
extern int sve_alloc(struct thread *thread);
|
||||
extern void sve_save_state(void *state, unsigned int *pfpsr);
|
||||
extern void sve_load_state(void const *state, unsigned int const *pfpsr, unsigned long vq_minus_1);
|
||||
extern unsigned int sve_get_vl(void);
|
||||
extern int sve_set_thread_vl(struct thread *thread, const unsigned long vector_length, const unsigned long flags);
|
||||
extern int sve_get_thread_vl(const struct thread *thread);
|
||||
extern int sve_set_thread_vl(unsigned long arg);
|
||||
extern int sve_get_thread_vl(void);
|
||||
extern int sve_set_vector_length(struct thread *thread, unsigned long vl, unsigned long flags);
|
||||
|
||||
#define SVE_SET_VL(thread, vector_length, flags) sve_set_thread_vl(thread, vector_length, flags)
|
||||
#define SVE_GET_VL(thread) sve_get_thread_vl(thread)
|
||||
#define SVE_SET_VL(arg) sve_set_thread_vl(arg)
|
||||
#define SVE_GET_VL() sve_get_thread_vl()
|
||||
|
||||
/* Maximum VL that SVE VL-agnostic software can transparently support */
|
||||
#define SVE_VL_ARCH_MAX 0x100
|
||||
|
||||
#else /* CONFIG_ARM64_SVE */
|
||||
|
||||
@ -80,12 +83,12 @@ static int sve_set_vector_length(struct thread *thread, unsigned long vl, unsign
|
||||
}
|
||||
|
||||
/* for prctl syscall */
|
||||
#define SVE_SET_VL(a,b,c) (-EINVAL)
|
||||
#define SVE_GET_VL(a) (-EINVAL)
|
||||
#define SVE_SET_VL(a) (-EINVAL)
|
||||
#define SVE_GET_VL() (-EINVAL)
|
||||
|
||||
#endif /* CONFIG_ARM64_SVE */
|
||||
|
||||
extern void init_sve_vl(void);
|
||||
extern void sve_setup(void);
|
||||
extern void fpsimd_save_state(struct fpsimd_state *state);
|
||||
extern void fpsimd_load_state(struct fpsimd_state *state);
|
||||
extern void thread_fpsimd_save(struct thread *thread);
|
||||
|
||||
@ -124,7 +124,7 @@ static inline long ihk_atomic64_read(const ihk_atomic64_t *v)
|
||||
return *(volatile long *)&(v)->counter64;
|
||||
}
|
||||
|
||||
static inline void ihk_atomic64_set(ihk_atomic64_t *v, int i)
|
||||
static inline void ihk_atomic64_set(ihk_atomic64_t *v, long i)
|
||||
{
|
||||
v->counter64 = i;
|
||||
}
|
||||
@ -147,6 +147,8 @@ static inline void ihk_atomic64_add(long i, ihk_atomic64_t *v)
|
||||
/* @ref.impl arch/arm64/include/asm/atomic.h::atomic64_inc */
|
||||
#define ihk_atomic64_inc(v) ihk_atomic64_add(1LL, (v))
|
||||
|
||||
#define ihk_atomic64_cmpxchg(p, o, n) cmpxchg(&((p)->counter64), o, n)
|
||||
|
||||
/***********************************************************************
|
||||
* others
|
||||
*/
|
||||
|
||||
@ -29,6 +29,7 @@
|
||||
#define IMP_PF_INJECTION_DISTANCE5_EL0 sys_reg(3, 3, 11, 7, 5)
|
||||
#define IMP_PF_INJECTION_DISTANCE6_EL0 sys_reg(3, 3, 11, 7, 6)
|
||||
#define IMP_PF_INJECTION_DISTANCE7_EL0 sys_reg(3, 3, 11, 7, 7)
|
||||
#define IMP_PF_PMUSERENR_EL0 sys_reg(3, 3, 9, 14, 0)
|
||||
#define IMP_BARRIER_CTRL_EL1 sys_reg(3, 0, 11, 12, 0)
|
||||
#define IMP_BARRIER_BST_BIT_EL1 sys_reg(3, 0, 11, 12, 4)
|
||||
#define IMP_BARRIER_INIT_SYNC_BB0_EL1 sys_reg(3, 0, 15, 13, 0)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* irq.h COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||
/* irq.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
|
||||
#ifndef __HEADER_ARM64_IRQ_H
|
||||
#define __HEADER_ARM64_IRQ_H
|
||||
@ -14,7 +14,8 @@
|
||||
#define INTRID_QUERY_FREE_MEM 2
|
||||
#define INTRID_CPU_STOP 3
|
||||
#define INTRID_TLB_FLUSH 4
|
||||
#define INTRID_STACK_TRACE 6
|
||||
#define INTRID_STACK_TRACE 5
|
||||
#define INTRID_MULTI_INTR 6
|
||||
#define INTRID_MULTI_NMI 7
|
||||
|
||||
/* use PPI interrupt number */
|
||||
@ -29,6 +30,7 @@ extern void gic_dist_init_gicv2(unsigned long dist_base_pa, unsigned long size);
|
||||
extern void gic_cpu_init_gicv2(unsigned long cpu_base_pa, unsigned long size);
|
||||
extern void gic_enable_gicv2(void);
|
||||
extern void arm64_issue_ipi_gicv2(unsigned int cpuid, unsigned int vector);
|
||||
extern void arm64_issue_host_ipi_gicv2(uint32_t cpuid, uint32_t vector);
|
||||
extern void handle_interrupt_gicv2(struct pt_regs *regs);
|
||||
|
||||
/* Functions for GICv3 */
|
||||
@ -36,6 +38,7 @@ extern void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size);
|
||||
extern void gic_cpu_init_gicv3(unsigned long cpu_base_pa, unsigned long size);
|
||||
extern void gic_enable_gicv3(void);
|
||||
extern void arm64_issue_ipi_gicv3(unsigned int cpuid, unsigned int vector);
|
||||
extern void arm64_issue_host_ipi_gicv3(uint32_t cpuid, uint32_t vector);
|
||||
extern void handle_interrupt_gicv3(struct pt_regs *regs);
|
||||
|
||||
void handle_IPI(unsigned int vector, struct pt_regs *regs);
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* prctl.h COPYRIGHT FUJITSU LIMITED 2017 */
|
||||
/* prctl.h COPYRIGHT FUJITSU LIMITED 2017-2019 */
|
||||
#ifndef __HEADER_ARM64_COMMON_PRCTL_H
|
||||
#define __HEADER_ARM64_COMMON_PRCTL_H
|
||||
|
||||
@ -6,15 +6,12 @@
|
||||
#define PR_GET_THP_DISABLE 42
|
||||
|
||||
/* arm64 Scalable Vector Extension controls */
|
||||
#define PR_SVE_SET_VL 48 /* set task vector length */
|
||||
#define PR_SVE_SET_VL_THREAD (1 << 1) /* set just this thread */
|
||||
#define PR_SVE_SET_VL_INHERIT (1 << 2) /* inherit across exec */
|
||||
#define PR_SVE_SET_VL_ONEXEC (1 << 3) /* defer effect until exec */
|
||||
|
||||
#define PR_SVE_GET_VL 49 /* get task vector length */
|
||||
/* Decode helpers for the return value from PR_SVE_GET_VL: */
|
||||
#define PR_SVE_GET_VL_LEN(ret) ((ret) & 0x3fff) /* vector length */
|
||||
#define PR_SVE_GET_VL_INHERIT (PR_SVE_SET_VL_INHERIT << 16)
|
||||
/* For conveinence, PR_SVE_SET_VL returns the result in the same encoding */
|
||||
/* Flag values must be kept in sync with ptrace NT_ARM_SVE interface */
|
||||
#define PR_SVE_SET_VL 50 /* set task vector length */
|
||||
# define PR_SVE_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */
|
||||
#define PR_SVE_GET_VL 51 /* get task vector length */
|
||||
/* Bits common to PR_SVE_SET_VL and PR_SVE_GET_VL */
|
||||
# define PR_SVE_VL_LEN_MASK 0xffff
|
||||
# define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */
|
||||
|
||||
#endif /* !__HEADER_ARM64_COMMON_PRCTL_H */
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* ptrace.h COPYRIGHT FUJITSU LIMITED 2015-2017 */
|
||||
/* ptrace.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
#ifndef __HEADER_ARM64_COMMON_PTRACE_H
|
||||
#define __HEADER_ARM64_COMMON_PTRACE_H
|
||||
|
||||
@ -46,6 +46,7 @@
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
#include <lwk/compiler.h>
|
||||
#include <ihk/types.h>
|
||||
|
||||
struct user_hwdebug_state {
|
||||
@ -78,6 +79,70 @@ struct user_sve_header {
|
||||
uint16_t __reserved;
|
||||
};
|
||||
|
||||
enum aarch64_regset {
|
||||
REGSET_GPR,
|
||||
REGSET_FPR,
|
||||
REGSET_TLS,
|
||||
REGSET_HW_BREAK,
|
||||
REGSET_HW_WATCH,
|
||||
REGSET_SYSTEM_CALL,
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
REGSET_SVE,
|
||||
#endif /* CONFIG_ARM64_SVE */
|
||||
};
|
||||
|
||||
struct thread;
|
||||
struct user_regset;
|
||||
|
||||
typedef int user_regset_active_fn(struct thread *target,
|
||||
const struct user_regset *regset);
|
||||
|
||||
typedef long user_regset_get_fn(struct thread *target,
|
||||
const struct user_regset *regset,
|
||||
unsigned int pos, unsigned int count,
|
||||
void *kbuf, void __user *ubuf);
|
||||
|
||||
typedef long user_regset_set_fn(struct thread *target,
|
||||
const struct user_regset *regset,
|
||||
unsigned int pos, unsigned int count,
|
||||
const void *kbuf, const void __user *ubuf);
|
||||
|
||||
typedef int user_regset_writeback_fn(struct thread *target,
|
||||
const struct user_regset *regset,
|
||||
int immediate);
|
||||
|
||||
typedef unsigned int user_regset_get_size_fn(struct thread *target,
|
||||
const struct user_regset *regset);
|
||||
|
||||
struct user_regset {
|
||||
user_regset_get_fn *get;
|
||||
user_regset_set_fn *set;
|
||||
user_regset_active_fn *active;
|
||||
user_regset_writeback_fn *writeback;
|
||||
user_regset_get_size_fn *get_size;
|
||||
unsigned int n;
|
||||
unsigned int size;
|
||||
unsigned int align;
|
||||
unsigned int bias;
|
||||
unsigned int core_note_type;
|
||||
};
|
||||
|
||||
struct user_regset_view {
|
||||
const char *name;
|
||||
const struct user_regset *regsets;
|
||||
unsigned int n;
|
||||
uint32_t e_flags;
|
||||
uint16_t e_machine;
|
||||
uint8_t ei_osabi;
|
||||
};
|
||||
|
||||
extern const struct user_regset_view *current_user_regset_view(void);
|
||||
extern const struct user_regset *find_regset(
|
||||
const struct user_regset_view *view,
|
||||
unsigned int type);
|
||||
extern unsigned int regset_size(struct thread *target,
|
||||
const struct user_regset *regset);
|
||||
|
||||
/* Definitions for user_sve_header.flags: */
|
||||
#define SVE_PT_REGS_MASK (1 << 0)
|
||||
|
||||
@ -85,7 +150,7 @@ struct user_sve_header {
|
||||
#define SVE_PT_REGS_SVE SVE_PT_REGS_MASK
|
||||
|
||||
#define SVE_PT_VL_THREAD PR_SVE_SET_VL_THREAD
|
||||
#define SVE_PT_VL_INHERIT PR_SVE_SET_VL_INHERIT
|
||||
#define SVE_PT_VL_INHERIT PR_SVE_VL_INHERIT
|
||||
#define SVE_PT_VL_ONEXEC PR_SVE_SET_VL_ONEXEC
|
||||
|
||||
/*
|
||||
@ -99,7 +164,9 @@ struct user_sve_header {
|
||||
*/
|
||||
|
||||
/* Offset from the start of struct user_sve_header to the register data */
|
||||
#define SVE_PT_REGS_OFFSET ((sizeof(struct sve_context) + 15) / 16 * 16)
|
||||
#define SVE_PT_REGS_OFFSET \
|
||||
((sizeof(struct sve_context) + (SVE_VQ_BYTES - 1)) \
|
||||
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
|
||||
|
||||
/*
|
||||
* The register data content and layout depends on the value of the
|
||||
@ -174,8 +241,10 @@ struct user_sve_header {
|
||||
#define SVE_PT_SVE_FFR_OFFSET(vq) \
|
||||
__SVE_SIG_TO_PT(SVE_SIG_FFR_OFFSET(vq))
|
||||
|
||||
#define SVE_PT_SVE_FPSR_OFFSET(vq) \
|
||||
((SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq) + 15) / 16 * 16)
|
||||
#define SVE_PT_SVE_FPSR_OFFSET(vq) \
|
||||
((SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq) + \
|
||||
(SVE_VQ_BYTES - 1)) \
|
||||
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
|
||||
#define SVE_PT_SVE_FPCR_OFFSET(vq) \
|
||||
(SVE_PT_SVE_FPSR_OFFSET(vq) + SVE_PT_SVE_FPSR_SIZE)
|
||||
|
||||
@ -184,9 +253,10 @@ struct user_sve_header {
|
||||
* 128-bit boundary.
|
||||
*/
|
||||
|
||||
#define SVE_PT_SVE_SIZE(vq, flags) \
|
||||
((SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE - \
|
||||
SVE_PT_SVE_OFFSET + 15) / 16 * 16)
|
||||
#define SVE_PT_SVE_SIZE(vq, flags) \
|
||||
((SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE \
|
||||
- SVE_PT_SVE_OFFSET + (SVE_VQ_BYTES - 1)) \
|
||||
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
|
||||
|
||||
#define SVE_PT_SIZE(vq, flags) \
|
||||
(((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE ? \
|
||||
|
||||
@ -85,7 +85,11 @@ enum __rlimit_resource
|
||||
__RLIMIT_RTPRIO = 14,
|
||||
#define RLIMIT_RTPRIO __RLIMIT_RTPRIO
|
||||
|
||||
__RLIMIT_NLIMITS = 15,
|
||||
/* timeout for RT tasks in us */
|
||||
__RLIMIT_RTTIME = 15,
|
||||
#define RLIMIT_RTTIME __RLIMIT_RTTIME
|
||||
|
||||
__RLIMIT_NLIMITS = 16,
|
||||
__RLIM_NLIMITS = __RLIMIT_NLIMITS
|
||||
#define RLIMIT_NLIMITS __RLIMIT_NLIMITS
|
||||
#define RLIM_NLIMITS __RLIM_NLIMITS
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* signal.h COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||
/* signal.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
#ifndef __HEADER_ARM64_COMMON_SIGNAL_H
|
||||
#define __HEADER_ARM64_COMMON_SIGNAL_H
|
||||
|
||||
@ -298,6 +298,7 @@ struct extra_context {
|
||||
struct _aarch64_ctx head;
|
||||
void *data; /* 16-byte aligned pointer to the extra space */
|
||||
uint32_t size; /* size in bytes of the extra space */
|
||||
uint32_t __reserved[3];
|
||||
};
|
||||
|
||||
#define SVE_MAGIC 0x53564501
|
||||
@ -318,19 +319,25 @@ struct sve_context {
|
||||
* The SVE architecture leaves space for future expansion of the
|
||||
* vector length beyond its initial architectural limit of 2048 bits
|
||||
* (16 quadwords).
|
||||
*
|
||||
* See linux/Documentation/arm64/sve.txt for a description of the VL/VQ
|
||||
* terminology.
|
||||
*/
|
||||
#define SVE_VQ_MIN 1
|
||||
#define SVE_VQ_MAX 0x200
|
||||
#define SVE_VQ_BYTES 16 /* number of bytes per quadword */
|
||||
|
||||
#define SVE_VL_MIN (SVE_VQ_MIN * 0x10)
|
||||
#define SVE_VL_MAX (SVE_VQ_MAX * 0x10)
|
||||
#define SVE_VQ_MIN 1
|
||||
#define SVE_VQ_MAX 512
|
||||
|
||||
#define SVE_VL_MIN (SVE_VQ_MIN * SVE_VQ_BYTES)
|
||||
#define SVE_VL_MAX (SVE_VQ_MAX * SVE_VQ_BYTES)
|
||||
|
||||
#define SVE_NUM_ZREGS 32
|
||||
#define SVE_NUM_PREGS 16
|
||||
|
||||
#define sve_vl_valid(vl) \
|
||||
((vl) % 0x10 == 0 && (vl) >= SVE_VL_MIN && (vl) <= SVE_VL_MAX)
|
||||
#define sve_vq_from_vl(vl) ((vl) / 0x10)
|
||||
((vl) % SVE_VQ_BYTES == 0 && (vl) >= SVE_VL_MIN && (vl) <= SVE_VL_MAX)
|
||||
#define sve_vq_from_vl(vl) ((vl) / SVE_VQ_BYTES)
|
||||
#define sve_vl_from_vq(vq) ((vq) * SVE_VQ_BYTES)
|
||||
|
||||
/*
|
||||
* The total size of meaningful data in the SVE context in bytes,
|
||||
@ -365,11 +372,13 @@ struct sve_context {
|
||||
* Additional data might be appended in the future.
|
||||
*/
|
||||
|
||||
#define SVE_SIG_ZREG_SIZE(vq) ((uint32_t)(vq) * 16)
|
||||
#define SVE_SIG_PREG_SIZE(vq) ((uint32_t)(vq) * 2)
|
||||
#define SVE_SIG_ZREG_SIZE(vq) ((uint32_t)(vq) * SVE_VQ_BYTES)
|
||||
#define SVE_SIG_PREG_SIZE(vq) ((uint32_t)(vq) * (SVE_VQ_BYTES / 8))
|
||||
#define SVE_SIG_FFR_SIZE(vq) SVE_SIG_PREG_SIZE(vq)
|
||||
|
||||
#define SVE_SIG_REGS_OFFSET ((sizeof(struct sve_context) + 15) / 16 * 16)
|
||||
#define SVE_SIG_REGS_OFFSET \
|
||||
((sizeof(struct sve_context) + (SVE_VQ_BYTES - 1)) \
|
||||
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
|
||||
|
||||
#define SVE_SIG_ZREGS_OFFSET SVE_SIG_REGS_OFFSET
|
||||
#define SVE_SIG_ZREG_OFFSET(vq, n) \
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
SYSCALL_DELEGATED(4, io_getevents)
|
||||
SYSCALL_DELEGATED(17, getcwd)
|
||||
SYSCALL_DELEGATED(22, epoll_pwait)
|
||||
SYSCALL_HANDLED(22, epoll_pwait)
|
||||
SYSCALL_DELEGATED(25, fcntl)
|
||||
SYSCALL_HANDLED(29, ioctl)
|
||||
SYSCALL_DELEGATED(35, unlinkat)
|
||||
@ -17,8 +17,8 @@ SYSCALL_DELEGATED(64, write)
|
||||
SYSCALL_DELEGATED(66, writev)
|
||||
SYSCALL_DELEGATED(67, pread64)
|
||||
SYSCALL_DELEGATED(68, pwrite64)
|
||||
SYSCALL_DELEGATED(72, pselect6)
|
||||
SYSCALL_DELEGATED(73, ppoll)
|
||||
SYSCALL_HANDLED(72, pselect6)
|
||||
SYSCALL_HANDLED(73, ppoll)
|
||||
SYSCALL_HANDLED(74, signalfd4)
|
||||
SYSCALL_DELEGATED(78, readlinkat)
|
||||
SYSCALL_DELEGATED(80, fstat)
|
||||
@ -83,6 +83,7 @@ SYSCALL_HANDLED(175, geteuid)
|
||||
SYSCALL_HANDLED(176, getgid)
|
||||
SYSCALL_HANDLED(177, getegid)
|
||||
SYSCALL_HANDLED(178, gettid)
|
||||
SYSCALL_HANDLED(179, sysinfo)
|
||||
SYSCALL_DELEGATED(188, msgrcv)
|
||||
SYSCALL_DELEGATED(189, msgsnd)
|
||||
SYSCALL_DELEGATED(192, semtimedop)
|
||||
@ -111,20 +112,16 @@ SYSCALL_HANDLED(236, get_mempolicy)
|
||||
SYSCALL_HANDLED(237, set_mempolicy)
|
||||
SYSCALL_HANDLED(238, migrate_pages)
|
||||
SYSCALL_HANDLED(239, move_pages)
|
||||
#ifdef PERF_ENABLE
|
||||
#ifdef ENABLE_PERF
|
||||
SYSCALL_HANDLED(241, perf_event_open)
|
||||
#else // PERF_ENABLE
|
||||
SYSCALL_DELEGATED(241, perf_event_open)
|
||||
#endif // PERF_ENABLE
|
||||
SYSCALL_HANDLED(260, wait4)
|
||||
SYSCALL_HANDLED(261, prlimit64)
|
||||
SYSCALL_HANDLED(270, process_vm_readv)
|
||||
SYSCALL_HANDLED(271, process_vm_writev)
|
||||
#ifdef PERF_ENABLE
|
||||
SYSCALL_HANDLED(601, pmc_init)
|
||||
SYSCALL_HANDLED(602, pmc_start)
|
||||
SYSCALL_HANDLED(603, pmc_stop)
|
||||
SYSCALL_HANDLED(604, pmc_reset)
|
||||
#endif // PERF_ENABLE
|
||||
SYSCALL_HANDLED(281, execveat)
|
||||
SYSCALL_HANDLED(700, get_cpu_id)
|
||||
#ifdef PROFILE_ENABLE
|
||||
SYSCALL_HANDLED(__NR_profile, profile)
|
||||
@ -132,6 +129,7 @@ SYSCALL_HANDLED(__NR_profile, profile)
|
||||
SYSCALL_HANDLED(730, util_migrate_inter_kernel)
|
||||
SYSCALL_HANDLED(731, util_indicate_clone)
|
||||
SYSCALL_HANDLED(732, get_system)
|
||||
SYSCALL_HANDLED(733, util_register_desc)
|
||||
|
||||
/* McKernel Specific */
|
||||
SYSCALL_HANDLED(801, swapout)
|
||||
@ -146,3 +144,9 @@ SYSCALL_HANDLED(1045, signalfd)
|
||||
SYSCALL_DELEGATED(1049, stat)
|
||||
SYSCALL_DELEGATED(1060, getpgrp)
|
||||
SYSCALL_HANDLED(1062, time)
|
||||
SYSCALL_DELEGATED(1069, epoll_wait)
|
||||
|
||||
/* Do not edit the lines including this comment and
|
||||
* EOF just after it because those are used as a
|
||||
* robust marker for the autotest patch.
|
||||
*/
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* thread_info.h COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||
/* thread_info.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
#ifndef __HEADER_ARM64_COMMON_THREAD_INFO_H
|
||||
#define __HEADER_ARM64_COMMON_THREAD_INFO_H
|
||||
|
||||
@ -46,9 +46,9 @@ struct thread_info {
|
||||
int cpu; /* cpu */
|
||||
struct cpu_context cpu_context; /* kernel_context */
|
||||
void *sve_state; /* SVE registers, if any */
|
||||
uint16_t sve_vl; /* SVE vector length */
|
||||
uint16_t sve_vl_onexec; /* SVE vl after next exec */
|
||||
uint16_t sve_flags; /* SVE related flags */
|
||||
unsigned int sve_vl; /* SVE vector length */
|
||||
unsigned int sve_vl_onexec; /* SVE vl after next exec */
|
||||
unsigned long sve_flags; /* SVE related flags */
|
||||
unsigned long fault_address; /* fault info */
|
||||
unsigned long fault_code; /* ESR_EL1 value */
|
||||
};
|
||||
@ -56,7 +56,7 @@ struct thread_info {
|
||||
/* Flags for sve_flags (intentionally defined to match the prctl flags) */
|
||||
|
||||
/* Inherit sve_vl and sve_flags across execve(): */
|
||||
#define THREAD_VL_INHERIT PR_SVE_SET_VL_INHERIT
|
||||
#define THREAD_VL_INHERIT PR_SVE_VL_INHERIT
|
||||
|
||||
struct arm64_cpu_local_thread {
|
||||
struct thread_info thread_info;
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
#define __ASM_TRAP_H
|
||||
|
||||
#include <types.h>
|
||||
#include <arch-lock.h>
|
||||
|
||||
struct pt_regs;
|
||||
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
#include <memory.h>
|
||||
#include <affinity.h>
|
||||
#include <syscall.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <arch-timer.h>
|
||||
#include <cls.h>
|
||||
|
||||
@ -31,10 +31,9 @@ void *cpu_base;
|
||||
* function, it is not necessary to perform the disable/enable
|
||||
* interrupts in this function as gic_raise_softirq() .
|
||||
*/
|
||||
static void arm64_raise_sgi_gicv2(unsigned int cpuid, unsigned int vector)
|
||||
static void __arm64_raise_sgi_gicv2(unsigned int hw_cpuid, unsigned int vector)
|
||||
{
|
||||
/* Build interrupt destination of the target cpu */
|
||||
unsigned int hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
|
||||
uint8_t cpu_target_list = gic_hwid_to_affinity(hw_cpuid);
|
||||
|
||||
/*
|
||||
@ -50,6 +49,23 @@ static void arm64_raise_sgi_gicv2(unsigned int cpuid, unsigned int vector)
|
||||
);
|
||||
}
|
||||
|
||||
static void arm64_raise_sgi_gicv2(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
/* Build interrupt destination of the target CPU */
|
||||
uint32_t hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
|
||||
|
||||
__arm64_raise_sgi_gicv2(hw_cpuid, vector);
|
||||
}
|
||||
|
||||
static void arm64_raise_sgi_to_host_gicv2(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
/* Build interrupt destination of the target Linux/host CPU */
|
||||
uint32_t hw_cpuid = ihk_mc_get_apicid(cpuid);
|
||||
|
||||
__arm64_raise_sgi_gicv2(hw_cpuid, vector);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* arm64_raise_spi_gicv2
|
||||
* @ref.impl nothing.
|
||||
@ -77,6 +93,11 @@ static void arm64_raise_spi_gicv2(unsigned int cpuid, unsigned int vector)
|
||||
);
|
||||
}
|
||||
|
||||
void arm64_issue_host_ipi_gicv2(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
arm64_raise_sgi_to_host_gicv2(cpuid, vector);
|
||||
}
|
||||
|
||||
/**
|
||||
* arm64_issue_ipi_gicv2
|
||||
* @param cpuid : hardware cpu id
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#include <cputype.h>
|
||||
#include <process.h>
|
||||
#include <syscall.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <arch-timer.h>
|
||||
#include <cls.h>
|
||||
|
||||
@ -195,15 +195,12 @@ static inline void gic_write_bpr1(uint32_t val)
|
||||
}
|
||||
#endif
|
||||
|
||||
static void arm64_raise_sgi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
static void __arm64_raise_sgi_gicv3(uint32_t hw_cpuid, uint32_t vector)
|
||||
{
|
||||
uint64_t mpidr, cluster_id;
|
||||
uint16_t tlist;
|
||||
uint64_t val;
|
||||
|
||||
/* Build interrupt destination of the target cpu */
|
||||
uint32_t hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
|
||||
|
||||
/*
|
||||
* Ensure that stores to Normal memory are visible to the
|
||||
* other CPUs before issuing the IPI.
|
||||
@ -239,6 +236,22 @@ static void arm64_raise_sgi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
}
|
||||
}
|
||||
|
||||
static void arm64_raise_sgi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
/* Build interrupt destination of the target CPU */
|
||||
uint32_t hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
|
||||
|
||||
__arm64_raise_sgi_gicv3(hw_cpuid, vector);
|
||||
}
|
||||
|
||||
static void arm64_raise_sgi_to_host_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
/* Build interrupt destination of the target Linux/host CPU */
|
||||
uint32_t hw_cpuid = ihk_mc_get_apicid(cpuid);
|
||||
|
||||
__arm64_raise_sgi_gicv3(hw_cpuid, vector);
|
||||
}
|
||||
|
||||
static void arm64_raise_spi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
uint64_t spi_reg_offset;
|
||||
@ -268,6 +281,11 @@ static void arm64_raise_lpi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
ekprintf("%s called.\n", __func__);
|
||||
}
|
||||
|
||||
void arm64_issue_host_ipi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
arm64_raise_sgi_to_host_gicv3(cpuid, vector);
|
||||
}
|
||||
|
||||
void arm64_issue_ipi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
dkprintf("Send irq#%d to cpuid=%d\n", vector, cpuid);
|
||||
@ -292,6 +310,9 @@ void handle_interrupt_gicv3(struct pt_regs *regs)
|
||||
{
|
||||
uint64_t irqnr;
|
||||
const int from_user = interrupt_from_user(regs);
|
||||
struct cpu_local_var *v = get_this_cpu_local_var();
|
||||
//unsigned long irqflags;
|
||||
int do_check = 0;
|
||||
|
||||
irqnr = gic_read_iar();
|
||||
cpu_enable_nmi();
|
||||
@ -305,10 +326,18 @@ void handle_interrupt_gicv3(struct pt_regs *regs)
|
||||
}
|
||||
set_cputime(from_user ? CPUTIME_MODE_K2U : CPUTIME_MODE_K2K_OUT);
|
||||
|
||||
/* for migration by IPI */
|
||||
if (get_this_cpu_local_var()->flags & CPU_FLAG_NEED_MIGRATE) {
|
||||
schedule();
|
||||
//irqflags = ihk_mc_spinlock_lock(&v->runq_lock);
|
||||
/* For migration by IPI or by timesharing */
|
||||
if (v->flags &
|
||||
(CPU_FLAG_NEED_MIGRATE | CPU_FLAG_NEED_RESCHED)) {
|
||||
v->flags &= ~CPU_FLAG_NEED_RESCHED;
|
||||
do_check = 1;
|
||||
}
|
||||
//ihk_mc_spinlock_unlock(&v->runq_lock, irqflags);
|
||||
|
||||
if (do_check) {
|
||||
check_signal(0, regs, 0);
|
||||
schedule();
|
||||
}
|
||||
}
|
||||
|
||||
@ -344,9 +373,11 @@ static void init_spi_routing(uint32_t irq, uint32_t linux_cpu)
|
||||
|
||||
void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size)
|
||||
{
|
||||
#ifndef IHK_IKC_USE_LINUX_WORK_IRQ
|
||||
extern int spi_table[];
|
||||
extern int nr_spi_table;
|
||||
int i;
|
||||
#endif // !IHK_IKC_USE_LINUX_WORK_IRQ
|
||||
|
||||
dist_base = map_fixed_area(dist_base_pa, size, 1 /*non chachable*/);
|
||||
|
||||
@ -357,6 +388,7 @@ void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef IHK_IKC_USE_LINUX_WORK_IRQ
|
||||
/* initialize spi routing */
|
||||
for (i = 0; i < nr_spi_table; i++) {
|
||||
if (spi_table[i] == -1) {
|
||||
@ -364,6 +396,7 @@ void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size)
|
||||
}
|
||||
init_spi_routing(spi_table[i], i);
|
||||
}
|
||||
#endif // !IHK_IKC_USE_LINUX_WORK_IRQ
|
||||
}
|
||||
|
||||
void gic_cpu_init_gicv3(unsigned long cpu_base_pa, unsigned long size)
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
/* memory.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||
#include <ihk/cpu.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <ihk/mm.h>
|
||||
#include <types.h>
|
||||
#include <memory.h>
|
||||
@ -14,7 +13,7 @@
|
||||
#include <context.h>
|
||||
#include <kmalloc.h>
|
||||
#include <vdso.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <rusage_private.h>
|
||||
#include <cputype.h>
|
||||
|
||||
@ -2672,17 +2671,28 @@ int set_range_l1(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start,
|
||||
}
|
||||
|
||||
phys = args->phys + (base - start);
|
||||
if (__page_offset(base, PTL1_CONT_SIZE) == 0) { //check head pte
|
||||
|
||||
/* Check if we can begin / end a series of contiguous PTEs */
|
||||
if (__page_offset(base, PTL1_CONT_SIZE) == 0) {
|
||||
uintptr_t next_addr = base + PTL1_CONT_SIZE;
|
||||
|
||||
if (end < next_addr) {
|
||||
next_addr = end;
|
||||
}
|
||||
|
||||
// set contiguous bit until the next head pte
|
||||
// if phys is aligned and range does not end early.
|
||||
/* Begin the series if physical address is also aligned and
|
||||
* the range covers the series. Don't start or end it if
|
||||
* physical address is not aligned or the range ends early.
|
||||
*/
|
||||
if (__page_offset(phys | next_addr, PTL1_CONT_SIZE) == 0) {
|
||||
args->attr[0] |= PTE_CONT;
|
||||
if (rusage_memory_stat_add(args->range, phys,
|
||||
PTL1_CONT_SIZE,
|
||||
PTL1_CONT_SIZE)) {
|
||||
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
phys, __func__, base, phys,
|
||||
PTL1_CONT_SIZE, PTL1_CONT_SIZE);
|
||||
}
|
||||
} else {
|
||||
args->attr[0] &= ~PTE_CONT;
|
||||
}
|
||||
@ -2692,12 +2702,13 @@ int set_range_l1(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start,
|
||||
|
||||
error = 0;
|
||||
// call memory_stat_rss_add() here because pgshift is resolved here
|
||||
if (rusage_memory_stat_add(args->range, phys, PTL1_SIZE, PTL1_SIZE)) {
|
||||
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
phys, __func__, base, phys, PTL1_SIZE, PTL1_SIZE);
|
||||
} else {
|
||||
dkprintf("%s: !calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
__func__, base, phys, PTL1_SIZE, PTL1_SIZE);
|
||||
if (!(args->attr[0] & PTE_CONT)) {
|
||||
if (rusage_memory_stat_add(args->range, phys,
|
||||
PTL1_SIZE, PTL1_SIZE)) {
|
||||
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
phys, __func__, base, phys,
|
||||
PTL1_SIZE, PTL1_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
@ -2761,7 +2772,9 @@ retry:
|
||||
|
||||
phys = args->phys + (base - start);
|
||||
|
||||
//check head pte
|
||||
/* Check if we can begin / end a series of
|
||||
* contiguous PTEs
|
||||
*/
|
||||
if (__page_offset(base, tbl.cont_pgsize) == 0) {
|
||||
uintptr_t next_addr = base +
|
||||
tbl.cont_pgsize;
|
||||
@ -2770,11 +2783,24 @@ retry:
|
||||
next_addr = end;
|
||||
}
|
||||
|
||||
// set contiguous bit until the
|
||||
// next head pte if phys is aligned
|
||||
// and range does not end early.
|
||||
/* Begin the series if physical address
|
||||
* is also aligned and the range covers
|
||||
* the series. Don't start or end it if
|
||||
* physical address is not aligned or
|
||||
* the range ends early.
|
||||
*/
|
||||
if (__page_offset(phys | next_addr, tbl.cont_pgsize) == 0) {
|
||||
args->attr[level-1] |= PTE_CONT;
|
||||
if (rusage_memory_stat_add(args->range,
|
||||
phys,
|
||||
tbl.cont_pgsize,
|
||||
tbl.cont_pgsize)) {
|
||||
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
phys, __func__,
|
||||
base, phys,
|
||||
tbl.cont_pgsize,
|
||||
tbl.cont_pgsize);
|
||||
}
|
||||
} else {
|
||||
args->attr[level-1] &= ~PTE_CONT;
|
||||
}
|
||||
@ -2782,21 +2808,23 @@ retry:
|
||||
|
||||
ptl_set(ptep, phys | args->attr[level-1],
|
||||
level);
|
||||
|
||||
error = 0;
|
||||
dkprintf("set_range_middle(%lx,%lx,%lx,%d):"
|
||||
"large page. %d %lx\n",
|
||||
base, start, end, level, error, *ptep);
|
||||
// Call memory_stat_rss_add() here because pgshift is resolved here
|
||||
if (rusage_memory_stat_add(args->range, phys,
|
||||
tbl.pgsize,
|
||||
tbl.pgsize)) {
|
||||
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
phys, __func__, base, phys,
|
||||
tbl.pgsize, tbl.pgsize);
|
||||
} else {
|
||||
dkprintf("%s: !calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
__func__, base, phys,
|
||||
tbl.pgsize, tbl.pgsize);
|
||||
if (!(args->attr[level-1] & PTE_CONT)) {
|
||||
if (rusage_memory_stat_add(args->range,
|
||||
phys,
|
||||
tbl.pgsize,
|
||||
tbl.pgsize)) {
|
||||
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
phys, __func__, base,
|
||||
phys,
|
||||
tbl.pgsize,
|
||||
tbl.pgsize);
|
||||
}
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
@ -2848,7 +2876,7 @@ retry:
|
||||
error = 0;
|
||||
out:
|
||||
if (tt_pa) {
|
||||
ihk_mc_free_pages(tt_pa, 1);
|
||||
ihk_mc_free_pages(phys_to_virt((unsigned long)tt_pa), 1);
|
||||
}
|
||||
dkprintf("set_range_middle(%lx,%lx,%lx,%d): %d %lx\n",
|
||||
base, start, end, level, error, *ptep);
|
||||
@ -3200,6 +3228,7 @@ void load_page_table(struct page_table *pt)
|
||||
{
|
||||
if (pt == NULL) {
|
||||
// load page table for idle(EL1) process.
|
||||
switch_mm(init_pt);
|
||||
return;
|
||||
}
|
||||
// load page table for user(EL0) thread.
|
||||
@ -3259,7 +3288,7 @@ void *map_fixed_area(unsigned long phys, unsigned long size, int uncachable)
|
||||
attr |= PTATTR_UNCACHABLE;
|
||||
}
|
||||
|
||||
kprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
|
||||
dkprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
|
||||
paligned, v, npages);
|
||||
|
||||
pt = get_init_page_table();
|
||||
@ -3335,15 +3364,15 @@ unsigned long virt_to_phys(void *v)
|
||||
{
|
||||
unsigned long va = (unsigned long)v;
|
||||
|
||||
if (MAP_KERNEL_START <= va) {
|
||||
return va - MAP_KERNEL_START + arm64_kernel_phys_base;
|
||||
if (va >= MAP_ST_START) {
|
||||
return va - MAP_ST_START + arm64_st_phys_base;
|
||||
}
|
||||
return va - MAP_ST_START;
|
||||
return va - MAP_KERNEL_START + arm64_kernel_phys_base;
|
||||
}
|
||||
|
||||
void *phys_to_virt(unsigned long p)
|
||||
{
|
||||
return (void *)(p | MAP_ST_START);
|
||||
return (void *)((p - arm64_st_phys_base) | MAP_ST_START);
|
||||
}
|
||||
|
||||
int copy_from_user(void *dst, const void *src, size_t siz)
|
||||
@ -3716,44 +3745,6 @@ translation_table_t* get_translation_table_as_paddr(const struct page_table *pt)
|
||||
return pt->tt_pa;
|
||||
}
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_8
|
||||
void remote_flush_tlb_cpumask(struct process_vm *vm,
|
||||
unsigned long addr, int cpu_id)
|
||||
{
|
||||
unsigned long cpu;
|
||||
cpu_set_t _cpu_set;
|
||||
int flush_ind;
|
||||
|
||||
if (addr) {
|
||||
flush_ind = (addr >> PAGE_SHIFT) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
|
||||
}
|
||||
/* Zero address denotes full TLB flush */
|
||||
else {
|
||||
/* Random.. */
|
||||
flush_ind = (rdtsc()) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
|
||||
}
|
||||
|
||||
/* Take a copy of the cpu set so that we don't hold the lock
|
||||
* all the way while interrupting other cores */
|
||||
ihk_mc_spinlock_lock_noirq(&vm->address_space->cpu_set_lock);
|
||||
memcpy(&_cpu_set, &vm->address_space->cpu_set, sizeof(cpu_set_t));
|
||||
ihk_mc_spinlock_unlock_noirq(&vm->address_space->cpu_set_lock);
|
||||
|
||||
/* Loop through CPUs in this address space and interrupt them for
|
||||
* TLB flush on the specified address */
|
||||
for_each_set_bit(cpu, (const unsigned long*)&_cpu_set.__bits, CPU_SETSIZE) {
|
||||
if (ihk_mc_get_processor_id() == cpu)
|
||||
continue;
|
||||
|
||||
dkprintf("remote_flush_tlb_cpumask: flush_ind: %d, addr: 0x%lX, interrupting cpu: %d\n",
|
||||
flush_ind, addr, cpu);
|
||||
|
||||
ihk_mc_interrupt_cpu(cpu,
|
||||
ihk_mc_get_vector(flush_ind + IHK_TLB_FLUSH_IRQ_VECTOR_START));
|
||||
}
|
||||
}
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_8 */
|
||||
|
||||
void arch_adjust_allocate_page_size(struct page_table *pt,
|
||||
uintptr_t fault_addr,
|
||||
pte_t *ptep,
|
||||
|
||||
@ -19,7 +19,7 @@ int ihk_mc_ikc_init_first_local(struct ihk_ikc_channel_desc *channel,
|
||||
|
||||
memset(channel, 0, sizeof(struct ihk_ikc_channel_desc));
|
||||
|
||||
mikc_queue_pages = ((4 * num_processors * MASTER_IKCQ_PKTSIZE)
|
||||
mikc_queue_pages = ((8 * num_processors * MASTER_IKCQ_PKTSIZE)
|
||||
+ (PAGE_SIZE - 1)) / PAGE_SIZE;
|
||||
|
||||
/* Place both sides in this side */
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
#include <string.h>
|
||||
#include <ihk/mm.h>
|
||||
#include <irq.h>
|
||||
#include <process.h>
|
||||
|
||||
/*
|
||||
* @ref.impl arch/arm64/kernel/perf_event.c
|
||||
@ -85,25 +86,17 @@ void arm64_disable_user_access_pmu_regs(void)
|
||||
cpu_pmu.disable_user_access_pmu_regs();
|
||||
}
|
||||
|
||||
extern unsigned int *arm64_march_perfmap;
|
||||
|
||||
static int __ihk_mc_perfctr_init(int counter, uint32_t type, uint64_t config, int mode)
|
||||
{
|
||||
int ret = -1;
|
||||
unsigned long config_base = 0;
|
||||
int mapping;
|
||||
|
||||
mapping = cpu_pmu.map_event(type, config);
|
||||
if (mapping < 0) {
|
||||
return mapping;
|
||||
}
|
||||
|
||||
ret = cpu_pmu.disable_counter(counter);
|
||||
ret = cpu_pmu.disable_counter(1UL << counter);
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = cpu_pmu.enable_intens(counter);
|
||||
ret = cpu_pmu.enable_intens(1UL << counter);
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
}
|
||||
@ -112,7 +105,7 @@ static int __ihk_mc_perfctr_init(int counter, uint32_t type, uint64_t config, in
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
config_base |= (unsigned long)mapping;
|
||||
config_base |= config;
|
||||
cpu_pmu.write_evtype(counter, config_base);
|
||||
return ret;
|
||||
}
|
||||
@ -124,68 +117,24 @@ int ihk_mc_perfctr_init_raw(int counter, uint64_t config, int mode)
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_init(int counter, uint64_t config, int mode)
|
||||
{
|
||||
int ret;
|
||||
ret = __ihk_mc_perfctr_init(counter, PERF_TYPE_RAW, config, mode);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_start(unsigned long counter_mask)
|
||||
{
|
||||
int ret = 0, i;
|
||||
|
||||
for (i = 0; i < sizeof(counter_mask) * BITS_PER_BYTE; i++) {
|
||||
if (counter_mask & (1UL << i)) {
|
||||
ret = cpu_pmu.enable_counter(i);
|
||||
if (ret < 0) {
|
||||
kprintf("%s: enable failed(idx=%d)\n",
|
||||
__func__, i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
return cpu_pmu.enable_counter(counter_mask);
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_stop(unsigned long counter_mask, int flags)
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
for (i = 0; i < sizeof(counter_mask) * BITS_PER_BYTE; i++) {
|
||||
if (!(counter_mask & (1UL << i)))
|
||||
continue;
|
||||
|
||||
int ret = 0;
|
||||
|
||||
ret = cpu_pmu.disable_counter(i);
|
||||
if (ret < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (flags & IHK_MC_PERFCTR_DISABLE_INTERRUPT) {
|
||||
// when ihk_mc_perfctr_start is called,
|
||||
// ihk_mc_perfctr_init is also called so disable
|
||||
// interrupt
|
||||
ret = cpu_pmu.disable_intens(i);
|
||||
if (ret < 0) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
return cpu_pmu.disable_counter(counter_mask);
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_reset(int counter)
|
||||
{
|
||||
// TODO[PMU]: ihk_mc_perfctr_setと同様にサンプリングレートの共通部実装の扱いを見てから本実装。
|
||||
cpu_pmu.write_counter(counter, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_set(int counter, long val)
|
||||
{
|
||||
// TODO[PMU]: 共通部でサンプリングレートの計算をして、設定するカウンタ値をvalに渡してくるようになると想定。サンプリングレートの扱いを見てから本実装。
|
||||
uint32_t v = val;
|
||||
cpu_pmu.write_counter(counter, v);
|
||||
return 0;
|
||||
@ -198,6 +147,15 @@ int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_alloc(struct thread *thread, struct mc_perf_event *event)
|
||||
{
|
||||
const int counters = ihk_mc_perf_get_num_counters();
|
||||
|
||||
return cpu_pmu.get_event_idx(counters,
|
||||
thread->pmc_alloc_map,
|
||||
event->hw_config);
|
||||
}
|
||||
|
||||
unsigned long ihk_mc_perfctr_read(int counter)
|
||||
{
|
||||
unsigned long count;
|
||||
@ -205,6 +163,14 @@ unsigned long ihk_mc_perfctr_read(int counter)
|
||||
return count;
|
||||
}
|
||||
|
||||
unsigned long ihk_mc_perfctr_value(int counter, unsigned long correction)
|
||||
{
|
||||
unsigned long count = ihk_mc_perfctr_read(counter) + correction;
|
||||
|
||||
count &= ((1UL << 32) - 1);
|
||||
return count;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config,
|
||||
unsigned long pmc_status)
|
||||
{
|
||||
@ -234,12 +200,14 @@ int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config,
|
||||
|
||||
int ihk_mc_perf_counter_mask_check(unsigned long counter_mask)
|
||||
{
|
||||
return 1;
|
||||
return cpu_pmu.counter_mask_valid(counter_mask);
|
||||
}
|
||||
|
||||
int ihk_mc_perf_get_num_counters(void)
|
||||
{
|
||||
return cpu_pmu.per_cpu[ihk_mc_get_processor_id()].num_events;
|
||||
const struct per_cpu_arm_pmu *per_cpu_arm_pmu = get_per_cpu_pmu();
|
||||
|
||||
return per_cpu_arm_pmu->num_events;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
|
||||
@ -247,3 +215,83 @@ int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
|
||||
/* Nothing to do. */
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline uint64_t arm_pmu_event_max_period(struct mc_perf_event *event)
|
||||
{
|
||||
return 0xFFFFFFFF;
|
||||
}
|
||||
|
||||
int hw_perf_event_init(struct mc_perf_event *event)
|
||||
{
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
|
||||
if (!is_sampling_event(event)) {
|
||||
hwc->sample_period = arm_pmu_event_max_period(event) >> 1;
|
||||
hwc->last_period = hwc->sample_period;
|
||||
ihk_atomic64_set(&hwc->period_left, hwc->sample_period);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ihk_mc_event_set_period(struct mc_perf_event *event)
|
||||
{
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
int64_t left = ihk_atomic64_read(&hwc->period_left);
|
||||
int64_t period = hwc->sample_period;
|
||||
uint64_t max_period;
|
||||
int ret = 0;
|
||||
|
||||
max_period = arm_pmu_event_max_period(event);
|
||||
if (unlikely(left <= -period)) {
|
||||
left = period;
|
||||
ihk_atomic64_set(&hwc->period_left, left);
|
||||
hwc->last_period = period;
|
||||
ret = 1;
|
||||
}
|
||||
|
||||
if (unlikely(left <= 0)) {
|
||||
left += period;
|
||||
ihk_atomic64_set(&hwc->period_left, left);
|
||||
hwc->last_period = period;
|
||||
ret = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Limit the maximum period to prevent the counter value
|
||||
* from overtaking the one we are about to program. In
|
||||
* effect we are reducing max_period to account for
|
||||
* interrupt latency (and we are being very conservative).
|
||||
*/
|
||||
if (left > (max_period >> 1))
|
||||
left = (max_period >> 1);
|
||||
|
||||
ihk_atomic64_set(&hwc->prev_count, (uint64_t)-left);
|
||||
|
||||
cpu_pmu.write_counter(event->counter_id,
|
||||
(uint64_t)(-left) & max_period);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
uint64_t ihk_mc_event_update(struct mc_perf_event *event)
|
||||
{
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
int64_t delta;
|
||||
uint64_t prev_raw_count, new_raw_count;
|
||||
uint64_t max_period = arm_pmu_event_max_period(event);
|
||||
|
||||
again:
|
||||
prev_raw_count = ihk_atomic64_read(&hwc->prev_count);
|
||||
new_raw_count = cpu_pmu.read_counter(event->counter_id);
|
||||
|
||||
if (ihk_atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
|
||||
new_raw_count) != prev_raw_count)
|
||||
goto again;
|
||||
|
||||
delta = (new_raw_count - prev_raw_count) & max_period;
|
||||
|
||||
ihk_atomic64_add(delta, &event->count);
|
||||
ihk_atomic64_add(-delta, &hwc->period_left);
|
||||
|
||||
return new_raw_count;
|
||||
}
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
#include <ihk/perfctr.h>
|
||||
#include <errno.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <debug.h>
|
||||
#include <sysreg.h>
|
||||
#include <virt.h>
|
||||
#include <bitops.h>
|
||||
@ -21,29 +20,174 @@
|
||||
#define DDEBUG_DEFAULT DDEBUG_PRINT
|
||||
#endif
|
||||
|
||||
/*
|
||||
* read pmevcntr<n>_el0 functions
|
||||
*/
|
||||
#define read_pmevcntrN_el0(N) \
|
||||
static uint32_t read_pmevcntr##N##_el0(void) \
|
||||
{ \
|
||||
return read_sysreg(pmevcntr##N##_el0); \
|
||||
}
|
||||
|
||||
read_pmevcntrN_el0(0)
|
||||
read_pmevcntrN_el0(1)
|
||||
read_pmevcntrN_el0(2)
|
||||
read_pmevcntrN_el0(3)
|
||||
read_pmevcntrN_el0(4)
|
||||
read_pmevcntrN_el0(5)
|
||||
read_pmevcntrN_el0(6)
|
||||
read_pmevcntrN_el0(7)
|
||||
read_pmevcntrN_el0(8)
|
||||
read_pmevcntrN_el0(9)
|
||||
read_pmevcntrN_el0(10)
|
||||
read_pmevcntrN_el0(11)
|
||||
read_pmevcntrN_el0(12)
|
||||
read_pmevcntrN_el0(13)
|
||||
read_pmevcntrN_el0(14)
|
||||
read_pmevcntrN_el0(15)
|
||||
read_pmevcntrN_el0(16)
|
||||
read_pmevcntrN_el0(17)
|
||||
read_pmevcntrN_el0(18)
|
||||
read_pmevcntrN_el0(19)
|
||||
read_pmevcntrN_el0(20)
|
||||
read_pmevcntrN_el0(21)
|
||||
read_pmevcntrN_el0(22)
|
||||
read_pmevcntrN_el0(23)
|
||||
read_pmevcntrN_el0(24)
|
||||
read_pmevcntrN_el0(25)
|
||||
read_pmevcntrN_el0(26)
|
||||
read_pmevcntrN_el0(27)
|
||||
read_pmevcntrN_el0(28)
|
||||
read_pmevcntrN_el0(29)
|
||||
read_pmevcntrN_el0(30)
|
||||
|
||||
static uint32_t (* const read_pmevcntr_el0[])(void) = {
|
||||
read_pmevcntr0_el0, read_pmevcntr1_el0, read_pmevcntr2_el0,
|
||||
read_pmevcntr3_el0, read_pmevcntr4_el0, read_pmevcntr5_el0,
|
||||
read_pmevcntr6_el0, read_pmevcntr7_el0, read_pmevcntr8_el0,
|
||||
read_pmevcntr9_el0, read_pmevcntr10_el0, read_pmevcntr11_el0,
|
||||
read_pmevcntr12_el0, read_pmevcntr13_el0, read_pmevcntr14_el0,
|
||||
read_pmevcntr15_el0, read_pmevcntr16_el0, read_pmevcntr17_el0,
|
||||
read_pmevcntr18_el0, read_pmevcntr19_el0, read_pmevcntr20_el0,
|
||||
read_pmevcntr21_el0, read_pmevcntr22_el0, read_pmevcntr23_el0,
|
||||
read_pmevcntr24_el0, read_pmevcntr25_el0, read_pmevcntr26_el0,
|
||||
read_pmevcntr27_el0, read_pmevcntr28_el0, read_pmevcntr29_el0,
|
||||
read_pmevcntr30_el0,
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c
|
||||
* Perf Events' indices
|
||||
* write pmevcntr<n>_el0 functions
|
||||
*/
|
||||
#define ARMV8_IDX_CYCLE_COUNTER 0
|
||||
#define ARMV8_IDX_COUNTER0 1
|
||||
#define ARMV8_IDX_COUNTER_LAST (ARMV8_IDX_CYCLE_COUNTER + get_per_cpu_pmu()->num_events - 1)
|
||||
#define write_pmevcntrN_el0(N) \
|
||||
static void write_pmevcntr##N##_el0(uint32_t v) \
|
||||
{ \
|
||||
write_sysreg(v, pmevcntr##N##_el0); \
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/include/asm/perf_event.h */
|
||||
#define ARMV8_PMU_MAX_COUNTERS 32
|
||||
#define ARMV8_PMU_COUNTER_MASK (ARMV8_PMU_MAX_COUNTERS - 1)
|
||||
write_pmevcntrN_el0(0)
|
||||
write_pmevcntrN_el0(1)
|
||||
write_pmevcntrN_el0(2)
|
||||
write_pmevcntrN_el0(3)
|
||||
write_pmevcntrN_el0(4)
|
||||
write_pmevcntrN_el0(5)
|
||||
write_pmevcntrN_el0(6)
|
||||
write_pmevcntrN_el0(7)
|
||||
write_pmevcntrN_el0(8)
|
||||
write_pmevcntrN_el0(9)
|
||||
write_pmevcntrN_el0(10)
|
||||
write_pmevcntrN_el0(11)
|
||||
write_pmevcntrN_el0(12)
|
||||
write_pmevcntrN_el0(13)
|
||||
write_pmevcntrN_el0(14)
|
||||
write_pmevcntrN_el0(15)
|
||||
write_pmevcntrN_el0(16)
|
||||
write_pmevcntrN_el0(17)
|
||||
write_pmevcntrN_el0(18)
|
||||
write_pmevcntrN_el0(19)
|
||||
write_pmevcntrN_el0(20)
|
||||
write_pmevcntrN_el0(21)
|
||||
write_pmevcntrN_el0(22)
|
||||
write_pmevcntrN_el0(23)
|
||||
write_pmevcntrN_el0(24)
|
||||
write_pmevcntrN_el0(25)
|
||||
write_pmevcntrN_el0(26)
|
||||
write_pmevcntrN_el0(27)
|
||||
write_pmevcntrN_el0(28)
|
||||
write_pmevcntrN_el0(29)
|
||||
write_pmevcntrN_el0(30)
|
||||
|
||||
static void (* const write_pmevcntr_el0[])(uint32_t) = {
|
||||
write_pmevcntr0_el0, write_pmevcntr1_el0, write_pmevcntr2_el0,
|
||||
write_pmevcntr3_el0, write_pmevcntr4_el0, write_pmevcntr5_el0,
|
||||
write_pmevcntr6_el0, write_pmevcntr7_el0, write_pmevcntr8_el0,
|
||||
write_pmevcntr9_el0, write_pmevcntr10_el0, write_pmevcntr11_el0,
|
||||
write_pmevcntr12_el0, write_pmevcntr13_el0, write_pmevcntr14_el0,
|
||||
write_pmevcntr15_el0, write_pmevcntr16_el0, write_pmevcntr17_el0,
|
||||
write_pmevcntr18_el0, write_pmevcntr19_el0, write_pmevcntr20_el0,
|
||||
write_pmevcntr21_el0, write_pmevcntr22_el0, write_pmevcntr23_el0,
|
||||
write_pmevcntr24_el0, write_pmevcntr25_el0, write_pmevcntr26_el0,
|
||||
write_pmevcntr27_el0, write_pmevcntr28_el0, write_pmevcntr29_el0,
|
||||
write_pmevcntr30_el0,
|
||||
};
|
||||
|
||||
/*
|
||||
* ARMv8 low level PMU access
|
||||
* write pmevtyper<n>_el0 functions
|
||||
*/
|
||||
#define write_pmevtyperN_el0(N) \
|
||||
static void write_pmevtyper##N##_el0(uint32_t v) \
|
||||
{ \
|
||||
write_sysreg(v, pmevtyper##N##_el0); \
|
||||
}
|
||||
|
||||
/*
|
||||
* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c
|
||||
* Perf Event to low level counters mapping
|
||||
*/
|
||||
#define ARMV8_IDX_TO_COUNTER(x) \
|
||||
(((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)
|
||||
write_pmevtyperN_el0(0)
|
||||
write_pmevtyperN_el0(1)
|
||||
write_pmevtyperN_el0(2)
|
||||
write_pmevtyperN_el0(3)
|
||||
write_pmevtyperN_el0(4)
|
||||
write_pmevtyperN_el0(5)
|
||||
write_pmevtyperN_el0(6)
|
||||
write_pmevtyperN_el0(7)
|
||||
write_pmevtyperN_el0(8)
|
||||
write_pmevtyperN_el0(9)
|
||||
write_pmevtyperN_el0(10)
|
||||
write_pmevtyperN_el0(11)
|
||||
write_pmevtyperN_el0(12)
|
||||
write_pmevtyperN_el0(13)
|
||||
write_pmevtyperN_el0(14)
|
||||
write_pmevtyperN_el0(15)
|
||||
write_pmevtyperN_el0(16)
|
||||
write_pmevtyperN_el0(17)
|
||||
write_pmevtyperN_el0(18)
|
||||
write_pmevtyperN_el0(19)
|
||||
write_pmevtyperN_el0(20)
|
||||
write_pmevtyperN_el0(21)
|
||||
write_pmevtyperN_el0(22)
|
||||
write_pmevtyperN_el0(23)
|
||||
write_pmevtyperN_el0(24)
|
||||
write_pmevtyperN_el0(25)
|
||||
write_pmevtyperN_el0(26)
|
||||
write_pmevtyperN_el0(27)
|
||||
write_pmevtyperN_el0(28)
|
||||
write_pmevtyperN_el0(29)
|
||||
write_pmevtyperN_el0(30)
|
||||
|
||||
static void (* const write_pmevtyper_el0[])(uint32_t) = {
|
||||
write_pmevtyper0_el0, write_pmevtyper1_el0, write_pmevtyper2_el0,
|
||||
write_pmevtyper3_el0, write_pmevtyper4_el0, write_pmevtyper5_el0,
|
||||
write_pmevtyper6_el0, write_pmevtyper7_el0, write_pmevtyper8_el0,
|
||||
write_pmevtyper9_el0, write_pmevtyper10_el0, write_pmevtyper11_el0,
|
||||
write_pmevtyper12_el0, write_pmevtyper13_el0, write_pmevtyper14_el0,
|
||||
write_pmevtyper15_el0, write_pmevtyper16_el0, write_pmevtyper17_el0,
|
||||
write_pmevtyper18_el0, write_pmevtyper19_el0, write_pmevtyper20_el0,
|
||||
write_pmevtyper21_el0, write_pmevtyper22_el0, write_pmevtyper23_el0,
|
||||
write_pmevtyper24_el0, write_pmevtyper25_el0, write_pmevtyper26_el0,
|
||||
write_pmevtyper27_el0, write_pmevtyper28_el0, write_pmevtyper29_el0,
|
||||
write_pmevtyper30_el0,
|
||||
};
|
||||
|
||||
#define ARMV8_IDX_CYCLE_COUNTER 31
|
||||
#define ARMV8_IDX_COUNTER0 0
|
||||
|
||||
/*
|
||||
* @ref.impl linux-v4.15-rc3 arch/arm64/include/asm/perf_event.h
|
||||
@ -175,6 +319,10 @@
|
||||
|
||||
/* PMUv3 HW events mapping. */
|
||||
|
||||
/* disable -Woverride-init for the following initializations */
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Woverride-init"
|
||||
|
||||
/*
|
||||
* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c
|
||||
* ARMv8 Architectural defined events, not all of these may
|
||||
@ -220,6 +368,9 @@ static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
|
||||
[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED,
|
||||
};
|
||||
|
||||
/* restore warnings */
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 drivers/perf/arm_pmu.c */
|
||||
static int
|
||||
armpmu_map_cache_event(const unsigned (*cache_map)
|
||||
@ -298,11 +449,25 @@ armpmu_map_event(uint32_t type, uint64_t config,
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline int armv8pmu_counter_mask_valid(unsigned long counter_mask)
|
||||
{
|
||||
int num;
|
||||
unsigned long event;
|
||||
unsigned long cycle;
|
||||
unsigned long invalid_mask;
|
||||
|
||||
num = get_per_cpu_pmu()->num_events;
|
||||
num--; /* Sub the CPU cycles counter */
|
||||
event = ((1UL << num) - 1) << ARMV8_IDX_COUNTER0;
|
||||
cycle = 1UL << ARMV8_IDX_CYCLE_COUNTER;
|
||||
invalid_mask = ~(event | cycle);
|
||||
|
||||
return !(counter_mask & invalid_mask);
|
||||
}
|
||||
|
||||
static inline int armv8pmu_counter_valid(int idx)
|
||||
{
|
||||
return idx >= ARMV8_IDX_CYCLE_COUNTER &&
|
||||
idx <= ARMV8_IDX_COUNTER_LAST;
|
||||
return armv8pmu_counter_mask_valid(1UL << idx);
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
@ -326,6 +491,11 @@ static inline int armv8pmu_has_overflowed(uint32_t pmovsr)
|
||||
return pmovsr & ARMV8_PMU_OVERFLOWED_MASK;
|
||||
}
|
||||
|
||||
static inline int armv8pmu_counter_has_overflowed(uint32_t pmnc, int idx)
|
||||
{
|
||||
return pmnc & BIT(idx);
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static int __armv8_pmuv3_map_event(uint32_t type, uint64_t config,
|
||||
const unsigned int (*extra_event_map)
|
||||
@ -357,6 +527,23 @@ static int armv8_pmuv3_map_event(uint32_t type, uint64_t config)
|
||||
return __armv8_pmuv3_map_event(type, config, NULL, NULL);
|
||||
}
|
||||
|
||||
|
||||
static int armv8_pmuv3_map_hw_event(uint64_t config)
|
||||
{
|
||||
return __armv8_pmuv3_map_event(PERF_TYPE_HARDWARE, config, NULL, NULL);
|
||||
}
|
||||
|
||||
|
||||
static int armv8_pmuv3_map_cache_event(uint64_t config)
|
||||
{
|
||||
return __armv8_pmuv3_map_event(PERF_TYPE_HW_CACHE, config, NULL, NULL);
|
||||
}
|
||||
|
||||
static int armv8_pmuv3_map_raw_event(uint64_t config)
|
||||
{
|
||||
return __armv8_pmuv3_map_event(PERF_TYPE_RAW, config, NULL, NULL);
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline uint32_t armv8pmu_pmcr_read(void)
|
||||
{
|
||||
@ -371,24 +558,6 @@ static inline void armv8pmu_pmcr_write(uint32_t val)
|
||||
write_sysreg(val, pmcr_el0);
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline int armv8pmu_select_counter(int idx)
|
||||
{
|
||||
uint32_t counter;
|
||||
|
||||
if (!armv8pmu_counter_valid(idx)) {
|
||||
ekprintf("%s: The count_register#%d is not implemented.\n",
|
||||
__func__, idx);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
counter = ARMV8_IDX_TO_COUNTER(idx);
|
||||
write_sysreg(counter, pmselr_el0);
|
||||
isb();
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline uint32_t armv8pmu_read_counter(int idx)
|
||||
{
|
||||
@ -401,8 +570,8 @@ static inline uint32_t armv8pmu_read_counter(int idx)
|
||||
else if (idx == ARMV8_IDX_CYCLE_COUNTER) {
|
||||
value = read_sysreg(pmccntr_el0);
|
||||
}
|
||||
else if (armv8pmu_select_counter(idx) == idx) {
|
||||
value = read_sysreg(pmxevcntr_el0);
|
||||
else {
|
||||
value = read_pmevcntr_el0[idx]();
|
||||
}
|
||||
|
||||
return value;
|
||||
@ -421,43 +590,42 @@ static inline void armv8pmu_write_counter(int idx, uint32_t value)
|
||||
* count using the lower 32bits and we want an interrupt when
|
||||
* it overflows.
|
||||
*/
|
||||
uint64_t value64 = 0xffffffff00000000ULL | value;
|
||||
uint64_t value64 = (int32_t)value;
|
||||
|
||||
write_sysreg(value64, pmccntr_el0);
|
||||
}
|
||||
else if (armv8pmu_select_counter(idx) == idx) {
|
||||
write_sysreg(value, pmxevcntr_el0);
|
||||
else {
|
||||
write_pmevcntr_el0[idx](value);
|
||||
}
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline int armv8pmu_enable_intens(int idx)
|
||||
static inline int armv8pmu_enable_intens(unsigned long counter_mask)
|
||||
{
|
||||
uint32_t counter;
|
||||
|
||||
if (!armv8pmu_counter_valid(idx)) {
|
||||
ekprintf("%s: The count_register#%d is not implemented.\n",
|
||||
__func__, idx);
|
||||
if (!armv8pmu_counter_mask_valid(counter_mask)) {
|
||||
ekprintf("%s: invalid counter mask(%#lx)\n",
|
||||
__func__, counter_mask);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
counter = ARMV8_IDX_TO_COUNTER(idx);
|
||||
write_sysreg(BIT(counter), pmintenset_el1);
|
||||
return idx;
|
||||
write_sysreg(counter_mask, pmintenset_el1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline int armv8pmu_disable_intens(int idx)
|
||||
static inline int armv8pmu_disable_intens(unsigned long counter_mask)
|
||||
{
|
||||
uint32_t counter = ARMV8_IDX_TO_COUNTER(idx);
|
||||
|
||||
write_sysreg(BIT(counter), pmintenclr_el1);
|
||||
if (!armv8pmu_counter_mask_valid(counter_mask)) {
|
||||
ekprintf("%s: invalid counter mask(%#lx)\n",
|
||||
__func__, counter_mask);
|
||||
return -EINVAL;
|
||||
}
|
||||
write_sysreg(counter_mask, pmintenclr_el1);
|
||||
isb();
|
||||
/* Clear the overflow flag in case an interrupt is pending. */
|
||||
write_sysreg(BIT(counter), pmovsclr_el0);
|
||||
write_sysreg(counter_mask, pmovsclr_el0);
|
||||
isb();
|
||||
|
||||
return idx;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
@ -492,42 +660,37 @@ static int armv8pmu_set_event_filter(unsigned long *config_base, int mode)
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline void armv8pmu_write_evtype(int idx, uint32_t val)
|
||||
{
|
||||
if (armv8pmu_select_counter(idx) == idx) {
|
||||
val &= ARMV8_PMU_EVTYPE_MASK;
|
||||
write_sysreg(val, pmxevtyper_el0);
|
||||
if (!armv8pmu_counter_valid(idx)) {
|
||||
ekprintf("%s: The count_register#%d is not implemented.\n",
|
||||
__func__, idx);
|
||||
return;
|
||||
} else if (idx != ARMV8_IDX_CYCLE_COUNTER) {
|
||||
write_pmevtyper_el0[idx](val);
|
||||
}
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline int armv8pmu_enable_counter(int idx)
|
||||
static inline int armv8pmu_enable_counter(unsigned long counter_mask)
|
||||
{
|
||||
uint32_t counter;
|
||||
|
||||
if (!armv8pmu_counter_valid(idx)) {
|
||||
ekprintf("%s: The count_register#%d is not implemented.\n",
|
||||
__func__, idx);
|
||||
if (!armv8pmu_counter_mask_valid(counter_mask)) {
|
||||
ekprintf("%s: invalid counter mask 0x%lx.\n",
|
||||
__func__, counter_mask);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
counter = ARMV8_IDX_TO_COUNTER(idx);
|
||||
write_sysreg(BIT(counter), pmcntenset_el0);
|
||||
return idx;
|
||||
write_sysreg(counter_mask, pmcntenset_el0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline int armv8pmu_disable_counter(int idx)
|
||||
static inline int armv8pmu_disable_counter(unsigned long counter_mask)
|
||||
{
|
||||
uint32_t counter;
|
||||
|
||||
if (!armv8pmu_counter_valid(idx)) {
|
||||
ekprintf("%s: The count_register#%d is not implemented.\n",
|
||||
__func__, idx);
|
||||
if (!armv8pmu_counter_mask_valid(counter_mask)) {
|
||||
ekprintf("%s: invalid counter mask 0x%lx.\n",
|
||||
__func__, counter_mask);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
counter = ARMV8_IDX_TO_COUNTER(idx);
|
||||
write_sysreg(BIT(counter), pmcntenclr_el0);
|
||||
return idx;
|
||||
write_sysreg(counter_mask, pmcntenclr_el0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
@ -555,41 +718,20 @@ static void armv8pmu_stop(void)
|
||||
ihk_mc_spinlock_unlock(&pmu_lock, flags);
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static void armv8pmu_disable_event(int idx)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* Disable counter and interrupt
|
||||
*/
|
||||
flags = ihk_mc_spinlock_lock(&pmu_lock);
|
||||
|
||||
/*
|
||||
* Disable counter
|
||||
*/
|
||||
armv8pmu_disable_counter(idx);
|
||||
|
||||
/*
|
||||
* Disable interrupt for this counter
|
||||
*/
|
||||
armv8pmu_disable_intens(idx);
|
||||
|
||||
ihk_mc_spinlock_unlock(&pmu_lock, flags);
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static void armv8pmu_reset(void *info)
|
||||
{
|
||||
struct arm_pmu *cpu_pmu = (struct arm_pmu *)info;
|
||||
uint32_t idx, nb_cnt =
|
||||
uint32_t nb_cnt =
|
||||
cpu_pmu->per_cpu[ihk_mc_get_processor_id()].num_events;
|
||||
nb_cnt--; /* Sub the CPU cycles counter */
|
||||
unsigned long event = ((1UL << nb_cnt) - 1) << ARMV8_IDX_COUNTER0;
|
||||
unsigned long cycle = 1UL << ARMV8_IDX_CYCLE_COUNTER;
|
||||
unsigned long valid_mask = event | cycle;
|
||||
|
||||
/* The counter and interrupt enable registers are unknown at reset. */
|
||||
for (idx = ARMV8_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx) {
|
||||
armv8pmu_disable_counter(idx);
|
||||
armv8pmu_disable_intens(idx);
|
||||
}
|
||||
armv8pmu_disable_counter(valid_mask);
|
||||
armv8pmu_disable_intens(valid_mask);
|
||||
|
||||
/*
|
||||
* Initialize & Reset PMNC. Request overflow interrupt for
|
||||
@ -603,7 +745,7 @@ static void armv8pmu_reset(void *info)
|
||||
static int armv8pmu_get_event_idx(int num_events, unsigned long used_mask,
|
||||
unsigned long config)
|
||||
{
|
||||
int idx;
|
||||
int idx, end;
|
||||
unsigned long evtype = config & ARMV8_PMU_EVTYPE_EVENT;
|
||||
|
||||
/* Always prefer to place a cycle counter into the cycle counter. */
|
||||
@ -615,7 +757,9 @@ static int armv8pmu_get_event_idx(int num_events, unsigned long used_mask,
|
||||
/*
|
||||
* Otherwise use events counters
|
||||
*/
|
||||
for (idx = ARMV8_IDX_COUNTER0; idx < num_events; ++idx) {
|
||||
end = ARMV8_IDX_COUNTER0 + num_events;
|
||||
end--; /* Sub the CPU cycles counter */
|
||||
for (idx = ARMV8_IDX_COUNTER0; idx < end; ++idx) {
|
||||
if (!(used_mask & (1UL << idx)))
|
||||
return idx;
|
||||
}
|
||||
@ -642,13 +786,11 @@ static uint32_t armv8pmu_read_num_pmnc_events(void)
|
||||
|
||||
static void armv8pmu_handle_irq(void *priv)
|
||||
{
|
||||
struct siginfo info;
|
||||
uint32_t pmovsr;
|
||||
struct thread *thread = cpu_local_var(current);
|
||||
struct process *proc = thread->proc;
|
||||
long irqstate;
|
||||
struct mckfd *fdp;
|
||||
struct pt_regs *regs = (struct pt_regs *)priv;
|
||||
const struct per_cpu_arm_pmu *cpu_pmu = get_per_cpu_pmu();
|
||||
int idx;
|
||||
|
||||
/*
|
||||
* Get and reset the IRQ flags
|
||||
@ -661,27 +803,40 @@ static void armv8pmu_handle_irq(void *priv)
|
||||
if (!armv8pmu_has_overflowed(pmovsr))
|
||||
return;
|
||||
|
||||
if (!proc->monitoring_event) {
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* Handle the counter(s) overflow(s)
|
||||
*/
|
||||
/* same as x86_64 mckernel */
|
||||
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
|
||||
for (fdp = proc->mckfd; fdp; fdp = fdp->next) {
|
||||
if (fdp->sig_no > 0)
|
||||
break;
|
||||
}
|
||||
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
|
||||
for (idx = 0; idx < cpu_pmu->num_events; idx++) {
|
||||
struct mc_perf_event *event = NULL;
|
||||
struct mc_perf_event *sub;
|
||||
|
||||
if (fdp) {
|
||||
memset(&info, '\0', sizeof(info));
|
||||
info.si_signo = fdp->sig_no;
|
||||
info._sifields._sigfault.si_addr = (void *)regs->pc;
|
||||
info._sifields._sigpoll.si_fd = fdp->fd;
|
||||
set_signal(fdp->sig_no, regs, &info);
|
||||
}
|
||||
else {
|
||||
set_signal(SIGIO, regs, NULL);
|
||||
if (!armv8pmu_counter_has_overflowed(pmovsr, idx)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (proc->monitoring_event->counter_id == idx) {
|
||||
event = proc->monitoring_event;
|
||||
} else {
|
||||
list_for_each_entry(sub,
|
||||
&proc->monitoring_event->sibling_list,
|
||||
group_entry) {
|
||||
if (sub->counter_id == idx) {
|
||||
event = sub;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!event) {
|
||||
continue;
|
||||
}
|
||||
ihk_mc_event_update(event);
|
||||
ihk_mc_event_set_period(event);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void armv8pmu_enable_user_access_pmu_regs(void)
|
||||
@ -735,11 +890,15 @@ int armv8pmu_init(struct arm_pmu* cpu_pmu)
|
||||
cpu_pmu->write_evtype = armv8pmu_write_evtype;
|
||||
cpu_pmu->get_event_idx = armv8pmu_get_event_idx;
|
||||
cpu_pmu->map_event = armv8_pmuv3_map_event;
|
||||
cpu_pmu->map_hw_event = armv8_pmuv3_map_hw_event;
|
||||
cpu_pmu->map_cache_event = armv8_pmuv3_map_cache_event;
|
||||
cpu_pmu->map_raw_event = armv8_pmuv3_map_raw_event;
|
||||
cpu_pmu->enable_user_access_pmu_regs =
|
||||
armv8pmu_enable_user_access_pmu_regs;
|
||||
cpu_pmu->disable_user_access_pmu_regs =
|
||||
armv8pmu_disable_user_access_pmu_regs;
|
||||
cpu_pmu->handler = &armv8pmu_handler;
|
||||
cpu_pmu->counter_mask_valid = &armv8pmu_counter_mask_valid;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -18,10 +18,9 @@
|
||||
#include <psci.h>
|
||||
#include <errno.h>
|
||||
#include <ihk/types.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <compiler.h>
|
||||
#include <lwk/compiler.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
//#define DEBUG_PRINT_PSCI
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* ptrace.c COPYRIGHT FUJITSU LIMITED 2016-2018 */
|
||||
/* ptrace.c COPYRIGHT FUJITSU LIMITED 2016-2019 */
|
||||
#include <errno.h>
|
||||
#include <debug-monitors.h>
|
||||
#include <hw_breakpoint.h>
|
||||
@ -11,7 +11,8 @@
|
||||
#include <hwcap.h>
|
||||
#include <string.h>
|
||||
#include <thread_info.h>
|
||||
#include <debug.h>
|
||||
#include <ptrace.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
//#define DEBUG_PRINT_SC
|
||||
|
||||
@ -25,37 +26,6 @@
|
||||
extern void save_debugreg(unsigned long *debugreg);
|
||||
extern int interrupt_from_user(void *);
|
||||
|
||||
enum aarch64_regset {
|
||||
REGSET_GPR,
|
||||
REGSET_FPR,
|
||||
REGSET_TLS,
|
||||
REGSET_HW_BREAK,
|
||||
REGSET_HW_WATCH,
|
||||
REGSET_SYSTEM_CALL,
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
REGSET_SVE,
|
||||
#endif /* CONFIG_ARM64_SVE */
|
||||
};
|
||||
|
||||
struct user_regset;
|
||||
typedef long user_regset_get_fn(struct thread *target,
|
||||
const struct user_regset *regset,
|
||||
unsigned int pos, unsigned int count,
|
||||
void *kbuf, void __user *ubuf);
|
||||
|
||||
typedef long user_regset_set_fn(struct thread *target,
|
||||
const struct user_regset *regset,
|
||||
unsigned int pos, unsigned int count,
|
||||
const void *kbuf, const void __user *ubuf);
|
||||
|
||||
struct user_regset {
|
||||
user_regset_get_fn *get;
|
||||
user_regset_set_fn *set;
|
||||
unsigned int n;
|
||||
unsigned int size;
|
||||
unsigned int core_note_type;
|
||||
};
|
||||
|
||||
long ptrace_read_user(struct thread *thread, long addr, unsigned long *value)
|
||||
{
|
||||
return -EIO;
|
||||
@ -273,6 +243,17 @@ static inline long copy_regset_from_user(struct thread *target,
|
||||
return regset->set(target, regset, offset, size, NULL, data);
|
||||
}
|
||||
|
||||
unsigned int regset_size(struct thread *target,
|
||||
const struct user_regset *regset)
|
||||
{
|
||||
if (!regset->get_size) {
|
||||
return regset->n * regset->size;
|
||||
}
|
||||
else {
|
||||
return regset->get_size(target, regset);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Bits which are always architecturally RES0 per ARM DDI 0487A.h
|
||||
* Userspace cannot use these until they have an architectural meaning.
|
||||
@ -624,6 +605,48 @@ out:
|
||||
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
|
||||
static void sve_init_header_from_thread(struct user_sve_header *header,
|
||||
struct thread *target)
|
||||
{
|
||||
unsigned int vq;
|
||||
|
||||
memset(header, 0, sizeof(*header));
|
||||
|
||||
/* McKernel processes always enable SVE. */
|
||||
header->flags = SVE_PT_REGS_SVE;
|
||||
|
||||
if (target->ctx.thread->sve_flags & SVE_PT_VL_INHERIT) {
|
||||
header->flags |= SVE_PT_VL_INHERIT;
|
||||
}
|
||||
|
||||
header->vl = target->ctx.thread->sve_vl;
|
||||
vq = sve_vq_from_vl(header->vl);
|
||||
|
||||
header->max_vl = sve_max_vl;
|
||||
header->size = SVE_PT_SIZE(vq, header->flags);
|
||||
header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl),
|
||||
SVE_PT_REGS_SVE);
|
||||
}
|
||||
|
||||
static unsigned int sve_size_from_header(struct user_sve_header const *header)
|
||||
{
|
||||
return ALIGN(header->size, SVE_VQ_BYTES);
|
||||
}
|
||||
|
||||
static unsigned int sve_get_size(struct thread *target,
|
||||
const struct user_regset *regset)
|
||||
{
|
||||
struct user_sve_header header;
|
||||
|
||||
/* Instead of system_supports_sve() */
|
||||
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
sve_init_header_from_thread(&header, target);
|
||||
return sve_size_from_header(&header);
|
||||
}
|
||||
|
||||
/* read NT_ARM_SVE */
|
||||
static long sve_get(struct thread *target,
|
||||
const struct user_regset *regset,
|
||||
@ -646,23 +669,9 @@ static long sve_get(struct thread *target,
|
||||
}
|
||||
|
||||
/* Header */
|
||||
memset(&header, 0, sizeof(header));
|
||||
|
||||
header.vl = target->ctx.thread->sve_vl;
|
||||
|
||||
BUG_ON(!sve_vl_valid(header.vl));
|
||||
sve_init_header_from_thread(&header, target);
|
||||
vq = sve_vq_from_vl(header.vl);
|
||||
|
||||
BUG_ON(!sve_vl_valid(sve_max_vl));
|
||||
header.max_vl = sve_max_vl;
|
||||
|
||||
/* McKernel processes always enable SVE. */
|
||||
header.flags = SVE_PT_REGS_SVE;
|
||||
|
||||
header.size = SVE_PT_SIZE(vq, header.flags);
|
||||
header.max_size = SVE_PT_SIZE(sve_vq_from_vl(header.max_vl),
|
||||
SVE_PT_REGS_SVE);
|
||||
|
||||
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &header,
|
||||
0, sizeof(header));
|
||||
if (ret) {
|
||||
@ -676,11 +685,9 @@ static long sve_get(struct thread *target,
|
||||
*/
|
||||
|
||||
/* Otherwise: full SVE case */
|
||||
|
||||
start = SVE_PT_SVE_OFFSET;
|
||||
end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
|
||||
|
||||
BUG_ON(end < start);
|
||||
BUG_ON(end - start > sve_state_size(target));
|
||||
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
|
||||
target->ctx.thread->sve_state,
|
||||
start, end);
|
||||
@ -690,24 +697,18 @@ static long sve_get(struct thread *target,
|
||||
|
||||
start = end;
|
||||
end = SVE_PT_SVE_FPSR_OFFSET(vq);
|
||||
|
||||
BUG_ON(end < start);
|
||||
ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
|
||||
start, end);
|
||||
if (ret) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy fpsr, and fpcr which must follow contiguously in
|
||||
* struct fpsimd_state:
|
||||
*/
|
||||
start = end;
|
||||
end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
|
||||
|
||||
BUG_ON((char *)(&target->fp_regs->fpcr + 1) <
|
||||
(char *)&target->fp_regs->fpsr);
|
||||
BUG_ON(end < start);
|
||||
BUG_ON((char *)(&target->fp_regs->fpcr + 1) -
|
||||
(char *)&target->fp_regs->fpsr !=
|
||||
end - start);
|
||||
|
||||
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
|
||||
&target->fp_regs->fpsr,
|
||||
start, end);
|
||||
@ -716,9 +717,7 @@ static long sve_get(struct thread *target,
|
||||
}
|
||||
|
||||
start = end;
|
||||
end = (SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE) + 15) / 16 * 16;
|
||||
|
||||
BUG_ON(end < start);
|
||||
end = sve_size_from_header(&header);
|
||||
ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
|
||||
start, end);
|
||||
out:
|
||||
@ -762,13 +761,12 @@ static long sve_set(struct thread *target,
|
||||
* sve_set_vector_length(), which will also validate them for us:
|
||||
*/
|
||||
ret = sve_set_vector_length(target, header.vl,
|
||||
header.flags & ~SVE_PT_REGS_MASK);
|
||||
((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
|
||||
if (ret) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Actual VL set may be less than the user asked for: */
|
||||
BUG_ON(!sve_vl_valid(target->ctx.thread->sve_vl));
|
||||
vq = sve_vq_from_vl(target->ctx.thread->sve_vl);
|
||||
|
||||
/* Registers: FPSIMD-only case */
|
||||
@ -779,11 +777,19 @@ static long sve_set(struct thread *target,
|
||||
}
|
||||
|
||||
/* Otherwise: full SVE case */
|
||||
|
||||
/*
|
||||
* If setting a different VL from the requested VL and there is
|
||||
* register data, the data layout will be wrong: don't even
|
||||
* try to set the registers in this case.
|
||||
*/
|
||||
if (count && vq != sve_vq_from_vl(header.vl)) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
start = SVE_PT_SVE_OFFSET;
|
||||
end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
|
||||
|
||||
BUG_ON(end < start);
|
||||
BUG_ON(end - start > sve_state_size(target));
|
||||
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
|
||||
target->ctx.thread->sve_state,
|
||||
start, end);
|
||||
@ -793,27 +799,21 @@ static long sve_set(struct thread *target,
|
||||
|
||||
start = end;
|
||||
end = SVE_PT_SVE_FPSR_OFFSET(vq);
|
||||
|
||||
BUG_ON(end < start);
|
||||
ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
|
||||
start, end);
|
||||
if (ret) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy fpsr, and fpcr which must follow contiguously in
|
||||
* struct fpsimd_state:
|
||||
*/
|
||||
start = end;
|
||||
end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
|
||||
|
||||
BUG_ON((char *)(&target->fp_regs->fpcr + 1) <
|
||||
(char *)&target->fp_regs->fpsr);
|
||||
BUG_ON(end < start);
|
||||
BUG_ON((char *)(&target->fp_regs->fpcr + 1) -
|
||||
(char *)&target->fp_regs->fpsr !=
|
||||
end - start);
|
||||
|
||||
user_regset_copyin(&pos, &count, &kbuf, &ubuf,
|
||||
&target->fp_regs->fpsr,
|
||||
start, end);
|
||||
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
|
||||
&target->fp_regs->fpsr,
|
||||
start, end);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
@ -825,8 +825,9 @@ static const struct user_regset aarch64_regsets[] = {
|
||||
.core_note_type = NT_PRSTATUS,
|
||||
.n = sizeof(struct user_pt_regs) / sizeof(uint64_t),
|
||||
.size = sizeof(uint64_t),
|
||||
.align = sizeof(uint64_t),
|
||||
.get = gpr_get,
|
||||
.set = gpr_set
|
||||
.set = gpr_set,
|
||||
},
|
||||
[REGSET_FPR] = {
|
||||
.core_note_type = NT_PRFPREG,
|
||||
@ -836,56 +837,75 @@ static const struct user_regset aarch64_regsets[] = {
|
||||
* fpcr are 32-bits wide.
|
||||
*/
|
||||
.size = sizeof(uint32_t),
|
||||
.align = sizeof(uint32_t),
|
||||
.get = fpr_get,
|
||||
.set = fpr_set
|
||||
.set = fpr_set,
|
||||
},
|
||||
[REGSET_TLS] = {
|
||||
.core_note_type = NT_ARM_TLS,
|
||||
.n = 1,
|
||||
.size = sizeof(void *),
|
||||
.align = sizeof(void *),
|
||||
.get = tls_get,
|
||||
.set = tls_set
|
||||
.set = tls_set,
|
||||
},
|
||||
[REGSET_HW_BREAK] = {
|
||||
.core_note_type = NT_ARM_HW_BREAK,
|
||||
.n = sizeof(struct user_hwdebug_state) / sizeof(uint32_t),
|
||||
.size = sizeof(uint32_t),
|
||||
.align = sizeof(uint32_t),
|
||||
.get = hw_break_get,
|
||||
.set = hw_break_set
|
||||
.set = hw_break_set,
|
||||
},
|
||||
[REGSET_HW_WATCH] = {
|
||||
.core_note_type = NT_ARM_HW_WATCH,
|
||||
.n = sizeof(struct user_hwdebug_state) / sizeof(uint32_t),
|
||||
.size = sizeof(uint32_t),
|
||||
.align = sizeof(uint32_t),
|
||||
.get = hw_break_get,
|
||||
.set = hw_break_set
|
||||
.set = hw_break_set,
|
||||
},
|
||||
[REGSET_SYSTEM_CALL] = {
|
||||
.core_note_type = NT_ARM_SYSTEM_CALL,
|
||||
.n = 1,
|
||||
.size = sizeof(int),
|
||||
.align = sizeof(int),
|
||||
.get = system_call_get,
|
||||
.set = system_call_set
|
||||
.set = system_call_set,
|
||||
},
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
[REGSET_SVE] = { /* Scalable Vector Extension */
|
||||
.core_note_type = NT_ARM_SVE,
|
||||
.n = (SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE) + 15) / 16,
|
||||
.size = 16,
|
||||
.n = (SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE) +
|
||||
(SVE_VQ_BYTES - 1)) / SVE_VQ_BYTES,
|
||||
.size = SVE_VQ_BYTES,
|
||||
.align = SVE_VQ_BYTES,
|
||||
.get = sve_get,
|
||||
.set = sve_set
|
||||
.set = sve_set,
|
||||
.get_size = sve_get_size,
|
||||
},
|
||||
#endif /* CONFIG_ARM64_SVE */
|
||||
};
|
||||
|
||||
static const struct user_regset *
|
||||
find_regset(const struct user_regset *regset, unsigned int type, int n)
|
||||
static const struct user_regset_view user_aarch64_view = {
|
||||
.name = "aarch64", .e_machine = EM_AARCH64,
|
||||
.regsets = aarch64_regsets,
|
||||
.n = sizeof(aarch64_regsets) / sizeof(aarch64_regsets[0])
|
||||
};
|
||||
|
||||
const struct user_regset_view *current_user_regset_view(void)
|
||||
{
|
||||
return &user_aarch64_view;
|
||||
}
|
||||
|
||||
const struct user_regset *find_regset(const struct user_regset_view *view,
|
||||
unsigned int type)
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
if (regset[i].core_note_type == type) {
|
||||
return ®set[i];
|
||||
for (i = 0; i < view->n; i++) {
|
||||
if (view->regsets[i].core_note_type == type) {
|
||||
return &view->regsets[i];
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
@ -894,8 +914,8 @@ find_regset(const struct user_regset *regset, unsigned int type, int n)
|
||||
static long ptrace_regset(struct thread *thread, int req, long type, struct iovec *iov)
|
||||
{
|
||||
long rc = -EINVAL;
|
||||
const struct user_regset *regset = find_regset(aarch64_regsets, type,
|
||||
sizeof(aarch64_regsets) / sizeof(aarch64_regsets[0]));
|
||||
const struct user_regset *regset =
|
||||
find_regset(&user_aarch64_view, type);
|
||||
|
||||
if (!regset) {
|
||||
kprintf("%s: not supported type 0x%x\n", __FUNCTION__, type);
|
||||
@ -944,6 +964,7 @@ void ptrace_report_signal(struct thread *thread, int sig)
|
||||
/* save thread_info, if called by ptrace_report_exec() */
|
||||
if (sig == ((SIGTRAP | (PTRACE_EVENT_EXEC << 8)))) {
|
||||
memcpy(&tinfo, thread->ctx.thread, sizeof(struct thread_info));
|
||||
thread->uctx->user_regs.regs[0] = 0;
|
||||
}
|
||||
|
||||
mcs_rwlock_writer_lock(&proc->update_lock, &lock);
|
||||
@ -956,6 +977,13 @@ void ptrace_report_signal(struct thread *thread, int sig)
|
||||
thread->exit_status = sig;
|
||||
thread->status = PS_TRACED;
|
||||
thread->ptrace &= ~PT_TRACE_SYSCALL;
|
||||
if (sig == ((SIGTRAP | (PTRACE_EVENT_EXEC << 8))) &&
|
||||
thread->ptrace & PTRACE_O_TRACEEXEC) {
|
||||
/* PTRACE_O_TRACEEXEC: since Linux 3.0, the former
|
||||
* thread ID can be retrieved with PTRACE_GETEVENTMSG.
|
||||
* Report no change. */
|
||||
thread->ptrace_eventmsg = thread->tid;
|
||||
}
|
||||
save_debugreg(thread->ptrace_debugreg);
|
||||
if (sig == SIGSTOP || sig == SIGTSTP ||
|
||||
sig == SIGTTIN || sig == SIGTTOU) {
|
||||
@ -991,6 +1019,7 @@ void ptrace_report_signal(struct thread *thread, int sig)
|
||||
if (sig == ((SIGTRAP | (PTRACE_EVENT_EXEC << 8)))) {
|
||||
memcpy(thread->ctx.thread, &tinfo, sizeof(struct thread_info));
|
||||
}
|
||||
arch_flush_icache_all();
|
||||
}
|
||||
|
||||
long
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* syscall.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||
/* syscall.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
#include <cpulocal.h>
|
||||
#include <string.h>
|
||||
#include <kmalloc.h>
|
||||
@ -15,7 +15,8 @@
|
||||
#include <limits.h>
|
||||
#include <uio.h>
|
||||
#include <syscall.h>
|
||||
#include <debug.h>
|
||||
#include <rusage_private.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
void terminate_mcexec(int, int);
|
||||
extern void ptrace_report_signal(struct thread *thread, int sig);
|
||||
@ -42,7 +43,7 @@ uintptr_t debug_constants[] = {
|
||||
offsetof(struct cpu_local_var, runq),
|
||||
offsetof(struct cpu_local_var, status),
|
||||
offsetof(struct cpu_local_var, idle),
|
||||
offsetof(struct thread, ctx) + offsetof(struct thread_info, cpu_context),
|
||||
offsetof(struct thread, ctx),
|
||||
offsetof(struct thread, sched_list),
|
||||
offsetof(struct thread, proc),
|
||||
offsetof(struct thread, status),
|
||||
@ -56,13 +57,34 @@ extern int num_processors;
|
||||
int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last)
|
||||
{
|
||||
int min_queue_len = -1;
|
||||
int cpu, min_cpu = -1, uti_cpu = -1;
|
||||
unsigned long irqstate;
|
||||
int cpu, min_cpu = -1;
|
||||
#if 0
|
||||
int uti_cpu = -1;
|
||||
#endif
|
||||
unsigned long irqstate = 0;
|
||||
|
||||
irqstate = ihk_mc_spinlock_lock(&runq_reservation_lock);
|
||||
int start, end, step;
|
||||
|
||||
if (use_last) {
|
||||
start = num_processors - 1;
|
||||
end = -1;
|
||||
step = -1;
|
||||
}
|
||||
else {
|
||||
start = 0;
|
||||
end = num_processors;
|
||||
step = 1;
|
||||
}
|
||||
|
||||
if (!cpu_local_var(current)->proc->nr_processes) {
|
||||
irqstate = ihk_mc_spinlock_lock(&runq_reservation_lock);
|
||||
}
|
||||
else {
|
||||
irqstate = cpu_disable_interrupt_save();
|
||||
}
|
||||
|
||||
/* Find the first allowed core with the shortest run queue */
|
||||
for (cpu = 0; cpu < num_processors; ++cpu) {
|
||||
for (cpu = start; cpu != end; cpu += step) {
|
||||
struct cpu_local_var *v;
|
||||
|
||||
if (!CPU_ISSET(cpu, cpu_set))
|
||||
@ -73,11 +95,14 @@ int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last)
|
||||
dkprintf("%s: cpu=%d,runq_len=%d,runq_reserved=%d\n",
|
||||
__func__, cpu, v->runq_len, v->runq_reserved);
|
||||
if (min_queue_len == -1 ||
|
||||
v->runq_len + v->runq_reserved < min_queue_len) {
|
||||
min_queue_len = v->runq_len + v->runq_reserved;
|
||||
//v->runq_len + v->runq_reserved < min_queue_len) {
|
||||
v->runq_len < min_queue_len) {
|
||||
//min_queue_len = v->runq_len + v->runq_reserved;
|
||||
min_queue_len = v->runq_len;
|
||||
min_cpu = cpu;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* Record the last tie CPU */
|
||||
if (min_cpu != cpu &&
|
||||
v->runq_len + v->runq_reserved == min_queue_len) {
|
||||
@ -86,14 +111,15 @@ int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last)
|
||||
dkprintf("%s: cpu=%d,runq_len=%d,runq_reserved=%d,min_cpu=%d,uti_cpu=%d\n",
|
||||
__func__, cpu, v->runq_len, v->runq_reserved,
|
||||
min_cpu, uti_cpu);
|
||||
#else
|
||||
|
||||
ihk_mc_spinlock_unlock_noirq(&v->runq_lock);
|
||||
#if 0
|
||||
if (min_queue_len == 0)
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if 0
|
||||
min_cpu = use_last ? uti_cpu : min_cpu;
|
||||
if (min_cpu != -1) {
|
||||
if (get_cpu_local_var(min_cpu)->status != CPU_STATUS_RESERVED)
|
||||
@ -102,22 +128,20 @@ int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last)
|
||||
__sync_fetch_and_add(&get_cpu_local_var(min_cpu)->runq_reserved,
|
||||
1);
|
||||
}
|
||||
ihk_mc_spinlock_unlock(&runq_reservation_lock, irqstate);
|
||||
#else
|
||||
__sync_fetch_and_add(&get_cpu_local_var(min_cpu)->runq_reserved, 1);
|
||||
#endif
|
||||
|
||||
if (!cpu_local_var(current)->proc->nr_processes) {
|
||||
ihk_mc_spinlock_unlock(&runq_reservation_lock, irqstate);
|
||||
}
|
||||
else {
|
||||
cpu_restore_interrupt(irqstate);
|
||||
}
|
||||
|
||||
return min_cpu;
|
||||
}
|
||||
|
||||
int
|
||||
arch_clear_host_user_space()
|
||||
{
|
||||
struct thread *th = cpu_local_var(current);
|
||||
|
||||
/* XXX: might be unnecessary */
|
||||
clear_host_pte(th->vm->region.user_start,
|
||||
(th->vm->region.user_end - th->vm->region.user_start));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* archtecture-depended syscall handlers */
|
||||
extern unsigned long do_fork(int clone_flags, unsigned long newsp,
|
||||
unsigned long parent_tidptr, unsigned long child_tidptr,
|
||||
@ -126,10 +150,18 @@ extern unsigned long do_fork(int clone_flags, unsigned long newsp,
|
||||
|
||||
SYSCALL_DECLARE(clone)
|
||||
{
|
||||
struct process *proc = cpu_local_var(current)->proc;
|
||||
struct mcs_rwlock_node_irqsave lock_dump;
|
||||
unsigned long ret;
|
||||
|
||||
/* mutex coredump */
|
||||
mcs_rwlock_reader_lock(&proc->coredump_lock, &lock_dump);
|
||||
|
||||
if ((int)ihk_mc_syscall_arg0(ctx) & CLONE_VFORK) {
|
||||
return do_fork(CLONE_VFORK|SIGCHLD, 0, 0, 0, 0, ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx));
|
||||
ret = do_fork(CLONE_VFORK|SIGCHLD, 0, 0, 0, 0,
|
||||
ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx));
|
||||
} else {
|
||||
return do_fork((int)ihk_mc_syscall_arg0(ctx), /* clone_flags */
|
||||
ret = do_fork((int)ihk_mc_syscall_arg0(ctx), /* clone_flags */
|
||||
ihk_mc_syscall_arg1(ctx), /* newsp */
|
||||
ihk_mc_syscall_arg2(ctx), /* parent_tidptr */
|
||||
ihk_mc_syscall_arg4(ctx), /* child_tidptr (swap arg3) */
|
||||
@ -137,33 +169,9 @@ SYSCALL_DECLARE(clone)
|
||||
ihk_mc_syscall_pc(ctx), /* curpc */
|
||||
ihk_mc_syscall_sp(ctx)); /* cursp */
|
||||
}
|
||||
}
|
||||
mcs_rwlock_reader_unlock(&proc->coredump_lock, &lock_dump);
|
||||
|
||||
SYSCALL_DECLARE(rt_sigaction)
|
||||
{
|
||||
int sig = ihk_mc_syscall_arg0(ctx);
|
||||
const struct sigaction *act = (const struct sigaction *)ihk_mc_syscall_arg1(ctx);
|
||||
struct sigaction *oact = (struct sigaction *)ihk_mc_syscall_arg2(ctx);
|
||||
size_t sigsetsize = ihk_mc_syscall_arg3(ctx);
|
||||
struct k_sigaction new_sa, old_sa;
|
||||
int rc;
|
||||
|
||||
if (sigsetsize != sizeof(sigset_t))
|
||||
return -EINVAL;
|
||||
|
||||
if(act)
|
||||
if(copy_from_user(&new_sa.sa, act, sizeof new_sa.sa)){
|
||||
goto fault;
|
||||
}
|
||||
rc = do_sigaction(sig, act? &new_sa: NULL, oact? &old_sa: NULL);
|
||||
if(rc == 0 && oact)
|
||||
if(copy_to_user(oact, &old_sa.sa, sizeof old_sa.sa)){
|
||||
goto fault;
|
||||
}
|
||||
|
||||
return rc;
|
||||
fault:
|
||||
return -EFAULT;
|
||||
return ret;
|
||||
}
|
||||
|
||||
SYSCALL_DECLARE(prctl)
|
||||
@ -178,11 +186,10 @@ SYSCALL_DECLARE(prctl)
|
||||
|
||||
switch (option) {
|
||||
case PR_SVE_SET_VL:
|
||||
error = SVE_SET_VL(cpu_local_var(current),
|
||||
ihk_mc_syscall_arg1(ctx), ihk_mc_syscall_arg2(ctx));
|
||||
error = SVE_SET_VL(ihk_mc_syscall_arg1(ctx));
|
||||
break;
|
||||
case PR_SVE_GET_VL:
|
||||
error = SVE_GET_VL(cpu_local_var(current));
|
||||
error = SVE_GET_VL();
|
||||
break;
|
||||
case PR_SET_THP_DISABLE:
|
||||
if (arg3 || arg4 || arg5) {
|
||||
@ -657,7 +664,7 @@ void set_single_step(struct thread *thread)
|
||||
set_regs_spsr_ss(thread->uctx);
|
||||
}
|
||||
|
||||
extern void coredump(struct thread *thread, void *regs);
|
||||
extern int coredump(struct thread *thread, void *regs, int sig);
|
||||
|
||||
static int
|
||||
isrestart(int syscallno, unsigned long rc, int sig, int restart)
|
||||
@ -1096,6 +1103,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
struct mcs_rwlock_node_irqsave lock;
|
||||
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
||||
int restart = 0;
|
||||
int ret;
|
||||
|
||||
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
|
||||
dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig);
|
||||
@ -1270,15 +1278,6 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
dkprintf("SIGTRAP(): woken up\n");
|
||||
break;
|
||||
case SIGCONT:
|
||||
memset(&info, '\0', sizeof info);
|
||||
info.si_signo = SIGCHLD;
|
||||
info.si_code = CLD_CONTINUED;
|
||||
info._sifields._sigchld.si_pid = proc->pid;
|
||||
info._sifields._sigchld.si_status = 0x0000ffff;
|
||||
do_kill(cpu_local_var(current), proc->parent->pid, -1, SIGCHLD, &info, 0);
|
||||
proc->main_thread->signal_flags = SIGNAL_STOP_CONTINUED;
|
||||
proc->status = PS_RUNNING;
|
||||
dkprintf("do_signal,SIGCONT,do nothing\n");
|
||||
break;
|
||||
case SIGQUIT:
|
||||
case SIGILL:
|
||||
@ -1290,9 +1289,31 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
case SIGXCPU:
|
||||
case SIGXFSZ:
|
||||
core:
|
||||
dkprintf("do_signal,default,core,sig=%d\n", sig);
|
||||
coredump(thread, regs);
|
||||
coredumped = 0x80;
|
||||
thread->coredump_regs =
|
||||
kmalloc(sizeof(struct pt_regs),
|
||||
IHK_MC_AP_NOWAIT);
|
||||
if (!thread->coredump_regs) {
|
||||
kprintf("%s: Out of memory\n", __func__);
|
||||
goto skip;
|
||||
}
|
||||
memcpy(thread->coredump_regs, regs,
|
||||
sizeof(struct pt_regs));
|
||||
|
||||
ret = coredump(thread, regs, sig);
|
||||
switch (ret) {
|
||||
case -EBUSY:
|
||||
kprintf("%s: INFO: coredump not performed, try ulimit -c <non-zero>\n",
|
||||
__func__);
|
||||
break;
|
||||
case 0:
|
||||
coredumped = 0x80;
|
||||
break;
|
||||
default:
|
||||
kprintf("%s: ERROR: coredump failed (%d)\n",
|
||||
__func__, ret);
|
||||
break;
|
||||
}
|
||||
skip:
|
||||
terminate(0, sig | coredumped);
|
||||
break;
|
||||
case SIGCHLD:
|
||||
@ -1309,70 +1330,6 @@ out:
|
||||
return restart;
|
||||
}
|
||||
|
||||
static struct sig_pending *
|
||||
getsigpending(struct thread *thread, int delflag){
|
||||
struct list_head *head;
|
||||
mcs_rwlock_lock_t *lock;
|
||||
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
||||
struct sig_pending *next;
|
||||
struct sig_pending *pending;
|
||||
__sigset_t w;
|
||||
|
||||
w = thread->sigmask.__val[0];
|
||||
|
||||
lock = &thread->sigcommon->lock;
|
||||
head = &thread->sigcommon->sigpending;
|
||||
for(;;) {
|
||||
if (delflag) {
|
||||
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
|
||||
}
|
||||
else {
|
||||
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
|
||||
}
|
||||
|
||||
list_for_each_entry_safe(pending, next, head, list){
|
||||
if(!(pending->sigmask.__val[0] & w)){
|
||||
if(delflag)
|
||||
list_del(&pending->list);
|
||||
|
||||
if (delflag) {
|
||||
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
||||
}
|
||||
else {
|
||||
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
|
||||
}
|
||||
return pending;
|
||||
}
|
||||
}
|
||||
|
||||
if (delflag) {
|
||||
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
||||
}
|
||||
else {
|
||||
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
|
||||
}
|
||||
|
||||
if(lock == &thread->sigpendinglock)
|
||||
return NULL;
|
||||
|
||||
lock = &thread->sigpendinglock;
|
||||
head = &thread->sigpending;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct sig_pending *
|
||||
hassigpending(struct thread *thread)
|
||||
{
|
||||
if (list_empty(&thread->sigpending) &&
|
||||
list_empty(&thread->sigcommon->sigpending)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return getsigpending(thread, 0);
|
||||
}
|
||||
|
||||
int
|
||||
interrupt_from_user(void *regs0)
|
||||
{
|
||||
@ -1396,185 +1353,6 @@ void save_syscall_return_value(int num, unsigned long rc)
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
check_signal(unsigned long rc, void *regs0, int num)
|
||||
{
|
||||
__check_signal(rc, regs0, num, 0);
|
||||
}
|
||||
|
||||
void
|
||||
check_signal_irq_disabled(unsigned long rc, void *regs0, int num)
|
||||
{
|
||||
__check_signal(rc, regs0, num, 1);
|
||||
}
|
||||
|
||||
static void
|
||||
__check_signal(unsigned long rc, void *regs0, int num, int irq_disabled)
|
||||
{
|
||||
ihk_mc_user_context_t *regs = regs0;
|
||||
struct thread *thread;
|
||||
struct sig_pending *pending;
|
||||
int irqstate;
|
||||
|
||||
if(clv == NULL)
|
||||
return;
|
||||
thread = cpu_local_var(current);
|
||||
|
||||
if(thread == NULL || thread->proc->pid == 0){
|
||||
struct thread *t;
|
||||
irqstate = ihk_mc_spinlock_lock(&(cpu_local_var(runq_lock)));
|
||||
list_for_each_entry(t, &(cpu_local_var(runq)), sched_list){
|
||||
if(t->proc->pid <= 0)
|
||||
continue;
|
||||
if(t->status == PS_INTERRUPTIBLE &&
|
||||
hassigpending(t)){
|
||||
t->status = PS_RUNNING;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), irqstate);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if(regs != NULL && !interrupt_from_user(regs)) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (list_empty(&thread->sigpending) &&
|
||||
list_empty(&thread->sigcommon->sigpending)) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
for(;;){
|
||||
/* When this function called from check_signal_irq_disabled,
|
||||
* return with interrupt invalid.
|
||||
* This is to eliminate signal loss.
|
||||
*/
|
||||
if (irq_disabled == 1) {
|
||||
irqstate = cpu_disable_interrupt_save();
|
||||
}
|
||||
pending = getsigpending(thread, 1);
|
||||
if(!pending) {
|
||||
dkprintf("check_signal,queue is empty\n");
|
||||
goto out;
|
||||
}
|
||||
if (irq_disabled == 1) {
|
||||
cpu_restore_interrupt(irqstate);
|
||||
}
|
||||
if (do_signal(rc, regs, thread, pending, num)) {
|
||||
num = -1;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
return;
|
||||
}
|
||||
|
||||
static int
|
||||
check_sig_pending_thread(struct thread *thread)
|
||||
{
|
||||
int found = 0;
|
||||
struct list_head *head;
|
||||
mcs_rwlock_lock_t *lock;
|
||||
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
||||
struct sig_pending *next;
|
||||
struct sig_pending *pending;
|
||||
__sigset_t w;
|
||||
__sigset_t x;
|
||||
int sig = 0;
|
||||
struct k_sigaction *k;
|
||||
struct cpu_local_var *v;
|
||||
|
||||
v = get_this_cpu_local_var();
|
||||
w = thread->sigmask.__val[0];
|
||||
|
||||
lock = &thread->sigcommon->lock;
|
||||
head = &thread->sigcommon->sigpending;
|
||||
for (;;) {
|
||||
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
|
||||
|
||||
list_for_each_entry_safe(pending, next, head, list) {
|
||||
for (x = pending->sigmask.__val[0], sig = 0; x;
|
||||
sig++, x >>= 1)
|
||||
;
|
||||
k = thread->sigcommon->action + sig - 1;
|
||||
if ((sig != SIGCHLD && sig != SIGURG) ||
|
||||
(k->sa.sa_handler != SIG_IGN &&
|
||||
k->sa.sa_handler != NULL)) {
|
||||
if (!(pending->sigmask.__val[0] & w)) {
|
||||
if (pending->interrupted == 0) {
|
||||
pending->interrupted = 1;
|
||||
found = 1;
|
||||
if (sig != SIGCHLD &&
|
||||
sig != SIGURG &&
|
||||
!k->sa.sa_handler) {
|
||||
found = 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
|
||||
|
||||
if (found == 2) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (lock == &thread->sigpendinglock) {
|
||||
break;
|
||||
}
|
||||
|
||||
lock = &thread->sigpendinglock;
|
||||
head = &thread->sigpending;
|
||||
}
|
||||
|
||||
if (found == 2) {
|
||||
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
|
||||
terminate_mcexec(0, sig);
|
||||
return 1;
|
||||
}
|
||||
else if (found == 1) {
|
||||
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
|
||||
interrupt_syscall(thread, 0);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
check_sig_pending(void)
|
||||
{
|
||||
struct thread *thread;
|
||||
struct cpu_local_var *v;
|
||||
|
||||
if (clv == NULL)
|
||||
return;
|
||||
|
||||
v = get_this_cpu_local_var();
|
||||
repeat:
|
||||
v->runq_irqstate = ihk_mc_spinlock_lock(&v->runq_lock);
|
||||
list_for_each_entry(thread, &(v->runq), sched_list) {
|
||||
|
||||
if (thread == NULL || thread == &cpu_local_var(idle)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (thread->in_syscall_offload == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (thread->proc->group_exit_status & 0x0000000100000000L) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (check_sig_pending_thread(thread))
|
||||
goto repeat;
|
||||
}
|
||||
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
|
||||
}
|
||||
|
||||
unsigned long
|
||||
do_kill(struct thread * thread, int pid, int tid, int sig, siginfo_t *info, int ptracecont)
|
||||
{
|
||||
@ -1590,7 +1368,6 @@ do_kill(struct thread * thread, int pid, int tid, int sig, siginfo_t *info, int
|
||||
struct list_head *head = NULL;
|
||||
int rc;
|
||||
unsigned long irqstate = 0;
|
||||
struct k_sigaction *k;
|
||||
int doint;
|
||||
int found = 0;
|
||||
siginfo_t info0;
|
||||
@ -1600,6 +1377,7 @@ do_kill(struct thread * thread, int pid, int tid, int sig, siginfo_t *info, int
|
||||
struct process_hash *phash = rset->process_hash;
|
||||
struct mcs_rwlock_node lock;
|
||||
struct mcs_rwlock_node updatelock;
|
||||
struct sig_pending *pending = NULL;
|
||||
|
||||
if(sig > SIGRTMAX || sig < 0)
|
||||
return -EINVAL;
|
||||
@ -1786,47 +1564,61 @@ done:
|
||||
|
||||
mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node);
|
||||
|
||||
/* Put signal event even when handler is SIG_IGN or SIG_DFL
|
||||
because target ptraced thread must call ptrace_report_signal
|
||||
in check_signal */
|
||||
rc = 0;
|
||||
k = tthread->sigcommon->action + sig - 1;
|
||||
if ((sig != SIGKILL && (tthread->ptrace & PT_TRACED)) ||
|
||||
(k->sa.sa_handler != SIG_IGN &&
|
||||
(k->sa.sa_handler != NULL ||
|
||||
(sig != SIGCHLD && sig != SIGURG)))) {
|
||||
struct sig_pending *pending = NULL;
|
||||
if (sig < SIGRTMIN) { // SIGRTMIN - SIGRTMAX
|
||||
list_for_each_entry(pending, head, list){
|
||||
if(pending->sigmask.__val[0] == mask &&
|
||||
pending->ptracecont == ptracecont)
|
||||
break;
|
||||
}
|
||||
if(&pending->list == head)
|
||||
pending = NULL;
|
||||
|
||||
if (sig < SIGRTMIN) { // SIGRTMIN - SIGRTMAX
|
||||
list_for_each_entry(pending, head, list) {
|
||||
if (pending->sigmask.__val[0] == mask &&
|
||||
pending->ptracecont == ptracecont)
|
||||
break;
|
||||
}
|
||||
if(pending == NULL){
|
||||
doint = 1;
|
||||
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
|
||||
if(!pending){
|
||||
rc = -ENOMEM;
|
||||
}
|
||||
else{
|
||||
memset(pending, 0, sizeof(struct sig_pending));
|
||||
pending->sigmask.__val[0] = mask;
|
||||
memcpy(&pending->info, info, sizeof(siginfo_t));
|
||||
pending->ptracecont = ptracecont;
|
||||
if(sig == SIGKILL || sig == SIGSTOP)
|
||||
list_add(&pending->list, head);
|
||||
else
|
||||
list_add_tail(&pending->list, head);
|
||||
tthread->sigevent = 1;
|
||||
}
|
||||
if (&pending->list == head)
|
||||
pending = NULL;
|
||||
}
|
||||
if (pending == NULL) {
|
||||
doint = 1;
|
||||
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
|
||||
if (!pending) {
|
||||
rc = -ENOMEM;
|
||||
}
|
||||
else {
|
||||
memset(pending, 0, sizeof(struct sig_pending));
|
||||
pending->sigmask.__val[0] = mask;
|
||||
memcpy(&pending->info, info, sizeof(siginfo_t));
|
||||
pending->ptracecont = ptracecont;
|
||||
if (sig == SIGKILL || sig == SIGSTOP)
|
||||
list_add(&pending->list, head);
|
||||
else
|
||||
list_add_tail(&pending->list, head);
|
||||
tthread->sigevent = 1;
|
||||
}
|
||||
}
|
||||
|
||||
mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node);
|
||||
cpu_restore_interrupt(irqstate);
|
||||
|
||||
if (sig == SIGCONT || ptracecont == 1) {
|
||||
/* Wake up the target only when stopped by SIGSTOP */
|
||||
if (sched_wakeup_thread(tthread, PS_STOPPED) == 0) {
|
||||
struct siginfo info;
|
||||
|
||||
tthread->proc->main_thread->signal_flags =
|
||||
SIGNAL_STOP_CONTINUED;
|
||||
tthread->proc->status = PS_RUNNING;
|
||||
memset(&info, '\0', sizeof(info));
|
||||
info.si_signo = SIGCHLD;
|
||||
info.si_code = CLD_CONTINUED;
|
||||
info._sifields._sigchld.si_pid = tthread->proc->pid;
|
||||
info._sifields._sigchld.si_status = 0x0000ffff;
|
||||
do_kill(tthread, tthread->proc->parent->pid, -1,
|
||||
SIGCHLD, &info, 0);
|
||||
if (thread != tthread) {
|
||||
ihk_mc_interrupt_cpu(tthread->cpu_id,
|
||||
ihk_mc_get_vector(IHK_GV_IKC));
|
||||
}
|
||||
doint = 0;
|
||||
}
|
||||
}
|
||||
if (doint && !(mask & tthread->sigmask.__val[0])) {
|
||||
int status = tthread->status;
|
||||
|
||||
@ -1841,11 +1633,6 @@ done:
|
||||
/* Wake up the target only when stopped by ptrace-reporting */
|
||||
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED | PS_INTERRUPTIBLE);
|
||||
}
|
||||
else if(sig == SIGCONT || ptracecont == 1){
|
||||
/* Wake up the target only when stopped by SIGSTOP */
|
||||
sched_wakeup_thread(tthread, PS_STOPPED);
|
||||
tthread->proc->status = PS_RUNNING;
|
||||
}
|
||||
else {
|
||||
sched_wakeup_thread(tthread, PS_INTERRUPTIBLE);
|
||||
}
|
||||
@ -1870,7 +1657,7 @@ set_signal(int sig, void *regs0, siginfo_t *info)
|
||||
}
|
||||
|
||||
if ((__sigmask(sig) & thread->sigmask.__val[0])) {
|
||||
coredump(thread, regs0);
|
||||
coredump(thread, regs0, sig);
|
||||
terminate(0, sig | 0x80);
|
||||
}
|
||||
do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0);
|
||||
@ -1900,7 +1687,7 @@ SYSCALL_DECLARE(mmap)
|
||||
;
|
||||
|
||||
const uintptr_t addr0 = ihk_mc_syscall_arg0(ctx);
|
||||
const size_t len0 = ihk_mc_syscall_arg1(ctx);
|
||||
size_t len0 = ihk_mc_syscall_arg1(ctx);
|
||||
const int prot = ihk_mc_syscall_arg2(ctx);
|
||||
const int flags0 = ihk_mc_syscall_arg3(ctx);
|
||||
const int fd = ihk_mc_syscall_arg4(ctx);
|
||||
@ -1941,7 +1728,8 @@ SYSCALL_DECLARE(mmap)
|
||||
|
||||
if (hugeshift == 0) {
|
||||
/* default hugepage size */
|
||||
flags |= MAP_HUGE_SECOND_BLOCK;
|
||||
flags |= ihk_mc_get_linux_default_huge_page_shift() <<
|
||||
MAP_HUGE_SHIFT;
|
||||
} else if ((first_level_block_support &&
|
||||
hugeshift == MAP_HUGE_FIRST_BLOCK) ||
|
||||
(first_level_block_support &&
|
||||
@ -1958,6 +1746,14 @@ SYSCALL_DECLARE(mmap)
|
||||
goto out;
|
||||
}
|
||||
pgsize = (size_t)1 << ((flags >> MAP_HUGE_SHIFT) & 0x3F);
|
||||
/* Round-up map length by pagesize */
|
||||
len0 = ALIGN(len0, pgsize);
|
||||
|
||||
if (rusage_check_overmap(len0,
|
||||
(flags >> MAP_HUGE_SHIFT) & 0x3F)) {
|
||||
error = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
#define VALID_DUMMY_ADDR ((region->user_start + PTL3_SIZE - 1) & ~(PTL3_SIZE - 1))
|
||||
@ -2018,7 +1814,8 @@ SYSCALL_DECLARE(shmget)
|
||||
|
||||
if (hugeshift == 0) {
|
||||
/* default hugepage size */
|
||||
shmflg |= SHM_HUGE_SECOND_BLOCK;
|
||||
shmflg |= ihk_mc_get_linux_default_huge_page_shift() <<
|
||||
MAP_HUGE_SHIFT;
|
||||
} else if ((first_level_block_support &&
|
||||
hugeshift == SHM_HUGE_FIRST_BLOCK) ||
|
||||
(first_level_block_support &&
|
||||
@ -2082,11 +1879,13 @@ int do_process_vm_read_writev(int pid,
|
||||
struct process *rproc;
|
||||
struct process *lproc = lthread->proc;
|
||||
struct process_vm *rvm = NULL;
|
||||
unsigned long rphys;
|
||||
unsigned long rpage_left;
|
||||
unsigned long psize;
|
||||
void *rva;
|
||||
unsigned long lphys, rphys;
|
||||
unsigned long lpage_left, rpage_left;
|
||||
unsigned long lpsize, rpsize;
|
||||
void *rva, *lva;
|
||||
#if 0
|
||||
struct vm_range *range;
|
||||
#endif
|
||||
struct mcs_rwlock_node_irqsave lock;
|
||||
struct mcs_rwlock_node update_lock;
|
||||
|
||||
@ -2099,8 +1898,9 @@ int do_process_vm_read_writev(int pid,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* Check if parameters are okay */
|
||||
ihk_mc_spinlock_lock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_lock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
range = lookup_process_memory_range(lthread->vm,
|
||||
(uintptr_t)local_iov,
|
||||
@ -2122,11 +1922,12 @@ int do_process_vm_read_writev(int pid,
|
||||
|
||||
ret = 0;
|
||||
arg_out:
|
||||
ihk_mc_spinlock_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (li = 0; li < liovcnt; ++li) {
|
||||
llen += local_iov[li].iov_len;
|
||||
@ -2191,7 +1992,7 @@ arg_out:
|
||||
if (pli != li) {
|
||||
struct vm_range *range;
|
||||
|
||||
ihk_mc_spinlock_lock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_lock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
/* Is base valid? */
|
||||
range = lookup_process_memory_range(lthread->vm,
|
||||
@ -2221,7 +2022,7 @@ arg_out:
|
||||
|
||||
ret = 0;
|
||||
pli_out:
|
||||
ihk_mc_spinlock_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
@ -2234,7 +2035,7 @@ pli_out:
|
||||
if (pri != ri) {
|
||||
struct vm_range *range;
|
||||
|
||||
ihk_mc_spinlock_lock_noirq(&rvm->memory_range_lock);
|
||||
ihk_rwspinlock_read_lock_noirq(&rvm->memory_range_lock);
|
||||
|
||||
/* Is base valid? */
|
||||
range = lookup_process_memory_range(rvm,
|
||||
@ -2264,7 +2065,7 @@ pli_out:
|
||||
|
||||
ret = 0;
|
||||
pri_out:
|
||||
ihk_mc_spinlock_unlock_noirq(&rvm->memory_range_lock);
|
||||
ihk_rwspinlock_read_unlock_noirq(&rvm->memory_range_lock);
|
||||
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
@ -2279,10 +2080,53 @@ pri_out:
|
||||
to_copy = remote_iov[ri].iov_len - roff;
|
||||
}
|
||||
|
||||
retry_lookup:
|
||||
retry_llookup:
|
||||
/* Figure out local physical */
|
||||
/* TODO: remember page and do this only if necessary */
|
||||
ret = ihk_mc_pt_virt_to_phys_size(lthread->vm->address_space->page_table,
|
||||
local_iov[li].iov_base + loff, &lphys, &lpsize);
|
||||
|
||||
if (ret) {
|
||||
uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER;
|
||||
void *addr;
|
||||
|
||||
if (faulted) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Fault in pages */
|
||||
for (addr = (void *)
|
||||
(((unsigned long)local_iov[li].iov_base + loff)
|
||||
& PAGE_MASK);
|
||||
addr < (local_iov[li].iov_base + loff + to_copy);
|
||||
addr += PAGE_SIZE) {
|
||||
|
||||
ret = page_fault_process_vm(lthread->vm, addr, reason);
|
||||
if (ret) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
faulted = 1;
|
||||
goto retry_llookup;
|
||||
}
|
||||
|
||||
lpage_left = ((((unsigned long)local_iov[li].iov_base + loff +
|
||||
lpsize) & ~(lpsize - 1)) -
|
||||
((unsigned long)local_iov[li].iov_base + loff));
|
||||
if (lpage_left < to_copy) {
|
||||
to_copy = lpage_left;
|
||||
}
|
||||
|
||||
lva = phys_to_virt(lphys);
|
||||
|
||||
retry_rlookup:
|
||||
/* Figure out remote physical */
|
||||
/* TODO: remember page and do this only if necessary */
|
||||
ret = ihk_mc_pt_virt_to_phys_size(rvm->address_space->page_table,
|
||||
remote_iov[ri].iov_base + roff, &rphys, &psize);
|
||||
remote_iov[ri].iov_base + roff, &rphys, &rpsize);
|
||||
|
||||
if (ret) {
|
||||
uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER;
|
||||
@ -2308,11 +2152,11 @@ retry_lookup:
|
||||
}
|
||||
|
||||
faulted = 1;
|
||||
goto retry_lookup;
|
||||
goto retry_rlookup;
|
||||
}
|
||||
|
||||
rpage_left = ((((unsigned long)remote_iov[ri].iov_base + roff +
|
||||
psize) & ~(psize - 1)) -
|
||||
rpsize) & ~(rpsize - 1)) -
|
||||
((unsigned long)remote_iov[ri].iov_base + roff));
|
||||
if (rpage_left < to_copy) {
|
||||
to_copy = rpage_left;
|
||||
@ -2321,16 +2165,16 @@ retry_lookup:
|
||||
rva = phys_to_virt(rphys);
|
||||
|
||||
fast_memcpy(
|
||||
(op == PROCESS_VM_READ) ? local_iov[li].iov_base + loff : rva,
|
||||
(op == PROCESS_VM_READ) ? rva : local_iov[li].iov_base + loff,
|
||||
(op == PROCESS_VM_READ) ? lva : rva,
|
||||
(op == PROCESS_VM_READ) ? rva : lva,
|
||||
to_copy);
|
||||
|
||||
copied += to_copy;
|
||||
dkprintf("local_iov[%d]: 0x%lx %s remote_iov[%d]: 0x%lx, %lu copied, psize: %lu, rpage_left: %lu\n",
|
||||
dkprintf("local_iov[%d]: 0x%lx %s remote_iov[%d]: 0x%lx, %lu copied, rpsize: %lu, rpage_left: %lu\n",
|
||||
li, local_iov[li].iov_base + loff,
|
||||
(op == PROCESS_VM_READ) ? "<-" : "->",
|
||||
ri, remote_iov[ri].iov_base + roff, to_copy,
|
||||
psize, rpage_left);
|
||||
rpsize, rpage_left);
|
||||
|
||||
loff += to_copy;
|
||||
roff += to_copy;
|
||||
@ -2700,4 +2544,48 @@ SYSCALL_DECLARE(time)
|
||||
return time();
|
||||
}
|
||||
|
||||
void calculate_time_from_tsc(struct timespec *ts)
|
||||
{
|
||||
long ver;
|
||||
unsigned long current_tsc;
|
||||
time_t sec_delta;
|
||||
long ns_delta;
|
||||
|
||||
for (;;) {
|
||||
while ((ver = ihk_atomic64_read(&tod_data.version)) & 1) {
|
||||
/* settimeofday() is in progress */
|
||||
cpu_pause();
|
||||
}
|
||||
rmb(); /* fetch version before time */
|
||||
*ts = tod_data.origin;
|
||||
rmb(); /* fetch time before checking version */
|
||||
if (ver == ihk_atomic64_read(&tod_data.version)) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* settimeofday() has intervened */
|
||||
cpu_pause();
|
||||
}
|
||||
|
||||
current_tsc = rdtsc();
|
||||
sec_delta = current_tsc / tod_data.clocks_per_sec;
|
||||
ns_delta = NS_PER_SEC * (current_tsc % tod_data.clocks_per_sec)
|
||||
/ tod_data.clocks_per_sec;
|
||||
/* calc. of ns_delta overflows if clocks_per_sec exceeds 18.44 GHz */
|
||||
|
||||
ts->tv_sec += sec_delta;
|
||||
ts->tv_nsec += ns_delta;
|
||||
if (ts->tv_nsec >= NS_PER_SEC) {
|
||||
ts->tv_nsec -= NS_PER_SEC;
|
||||
++ts->tv_sec;
|
||||
}
|
||||
}
|
||||
|
||||
extern void ptrace_syscall_event(struct thread *thread);
|
||||
long arch_ptrace_syscall_event(struct thread *thread,
|
||||
ihk_mc_user_context_t *ctx, long setret)
|
||||
{
|
||||
ptrace_syscall_event(thread);
|
||||
return setret;
|
||||
}
|
||||
/*** End of File ***/
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
#include <cputype.h>
|
||||
#include <irq.h>
|
||||
#include <arch-timer.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
//#define DEBUG_PRINT_TIMER
|
||||
|
||||
@ -111,6 +111,8 @@ static void timer_handler(void *priv)
|
||||
/* set timer re-enable for periodic */
|
||||
arch_timer_reg_write(ARCH_TIMER_REG_TVAL, clocks);
|
||||
arch_timer_reg_write(ARCH_TIMER_REG_CTRL, ctrl);
|
||||
|
||||
do_backlog();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -11,10 +11,9 @@
|
||||
#include <process.h>
|
||||
#include <string.h>
|
||||
#include <syscall.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <ikc/queue.h>
|
||||
#include <vdso.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
//#define DEBUG_PRINT_VDSO
|
||||
|
||||
@ -23,7 +22,6 @@
|
||||
#define DDEBUG_DEFAULT DDEBUG_PRINT
|
||||
#endif
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_52
|
||||
#define VDSO_MAXPAGES 1
|
||||
struct vdso {
|
||||
long busy;
|
||||
@ -34,7 +32,6 @@ struct vdso {
|
||||
long lbase;
|
||||
long offset_sigtramp;
|
||||
};
|
||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
||||
|
||||
extern char vdso_start, vdso_end;
|
||||
static struct vdso vdso;
|
||||
@ -90,6 +87,7 @@ int arch_setup_vdso(void)
|
||||
}
|
||||
|
||||
panic("Only support host mapping vDSO");
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int get_free_area(struct process_vm *vm, size_t len, intptr_t hint,
|
||||
|
||||
@ -18,7 +18,7 @@ extern char data_start[], data_end[];
|
||||
#define LARGE_PAGE_MASK (~((unsigned long)LARGE_PAGE_SIZE - 1))
|
||||
|
||||
#define MAP_ST_START 0xffff800000000000UL
|
||||
#define MAP_KERNEL_START 0xffffffff80000000UL
|
||||
/* MAP_KERNEL_START is defined by cmake */
|
||||
|
||||
#define PTL4_SHIFT 39
|
||||
#define PTL3_SHIFT 30
|
||||
|
||||
@ -1,8 +1,9 @@
|
||||
/* coredump.c COPYRIGHT FUJITSU LIMITED 2018 */
|
||||
/* coredump.c COPYRIGHT FUJITSU LIMITED 2018-2019 */
|
||||
#include <process.h>
|
||||
#include <elfcore.h>
|
||||
|
||||
void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread, void *regs0)
|
||||
void arch_fill_prstatus(struct elf_prstatus64 *prstatus,
|
||||
struct thread *thread, void *regs0, int sig)
|
||||
{
|
||||
struct x86_user_context *uctx = regs0;
|
||||
struct x86_basic_regs *regs = &uctx->gpr;
|
||||
@ -18,8 +19,6 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
||||
short int pr_cursig;
|
||||
a8_uint64_t pr_sigpend;
|
||||
a8_uint64_t pr_sighold;
|
||||
pid_t pr_pid;
|
||||
pid_t pr_ppid;
|
||||
pid_t pr_pgrp;
|
||||
pid_t pr_sid;
|
||||
struct prstatus64_timeval pr_utime;
|
||||
@ -28,6 +27,14 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
||||
struct prstatus64_timeval pr_cstime;
|
||||
*/
|
||||
|
||||
prstatus->pr_pid = thread->tid;
|
||||
if (thread->proc->parent) {
|
||||
prstatus->pr_ppid = thread->proc->parent->pid;
|
||||
}
|
||||
|
||||
prstatus->pr_info.si_signo = sig;
|
||||
prstatus->pr_cursig = sig;
|
||||
|
||||
prstatus->pr_reg[0] = _r15;
|
||||
prstatus->pr_reg[1] = _r14;
|
||||
prstatus->pr_reg[2] = _r13;
|
||||
@ -55,3 +62,13 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
||||
|
||||
prstatus->pr_fpvalid = 0; /* We assume no fp */
|
||||
}
|
||||
|
||||
void arch_fill_thread_core_info(struct note *head,
|
||||
struct thread *thread, void *regs)
|
||||
{
|
||||
}
|
||||
|
||||
int arch_get_thread_core_info_size(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* cpu.c COPYRIGHT FUJITSU LIMITED 2018 */
|
||||
/* cpu.c COPYRIGHT FUJITSU LIMITED 2018-2019 */
|
||||
/**
|
||||
* \file cpu.c
|
||||
* License details are found in the file LICENSE.
|
||||
@ -16,7 +16,6 @@
|
||||
*/
|
||||
|
||||
#include <ihk/cpu.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <ihk/mm.h>
|
||||
#include <types.h>
|
||||
#include <errno.h>
|
||||
@ -32,7 +31,7 @@
|
||||
#include <prctl.h>
|
||||
#include <page.h>
|
||||
#include <kmalloc.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
#define LAPIC_ID 0x020
|
||||
#define LAPIC_TIMER 0x320
|
||||
@ -45,11 +44,9 @@
|
||||
#define LAPIC_ICR0 0x300
|
||||
#define LAPIC_ICR2 0x310
|
||||
#define LAPIC_ESR 0x280
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_75 /* x86 depend hide */
|
||||
#define LOCAL_TIMER_VECTOR 0xef
|
||||
#define LOCAL_PERF_VECTOR 0xf0
|
||||
#define LOCAL_SMP_FUNC_CALL_VECTOR 0xf1
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_75 */
|
||||
|
||||
#define APIC_INT_LEVELTRIG 0x08000
|
||||
#define APIC_INT_ASSERT 0x04000
|
||||
@ -148,7 +145,7 @@ void reload_idt(void)
|
||||
}
|
||||
|
||||
static struct list_head handlers[256 - 32];
|
||||
extern char nmi[];
|
||||
extern char nmi_handler[];
|
||||
extern char page_fault[], general_protection_exception[];
|
||||
extern char debug_exception[], int3_exception[];
|
||||
|
||||
@ -175,7 +172,7 @@ static void init_idt(void)
|
||||
set_idt_entry(i, generic_common_handlers[i]);
|
||||
}
|
||||
|
||||
set_idt_entry(2, (uintptr_t)nmi);
|
||||
set_idt_entry(2, (uintptr_t)nmi_handler);
|
||||
set_idt_entry(13, (unsigned long)general_protection_exception);
|
||||
set_idt_entry(14, (unsigned long)page_fault);
|
||||
|
||||
@ -955,6 +952,8 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
|
||||
v->flags |= CPU_FLAG_NEED_RESCHED;
|
||||
ihk_mc_spinlock_unlock(&v->runq_lock, irqstate);
|
||||
dkprintf("timer[%lu]: CPU_FLAG_NEED_RESCHED \n", rdtsc());
|
||||
|
||||
do_backlog();
|
||||
}
|
||||
else if (vector == LOCAL_PERF_VECTOR) {
|
||||
struct siginfo info;
|
||||
@ -1206,6 +1205,15 @@ unsigned long cpu_disable_interrupt_save(void)
|
||||
return flags;
|
||||
}
|
||||
|
||||
unsigned long cpu_enable_interrupt_save(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
asm volatile("pushf; pop %0; sti" : "=r"(flags) : : "memory", "cc");
|
||||
|
||||
return flags;
|
||||
}
|
||||
|
||||
/*@
|
||||
@ behavior valid_vector:
|
||||
@ assumes 32 <= vector <= 255;
|
||||
@ -1602,14 +1610,18 @@ int ihk_mc_arch_get_special_register(enum ihk_asr_type type,
|
||||
}
|
||||
|
||||
/*@
|
||||
@ requires \valid_apicid(cpu); // valid APIC ID or not
|
||||
@ requires \valid_cpuid(cpu); // valid CPU logical ID
|
||||
@ ensures \result == 0
|
||||
@*/
|
||||
int ihk_mc_interrupt_cpu(int cpu, int vector)
|
||||
{
|
||||
if (cpu < 0 || cpu >= num_processors) {
|
||||
kprintf("%s: invalid CPU id: %d\n", __func__, cpu);
|
||||
return -1;
|
||||
}
|
||||
dkprintf("[%d] ihk_mc_interrupt_cpu: %d\n", ihk_mc_get_processor_id(), cpu);
|
||||
|
||||
x86_issue_ipi(cpu, vector);
|
||||
x86_issue_ipi(get_x86_cpu_local_variable(cpu)->apic_id, vector);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1624,6 +1636,7 @@ struct thread *arch_switch_context(struct thread *prev, struct thread *next)
|
||||
/* Set up new TLS.. */
|
||||
ihk_mc_init_user_tlsbase(next->uctx, next->tlsblock_base);
|
||||
|
||||
#ifdef ENABLE_PERF
|
||||
/* Performance monitoring inherit */
|
||||
if(next->proc->monitoring_event) {
|
||||
if(next->proc->perf_status == PP_RESET)
|
||||
@ -1633,6 +1646,7 @@ struct thread *arch_switch_context(struct thread *prev, struct thread *next)
|
||||
perf_start(next->proc->monitoring_event);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PROFILE_ENABLE
|
||||
if (prev && prev->profile && prev->profile_start_ts != 0) {
|
||||
@ -1708,7 +1722,7 @@ check_and_allocate_fp_regs(struct thread *thread)
|
||||
|
||||
if (!thread->fp_regs) {
|
||||
kprintf("error: allocating fp_regs pages\n");
|
||||
result = 1;
|
||||
result = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1721,12 +1735,14 @@ out:
|
||||
/*@
|
||||
@ requires \valid(thread);
|
||||
@*/
|
||||
void
|
||||
int
|
||||
save_fp_regs(struct thread *thread)
|
||||
{
|
||||
if (check_and_allocate_fp_regs(thread) != 0) {
|
||||
// alloc error
|
||||
return;
|
||||
int ret = 0;
|
||||
|
||||
ret = check_and_allocate_fp_regs(thread);
|
||||
if (ret) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (xsave_available) {
|
||||
@ -1741,13 +1757,23 @@ save_fp_regs(struct thread *thread)
|
||||
|
||||
dkprintf("fp_regs for TID %d saved\n", thread->tid);
|
||||
}
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void copy_fp_regs(struct thread *from, struct thread *to)
|
||||
int copy_fp_regs(struct thread *from, struct thread *to)
|
||||
{
|
||||
if ((from->fp_regs != NULL) && (check_and_allocate_fp_regs(to) == 0)) {
|
||||
memcpy(to->fp_regs, from->fp_regs, sizeof(fp_regs_struct));
|
||||
int ret = 0;
|
||||
|
||||
if (from->fp_regs != NULL) {
|
||||
ret = check_and_allocate_fp_regs(to);
|
||||
if (!ret) {
|
||||
memcpy(to->fp_regs,
|
||||
from->fp_regs,
|
||||
sizeof(fp_regs_struct));
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*@
|
||||
@ -1820,6 +1846,10 @@ ihk_mc_init_user_tlsbase(ihk_mc_user_context_t *ctx,
|
||||
do_arch_prctl(ARCH_SET_FS, tls_base_addr);
|
||||
}
|
||||
|
||||
void arch_flush_icache_all(void)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
/*@
|
||||
@ assigns \nothing;
|
||||
@ -1973,6 +2003,92 @@ mod_nmi_ctx(void *nmi_ctx, void (*func)())
|
||||
l[i++] = 0x28; // KERNEL DS
|
||||
}
|
||||
|
||||
void arch_save_panic_regs(void *irq_regs)
|
||||
{
|
||||
struct thread *current = cpu_local_var(current);
|
||||
struct x86_user_context *regs =
|
||||
(struct x86_user_context *)irq_regs;
|
||||
struct x86_cpu_local_variables *x86v =
|
||||
get_x86_cpu_local_variable(ihk_mc_get_processor_id());
|
||||
struct segment_regs {
|
||||
uint32_t rflags;
|
||||
uint32_t cs;
|
||||
uint32_t ss;
|
||||
uint32_t ds;
|
||||
uint32_t es;
|
||||
uint32_t fs;
|
||||
uint32_t gs;
|
||||
} *sregs;
|
||||
|
||||
/* Kernel space? */
|
||||
if (regs->gpr.rip > USER_END) {
|
||||
x86v->panic_regs[0] = regs->gpr.rax;
|
||||
x86v->panic_regs[1] = regs->gpr.rbx;
|
||||
x86v->panic_regs[2] = regs->gpr.rcx;
|
||||
x86v->panic_regs[3] = regs->gpr.rdx;
|
||||
x86v->panic_regs[4] = regs->gpr.rsi;
|
||||
x86v->panic_regs[5] = regs->gpr.rdi;
|
||||
x86v->panic_regs[6] = regs->gpr.rbp;
|
||||
x86v->panic_regs[7] = regs->gpr.rsp;
|
||||
x86v->panic_regs[8] = regs->gpr.r8;
|
||||
x86v->panic_regs[9] = regs->gpr.r9;
|
||||
x86v->panic_regs[10] = regs->gpr.r10;
|
||||
x86v->panic_regs[11] = regs->gpr.r11;
|
||||
x86v->panic_regs[12] = regs->gpr.r12;
|
||||
x86v->panic_regs[13] = regs->gpr.r13;
|
||||
x86v->panic_regs[14] = regs->gpr.r14;
|
||||
x86v->panic_regs[15] = regs->gpr.r15;
|
||||
x86v->panic_regs[16] = regs->gpr.rip;
|
||||
sregs = (struct segment_regs *)&x86v->panic_regs[17];
|
||||
sregs->rflags = regs->gpr.rflags;
|
||||
sregs->cs = regs->gpr.cs;
|
||||
sregs->ss = regs->gpr.ss;
|
||||
sregs->ds = regs->sr.ds;
|
||||
sregs->es = regs->sr.es;
|
||||
sregs->fs = regs->sr.fs;
|
||||
sregs->gs = regs->sr.gs;
|
||||
}
|
||||
/* User-space, show kernel context */
|
||||
else {
|
||||
kprintf("%s: in user-space: %p\n", __func__, regs->gpr.rip);
|
||||
x86v->panic_regs[0] = 0;
|
||||
x86v->panic_regs[1] = current->ctx.rbx;
|
||||
x86v->panic_regs[2] = 0;
|
||||
x86v->panic_regs[3] = 0;
|
||||
x86v->panic_regs[4] = current->ctx.rsi;
|
||||
x86v->panic_regs[5] = current->ctx.rdi;
|
||||
x86v->panic_regs[6] = current->ctx.rbp;
|
||||
x86v->panic_regs[7] = current->ctx.rsp;
|
||||
x86v->panic_regs[8] = 0;
|
||||
x86v->panic_regs[9] = 0;
|
||||
x86v->panic_regs[10] = 0;
|
||||
x86v->panic_regs[11] = 0;
|
||||
x86v->panic_regs[12] = regs->gpr.r12;
|
||||
x86v->panic_regs[13] = regs->gpr.r13;
|
||||
x86v->panic_regs[14] = regs->gpr.r14;
|
||||
x86v->panic_regs[15] = regs->gpr.r15;
|
||||
x86v->panic_regs[16] = (unsigned long)enter_user_mode;
|
||||
sregs = (struct segment_regs *)&x86v->panic_regs[17];
|
||||
sregs->rflags = regs->gpr.rflags;
|
||||
sregs->cs = regs->gpr.cs;
|
||||
sregs->ss = regs->gpr.ss;
|
||||
sregs->ds = regs->sr.ds;
|
||||
sregs->es = regs->sr.es;
|
||||
sregs->fs = regs->sr.fs;
|
||||
sregs->gs = regs->sr.gs;
|
||||
}
|
||||
|
||||
x86v->paniced = 1;
|
||||
}
|
||||
|
||||
void arch_clear_panic(void)
|
||||
{
|
||||
struct x86_cpu_local_variables *x86v =
|
||||
get_x86_cpu_local_variable(ihk_mc_get_processor_id());
|
||||
|
||||
x86v->paniced = 0;
|
||||
}
|
||||
|
||||
int arch_cpu_read_write_register(
|
||||
struct ihk_os_cpu_register *desc,
|
||||
enum mcctrl_os_cpu_operation op)
|
||||
@ -2096,9 +2212,7 @@ int smp_call_func(cpu_set_t *__cpu_set, smp_func_t __func, void *__arg)
|
||||
ihk_mc_spinlock_unlock(&get_cpu_local_var(cpu)->smp_func_req_lock,
|
||||
irq_flags);
|
||||
|
||||
ihk_mc_interrupt_cpu(
|
||||
get_x86_cpu_local_variable(cpu)->apic_id,
|
||||
LOCAL_SMP_FUNC_CALL_VECTOR);
|
||||
ihk_mc_interrupt_cpu(cpu, LOCAL_SMP_FUNC_CALL_VECTOR);
|
||||
|
||||
++cpu_index;
|
||||
}
|
||||
@ -2130,4 +2244,48 @@ free_out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
extern int nmi_mode;
|
||||
extern long freeze_thaw(void *nmi_ctx);
|
||||
|
||||
void multi_nm_interrupt_handler(void *irq_regs)
|
||||
{
|
||||
dkprintf("%s: ...\n", __func__);
|
||||
switch (nmi_mode) {
|
||||
case 1:
|
||||
case 2:
|
||||
/* mode == 1 or 2, for FREEZER NMI */
|
||||
dkprintf("%s: freeze mode NMI catch. (nmi_mode=%d)\n",
|
||||
__func__, nmi_mode);
|
||||
freeze_thaw(NULL);
|
||||
break;
|
||||
|
||||
case 0:
|
||||
/* mode == 0, for MEMDUMP NMI */
|
||||
arch_save_panic_regs(irq_regs);
|
||||
ihk_mc_query_mem_areas();
|
||||
/* memdump-nmi is halted McKernel, break is unnecessary. */
|
||||
/* fall through */
|
||||
case 3:
|
||||
/* mode == 3, for SHUTDOWN-WAIT NMI */
|
||||
kprintf("%s: STOP\n", __func__);
|
||||
while (nmi_mode != 4)
|
||||
cpu_halt();
|
||||
break;
|
||||
|
||||
case 4:
|
||||
/* mode == 4, continue NMI */
|
||||
arch_clear_panic();
|
||||
if (!ihk_mc_get_processor_id()) {
|
||||
ihk_mc_clear_dump_page_completion();
|
||||
}
|
||||
kprintf("%s: RESUME, nmi_mode: %d\n", __func__, nmi_mode);
|
||||
break;
|
||||
|
||||
default:
|
||||
ekprintf("%s: Unknown nmi-mode(%d) detected.\n",
|
||||
__func__, nmi_mode);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*** end of file ***/
|
||||
|
||||
@ -33,7 +33,7 @@ extern void preempt_disable(void);
|
||||
|
||||
#define IHK_STATIC_SPINLOCK_FUNCS
|
||||
|
||||
static void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
|
||||
static inline void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
|
||||
{
|
||||
lock->head_tail = 0;
|
||||
}
|
||||
@ -50,10 +50,13 @@ rc = __ihk_mc_spinlock_trylock_noirq(l); \
|
||||
#define ihk_mc_spinlock_trylock_noirq __ihk_mc_spinlock_trylock_noirq
|
||||
#endif
|
||||
|
||||
static int __ihk_mc_spinlock_trylock_noirq(ihk_spinlock_t *lock)
|
||||
static inline int __ihk_mc_spinlock_trylock_noirq(ihk_spinlock_t *lock)
|
||||
{
|
||||
ihk_spinlock_t cur = { .head_tail = lock->head_tail };
|
||||
ihk_spinlock_t next = { .tickets.head = cur.tickets.head, .tickets.tail = cur.tickets.tail + 2 };
|
||||
ihk_spinlock_t next = { .tickets = {
|
||||
.head = cur.tickets.head,
|
||||
.tail = cur.tickets.tail + 2
|
||||
} };
|
||||
int success;
|
||||
|
||||
if (cur.tickets.head != cur.tickets.tail) {
|
||||
@ -80,7 +83,8 @@ __kprintf("[%d] ret ihk_mc_spinlock_trylock\n", ihk_mc_get_processor_id()); rc;\
|
||||
#else
|
||||
#define ihk_mc_spinlock_trylock __ihk_mc_spinlock_trylock
|
||||
#endif
|
||||
static unsigned long __ihk_mc_spinlock_trylock(ihk_spinlock_t *lock, int *result)
|
||||
static inline unsigned long __ihk_mc_spinlock_trylock(ihk_spinlock_t *lock,
|
||||
int *result)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
@ -101,7 +105,7 @@ __kprintf("[%d] ret ihk_mc_spinlock_lock_noirq\n", ihk_mc_get_processor_id()); \
|
||||
#define ihk_mc_spinlock_lock_noirq __ihk_mc_spinlock_lock_noirq
|
||||
#endif
|
||||
|
||||
static void __ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
|
||||
static inline void __ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
|
||||
{
|
||||
register struct __raw_tickets inc = { .tail = 0x0002 };
|
||||
|
||||
@ -132,7 +136,7 @@ __kprintf("[%d] ret ihk_mc_spinlock_lock\n", ihk_mc_get_processor_id()); rc;\
|
||||
#else
|
||||
#define ihk_mc_spinlock_lock __ihk_mc_spinlock_lock
|
||||
#endif
|
||||
static unsigned long __ihk_mc_spinlock_lock(ihk_spinlock_t *lock)
|
||||
static inline unsigned long __ihk_mc_spinlock_lock(ihk_spinlock_t *lock)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
@ -152,7 +156,7 @@ __kprintf("[%d] ret ihk_mc_spinlock_unlock_noirq\n", ihk_mc_get_processor_id());
|
||||
#else
|
||||
#define ihk_mc_spinlock_unlock_noirq __ihk_mc_spinlock_unlock_noirq
|
||||
#endif
|
||||
static void __ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
|
||||
static inline void __ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
|
||||
{
|
||||
__ticket_t inc = 0x0002;
|
||||
|
||||
@ -171,100 +175,14 @@ __kprintf("[%d] ret ihk_mc_spinlock_unlock\n", ihk_mc_get_processor_id()); \
|
||||
#else
|
||||
#define ihk_mc_spinlock_unlock __ihk_mc_spinlock_unlock
|
||||
#endif
|
||||
static void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
|
||||
static inline void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock,
|
||||
unsigned long flags)
|
||||
{
|
||||
__ihk_mc_spinlock_unlock_noirq(lock);
|
||||
|
||||
cpu_restore_interrupt(flags);
|
||||
}
|
||||
|
||||
/* An implementation of the Mellor-Crummey Scott (MCS) lock */
|
||||
typedef struct mcs_lock_node {
|
||||
unsigned long locked;
|
||||
struct mcs_lock_node *next;
|
||||
unsigned long irqsave;
|
||||
#ifndef ENABLE_UBSAN
|
||||
} __aligned(64) mcs_lock_node_t;
|
||||
#else
|
||||
} mcs_lock_node_t;
|
||||
#endif
|
||||
|
||||
typedef mcs_lock_node_t mcs_lock_t;
|
||||
|
||||
static void mcs_lock_init(struct mcs_lock_node *node)
|
||||
{
|
||||
node->locked = 0;
|
||||
node->next = NULL;
|
||||
}
|
||||
|
||||
static void __mcs_lock_lock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
struct mcs_lock_node *pred;
|
||||
|
||||
node->next = NULL;
|
||||
node->locked = 0;
|
||||
pred = (struct mcs_lock_node *)xchg8((unsigned long *)&lock->next,
|
||||
(unsigned long)node);
|
||||
|
||||
if (pred) {
|
||||
node->locked = 1;
|
||||
pred->next = node;
|
||||
while (node->locked != 0) {
|
||||
cpu_pause();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void __mcs_lock_unlock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
if (node->next == NULL) {
|
||||
struct mcs_lock_node *old = (struct mcs_lock_node *)
|
||||
atomic_cmpxchg8((unsigned long *)&lock->next,
|
||||
(unsigned long)node, (unsigned long)0);
|
||||
|
||||
if (old == node) {
|
||||
return;
|
||||
}
|
||||
|
||||
while (node->next == NULL) {
|
||||
cpu_pause();
|
||||
}
|
||||
}
|
||||
|
||||
node->next->locked = 0;
|
||||
}
|
||||
|
||||
static void mcs_lock_lock_noirq(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
preempt_disable();
|
||||
__mcs_lock_lock(lock, node);
|
||||
}
|
||||
|
||||
static void mcs_lock_unlock_noirq(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
__mcs_lock_unlock(lock, node);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void mcs_lock_lock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
node->irqsave = cpu_disable_interrupt_save();
|
||||
mcs_lock_lock_noirq(lock, node);
|
||||
}
|
||||
|
||||
static void mcs_lock_unlock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
mcs_lock_unlock_noirq(lock, node);
|
||||
cpu_restore_interrupt(node->irqsave);
|
||||
}
|
||||
|
||||
|
||||
#define SPINLOCK_IN_MCS_RWLOCK
|
||||
|
||||
// reader/writer lock
|
||||
@ -310,7 +228,7 @@ typedef struct mcs_rwlock_lock {
|
||||
} mcs_rwlock_lock_t;
|
||||
#endif
|
||||
|
||||
static void
|
||||
static inline void
|
||||
mcs_rwlock_init(struct mcs_rwlock_lock *lock)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -331,7 +249,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_lock_noirq\n", ihk_mc_get_processor_id());
|
||||
#else
|
||||
#define mcs_rwlock_writer_lock_noirq __mcs_rwlock_writer_lock_noirq
|
||||
#endif
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -358,7 +276,7 @@ __mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_n
|
||||
}
|
||||
|
||||
#ifndef SPINLOCK_IN_MCS_RWLOCK
|
||||
static void
|
||||
static inline void
|
||||
mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
|
||||
{
|
||||
struct mcs_rwlock_node *p;
|
||||
@ -425,7 +343,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_unlock_noirq\n", ihk_mc_get_processor_id()
|
||||
#else
|
||||
#define mcs_rwlock_writer_unlock_noirq __mcs_rwlock_writer_unlock_noirq
|
||||
#endif
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_writer_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -485,7 +403,7 @@ atomic_inc_ifnot0(ihk_atomic_t *v)
|
||||
return old;
|
||||
}
|
||||
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -551,7 +469,7 @@ __kprintf("[%d] ret mcs_rwlock_reader_unlock_noirq\n", ihk_mc_get_processor_id()
|
||||
#else
|
||||
#define mcs_rwlock_reader_unlock_noirq __mcs_rwlock_reader_unlock_noirq
|
||||
#endif
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_reader_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -598,7 +516,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_lock\n", ihk_mc_get_processor_id()); \
|
||||
#else
|
||||
#define mcs_rwlock_writer_lock __mcs_rwlock_writer_lock
|
||||
#endif
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_writer_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -618,7 +536,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_unlock\n", ihk_mc_get_processor_id()); \
|
||||
#else
|
||||
#define mcs_rwlock_writer_unlock __mcs_rwlock_writer_unlock
|
||||
#endif
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_writer_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -638,7 +556,7 @@ __kprintf("[%d] ret mcs_rwlock_reader_lock\n", ihk_mc_get_processor_id()); \
|
||||
#else
|
||||
#define mcs_rwlock_reader_lock __mcs_rwlock_reader_lock
|
||||
#endif
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_reader_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -658,7 +576,7 @@ __kprintf("[%d] ret mcs_rwlock_reader_unlock\n", ihk_mc_get_processor_id()); \
|
||||
#else
|
||||
#define mcs_rwlock_reader_unlock __mcs_rwlock_reader_unlock
|
||||
#endif
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_reader_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -674,4 +592,90 @@ static inline int irqflags_can_interrupt(unsigned long flags)
|
||||
return !!(flags & 0x200);
|
||||
}
|
||||
|
||||
struct ihk_rwlock {
|
||||
union {
|
||||
long lock;
|
||||
struct {
|
||||
unsigned int read;
|
||||
int write;
|
||||
};
|
||||
} lock;
|
||||
};
|
||||
|
||||
static inline void ihk_mc_rwlock_init(struct ihk_rwlock *rw)
|
||||
{
|
||||
rw->lock.read = 0;
|
||||
rw->lock.write = 1;
|
||||
}
|
||||
|
||||
static inline void ihk_mc_read_lock(struct ihk_rwlock *rw)
|
||||
{
|
||||
asm volatile("1:\t"
|
||||
"lock; decq %0\n\t"
|
||||
"jns 3f\n\t"
|
||||
"lock incq %0\n\t"
|
||||
"2:\t"
|
||||
"pause\n\t"
|
||||
"cmpq $0x1, %0\n\t"
|
||||
"jns 1b\n\t"
|
||||
"jmp 2b\n\t"
|
||||
"3:"
|
||||
: "+m" (rw->lock.lock) : : "memory");
|
||||
}
|
||||
|
||||
static inline void ihk_mc_write_lock(struct ihk_rwlock *rw)
|
||||
{
|
||||
asm volatile("1:\t"
|
||||
"lock; decl %0\n\t"
|
||||
"je 3f\n\t"
|
||||
"lock; incl %0\n\t"
|
||||
"2:\t"
|
||||
"pause\n\t"
|
||||
"cmpl $0x1,%0\n\t"
|
||||
"je 1b\n\t"
|
||||
"jmp 2b\n\t"
|
||||
"3:"
|
||||
: "+m" (rw->lock.write) : "i" (((1L) << 32)) : "memory");
|
||||
}
|
||||
|
||||
static inline int ihk_mc_read_trylock(struct ihk_rwlock *rw)
|
||||
{
|
||||
ihk_atomic64_t *count = (ihk_atomic64_t *)rw;
|
||||
|
||||
if (ihk_atomic64_sub_return(1, count) >= 0)
|
||||
return 1;
|
||||
ihk_atomic64_inc(count);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int ihk_mc_write_trylock(struct ihk_rwlock *rw)
|
||||
{
|
||||
ihk_atomic_t *count = (ihk_atomic_t *)&rw->lock.write;
|
||||
|
||||
if (ihk_atomic_dec_and_test(count))
|
||||
return 1;
|
||||
ihk_atomic_inc(count);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void ihk_mc_read_unlock(struct ihk_rwlock *rw)
|
||||
{
|
||||
asm volatile("lock; incq %0" : "+m" (rw->lock.lock) : : "memory");
|
||||
}
|
||||
|
||||
static inline void ihk_mc_write_unlock(struct ihk_rwlock *rw)
|
||||
{
|
||||
asm volatile("lock; incl %0"
|
||||
: "+m" (rw->lock.write) : "i" (((1L) << 32)) : "memory");
|
||||
}
|
||||
|
||||
static inline int ihk_mc_write_can_lock(struct ihk_rwlock *rw)
|
||||
{
|
||||
return rw->lock.write == 1;
|
||||
}
|
||||
|
||||
static inline int ihk_mc_read_can_lock(struct ihk_rwlock *rw)
|
||||
{
|
||||
return rw->lock.lock > 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
#define __HEADER_X86_COMMON_ARCH_MEMORY_H
|
||||
|
||||
#include <ihk/types.h>
|
||||
#include <errno.h>
|
||||
|
||||
#define KERNEL_CS_ENTRY 4
|
||||
#define KERNEL_DS_ENTRY 5
|
||||
@ -66,8 +67,8 @@
|
||||
* Placing the LWK image in the virtual address space at the end of
|
||||
* the Linux modules section enables us to map the LWK TEXT in Linux
|
||||
* as well, so that Linux can also call into LWK text.
|
||||
* It's defined by cmake.
|
||||
*/
|
||||
#define MAP_KERNEL_START 0xFFFFFFFFFE800000UL
|
||||
#define STACK_TOP(region) ((region)->user_end)
|
||||
|
||||
#define MAP_VMAP_SIZE 0x0000000100000000UL
|
||||
@ -183,12 +184,10 @@ enum ihk_mc_pt_attribute {
|
||||
|
||||
enum ihk_mc_pt_attribute attr_mask;
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
||||
static inline int pfn_is_write_combined(uintptr_t pfn)
|
||||
{
|
||||
return ((pfn & PFL1_PWT) && !(pfn & PFL1_PCD));
|
||||
}
|
||||
#endif /* #ifdef POSTK_DEBUG_ARCH_DEP_12 */
|
||||
|
||||
static inline int pte_is_null(pte_t *ptep)
|
||||
{
|
||||
@ -365,6 +364,17 @@ static inline int pgsize_to_tbllv(size_t pgsize)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int pgsize_to_pgshift(size_t pgsize)
|
||||
{
|
||||
switch (pgsize) {
|
||||
case PTL1_SIZE: return PTL1_SHIFT;
|
||||
case PTL2_SIZE: return PTL2_SHIFT;
|
||||
case PTL3_SIZE: return PTL3_SHIFT;
|
||||
case PTL4_SIZE: return PTL4_SHIFT;
|
||||
default: return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
static inline size_t tbllv_to_pgsize(int level)
|
||||
{
|
||||
switch (level) {
|
||||
|
||||
@ -13,19 +13,17 @@
|
||||
#ifndef ARCH_CPU_H
|
||||
#define ARCH_CPU_H
|
||||
|
||||
#define mb() asm volatile("mfence":::"memory")
|
||||
#define rmb() asm volatile("lfence":::"memory")
|
||||
#define wmb() asm volatile("sfence" ::: "memory")
|
||||
|
||||
#define smp_mb() mb()
|
||||
#define smp_rmb() rmb()
|
||||
#define smp_wmb() barrier()
|
||||
|
||||
#define arch_barrier() asm volatile("" : : : "memory")
|
||||
|
||||
static inline void rmb(void)
|
||||
{
|
||||
arch_barrier();
|
||||
}
|
||||
|
||||
static inline void wmb(void)
|
||||
{
|
||||
arch_barrier();
|
||||
}
|
||||
|
||||
static unsigned long read_tsc(void)
|
||||
static inline unsigned long read_tsc(void)
|
||||
{
|
||||
unsigned int low, high;
|
||||
|
||||
@ -34,4 +32,21 @@ static unsigned long read_tsc(void)
|
||||
return (low | ((unsigned long)high << 32));
|
||||
}
|
||||
|
||||
#define smp_load_acquire(p) \
|
||||
({ \
|
||||
typeof(*p) ___p1 = ACCESS_ONCE(*p); \
|
||||
compiletime_assert_atomic_type(*p); \
|
||||
barrier(); \
|
||||
___p1; \
|
||||
})
|
||||
|
||||
#define smp_store_release(p, v) \
|
||||
({ \
|
||||
compiletime_assert_atomic_type(*p); \
|
||||
barrier(); \
|
||||
WRITE_ONCE(*p, v); \
|
||||
})
|
||||
|
||||
void arch_flush_icache_all(void);
|
||||
|
||||
#endif /* ARCH_CPU_H */
|
||||
|
||||
@ -1,32 +0,0 @@
|
||||
#ifndef ARCH_RUSAGE_H_INCLUDED
|
||||
#define ARCH_RUSAGE_H_INCLUDED
|
||||
|
||||
#define DEBUG_RUSAGE
|
||||
|
||||
#define IHK_OS_PGSIZE_4KB 0
|
||||
#define IHK_OS_PGSIZE_2MB 1
|
||||
#define IHK_OS_PGSIZE_1GB 2
|
||||
|
||||
extern struct rusage_global rusage;
|
||||
|
||||
static inline int rusage_pgsize_to_pgtype(size_t pgsize)
|
||||
{
|
||||
int ret = IHK_OS_PGSIZE_4KB;
|
||||
switch (pgsize) {
|
||||
case PTL1_SIZE:
|
||||
ret = IHK_OS_PGSIZE_4KB;
|
||||
break;
|
||||
case PTL2_SIZE:
|
||||
ret = IHK_OS_PGSIZE_2MB;
|
||||
break;
|
||||
case PTL3_SIZE:
|
||||
ret = IHK_OS_PGSIZE_1GB;
|
||||
break;
|
||||
default:
|
||||
kprintf("%s: Error: Unknown pgsize=%ld\n", __FUNCTION__, pgsize);
|
||||
break;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* !defined(ARCH_RUSAGE_H_INCLUDED) */
|
||||
@ -13,6 +13,8 @@
|
||||
#ifndef HEADER_X86_COMMON_IHK_ATOMIC_H
|
||||
#define HEADER_X86_COMMON_IHK_ATOMIC_H
|
||||
|
||||
#include <lwk/compiler.h>
|
||||
|
||||
/***********************************************************************
|
||||
* ihk_atomic_t
|
||||
*/
|
||||
@ -114,7 +116,7 @@ static inline long ihk_atomic64_read(const ihk_atomic64_t *v)
|
||||
return *(volatile long *)&(v)->counter64;
|
||||
}
|
||||
|
||||
static inline void ihk_atomic64_set(ihk_atomic64_t *v, int i)
|
||||
static inline void ihk_atomic64_set(ihk_atomic64_t *v, long i)
|
||||
{
|
||||
v->counter64 = i;
|
||||
}
|
||||
@ -124,6 +126,22 @@ static inline void ihk_atomic64_inc(ihk_atomic64_t *v)
|
||||
asm volatile ("lock incq %0" : "+m"(v->counter64));
|
||||
}
|
||||
|
||||
static inline long ihk_atomic64_add_return(long i, ihk_atomic64_t *v)
|
||||
{
|
||||
long __i;
|
||||
|
||||
__i = i;
|
||||
asm volatile("lock xaddq %0, %1"
|
||||
: "+r" (i), "+m" (v->counter64)
|
||||
: : "memory");
|
||||
return i + __i;
|
||||
}
|
||||
|
||||
static inline long ihk_atomic64_sub_return(long i, ihk_atomic64_t *v)
|
||||
{
|
||||
return ihk_atomic64_add_return(-i, v);
|
||||
}
|
||||
|
||||
/***********************************************************************
|
||||
* others
|
||||
*/
|
||||
@ -156,43 +174,55 @@ static inline unsigned long xchg8(unsigned long *ptr, unsigned long x)
|
||||
return __x;
|
||||
}
|
||||
|
||||
#define __xchg(x, ptr, size) \
|
||||
({ \
|
||||
__typeof(*(ptr)) __x = (x); \
|
||||
switch (size) { \
|
||||
case 1: \
|
||||
asm volatile("xchgb %b0,%1" \
|
||||
: "=q" (__x) \
|
||||
: "m" (*__xg(ptr)), "0" (__x) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
case 2: \
|
||||
asm volatile("xchgw %w0,%1" \
|
||||
: "=r" (__x) \
|
||||
: "m" (*__xg(ptr)), "0" (__x) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
case 4: \
|
||||
asm volatile("xchgl %k0,%1" \
|
||||
: "=r" (__x) \
|
||||
: "m" (*__xg(ptr)), "0" (__x) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
case 8: \
|
||||
asm volatile("xchgq %0,%1" \
|
||||
: "=r" (__x) \
|
||||
: "m" (*__xg(ptr)), "0" (__x) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
default: \
|
||||
panic("xchg for wrong size"); \
|
||||
} \
|
||||
__x; \
|
||||
})
|
||||
#define __X86_CASE_B 1
|
||||
#define __X86_CASE_W 2
|
||||
#define __X86_CASE_L 4
|
||||
#define __X86_CASE_Q 8
|
||||
|
||||
extern void __xchg_wrong_size(void)
|
||||
__compiletime_error("Bad argument size for xchg");
|
||||
|
||||
#define xchg(ptr, v) \
|
||||
__xchg((v), (ptr), sizeof(*ptr))
|
||||
/*
|
||||
* An exchange-type operation, which takes a value and a pointer, and
|
||||
* returns the old value.
|
||||
*/
|
||||
#define __xchg_op(ptr, arg, op, lock) \
|
||||
({ \
|
||||
__typeof__(*(ptr)) __ret = (arg); \
|
||||
switch (sizeof(*(ptr))) { \
|
||||
case __X86_CASE_B: \
|
||||
asm volatile (lock #op "b %b0, %1\n" \
|
||||
: "+q" (__ret), "+m" (*(ptr)) \
|
||||
: : "memory", "cc"); \
|
||||
break; \
|
||||
case __X86_CASE_W: \
|
||||
asm volatile (lock #op "w %w0, %1\n" \
|
||||
: "+r" (__ret), "+m" (*(ptr)) \
|
||||
: : "memory", "cc"); \
|
||||
break; \
|
||||
case __X86_CASE_L: \
|
||||
asm volatile (lock #op "l %0, %1\n" \
|
||||
: "+r" (__ret), "+m" (*(ptr)) \
|
||||
: : "memory", "cc"); \
|
||||
break; \
|
||||
case __X86_CASE_Q: \
|
||||
asm volatile (lock #op "q %q0, %1\n" \
|
||||
: "+r" (__ret), "+m" (*(ptr)) \
|
||||
: : "memory", "cc"); \
|
||||
break; \
|
||||
default: \
|
||||
__xchg_wrong_size(); \
|
||||
} \
|
||||
__ret; \
|
||||
})
|
||||
|
||||
/*
|
||||
* Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
|
||||
* Since this is generally used to protect other memory information, we
|
||||
* use "asm volatile" and "memory" clobbers to prevent gcc from moving
|
||||
* information around.
|
||||
*/
|
||||
#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "")
|
||||
|
||||
static inline unsigned long atomic_cmpxchg8(unsigned long *addr,
|
||||
unsigned long oldval,
|
||||
@ -241,4 +271,66 @@ static inline unsigned long ihk_atomic_add_long_return(long i, long *v) {
|
||||
return i + __i;
|
||||
}
|
||||
|
||||
extern void __cmpxchg_wrong_size(void)
|
||||
__compiletime_error("Bad argument size for cmpxchg");
|
||||
|
||||
/*
|
||||
* Atomic compare and exchange. Compare OLD with MEM, if identical,
|
||||
* store NEW in MEM. Return the initial value in MEM. Success is
|
||||
* indicated by comparing RETURN with OLD.
|
||||
*/
|
||||
#define __raw_cmpxchg(ptr, old, new, size, lock) \
|
||||
({ \
|
||||
__typeof__(*(ptr)) __ret; \
|
||||
__typeof__(*(ptr)) __old = (old); \
|
||||
__typeof__(*(ptr)) __new = (new); \
|
||||
switch (size) { \
|
||||
case __X86_CASE_B: \
|
||||
{ \
|
||||
volatile uint8_t *__ptr = (volatile uint8_t *)(ptr);\
|
||||
asm volatile(lock "cmpxchgb %2,%1" \
|
||||
: "=a" (__ret), "+m" (*__ptr) \
|
||||
: "q" (__new), "0" (__old) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
} \
|
||||
case __X86_CASE_W: \
|
||||
{ \
|
||||
volatile uint16_t *__ptr = (volatile uint16_t *)(ptr);\
|
||||
asm volatile(lock "cmpxchgw %2,%1" \
|
||||
: "=a" (__ret), "+m" (*__ptr) \
|
||||
: "r" (__new), "0" (__old) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
} \
|
||||
case __X86_CASE_L: \
|
||||
{ \
|
||||
volatile uint32_t *__ptr = (volatile uint32_t *)(ptr);\
|
||||
asm volatile(lock "cmpxchgl %2,%1" \
|
||||
: "=a" (__ret), "+m" (*__ptr) \
|
||||
: "r" (__new), "0" (__old) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
} \
|
||||
case __X86_CASE_Q: \
|
||||
{ \
|
||||
volatile uint64_t *__ptr = (volatile uint64_t *)(ptr);\
|
||||
asm volatile(lock "cmpxchgq %2,%1" \
|
||||
: "=a" (__ret), "+m" (*__ptr) \
|
||||
: "r" (__new), "0" (__old) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
} \
|
||||
default: \
|
||||
__cmpxchg_wrong_size(); \
|
||||
} \
|
||||
__ret; \
|
||||
})
|
||||
|
||||
#define __cmpxchg(ptr, old, new, size) \
|
||||
__raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
|
||||
|
||||
#define cmpxchg(ptr, old, new) \
|
||||
__cmpxchg(ptr, old, new, sizeof(*(ptr)))
|
||||
|
||||
#endif
|
||||
|
||||
@ -71,7 +71,7 @@
|
||||
#define MSR_PERF_CTL_0 0xc0010000
|
||||
#define MSR_PERF_CTR_0 0xc0010004
|
||||
|
||||
static unsigned long xgetbv(unsigned int index)
|
||||
static inline unsigned long xgetbv(unsigned int index)
|
||||
{
|
||||
unsigned int low, high;
|
||||
|
||||
@ -80,7 +80,7 @@ static unsigned long xgetbv(unsigned int index)
|
||||
return low | ((unsigned long)high << 32);
|
||||
}
|
||||
|
||||
static void xsetbv(unsigned int index, unsigned long val)
|
||||
static inline void xsetbv(unsigned int index, unsigned long val)
|
||||
{
|
||||
unsigned int low, high;
|
||||
|
||||
@ -90,7 +90,8 @@ static void xsetbv(unsigned int index, unsigned long val)
|
||||
asm volatile("xsetbv" : : "a" (low), "d" (high), "c" (index));
|
||||
}
|
||||
|
||||
static void wrmsr(unsigned int idx, unsigned long value){
|
||||
static inline void wrmsr(unsigned int idx, unsigned long value)
|
||||
{
|
||||
unsigned int high, low;
|
||||
|
||||
high = value >> 32;
|
||||
@ -99,7 +100,7 @@ static void wrmsr(unsigned int idx, unsigned long value){
|
||||
asm volatile("wrmsr" : : "c" (idx), "a" (low), "d" (high) : "memory");
|
||||
}
|
||||
|
||||
static unsigned long rdpmc(unsigned int counter)
|
||||
static inline unsigned long rdpmc(unsigned int counter)
|
||||
{
|
||||
unsigned int high, low;
|
||||
|
||||
@ -108,7 +109,7 @@ static unsigned long rdpmc(unsigned int counter)
|
||||
return (unsigned long)high << 32 | low;
|
||||
}
|
||||
|
||||
static unsigned long rdmsr(unsigned int index)
|
||||
static inline unsigned long rdmsr(unsigned int index)
|
||||
{
|
||||
unsigned int high, low;
|
||||
|
||||
@ -117,7 +118,7 @@ static unsigned long rdmsr(unsigned int index)
|
||||
return (unsigned long)high << 32 | low;
|
||||
}
|
||||
|
||||
static unsigned long rdtsc(void)
|
||||
static inline unsigned long rdtsc(void)
|
||||
{
|
||||
unsigned int high, low;
|
||||
|
||||
@ -126,7 +127,7 @@ static unsigned long rdtsc(void)
|
||||
return (unsigned long)high << 32 | low;
|
||||
}
|
||||
|
||||
static void set_perfctl(int counter, int event, int mask)
|
||||
static inline void set_perfctl(int counter, int event, int mask)
|
||||
{
|
||||
unsigned long value;
|
||||
|
||||
@ -137,7 +138,7 @@ static void set_perfctl(int counter, int event, int mask)
|
||||
wrmsr(MSR_PERF_CTL_0 + counter, value);
|
||||
}
|
||||
|
||||
static void start_perfctr(int counter)
|
||||
static inline void start_perfctr(int counter)
|
||||
{
|
||||
unsigned long value;
|
||||
|
||||
@ -145,7 +146,7 @@ static void start_perfctr(int counter)
|
||||
value |= (1 << 22);
|
||||
wrmsr(MSR_PERF_CTL_0 + counter, value);
|
||||
}
|
||||
static void stop_perfctr(int counter)
|
||||
static inline void stop_perfctr(int counter)
|
||||
{
|
||||
unsigned long value;
|
||||
|
||||
@ -154,17 +155,17 @@ static void stop_perfctr(int counter)
|
||||
wrmsr(MSR_PERF_CTL_0 + counter, value);
|
||||
}
|
||||
|
||||
static void clear_perfctl(int counter)
|
||||
static inline void clear_perfctl(int counter)
|
||||
{
|
||||
wrmsr(MSR_PERF_CTL_0 + counter, 0);
|
||||
}
|
||||
|
||||
static void set_perfctr(int counter, unsigned long value)
|
||||
static inline void set_perfctr(int counter, unsigned long value)
|
||||
{
|
||||
wrmsr(MSR_PERF_CTR_0 + counter, value);
|
||||
}
|
||||
|
||||
static unsigned long read_perfctr(int counter)
|
||||
static inline unsigned long read_perfctr(int counter)
|
||||
{
|
||||
return rdpmc(counter);
|
||||
}
|
||||
|
||||
@ -84,7 +84,11 @@ enum __rlimit_resource
|
||||
__RLIMIT_RTPRIO = 14,
|
||||
#define RLIMIT_RTPRIO __RLIMIT_RTPRIO
|
||||
|
||||
__RLIMIT_NLIMITS = 15,
|
||||
/* timeout for RT tasks in us */
|
||||
__RLIMIT_RTTIME = 15,
|
||||
#define RLIMIT_RTTIME __RLIMIT_RTTIME
|
||||
|
||||
__RLIMIT_NLIMITS = 16,
|
||||
__RLIM_NLIMITS = __RLIMIT_NLIMITS
|
||||
#define RLIMIT_NLIMITS __RLIMIT_NLIMITS
|
||||
#define RLIM_NLIMITS __RLIM_NLIMITS
|
||||
|
||||
@ -74,6 +74,7 @@ SYSCALL_DELEGATED(89, readlink)
|
||||
SYSCALL_HANDLED(96, gettimeofday)
|
||||
SYSCALL_HANDLED(97, getrlimit)
|
||||
SYSCALL_HANDLED(98, getrusage)
|
||||
SYSCALL_HANDLED(99, sysinfo)
|
||||
SYSCALL_HANDLED(100, times)
|
||||
SYSCALL_HANDLED(101, ptrace)
|
||||
SYSCALL_HANDLED(102, getuid)
|
||||
@ -147,24 +148,24 @@ SYSCALL_DELEGATED(266, symlinkat)
|
||||
SYSCALL_DELEGATED(267, readlinkat)
|
||||
SYSCALL_DELEGATED(268, fchmodat)
|
||||
SYSCALL_DELEGATED(269, faccessat)
|
||||
SYSCALL_DELEGATED(270, pselect6)
|
||||
SYSCALL_DELEGATED(271, ppoll)
|
||||
SYSCALL_HANDLED(270, pselect6)
|
||||
SYSCALL_HANDLED(271, ppoll)
|
||||
SYSCALL_HANDLED(273, set_robust_list)
|
||||
SYSCALL_HANDLED(279, move_pages)
|
||||
SYSCALL_DELEGATED(281, epoll_pwait)
|
||||
SYSCALL_HANDLED(281, epoll_pwait)
|
||||
SYSCALL_HANDLED(282, signalfd)
|
||||
SYSCALL_HANDLED(289, signalfd4)
|
||||
#ifdef ENABLE_PERF
|
||||
SYSCALL_HANDLED(298, perf_event_open)
|
||||
#endif
|
||||
SYSCALL_HANDLED(302, prlimit64)
|
||||
#ifdef DCFA_KMOD
|
||||
SYSCALL_HANDLED(303, mod_call)
|
||||
#endif
|
||||
SYSCALL_HANDLED(309, getcpu)
|
||||
SYSCALL_HANDLED(310, process_vm_readv)
|
||||
SYSCALL_HANDLED(311, process_vm_writev)
|
||||
SYSCALL_HANDLED(601, pmc_init)
|
||||
SYSCALL_HANDLED(602, pmc_start)
|
||||
SYSCALL_HANDLED(603, pmc_stop)
|
||||
SYSCALL_HANDLED(604, pmc_reset)
|
||||
SYSCALL_HANDLED(322, execveat)
|
||||
SYSCALL_HANDLED(700, get_cpu_id)
|
||||
#ifdef PROFILE_ENABLE
|
||||
SYSCALL_HANDLED(__NR_profile, profile)
|
||||
@ -180,4 +181,8 @@ SYSCALL_HANDLED(802, linux_mlock)
|
||||
SYSCALL_HANDLED(803, suspend_threads)
|
||||
SYSCALL_HANDLED(804, resume_threads)
|
||||
SYSCALL_HANDLED(811, linux_spawn)
|
||||
/**** End of File ****/
|
||||
|
||||
/* Do not edit the lines including this comment and
|
||||
* EOF just after it because those are used as a
|
||||
* robust marker for the autotest patch.
|
||||
*/
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
/* interrupt.S COPYRIGHT FUJITSU LIMITED 2019 */
|
||||
/**
|
||||
* \file interrupt.S
|
||||
* License details are found in the file LICENSE.
|
||||
@ -91,6 +92,9 @@ vector=vector+1
|
||||
.endr
|
||||
|
||||
common_interrupt:
|
||||
#define MULT_INTR_VECTOR 242
|
||||
cmp $(MULT_INTR_VECTOR),%rdi
|
||||
je 1f
|
||||
PUSH_ALL_REGS
|
||||
movq ERROR_OFFSET(%rsp), %rdi
|
||||
movq %rsp, %rsi
|
||||
@ -99,6 +103,19 @@ common_interrupt:
|
||||
addq $8, %rsp
|
||||
iretq
|
||||
|
||||
|
||||
.globl nmi_handler
|
||||
nmi_handler:
|
||||
cld
|
||||
pushq $0 /* error field of x86_basic_regs */
|
||||
PUSH_ALL_REGS
|
||||
movq %rsp, %rdi
|
||||
call multi_nm_interrupt_handler /* Enter C code */
|
||||
POP_ALL_REGS
|
||||
addq $8, %rsp
|
||||
iretq
|
||||
|
||||
|
||||
.globl __page_fault_handler_address
|
||||
__page_fault_handler_address:
|
||||
.quad 0
|
||||
@ -137,74 +154,6 @@ __freeze:
|
||||
POP_ALL_REGS
|
||||
iretq
|
||||
|
||||
.globl nmi
|
||||
nmi:
|
||||
#define PANICED 232
|
||||
#define PANIC_REGS 240
|
||||
movq %rax,%gs:PANIC_REGS+0x00
|
||||
movq %rsp,%gs:PANIC_REGS+0x08
|
||||
|
||||
movl nmi_mode(%rip),%eax
|
||||
cmp $3,%rax
|
||||
je 4f
|
||||
cmp $1,%rax
|
||||
je 1f
|
||||
cmp $2,%rax
|
||||
jne 3f
|
||||
1:
|
||||
cld
|
||||
movq %gs:PANIC_REGS+0x00,%rax
|
||||
PUSH_ALL_REGS
|
||||
subq $40, %rsp
|
||||
movq %rsp,%gs:PANIC_REGS+0x10
|
||||
movq %rsp, %rdi
|
||||
call freeze_thaw
|
||||
cmpq $0, %rax
|
||||
jnz 2f
|
||||
addq $40, %rsp
|
||||
2:
|
||||
POP_ALL_REGS
|
||||
iretq
|
||||
3:
|
||||
movq %rbx,%gs:PANIC_REGS+0x08
|
||||
movq %rcx,%gs:PANIC_REGS+0x10
|
||||
movq %rdx,%gs:PANIC_REGS+0x18
|
||||
movq %rsi,%gs:PANIC_REGS+0x20
|
||||
movq %rdi,%gs:PANIC_REGS+0x28
|
||||
movq %rbp,%gs:PANIC_REGS+0x30
|
||||
movq 0x18(%rsp),%rax /* rsp */
|
||||
movq %rax,%gs:PANIC_REGS+0x38
|
||||
movq %r8, %gs:PANIC_REGS+0x40
|
||||
movq %r9, %gs:PANIC_REGS+0x48
|
||||
movq %r10,%gs:PANIC_REGS+0x50
|
||||
movq %r11,%gs:PANIC_REGS+0x58
|
||||
movq %r12,%gs:PANIC_REGS+0x60
|
||||
movq %r13,%gs:PANIC_REGS+0x68
|
||||
movq %r14,%gs:PANIC_REGS+0x70
|
||||
movq %r15,%gs:PANIC_REGS+0x78
|
||||
movq 0x00(%rsp),%rax /* rip */
|
||||
movq %rax,%gs:PANIC_REGS+0x80
|
||||
movq 0x10(%rsp),%rax /* rflags */
|
||||
movl %eax,%gs:PANIC_REGS+0x88
|
||||
movq 0x08(%rsp),%rax /* cs */
|
||||
movl %eax,%gs:PANIC_REGS+0x8C
|
||||
movq 0x20(%rsp),%rax /* ss */
|
||||
movl %eax,%gs:PANIC_REGS+0x90
|
||||
xorq %rax,%rax
|
||||
movw %ds,%ax
|
||||
movl %eax,%gs:PANIC_REGS+0x94
|
||||
movw %es,%ax
|
||||
movl %eax,%gs:PANIC_REGS+0x98
|
||||
movw %fs,%ax
|
||||
movl %eax,%gs:PANIC_REGS+0x9C
|
||||
movw %gs,%ax
|
||||
movl %eax,%gs:PANIC_REGS+0xA0
|
||||
movq $1,%gs:PANICED
|
||||
call ihk_mc_query_mem_areas
|
||||
4:
|
||||
hlt
|
||||
jmp 4b
|
||||
|
||||
.globl x86_syscall
|
||||
x86_syscall:
|
||||
cld
|
||||
|
||||
@ -14,7 +14,6 @@
|
||||
*/
|
||||
|
||||
#include <ihk/cpu.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <ihk/mm.h>
|
||||
#include <types.h>
|
||||
#include <memory.h>
|
||||
@ -26,7 +25,7 @@
|
||||
#include <cls.h>
|
||||
#include <kmalloc.h>
|
||||
#include <rusage_private.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
//#define DEBUG
|
||||
|
||||
@ -38,6 +37,7 @@
|
||||
static char *last_page;
|
||||
extern char _head[], _end[];
|
||||
|
||||
extern unsigned long linux_page_offset_base;
|
||||
extern unsigned long x86_kernel_phys_base;
|
||||
|
||||
/* Arch specific early allocation routine */
|
||||
@ -1355,109 +1355,6 @@ struct clear_range_args {
|
||||
int max_nr_addr;
|
||||
};
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_8
|
||||
void remote_flush_tlb_cpumask(struct process_vm *vm,
|
||||
unsigned long addr, int cpu_id)
|
||||
{
|
||||
unsigned long __addr = addr;
|
||||
return remote_flush_tlb_array_cpumask(vm, &__addr, 1, cpu_id);
|
||||
}
|
||||
|
||||
void remote_flush_tlb_array_cpumask(struct process_vm *vm,
|
||||
unsigned long *addr,
|
||||
int nr_addr,
|
||||
int cpu_id)
|
||||
{
|
||||
unsigned long cpu;
|
||||
int flush_ind;
|
||||
struct tlb_flush_entry *flush_entry;
|
||||
cpu_set_t _cpu_set;
|
||||
|
||||
if (addr[0]) {
|
||||
flush_ind = (addr[0] >> PAGE_SHIFT) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
|
||||
}
|
||||
/* Zero address denotes full TLB flush */
|
||||
else {
|
||||
/* Random.. */
|
||||
flush_ind = (rdtsc()) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
|
||||
}
|
||||
|
||||
flush_entry = &tlb_flush_vector[flush_ind];
|
||||
|
||||
/* Take a copy of the cpu set so that we don't hold the lock
|
||||
* all the way while interrupting other cores */
|
||||
ihk_mc_spinlock_lock_noirq(&vm->address_space->cpu_set_lock);
|
||||
memcpy(&_cpu_set, &vm->address_space->cpu_set, sizeof(cpu_set_t));
|
||||
ihk_mc_spinlock_unlock_noirq(&vm->address_space->cpu_set_lock);
|
||||
|
||||
dkprintf("trying to aquire flush_entry->lock flush_ind: %d\n", flush_ind);
|
||||
|
||||
ihk_mc_spinlock_lock_noirq(&flush_entry->lock);
|
||||
|
||||
flush_entry->vm = vm;
|
||||
flush_entry->addr = addr;
|
||||
flush_entry->nr_addr = nr_addr;
|
||||
ihk_atomic_set(&flush_entry->pending, 0);
|
||||
|
||||
dkprintf("lock aquired, iterating cpu mask.. flush_ind: %d\n", flush_ind);
|
||||
|
||||
/* Loop through CPUs in this address space and interrupt them for
|
||||
* TLB flush on the specified address */
|
||||
for_each_set_bit(cpu, (const unsigned long*)&_cpu_set.__bits, CPU_SETSIZE) {
|
||||
|
||||
if (ihk_mc_get_processor_id() == cpu)
|
||||
continue;
|
||||
|
||||
ihk_atomic_inc(&flush_entry->pending);
|
||||
dkprintf("remote_flush_tlb_cpumask: flush_ind: %d, addr: 0x%lX, interrupting cpu: %d\n",
|
||||
flush_ind, addr, cpu);
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_8 /* arch depend hide */
|
||||
/* TODO(pka_idke) Interim support */
|
||||
ihk_mc_interrupt_cpu(cpu,
|
||||
ihk_mc_get_vector(flush_ind + IHK_TLB_FLUSH_IRQ_VECTOR_START));
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_8 */
|
||||
ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(cpu)->apic_id,
|
||||
flush_ind + IHK_TLB_FLUSH_IRQ_VECTOR_START);
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_8 */
|
||||
}
|
||||
|
||||
#ifdef DEBUG_IC_TLB
|
||||
{
|
||||
unsigned long tsc;
|
||||
tsc = rdtsc() + 12884901888; /* 1.2GHz =>10 sec */
|
||||
#endif
|
||||
if (flush_entry->addr[0]) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < flush_entry->nr_addr; ++i) {
|
||||
flush_tlb_single(flush_entry->addr[i] & PAGE_MASK);
|
||||
}
|
||||
}
|
||||
/* Zero address denotes full TLB flush */
|
||||
else {
|
||||
flush_tlb();
|
||||
}
|
||||
|
||||
/* Wait for all cores */
|
||||
while (ihk_atomic_read(&flush_entry->pending) != 0) {
|
||||
cpu_pause();
|
||||
|
||||
#ifdef DEBUG_IC_TLB
|
||||
if (rdtsc() > tsc) {
|
||||
kprintf("waited 10 secs for remote TLB!! -> panic_all()\n");
|
||||
panic_all_cores("waited 10 secs for remote TLB!!\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#ifdef DEBUG_IC_TLB
|
||||
}
|
||||
#endif
|
||||
|
||||
ihk_mc_spinlock_unlock_noirq(&flush_entry->lock);
|
||||
}
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_8 */
|
||||
|
||||
static void remote_flush_tlb_add_addr(struct clear_range_args *args,
|
||||
unsigned long addr)
|
||||
{
|
||||
@ -1622,7 +1519,7 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
|
||||
{
|
||||
struct clear_range_args *args = args0;
|
||||
int error;
|
||||
uint64_t phys;
|
||||
uint64_t phys = 0;
|
||||
pte_t old;
|
||||
struct page *page;
|
||||
struct page_table *pt;
|
||||
@ -2572,10 +2469,10 @@ static void init_linux_kernel_mapping(struct page_table *pt)
|
||||
map_start = 0;
|
||||
map_end = 0x20000000000;
|
||||
|
||||
virt = (void *)LINUX_PAGE_OFFSET;
|
||||
virt = (void *)linux_page_offset_base;
|
||||
|
||||
kprintf("Linux kernel virtual: 0x%lx - 0x%lx -> 0x%lx - 0x%lx\n",
|
||||
LINUX_PAGE_OFFSET, LINUX_PAGE_OFFSET + map_end, 0, map_end);
|
||||
virt, virt + map_end, 0, map_end);
|
||||
|
||||
for (phys = map_start; phys < map_end; phys += LARGE_PAGE_SIZE) {
|
||||
if (set_pt_large_page(pt, virt, phys, PTATTR_WRITABLE) != 0) {
|
||||
@ -2599,9 +2496,11 @@ static void init_linux_kernel_mapping(struct page_table *pt)
|
||||
}
|
||||
|
||||
dkprintf("Linux kernel virtual: 0x%lx - 0x%lx -> 0x%lx - 0x%lx\n",
|
||||
LINUX_PAGE_OFFSET + map_start, LINUX_PAGE_OFFSET + map_end, map_start, map_end);
|
||||
linux_page_offset_base + map_start,
|
||||
linux_page_offset_base + map_end,
|
||||
map_start, map_end);
|
||||
|
||||
virt = (void *)(LINUX_PAGE_OFFSET + map_start);
|
||||
virt = (void *)(linux_page_offset_base + map_start);
|
||||
for (phys = map_start; phys < map_end; phys += LARGE_PAGE_SIZE, virt += LARGE_PAGE_SIZE) {
|
||||
if (set_pt_large_page(pt, virt, phys, PTATTR_WRITABLE) != 0) {
|
||||
kprintf("%s: set_pt_large_page() failed for 0x%lx\n", __FUNCTION__, virt);
|
||||
@ -2652,7 +2551,7 @@ void *map_fixed_area(unsigned long phys, unsigned long size, int uncachable)
|
||||
attr |= PTATTR_UNCACHABLE;
|
||||
}
|
||||
|
||||
kprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
|
||||
dkprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
|
||||
paligned, v, npages);
|
||||
|
||||
for (i = 0; i < npages; i++) {
|
||||
@ -2745,12 +2644,12 @@ unsigned long virt_to_phys(void *v)
|
||||
unsigned long va = (unsigned long)v;
|
||||
|
||||
if (va >= MAP_KERNEL_START) {
|
||||
dkprintf("%s: MAP_KERNEL_START <= 0x%lx <= LINUX_PAGE_OFFSET\n",
|
||||
dkprintf("%s: MAP_KERNEL_START <= 0x%lx <= linux_page_offset_base\n",
|
||||
__FUNCTION__, va);
|
||||
return va - MAP_KERNEL_START + x86_kernel_phys_base;
|
||||
}
|
||||
else if (va >= LINUX_PAGE_OFFSET) {
|
||||
return va - LINUX_PAGE_OFFSET;
|
||||
else if (va >= linux_page_offset_base) {
|
||||
return va - linux_page_offset_base;
|
||||
}
|
||||
else if (va >= MAP_FIXED_START) {
|
||||
return va - MAP_FIXED_START;
|
||||
@ -2769,7 +2668,7 @@ void *phys_to_virt(unsigned long p)
|
||||
return (void *)(p + MAP_ST_START);
|
||||
}
|
||||
|
||||
return (void *)(p + LINUX_PAGE_OFFSET);
|
||||
return (void *)(p + linux_page_offset_base);
|
||||
}
|
||||
|
||||
int copy_from_user(void *dst, const void *src, size_t siz)
|
||||
|
||||
@ -12,12 +12,12 @@
|
||||
#include <march.h>
|
||||
#include <errno.h>
|
||||
#include <cls.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <ihk/cpu.h>
|
||||
#include <registers.h>
|
||||
#include <mc_perf_event.h>
|
||||
#include <config.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <process.h>
|
||||
|
||||
extern unsigned int *x86_march_perfmap;
|
||||
extern int running_on_kvm(void);
|
||||
@ -223,41 +223,6 @@ int ihk_mc_perfctr_init_raw(int counter, unsigned int code, int mode)
|
||||
#endif /*POSTK_DEBUG_TEMP_FIX_29*/
|
||||
}
|
||||
|
||||
#ifdef POSTK_DEBUG_TEMP_FIX_29
|
||||
int ihk_mc_perfctr_init(int counter, uint64_t config, int mode)
|
||||
#else
|
||||
int ihk_mc_perfctr_init(int counter, enum ihk_perfctr_type type, int mode)
|
||||
#endif /*POSTK_DEBUG_TEMP_FIX_29*/
|
||||
{
|
||||
#ifdef POSTK_DEBUG_TEMP_FIX_29
|
||||
enum ihk_perfctr_type type;
|
||||
|
||||
switch (config) {
|
||||
case PERF_COUNT_HW_CPU_CYCLES :
|
||||
type = APT_TYPE_CYCLE;
|
||||
break;
|
||||
case PERF_COUNT_HW_INSTRUCTIONS :
|
||||
type = APT_TYPE_INSTRUCTIONS;
|
||||
break;
|
||||
default :
|
||||
// Not supported config.
|
||||
type = PERFCTR_MAX_TYPE;
|
||||
}
|
||||
#endif /*POSTK_DEBUG_TEMP_FIX_29*/
|
||||
|
||||
if (counter < 0 || counter >= NUM_PERF_COUNTERS) {
|
||||
return -EINVAL;
|
||||
}
|
||||
if (type < 0 || type >= PERFCTR_MAX_TYPE) {
|
||||
return -EINVAL;
|
||||
}
|
||||
if (!x86_march_perfmap[type]) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return set_perfctr_x86_direct(counter, mode, x86_march_perfmap[type]);
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
|
||||
{
|
||||
struct thread *thread = cpu_local_var(current);
|
||||
@ -412,6 +377,23 @@ int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_alloc(struct thread *thread, struct mc_perf_event *event)
|
||||
{
|
||||
int ret = -EINVAL;
|
||||
int i = 0;
|
||||
const int counters = ihk_mc_perf_get_num_counters();
|
||||
|
||||
// find avail generic counter
|
||||
for (i = 0; i < counters; i++) {
|
||||
if (!(thread->pmc_alloc_map & (1 << i))) {
|
||||
ret = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
unsigned long ihk_mc_perfctr_read(int counter)
|
||||
{
|
||||
unsigned long retval = 0;
|
||||
@ -439,6 +421,14 @@ unsigned long ihk_mc_perfctr_read(int counter)
|
||||
return retval;
|
||||
}
|
||||
|
||||
unsigned long ihk_mc_perfctr_value(int counter, unsigned long correction)
|
||||
{
|
||||
unsigned long count = ihk_mc_perfctr_read(counter) + correction;
|
||||
|
||||
count &= 0x000000ffffffffffL;
|
||||
return count;
|
||||
}
|
||||
|
||||
// read by rdmsr
|
||||
unsigned long ihk_mc_perfctr_read_msr(int counter)
|
||||
{
|
||||
@ -513,3 +503,18 @@ int ihk_mc_perf_get_num_counters(void)
|
||||
{
|
||||
return NUM_PERF_COUNTERS;
|
||||
}
|
||||
|
||||
int hw_perf_event_init(struct mc_perf_event *event)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ihk_mc_event_set_period(struct mc_perf_event *event)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t ihk_mc_event_update(struct mc_perf_event *event)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -16,7 +16,6 @@
|
||||
*/
|
||||
|
||||
#include <ihk/cpu.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <cls.h>
|
||||
#include <cpulocal.h>
|
||||
#include <syscall.h>
|
||||
@ -32,7 +31,8 @@
|
||||
#include <page.h>
|
||||
#include <limits.h>
|
||||
#include <syscall.h>
|
||||
#include <debug.h>
|
||||
#include <rusage_private.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
void terminate_mcexec(int, int);
|
||||
extern long do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact);
|
||||
@ -64,7 +64,6 @@ uintptr_t debug_constants[] = {
|
||||
-1,
|
||||
};
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_52
|
||||
#define VDSO_MAXPAGES 2
|
||||
struct vdso {
|
||||
long busy;
|
||||
@ -80,8 +79,24 @@ struct vdso {
|
||||
long hpet_phys;
|
||||
void *pvti_virt;
|
||||
long pvti_phys;
|
||||
void *vgtod_virt;
|
||||
};
|
||||
|
||||
struct vsyscall_gtod_data {
|
||||
int seq;
|
||||
|
||||
struct {
|
||||
int vclock_mode;
|
||||
unsigned long cycle_last;
|
||||
unsigned long mask;
|
||||
unsigned int mult;
|
||||
unsigned int shift;
|
||||
} clock;
|
||||
|
||||
/* open coded 'struct timespec' */
|
||||
time_t wall_time_sec;
|
||||
unsigned long wall_time_snsec;
|
||||
};
|
||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
||||
|
||||
static struct vdso vdso;
|
||||
static size_t container_size = 0;
|
||||
@ -132,44 +147,6 @@ int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last) {
|
||||
return min_cpu;
|
||||
}
|
||||
|
||||
int
|
||||
arch_clear_host_user_space()
|
||||
{
|
||||
struct thread *th = cpu_local_var(current);
|
||||
|
||||
/* XXX: might be unnecessary */
|
||||
clear_host_pte(th->vm->region.user_start,
|
||||
(th->vm->region.user_end - th->vm->region.user_start));
|
||||
return 0;
|
||||
}
|
||||
|
||||
SYSCALL_DECLARE(rt_sigaction)
|
||||
{
|
||||
int sig = ihk_mc_syscall_arg0(ctx);
|
||||
const struct sigaction *act = (const struct sigaction *)ihk_mc_syscall_arg1(ctx);
|
||||
struct sigaction *oact = (struct sigaction *)ihk_mc_syscall_arg2(ctx);
|
||||
size_t sigsetsize = ihk_mc_syscall_arg3(ctx);
|
||||
struct k_sigaction new_sa, old_sa;
|
||||
int rc;
|
||||
|
||||
if (sigsetsize != sizeof(sigset_t))
|
||||
return -EINVAL;
|
||||
|
||||
if(act)
|
||||
if(copy_from_user(&new_sa.sa, act, sizeof new_sa.sa)){
|
||||
goto fault;
|
||||
}
|
||||
rc = do_sigaction(sig, act? &new_sa: NULL, oact? &old_sa: NULL);
|
||||
if(rc == 0 && oact)
|
||||
if(copy_to_user(oact, &old_sa.sa, sizeof old_sa.sa)){
|
||||
goto fault;
|
||||
}
|
||||
|
||||
return rc;
|
||||
fault:
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
SYSCALL_DECLARE(prctl)
|
||||
{
|
||||
struct process *proc = cpu_local_var(current)->proc;
|
||||
@ -558,7 +535,7 @@ long ptrace_write_regset(struct thread *thread, long type, struct iovec *iov)
|
||||
return rc;
|
||||
}
|
||||
|
||||
extern void coredump(struct thread *thread, void *regs);
|
||||
extern int coredump(struct thread *thread, void *regs, int sig);
|
||||
|
||||
void ptrace_report_signal(struct thread *thread, int sig)
|
||||
{
|
||||
@ -726,6 +703,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
struct mcs_rwlock_node_irqsave lock;
|
||||
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
||||
int restart = 0;
|
||||
int ret;
|
||||
|
||||
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
|
||||
dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig);
|
||||
@ -971,15 +949,6 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
dkprintf("SIGTRAP(): woken up\n");
|
||||
break;
|
||||
case SIGCONT:
|
||||
memset(&info, '\0', sizeof info);
|
||||
info.si_signo = SIGCHLD;
|
||||
info.si_code = CLD_CONTINUED;
|
||||
info._sifields._sigchld.si_pid = proc->pid;
|
||||
info._sifields._sigchld.si_status = 0x0000ffff;
|
||||
do_kill(cpu_local_var(current), proc->parent->pid, -1, SIGCHLD, &info, 0);
|
||||
proc->main_thread->signal_flags = SIGNAL_STOP_CONTINUED;
|
||||
proc->status = PS_RUNNING;
|
||||
dkprintf("do_signal,SIGCONT,do nothing\n");
|
||||
break;
|
||||
case SIGQUIT:
|
||||
case SIGILL:
|
||||
@ -991,9 +960,31 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
case SIGXCPU:
|
||||
case SIGXFSZ:
|
||||
core:
|
||||
dkprintf("do_signal,default,core,sig=%d\n", sig);
|
||||
coredump(thread, regs);
|
||||
coredumped = 0x80;
|
||||
thread->coredump_regs =
|
||||
kmalloc(sizeof(struct x86_user_context),
|
||||
IHK_MC_AP_NOWAIT);
|
||||
if (!thread->coredump_regs) {
|
||||
kprintf("%s: Out of memory\n", __func__);
|
||||
goto skip;
|
||||
}
|
||||
memcpy(thread->coredump_regs, regs,
|
||||
sizeof(struct x86_user_context));
|
||||
|
||||
ret = coredump(thread, regs, sig);
|
||||
switch (ret) {
|
||||
case -EBUSY:
|
||||
kprintf("%s: INFO: coredump not performed, try ulimit -c <non-zero>\n",
|
||||
__func__);
|
||||
break;
|
||||
case 0:
|
||||
coredumped = 0x80;
|
||||
break;
|
||||
default:
|
||||
kprintf("%s: ERROR: coredump failed (%d)\n",
|
||||
__func__, ret);
|
||||
break;
|
||||
}
|
||||
skip:
|
||||
terminate(0, sig | coredumped);
|
||||
break;
|
||||
case SIGCHLD:
|
||||
@ -1010,80 +1001,6 @@ out:
|
||||
return restart;
|
||||
}
|
||||
|
||||
static struct sig_pending *
|
||||
getsigpending(struct thread *thread, int delflag){
|
||||
struct list_head *head;
|
||||
mcs_rwlock_lock_t *lock;
|
||||
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
||||
struct sig_pending *next;
|
||||
struct sig_pending *pending;
|
||||
__sigset_t w;
|
||||
__sigset_t x;
|
||||
int sig;
|
||||
struct k_sigaction *k;
|
||||
|
||||
w = thread->sigmask.__val[0];
|
||||
|
||||
lock = &thread->sigcommon->lock;
|
||||
head = &thread->sigcommon->sigpending;
|
||||
for(;;) {
|
||||
if (delflag) {
|
||||
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
|
||||
}
|
||||
else {
|
||||
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
|
||||
}
|
||||
|
||||
list_for_each_entry_safe(pending, next, head, list){
|
||||
for(x = pending->sigmask.__val[0], sig = 0; x; sig++, x >>= 1);
|
||||
k = thread->sigcommon->action + sig - 1;
|
||||
if(delflag ||
|
||||
(sig != SIGCHLD && sig != SIGURG) ||
|
||||
(k->sa.sa_handler != (void *)1 &&
|
||||
k->sa.sa_handler != NULL)){
|
||||
if(!(pending->sigmask.__val[0] & w)){
|
||||
if(delflag)
|
||||
list_del(&pending->list);
|
||||
|
||||
if (delflag) {
|
||||
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
||||
}
|
||||
else {
|
||||
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
|
||||
}
|
||||
return pending;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (delflag) {
|
||||
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
||||
}
|
||||
else {
|
||||
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
|
||||
}
|
||||
|
||||
if(lock == &thread->sigpendinglock)
|
||||
return NULL;
|
||||
|
||||
lock = &thread->sigpendinglock;
|
||||
head = &thread->sigpending;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct sig_pending *
|
||||
hassigpending(struct thread *thread)
|
||||
{
|
||||
if (list_empty(&thread->sigpending) &&
|
||||
list_empty(&thread->sigcommon->sigpending)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return getsigpending(thread, 0);
|
||||
}
|
||||
|
||||
int
|
||||
interrupt_from_user(void *regs0)
|
||||
{
|
||||
@ -1098,170 +1015,6 @@ void save_syscall_return_value(int num, unsigned long rc)
|
||||
return;
|
||||
}
|
||||
|
||||
/** \brief check arrived signals and processing
|
||||
*
|
||||
* @param rc return value of syscall
|
||||
* @param regs0 context
|
||||
* @param num syscall number (-1: Not called on exiting system call)
|
||||
*/
|
||||
void
|
||||
check_signal(unsigned long rc, void *regs0, int num)
|
||||
{
|
||||
struct x86_user_context *regs = regs0;
|
||||
struct thread *thread;
|
||||
struct sig_pending *pending;
|
||||
int irqstate;
|
||||
|
||||
if(clv == NULL)
|
||||
return;
|
||||
thread = cpu_local_var(current);
|
||||
|
||||
if(thread == NULL || thread == &cpu_local_var(idle)){
|
||||
struct thread *t;
|
||||
|
||||
irqstate = ihk_mc_spinlock_lock(&(cpu_local_var(runq_lock)));
|
||||
list_for_each_entry(t, &(cpu_local_var(runq)), sched_list){
|
||||
if(t == &cpu_local_var(idle))
|
||||
continue;
|
||||
if(t->status == PS_INTERRUPTIBLE &&
|
||||
hassigpending(t)){
|
||||
t->status = PS_RUNNING;
|
||||
break;
|
||||
}
|
||||
}
|
||||
ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), irqstate);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if(regs != NULL && !interrupt_from_user(regs)) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (list_empty(&thread->sigpending) &&
|
||||
list_empty(&thread->sigcommon->sigpending)) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
for(;;){
|
||||
pending = getsigpending(thread, 1);
|
||||
if(!pending) {
|
||||
dkprintf("check_signal,queue is empty\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (do_signal(rc, regs, thread, pending, num)) {
|
||||
num = -1;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
return;
|
||||
}
|
||||
|
||||
static int
|
||||
check_sig_pending_thread(struct thread *thread)
|
||||
{
|
||||
int found = 0;
|
||||
struct list_head *head;
|
||||
mcs_rwlock_lock_t *lock;
|
||||
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
||||
struct sig_pending *next;
|
||||
struct sig_pending *pending;
|
||||
__sigset_t w;
|
||||
__sigset_t x;
|
||||
int sig = 0;
|
||||
struct k_sigaction *k;
|
||||
struct cpu_local_var *v;
|
||||
|
||||
v = get_this_cpu_local_var();
|
||||
w = thread->sigmask.__val[0];
|
||||
|
||||
lock = &thread->sigcommon->lock;
|
||||
head = &thread->sigcommon->sigpending;
|
||||
for (;;) {
|
||||
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
|
||||
|
||||
list_for_each_entry_safe(pending, next, head, list){
|
||||
for (x = pending->sigmask.__val[0], sig = 0; x;
|
||||
sig++, x >>= 1);
|
||||
k = thread->sigcommon->action + sig - 1;
|
||||
if ((sig != SIGCHLD && sig != SIGURG) ||
|
||||
(k->sa.sa_handler != (void *)1 &&
|
||||
k->sa.sa_handler != NULL)) {
|
||||
if (!(pending->sigmask.__val[0] & w)) {
|
||||
if (pending->interrupted == 0) {
|
||||
pending->interrupted = 1;
|
||||
found = 1;
|
||||
if (sig != SIGCHLD &&
|
||||
sig != SIGURG &&
|
||||
!k->sa.sa_handler) {
|
||||
found = 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
|
||||
|
||||
if (found == 2) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (lock == &thread->sigpendinglock) {
|
||||
break;
|
||||
}
|
||||
|
||||
lock = &thread->sigpendinglock;
|
||||
head = &thread->sigpending;
|
||||
}
|
||||
|
||||
if (found == 2) {
|
||||
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
|
||||
terminate_mcexec(0, sig);
|
||||
return 1;
|
||||
}
|
||||
else if (found == 1) {
|
||||
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
|
||||
interrupt_syscall(thread, 0);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
check_sig_pending(void)
|
||||
{
|
||||
struct thread *thread;
|
||||
struct cpu_local_var *v;
|
||||
|
||||
if (clv == NULL)
|
||||
return;
|
||||
|
||||
v = get_this_cpu_local_var();
|
||||
repeat:
|
||||
v->runq_irqstate = ihk_mc_spinlock_lock(&v->runq_lock);
|
||||
list_for_each_entry(thread, &(v->runq), sched_list) {
|
||||
|
||||
if (thread == NULL || thread == &cpu_local_var(idle)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (thread->in_syscall_offload == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (thread->proc->group_exit_status & 0x0000000100000000L) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (check_sig_pending_thread(thread))
|
||||
goto repeat;
|
||||
}
|
||||
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
|
||||
}
|
||||
|
||||
unsigned long
|
||||
do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
|
||||
int ptracecont)
|
||||
@ -1278,7 +1031,6 @@ do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
|
||||
struct list_head *head = NULL;
|
||||
int rc;
|
||||
unsigned long irqstate = 0;
|
||||
struct k_sigaction *k;
|
||||
int doint;
|
||||
int found = 0;
|
||||
siginfo_t info0;
|
||||
@ -1288,6 +1040,7 @@ do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
|
||||
struct process_hash *phash = rset->process_hash;
|
||||
struct mcs_rwlock_node lock;
|
||||
struct mcs_rwlock_node updatelock;
|
||||
struct sig_pending *pending = NULL;
|
||||
|
||||
if(sig > 64 || sig < 0)
|
||||
return -EINVAL;
|
||||
@ -1509,54 +1262,70 @@ done:
|
||||
|
||||
mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node);
|
||||
|
||||
/* Put signal event even when handler is SIG_IGN or SIG_DFL
|
||||
because target ptraced thread must call ptrace_report_signal
|
||||
in check_signal */
|
||||
rc = 0;
|
||||
k = tthread->sigcommon->action + sig - 1;
|
||||
if ((sig != SIGKILL && (tthread->ptrace & PT_TRACED)) ||
|
||||
(k->sa.sa_handler != (void *)1 &&
|
||||
(k->sa.sa_handler != NULL ||
|
||||
(sig != SIGCHLD && sig != SIGURG)))) {
|
||||
struct sig_pending *pending = NULL;
|
||||
if (sig < 33) { // SIGRTMIN - SIGRTMAX
|
||||
list_for_each_entry(pending, head, list){
|
||||
if(pending->sigmask.__val[0] == mask &&
|
||||
pending->ptracecont == ptracecont)
|
||||
break;
|
||||
}
|
||||
if(&pending->list == head)
|
||||
pending = NULL;
|
||||
|
||||
if (sig < 33) { // SIGRTMIN - SIGRTMAX
|
||||
list_for_each_entry(pending, head, list) {
|
||||
if (pending->sigmask.__val[0] == mask &&
|
||||
pending->ptracecont == ptracecont)
|
||||
break;
|
||||
}
|
||||
if(pending == NULL){
|
||||
doint = 1;
|
||||
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
|
||||
if(!pending){
|
||||
rc = -ENOMEM;
|
||||
}
|
||||
else{
|
||||
memset(pending, 0, sizeof(struct sig_pending));
|
||||
pending->sigmask.__val[0] = mask;
|
||||
memcpy(&pending->info, info, sizeof(siginfo_t));
|
||||
pending->ptracecont = ptracecont;
|
||||
if(sig == SIGKILL || sig == SIGSTOP)
|
||||
list_add(&pending->list, head);
|
||||
else
|
||||
list_add_tail(&pending->list, head);
|
||||
tthread->sigevent = 1;
|
||||
}
|
||||
if (&pending->list == head)
|
||||
pending = NULL;
|
||||
}
|
||||
if (pending == NULL) {
|
||||
doint = 1;
|
||||
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
|
||||
if (!pending) {
|
||||
rc = -ENOMEM;
|
||||
}
|
||||
else {
|
||||
memset(pending, 0, sizeof(struct sig_pending));
|
||||
pending->sigmask.__val[0] = mask;
|
||||
memcpy(&pending->info, info, sizeof(siginfo_t));
|
||||
pending->ptracecont = ptracecont;
|
||||
if (sig == SIGKILL || sig == SIGSTOP)
|
||||
list_add(&pending->list, head);
|
||||
else
|
||||
list_add_tail(&pending->list, head);
|
||||
tthread->sigevent = 1;
|
||||
}
|
||||
}
|
||||
|
||||
mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node);
|
||||
cpu_restore_interrupt(irqstate);
|
||||
|
||||
if (sig == SIGCONT || ptracecont == 1) {
|
||||
/* Wake up the target only when stopped by SIGSTOP */
|
||||
if (sched_wakeup_thread(tthread, PS_STOPPED) == 0) {
|
||||
struct siginfo info;
|
||||
|
||||
tthread->proc->main_thread->signal_flags =
|
||||
SIGNAL_STOP_CONTINUED;
|
||||
tthread->proc->status = PS_RUNNING;
|
||||
memset(&info, '\0', sizeof(info));
|
||||
info.si_signo = SIGCHLD;
|
||||
info.si_code = CLD_CONTINUED;
|
||||
info._sifields._sigchld.si_pid = tthread->proc->pid;
|
||||
info._sifields._sigchld.si_status = 0x0000ffff;
|
||||
do_kill(tthread, tthread->proc->parent->pid, -1,
|
||||
SIGCHLD, &info, 0);
|
||||
tthread->proc->status = PS_RUNNING;
|
||||
if (thread != tthread) {
|
||||
ihk_mc_interrupt_cpu(tthread->cpu_id,
|
||||
ihk_mc_get_vector(IHK_GV_IKC));
|
||||
}
|
||||
doint = 0;
|
||||
}
|
||||
}
|
||||
if (doint && !(mask & tthread->sigmask.__val[0])) {
|
||||
int status = tthread->status;
|
||||
|
||||
if (thread != tthread) {
|
||||
dkprintf("do_kill,ipi,pid=%d,cpu_id=%d\n",
|
||||
tproc->pid, tthread->cpu_id);
|
||||
ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(tthread->cpu_id)->apic_id, 0xd0);
|
||||
ihk_mc_interrupt_cpu(tthread->cpu_id,
|
||||
ihk_mc_get_vector(IHK_GV_IKC));
|
||||
}
|
||||
|
||||
if (status != PS_RUNNING) {
|
||||
@ -1564,11 +1333,6 @@ done:
|
||||
/* Wake up the target only when stopped by ptrace-reporting */
|
||||
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED | PS_INTERRUPTIBLE);
|
||||
}
|
||||
else if(sig == SIGCONT || ptracecont == 1){
|
||||
/* Wake up the target only when stopped by SIGSTOP */
|
||||
sched_wakeup_thread(tthread, PS_STOPPED);
|
||||
tthread->proc->status = PS_RUNNING;
|
||||
}
|
||||
else {
|
||||
sched_wakeup_thread(tthread, PS_INTERRUPTIBLE);
|
||||
}
|
||||
@ -1593,7 +1357,7 @@ set_signal(int sig, void *regs0, siginfo_t *info)
|
||||
}
|
||||
|
||||
if ((__sigmask(sig) & thread->sigmask.__val[0])) {
|
||||
coredump(thread, regs0);
|
||||
coredump(thread, regs0, sig);
|
||||
terminate(0, sig | 0x80);
|
||||
}
|
||||
do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0);
|
||||
@ -1629,7 +1393,7 @@ SYSCALL_DECLARE(mmap)
|
||||
;
|
||||
|
||||
const uintptr_t addr0 = ihk_mc_syscall_arg0(ctx);
|
||||
const size_t len0 = ihk_mc_syscall_arg1(ctx);
|
||||
size_t len0 = ihk_mc_syscall_arg1(ctx);
|
||||
const int prot = ihk_mc_syscall_arg2(ctx);
|
||||
const int flags0 = ihk_mc_syscall_arg3(ctx);
|
||||
const int fd = ihk_mc_syscall_arg4(ctx);
|
||||
@ -1668,7 +1432,9 @@ SYSCALL_DECLARE(mmap)
|
||||
if (flags & MAP_HUGETLB) {
|
||||
switch (flags & (0x3F << MAP_HUGE_SHIFT)) {
|
||||
case 0:
|
||||
flags |= MAP_HUGE_2MB; /* default hugepage size */
|
||||
/* default hugepage size */
|
||||
flags |= ihk_mc_get_linux_default_huge_page_shift() <<
|
||||
MAP_HUGE_SHIFT;
|
||||
break;
|
||||
|
||||
case MAP_HUGE_2MB:
|
||||
@ -1684,16 +1450,29 @@ SYSCALL_DECLARE(mmap)
|
||||
}
|
||||
|
||||
pgsize = (size_t)1 << ((flags >> MAP_HUGE_SHIFT) & 0x3F);
|
||||
/* Round-up map length by pagesize */
|
||||
len0 = ALIGN(len0, pgsize);
|
||||
|
||||
if (rusage_check_overmap(len0,
|
||||
(flags >> MAP_HUGE_SHIFT) & 0x3F)) {
|
||||
error = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
#define VALID_DUMMY_ADDR ((region->user_start + PTL3_SIZE - 1) & ~(PTL3_SIZE - 1))
|
||||
addr = (flags & MAP_FIXED)? addr0: VALID_DUMMY_ADDR;
|
||||
addr = addr0;
|
||||
len = (len0 + pgsize - 1) & ~(pgsize - 1);
|
||||
recheck:
|
||||
if ((addr & (pgsize - 1))
|
||||
|| (len == 0)
|
||||
|| !(flags & (MAP_SHARED | MAP_PRIVATE))
|
||||
|| ((flags & MAP_SHARED) && (flags & MAP_PRIVATE))
|
||||
|| (off0 & (pgsize - 1))) {
|
||||
if (!(flags & MAP_FIXED) && addr != VALID_DUMMY_ADDR) {
|
||||
addr = VALID_DUMMY_ADDR;
|
||||
goto recheck;
|
||||
}
|
||||
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):EINVAL\n",
|
||||
addr0, len0, prot, flags0, fd, off0);
|
||||
error = -EINVAL;
|
||||
@ -1703,6 +1482,10 @@ SYSCALL_DECLARE(mmap)
|
||||
if (addr < region->user_start
|
||||
|| region->user_end <= addr
|
||||
|| len > (region->user_end - region->user_start)) {
|
||||
if (!(flags & MAP_FIXED) && addr != VALID_DUMMY_ADDR) {
|
||||
addr = VALID_DUMMY_ADDR;
|
||||
goto recheck;
|
||||
}
|
||||
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):ENOMEM\n",
|
||||
addr0, len0, prot, flags0, fd, off0);
|
||||
error = -ENOMEM;
|
||||
@ -1730,10 +1513,20 @@ out:
|
||||
|
||||
SYSCALL_DECLARE(clone)
|
||||
{
|
||||
return do_fork((int)ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx),
|
||||
struct process *proc = cpu_local_var(current)->proc;
|
||||
struct mcs_rwlock_node_irqsave lock_dump;
|
||||
unsigned long ret;
|
||||
|
||||
/* mutex coredump */
|
||||
mcs_rwlock_reader_lock(&proc->coredump_lock, &lock_dump);
|
||||
|
||||
ret = do_fork((int)ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx),
|
||||
ihk_mc_syscall_arg2(ctx), ihk_mc_syscall_arg3(ctx),
|
||||
ihk_mc_syscall_arg4(ctx), ihk_mc_syscall_pc(ctx),
|
||||
ihk_mc_syscall_sp(ctx));
|
||||
|
||||
mcs_rwlock_reader_unlock(&proc->coredump_lock, &lock_dump);
|
||||
return ret;
|
||||
}
|
||||
|
||||
SYSCALL_DECLARE(fork)
|
||||
@ -1761,7 +1554,9 @@ SYSCALL_DECLARE(shmget)
|
||||
int hugeshift = shmflg & (0x3F << SHM_HUGE_SHIFT);
|
||||
|
||||
if (hugeshift == 0) {
|
||||
shmflg |= SHM_HUGE_2MB; /* default hugepage size */
|
||||
/* default hugepage size */
|
||||
shmflg |= ihk_mc_get_linux_default_huge_page_shift() <<
|
||||
MAP_HUGE_SHIFT;
|
||||
} else if (hugeshift == SHM_HUGE_2MB ||
|
||||
hugeshift == SHM_HUGE_1GB) {
|
||||
/*nop*/
|
||||
@ -2210,7 +2005,7 @@ int do_process_vm_read_writev(int pid,
|
||||
}
|
||||
|
||||
/* Check if parameters are okay */
|
||||
ihk_mc_spinlock_lock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_lock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
range = lookup_process_memory_range(lthread->vm,
|
||||
(uintptr_t)local_iov,
|
||||
@ -2232,7 +2027,7 @@ int do_process_vm_read_writev(int pid,
|
||||
|
||||
ret = 0;
|
||||
arg_out:
|
||||
ihk_mc_spinlock_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
@ -2301,7 +2096,7 @@ arg_out:
|
||||
if (pli != li) {
|
||||
struct vm_range *range;
|
||||
|
||||
ihk_mc_spinlock_lock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_lock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
/* Is base valid? */
|
||||
range = lookup_process_memory_range(lthread->vm,
|
||||
@ -2331,7 +2126,7 @@ arg_out:
|
||||
|
||||
ret = 0;
|
||||
pli_out:
|
||||
ihk_mc_spinlock_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
@ -2344,7 +2139,7 @@ pli_out:
|
||||
if (pri != ri) {
|
||||
struct vm_range *range;
|
||||
|
||||
ihk_mc_spinlock_lock_noirq(&rvm->memory_range_lock);
|
||||
ihk_rwspinlock_read_lock_noirq(&rvm->memory_range_lock);
|
||||
|
||||
/* Is base valid? */
|
||||
range = lookup_process_memory_range(rvm,
|
||||
@ -2374,7 +2169,7 @@ pli_out:
|
||||
|
||||
ret = 0;
|
||||
pri_out:
|
||||
ihk_mc_spinlock_unlock_noirq(&rvm->memory_range_lock);
|
||||
ihk_rwspinlock_read_unlock_noirq(&rvm->memory_range_lock);
|
||||
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
@ -2811,4 +2606,46 @@ time_t time(void) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
void calculate_time_from_tsc(struct timespec *ts)
|
||||
{
|
||||
unsigned long seq;
|
||||
unsigned long seq2;
|
||||
unsigned long ns;
|
||||
unsigned long delta;
|
||||
struct vsyscall_gtod_data *gtod = vdso.vgtod_virt;
|
||||
|
||||
do {
|
||||
for (;;) {
|
||||
seq = ACCESS_ONCE(gtod->seq);
|
||||
if (unlikely(seq & 1)) {
|
||||
cpu_pause();
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
rmb(); /* fetch sequence before time */
|
||||
ts->tv_sec = gtod->wall_time_sec;
|
||||
ns = gtod->wall_time_snsec;
|
||||
delta = rdtsc() - gtod->clock.cycle_last;
|
||||
ns += delta * gtod->clock.mult;
|
||||
ns >>= gtod->clock.shift;
|
||||
seq2 = ACCESS_ONCE(gtod->seq);
|
||||
rmb(); /* fetch time before checking sequence */
|
||||
} while (seq != seq2);
|
||||
ts->tv_nsec = ns;
|
||||
|
||||
if (ts->tv_nsec >= NS_PER_SEC) {
|
||||
ts->tv_nsec -= NS_PER_SEC;
|
||||
++ts->tv_sec;
|
||||
}
|
||||
}
|
||||
|
||||
extern void ptrace_syscall_event(struct thread *thread);
|
||||
long arch_ptrace_syscall_event(struct thread *thread,
|
||||
ihk_mc_user_context_t *ctx, long setret)
|
||||
{
|
||||
ihk_mc_syscall_ret(ctx) = setret;
|
||||
ptrace_syscall_event(thread);
|
||||
return ihk_mc_syscall_ret(ctx);
|
||||
}
|
||||
/*** End of File ***/
|
||||
|
||||
@ -1,10 +0,0 @@
|
||||
[Unit]
|
||||
Description=irqbalance daemon
|
||||
After=syslog.target
|
||||
|
||||
[Service]
|
||||
EnvironmentFile=/tmp/irqbalance_mck
|
||||
ExecStart=/usr/sbin/irqbalance --foreground $IRQBALANCE_ARGS
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@ -1,150 +0,0 @@
|
||||
# mcoverlay-create-smp-x86.sh.in COPYRIGHT FUJITSU LIMITED 2018
|
||||
# Overlay /proc, /sys with McKernel specific contents
|
||||
|
||||
#
|
||||
# Revert any state that has been initialized before the error occured.
|
||||
#
|
||||
if [ -z "$(declare -f error_exit)" ]; then
|
||||
error_exit() {
|
||||
local status=$1
|
||||
|
||||
case $status in
|
||||
mcos_sys_mounted)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
umount /tmp/mcos/mcos0_sys
|
||||
fi
|
||||
;&
|
||||
mcos_proc_mounted)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
umount /tmp/mcos/mcos0_proc
|
||||
fi
|
||||
;&
|
||||
mcoverlayfs_loaded)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
rmmod mcoverlay 2>/dev/null
|
||||
fi
|
||||
;&
|
||||
linux_proc_bind_mounted)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
umount /tmp/mcos/linux_proc
|
||||
fi
|
||||
;&
|
||||
tmp_mcos_mounted)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
umount /tmp/mcos
|
||||
fi
|
||||
;&
|
||||
tmp_mcos_created)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
rm -rf /tmp/mcos
|
||||
fi
|
||||
;&
|
||||
initial)
|
||||
# Nothing more to revert
|
||||
;;
|
||||
esac
|
||||
|
||||
# Retun -EINVAL
|
||||
exit -22
|
||||
}
|
||||
fi
|
||||
|
||||
if [ ! -e /tmp/mcos ]; then
|
||||
mkdir -p /tmp/mcos;
|
||||
fi
|
||||
if ! mount -t tmpfs tmpfs /tmp/mcos; then
|
||||
echo "error: mount /tmp/mcos" >&2
|
||||
error_exit "tmp_mcos_created"
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/linux_proc ]; then
|
||||
mkdir -p /tmp/mcos/linux_proc;
|
||||
fi
|
||||
if ! mount --bind /proc /tmp/mcos/linux_proc; then
|
||||
echo "error: mount /tmp/mcos/linux_proc" >&2
|
||||
error_exit "tmp_mcos_mounted"
|
||||
fi
|
||||
if ! taskset -c 0 insmod @KMODDIR@/mcoverlay.ko 2>/dev/null; then
|
||||
echo "error: inserting mcoverlay.ko" >&2
|
||||
error_exit "linux_proc_bind_mounted"
|
||||
fi
|
||||
while [ ! -e /proc/mcos0 ]
|
||||
do
|
||||
sleep 0.1
|
||||
done
|
||||
if [ ! -e /tmp/mcos/mcos0_proc ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_proc;
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_proc_upper;
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_proc_work;
|
||||
fi
|
||||
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then
|
||||
echo "error: mounting /tmp/mcos/mcos0_proc" >&2
|
||||
error_exit "mcoverlayfs_loaded"
|
||||
fi
|
||||
# TODO: How de we revert this in case of failure??
|
||||
mount --make-rprivate /proc
|
||||
|
||||
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys/setup_complete ]
|
||||
do
|
||||
sleep 0.1
|
||||
done
|
||||
if [ ! -e /tmp/mcos/mcos0_sys ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_sys;
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_sys_upper;
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_sys_work;
|
||||
fi
|
||||
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then
|
||||
echo "error: mount /tmp/mcos/mcos0_sys" >&2
|
||||
error_exit "mcos_proc_mounted"
|
||||
fi
|
||||
# TODO: How de we revert this in case of failure??
|
||||
mount --make-rprivate /sys
|
||||
|
||||
touch /tmp/mcos/mcos0_proc/mckernel
|
||||
|
||||
rm -rf /tmp/mcos/mcos0_sys/setup_complete
|
||||
|
||||
# Hide NUMA related files which are outside the LWK partition
|
||||
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
|
||||
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
|
||||
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
|
||||
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/drivers/processor/$cpuid
|
||||
else
|
||||
for nodeid in `find /sys/devices/system/cpu/$cpuid/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid/$nodeid" ]; then
|
||||
rm -f /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid/$nodeid
|
||||
fi
|
||||
done
|
||||
fi
|
||||
done
|
||||
for nodeid in `find /sys/devices/system/node/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid" ]; then
|
||||
rm -rf /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/*
|
||||
rm -rf /tmp/mcos/mcos0_sys/bus/node/devices/$nodeid
|
||||
else
|
||||
# Delete non-existent symlinks
|
||||
for cpuid in `find /sys/devices/system/node/$nodeid/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid/$cpuid" ]; then
|
||||
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/$cpuid
|
||||
fi
|
||||
done
|
||||
|
||||
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/memory*
|
||||
fi
|
||||
done
|
||||
rm -f /tmp/mcos/mcos0_sys/devices/system/node/has_*
|
||||
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
|
||||
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
|
||||
fi
|
||||
done
|
||||
|
||||
exit 0
|
||||
@ -1,16 +0,0 @@
|
||||
# Remove mcoverlay if loaded
|
||||
|
||||
if grep mcoverlay /proc/modules &>/dev/null; then
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
|
||||
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
|
||||
if ! rmmod mcoverlay 2>/dev/null; then
|
||||
echo "error: removing mcoverlay" >&2
|
||||
# Return -EINVAL
|
||||
exit -22
|
||||
fi
|
||||
fi
|
||||
|
||||
exit 0
|
||||
383
cmake/modules/AutoconfHelper.cmake
Normal file
383
cmake/modules/AutoconfHelper.cmake
Normal file
@ -0,0 +1,383 @@
|
||||
# Helper functions for translating autoconf projects. Several functions
|
||||
# are lifted from the Mono sources
|
||||
|
||||
include (CheckCSourceCompiles)
|
||||
include (CheckIncludeFile)
|
||||
include (TestBigEndian)
|
||||
include (CheckFunctionExists)
|
||||
include (CheckTypeSize)
|
||||
include (CheckCSourceRuns)
|
||||
|
||||
|
||||
# Function to get the version information from the configure.ac file in the
|
||||
# current directory. Its argument is the name of the library as passed to
|
||||
# AC_INIT. It will set the variables ${LIBNAME}_VERSION and ${LIBNAME}_SOVERSION
|
||||
function (ac_get_version libname)
|
||||
string(TOUPPER "${libname}" libname_upper)
|
||||
|
||||
# Read the relevant content from configure.ac
|
||||
file (STRINGS configure.ac tmp_configure_ac
|
||||
REGEX "${libname_upper}_[_A-Z]+=[ \\t]*[0-9]+")
|
||||
|
||||
# Product version
|
||||
string (REGEX REPLACE ".+MAJOR[_A-Z]+=([0-9]+).+MINOR[_A-Z]+=([0-9]+).+MICRO[_A-Z]+=([0-9]+).*"
|
||||
"\\1.\\2.\\3" ${libname_upper}_VERSION "${tmp_configure_ac}")
|
||||
|
||||
# Library version for libtool
|
||||
string (REGEX REPLACE ".+CURRENT=([0-9]+).+REVISION=([0-9]+).+AGE=([0-9]+).*"
|
||||
"\\1.\\2.\\3" ${libname_upper}_SOVERSION "${tmp_configure_ac}")
|
||||
|
||||
# Checks if the string needs to be displayed
|
||||
set (${libname_upper}_DISPLAYSTR_AUX
|
||||
"Found ${libname} version ${${libname_upper}_VERSION}, soversion ${${libname_upper}_SOVERSION} from configure.ac"
|
||||
)
|
||||
if ((NOT ${libname_upper}_DISPLAYSTR) OR (NOT ${libname_upper}_DISPLAYSTR STREQUAL ${libname_upper}_DISPLAYSTR_AUX))
|
||||
set (${libname_upper}_DISPLAYSTR ${${libname_upper}_DISPLAYSTR_AUX}
|
||||
CACHE INTERNAL "Version string from ${libname}" FORCE)
|
||||
message (STATUS ${${libname_upper}_DISPLAYSTR})
|
||||
endif ()
|
||||
|
||||
# Export the result to the caller
|
||||
set(${libname_upper}_VERSION "${${libname_upper}_VERSION}" PARENT_SCOPE)
|
||||
set(${libname_upper}_SOVERSION "${${libname_upper}_SOVERSION}" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
|
||||
# Also from mono's source code
|
||||
# Implementation of AC_CHECK_HEADERS
|
||||
# In addition, it also records the list of variables in the variable
|
||||
# 'autoheader_vars', and for each variable, a documentation string in the
|
||||
# variable ${var}_doc
|
||||
function(ac_check_headers)
|
||||
foreach (header ${ARGV})
|
||||
string(TOUPPER ${header} header_var)
|
||||
string(REPLACE "." "_" header_var ${header_var})
|
||||
string(REPLACE "/" "_" header_var ${header_var})
|
||||
set(header_var "HAVE_${header_var}")
|
||||
check_include_file (${header} ${header_var})
|
||||
set("${header_var}_doc" "Define to 1 if you have the <${header}> header file." PARENT_SCOPE)
|
||||
if (${header_var})
|
||||
set("${header_var}_defined" "1" PARENT_SCOPE)
|
||||
endif()
|
||||
set("${header_var}_val" "1" PARENT_SCOPE)
|
||||
set (autoheader_vars ${autoheader_vars} ${header_var})
|
||||
endforeach()
|
||||
set (autoheader_vars ${autoheader_vars} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
# Function taken from mono's source code
|
||||
function (ac_check_funcs)
|
||||
foreach (func ${ARGV})
|
||||
string(TOUPPER ${func} var)
|
||||
set(var "HAVE_${var}")
|
||||
set(${var})
|
||||
check_function_exists (${func} ${var})
|
||||
set("${var}_doc" "Define to 1 if you have the '${func}' function." PARENT_SCOPE)
|
||||
if (${var})
|
||||
set("${var}_defined" "1" PARENT_SCOPE)
|
||||
set(${var} yes PARENT_SCOPE)
|
||||
endif()
|
||||
set("${var}_val" "1" PARENT_SCOPE)
|
||||
set (autoheader_vars ${autoheader_vars} ${var})
|
||||
endforeach()
|
||||
set (autoheader_vars ${autoheader_vars} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
|
||||
# Specifically, this macro checks for stdlib.h', stdarg.h',
|
||||
# string.h', and float.h'; if the system has those, it probably
|
||||
# has the rest of the ANSI C header files. This macro also checks
|
||||
# whether string.h' declares memchr' (and thus presumably the
|
||||
# other mem' functions), whether stdlib.h' declare free' (and
|
||||
# thus presumably malloc' and other related functions), and whether
|
||||
# the ctype.h' macros work on characters with the high bit set, as
|
||||
# ANSI C requires.
|
||||
function (ac_header_stdc)
|
||||
if (STDC_HEADERS)
|
||||
return()
|
||||
endif()
|
||||
message(STATUS "Looking for ANSI-C headers")
|
||||
set(code "
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include <float.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
void *ptr;
|
||||
free((void*)1);
|
||||
ptr = memchr((void*)1, 0, 0);
|
||||
|
||||
return (int)ptr;
|
||||
}
|
||||
")
|
||||
# FIXME Check the ctype.h high bit
|
||||
CHECK_C_SOURCE_COMPILES("${code}" STDC_HEADERS)
|
||||
if (STDC_HEADERS)
|
||||
set(STDC_HEADERS 1 PARENT_SCOPE)
|
||||
message(STATUS "Looking for ANSI-C headers - found")
|
||||
else()
|
||||
message(STATUS "Looking for ANSI-C headers - not found")
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
# Also from the mono sources, kind of implements AC_SYS_LARGEFILE
|
||||
function (ac_sys_largefile)
|
||||
CHECK_C_SOURCE_RUNS("
|
||||
#include <sys/types.h>
|
||||
#define BIG_OFF_T (((off_t)1<<62)-1+((off_t)1<<62))
|
||||
int main (int argc, char **argv) {
|
||||
int big_off_t=((BIG_OFF_T%2147483629==721) &&
|
||||
(BIG_OFF_T%2147483647==1));
|
||||
return big_off ? 0 : 1;
|
||||
}
|
||||
" HAVE_LARGE_FILE_SUPPORT)
|
||||
|
||||
# Check if it makes sense to define _LARGE_FILES or _FILE_OFFSET_BITS
|
||||
if (HAVE_LARGE_FILE_SUPPORT)
|
||||
return()
|
||||
endif()
|
||||
|
||||
set (_LARGE_FILE_EXTRA_SRC "
|
||||
#include <sys/types.h>
|
||||
int main (int argc, char **argv) {
|
||||
return sizeof(off_t) == 8 ? 0 : 1;
|
||||
}
|
||||
")
|
||||
CHECK_C_SOURCE_RUNS ("#define _LARGE_FILES\n${_LARGE_FILE_EXTRA_SRC}"
|
||||
HAVE_USEFUL_D_LARGE_FILES)
|
||||
if (NOT HAVE_USEFUL_D_LARGE_FILES)
|
||||
if (NOT DEFINED HAVE_USEFUL_D_FILE_OFFSET_BITS)
|
||||
set (SHOW_LARGE_FILE_WARNING TRUE)
|
||||
endif ()
|
||||
CHECK_C_SOURCE_RUNS ("#define _FILE_OFFSET_BITS 64\n${_LARGE_FILE_EXTRA_SRC}"
|
||||
HAVE_USEFUL_D_FILE_OFFSET_BITS)
|
||||
if (HAVE_USEFUL_D_FILE_OFFSET_BITS)
|
||||
set (_FILE_OFFSET_BITS 64 PARENT_SCOPE)
|
||||
elseif (SHOW_LARGE_FILE_WARNING)
|
||||
message (WARNING "No 64 bit file support through off_t available.")
|
||||
endif ()
|
||||
else ()
|
||||
set (_LARGE_FILES 1 PARENT_SCOPE)
|
||||
endif ()
|
||||
endfunction ()
|
||||
|
||||
|
||||
# Quick way to set some basic variables
|
||||
# FIXME add support for variable number of arguments: only package and version are mandatory
|
||||
# arguments are package version bug_report tarname url
|
||||
function (ac_init)
|
||||
set(package ${ARGV0})
|
||||
set(version ${ARGV1})
|
||||
set(bug_report ${ARGV2})
|
||||
set(tarname ${ARGV3})
|
||||
set(url ${ARGV4})
|
||||
set(PACKAGE_NAME "\"${package}\"" PARENT_SCOPE)
|
||||
set(PACKAGE_VERSION "\"${version}\"" PARENT_SCOPE)
|
||||
set(VERSION "\"${version}\"" PARENT_SCOPE)
|
||||
if(version)
|
||||
set(PACKAGE_STRING "\"${package} ${version}\"" PARENT_SCOPE)
|
||||
else()
|
||||
set(PACKAGE_STRING "\"${package}\"" PARENT_SCOPE)
|
||||
endif()
|
||||
|
||||
set(PACKAGE_BUGREPORT "\"${bug_report}\"" PARENT_SCOPE)
|
||||
|
||||
if(NOT tarname)
|
||||
string(REGEX REPLACE "[^a-zA-Z0-9_]" "-" tarname "${package}")
|
||||
endif()
|
||||
set(PACKAGE_TARNAME "\"${tarname}\"" PARENT_SCOPE)
|
||||
|
||||
set(PACKAGE_URL "\"${url}\"" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
|
||||
# Checks for the const keyword, defining "HAS_CONST_SUPPORT"
|
||||
# If it does not have support, defines "const" to 0 in the parent scope
|
||||
function (ac_c_const)
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"int main(int argc, char **argv){const int r = 0;return r;}"
|
||||
HAS_CONST_SUPPORT)
|
||||
if (NOT HAS_CONST_SUPPORT)
|
||||
set(const 0 PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
# Inline keyword support. Defines "inline" in the parent scope to the
|
||||
# compiler internal keyword for inline in C
|
||||
# TODO write a better test!
|
||||
function (ac_c_inline)
|
||||
if (MSVC)
|
||||
set (inline __inline)
|
||||
elseif(CMAKE_COMPILER_IS_GNUC)
|
||||
set (inline __inline__)
|
||||
endif()
|
||||
set(inline "${inline}" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
|
||||
# Test if you can safely include both <sys/time.h> and <time.h>
|
||||
function (ac_header_time)
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"#include <sys/time.h>\n#include <time.h>\nint main(int argc, char **argv) { return 0; }"
|
||||
TIME_WITH_SYS_TIME)
|
||||
set(TIME_WITH_SYS_TIME ${TIME_WITH_SYS_TIME} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
|
||||
# Native cpu byte order: 1 if big-endian (Motorola) or 0 if little-endian
|
||||
# (Intel), setting "WORDS_BIGENDIAN" to 1 if big endian
|
||||
function (ac_c_bigendian)
|
||||
TEST_BIG_ENDIAN(HOST_BIGENDIAN)
|
||||
if (HOST_BIGENDIAN)
|
||||
set(WORDS_BIGENDIAN 1 PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
# Check for off_t, setting "off_t" in the parent scope
|
||||
function(ac_type_off_t)
|
||||
CHECK_TYPE_SIZE("off_t" SIZEOF_OFF_T)
|
||||
if (NOT SIZEOF_OFF_T)
|
||||
set(off_t "long int")
|
||||
endif()
|
||||
set(off_t ${off_t} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
|
||||
# Check for size_t, setting "size_t" in the parent scope
|
||||
function(ac_type_size_t)
|
||||
CHECK_TYPE_SIZE("size_t" SIZEOF_SIZE_T)
|
||||
if (NOT SIZEOF_SIZE_T)
|
||||
set(size_t "unsigned int")
|
||||
endif()
|
||||
set(size_t ${size_t} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
|
||||
# Define "TM_IN_SYS_TIME" to 1 if <sys/time.h> declares "struct tm"
|
||||
function(ac_struct_tm)
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"#include <sys/time.h>\nint main(int argc, char **argv) { struct tm x; return 0; }"
|
||||
TM_IN_SYS_TIME
|
||||
)
|
||||
if (TM_IN_SYS_TIME)
|
||||
set (TM_IN_SYS_TIME 1 PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
# Obtain size of an 'type' and define as SIZEOF_TYPE
|
||||
function (ac_check_sizeof typename)
|
||||
string(TOUPPER "SIZEOF_${typename}" varname)
|
||||
string(REPLACE " " "_" varname "${varname}")
|
||||
string(REPLACE "*" "p" varname "${varname}")
|
||||
CHECK_TYPE_SIZE("${typename}" ${varname} BUILTIN_TYPES_ONLY)
|
||||
if(NOT ${varname})
|
||||
set(${varname} 0 PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
# Check if the type exists, defines HAVE_<type>
|
||||
function (ac_check_type typename)
|
||||
string(TOUPPER "${typename}" varname)
|
||||
string(REPLACE " " "_" varname "${varname}")
|
||||
string(REPLACE "*" "p" varname "${varname}")
|
||||
CHECK_TYPE_SIZE("${typename}" ${varname})
|
||||
if (NOT "${varname}" STREQUAL "")
|
||||
set("HAVE_${varname}" 1 PARENT_SCOPE)
|
||||
set("${varname}" "${typename}" PARENT_SCOPE)
|
||||
else()
|
||||
set("${varname}" "unknown" PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
# Verifies if each type on the list exists, using the given prelude
|
||||
function (ac_check_types type_list prelude)
|
||||
foreach(typename ${type_list})
|
||||
string(TOUPPER "HAVE_${typename}" varname)
|
||||
string(REPLACE " " "_" varname "${varname}")
|
||||
string(REPLACE "*" "p" varname "${varname}")
|
||||
CHECK_C_SOURCE_COMPILES("${prelude}\n ${typename} foo;" ${varname})
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
function(ac_path_prog variable prog_to_check_for value_if_not_found env_var)
|
||||
find_program(${variable} NAMES ${prog_to_check_for} PATHS ENV ${env_var} NO_DEFAULT_PATH)
|
||||
if(NOT ${variable})
|
||||
message(STATUS "Looking for ${prog_to_check_for} - not found")
|
||||
set(${variable} ${value_if_not_fount} PARENT_SCOPE)
|
||||
else()
|
||||
message(STATUS "Looking for ${prog_to_check_for} - ${variable}")
|
||||
set(${variable} ${${variable}} PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
# check if function func exists in library lib
|
||||
function(ac_check_lib lib func)
|
||||
string(TOUPPER "HAVE_${func}" varname)
|
||||
set(CMAKE_REQUIRED_LIBRARIES ${lib})
|
||||
check_function_exists(${func} ${varname})
|
||||
set(CMAKE_REQUIRED_LIBRARIES)
|
||||
endfunction()
|
||||
|
||||
# check if source compiles without linking
|
||||
function(ac_try_compile SOURCE VAR)
|
||||
set(CMAKE_TMP_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp)
|
||||
if(NOT DEFINED "${VAR}")
|
||||
file(WRITE
|
||||
"${CMAKE_TMP_DIR}/src.c"
|
||||
"${SOURCE}\n"
|
||||
)
|
||||
|
||||
if(NOT CMAKE_REQUIRED_QUIET)
|
||||
message(STATUS "Performing Test ${VAR}")
|
||||
endif()
|
||||
# Set up CMakeLists.txt for static library:
|
||||
file(WRITE
|
||||
${CMAKE_TMP_DIR}/CMakeLists.txt
|
||||
"add_library(compile STATIC src.c)"
|
||||
)
|
||||
|
||||
# Configure:
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
|
||||
WORKING_DIRECTORY ${CMAKE_TMP_DIR}
|
||||
)
|
||||
|
||||
# Build:
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} --build ${CMAKE_TMP_DIR}
|
||||
RESULT_VARIABLE RESVAR
|
||||
OUTPUT_VARIABLE OUTPUT
|
||||
ERROR_VARIABLE OUTPUT
|
||||
)
|
||||
|
||||
# Set up result:
|
||||
if(RESVAR EQUAL 0)
|
||||
set(${VAR} 1 CACHE INTERNAL "Test ${VAR}")
|
||||
if(NOT CMAKE_REQUIRED_QUIET)
|
||||
message(STATUS "Performing Test ${VAR} - Success")
|
||||
endif()
|
||||
|
||||
file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
|
||||
"Performing C SOURCE FILE Test ${VAR} succeded with the following output:\n"
|
||||
"${OUTPUT}\n"
|
||||
"Source file was:\n${SOURCE}\n")
|
||||
else()
|
||||
if(NOT CMAKE_REQUIRED_QUIET)
|
||||
message(STATUS "Performing Test ${VAR} - Failed")
|
||||
endif()
|
||||
set(${VAR} "" CACHE INTERNAL "Test ${VAR}")
|
||||
file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
|
||||
"Performing C SOURCE FILE Test ${VAR} failed with the following output:\n"
|
||||
"${OUTPUT}\n"
|
||||
"Source file was:\n${SOURCE}\n")
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
64
cmake/modules/FindLibElf.cmake
Normal file
64
cmake/modules/FindLibElf.cmake
Normal file
@ -0,0 +1,64 @@
|
||||
# - Try to find libelf
|
||||
# Once done this will define
|
||||
#
|
||||
# LIBELF_FOUND - system has libelf
|
||||
# LIBELF_INCLUDE_DIRS - the libelf include directory
|
||||
# LIBELF_LIBRARIES - Link these to use libelf
|
||||
# LIBELF_DEFINITIONS - Compiler switches required for using libelf
|
||||
#
|
||||
# This module reads hints about search locations from variables:
|
||||
#
|
||||
# LIBELF_ROOT - Preferred installation prefix
|
||||
#
|
||||
# Copyright (c) 2008 Bernhard Walle <bernhard.walle@gmx.de>
|
||||
#
|
||||
# Redistribution and use is allowed according to the terms of the New
|
||||
# BSD license.
|
||||
# For details see the accompanying COPYING-CMAKE-SCRIPTS file.
|
||||
#
|
||||
|
||||
|
||||
if (LIBELF_LIBRARIES AND LIBELF_INCLUDE_DIRS)
|
||||
set (LibElf_FIND_QUIETLY TRUE)
|
||||
endif (LIBELF_LIBRARIES AND LIBELF_INCLUDE_DIRS)
|
||||
|
||||
find_path (LIBELF_INCLUDE_DIRS
|
||||
NAMES
|
||||
libelf/libelf.h libelf.h
|
||||
HINTS
|
||||
${LIBELF_ROOT}
|
||||
PATH_SUFFIXES
|
||||
include
|
||||
libelf/include
|
||||
)
|
||||
|
||||
find_library (LIBELF_LIBRARIES
|
||||
NAMES
|
||||
elf libelf
|
||||
HINTS
|
||||
${LIBELF_ROOT}
|
||||
PATH_SUFFIXES
|
||||
lib
|
||||
libelf/lib
|
||||
)
|
||||
|
||||
include (FindPackageHandleStandardArgs)
|
||||
|
||||
|
||||
# handle the QUIETLY and REQUIRED arguments and set LIBELF_FOUND to TRUE if all listed variables are TRUE
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(LibElf DEFAULT_MSG
|
||||
LIBELF_LIBRARIES
|
||||
LIBELF_INCLUDE_DIRS)
|
||||
|
||||
set(CMAKE_REQUIRED_LIBRARIES elf)
|
||||
include(CheckCXXSourceCompiles)
|
||||
check_cxx_source_compiles("#include <libelf.h>
|
||||
int main() {
|
||||
Elf *e = (Elf*)0;
|
||||
size_t sz;
|
||||
elf_getshdrstrndx(e, &sz);
|
||||
return 0;
|
||||
}" ELF_GETSHDRSTRNDX)
|
||||
unset(CMAKE_REQUIRED_LIBRARIES)
|
||||
|
||||
mark_as_advanced(LIBELF_INCLUDE_DIRS LIBELF_LIBRARIES ELF_GETSHDRSTRNDX)
|
||||
@ -14,6 +14,28 @@ mark_as_advanced(
|
||||
KBUILD_MAKE_FLAGS
|
||||
)
|
||||
|
||||
if (${CMAKE_GENERATOR} STREQUAL Ninja)
|
||||
set(MAKE "make")
|
||||
list(APPEND KBUILD_MAKE_FLAGS "-j")
|
||||
else ()
|
||||
set(MAKE "$(MAKE)")
|
||||
endif ()
|
||||
|
||||
# Convert McKernel "arm64" into Linux "aarch64"
|
||||
if ("${ARCH}" STREQUAL "arm64")
|
||||
set(LINUX_ARCH "aarch64")
|
||||
else ()
|
||||
set(LINUX_ARCH "${ARCH}")
|
||||
endif ()
|
||||
|
||||
if (NOT "${LINUX_ARCH}" STREQUAL "${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
||||
string(REGEX REPLACE "ld$" "" CROSS_COMPILE "${CMAKE_LINKER}")
|
||||
list(APPEND KBUILD_MAKE_FLAGS "ARCH=${ARCH}")
|
||||
list(APPEND KBUILD_MAKE_FLAGS "CROSS_COMPILE=${CROSS_COMPILE}")
|
||||
endif()
|
||||
|
||||
string(REPLACE ";" " " KBUILD_MAKE_FLAGS_STR "${KBUILD_MAKE_FLAGS}")
|
||||
|
||||
function(kmod MODULE_NAME)
|
||||
cmake_parse_arguments(KMOD "" "INSTALL_DEST" "C_FLAGS;SOURCES;EXTRA_SYMBOLS;DEPENDS" ${ARGN})
|
||||
|
||||
@ -33,17 +55,6 @@ endif(ENABLE_WERROR)
|
||||
configure_file(${CMAKE_SOURCE_DIR}/cmake/modules/Kbuild.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/Kbuild)
|
||||
|
||||
if (${CMAKE_GENERATOR} STREQUAL Ninja)
|
||||
set(MAKE "make")
|
||||
list(APPEND KBUILD_MAKE_FLAGS "-j")
|
||||
else ()
|
||||
set(MAKE "$(MAKE)")
|
||||
endif ()
|
||||
if (NOT "${ARCH}" STREQUAL "${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
||||
string(REGEX REPLACE "ld$" "" CROSS_COMPILE "${CMAKE_LINKER}")
|
||||
list(APPEND KBUILD_MAKE_FLAGS "ARCH=${ARCH};CROSS_COMPILE=${CROSS_COMPILE}")
|
||||
endif()
|
||||
|
||||
string(REGEX REPLACE "\\.c(;|$)" ".o.cmd\\1" KMOD_O_CMD "${KMOD_SOURCES}")
|
||||
string(REGEX REPLACE "[^/;]+(;|$)" ".\\0" KMOD_O_CMD "${KMOD_O_CMD}")
|
||||
|
||||
@ -78,6 +89,10 @@ endif(ENABLE_WERROR)
|
||||
# the native build system do these checks, if possible at all...
|
||||
add_custom_command(OUTPUT kmod_always_rebuild COMMAND touch kmod_always_rebuild)
|
||||
|
||||
if (NOT EXISTS "${KERNEL_DIR}/Makefile")
|
||||
message(FATAL_ERROR "${KERNEL_DIR} does not contain a Makefile and is probably missing. install kernel development package or set the KERNEL_DIR variable")
|
||||
endif()
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${MODULE_NAME}.ko"
|
||||
"Module.symvers"
|
||||
|
||||
17
config.h.in
17
config.h.in
@ -6,8 +6,9 @@
|
||||
/* version number */
|
||||
#define MCKERNEL_VERSION "${MCKERNEL_VERSION}"
|
||||
|
||||
/* whether mcoverlayfs is enabled */
|
||||
#cmakedefine ENABLE_MCOVERLAYFS 1
|
||||
/* enable the required code for mcexec to be able to use bind mount
|
||||
* there is no config option as its use is discouraged */
|
||||
// #define MCEXEC_BIND_MOUNT 1
|
||||
|
||||
/* whether memdump feature is enabled */
|
||||
#cmakedefine ENABLE_MEMDUMP 1
|
||||
@ -27,18 +28,12 @@
|
||||
/* whether undefined behaviour sanitizer is enabled */
|
||||
#cmakedefine ENABLE_UBSAN 1
|
||||
|
||||
/* whether per-CPU allocator cache (ThunderX2 workaround) is enabled */
|
||||
#cmakedefine ENABLE_PER_CPU_ALLOC_CACHE 1
|
||||
|
||||
/* Path of bind-mount source directory */
|
||||
#cmakedefine ROOTFSDIR "${ROOTFSDIR}"
|
||||
|
||||
/* Path of install directory for libraries */
|
||||
#cmakedefine MCKERNEL_LIBDIR "${MCKERNEL_LIBDIR}"
|
||||
|
||||
/* Path of install directory for binary */
|
||||
#cmakedefine BINDIR "${BINDIR}"
|
||||
|
||||
/* Path of install directory for system binary */
|
||||
#cmakedefine SBINDIR "${SBINDIR}"
|
||||
|
||||
/* for non-RHEL kernels */
|
||||
#ifndef RHEL_RELEASE_VERSION
|
||||
#define RHEL_RELEASE_VERSION(a,b) (((a) << 8) + (b))
|
||||
|
||||
@ -1,27 +0,0 @@
|
||||
#ifndef IHKLIB_RUSAGE_H_INCLUDED
|
||||
#define IHKLIB_RUSAGE_H_INCLUDED
|
||||
|
||||
#define IHK_MAX_NUM_PGSIZES 4
|
||||
#define IHK_MAX_NUM_NUMA_NODES 1024
|
||||
#define IHK_MAX_NUM_CPUS 1024
|
||||
|
||||
#define IHK_OS_PGSIZE_4KB 0
|
||||
#define IHK_OS_PGSIZE_2MB 1
|
||||
#define IHK_OS_PGSIZE_1GB 2
|
||||
|
||||
struct mckernel_rusage {
|
||||
unsigned long memory_stat_rss[IHK_MAX_NUM_PGSIZES];
|
||||
unsigned long memory_stat_mapped_file[IHK_MAX_NUM_PGSIZES];
|
||||
unsigned long memory_max_usage;
|
||||
unsigned long memory_kmem_usage;
|
||||
unsigned long memory_kmem_max_usage;
|
||||
unsigned long memory_numa_stat[IHK_MAX_NUM_NUMA_NODES];
|
||||
unsigned long cpuacct_stat_system;
|
||||
unsigned long cpuacct_stat_user;
|
||||
unsigned long cpuacct_usage;
|
||||
unsigned long cpuacct_usage_percpu[IHK_MAX_NUM_CPUS];
|
||||
int num_threads;
|
||||
int max_num_threads;
|
||||
};
|
||||
|
||||
#endif /* !defined(IHKLIB_RUSAGE_H_INCLUDED) */
|
||||
@ -55,7 +55,7 @@
|
||||
#define MCEXEC_UP_SYS_UNSHARE 0x30a02916
|
||||
|
||||
#define MCEXEC_UP_UTI_GET_CTX 0x30a02920
|
||||
#define MCEXEC_UP_UTI_SAVE_FS 0x30a02921
|
||||
#define MCEXEC_UP_UTI_SWITCH_CTX 0x30a02921
|
||||
#define MCEXEC_UP_SIG_THREAD 0x30a02922
|
||||
#define MCEXEC_UP_SYSCALL_THREAD 0x30a02924
|
||||
#define MCEXEC_UP_TERMINATE_THREAD 0x30a02925
|
||||
@ -91,7 +91,10 @@ struct program_image_section {
|
||||
|
||||
struct get_cpu_set_arg {
|
||||
int nr_processes;
|
||||
char *req_cpu_list; // Requested by user-space
|
||||
int req_cpu_list_len; // Lenght of request string
|
||||
int *process_rank;
|
||||
pid_t ppid;
|
||||
void *cpu_set;
|
||||
size_t cpu_set_size; // Size in bytes
|
||||
int *target_core;
|
||||
@ -193,7 +196,6 @@ struct syscall_response {
|
||||
unsigned long req_thread_status;
|
||||
long ret;
|
||||
unsigned long fault_address;
|
||||
unsigned long fault_reason;
|
||||
};
|
||||
|
||||
struct syscall_ret_desc {
|
||||
@ -359,7 +361,7 @@ struct uti_get_ctx_desc {
|
||||
unsigned long key; /* OUT: struct task_struct* of mcexec thread, used to search struct host_thread */
|
||||
};
|
||||
|
||||
struct uti_save_fs_desc {
|
||||
struct uti_switch_ctx_desc {
|
||||
void *rctx; /* Remote context */
|
||||
void *lctx; /* Local context */
|
||||
};
|
||||
|
||||
@ -4,6 +4,8 @@ if(ARCH STREQUAL "x86_64")
|
||||
set(ARCH_C_FLAGS "-mno-red-zone -mcmodel=kernel")
|
||||
endif()
|
||||
|
||||
set(MCEXEC_PATH "${CMAKE_INSTALL_FULL_BINDIR}/mcexec" CACHE STRING "mcexec path for binfmt")
|
||||
|
||||
kmod(mcctrl
|
||||
C_FLAGS
|
||||
-I${IHK_FULL_SOURCE_DIR}/linux/include
|
||||
@ -16,7 +18,7 @@ kmod(mcctrl
|
||||
-I${CMAKE_CURRENT_SOURCE_DIR}/arch/${ARCH}/include
|
||||
-I${PROJECT_BINARY_DIR}
|
||||
-I${PROJECT_SOURCE_DIR}/kernel/include
|
||||
-DMCEXEC_PATH=\\"${CMAKE_INSTALL_FULL_BINDIR}/mcexec\\"
|
||||
-DMCEXEC_PATH=\\"${MCEXEC_PATH}\\"
|
||||
${ARCH_C_FLAGS}
|
||||
SOURCES
|
||||
driver.c control.c ikc.c syscall.c procfs.c binfmt_mcexec.c
|
||||
|
||||
@ -1,7 +1,12 @@
|
||||
/* archdeps.c COPYRIGHT FUJITSU LIMITED 2016 */
|
||||
/* archdeps.c COPYRIGHT FUJITSU LIMITED 2016-2019 */
|
||||
#include <linux/version.h>
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/kallsyms.h>
|
||||
#if KERNEL_VERSION(4, 11, 0) <= LINUX_VERSION_CODE
|
||||
#include <linux/sched/task_stack.h>
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) */
|
||||
#include <linux/ptrace.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/vdso.h>
|
||||
#include "config.h"
|
||||
#include "../../mcctrl.h"
|
||||
@ -42,7 +47,6 @@ int arch_symbols_init(void)
|
||||
}
|
||||
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_52
|
||||
#define VDSO_MAXPAGES 1
|
||||
struct vdso {
|
||||
long busy;
|
||||
@ -53,7 +57,6 @@ struct vdso {
|
||||
long lbase;
|
||||
long offset_sigtramp;
|
||||
};
|
||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
||||
|
||||
unsigned long
|
||||
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
|
||||
@ -95,6 +98,74 @@ reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsign
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if KERNEL_VERSION(4, 0, 0) <= LINUX_VERSION_CODE
|
||||
static long elf_search_vdso_sigtramp(void)
|
||||
{
|
||||
int i = 0;
|
||||
long ans = -1;
|
||||
char *shstr = NULL, *dynstr = NULL;
|
||||
Elf64_Ehdr *eh = NULL;
|
||||
Elf64_Shdr *tmp_sh = NULL, *sym_sh = NULL;
|
||||
Elf64_Sym *sym = NULL;
|
||||
|
||||
/* ELF header */
|
||||
eh = (Elf64_Ehdr *)vdso_start;
|
||||
if (eh == NULL) {
|
||||
D("vdso_start is NULL.\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* ELF magic check */
|
||||
if (eh->e_ident[EI_MAG0] != ELFMAG0 &&
|
||||
eh->e_ident[EI_MAG1] != ELFMAG1 &&
|
||||
eh->e_ident[EI_MAG2] != ELFMAG2 &&
|
||||
eh->e_ident[EI_MAG3] != ELFMAG3) {
|
||||
D("vdso_start ELF MAGIC Mismatch.\n"
|
||||
"e_ident[EI_MAG0 - EI_MAG3]: %02x %02x %02x %02x\n",
|
||||
eh->e_ident[EI_MAG0], eh->e_ident[EI_MAG1],
|
||||
eh->e_ident[EI_MAG2], eh->e_ident[EI_MAG3]);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Search dynsym-table and dynstr-table offset
|
||||
* from section header table
|
||||
*/
|
||||
tmp_sh = (Elf64_Shdr *)(vdso_start + eh->e_shoff);
|
||||
shstr = vdso_start + (tmp_sh + eh->e_shstrndx)->sh_offset;
|
||||
for (i = 0; i < eh->e_shnum; i++, tmp_sh++) {
|
||||
if (tmp_sh->sh_type == SHT_DYNSYM) {
|
||||
sym_sh = tmp_sh;
|
||||
}
|
||||
|
||||
if (tmp_sh->sh_type == SHT_STRTAB &&
|
||||
!strcmp(&shstr[tmp_sh->sh_name], ".dynstr")) {
|
||||
dynstr = vdso_start + tmp_sh->sh_offset;
|
||||
}
|
||||
}
|
||||
|
||||
if (sym_sh == NULL) {
|
||||
D("dynsym-table not found.\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (dynstr == 0) {
|
||||
D("dynstr-table not found.\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Search __kernel_rt_sigreturn offset from dynsym-table */
|
||||
sym = (Elf64_Sym *)(vdso_start + sym_sh->sh_offset);
|
||||
for (i = 0; (i * sym_sh->sh_entsize) < sym_sh->sh_size; i++, sym++) {
|
||||
if (!strcmp(dynstr + sym->st_name, "__kernel_rt_sigreturn")) {
|
||||
ans = sym->st_value;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
return ans;
|
||||
}
|
||||
#endif /*LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)*/
|
||||
|
||||
void get_vdso_info(ihk_os_t os, long vdso_rpa)
|
||||
{
|
||||
ihk_device_t dev = ihk_os_to_dev(os);
|
||||
@ -128,7 +199,12 @@ void get_vdso_info(ihk_os_t os, long vdso_rpa)
|
||||
|
||||
/* offsets */
|
||||
vdso->lbase = VDSO_LBASE;
|
||||
vdso->offset_sigtramp = vdso_offset_sigtramp;
|
||||
vdso->offset_sigtramp = elf_search_vdso_sigtramp();
|
||||
|
||||
if (unlikely(vdso->offset_sigtramp == -1)) {
|
||||
D("Use vdso_offset_sigtramp in header-file.\n");
|
||||
vdso->offset_sigtramp = vdso_offset_sigtramp;
|
||||
}
|
||||
#endif /*LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)*/
|
||||
out:
|
||||
wmb();
|
||||
@ -142,59 +218,61 @@ out:
|
||||
void *
|
||||
get_user_sp(void)
|
||||
{
|
||||
/* TODO; skeleton for UTI */
|
||||
return NULL;
|
||||
return (void *)current_pt_regs()->sp;
|
||||
}
|
||||
|
||||
void
|
||||
set_user_sp(void *usp)
|
||||
{
|
||||
/* TODO; skeleton for UTI */
|
||||
current_pt_regs()->sp = (unsigned long)usp;
|
||||
}
|
||||
|
||||
/* TODO; skeleton for UTI */
|
||||
struct trans_uctx {
|
||||
volatile int cond;
|
||||
int fregsize;
|
||||
|
||||
unsigned long rax;
|
||||
unsigned long rbx;
|
||||
unsigned long rcx;
|
||||
unsigned long rdx;
|
||||
unsigned long rsi;
|
||||
unsigned long rdi;
|
||||
unsigned long rbp;
|
||||
unsigned long r8;
|
||||
unsigned long r9;
|
||||
unsigned long r10;
|
||||
unsigned long r11;
|
||||
unsigned long r12;
|
||||
unsigned long r13;
|
||||
unsigned long r14;
|
||||
unsigned long r15;
|
||||
unsigned long rflags;
|
||||
unsigned long rip;
|
||||
unsigned long rsp;
|
||||
unsigned long fs;
|
||||
struct user_pt_regs regs;
|
||||
unsigned long tls_baseaddr;
|
||||
};
|
||||
|
||||
void
|
||||
restore_fs(unsigned long fs)
|
||||
restore_tls(unsigned long addr)
|
||||
{
|
||||
/* TODO; skeleton for UTI */
|
||||
const unsigned long tpidrro = 0;
|
||||
|
||||
asm volatile(
|
||||
" msr tpidr_el0, %0\n"
|
||||
" msr tpidrro_el0, %1"
|
||||
: : "r" (addr), "r" (tpidrro));
|
||||
}
|
||||
|
||||
void
|
||||
save_fs_ctx(void *ctx)
|
||||
save_tls_ctx(void __user *ctx)
|
||||
{
|
||||
/* TODO; skeleton for UTI */
|
||||
struct trans_uctx __user *tctx = ctx;
|
||||
unsigned long baseaddr;
|
||||
|
||||
asm volatile(
|
||||
" mrs %0, tpidr_el0"
|
||||
: "=r" (baseaddr));
|
||||
|
||||
if (copy_to_user(&tctx->tls_baseaddr, &baseaddr,
|
||||
sizeof(tctx->tls_baseaddr))) {
|
||||
pr_err("%s: copy_to_user failed.\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned long
|
||||
get_fs_ctx(void *ctx)
|
||||
get_tls_ctx(void __user *ctx)
|
||||
{
|
||||
/* TODO; skeleton for UTI */
|
||||
return 0;
|
||||
struct trans_uctx __user *tctx = ctx;
|
||||
struct trans_uctx kctx;
|
||||
|
||||
if (copy_from_user(&kctx, tctx, sizeof(struct trans_uctx))) {
|
||||
pr_err("%s: copy_from_user failed.\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
return kctx.tls_baseaddr;
|
||||
}
|
||||
|
||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0)
|
||||
@ -304,3 +382,38 @@ out:
|
||||
error, rva, rpa, pgsize);
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Assembler switch_ctx executes only ioctl.
|
||||
* Context register save/load is done on Linux (get from current_pt_regs).
|
||||
* Do TLS save/load and register host_thread with ioctl.
|
||||
*/
|
||||
long arch_switch_ctx(struct uti_switch_ctx_desc *desc)
|
||||
{
|
||||
int rc = 0;
|
||||
struct trans_uctx *__user rctx = NULL;
|
||||
struct trans_uctx *__user lctx = NULL;
|
||||
struct trans_uctx klctx = {
|
||||
.regs = current_pt_regs()->user_regs,
|
||||
};
|
||||
|
||||
rctx = desc->rctx;
|
||||
lctx = desc->lctx;
|
||||
|
||||
if (copy_to_user(lctx, &klctx, sizeof(klctx))) {
|
||||
pr_err("%s: Error: copy_to_user failed\n", __func__);
|
||||
rc = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (copy_from_user(¤t_pt_regs()->user_regs,
|
||||
&rctx->regs, sizeof(rctx->regs))) {
|
||||
pr_err("%s: Error: copy_from_user failed\n", __func__);
|
||||
rc = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
restore_tls(get_tls_ctx(rctx));
|
||||
|
||||
out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -9,8 +9,6 @@
|
||||
extern int translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva,
|
||||
unsigned long *rpap, unsigned long *pgsizep);
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
||||
|
||||
#define PFN_WRITE_COMBINED PTE_ATTRINDX(MT_NORMAL_NC)
|
||||
static inline bool pte_is_write_combined(pte_t pte)
|
||||
{
|
||||
@ -31,9 +29,8 @@ static inline bool pte_is_write_combined(pte_t pte)
|
||||
#endif
|
||||
return ((pte_val(pte) & PTE_ATTRINDX_MASK) == PFN_WRITE_COMBINED);
|
||||
}
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
|
||||
|
||||
#define ARMV8_IDX_COUNTER0 1
|
||||
#define ARMV8_IDX_COUNTER0 0
|
||||
#define ARCH_PERF_COUNTER_START ARMV8_IDX_COUNTER0
|
||||
|
||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
|
||||
|
||||
@ -2,9 +2,13 @@
|
||||
#include <linux/version.h>
|
||||
#include <linux/kallsyms.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/vsyscall.h>
|
||||
#include <asm/vgtod.h>
|
||||
#include "config.h"
|
||||
#include "../../mcctrl.h"
|
||||
|
||||
#define gtod (&VVAR(vsyscall_gtod_data))
|
||||
|
||||
//#define SC_DEBUG
|
||||
|
||||
#ifdef SC_DEBUG
|
||||
@ -54,7 +58,6 @@ int arch_symbols_init(void)
|
||||
}
|
||||
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_52
|
||||
#define VDSO_MAXPAGES 2
|
||||
struct vdso {
|
||||
long busy;
|
||||
@ -70,8 +73,8 @@ struct vdso {
|
||||
long hpet_phys;
|
||||
void *pvti_virt;
|
||||
long pvti_phys;
|
||||
void *vgtod_virt;
|
||||
};
|
||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
||||
|
||||
unsigned long
|
||||
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
|
||||
@ -207,6 +210,7 @@ void get_vdso_info(ihk_os_t os, long vdso_rpa)
|
||||
#endif
|
||||
}
|
||||
|
||||
vdso->vgtod_virt = (void *)gtod;
|
||||
out:
|
||||
wmb();
|
||||
vdso->busy = 0;
|
||||
@ -257,25 +261,35 @@ struct trans_uctx {
|
||||
};
|
||||
|
||||
void
|
||||
restore_fs(unsigned long fs)
|
||||
restore_tls(unsigned long addr)
|
||||
{
|
||||
wrmsrl(MSR_FS_BASE, fs);
|
||||
wrmsrl(MSR_FS_BASE, addr);
|
||||
}
|
||||
|
||||
void
|
||||
save_fs_ctx(void *ctx)
|
||||
save_tls_ctx(void __user *ctx)
|
||||
{
|
||||
struct trans_uctx *tctx = ctx;
|
||||
struct trans_uctx __user *tctx = ctx;
|
||||
struct trans_uctx kctx;
|
||||
|
||||
rdmsrl(MSR_FS_BASE, tctx->fs);
|
||||
if (copy_from_user(&kctx, tctx, sizeof(struct trans_uctx))) {
|
||||
pr_err("%s: copy_from_user failed.\n", __func__);
|
||||
return;
|
||||
}
|
||||
rdmsrl(MSR_FS_BASE, kctx.fs);
|
||||
}
|
||||
|
||||
unsigned long
|
||||
get_fs_ctx(void *ctx)
|
||||
get_tls_ctx(void __user *ctx)
|
||||
{
|
||||
struct trans_uctx *tctx = ctx;
|
||||
struct trans_uctx __user *tctx = ctx;
|
||||
struct trans_uctx kctx;
|
||||
|
||||
return tctx->fs;
|
||||
if (copy_from_user(&kctx, tctx, sizeof(struct trans_uctx))) {
|
||||
pr_err("%s: copy_from_user failed.\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
return kctx.fs;
|
||||
}
|
||||
|
||||
unsigned long
|
||||
@ -356,11 +370,17 @@ out:
|
||||
return error;
|
||||
}
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
||||
#define PFN_WRITE_COMBINED _PAGE_PWT
|
||||
static inline bool pte_is_write_combined(pte_t pte)
|
||||
{
|
||||
return ((pte_flags(pte) & _PAGE_PWT) && !(pte_flags(pte) & _PAGE_PCD));
|
||||
}
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
|
||||
|
||||
/*
|
||||
* The assembler switch_ctx is save/load registers in the context.
|
||||
* Do TLS save/load and register host_thread with ioctl.
|
||||
*/
|
||||
long arch_switch_ctx(struct uti_switch_ctx_desc *desc)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -9,14 +9,12 @@
|
||||
extern int translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva,
|
||||
unsigned long *rpap, unsigned long *pgsizep);
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
||||
#define PFN_WRITE_COMBINED _PAGE_PWT
|
||||
|
||||
static inline bool pte_is_write_combined(pte_t pte)
|
||||
{
|
||||
return ((pte_flags(pte) & _PAGE_PWT) && !(pte_flags(pte) & _PAGE_PCD));
|
||||
}
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
|
||||
|
||||
#define ARCH_PERF_COUNTER_START 0
|
||||
|
||||
|
||||
@ -44,7 +44,6 @@
|
||||
#include <config.h>
|
||||
#include "mcctrl.h"
|
||||
#include <ihk/ihk_host_user.h>
|
||||
#include <ihklib_rusage.h>
|
||||
#include <rusage.h>
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
|
||||
#include <uapi/linux/sched/types.h>
|
||||
@ -87,8 +86,19 @@ int syscall_backward(struct mcctrl_usrdata *, int, unsigned long, unsigned long,
|
||||
unsigned long, unsigned long, unsigned long,
|
||||
unsigned long, unsigned long *);
|
||||
|
||||
struct mcos_handler_info {
|
||||
int pid;
|
||||
int cpu;
|
||||
struct mcctrl_usrdata *ud;
|
||||
struct file *file;
|
||||
unsigned long user_start;
|
||||
unsigned long user_end;
|
||||
unsigned long prepare_thread;
|
||||
};
|
||||
|
||||
static long mcexec_prepare_image(ihk_os_t os,
|
||||
struct program_load_desc * __user udesc)
|
||||
struct program_load_desc * __user udesc,
|
||||
struct file *file)
|
||||
{
|
||||
struct program_load_desc *desc = NULL;
|
||||
struct program_load_desc *pdesc = NULL;
|
||||
@ -100,6 +110,7 @@ static long mcexec_prepare_image(ihk_os_t os,
|
||||
struct mcctrl_per_proc_data *ppd = NULL;
|
||||
int num_sections;
|
||||
int free_ikc_pointers = 1;
|
||||
struct mcos_handler_info *info;
|
||||
|
||||
if (!usrdata) {
|
||||
pr_err("%s: error: mcctrl_usrdata not found\n", __func__);
|
||||
@ -122,6 +133,14 @@ static long mcexec_prepare_image(ihk_os_t os,
|
||||
goto free_out;
|
||||
}
|
||||
|
||||
info = ihk_os_get_mcos_private_data(file);
|
||||
if (!info) {
|
||||
ret = -EFAULT;
|
||||
goto free_out;
|
||||
}
|
||||
/* To serialize SCD_MSG_SCHEDULE_PROCESS and SCD_MSG_CLEANUP_PROCESS */
|
||||
info->cpu = desc->cpu;
|
||||
|
||||
ppd = mcctrl_get_per_proc_data(usrdata, desc->pid);
|
||||
if (!ppd) {
|
||||
printk("%s: ERROR: no per process data for PID %d\n",
|
||||
@ -193,6 +212,11 @@ static long mcexec_prepare_image(ihk_os_t os,
|
||||
/* either send or remote prepare_process failed */
|
||||
goto put_and_free_out;
|
||||
}
|
||||
/*
|
||||
* Used as SCD_MSG_CLEANUP_PROCESS target which isn't scheduled
|
||||
* with SCD_MSG_SCHEDULE_PROCESS
|
||||
*/
|
||||
info->prepare_thread = pdesc->rprocess;
|
||||
|
||||
/* Update rpgtable */
|
||||
ppd->rpgtable = pdesc->rpgtable;
|
||||
@ -307,30 +331,10 @@ int mcexec_transfer_image(ihk_os_t os, struct remote_transfer *__user upt)
|
||||
#endif
|
||||
}
|
||||
|
||||
struct mcos_handler_info {
|
||||
int pid;
|
||||
int cpu;
|
||||
struct mcctrl_usrdata *ud;
|
||||
struct file *file;
|
||||
unsigned long user_start;
|
||||
unsigned long user_end;
|
||||
};
|
||||
|
||||
struct mcos_handler_info;
|
||||
static LIST_HEAD(host_threads); /* Used for FS switch */
|
||||
DEFINE_RWLOCK(host_thread_lock);
|
||||
|
||||
/* Info of Linux counterpart of migrated-to-Linux thread */
|
||||
struct host_thread {
|
||||
struct list_head list;
|
||||
struct mcos_handler_info *handler;
|
||||
int pid;
|
||||
int tid;
|
||||
unsigned long usp;
|
||||
unsigned long lfs;
|
||||
unsigned long rfs;
|
||||
};
|
||||
|
||||
struct mcos_handler_info *new_mcos_handler_info(ihk_os_t os, struct file *file)
|
||||
{
|
||||
struct mcos_handler_info *info;
|
||||
@ -391,6 +395,7 @@ static void release_handler(ihk_os_t os, void *param)
|
||||
memset(&isp, '\0', sizeof isp);
|
||||
isp.msg = SCD_MSG_CLEANUP_PROCESS;
|
||||
isp.pid = info->pid;
|
||||
isp.arg = info->prepare_thread;
|
||||
|
||||
dprintk("%s: SCD_MSG_CLEANUP_PROCESS, info: %p, cpu: %d\n",
|
||||
__FUNCTION__, info, info->cpu);
|
||||
@ -426,6 +431,7 @@ static long mcexec_start_image(ihk_os_t os,
|
||||
struct mcctrl_channel *c;
|
||||
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
|
||||
struct mcos_handler_info *info;
|
||||
struct mcos_handler_info *prev_info;
|
||||
int ret = 0;
|
||||
|
||||
if (!usrdata) {
|
||||
@ -446,6 +452,7 @@ static long mcexec_start_image(ihk_os_t os,
|
||||
goto out;
|
||||
}
|
||||
|
||||
prev_info = ihk_os_get_mcos_private_data(file);
|
||||
info = new_mcos_handler_info(os, file);
|
||||
if (info == NULL) {
|
||||
ret = -ENOMEM;
|
||||
@ -456,6 +463,7 @@ static long mcexec_start_image(ihk_os_t os,
|
||||
info->cpu = desc->cpu;
|
||||
info->user_start = desc->user_start;
|
||||
info->user_end = desc->user_end;
|
||||
info->prepare_thread = prev_info->prepare_thread;
|
||||
ihk_os_register_release_handler(file, release_handler, info);
|
||||
ihk_os_set_mcos_private_data(file, info);
|
||||
|
||||
@ -472,8 +480,10 @@ static long mcexec_start_image(ihk_os_t os,
|
||||
ret = mcctrl_ikc_send(os, desc->cpu, &isp);
|
||||
if (ret < 0) {
|
||||
printk("%s: error: sending IKC msg\n", __FUNCTION__);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* clear prepared thread struct */
|
||||
info->prepare_thread = 0;
|
||||
out:
|
||||
kfree(desc);
|
||||
return ret;
|
||||
@ -577,17 +587,14 @@ extern int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id);
|
||||
static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
||||
{
|
||||
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
|
||||
struct mcctrl_part_exec *pe;
|
||||
struct mcctrl_part_exec *pe = NULL, *pe_itr;
|
||||
struct get_cpu_set_arg req;
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
struct mcctrl_cpu_topology *cpu_top, *cpu_top_i;
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cpu_topology *cpu_top, *cpu_top_i;
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cache_topology *cache_top;
|
||||
int cpu, cpus_assigned, cpus_to_assign, cpu_prev;
|
||||
int ret = 0;
|
||||
int mcexec_linux_numa;
|
||||
int pe_list_len = 0;
|
||||
cpumask_t *mcexec_cpu_set = NULL;
|
||||
cpumask_t *cpus_used = NULL;
|
||||
cpumask_t *cpus_to_use = NULL;
|
||||
@ -607,24 +614,126 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
pe = &udp->part_exec;
|
||||
|
||||
mutex_lock(&pe->lock);
|
||||
|
||||
if (copy_from_user(&req, (void *)arg, sizeof(req))) {
|
||||
printk("%s: error copying user request\n", __FUNCTION__);
|
||||
pr_err("%s: error copying user request\n", __func__);
|
||||
ret = -EINVAL;
|
||||
goto put_and_unlock_out;
|
||||
goto put_out;
|
||||
}
|
||||
|
||||
/* First process to enter CPU partitioning */
|
||||
if (pe->nr_processes == -1) {
|
||||
/* User requested CPU mask? */
|
||||
if (req.req_cpu_list && req.req_cpu_list_len) {
|
||||
char *cpu_list = NULL;
|
||||
|
||||
cpu_list = kmalloc(req.req_cpu_list_len, GFP_KERNEL);
|
||||
if (!cpu_list) {
|
||||
printk("%s: error: allocating CPU list\n", __FUNCTION__);
|
||||
ret = -ENOMEM;
|
||||
goto put_out;
|
||||
}
|
||||
|
||||
if (copy_from_user(cpu_list,
|
||||
req.req_cpu_list, req.req_cpu_list_len)) {
|
||||
printk("%s: error copying CPU list request\n", __FUNCTION__);
|
||||
kfree(cpu_list);
|
||||
ret = -EINVAL;
|
||||
goto put_out;
|
||||
}
|
||||
|
||||
cpus_used = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
|
||||
cpus_to_use = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
|
||||
if (!cpus_to_use || !cpus_used) {
|
||||
printk("%s: error: allocating CPU mask\n", __FUNCTION__);
|
||||
ret = -ENOMEM;
|
||||
kfree(cpu_list);
|
||||
goto put_out;
|
||||
}
|
||||
memset(cpus_used, 0, sizeof(cpumask_t));
|
||||
memset(cpus_to_use, 0, sizeof(cpumask_t));
|
||||
|
||||
/* Parse CPU list */
|
||||
if (cpulist_parse(cpu_list, cpus_to_use) < 0) {
|
||||
printk("%s: invalid CPUs requested: %s\n",
|
||||
__FUNCTION__, cpu_list);
|
||||
ret = -EINVAL;
|
||||
kfree(cpu_list);
|
||||
goto put_out;
|
||||
}
|
||||
|
||||
memcpy(cpus_used, cpus_to_use, sizeof(cpumask_t));
|
||||
|
||||
/* Copy mask to user-space */
|
||||
if (copy_to_user(req.cpu_set, cpus_used,
|
||||
(req.cpu_set_size < sizeof(cpumask_t) ?
|
||||
req.cpu_set_size : sizeof(cpumask_t)))) {
|
||||
printk("%s: error copying mask to user\n", __FUNCTION__);
|
||||
ret = -EINVAL;
|
||||
kfree(cpu_list);
|
||||
goto put_out;
|
||||
}
|
||||
|
||||
/* Copy IKC target core */
|
||||
cpu = cpumask_next(-1, cpus_used);
|
||||
if (copy_to_user(req.target_core, &cpu, sizeof(cpu))) {
|
||||
printk("%s: error copying target core to user\n",
|
||||
__FUNCTION__);
|
||||
ret = -EINVAL;
|
||||
kfree(cpu_list);
|
||||
goto put_out;
|
||||
}
|
||||
|
||||
/* Save in per-process structure */
|
||||
memcpy(&ppd->cpu_set, cpus_used, sizeof(cpumask_t));
|
||||
ppd->ikc_target_cpu = cpu;
|
||||
printk("%s: %s -> target McKernel CPU: %d\n",
|
||||
__func__, cpu_list, cpu);
|
||||
|
||||
ret = 0;
|
||||
kfree(cpu_list);
|
||||
goto put_out;
|
||||
}
|
||||
|
||||
mutex_lock(&udp->part_exec_lock);
|
||||
/* Find part_exec having same node_proxy */
|
||||
list_for_each_entry_reverse(pe_itr, &udp->part_exec_list, chain) {
|
||||
pe_list_len++;
|
||||
if (pe_itr->node_proxy_pid == req.ppid) {
|
||||
pe = pe_itr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!pe) {
|
||||
/* First process to enter CPU partitioning */
|
||||
pr_debug("%s: pe_list_len:%d\n", __func__, pe_list_len);
|
||||
if (pe_list_len >= PE_LIST_MAXLEN) {
|
||||
/* delete head entry of pe_list */
|
||||
pe_itr = list_first_entry(&udp->part_exec_list,
|
||||
struct mcctrl_part_exec, chain);
|
||||
list_del(&pe_itr->chain);
|
||||
kfree(pe_itr);
|
||||
}
|
||||
|
||||
pe = kzalloc(sizeof(struct mcctrl_part_exec), GFP_KERNEL);
|
||||
if (!pe) {
|
||||
mutex_unlock(&udp->part_exec_lock);
|
||||
ret = -ENOMEM;
|
||||
goto put_out;
|
||||
}
|
||||
/* Init part_exec */
|
||||
mutex_init(&pe->lock);
|
||||
INIT_LIST_HEAD(&pe->pli_list);
|
||||
pe->nr_processes = req.nr_processes;
|
||||
pe->nr_processes_left = req.nr_processes;
|
||||
pe->nr_processes_joined = 0;
|
||||
pe->node_proxy_pid = req.ppid;
|
||||
|
||||
list_add_tail(&pe->chain, &udp->part_exec_list);
|
||||
dprintk("%s: nr_processes: %d (partitioned exec starts)\n",
|
||||
__FUNCTION__,
|
||||
pe->nr_processes);
|
||||
__func__, pe->nr_processes);
|
||||
}
|
||||
mutex_unlock(&udp->part_exec_lock);
|
||||
|
||||
mutex_lock(&pe->lock);
|
||||
|
||||
if (pe->nr_processes != req.nr_processes) {
|
||||
printk("%s: error: requested number of processes"
|
||||
@ -634,7 +743,15 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
||||
goto put_and_unlock_out;
|
||||
}
|
||||
|
||||
if (pe->nr_processes_joined >= pe->nr_processes) {
|
||||
printk("%s: too many processes have joined to the group of %d\n",
|
||||
__func__, req.ppid);
|
||||
ret = -EINVAL;
|
||||
goto put_and_unlock_out;
|
||||
}
|
||||
|
||||
--pe->nr_processes_left;
|
||||
++pe->nr_processes_joined;
|
||||
dprintk("%s: nr_processes: %d, nr_processes_left: %d\n",
|
||||
__FUNCTION__,
|
||||
pe->nr_processes,
|
||||
@ -720,8 +837,6 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
||||
wake_up_interruptible(&pli_next->pli_wq);
|
||||
}
|
||||
|
||||
/* Reset process counter to start state */
|
||||
pe->nr_processes = -1;
|
||||
ret = -ETIMEDOUT;
|
||||
goto put_and_unlock_out;
|
||||
}
|
||||
@ -969,16 +1084,8 @@ next_cpu:
|
||||
/* Commit used cores to OS structure */
|
||||
memcpy(&pe->cpus_used, cpus_used, sizeof(*cpus_used));
|
||||
|
||||
/* Reset if last process */
|
||||
if (pe->nr_processes_left == 0) {
|
||||
dprintk("%s: nr_processes: %d (partitioned exec ends)\n",
|
||||
__FUNCTION__,
|
||||
pe->nr_processes);
|
||||
pe->nr_processes = -1;
|
||||
memset(&pe->cpus_used, 0, sizeof(pe->cpus_used));
|
||||
}
|
||||
/* Otherwise wake up next process in list */
|
||||
else {
|
||||
/* If not last process, wake up next process in list */
|
||||
if (pe->nr_processes_left != 0) {
|
||||
++pe->process_rank;
|
||||
pli_next = list_first_entry(&pe->pli_list,
|
||||
struct process_list_item, list);
|
||||
@ -991,11 +1098,14 @@ next_cpu:
|
||||
ret = 0;
|
||||
|
||||
put_and_unlock_out:
|
||||
mutex_unlock(&pe->lock);
|
||||
|
||||
put_out:
|
||||
mcctrl_put_per_proc_data(ppd);
|
||||
|
||||
kfree(cpus_to_use);
|
||||
kfree(cpus_used);
|
||||
kfree(mcexec_cpu_set);
|
||||
mcctrl_put_per_proc_data(ppd);
|
||||
mutex_unlock(&pe->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -1199,7 +1309,7 @@ int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet)
|
||||
ppd = mcctrl_get_per_proc_data(ud, pid);
|
||||
|
||||
if (unlikely(!ppd)) {
|
||||
kprintf("%s: ERROR: no per-process structure for PID %d, "
|
||||
dprintk("%s: ERROR: no per-process structure for PID %d, "
|
||||
"syscall nr: %lu\n",
|
||||
__FUNCTION__, pid, packet->req.number);
|
||||
|
||||
@ -1414,7 +1524,7 @@ retry_alloc:
|
||||
__FUNCTION__, task_pid_vnr(current), packet->ref);
|
||||
|
||||
mb();
|
||||
if (!packet->req.valid) {
|
||||
if (!smp_load_acquire(&packet->req.valid)) {
|
||||
printk("%s: ERROR: stray wakeup pid: %d, tid: %d: SC %lu\n",
|
||||
__FUNCTION__,
|
||||
task_tgid_vnr(current),
|
||||
@ -1424,7 +1534,7 @@ retry_alloc:
|
||||
goto retry;
|
||||
}
|
||||
|
||||
packet->req.valid = 0; /* ack */
|
||||
smp_store_release(&packet->req.valid, 0); /* ack */
|
||||
dprintk("%s: system call: %d, args[0]: %lu, args[1]: %lu, args[2]: %lu, "
|
||||
"args[3]: %lu, args[4]: %lu, args[5]: %lu\n",
|
||||
__FUNCTION__,
|
||||
@ -2376,7 +2486,7 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
|
||||
{
|
||||
struct mcctrl_ioctl_getrusage_desc desc;
|
||||
struct rusage_global *rusage_global = ihk_os_get_rusage(ihk_os);
|
||||
struct mckernel_rusage *rusage = NULL;
|
||||
struct ihk_os_rusage *rusage = NULL;
|
||||
int ret = 0;
|
||||
int i;
|
||||
unsigned long ut;
|
||||
@ -2388,13 +2498,13 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
|
||||
goto out;
|
||||
}
|
||||
|
||||
rusage = kmalloc(sizeof(struct mckernel_rusage), GFP_KERNEL);
|
||||
rusage = kmalloc(sizeof(struct ihk_os_rusage), GFP_KERNEL);
|
||||
if (!rusage) {
|
||||
printk("%s: kmalloc failed\n", __FUNCTION__);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
memset(rusage, 0, sizeof(struct mckernel_rusage));
|
||||
memset(rusage, 0, sizeof(struct ihk_os_rusage));
|
||||
|
||||
/* Compile statistics */
|
||||
for (i = 0; i < IHK_MAX_NUM_PGSIZES; i++) {
|
||||
@ -2415,15 +2525,17 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
|
||||
st += rusage_global->cpu[i].system_tsc * rusage_global->ns_per_tsc / 1000;
|
||||
rusage->cpuacct_usage_percpu[i] = wt;
|
||||
}
|
||||
rusage->cpuacct_stat_system = st / 10000000;
|
||||
rusage->cpuacct_stat_user = ut / 10000000;
|
||||
rusage->cpuacct_stat_system = (st + 10000000 - 1) / 10000000;
|
||||
rusage->cpuacct_stat_user = (ut + 10000000 - 1) / 10000000;
|
||||
rusage->cpuacct_usage = ut;
|
||||
|
||||
rusage->num_threads = rusage_global->num_threads;
|
||||
rusage->max_num_threads = rusage_global->max_num_threads;
|
||||
|
||||
if (desc.size_rusage > sizeof(struct mckernel_rusage)) {
|
||||
printk("%s: desc.size_rusage=%ld > sizeof(struct mckernel_rusage)=%ld\n", __FUNCTION__, desc.size_rusage, sizeof(struct mckernel_rusage));
|
||||
if (desc.size_rusage > sizeof(struct ihk_os_rusage)) {
|
||||
printk("%s: desc.size_rusage=%ld > sizeof(struct mckernel_rusage)=%ld\n",
|
||||
__func__, desc.size_rusage,
|
||||
sizeof(struct ihk_os_rusage));
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
@ -2444,10 +2556,10 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
|
||||
|
||||
extern void *get_user_sp(void);
|
||||
extern void set_user_sp(unsigned long);
|
||||
extern void restore_fs(unsigned long fs);
|
||||
extern void save_fs_ctx(void *);
|
||||
extern unsigned long get_fs_ctx(void *);
|
||||
extern unsigned long get_rsp_ctx(void *);
|
||||
extern void restore_tls(unsigned long addr);
|
||||
extern void save_tls_ctx(void __user *ctx);
|
||||
extern unsigned long get_tls_ctx(void __user *ctx);
|
||||
extern unsigned long get_rsp_ctx(void *ctx);
|
||||
|
||||
long mcexec_uti_get_ctx(ihk_os_t os, struct uti_get_ctx_desc __user *udesc)
|
||||
{
|
||||
@ -2491,14 +2603,15 @@ long mcexec_uti_get_ctx(ihk_os_t os, struct uti_get_ctx_desc __user *udesc)
|
||||
return rc;
|
||||
}
|
||||
|
||||
long mcexec_uti_save_fs(ihk_os_t os, struct uti_save_fs_desc __user *udesc, struct file *file)
|
||||
long mcctrl_switch_ctx(ihk_os_t os, struct uti_switch_ctx_desc __user *udesc,
|
||||
struct file *file)
|
||||
{
|
||||
int rc = 0;
|
||||
void *usp = get_user_sp();
|
||||
struct mcos_handler_info *info;
|
||||
struct host_thread *thread;
|
||||
unsigned long flags;
|
||||
struct uti_save_fs_desc desc;
|
||||
struct uti_switch_ctx_desc desc;
|
||||
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
|
||||
struct mcctrl_per_proc_data *ppd;
|
||||
|
||||
@ -2508,21 +2621,26 @@ long mcexec_uti_save_fs(ihk_os_t os, struct uti_save_fs_desc __user *udesc, stru
|
||||
goto out;
|
||||
}
|
||||
|
||||
if(copy_from_user(&desc, udesc, sizeof(struct uti_save_fs_desc))) {
|
||||
if (copy_from_user(&desc, udesc, sizeof(struct uti_switch_ctx_desc))) {
|
||||
printk("%s: Error: copy_from_user failed\n", __FUNCTION__);
|
||||
rc = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
save_fs_ctx(desc.lctx);
|
||||
rc = arch_switch_ctx(&desc);
|
||||
if (rc < 0) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
save_tls_ctx(desc.lctx);
|
||||
info = ihk_os_get_mcos_private_data(file);
|
||||
thread = kmalloc(sizeof(struct host_thread), GFP_KERNEL);
|
||||
memset(thread, '\0', sizeof(struct host_thread));
|
||||
thread->pid = task_tgid_vnr(current);
|
||||
thread->tid = task_pid_vnr(current);
|
||||
thread->usp = (unsigned long)usp;
|
||||
thread->lfs = get_fs_ctx(desc.lctx);
|
||||
thread->rfs = get_fs_ctx(desc.rctx);
|
||||
thread->ltls = get_tls_ctx(desc.lctx);
|
||||
thread->rtls = get_tls_ctx(desc.rctx);
|
||||
thread->handler = info;
|
||||
|
||||
write_lock_irqsave(&host_thread_lock, flags);
|
||||
@ -2568,9 +2686,9 @@ mcexec_sig_thread(ihk_os_t os, unsigned long arg, struct file *file)
|
||||
read_unlock_irqrestore(&host_thread_lock, flags);
|
||||
if (thread) {
|
||||
if (arg)
|
||||
restore_fs(thread->lfs);
|
||||
restore_tls(thread->ltls);
|
||||
else
|
||||
restore_fs(thread->rfs);
|
||||
restore_tls(thread->rtls);
|
||||
goto out;
|
||||
}
|
||||
ret = -EINVAL;
|
||||
@ -2774,8 +2892,7 @@ long mcexec_syscall_thread(ihk_os_t os, unsigned long arg, struct file *file)
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
/* debug */
|
||||
if (0 && param.number == __NR_futex) {
|
||||
if (param.number == __NR_futex) {
|
||||
struct uti_futex_resp resp = {
|
||||
.done = 0
|
||||
};
|
||||
@ -2971,13 +3088,8 @@ mcexec_uti_attr(ihk_os_t os, struct uti_attr_desc __user *_desc)
|
||||
cpumask_t *cpuset = NULL, *env_cpuset = NULL;
|
||||
struct mcctrl_usrdata *ud = ihk_host_os_get_usrdata(os);
|
||||
ihk_device_t dev = ihk_os_to_dev(os);
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
struct mcctrl_cpu_topology *cpu_topo;
|
||||
struct mcctrl_cpu_topology *target_cpu = NULL;
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cpu_topology *cpu_topo;
|
||||
struct cpu_topology *target_cpu = NULL;
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct node_topology *node_topo;
|
||||
struct ihk_cache_topology *lcache_topo;
|
||||
struct ihk_node_topology *lnode_topo;
|
||||
@ -3200,13 +3312,51 @@ out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int __mcctrl_control_perm(unsigned int request)
|
||||
{
|
||||
int ret = 0;
|
||||
kuid_t euid;
|
||||
|
||||
/* black list */
|
||||
switch (request) {
|
||||
case IHK_OS_AUX_PERF_NUM:
|
||||
case IHK_OS_AUX_PERF_SET:
|
||||
case IHK_OS_AUX_PERF_GET:
|
||||
case IHK_OS_AUX_PERF_ENABLE:
|
||||
case IHK_OS_AUX_PERF_DISABLE:
|
||||
case IHK_OS_AUX_PERF_DESTROY:
|
||||
euid = current_euid();
|
||||
pr_debug("%s: request=0x%x, euid=%u\n",
|
||||
__func__, request, euid.val);
|
||||
if (euid.val) {
|
||||
ret = -EPERM;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
pr_debug("%s: request=0x%x, ret=%d\n", __func__, request, ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
|
||||
struct file *file)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = __mcctrl_control_perm(req);
|
||||
if (ret) {
|
||||
pr_err("%s: error: permission denied, req: %x\n",
|
||||
__func__, req);
|
||||
return ret;
|
||||
}
|
||||
|
||||
switch (req) {
|
||||
case MCEXEC_UP_PREPARE_IMAGE:
|
||||
return mcexec_prepare_image(os,
|
||||
(struct program_load_desc *)arg);
|
||||
(struct program_load_desc *)arg,
|
||||
file);
|
||||
case MCEXEC_UP_TRANSFER:
|
||||
return mcexec_transfer_image(os, (struct remote_transfer *)arg);
|
||||
|
||||
@ -3272,8 +3422,9 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
|
||||
case MCEXEC_UP_UTI_GET_CTX:
|
||||
return mcexec_uti_get_ctx(os, (struct uti_get_ctx_desc *)arg);
|
||||
|
||||
case MCEXEC_UP_UTI_SAVE_FS:
|
||||
return mcexec_uti_save_fs(os, (struct uti_save_fs_desc *)arg, file);
|
||||
case MCEXEC_UP_UTI_SWITCH_CTX:
|
||||
return mcctrl_switch_ctx(os, (struct uti_switch_ctx_desc *)arg,
|
||||
file);
|
||||
|
||||
case MCEXEC_UP_SIG_THREAD:
|
||||
return mcexec_sig_thread(os, arg, file);
|
||||
@ -3320,14 +3471,6 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Per-CPU register manipulation functions */
|
||||
struct mcctrl_os_cpu_response {
|
||||
int done;
|
||||
unsigned long val;
|
||||
int err;
|
||||
wait_queue_head_t wq;
|
||||
};
|
||||
|
||||
int mcctrl_get_request_os_cpu(ihk_os_t os, int *ret_cpu)
|
||||
{
|
||||
struct mcctrl_usrdata *usrdata;
|
||||
@ -3379,7 +3522,8 @@ int mcctrl_get_request_os_cpu(ihk_os_t os, int *ret_cpu)
|
||||
*ret_cpu = ch->send.queue->read_cpu;
|
||||
ret = 0;
|
||||
|
||||
printk("%s: OS: %p, CPU: %d\n", __FUNCTION__, os, *ret_cpu);
|
||||
pr_info("%s: OS: %lx, CPU: %d\n",
|
||||
__func__, (unsigned long)os, *ret_cpu);
|
||||
|
||||
out_put_ppd:
|
||||
mcctrl_put_per_thread_data(ptd);
|
||||
@ -3390,73 +3534,67 @@ out_put_ppd:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void mcctrl_os_read_write_cpu_response(ihk_os_t os,
|
||||
struct ikc_scd_packet *pisp)
|
||||
{
|
||||
struct mcctrl_os_cpu_response *resp;
|
||||
|
||||
/* XXX: What if caller thread is unblocked by a signal
|
||||
* before this message arrives? */
|
||||
resp = pisp->resp;
|
||||
if (!resp) {
|
||||
return;
|
||||
}
|
||||
|
||||
resp->val = pisp->desc.val;
|
||||
resp->done = 1;
|
||||
resp->err = pisp->err;
|
||||
wake_up_interruptible(&resp->wq);
|
||||
}
|
||||
|
||||
int __mcctrl_os_read_write_cpu_register(ihk_os_t os, int cpu,
|
||||
struct ihk_os_cpu_register *desc,
|
||||
enum mcctrl_os_cpu_operation op)
|
||||
{
|
||||
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
|
||||
struct ikc_scd_packet isp;
|
||||
struct mcctrl_os_cpu_response resp;
|
||||
struct ihk_os_cpu_register *ldesc = NULL;
|
||||
int do_free = 0;
|
||||
int ret = -EINVAL;
|
||||
|
||||
if (!udp) {
|
||||
pr_err("%s: error: mcctrl_usrdata not found\n", __func__);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (cpu < 0 || cpu >= udp->cpu_info->n_cpus) {
|
||||
pr_err("%s: error: cpu (%d) is out of range\n",
|
||||
__func__, cpu);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
|
||||
}
|
||||
|
||||
/* Keep a dynamic structure around that can
|
||||
* survive an early return due to a signal */
|
||||
ldesc = kmalloc(sizeof(*ldesc), GFP_KERNEL);
|
||||
if (!ldesc) {
|
||||
printk("%s: ERROR: allocating cpu register desc\n", __FUNCTION__);
|
||||
return -ENOMEM;
|
||||
}
|
||||
*ldesc = *desc;
|
||||
|
||||
memset(&isp, '\0', sizeof(struct ikc_scd_packet));
|
||||
isp.msg = SCD_MSG_CPU_RW_REG;
|
||||
isp.op = op;
|
||||
isp.desc = *desc;
|
||||
isp.resp = &resp;
|
||||
isp.pdesc = virt_to_phys(ldesc);
|
||||
|
||||
resp.done = 0;
|
||||
resp.err = 0;
|
||||
init_waitqueue_head(&resp.wq);
|
||||
|
||||
mb();
|
||||
ret = mcctrl_ikc_send(os, cpu, &isp);
|
||||
if (ret < 0) {
|
||||
ret = mcctrl_ikc_send_wait(os, cpu, &isp, 0, NULL, &do_free, 1, ldesc);
|
||||
if (ret != 0) {
|
||||
printk("%s: ERROR sending IKC msg: %d\n", __FUNCTION__, ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Wait for response */
|
||||
ret = wait_event_interruptible(resp.wq, resp.done);
|
||||
if (ret < 0) {
|
||||
printk("%s: ERROR after wait: %d\n", __FUNCTION__, ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = resp.err;
|
||||
if (ret != 0) {
|
||||
printk("%s: ERROR receive: %d\n", __FUNCTION__, resp.err);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Update if read */
|
||||
if (ret == 0 && op == MCCTRL_OS_CPU_READ_REGISTER) {
|
||||
desc->val = resp.val;
|
||||
if (op == MCCTRL_OS_CPU_READ_REGISTER) {
|
||||
desc->val = ldesc->val;
|
||||
}
|
||||
|
||||
dprintk("%s: MCCTRL_OS_CPU_%s_REGISTER: reg: 0x%lx, val: 0x%lx\n",
|
||||
/* Notify caller (for future async implementation) */
|
||||
atomic_set(&desc->sync, 1);
|
||||
|
||||
dprintk("%s: MCCTRL_OS_CPU_%s_REGISTER: CPU: %d, addr_ext: 0x%lx, val: 0x%lx\n",
|
||||
__FUNCTION__,
|
||||
(op == MCCTRL_OS_CPU_READ_REGISTER ? "READ" : "WRITE"),
|
||||
desc->addr, desc->val);
|
||||
(op == MCCTRL_OS_CPU_READ_REGISTER ? "READ" : "WRITE"), cpu,
|
||||
desc->addr_ext, desc->val);
|
||||
|
||||
out:
|
||||
if (do_free) {
|
||||
kfree(ldesc);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@ -21,8 +21,10 @@
|
||||
* 2013/08/19 shirasawa mcexec forward signal to MIC process
|
||||
*/
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/miscdevice.h>
|
||||
#include <linux/slab.h>
|
||||
@ -80,11 +82,13 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
|
||||
{ .request = MCEXEC_UP_CLOSE_EXEC, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
|
||||
#ifdef MCEXEC_BIND_MOUNT
|
||||
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_SYS_UMOUNT, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
|
||||
#endif // MCEXEC_BIND_MOUNT
|
||||
{ .request = MCEXEC_UP_UTI_GET_CTX, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_UTI_SAVE_FS, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_UTI_SWITCH_CTX, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_SIG_THREAD, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_SYSCALL_THREAD, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_TERMINATE_THREAD, .func = mcctrl_ioctl },
|
||||
@ -170,14 +174,6 @@ error_cleanup_channels:
|
||||
int mcctrl_os_shutdown_notifier(int os_index)
|
||||
{
|
||||
if (os[os_index]) {
|
||||
/* Wait for os running */
|
||||
if (ihk_os_wait_for_status(os[os_index], IHK_OS_STATUS_RUNNING, 0, 200) != 0) {
|
||||
printk("IHK: OS does not become RUNNING in shutdown. Force shutdown.\n");
|
||||
/* send nmi to force shutdown */
|
||||
ihk_os_send_nmi(os[os_index], 3);
|
||||
mdelay(200);
|
||||
}
|
||||
|
||||
pager_cleanup();
|
||||
sysfsm_cleanup(os[os_index]);
|
||||
free_topology_info(os[os_index]);
|
||||
|
||||
@ -208,6 +208,8 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
case SCD_MSG_PERF_ACK:
|
||||
case SCD_MSG_SEND_SIGNAL_ACK:
|
||||
case SCD_MSG_PROCFS_ANSWER:
|
||||
case SCD_MSG_REMOTE_PAGE_FAULT_ANSWER:
|
||||
case SCD_MSG_CPU_RW_REG_RESP:
|
||||
mcctrl_wakeup_cb(__os, pisp);
|
||||
break;
|
||||
|
||||
@ -238,10 +240,6 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
get_vdso_info(__os, pisp->arg);
|
||||
break;
|
||||
|
||||
case SCD_MSG_CPU_RW_REG_RESP:
|
||||
mcctrl_os_read_write_cpu_response(__os, pisp);
|
||||
break;
|
||||
|
||||
case SCD_MSG_EVENTFD:
|
||||
dkprintf("%s: SCD_MSG_EVENTFD,pisp->eventfd_type=%d\n", __FUNCTION__, pisp->eventfd_type);
|
||||
mcctrl_eventfd(__os, pisp);
|
||||
@ -464,7 +462,7 @@ int prepare_ikc_channels(ihk_os_t os)
|
||||
int i;
|
||||
int ret = 0;
|
||||
|
||||
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL);
|
||||
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_ATOMIC);
|
||||
if (!usrdata) {
|
||||
printk("%s: error: allocating mcctrl_usrdata\n", __FUNCTION__);
|
||||
ret = -ENOMEM;
|
||||
@ -490,7 +488,7 @@ int prepare_ikc_channels(ihk_os_t os)
|
||||
usrdata->num_channels = usrdata->cpu_info->n_cpus;
|
||||
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) *
|
||||
usrdata->num_channels,
|
||||
GFP_KERNEL);
|
||||
GFP_ATOMIC);
|
||||
|
||||
if (!usrdata->channels) {
|
||||
printk("Error: cannot allocate channels.\n");
|
||||
@ -499,7 +497,7 @@ int prepare_ikc_channels(ihk_os_t os)
|
||||
}
|
||||
|
||||
usrdata->ikc2linux = kzalloc(sizeof(struct ihk_ikc_channel_desc *) *
|
||||
nr_cpu_ids, GFP_KERNEL);
|
||||
nr_cpu_ids, GFP_ATOMIC);
|
||||
|
||||
if (!usrdata->ikc2linux) {
|
||||
printk("Error: cannot allocate ikc2linux channels.\n");
|
||||
@ -515,6 +513,7 @@ int prepare_ikc_channels(ihk_os_t os)
|
||||
|
||||
init_waitqueue_head(&usrdata->wq_procfs);
|
||||
mutex_init(&usrdata->reserve_lock);
|
||||
mutex_init(&usrdata->part_exec_lock);
|
||||
|
||||
for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
|
||||
INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
|
||||
@ -523,10 +522,8 @@ int prepare_ikc_channels(ihk_os_t os)
|
||||
|
||||
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
|
||||
INIT_LIST_HEAD(&usrdata->node_topology_list);
|
||||
INIT_LIST_HEAD(&usrdata->part_exec_list);
|
||||
|
||||
mutex_init(&usrdata->part_exec.lock);
|
||||
INIT_LIST_HEAD(&usrdata->part_exec.pli_list);
|
||||
usrdata->part_exec.nr_processes = -1;
|
||||
INIT_LIST_HEAD(&usrdata->wakeup_descs_list);
|
||||
spin_lock_init(&usrdata->wakeup_descs_lock);
|
||||
|
||||
@ -582,6 +579,18 @@ void destroy_ikc_channels(ihk_os_t os)
|
||||
|
||||
kfree(usrdata->channels);
|
||||
kfree(usrdata->ikc2linux);
|
||||
|
||||
mutex_lock(&usrdata->part_exec_lock);
|
||||
while (!list_empty(&usrdata->part_exec_list)) {
|
||||
struct mcctrl_part_exec *pe;
|
||||
|
||||
pe = list_first_entry(&usrdata->part_exec_list,
|
||||
struct mcctrl_part_exec, chain);
|
||||
list_del(&pe->chain);
|
||||
kfree(pe);
|
||||
}
|
||||
mutex_unlock(&usrdata->part_exec_lock);
|
||||
|
||||
kfree(usrdata);
|
||||
}
|
||||
|
||||
|
||||
@ -69,6 +69,9 @@
|
||||
#define SCD_MSG_PROCFS_ANSWER 0x13
|
||||
#define SCD_MSG_PROCFS_RELEASE 0x15
|
||||
|
||||
#define SCD_MSG_REMOTE_PAGE_FAULT 0x18
|
||||
#define SCD_MSG_REMOTE_PAGE_FAULT_ANSWER 0x19
|
||||
|
||||
#define SCD_MSG_DEBUG_LOG 0x20
|
||||
|
||||
#define SCD_MSG_SYSFS_REQ_CREATE 0x30
|
||||
@ -121,12 +124,6 @@ enum mcctrl_os_cpu_operation {
|
||||
MCCTRL_OS_CPU_MAX_OP
|
||||
};
|
||||
|
||||
/* Used to wake-up a Linux thread futex_wait()-ing */
|
||||
struct uti_futex_resp {
|
||||
int done;
|
||||
wait_queue_head_t wq;
|
||||
};
|
||||
|
||||
struct ikc_scd_packet {
|
||||
struct ihk_ikc_packet_header header;
|
||||
int msg;
|
||||
@ -157,7 +154,7 @@ struct ikc_scd_packet {
|
||||
|
||||
/* SCD_MSG_CPU_RW_REG */
|
||||
struct {
|
||||
struct ihk_os_cpu_register desc;
|
||||
unsigned long pdesc; /* Physical addr of the descriptor */
|
||||
enum mcctrl_os_cpu_operation op;
|
||||
void *resp;
|
||||
};
|
||||
@ -172,6 +169,14 @@ struct ikc_scd_packet {
|
||||
void *resp;
|
||||
int *spin_sleep; /* 1: waiting in linux_wait_event() 0: woken up by someone else */
|
||||
} futex;
|
||||
|
||||
/* SCD_MSG_REMOTE_PAGE_FAULT */
|
||||
struct {
|
||||
int target_cpu;
|
||||
int fault_tid;
|
||||
unsigned long fault_address;
|
||||
unsigned long fault_reason;
|
||||
};
|
||||
};
|
||||
/* char padding[8]; */ /* We want the size to be 128 bytes */
|
||||
};
|
||||
@ -289,11 +294,7 @@ struct cache_topology {
|
||||
struct list_head chain;
|
||||
};
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
struct mcctrl_cpu_topology {
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cpu_topology {
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
//struct mcctrl_usrdata *udp;
|
||||
struct ihk_cpu_topology *saved;
|
||||
int mckernel_cpu_id;
|
||||
@ -323,13 +324,20 @@ struct process_list_item {
|
||||
wait_queue_head_t pli_wq;
|
||||
};
|
||||
|
||||
#define PE_LIST_MAXLEN 5
|
||||
|
||||
struct mcctrl_part_exec {
|
||||
struct mutex lock;
|
||||
int nr_processes;
|
||||
/* number of processes to let in / out the synchronization point */
|
||||
int nr_processes_left;
|
||||
/* number of processes which have joined the partition */
|
||||
int nr_processes_joined;
|
||||
int process_rank;
|
||||
pid_t node_proxy_pid;
|
||||
cpumask_t cpus_used;
|
||||
struct list_head pli_list;
|
||||
struct list_head chain;
|
||||
};
|
||||
|
||||
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
|
||||
@ -352,6 +360,7 @@ struct mcctrl_usrdata {
|
||||
int job_pos;
|
||||
int mcctrl_dma_abort;
|
||||
struct mutex reserve_lock;
|
||||
struct mutex part_exec_lock;
|
||||
unsigned long last_thread_exec;
|
||||
wait_queue_head_t wq_procfs;
|
||||
struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
|
||||
@ -367,7 +376,7 @@ struct mcctrl_usrdata {
|
||||
nodemask_t numa_online;
|
||||
struct list_head cpu_topology_list;
|
||||
struct list_head node_topology_list;
|
||||
struct mcctrl_part_exec part_exec;
|
||||
struct list_head part_exec_list;
|
||||
int perf_event_num;
|
||||
};
|
||||
|
||||
@ -448,40 +457,8 @@ void mcctrl_put_per_proc_data(struct mcctrl_per_proc_data *ppd);
|
||||
int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data *ppd, void *data);
|
||||
void mcctrl_put_per_thread_data_unsafe(struct mcctrl_per_thread_data *ptd);
|
||||
void mcctrl_put_per_thread_data(struct mcctrl_per_thread_data* ptd);
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_56 /* Strange how to use inline declaration fix. */
|
||||
static inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd,
|
||||
struct task_struct *task)
|
||||
{
|
||||
struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL;
|
||||
int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK);
|
||||
unsigned long flags;
|
||||
|
||||
/* Check if data for this thread exists */
|
||||
write_lock_irqsave(&ppd->per_thread_data_hash_lock[hash], flags);
|
||||
|
||||
list_for_each_entry(ptd_iter, &ppd->per_thread_data_hash[hash], hash) {
|
||||
if (ptd_iter->task == task) {
|
||||
ptd = ptd_iter;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ptd) {
|
||||
if (atomic_read(&ptd->refcount) <= 0) {
|
||||
printk("%s: ERROR: use-after-free detected (%d)", __FUNCTION__, atomic_read(&ptd->refcount));
|
||||
ptd = NULL;
|
||||
goto out;
|
||||
}
|
||||
atomic_inc(&ptd->refcount);
|
||||
}
|
||||
|
||||
out:
|
||||
write_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags);
|
||||
return ptd;
|
||||
}
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_56 */
|
||||
inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd, struct task_struct *task);
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_56 */
|
||||
struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd,
|
||||
struct task_struct *task);
|
||||
int mcctrl_clear_pte_range(uintptr_t start, uintptr_t len);
|
||||
|
||||
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
|
||||
@ -526,24 +503,6 @@ void reply_get_cpu_mapping(long req_pa);
|
||||
void free_topology_info(ihk_os_t os);
|
||||
|
||||
/* archdep.c */
|
||||
#ifndef POSTK_DEBUG_ARCH_DEP_52
|
||||
#define VDSO_MAXPAGES 2
|
||||
struct vdso {
|
||||
long busy;
|
||||
int vdso_npages;
|
||||
char vvar_is_global;
|
||||
char hpet_is_global;
|
||||
char pvti_is_global;
|
||||
char padding;
|
||||
long vdso_physlist[VDSO_MAXPAGES];
|
||||
void *vvar_virt;
|
||||
long vvar_phys;
|
||||
void *hpet_virt;
|
||||
long hpet_phys;
|
||||
void *pvti_virt;
|
||||
long pvti_phys;
|
||||
};
|
||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
||||
|
||||
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
|
||||
unsigned long *endp);
|
||||
@ -573,8 +532,28 @@ struct ihk_perf_event_attr{
|
||||
};
|
||||
|
||||
struct mcctrl_ioctl_getrusage_desc {
|
||||
void* rusage;
|
||||
struct ihk_os_rusage *rusage;
|
||||
size_t size_rusage;
|
||||
};
|
||||
|
||||
/* uti */
|
||||
long mcctrl_switch_ctx(ihk_os_t os, struct uti_switch_ctx_desc __user *desc,
|
||||
struct file *file);
|
||||
long arch_switch_ctx(struct uti_switch_ctx_desc *desc);
|
||||
|
||||
struct host_thread {
|
||||
struct list_head list;
|
||||
struct mcos_handler_info *handler;
|
||||
int pid;
|
||||
int tid;
|
||||
unsigned long usp;
|
||||
unsigned long ltls;
|
||||
unsigned long rtls;
|
||||
};
|
||||
|
||||
/* Used to wake-up a Linux thread futex_wait()-ing */
|
||||
struct uti_futex_resp {
|
||||
int done;
|
||||
wait_queue_head_t wq;
|
||||
};
|
||||
#endif
|
||||
|
||||
@ -1110,10 +1110,10 @@ static const struct procfs_entry pid_entry_stuff[] = {
|
||||
// PROC_LNK("exe", mckernel_readlink),
|
||||
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
|
||||
PROC_REG("maps", 0444, &mckernel_buff_io),
|
||||
PROC_REG("mem", 0400, NULL),
|
||||
PROC_REG("mem", 0600, NULL),
|
||||
PROC_REG("pagemap", 0444, NULL),
|
||||
// PROC_REG("smaps", S_IRUGO, NULL),
|
||||
// PROC_REG("stat", 0444, &mckernel_buff_io),
|
||||
PROC_REG("stat", 0444, &mckernel_buff_io),
|
||||
// PROC_REG("statm", S_IRUGO, NULL),
|
||||
PROC_REG("status", 0444, &mckernel_buff_io),
|
||||
// PROC_REG("syscall", S_IRUGO, NULL),
|
||||
|
||||
@ -43,6 +43,8 @@
|
||||
#include <linux/semaphore.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/mount.h>
|
||||
#include <linux/kdev_t.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/delay.h>
|
||||
#include <asm/io.h>
|
||||
@ -178,8 +180,8 @@ int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data *ppd, void *data)
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifndef POSTK_DEBUG_ARCH_DEP_56 /* Strange how to use inline declaration fix. */
|
||||
struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd, struct task_struct *task)
|
||||
struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd,
|
||||
struct task_struct *task)
|
||||
{
|
||||
struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL;
|
||||
int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK);
|
||||
@ -208,7 +210,6 @@ struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc
|
||||
read_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags);
|
||||
return ptd;
|
||||
}
|
||||
#endif /* !POSTK_DEBUG_ARCH_DEP_56 */
|
||||
|
||||
static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet,
|
||||
struct syscall_response *res)
|
||||
@ -227,19 +228,19 @@ static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet
|
||||
c = (usrdata->channels + packet->ref)->c;
|
||||
|
||||
/* If spinning, no need for IKC message */
|
||||
if (__sync_bool_compare_and_swap(&res->req_thread_status,
|
||||
if (cmpxchg(&res->req_thread_status,
|
||||
IHK_SCD_REQ_THREAD_SPINNING,
|
||||
IHK_SCD_REQ_THREAD_TO_BE_WOKEN)) {
|
||||
IHK_SCD_REQ_THREAD_TO_BE_WOKEN) ==
|
||||
IHK_SCD_REQ_THREAD_SPINNING) {
|
||||
dprintk("%s: no need to send IKC message for PID %d\n",
|
||||
__FUNCTION__, packet->pid);
|
||||
__FUNCTION__, packet->pid);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Wait until the status goes back to IHK_SCD_REQ_THREAD_SPINNING or
|
||||
IHK_SCD_REQ_THREAD_DESCHEDULED because two wake-up attempts are competing.
|
||||
Note that mcexec_terminate_thread() and remote page fault and
|
||||
returning EINTR would compete. */
|
||||
if (res->req_thread_status == IHK_SCD_REQ_THREAD_TO_BE_WOKEN) {
|
||||
Note that mcexec_terminate_thread() and returning EINTR would compete. */
|
||||
if (smp_load_acquire(&res->req_thread_status) == IHK_SCD_REQ_THREAD_TO_BE_WOKEN) {
|
||||
printk("%s: INFO: someone else is waking up the McKernel thread, "
|
||||
"pid: %d, req status: %lu, syscall nr: %lu\n",
|
||||
__FUNCTION__, packet->pid,
|
||||
@ -247,9 +248,10 @@ static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet
|
||||
}
|
||||
|
||||
/* The thread is not spinning any more, make sure it's descheduled */
|
||||
if (!__sync_bool_compare_and_swap(&res->req_thread_status,
|
||||
if (cmpxchg(&res->req_thread_status,
|
||||
IHK_SCD_REQ_THREAD_DESCHEDULED,
|
||||
IHK_SCD_REQ_THREAD_TO_BE_WOKEN)) {
|
||||
IHK_SCD_REQ_THREAD_TO_BE_WOKEN) !=
|
||||
IHK_SCD_REQ_THREAD_DESCHEDULED) {
|
||||
printk("%s: WARNING: inconsistent requester status, "
|
||||
"pid: %d, req status: %lu, syscall nr: %lu\n",
|
||||
__FUNCTION__, packet->pid,
|
||||
@ -273,7 +275,7 @@ long syscall_backward(struct mcctrl_usrdata *usrdata, int num,
|
||||
unsigned long *ret)
|
||||
{
|
||||
struct ikc_scd_packet *packet;
|
||||
struct ikc_scd_packet *free_packet = NULL;
|
||||
struct ikc_scd_packet *free_packet = NULL;
|
||||
struct syscall_request *req;
|
||||
struct syscall_response *resp;
|
||||
unsigned long syscall_ret;
|
||||
@ -282,15 +284,16 @@ long syscall_backward(struct mcctrl_usrdata *usrdata, int num,
|
||||
struct mcctrl_per_proc_data *ppd;
|
||||
struct mcctrl_per_thread_data *ptd;
|
||||
unsigned long phys;
|
||||
struct syscall_request _request[2];
|
||||
struct syscall_request *request;
|
||||
struct syscall_request *request = NULL;
|
||||
int retry;
|
||||
|
||||
if (((unsigned long)_request ^ (unsigned long)(_request + 1)) &
|
||||
~(PAGE_SIZE -1))
|
||||
request = _request + 1;
|
||||
else
|
||||
request = _request;
|
||||
request = kmalloc(sizeof(struct syscall_request), GFP_ATOMIC);
|
||||
if (!request) {
|
||||
printk("%s: ERROR: allocating request\n", __func__);
|
||||
syscall_ret = -ENOMEM;
|
||||
goto no_ppd;
|
||||
}
|
||||
|
||||
request->number = num;
|
||||
request->args[0] = arg1;
|
||||
request->args[1] = arg2;
|
||||
@ -305,8 +308,9 @@ long syscall_backward(struct mcctrl_usrdata *usrdata, int num,
|
||||
|
||||
if (!ppd) {
|
||||
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
|
||||
__FUNCTION__, task_tgid_vnr(current));
|
||||
return -EINVAL;
|
||||
__func__, task_tgid_vnr(current));
|
||||
syscall_ret = -EINVAL;
|
||||
goto no_ppd;
|
||||
}
|
||||
|
||||
ptd = mcctrl_get_per_thread_data(ppd, current);
|
||||
@ -454,11 +458,13 @@ out:
|
||||
out_put_ppd:
|
||||
mcctrl_put_per_thread_data(ptd);
|
||||
pr_ptd("put", task_pid_vnr(current), ptd);
|
||||
no_ptd:
|
||||
no_ptd:
|
||||
dprintk("%s: tid: %d, syscall: %d, syscall_ret: %lx\n",
|
||||
__FUNCTION__, task_pid_vnr(current), num, syscall_ret);
|
||||
|
||||
mcctrl_put_per_proc_data(ppd);
|
||||
no_ppd:
|
||||
kfree(request);
|
||||
return syscall_ret;
|
||||
}
|
||||
|
||||
@ -479,214 +485,43 @@ extern struct host_thread *host_threads;
|
||||
extern rwlock_t host_thread_lock;
|
||||
#endif
|
||||
|
||||
int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, uint64_t reason)
|
||||
int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr,
|
||||
uint64_t reason, struct mcctrl_per_proc_data *ppd,
|
||||
struct ikc_scd_packet *packet)
|
||||
{
|
||||
struct ikc_scd_packet *packet;
|
||||
struct ikc_scd_packet *free_packet = NULL;
|
||||
struct syscall_request *req;
|
||||
struct syscall_response *resp;
|
||||
int error;
|
||||
struct wait_queue_head_list_node *wqhln;
|
||||
unsigned long irqflags;
|
||||
struct mcctrl_per_proc_data *ppd;
|
||||
struct mcctrl_per_thread_data *ptd;
|
||||
unsigned long phys;
|
||||
int retry;
|
||||
struct mcctrl_wakeup_desc *desc;
|
||||
int do_frees = 1;
|
||||
|
||||
dprintk("%s: tid: %d, fault_addr: %p, reason: %lu\n",
|
||||
__FUNCTION__, task_pid_vnr(current), fault_addr, (unsigned long)reason);
|
||||
|
||||
/* Look up per-process structure */
|
||||
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
|
||||
|
||||
if (!ppd) {
|
||||
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
|
||||
__FUNCTION__, task_tgid_vnr(current));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ptd = mcctrl_get_per_thread_data(ppd, current);
|
||||
if (!ptd) {
|
||||
printk("%s: ERROR: mcctrl_get_per_thread_data failed\n", __FUNCTION__);
|
||||
error = -ENOENT;
|
||||
goto no_ptd;
|
||||
}
|
||||
pr_ptd("get", task_pid_vnr(current), ptd);
|
||||
packet = (struct ikc_scd_packet *)ptd->data;
|
||||
if (!packet) {
|
||||
printk("%s: no packet registered for TID %d\n",
|
||||
__FUNCTION__, task_pid_vnr(current));
|
||||
error = -ENOENT;
|
||||
goto out_put_ppd;
|
||||
}
|
||||
|
||||
req = &packet->req;
|
||||
|
||||
/* Map response structure */
|
||||
phys = ihk_device_map_memory(ihk_os_to_dev(usrdata->os),
|
||||
packet->resp_pa, sizeof(*resp));
|
||||
resp = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os),
|
||||
phys, sizeof(*resp), NULL, 0);
|
||||
if (!resp) {
|
||||
printk("%s: ERROR: invalid response structure address\n",
|
||||
__FUNCTION__);
|
||||
error = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
retry_alloc:
|
||||
wqhln = kmalloc(sizeof(*wqhln), GFP_ATOMIC);
|
||||
if (!wqhln) {
|
||||
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
|
||||
goto retry_alloc;
|
||||
}
|
||||
memset(wqhln, 0, sizeof(struct wait_queue_head_list_node));
|
||||
|
||||
/* Prepare per-thread wait queue head */
|
||||
wqhln->task = current;
|
||||
/* Save the TID explicitly, because mcexec_syscall(), where the request
|
||||
* will be matched, is in IRQ context and can't call task_pid_vnr() */
|
||||
wqhln->rtid = task_pid_vnr(current);
|
||||
wqhln->req = 0;
|
||||
init_waitqueue_head(&wqhln->wq_syscall);
|
||||
|
||||
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
|
||||
/* Add to exact list */
|
||||
list_add_tail(&wqhln->list, &ppd->wq_list_exact);
|
||||
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
|
||||
|
||||
/* Request page fault */
|
||||
resp->ret = -EFAULT;
|
||||
resp->fault_address = (unsigned long)fault_addr;
|
||||
resp->fault_reason = reason;
|
||||
resp->stid = task_pid_vnr(current);
|
||||
packet->msg = SCD_MSG_REMOTE_PAGE_FAULT;
|
||||
packet->fault_address = (unsigned long)fault_addr;
|
||||
packet->fault_reason = reason;
|
||||
|
||||
#define STATUS_PAGER_COMPLETED 1
|
||||
#define STATUS_PAGE_FAULT 3
|
||||
req->valid = 0;
|
||||
|
||||
if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) {
|
||||
printk("%s: WARNING: failed to notify PID %d\n",
|
||||
__FUNCTION__, packet->pid);
|
||||
/* we need to alloc desc ourselves because GFP_ATOMIC */
|
||||
retry_alloc:
|
||||
desc = kmalloc(sizeof(*desc), GFP_ATOMIC);
|
||||
if (!desc) {
|
||||
pr_warn("WARNING: coudln't alloc remote page fault wait desc, retrying..\n");
|
||||
goto retry_alloc;
|
||||
}
|
||||
|
||||
mb();
|
||||
resp->status = STATUS_PAGE_FAULT;
|
||||
|
||||
retry = 0;
|
||||
for (;;) {
|
||||
dprintk("%s: tid: %d, fault_addr: %p SLEEPING\n",
|
||||
__FUNCTION__, task_pid_vnr(current), fault_addr);
|
||||
/* wait for response */
|
||||
error = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
|
||||
|
||||
/* Delay signal handling */
|
||||
if (error == -ERESTARTSYS) {
|
||||
printk("%s: INFO: interrupted by signal\n", __FUNCTION__);
|
||||
retry++;
|
||||
if (retry < 5) { /* mcexec is alive */
|
||||
printk("%s: INFO: retry=%d\n", __FUNCTION__, retry);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Remove per-thread wait queue head */
|
||||
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
|
||||
list_del(&wqhln->list);
|
||||
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
|
||||
|
||||
dprintk("%s: tid: %d, fault_addr: %p WOKEN UP\n",
|
||||
__FUNCTION__, task_pid_vnr(current), fault_addr);
|
||||
|
||||
if (retry >= 5) {
|
||||
kfree(wqhln);
|
||||
kprintf("%s: INFO: mcexec is gone or retry count exceeded,pid=%d,retry=%d\n", __FUNCTION__, task_tgid_vnr(current), retry);
|
||||
error = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (error) {
|
||||
kfree(wqhln);
|
||||
printk("remote_page_fault:interrupted. %d\n", error);
|
||||
goto out;
|
||||
}
|
||||
else {
|
||||
/* Update packet reference */
|
||||
packet = wqhln->packet;
|
||||
free_packet = packet;
|
||||
req = &packet->req;
|
||||
{
|
||||
unsigned long phys2;
|
||||
struct syscall_response *resp2;
|
||||
phys2 = ihk_device_map_memory(ihk_os_to_dev(usrdata->os),
|
||||
packet->resp_pa, sizeof(*resp));
|
||||
resp2 = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os),
|
||||
phys2, sizeof(*resp), NULL, 0);
|
||||
|
||||
if (resp != resp2) {
|
||||
resp = resp2;
|
||||
phys = phys2;
|
||||
printk("%s: updated new remote PA for resp\n", __FUNCTION__);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!req->valid) {
|
||||
printk("remote_page_fault:not valid\n");
|
||||
}
|
||||
req->valid = 0;
|
||||
|
||||
/* check result */
|
||||
if (req->number != __NR_mmap) {
|
||||
printk("remote_page_fault:unexpected response. %lx %lx\n",
|
||||
req->number, req->args[0]);
|
||||
error = -EIO;
|
||||
goto out;
|
||||
}
|
||||
#define PAGER_REQ_RESUME 0x0101
|
||||
else if (req->args[0] != PAGER_REQ_RESUME) {
|
||||
resp->ret = pager_call(usrdata->os, (void *)req);
|
||||
|
||||
if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) {
|
||||
printk("%s: WARNING: failed to notify PID %d\n",
|
||||
__FUNCTION__, packet->pid);
|
||||
}
|
||||
|
||||
mb();
|
||||
resp->status = STATUS_PAGER_COMPLETED;
|
||||
break;
|
||||
//continue;
|
||||
}
|
||||
else {
|
||||
error = req->args[1];
|
||||
if (error) {
|
||||
printk("remote_page_fault:response %d\n", error);
|
||||
kfree(wqhln);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
break;
|
||||
/* packet->target_cpu was set in rus_vm_fault if a thread was found */
|
||||
error = mcctrl_ikc_send_wait(usrdata->os, packet->target_cpu, packet,
|
||||
0, desc, &do_frees, 0);
|
||||
if (do_frees)
|
||||
kfree(desc);
|
||||
if (error < 0) {
|
||||
pr_warn("%s: WARNING: failed to request remote page fault PID %d: %d\n",
|
||||
__func__, packet->pid, error);
|
||||
}
|
||||
|
||||
kfree(wqhln);
|
||||
error = 0;
|
||||
out:
|
||||
/* Release remote page-fault response packet */
|
||||
if (free_packet) {
|
||||
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)free_packet);
|
||||
}
|
||||
|
||||
ihk_device_unmap_virtual(ihk_os_to_dev(usrdata->os), resp, sizeof(*resp));
|
||||
ihk_device_unmap_memory(ihk_os_to_dev(usrdata->os), phys, sizeof(*resp));
|
||||
|
||||
out_put_ppd:
|
||||
mcctrl_put_per_thread_data(ptd);
|
||||
pr_ptd("put", task_pid_vnr(current), ptd);
|
||||
no_ptd:
|
||||
dprintk("%s: tid: %d, fault_addr: %p, reason: %lu, error: %d\n",
|
||||
__FUNCTION__, task_pid_vnr(current), fault_addr, (unsigned long)reason, error);
|
||||
|
||||
mcctrl_put_per_proc_data(ppd);
|
||||
__func__, task_pid_vnr(current), fault_addr,
|
||||
(unsigned long)reason, error);
|
||||
return error;
|
||||
}
|
||||
|
||||
@ -704,7 +539,11 @@ out_put_ppd:
|
||||
#define USE_VM_INSERT_PFN 1
|
||||
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
|
||||
#if defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 2)
|
||||
static vm_fault_t rus_vm_fault(struct vm_fault *vmf)
|
||||
#else
|
||||
static int rus_vm_fault(struct vm_fault *vmf)
|
||||
#endif
|
||||
{
|
||||
struct vm_area_struct *vma = vmf->vma;
|
||||
#else
|
||||
@ -726,70 +565,70 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
#endif
|
||||
struct mcctrl_per_proc_data *ppd;
|
||||
struct mcctrl_per_thread_data *ptd;
|
||||
struct ikc_scd_packet *packet;
|
||||
struct task_struct *task = current;
|
||||
struct ikc_scd_packet packet = { };
|
||||
unsigned long rsysnum = 0;
|
||||
int ret = 0;
|
||||
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
dprintk("mcctrl:page fault:flags %#x pgoff %#lx va %#lx page %p\n",
|
||||
vmf->flags, vmf->pgoff, vmf->address, vmf->page);
|
||||
unsigned long addr = vmf->address;
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
dprintk("mcctrl:page fault:flags %#x pgoff %#lx va %p page %p\n",
|
||||
vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page);
|
||||
void __user *addr = vmf->virtual_address;
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
|
||||
/* Look up per-process structure */
|
||||
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
|
||||
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(task));
|
||||
if (!ppd) {
|
||||
kprintf("%s: INFO: no per-process structure for pid %d (tid %d), try to use pid %d\n",
|
||||
__FUNCTION__, task_tgid_vnr(current), task_pid_vnr(current), vma->vm_mm->owner->pid);
|
||||
ppd = mcctrl_get_per_proc_data(usrdata, vma->vm_mm->owner->pid);
|
||||
pr_err("%s: INFO: no per-process structure for "
|
||||
"pid %d (tid %d), trying to use pid %d\n",
|
||||
__func__,
|
||||
task_tgid_vnr(task), task_pid_vnr(task),
|
||||
vma->vm_mm->owner->pid);
|
||||
task = vma->vm_mm->owner;
|
||||
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(task));
|
||||
}
|
||||
|
||||
if (!ppd) {
|
||||
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
|
||||
__FUNCTION__, task_tgid_vnr(current));
|
||||
pr_err("%s: ERROR: no per-process structure for PID %d??\n",
|
||||
__func__, task_tgid_vnr(task));
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
goto no_ppd;
|
||||
}
|
||||
packet.fault_tid = ppd->pid;
|
||||
|
||||
ptd = mcctrl_get_per_thread_data(ppd, current);
|
||||
if (!ptd) {
|
||||
printk("%s: ERROR: mcctrl_get_per_thread_data failed\n", __FUNCTION__);
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
goto no_ptd;
|
||||
ptd = mcctrl_get_per_thread_data(ppd, task);
|
||||
if (ptd) {
|
||||
struct ikc_scd_packet *ptd_packet;
|
||||
|
||||
pr_ptd("get", task_pid_vnr(task), ptd);
|
||||
ptd_packet = (struct ikc_scd_packet *)ptd->data;
|
||||
if (ptd_packet) {
|
||||
packet.target_cpu = ptd_packet->ref;
|
||||
packet.fault_tid = ptd_packet->req.rtid;
|
||||
rsysnum = ptd_packet->req.number;
|
||||
}
|
||||
mcctrl_put_per_thread_data(ptd);
|
||||
pr_ptd("put", task_pid_vnr(task), ptd);
|
||||
}
|
||||
pr_ptd("get", task_pid_vnr(current), ptd);
|
||||
packet = (struct ikc_scd_packet *)ptd->data;
|
||||
if (!packet) {
|
||||
|
||||
/* Don't even bother looking up NULL */
|
||||
if (!addr) {
|
||||
pr_warn("%s: WARNING: attempted NULL pointer access\n",
|
||||
__func__);
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
printk("%s: no packet registered for TID %d\n",
|
||||
__FUNCTION__, task_pid_vnr(current));
|
||||
goto put_and_out;
|
||||
}
|
||||
|
||||
for (try = 1; ; ++try) {
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
error = translate_rva_to_rpa(usrdata->os, ppd->rpgtable,
|
||||
vmf->address, &rpa, &pgsize);
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
error = translate_rva_to_rpa(usrdata->os, ppd->rpgtable,
|
||||
(unsigned long)vmf->virtual_address,
|
||||
&rpa, &pgsize);
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
(unsigned long)addr, &rpa, &pgsize);
|
||||
#define NTRIES 2
|
||||
if (!error || (try >= NTRIES)) {
|
||||
if (error) {
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
printk("%s: error translating 0x%#lx "
|
||||
"(req: TID: %u, syscall: %lu)\n",
|
||||
__FUNCTION__, vmf->address,
|
||||
packet->req.rtid, packet->req.number);
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
printk("%s: error translating 0x%p "
|
||||
"(req: TID: %u, syscall: %lu)\n",
|
||||
__FUNCTION__, vmf->virtual_address,
|
||||
packet->req.rtid, packet->req.number);
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
pr_err("%s: error translating 0x%#lx "
|
||||
"(req: TID: %u, syscall: %lu)\n",
|
||||
__func__,
|
||||
(unsigned long)addr,
|
||||
packet.fault_tid, rsysnum);
|
||||
}
|
||||
|
||||
break;
|
||||
@ -800,23 +639,14 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
#define PF_WRITE 0x02
|
||||
reason |= PF_WRITE;
|
||||
}
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
error = remote_page_fault(usrdata, (void *)vmf->address, reason);
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
error = remote_page_fault(usrdata, vmf->virtual_address, reason);
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
error = remote_page_fault(usrdata, (void *)addr,
|
||||
reason, ppd, &packet);
|
||||
if (error) {
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
printk("%s: error forwarding PF for 0x%#lx "
|
||||
"(req: TID: %d, syscall: %lu)\n",
|
||||
__FUNCTION__, vmf->address,
|
||||
packet->req.rtid, packet->req.number);
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
printk("%s: error forwarding PF for 0x%p "
|
||||
"(req: TID: %d, syscall: %lu)\n",
|
||||
__FUNCTION__, vmf->virtual_address,
|
||||
packet->req.rtid, packet->req.number);
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
pr_err("%s: error forwarding PF for 0x%#lx "
|
||||
"(req: TID: %d, syscall: %lu)\n",
|
||||
__func__,
|
||||
(unsigned long)addr,
|
||||
packet.fault_tid, rsysnum);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -825,11 +655,7 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
goto put_and_out;
|
||||
}
|
||||
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
rva = vmf->address & ~(pgsize - 1);
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
rva = (unsigned long)vmf->virtual_address & ~(pgsize - 1);
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
rva = (unsigned long)addr & ~(pgsize - 1);
|
||||
rpa = rpa & ~(pgsize - 1);
|
||||
|
||||
phys = ihk_device_map_memory(dev, rpa, pgsize);
|
||||
@ -849,21 +675,13 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
|
||||
error = vm_insert_page(vma, rva+(pix*PAGE_SIZE), page);
|
||||
if (error) {
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
printk("%s: error inserting mapping for 0x%#lx "
|
||||
"(req: TID: %d, syscall: %lu) error: %d, "
|
||||
"vm_start: 0x%lx, vm_end: 0x%lx\n",
|
||||
__FUNCTION__, vmf->address,
|
||||
packet->req.rtid, packet->req.number, error,
|
||||
vma->vm_start, vma->vm_end);
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
printk("%s: error inserting mapping for 0x%p "
|
||||
"(req: TID: %d, syscall: %lu) error: %d, "
|
||||
"vm_start: 0x%lx, vm_end: 0x%lx\n",
|
||||
__FUNCTION__, vmf->virtual_address,
|
||||
packet->req.rtid, packet->req.number, error,
|
||||
vma->vm_start, vma->vm_end);
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
pr_err("%s: error inserting mapping for 0x%#lx "
|
||||
"(req: TID: %d, syscall: %lu) error: %d,"
|
||||
" vm_start: 0x%lx, vm_end: 0x%lx\n",
|
||||
__func__,
|
||||
(unsigned long)addr, packet.fault_tid,
|
||||
rsysnum, error,
|
||||
vma->vm_start, vma->vm_end);
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -875,16 +693,13 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
pfn+pix);
|
||||
#endif
|
||||
if (error) {
|
||||
#if 1 /* POSTK_DEBUG_TEMP_FIX_11 */ /* rus_vm_fault() multi-thread fix */
|
||||
printk("%s: vm_insert_pfn returned %d\n", __FUNCTION__, error);
|
||||
pr_err("%s: vm_insert_pfn returned %d\n",
|
||||
__func__, error);
|
||||
if (error == -EBUSY) {
|
||||
error = 0;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
#else /* POSTK_DEBUG_TEMP_FIX_11 */
|
||||
break;
|
||||
#endif /* POSTK_DEBUG_TEMP_FIX_11 */
|
||||
}
|
||||
}
|
||||
#else
|
||||
@ -892,17 +707,11 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
#endif
|
||||
ihk_device_unmap_memory(dev, phys, pgsize);
|
||||
if (error) {
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
printk("%s: remote PF failed for 0x%#lx, pgoff: %lu "
|
||||
"(req: TID: %d, syscall: %lu)\n",
|
||||
__FUNCTION__, vmf->address, vmf->pgoff,
|
||||
packet->req.rtid, packet->req.number);
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
printk("%s: remote PF failed for 0x%p, pgoff: %lu "
|
||||
"(req: TID: %d, syscall: %lu)\n",
|
||||
__FUNCTION__, vmf->virtual_address, vmf->pgoff,
|
||||
packet->req.rtid, packet->req.number);
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
pr_err("%s: remote PF failed for 0x%#lx, pgoff: %lu"
|
||||
" (req: TID: %d, syscall: %lu)\n",
|
||||
__func__,
|
||||
(unsigned long)addr, vmf->pgoff,
|
||||
packet.fault_tid, rsysnum);
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
goto put_and_out;
|
||||
}
|
||||
@ -910,9 +719,6 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
ret = VM_FAULT_NOPAGE;
|
||||
|
||||
put_and_out:
|
||||
mcctrl_put_per_thread_data(ptd);
|
||||
pr_ptd("put", task_pid_vnr(current), ptd);
|
||||
no_ptd:
|
||||
mcctrl_put_per_proc_data(ppd);
|
||||
no_ppd:
|
||||
return ret;
|
||||
@ -1075,6 +881,7 @@ struct pager_create_result {
|
||||
int maxprot;
|
||||
uint32_t flags;
|
||||
size_t size;
|
||||
int pgshift;
|
||||
char path[PATH_MAX];
|
||||
};
|
||||
|
||||
@ -1136,6 +943,7 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
||||
struct kstat st;
|
||||
int mf_flags = 0;
|
||||
unsigned long irqflags;
|
||||
int pgshift = 0;
|
||||
|
||||
dprintk("pager_req_create(%d,%lx)\n", fd, (long)result_pa);
|
||||
|
||||
@ -1144,8 +952,16 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
||||
printk("pager_req_create(%d,%lx):vfs_stat failed. %d\n", fd, (long)result_pa, error);
|
||||
goto out;
|
||||
}
|
||||
if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1)) {
|
||||
/* treat memory devices as regular files */
|
||||
if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1) &&
|
||||
(MINOR(st.rdev) == 1 || // /dev/mem
|
||||
MINOR(st.rdev) == 5)) { // /dev/zero
|
||||
/* treat memory devices and zero devices as regular files */
|
||||
}
|
||||
else if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1)) {
|
||||
error = -ENODEV;
|
||||
dprintk("%s(%d,%lx):unmappable device %x\n",
|
||||
__func__, fd, (long)result_pa, st.mode);
|
||||
goto out;
|
||||
}
|
||||
else if (!S_ISREG(st.mode)) {
|
||||
error = -ESRCH;
|
||||
@ -1160,6 +976,30 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Shared memory hack */
|
||||
{
|
||||
char *pathbuf, *fullpath;
|
||||
pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
|
||||
if (pathbuf) {
|
||||
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
|
||||
if (!IS_ERR(fullpath)) {
|
||||
if (!strncmp("/tmp/ompi.", fullpath, 10) ||
|
||||
!strncmp("/dev/shm/", fullpath, 9) ||
|
||||
(!strncmp("/var/opt/FJSVtcs/ple/daemonif/",
|
||||
fullpath, 30) && !strstr(fullpath, "dstore_sm.lock"))) {
|
||||
printk("%s: treating %s as a device file..\n",
|
||||
__func__, fullpath);
|
||||
kfree(pathbuf);
|
||||
|
||||
error = -ESRCH;
|
||||
goto out;
|
||||
}
|
||||
|
||||
kfree(pathbuf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inode = file->f_path.dentry->d_inode;
|
||||
if (!inode) {
|
||||
error = -EBADF;
|
||||
@ -1167,6 +1007,10 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!strcmp(inode->i_sb->s_type->name, "tmpfs")) {
|
||||
mf_flags = MF_IS_REMOVABLE;
|
||||
}
|
||||
|
||||
if (!strcmp(inode->i_sb->s_type->name, "proc")) {
|
||||
error = -ESRCH;
|
||||
goto out;
|
||||
@ -1188,13 +1032,14 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
||||
}
|
||||
|
||||
if (inode->i_op == mcctrl_hugetlbfs_inode_operations) {
|
||||
struct hstate *h = hstate_file(file);
|
||||
|
||||
pgshift = PAGE_SHIFT + huge_page_order(h);
|
||||
mf_flags = MF_HUGETLBFS;
|
||||
/* pager is used as handle id on mckernel side, use inode */
|
||||
pager = (void *)st.ino;
|
||||
/* retrofit blksize in resp as well through st.size field;
|
||||
* the actual file size is not used
|
||||
*/
|
||||
st.size = st.blksize;
|
||||
/* file size is not used */
|
||||
st.size = 0;
|
||||
goto out_reply;
|
||||
}
|
||||
|
||||
@ -1214,7 +1059,7 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
||||
pager = newpager;
|
||||
newpager = NULL;
|
||||
|
||||
/* Intel MPI library and shared memory "prefetch" */
|
||||
/* Shared libraries prefetch */
|
||||
{
|
||||
char *pathbuf, *fullpath;
|
||||
|
||||
@ -1222,15 +1067,7 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
||||
if (pathbuf) {
|
||||
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
|
||||
if (!IS_ERR(fullpath)) {
|
||||
if (!strncmp("/dev/shm/Intel_MPI", fullpath, 18)) {
|
||||
mf_flags = (MF_PREMAP | MF_ZEROFILL);
|
||||
dprintk("%s: filename: %s, premap & zerofill\n",
|
||||
__FUNCTION__, fullpath);
|
||||
}
|
||||
else if (strstr(fullpath, "libmpi") ||
|
||||
strstr(fullpath, "libiomp") ||
|
||||
strstr(fullpath, "libpthread") ||
|
||||
strstr(fullpath, "libc.so")) {
|
||||
if (strstr(fullpath, ".so")) {
|
||||
mf_flags = MF_PREFETCH;
|
||||
dprintk("%s: filename: %s, prefetch\n",
|
||||
__FUNCTION__, fullpath);
|
||||
@ -1281,6 +1118,7 @@ out_reply:
|
||||
resp->maxprot = maxprot;
|
||||
resp->flags = mf_flags;
|
||||
resp->size = st.size;
|
||||
resp->pgshift = pgshift;
|
||||
|
||||
error = pager_get_path(file, resp->path);
|
||||
|
||||
@ -1355,8 +1193,9 @@ static int pager_req_read(ihk_os_t os, uintptr_t handle, off_t off, size_t size,
|
||||
uintptr_t phys = -1;
|
||||
ihk_device_t dev = ihk_os_to_dev(os);
|
||||
void *buf = NULL;
|
||||
loff_t pos;
|
||||
loff_t pos, fsize;
|
||||
unsigned long flags;
|
||||
unsigned int major, minor;
|
||||
|
||||
dprintk("pager_req_read(%lx,%lx,%lx,%lx)\n", handle, off, size, rpa);
|
||||
|
||||
@ -1378,6 +1217,21 @@ static int pager_req_read(ihk_os_t os, uintptr_t handle, off_t off, size_t size,
|
||||
goto out;
|
||||
}
|
||||
|
||||
major = MAJOR(file->f_mapping->host->i_rdev);
|
||||
minor = MINOR(file->f_mapping->host->i_rdev);
|
||||
if ((major == 1 && minor == 1) || // /dev/mem
|
||||
(major == 1 && minor == 5)) { // /dev/zero
|
||||
/* Nothing to check */
|
||||
}
|
||||
else {
|
||||
/* Check if the target page fits in the file */
|
||||
fsize = i_size_read(file->f_mapping->host);
|
||||
if (off > fsize) {
|
||||
ss = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
phys = ihk_device_map_memory(dev, rpa, size);
|
||||
buf = ihk_device_map_virtual(dev, phys, size, NULL, 0);
|
||||
if (!buf) {
|
||||
@ -1585,6 +1439,26 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off,
|
||||
#define ANY_WHERE 0
|
||||
if (prot_and_flags & MAP_LOCKED) prot_and_flags |= MAP_POPULATE;
|
||||
|
||||
/* Shared memory hack */
|
||||
{
|
||||
char *pathbuf, *fullpath;
|
||||
pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
|
||||
if (pathbuf) {
|
||||
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
|
||||
if (!IS_ERR(fullpath)) {
|
||||
if (!strncmp("/tmp/ompi.", fullpath, 10) ||
|
||||
!strncmp("/dev/shm/", fullpath, 9) ||
|
||||
!strncmp("/var/opt/FJSVtcs/ple/daemonif/",
|
||||
fullpath, 30)) {
|
||||
dprintk("%s: pre-populating %s..\n",
|
||||
__func__, fullpath);
|
||||
prot_and_flags |= MAP_POPULATE;
|
||||
}
|
||||
kfree(pathbuf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
|
||||
@ -1598,7 +1472,12 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off,
|
||||
#endif
|
||||
|
||||
if (IS_ERR_VALUE(va)) {
|
||||
printk("pager_req_map(%p,%d,%lx,%lx,%lx):do_mmap_pgoff failed. %d\n", os, fd, len, off, result_rpa, (int)va);
|
||||
if ((int)va != -ENOTSUPP) {
|
||||
pr_err("%s(%p,%d,%lx,%lx,%lx): "
|
||||
"do_mmap_pgoff failed. %d\n",
|
||||
__func__, os, fd, len, off, result_rpa,
|
||||
(int)va);
|
||||
}
|
||||
error = va;
|
||||
goto out;
|
||||
}
|
||||
@ -1712,16 +1591,9 @@ retry:
|
||||
pfn |= PFN_VALID | PFN_PRESENT;
|
||||
|
||||
/* Check if mapping is write-combined */
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
||||
if (pte_is_write_combined(*pte)) {
|
||||
pfn |= PFN_WRITE_COMBINED;
|
||||
}
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_12 */
|
||||
if ((pte_flags(*pte) & _PAGE_PWT) &&
|
||||
!(pte_flags(*pte) & _PAGE_PCD)) {
|
||||
pfn |= _PAGE_PWT;
|
||||
}
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
|
||||
}
|
||||
pte_unmap(pte);
|
||||
}
|
||||
@ -1754,10 +1626,27 @@ retry:
|
||||
#else
|
||||
fault = handle_mm_fault(current->mm, vma, va, flags);
|
||||
#endif
|
||||
#ifdef SC_DEBUG
|
||||
if (fault != 0) {
|
||||
printk("%s: error: faulting %lx at off: %lu\n",
|
||||
__FUNCTION__, va, off);
|
||||
char *pathbuf = NULL;
|
||||
char *fullpath;
|
||||
|
||||
if (vma->vm_file) {
|
||||
pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
|
||||
if (pathbuf) {
|
||||
fullpath = d_path(&vma->vm_file->f_path,
|
||||
pathbuf, PATH_MAX);
|
||||
if (!IS_ERR(fullpath)) {
|
||||
printk("%s: WARNING: couldn't fault 0x%lx"
|
||||
" at off: %lu in %s\n",
|
||||
__FUNCTION__, va, off, fullpath);
|
||||
}
|
||||
|
||||
kfree(pathbuf);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
page_fault_attempted = 1;
|
||||
goto retry;
|
||||
@ -2052,6 +1941,8 @@ int mcctrl_clear_pte_range(uintptr_t start, uintptr_t len)
|
||||
}
|
||||
if (addr < end) {
|
||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0)
|
||||
/* Revert permission */
|
||||
vma->vm_flags |= VM_READ | VM_WRITE | VM_EXEC;
|
||||
error = zap_vma_ptes(vma, addr, end-addr);
|
||||
if (error) {
|
||||
mcctrl_zap_page_range(vma, addr, end-addr,
|
||||
@ -2069,6 +1960,8 @@ int mcctrl_clear_pte_range(uintptr_t start, uintptr_t len)
|
||||
NULL);
|
||||
}
|
||||
else {
|
||||
/* Revert permission */
|
||||
vma->vm_flags |= VM_READ | VM_WRITE | VM_EXEC;
|
||||
zap_vma_ptes(vma, addr, end-addr);
|
||||
}
|
||||
#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) */
|
||||
@ -2124,7 +2017,10 @@ int release_user_space(uintptr_t start, uintptr_t len)
|
||||
* \param chunks The number of chunks which make a core file image in the whole.
|
||||
*/
|
||||
|
||||
static int writecore(ihk_os_t os, unsigned long rcoretable, int chunks) {
|
||||
static int writecore(ihk_os_t os, unsigned long rcoretable, int chunks,
|
||||
unsigned long cmdline_rphys, unsigned long cmdline_len)
|
||||
{
|
||||
char *fn = NULL;
|
||||
struct file *file;
|
||||
struct coretable *coretable;
|
||||
int i, tablesize, error = 0;
|
||||
@ -2133,22 +2029,43 @@ static int writecore(ihk_os_t os, unsigned long rcoretable, int chunks) {
|
||||
unsigned long phys, tablephys, rphys;
|
||||
ihk_device_t dev = ihk_os_to_dev(os);
|
||||
char *pt;
|
||||
unsigned long cmdline_phys;
|
||||
char *cmdline;
|
||||
|
||||
dprintk("coredump called as a pseudo syscall\n");
|
||||
|
||||
fn = kmalloc(PATH_MAX, GFP_ATOMIC);
|
||||
if (!fn) {
|
||||
dprintk("%s: ERROR: allocating file name\n", __func__);
|
||||
error = -ENOMEM;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (chunks <= 0) {
|
||||
dprintk("no core data found!(%d)\n", chunks);
|
||||
error = -EINVAL;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
cmdline_phys = ihk_device_map_memory(dev, cmdline_rphys, cmdline_len);
|
||||
cmdline = ihk_device_map_virtual(dev, cmdline_phys, cmdline_len, NULL,
|
||||
0);
|
||||
sprintf(fn, "mccore-%s.%d",
|
||||
strrchr(cmdline, '/') ?
|
||||
strrchr(cmdline, '/') + 1 : cmdline,
|
||||
task_tgid_vnr(current));
|
||||
pr_info("%s: fn=%s\n", __func__, fn);
|
||||
|
||||
ihk_device_unmap_virtual(dev, cmdline, cmdline_len);
|
||||
ihk_device_unmap_memory(dev, cmdline_phys, cmdline_len);
|
||||
|
||||
/* Every Linux documentation insists we should not
|
||||
* open a file in the kernel module, but our karma
|
||||
* leads us here. Precisely, Here we emulate the core
|
||||
* dump routine of the Linux kernel in linux/fs/exec.c.
|
||||
* So we have a legitimate reason to do this.
|
||||
*/
|
||||
file = filp_open("core", O_CREAT | O_RDWR | O_LARGEFILE | O_TRUNC, 0600);
|
||||
file = filp_open(fn, O_CREAT | O_RDWR | O_LARGEFILE | O_TRUNC, 0600);
|
||||
if (IS_ERR(file) || !file->f_op) {
|
||||
dprintk("cannot open core file\n");
|
||||
error = PTR_ERR(file);
|
||||
@ -2218,6 +2135,7 @@ fail:
|
||||
/* make sure we do not travel to user land */
|
||||
error = -EINVAL;
|
||||
}
|
||||
kfree(fn);
|
||||
return error;
|
||||
}
|
||||
|
||||
@ -2273,7 +2191,8 @@ int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet)
|
||||
}
|
||||
|
||||
case __NR_coredump:
|
||||
ret = writecore(os, sc->args[1], sc->args[0]);
|
||||
ret = writecore(os, sc->args[1], sc->args[0], sc->args[2],
|
||||
sc->args[3]);
|
||||
break;
|
||||
|
||||
case __NR_sched_setparam: {
|
||||
|
||||
@ -157,13 +157,8 @@ static void free_node_topology(struct mcctrl_usrdata *udp)
|
||||
return;
|
||||
} /* free_node_topology() */
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
static void free_cpu_topology_one(struct mcctrl_usrdata *udp,
|
||||
struct mcctrl_cpu_topology *cpu)
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
static void free_cpu_topology_one(struct mcctrl_usrdata *udp,
|
||||
struct cpu_topology *cpu)
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
{
|
||||
struct cache_topology *cache;
|
||||
struct cache_topology *next;
|
||||
@ -179,13 +174,8 @@ static void free_cpu_topology_one(struct mcctrl_usrdata *udp,
|
||||
|
||||
static void free_cpu_topology(struct mcctrl_usrdata *udp)
|
||||
{
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
struct mcctrl_cpu_topology *cpu;
|
||||
struct mcctrl_cpu_topology *next;
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cpu_topology *cpu;
|
||||
struct cpu_topology *next;
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
|
||||
list_for_each_entry_safe(cpu, next, &udp->cpu_topology_list, chain) {
|
||||
list_del(&cpu->chain);
|
||||
@ -315,13 +305,8 @@ static int translate_cpumap(struct mcctrl_usrdata *udp,
|
||||
return error;
|
||||
} /* translate_cpumap() */
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
static struct cache_topology *get_cache_topology(struct mcctrl_usrdata *udp,
|
||||
struct mcctrl_cpu_topology *cpu_topo, struct ihk_cache_topology *saved)
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
static struct cache_topology *get_cache_topology(struct mcctrl_usrdata *udp,
|
||||
struct cpu_topology *cpu_topo, struct ihk_cache_topology *saved)
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
{
|
||||
int error;
|
||||
struct cache_topology *topo = NULL;
|
||||
@ -355,21 +340,12 @@ out:
|
||||
return (error)? ERR_PTR(error): topo;
|
||||
} /* get_cache_topology() */
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
static struct mcctrl_cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
||||
int index)
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
||||
int index)
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
{
|
||||
int error;
|
||||
ihk_device_t dev = ihk_os_to_dev(udp->os);
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
struct mcctrl_cpu_topology *topology = NULL;
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cpu_topology *topology = NULL;
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cache_topology *cache;
|
||||
struct ihk_cache_topology *saved_cache;
|
||||
|
||||
@ -387,12 +363,8 @@ static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
||||
topology->saved = ihk_device_get_cpu_topology(dev,
|
||||
mckernel_cpu_2_hw_id(udp, index));
|
||||
|
||||
#ifdef POSTK_DEBUG_TEMP_FIX_21 /* IS_ERR() through return NULL */
|
||||
if (!topology->saved) {
|
||||
#else /* POSTK_DEBUG_TEMP_FIX_21 */
|
||||
if (IS_ERR(topology->saved)) {
|
||||
#endif /* POSTK_DEBUG_TEMP_FIX_21 */
|
||||
error = PTR_ERR(topology->saved);
|
||||
error = -ENOENT;
|
||||
eprintk("mcctrl:get_one_cpu_topology:"
|
||||
"ihk_device_get_cpu_topology failed. %d\n",
|
||||
error);
|
||||
@ -428,6 +400,9 @@ static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
||||
"get_cache_topology failed. %d\n",
|
||||
error);
|
||||
goto out;
|
||||
} else if (!cache) {
|
||||
error = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
list_add(&cache->chain, &topology->cache_list);
|
||||
@ -447,11 +422,7 @@ static int get_cpu_topology(struct mcctrl_usrdata *udp)
|
||||
{
|
||||
int error;
|
||||
int index;
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
struct mcctrl_cpu_topology *topology;
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cpu_topology *topology;
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
|
||||
dprintk("get_cpu_topology(%p)\n", udp);
|
||||
for (index = 0; index < udp->cpu_info->n_cpus; ++index) {
|
||||
@ -473,13 +444,8 @@ out:
|
||||
return error;
|
||||
} /* get_cpu_topology() */
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
|
||||
struct mcctrl_cpu_topology *cpu, struct cache_topology *cache)
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
|
||||
struct cpu_topology *cpu, struct cache_topology *cache)
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
{
|
||||
char *prefix = "/sys/devices/system/cpu";
|
||||
int cpu_number = cpu->mckernel_cpu_id;
|
||||
@ -531,13 +497,8 @@ static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
|
||||
return;
|
||||
} /* setup_cpu_sysfs_cache_files() */
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
|
||||
struct mcctrl_cpu_topology *cpu)
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
|
||||
struct cpu_topology *cpu)
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
{
|
||||
char *prefix = "/sys/devices/system/cpu";
|
||||
int cpu_number = cpu->mckernel_cpu_id;
|
||||
@ -586,11 +547,7 @@ static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
|
||||
static void setup_cpus_sysfs_files(struct mcctrl_usrdata *udp)
|
||||
{
|
||||
int error;
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
struct mcctrl_cpu_topology *cpu;
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cpu_topology *cpu;
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
|
||||
error = get_cpu_topology(udp);
|
||||
if (error) {
|
||||
|
||||
@ -1,14 +0,0 @@
|
||||
# LESS/GREATER_EQUAL appears somewhere in 3.7... meh compat until we stop caring about 2.x
|
||||
# ...apparently can't define macros ot use inside if, so unfold manually
|
||||
|
||||
if(NOT (LINUX_VERSION_CODE LESS 262144) AND NOT (LINUX_VERSION_CODE GREATER 262400))
|
||||
add_subdirectory("linux-4.0.9")
|
||||
elseif(NOT (LINUX_VERSION_CODE LESS 263680) AND NOT (LINUX_VERSION_CODE GREATER 263936))
|
||||
add_subdirectory("linux-4.6.7")
|
||||
elseif(LINUX_VERSION_CODE EQUAL 199168)
|
||||
add_subdirectory("linux-3.10.0-327.36.1.el7")
|
||||
else()
|
||||
#add_subdirectory("linux-3.10.0-327.36.1.el7")
|
||||
add_subdirectory("linux-4.18.14")
|
||||
#message(FATAL_ERROR "mcoverlayfs enabled but kernel version not compatible")
|
||||
endif()
|
||||
@ -1,7 +0,0 @@
|
||||
kmod(mcoverlay
|
||||
SOURCES
|
||||
copy_up.c dir.c inode.c readdir.c super.c
|
||||
INSTALL_DEST
|
||||
${KMODDIR}
|
||||
)
|
||||
|
||||
@ -1,461 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/splice.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/fdtable.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
|
||||
|
||||
static unsigned ovl_check_copy_up = 1;
|
||||
module_param_named(check_copy_up, ovl_check_copy_up, uint,
|
||||
S_IWUSR | S_IRUGO);
|
||||
MODULE_PARM_DESC(ovl_check_copy_up,
|
||||
"Warn on copy-up when causing process also has a R/O fd open");
|
||||
|
||||
static int ovl_check_fd(const void *data, struct file *f, unsigned fd)
|
||||
{
|
||||
const struct dentry *dentry = data;
|
||||
|
||||
if (f->f_path.dentry == dentry)
|
||||
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
|
||||
f, fd, current->pid, current->comm);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check the fds open by this process and warn if something like the following
|
||||
* scenario is about to occur:
|
||||
*
|
||||
* fd1 = open("foo", O_RDONLY);
|
||||
* fd2 = open("foo", O_RDWR);
|
||||
*/
|
||||
static void ovl_do_check_copy_up(struct dentry *dentry)
|
||||
{
|
||||
if (ovl_check_copy_up)
|
||||
iterate_fd(current->files, 0, ovl_check_fd, dentry);
|
||||
}
|
||||
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
|
||||
{
|
||||
ssize_t list_size, size, value_size = 0;
|
||||
char *buf, *name, *value = NULL;
|
||||
int uninitialized_var(error);
|
||||
|
||||
if (!old->d_inode->i_op->getxattr ||
|
||||
!new->d_inode->i_op->getxattr)
|
||||
return 0;
|
||||
|
||||
list_size = vfs_listxattr(old, NULL, 0);
|
||||
if (list_size <= 0) {
|
||||
if (list_size == -EOPNOTSUPP)
|
||||
return 0;
|
||||
return list_size;
|
||||
}
|
||||
|
||||
buf = kzalloc(list_size, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
list_size = vfs_listxattr(old, buf, list_size);
|
||||
if (list_size <= 0) {
|
||||
error = list_size;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
|
||||
retry:
|
||||
size = vfs_getxattr(old, name, value, value_size);
|
||||
if (size == -ERANGE)
|
||||
size = vfs_getxattr(old, name, NULL, 0);
|
||||
|
||||
if (size < 0) {
|
||||
error = size;
|
||||
break;
|
||||
}
|
||||
|
||||
if (size > value_size) {
|
||||
void *new;
|
||||
|
||||
new = krealloc(value, size, GFP_KERNEL);
|
||||
if (!new) {
|
||||
error = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
value = new;
|
||||
value_size = size;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
error = vfs_setxattr(new, name, value, size, 0);
|
||||
if (error)
|
||||
break;
|
||||
}
|
||||
kfree(value);
|
||||
out:
|
||||
kfree(buf);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
|
||||
{
|
||||
struct file *old_file;
|
||||
struct file *new_file;
|
||||
loff_t old_pos = 0;
|
||||
loff_t new_pos = 0;
|
||||
int error = 0;
|
||||
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
old_file = ovl_path_open(old, O_RDONLY);
|
||||
if (IS_ERR(old_file))
|
||||
return PTR_ERR(old_file);
|
||||
|
||||
new_file = ovl_path_open(new, O_WRONLY);
|
||||
if (IS_ERR(new_file)) {
|
||||
error = PTR_ERR(new_file);
|
||||
goto out_fput;
|
||||
}
|
||||
|
||||
/* FIXME: copy up sparse files efficiently */
|
||||
while (len) {
|
||||
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
|
||||
long bytes;
|
||||
|
||||
if (len < this_len)
|
||||
this_len = len;
|
||||
|
||||
if (signal_pending_state(TASK_KILLABLE, current)) {
|
||||
error = -EINTR;
|
||||
break;
|
||||
}
|
||||
|
||||
bytes = do_splice_direct(old_file, &old_pos,
|
||||
new_file, &new_pos,
|
||||
this_len, SPLICE_F_MOVE);
|
||||
if (bytes <= 0) {
|
||||
error = bytes;
|
||||
break;
|
||||
}
|
||||
WARN_ON(old_pos != new_pos);
|
||||
|
||||
len -= bytes;
|
||||
}
|
||||
|
||||
fput(new_file);
|
||||
out_fput:
|
||||
fput(old_file);
|
||||
return error;
|
||||
}
|
||||
|
||||
static char *ovl_read_symlink(struct dentry *realdentry)
|
||||
{
|
||||
int res;
|
||||
char *buf;
|
||||
struct inode *inode = realdentry->d_inode;
|
||||
mm_segment_t old_fs;
|
||||
|
||||
res = -EINVAL;
|
||||
if (!inode->i_op->readlink)
|
||||
goto err;
|
||||
|
||||
res = -ENOMEM;
|
||||
buf = (char *) __get_free_page(GFP_KERNEL);
|
||||
if (!buf)
|
||||
goto err;
|
||||
|
||||
old_fs = get_fs();
|
||||
set_fs(get_ds());
|
||||
/* The cast to a user pointer is valid due to the set_fs() */
|
||||
res = inode->i_op->readlink(realdentry,
|
||||
(char __user *)buf, PAGE_SIZE - 1);
|
||||
set_fs(old_fs);
|
||||
if (res < 0) {
|
||||
free_page((unsigned long) buf);
|
||||
goto err;
|
||||
}
|
||||
buf[res] = '\0';
|
||||
|
||||
return buf;
|
||||
|
||||
err:
|
||||
return ERR_PTR(res);
|
||||
}
|
||||
|
||||
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
struct iattr attr = {
|
||||
.ia_valid =
|
||||
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
|
||||
.ia_atime = stat->atime,
|
||||
.ia_mtime = stat->mtime,
|
||||
};
|
||||
|
||||
return notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
|
||||
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (!S_ISLNK(stat->mode)) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_MODE,
|
||||
.ia_mode = stat->mode,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_UID | ATTR_GID,
|
||||
.ia_uid = stat->uid,
|
||||
.ia_gid = stat->gid,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err)
|
||||
ovl_set_timestamps(upperdentry, stat);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
|
||||
struct dentry *dentry, struct path *lowerpath,
|
||||
struct kstat *stat, struct iattr *attr,
|
||||
const char *link)
|
||||
{
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *newdentry = NULL;
|
||||
struct dentry *upper = NULL;
|
||||
umode_t mode = stat->mode;
|
||||
int err;
|
||||
|
||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out1;
|
||||
|
||||
/* Can't properly set mode on creation because of the umask */
|
||||
stat->mode &= S_IFMT;
|
||||
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
|
||||
stat->mode = mode;
|
||||
if (err)
|
||||
goto out2;
|
||||
|
||||
if (S_ISREG(stat->mode)) {
|
||||
struct path upperpath;
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
BUG_ON(upperpath.dentry != NULL);
|
||||
upperpath.dentry = newdentry;
|
||||
|
||||
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
}
|
||||
|
||||
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
mutex_lock(&newdentry->d_inode->i_mutex);
|
||||
err = ovl_set_attr(newdentry, stat);
|
||||
if (!err && attr)
|
||||
err = notify_change(newdentry, attr, NULL);
|
||||
mutex_unlock(&newdentry->d_inode->i_mutex);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
newdentry = NULL;
|
||||
|
||||
/*
|
||||
* Non-directores become opaque when copied up.
|
||||
*/
|
||||
if (!S_ISDIR(stat->mode))
|
||||
ovl_dentry_set_opaque(dentry, true);
|
||||
out2:
|
||||
dput(upper);
|
||||
out1:
|
||||
dput(newdentry);
|
||||
out:
|
||||
return err;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, newdentry);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy up a single dentry
|
||||
*
|
||||
* Directory renames only allowed on "pure upper" (already created on
|
||||
* upper filesystem, never copied up). Directories which are on lower or
|
||||
* are merged may not be renamed. For these -EXDEV is returned and
|
||||
* userspace has to deal with it. This means, when copying up a
|
||||
* directory we can rely on it and ancestors being stable.
|
||||
*
|
||||
* Non-directory renames start with copy up of source if necessary. The
|
||||
* actual rename will only proceed once the copy up was successful. Copy
|
||||
* up uses upper parent i_mutex for exclusion. Since rename can change
|
||||
* d_parent it is possible that the copy up will lock the old parent. At
|
||||
* that point the file will have already been copied up anyway.
|
||||
*/
|
||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
||||
struct path *lowerpath, struct kstat *stat,
|
||||
struct iattr *attr)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
int err;
|
||||
struct kstat pstat;
|
||||
struct path parentpath;
|
||||
struct dentry *upperdir;
|
||||
struct dentry *upperdentry;
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
char *link = NULL;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
ovl_do_check_copy_up(lowerpath->dentry);
|
||||
|
||||
ovl_path_upper(parent, &parentpath);
|
||||
upperdir = parentpath.dentry;
|
||||
|
||||
err = vfs_getattr(&parentpath, &pstat);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (S_ISLNK(stat->mode)) {
|
||||
link = ovl_read_symlink(lowerpath->dentry);
|
||||
if (IS_ERR(link))
|
||||
return PTR_ERR(link);
|
||||
}
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_free_link;
|
||||
|
||||
override_cred->fsuid = stat->uid;
|
||||
override_cred->fsgid = stat->gid;
|
||||
/*
|
||||
* CAP_SYS_ADMIN for copying up extended attributes
|
||||
* CAP_DAC_OVERRIDE for create
|
||||
* CAP_FOWNER for chmod, timestamp update
|
||||
* CAP_FSETID for chmod
|
||||
* CAP_CHOWN for chown
|
||||
* CAP_MKNOD for mknod
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
cap_raise(override_cred->cap_effective, CAP_MKNOD);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = -EIO;
|
||||
if (lock_rename(workdir, upperdir) != NULL) {
|
||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
||||
goto out_unlock;
|
||||
}
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
if (upperdentry) {
|
||||
unlock_rename(workdir, upperdir);
|
||||
err = 0;
|
||||
/* Raced with another copy-up? Do the setattr here */
|
||||
if (attr) {
|
||||
mutex_lock(&upperdentry->d_inode->i_mutex);
|
||||
err = notify_change(upperdentry, attr, NULL);
|
||||
mutex_unlock(&upperdentry->d_inode->i_mutex);
|
||||
}
|
||||
goto out_put_cred;
|
||||
}
|
||||
|
||||
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
|
||||
stat, attr, link);
|
||||
if (!err) {
|
||||
/* Restore timestamps on parent (best effort) */
|
||||
ovl_set_timestamps(upperdir, &pstat);
|
||||
}
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out_put_cred:
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
|
||||
out_free_link:
|
||||
if (link)
|
||||
free_page((unsigned long) link);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_copy_up(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = 0;
|
||||
while (!err) {
|
||||
struct dentry *next;
|
||||
struct dentry *parent;
|
||||
struct path lowerpath;
|
||||
struct kstat stat;
|
||||
enum ovl_path_type type = ovl_path_type(dentry);
|
||||
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
break;
|
||||
|
||||
next = dget(dentry);
|
||||
/* find the topmost dentry not yet copied up */
|
||||
for (;;) {
|
||||
parent = dget_parent(next);
|
||||
|
||||
type = ovl_path_type(parent);
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
break;
|
||||
|
||||
dput(next);
|
||||
next = parent;
|
||||
}
|
||||
|
||||
ovl_path_lower(next, &lowerpath);
|
||||
err = vfs_getattr(&lowerpath, &stat);
|
||||
if (!err)
|
||||
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
|
||||
|
||||
dput(parent);
|
||||
dput(next);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
@ -1,972 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/cred.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
dget(wdentry);
|
||||
if (S_ISDIR(wdentry->d_inode->i_mode))
|
||||
err = ovl_do_rmdir(wdir, wdentry);
|
||||
else
|
||||
err = ovl_do_unlink(wdir, wdentry);
|
||||
dput(wdentry);
|
||||
|
||||
if (err) {
|
||||
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
|
||||
wdentry, err);
|
||||
}
|
||||
}
|
||||
|
||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
|
||||
{
|
||||
struct dentry *temp;
|
||||
char name[20];
|
||||
|
||||
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
|
||||
|
||||
temp = lookup_one_len(name, workdir, strlen(name));
|
||||
if (!IS_ERR(temp) && temp->d_inode) {
|
||||
pr_err("overlayfs: workdir/%s already exists\n", name);
|
||||
dput(temp);
|
||||
temp = ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
return temp;
|
||||
}
|
||||
|
||||
/* caller holds i_mutex on workdir */
|
||||
static struct dentry *ovl_whiteout(struct dentry *workdir,
|
||||
struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *whiteout;
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
|
||||
whiteout = ovl_lookup_temp(workdir, dentry);
|
||||
if (IS_ERR(whiteout))
|
||||
return whiteout;
|
||||
|
||||
err = ovl_do_whiteout(wdir, whiteout);
|
||||
if (err) {
|
||||
dput(whiteout);
|
||||
whiteout = ERR_PTR(err);
|
||||
}
|
||||
|
||||
return whiteout;
|
||||
}
|
||||
|
||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink, bool debug)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (newdentry->d_inode)
|
||||
return -ESTALE;
|
||||
|
||||
if (hardlink) {
|
||||
err = ovl_do_link(hardlink, dir, newdentry, debug);
|
||||
} else {
|
||||
switch (stat->mode & S_IFMT) {
|
||||
case S_IFREG:
|
||||
err = ovl_do_create(dir, newdentry, stat->mode, debug);
|
||||
break;
|
||||
|
||||
case S_IFDIR:
|
||||
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
|
||||
break;
|
||||
|
||||
case S_IFCHR:
|
||||
case S_IFBLK:
|
||||
case S_IFIFO:
|
||||
case S_IFSOCK:
|
||||
err = ovl_do_mknod(dir, newdentry,
|
||||
stat->mode, stat->rdev, debug);
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
err = ovl_do_symlink(dir, newdentry, link, debug);
|
||||
break;
|
||||
|
||||
default:
|
||||
err = -EPERM;
|
||||
}
|
||||
}
|
||||
if (!err && WARN_ON(!newdentry->d_inode)) {
|
||||
/*
|
||||
* Not quite sure if non-instantiated dentry is legal or not.
|
||||
* VFS doesn't seem to care so check and warn here.
|
||||
*/
|
||||
err = -ENOENT;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_set_opaque(struct dentry *upperdentry)
|
||||
{
|
||||
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
|
||||
}
|
||||
|
||||
static void ovl_remove_opaque(struct dentry *upperdentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
|
||||
if (err) {
|
||||
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
|
||||
upperdentry->d_name.name, err);
|
||||
}
|
||||
}
|
||||
|
||||
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat)
|
||||
{
|
||||
int err;
|
||||
enum ovl_path_type type;
|
||||
struct path realpath;
|
||||
|
||||
type = ovl_path_real(dentry, &realpath);
|
||||
err = vfs_getattr(&realpath, stat);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
stat->dev = dentry->d_sb->s_dev;
|
||||
stat->ino = dentry->d_inode->i_ino;
|
||||
|
||||
/*
|
||||
* It's probably not worth it to count subdirs to get the
|
||||
* correct link count. nlink=1 seems to pacify 'find' and
|
||||
* other utilities.
|
||||
*/
|
||||
if (OVL_TYPE_MERGE(type))
|
||||
stat->nlink = 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink)
|
||||
{
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *newdentry;
|
||||
int err;
|
||||
|
||||
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
|
||||
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_unlock;
|
||||
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
ovl_copyattr(newdentry->d_inode, inode);
|
||||
d_instantiate(dentry, inode);
|
||||
newdentry = NULL;
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_unlock:
|
||||
mutex_unlock(&udir->i_mutex);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_lock_rename_workdir(struct dentry *workdir,
|
||||
struct dentry *upperdir)
|
||||
{
|
||||
/* Workdir should not be the same as upperdir */
|
||||
if (workdir == upperdir)
|
||||
goto err;
|
||||
|
||||
/* Workdir should not be subdir of upperdir and vice versa */
|
||||
if (lock_rename(workdir, upperdir) != NULL)
|
||||
goto err_unlock;
|
||||
|
||||
return 0;
|
||||
|
||||
err_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
err:
|
||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_clear_empty(struct dentry *dentry,
|
||||
struct list_head *list)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct path upperpath;
|
||||
struct dentry *upper;
|
||||
struct dentry *opaquedir;
|
||||
struct kstat stat;
|
||||
int err;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return ERR_PTR(-EROFS);
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
err = vfs_getattr(&upperpath, &stat);
|
||||
if (err)
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if (!S_ISDIR(stat.mode))
|
||||
goto out_unlock;
|
||||
upper = upperpath.dentry;
|
||||
if (upper->d_parent->d_inode != udir)
|
||||
goto out_unlock;
|
||||
|
||||
opaquedir = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir))
|
||||
goto out_unlock;
|
||||
|
||||
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
err = ovl_copy_xattr(upper, opaquedir);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_set_opaque(opaquedir);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
mutex_lock(&opaquedir->d_inode->i_mutex);
|
||||
err = ovl_set_attr(opaquedir, &stat);
|
||||
mutex_unlock(&opaquedir->d_inode->i_mutex);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_cleanup_whiteouts(upper, list);
|
||||
ovl_cleanup(wdir, upper);
|
||||
unlock_rename(workdir, upperdir);
|
||||
|
||||
/* dentry's upper doesn't match now, get rid of it */
|
||||
d_drop(dentry);
|
||||
|
||||
return opaquedir;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, opaquedir);
|
||||
out_dput:
|
||||
dput(opaquedir);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out:
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *ret = NULL;
|
||||
LIST_HEAD(list);
|
||||
|
||||
err = ovl_check_empty_dir(dentry, &list);
|
||||
if (err)
|
||||
ret = ERR_PTR(err);
|
||||
else {
|
||||
/*
|
||||
* If no upperdentry then skip clearing whiteouts.
|
||||
*
|
||||
* Can race with copy-up, since we don't hold the upperdir
|
||||
* mutex. Doesn't matter, since copy-up can't create a
|
||||
* non-empty directory from an empty one.
|
||||
*/
|
||||
if (ovl_dentry_upper(dentry))
|
||||
ret = ovl_clear_empty(dentry, &list);
|
||||
}
|
||||
|
||||
ovl_cache_free(&list);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *upper;
|
||||
struct dentry *newdentry;
|
||||
int err;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_unlock;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out_dput;
|
||||
|
||||
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
|
||||
if (err)
|
||||
goto out_dput2;
|
||||
|
||||
if (S_ISDIR(stat->mode)) {
|
||||
err = ovl_set_opaque(newdentry);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper,
|
||||
RENAME_EXCHANGE);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_cleanup(wdir, upper);
|
||||
} else {
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
}
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
ovl_copyattr(newdentry->d_inode, inode);
|
||||
d_instantiate(dentry, inode);
|
||||
newdentry = NULL;
|
||||
out_dput2:
|
||||
dput(upper);
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out:
|
||||
return err;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, newdentry);
|
||||
goto out_dput2;
|
||||
}
|
||||
|
||||
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
|
||||
const char *link, struct dentry *hardlink)
|
||||
{
|
||||
int err;
|
||||
struct inode *inode;
|
||||
struct kstat stat = {
|
||||
.mode = mode,
|
||||
.rdev = rdev,
|
||||
};
|
||||
|
||||
err = -ENOMEM;
|
||||
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
|
||||
if (!inode)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry->d_parent);
|
||||
if (err)
|
||||
goto out_iput;
|
||||
|
||||
if (!ovl_dentry_is_opaque(dentry)) {
|
||||
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
|
||||
} else {
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_iput;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting opaque xattr
|
||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
|
||||
hardlink);
|
||||
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
|
||||
if (!err)
|
||||
inode = NULL;
|
||||
out_iput:
|
||||
iput(inode);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
|
||||
const char *link)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (!err) {
|
||||
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
|
||||
ovl_drop_write(dentry);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
bool excl)
|
||||
{
|
||||
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
|
||||
}
|
||||
|
||||
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
||||
{
|
||||
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
|
||||
}
|
||||
|
||||
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
dev_t rdev)
|
||||
{
|
||||
/* Don't allow creation of "whiteout" on overlay */
|
||||
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
|
||||
return -EPERM;
|
||||
|
||||
return ovl_create_object(dentry, mode, rdev, NULL);
|
||||
}
|
||||
|
||||
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *link)
|
||||
{
|
||||
return ovl_create_object(dentry, S_IFLNK, 0, link);
|
||||
}
|
||||
|
||||
static int ovl_link(struct dentry *old, struct inode *newdir,
|
||||
struct dentry *new)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upper;
|
||||
|
||||
err = ovl_want_write(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(old);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
upper = ovl_dentry_upper(old);
|
||||
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
|
||||
|
||||
out_drop_write:
|
||||
ovl_drop_write(old);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *whiteout;
|
||||
struct dentry *upper;
|
||||
struct dentry *opaquedir = NULL;
|
||||
int err;
|
||||
int flags = 0;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
if (is_dir) {
|
||||
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
|
||||
opaquedir = ovl_check_empty_and_clear(dentry);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir))
|
||||
goto out;
|
||||
} else {
|
||||
LIST_HEAD(list);
|
||||
|
||||
/*
|
||||
* When removing an empty opaque directory, then it
|
||||
* makes no sense to replace it with an exact replica of
|
||||
* itself. But emptiness still needs to be checked.
|
||||
*/
|
||||
err = ovl_check_empty_dir(dentry, &list);
|
||||
ovl_cache_free(&list);
|
||||
if (err)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if ((opaquedir && upper != opaquedir) ||
|
||||
(!opaquedir && ovl_dentry_upper(dentry) &&
|
||||
upper != ovl_dentry_upper(dentry))) {
|
||||
goto out_dput_upper;
|
||||
}
|
||||
|
||||
whiteout = ovl_whiteout(workdir, dentry);
|
||||
err = PTR_ERR(whiteout);
|
||||
if (IS_ERR(whiteout))
|
||||
goto out_dput_upper;
|
||||
|
||||
if (d_is_dir(upper))
|
||||
flags = RENAME_EXCHANGE;
|
||||
|
||||
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
|
||||
if (err)
|
||||
goto kill_whiteout;
|
||||
if (flags)
|
||||
ovl_cleanup(wdir, upper);
|
||||
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
out_d_drop:
|
||||
d_drop(dentry);
|
||||
dput(whiteout);
|
||||
out_dput_upper:
|
||||
dput(upper);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out_dput:
|
||||
dput(opaquedir);
|
||||
out:
|
||||
return err;
|
||||
|
||||
kill_whiteout:
|
||||
ovl_cleanup(wdir, whiteout);
|
||||
goto out_d_drop;
|
||||
}
|
||||
|
||||
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *dir = upperdir->d_inode;
|
||||
struct dentry *upper;
|
||||
int err;
|
||||
|
||||
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if (upper == ovl_dentry_upper(dentry)) {
|
||||
if (is_dir)
|
||||
err = vfs_rmdir(dir, upper);
|
||||
else
|
||||
err = vfs_unlink(dir, upper, NULL);
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
}
|
||||
dput(upper);
|
||||
|
||||
/*
|
||||
* Keeping this dentry hashed would mean having to release
|
||||
* upperpath/lowerpath, which could only be done if we are the
|
||||
* sole user of this dentry. Too tricky... Just unhash for
|
||||
* now.
|
||||
*/
|
||||
if (!err)
|
||||
d_drop(dentry);
|
||||
out_unlock:
|
||||
mutex_unlock(&dir->i_mutex);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_check_sticky(struct dentry *dentry)
|
||||
{
|
||||
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
|
||||
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
|
||||
|
||||
if (check_sticky(dir, inode))
|
||||
return -EPERM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
enum ovl_path_type type;
|
||||
int err;
|
||||
|
||||
err = ovl_check_sticky(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry->d_parent);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
type = ovl_path_type(dentry);
|
||||
if (OVL_TYPE_PURE_UPPER(type)) {
|
||||
err = ovl_remove_upper(dentry, is_dir);
|
||||
} else {
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_drop_write;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
* CAP_FSETID for chmod of opaque dir
|
||||
* CAP_CHOWN for chown of opaque dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = ovl_remove_and_whiteout(dentry, is_dir);
|
||||
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
return ovl_do_remove(dentry, false);
|
||||
}
|
||||
|
||||
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
return ovl_do_remove(dentry, true);
|
||||
}
|
||||
|
||||
static int ovl_rename2(struct inode *olddir, struct dentry *old,
|
||||
struct inode *newdir, struct dentry *new,
|
||||
unsigned int flags)
|
||||
{
|
||||
int err;
|
||||
enum ovl_path_type old_type;
|
||||
enum ovl_path_type new_type;
|
||||
struct dentry *old_upperdir;
|
||||
struct dentry *new_upperdir;
|
||||
struct dentry *olddentry;
|
||||
struct dentry *newdentry;
|
||||
struct dentry *trap;
|
||||
bool old_opaque;
|
||||
bool new_opaque;
|
||||
bool new_create = false;
|
||||
bool cleanup_whiteout = false;
|
||||
bool overwrite = !(flags & RENAME_EXCHANGE);
|
||||
bool is_dir = S_ISDIR(old->d_inode->i_mode);
|
||||
bool new_is_dir = false;
|
||||
struct dentry *opaquedir = NULL;
|
||||
const struct cred *old_cred = NULL;
|
||||
struct cred *override_cred = NULL;
|
||||
|
||||
err = -EINVAL;
|
||||
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
|
||||
goto out;
|
||||
|
||||
flags &= ~RENAME_NOREPLACE;
|
||||
|
||||
err = ovl_check_sticky(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
/* Don't copy up directory trees */
|
||||
old_type = ovl_path_type(old);
|
||||
err = -EXDEV;
|
||||
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
|
||||
goto out;
|
||||
|
||||
if (new->d_inode) {
|
||||
err = ovl_check_sticky(new);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (S_ISDIR(new->d_inode->i_mode))
|
||||
new_is_dir = true;
|
||||
|
||||
new_type = ovl_path_type(new);
|
||||
err = -EXDEV;
|
||||
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
|
||||
goto out;
|
||||
|
||||
err = 0;
|
||||
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
|
||||
if (ovl_dentry_lower(old)->d_inode ==
|
||||
ovl_dentry_lower(new)->d_inode)
|
||||
goto out;
|
||||
}
|
||||
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
|
||||
if (ovl_dentry_upper(old)->d_inode ==
|
||||
ovl_dentry_upper(new)->d_inode)
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
if (ovl_dentry_is_opaque(new))
|
||||
new_type = __OVL_PATH_UPPER;
|
||||
else
|
||||
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
|
||||
}
|
||||
|
||||
err = ovl_want_write(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(old);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(new->d_parent);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
if (!overwrite) {
|
||||
err = ovl_copy_up(new);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
}
|
||||
|
||||
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
|
||||
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
|
||||
|
||||
if (old_opaque || new_opaque) {
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_drop_write;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
||||
* CAP_DAC_OVERRIDE for create in workdir
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
* CAP_FSETID for chmod of opaque dir
|
||||
* CAP_CHOWN for chown of opaque dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
old_cred = override_creds(override_cred);
|
||||
}
|
||||
|
||||
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
|
||||
opaquedir = ovl_check_empty_and_clear(new);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir)) {
|
||||
opaquedir = NULL;
|
||||
goto out_revert_creds;
|
||||
}
|
||||
}
|
||||
|
||||
if (overwrite) {
|
||||
if (old_opaque) {
|
||||
if (new->d_inode || !new_opaque) {
|
||||
/* Whiteout source */
|
||||
flags |= RENAME_WHITEOUT;
|
||||
} else {
|
||||
/* Switch whiteouts */
|
||||
flags |= RENAME_EXCHANGE;
|
||||
}
|
||||
} else if (is_dir && !new->d_inode && new_opaque) {
|
||||
flags |= RENAME_EXCHANGE;
|
||||
cleanup_whiteout = true;
|
||||
}
|
||||
}
|
||||
|
||||
old_upperdir = ovl_dentry_upper(old->d_parent);
|
||||
new_upperdir = ovl_dentry_upper(new->d_parent);
|
||||
|
||||
trap = lock_rename(new_upperdir, old_upperdir);
|
||||
|
||||
|
||||
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
|
||||
old->d_name.len);
|
||||
err = PTR_ERR(olddentry);
|
||||
if (IS_ERR(olddentry))
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if (olddentry != ovl_dentry_upper(old))
|
||||
goto out_dput_old;
|
||||
|
||||
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
|
||||
new->d_name.len);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_dput_old;
|
||||
|
||||
err = -ESTALE;
|
||||
if (ovl_dentry_upper(new)) {
|
||||
if (opaquedir) {
|
||||
if (newdentry != opaquedir)
|
||||
goto out_dput;
|
||||
} else {
|
||||
if (newdentry != ovl_dentry_upper(new))
|
||||
goto out_dput;
|
||||
}
|
||||
} else {
|
||||
new_create = true;
|
||||
if (!d_is_negative(newdentry) &&
|
||||
(!new_opaque || !ovl_is_whiteout(newdentry)))
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (olddentry == trap)
|
||||
goto out_dput;
|
||||
if (newdentry == trap)
|
||||
goto out_dput;
|
||||
|
||||
if (is_dir && !old_opaque && new_opaque) {
|
||||
err = ovl_set_opaque(olddentry);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
}
|
||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
|
||||
err = ovl_set_opaque(newdentry);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (old_opaque || new_opaque) {
|
||||
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
|
||||
new_upperdir->d_inode, newdentry,
|
||||
flags);
|
||||
} else {
|
||||
/* No debug for the plain case */
|
||||
BUG_ON(flags & ~RENAME_EXCHANGE);
|
||||
err = vfs_rename(old_upperdir->d_inode, olddentry,
|
||||
new_upperdir->d_inode, newdentry,
|
||||
NULL, flags);
|
||||
}
|
||||
|
||||
if (err) {
|
||||
if (is_dir && !old_opaque && new_opaque)
|
||||
ovl_remove_opaque(olddentry);
|
||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
|
||||
ovl_remove_opaque(newdentry);
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (is_dir && old_opaque && !new_opaque)
|
||||
ovl_remove_opaque(olddentry);
|
||||
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
|
||||
ovl_remove_opaque(newdentry);
|
||||
|
||||
if (old_opaque != new_opaque) {
|
||||
ovl_dentry_set_opaque(old, new_opaque);
|
||||
if (!overwrite)
|
||||
ovl_dentry_set_opaque(new, old_opaque);
|
||||
}
|
||||
|
||||
if (cleanup_whiteout)
|
||||
ovl_cleanup(old_upperdir->d_inode, newdentry);
|
||||
|
||||
ovl_dentry_version_inc(old->d_parent);
|
||||
ovl_dentry_version_inc(new->d_parent);
|
||||
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_dput_old:
|
||||
dput(olddentry);
|
||||
out_unlock:
|
||||
unlock_rename(new_upperdir, old_upperdir);
|
||||
out_revert_creds:
|
||||
if (old_opaque || new_opaque) {
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
out_drop_write:
|
||||
ovl_drop_write(old);
|
||||
out:
|
||||
dput(opaquedir);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_rename(struct inode *olddir, struct dentry *old,
|
||||
struct inode *newdir, struct dentry *new)
|
||||
{
|
||||
return ovl_rename2(olddir, old, newdir, new, 0);
|
||||
}
|
||||
|
||||
const struct inode_operations_wrapper ovl_dir_inode_operations = {
|
||||
.ops = {
|
||||
.lookup = ovl_lookup,
|
||||
.mkdir = ovl_mkdir,
|
||||
.symlink = ovl_symlink,
|
||||
.unlink = ovl_unlink,
|
||||
.rmdir = ovl_rmdir,
|
||||
.rename = ovl_rename,
|
||||
.link = ovl_link,
|
||||
.setattr = ovl_setattr,
|
||||
.create = ovl_create,
|
||||
.mknod = ovl_mknod,
|
||||
.permission = ovl_permission,
|
||||
.getattr = ovl_dir_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
},
|
||||
.rename2 = ovl_rename2,
|
||||
};
|
||||
@ -1,442 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/xattr.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
|
||||
bool no_data)
|
||||
{
|
||||
int err;
|
||||
struct dentry *parent;
|
||||
struct kstat stat;
|
||||
struct path lowerpath;
|
||||
|
||||
parent = dget_parent(dentry);
|
||||
err = ovl_copy_up(parent);
|
||||
if (err)
|
||||
goto out_dput_parent;
|
||||
|
||||
ovl_path_lower(dentry, &lowerpath);
|
||||
err = vfs_getattr(&lowerpath, &stat);
|
||||
if (err)
|
||||
goto out_dput_parent;
|
||||
|
||||
if (no_data)
|
||||
stat.size = 0;
|
||||
|
||||
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
|
||||
|
||||
out_dput_parent:
|
||||
dput(parent);
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (!err) {
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
|
||||
mutex_lock(&upperdentry->d_inode->i_mutex);
|
||||
err = notify_change(upperdentry, attr, NULL);
|
||||
mutex_unlock(&upperdentry->d_inode->i_mutex);
|
||||
}
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat)
|
||||
{
|
||||
struct path realpath;
|
||||
|
||||
ovl_path_real(dentry, &realpath);
|
||||
return vfs_getattr(&realpath, stat);
|
||||
}
|
||||
|
||||
int ovl_permission(struct inode *inode, int mask)
|
||||
{
|
||||
struct ovl_entry *oe;
|
||||
struct dentry *alias = NULL;
|
||||
struct inode *realinode;
|
||||
struct dentry *realdentry;
|
||||
bool is_upper;
|
||||
int err;
|
||||
|
||||
if (S_ISDIR(inode->i_mode)) {
|
||||
oe = inode->i_private;
|
||||
} else if (mask & MAY_NOT_BLOCK) {
|
||||
return -ECHILD;
|
||||
} else {
|
||||
/*
|
||||
* For non-directories find an alias and get the info
|
||||
* from there.
|
||||
*/
|
||||
alias = d_find_any_alias(inode);
|
||||
if (WARN_ON(!alias))
|
||||
return -ENOENT;
|
||||
|
||||
oe = alias->d_fsdata;
|
||||
}
|
||||
|
||||
realdentry = ovl_entry_real(oe, &is_upper);
|
||||
|
||||
/* Careful in RCU walk mode */
|
||||
realinode = ACCESS_ONCE(realdentry->d_inode);
|
||||
if (!realinode) {
|
||||
WARN_ON(!(mask & MAY_NOT_BLOCK));
|
||||
err = -ENOENT;
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (mask & MAY_WRITE) {
|
||||
umode_t mode = realinode->i_mode;
|
||||
|
||||
/*
|
||||
* Writes will always be redirected to upper layer, so
|
||||
* ignore lower layer being read-only.
|
||||
*
|
||||
* If the overlay itself is read-only then proceed
|
||||
* with the permission check, don't return EROFS.
|
||||
* This will only happen if this is the lower layer of
|
||||
* another overlayfs.
|
||||
*
|
||||
* If upper fs becomes read-only after the overlay was
|
||||
* constructed return EROFS to prevent modification of
|
||||
* upper layer.
|
||||
*/
|
||||
err = -EROFS;
|
||||
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
|
||||
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
err = __inode_permission(realinode, mask);
|
||||
out_dput:
|
||||
dput(alias);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
struct ovl_link_data {
|
||||
struct dentry *realdentry;
|
||||
void *cookie;
|
||||
};
|
||||
|
||||
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
|
||||
{
|
||||
void *ret;
|
||||
struct dentry *realdentry;
|
||||
struct inode *realinode;
|
||||
struct ovl_link_data *data = NULL;
|
||||
|
||||
realdentry = ovl_dentry_real(dentry);
|
||||
realinode = realdentry->d_inode;
|
||||
|
||||
if (WARN_ON(!realinode->i_op->follow_link))
|
||||
return ERR_PTR(-EPERM);
|
||||
|
||||
if (realinode->i_op->put_link) {
|
||||
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
|
||||
if (!data)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
data->realdentry = realdentry;
|
||||
}
|
||||
|
||||
ret = realinode->i_op->follow_link(realdentry, nd);
|
||||
if (IS_ERR(ret)) {
|
||||
kfree(data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (data)
|
||||
data->cookie = ret;
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
|
||||
{
|
||||
struct inode *realinode;
|
||||
struct ovl_link_data *data = c;
|
||||
|
||||
if (!data)
|
||||
return;
|
||||
|
||||
realinode = data->realdentry->d_inode;
|
||||
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
|
||||
kfree(data);
|
||||
}
|
||||
|
||||
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
|
||||
{
|
||||
struct path realpath;
|
||||
struct inode *realinode;
|
||||
|
||||
ovl_path_real(dentry, &realpath);
|
||||
realinode = realpath.dentry->d_inode;
|
||||
|
||||
if (!realinode->i_op->readlink)
|
||||
return -EINVAL;
|
||||
|
||||
touch_atime(&realpath);
|
||||
|
||||
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
|
||||
}
|
||||
|
||||
|
||||
static bool ovl_is_private_xattr(const char *name)
|
||||
{
|
||||
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
|
||||
}
|
||||
|
||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = -EPERM;
|
||||
if (ovl_is_private_xattr(name))
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
err = vfs_setxattr(upperdentry, name, value, size, flags);
|
||||
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool ovl_need_xattr_filter(struct dentry *dentry,
|
||||
enum ovl_path_type type)
|
||||
{
|
||||
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
|
||||
return S_ISDIR(dentry->d_inode->i_mode);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size)
|
||||
{
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
|
||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
||||
return -ENODATA;
|
||||
|
||||
return vfs_getxattr(realpath.dentry, name, value, size);
|
||||
}
|
||||
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
|
||||
{
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
ssize_t res;
|
||||
int off;
|
||||
|
||||
res = vfs_listxattr(realpath.dentry, list, size);
|
||||
if (res <= 0 || size == 0)
|
||||
return res;
|
||||
|
||||
if (!ovl_need_xattr_filter(dentry, type))
|
||||
return res;
|
||||
|
||||
/* filter out private xattrs */
|
||||
for (off = 0; off < res;) {
|
||||
char *s = list + off;
|
||||
size_t slen = strlen(s) + 1;
|
||||
|
||||
BUG_ON(off + slen > res);
|
||||
|
||||
if (ovl_is_private_xattr(s)) {
|
||||
res -= slen;
|
||||
memmove(s, s + slen, res - off);
|
||||
} else {
|
||||
off += slen;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
int ovl_removexattr(struct dentry *dentry, const char *name)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = -ENODATA;
|
||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
||||
goto out_drop_write;
|
||||
|
||||
if (!OVL_TYPE_UPPER(type)) {
|
||||
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
|
||||
if (err < 0)
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
ovl_path_upper(dentry, &realpath);
|
||||
}
|
||||
|
||||
err = vfs_removexattr(realpath.dentry, name);
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
|
||||
struct dentry *realdentry)
|
||||
{
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
return false;
|
||||
|
||||
if (special_file(realdentry->d_inode->i_mode))
|
||||
return false;
|
||||
|
||||
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
|
||||
const struct cred *cred)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
enum ovl_path_type type;
|
||||
bool want_write = false;
|
||||
|
||||
type = ovl_path_real(dentry, &realpath);
|
||||
if (!ovl_is_nocopyupw(dentry)) {
|
||||
if (ovl_open_need_copy_up(file->f_flags, type,
|
||||
realpath.dentry)) {
|
||||
want_write = true;
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (file->f_flags & O_TRUNC)
|
||||
err = ovl_copy_up_last(dentry, NULL, true);
|
||||
else
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
ovl_path_upper(dentry, &realpath);
|
||||
}
|
||||
}
|
||||
|
||||
err = vfs_open(&realpath, file, cred);
|
||||
out_drop_write:
|
||||
if (want_write)
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static const struct inode_operations_wrapper ovl_file_inode_operations = {
|
||||
.ops = {
|
||||
.setattr = ovl_setattr,
|
||||
.permission = ovl_permission,
|
||||
.getattr = ovl_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
},
|
||||
.dentry_open = ovl_dentry_open,
|
||||
};
|
||||
|
||||
static const struct inode_operations ovl_symlink_inode_operations = {
|
||||
.setattr = ovl_setattr,
|
||||
.follow_link = ovl_follow_link,
|
||||
.put_link = ovl_put_link,
|
||||
.readlink = ovl_readlink,
|
||||
.getattr = ovl_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
};
|
||||
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
||||
struct ovl_entry *oe)
|
||||
{
|
||||
struct inode *inode;
|
||||
|
||||
inode = new_inode(sb);
|
||||
if (!inode)
|
||||
return NULL;
|
||||
|
||||
mode &= S_IFMT;
|
||||
|
||||
inode->i_ino = get_next_ino();
|
||||
inode->i_mode = mode;
|
||||
inode->i_flags |= S_NOATIME | S_NOCMTIME;
|
||||
|
||||
switch (mode) {
|
||||
case S_IFDIR:
|
||||
inode->i_private = oe;
|
||||
inode->i_op = &ovl_dir_inode_operations.ops;
|
||||
inode->i_fop = &ovl_dir_operations;
|
||||
inode->i_flags |= S_IOPS_WRAPPER;
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
inode->i_op = &ovl_symlink_inode_operations;
|
||||
break;
|
||||
|
||||
case S_IFREG:
|
||||
case S_IFSOCK:
|
||||
case S_IFBLK:
|
||||
case S_IFCHR:
|
||||
case S_IFIFO:
|
||||
inode->i_op = &ovl_file_inode_operations.ops;
|
||||
inode->i_flags |= S_IOPS_WRAPPER;
|
||||
break;
|
||||
|
||||
default:
|
||||
WARN(1, "illegal file type: %i\n", mode);
|
||||
iput(inode);
|
||||
inode = NULL;
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
@ -1,200 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
||||
struct ovl_entry;
|
||||
|
||||
enum ovl_path_type {
|
||||
__OVL_PATH_PURE = (1 << 0),
|
||||
__OVL_PATH_UPPER = (1 << 1),
|
||||
__OVL_PATH_MERGE = (1 << 2),
|
||||
};
|
||||
|
||||
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
|
||||
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
|
||||
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
|
||||
#define OVL_TYPE_MERGE_OR_LOWER(type) \
|
||||
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
|
||||
|
||||
#define OVL_XATTR_PRE_NAME "trusted.overlay."
|
||||
#define OVL_XATTR_PRE_LEN 16
|
||||
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
|
||||
|
||||
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_rmdir(dir, dentry);
|
||||
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_unlink(dir, dentry, NULL);
|
||||
pr_debug("unlink(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
|
||||
struct dentry *new_dentry, bool debug)
|
||||
{
|
||||
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
|
||||
if (debug) {
|
||||
pr_debug("link(%pd2, %pd2) = %i\n",
|
||||
old_dentry, new_dentry, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, bool debug)
|
||||
{
|
||||
int err = vfs_create(dir, dentry, mode, true);
|
||||
if (debug)
|
||||
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, bool debug)
|
||||
{
|
||||
int err = vfs_mkdir(dir, dentry, mode);
|
||||
if (debug)
|
||||
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, dev_t dev, bool debug)
|
||||
{
|
||||
int err = vfs_mknod(dir, dentry, mode, dev);
|
||||
if (debug) {
|
||||
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
|
||||
dentry, mode, dev, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *oldname, bool debug)
|
||||
{
|
||||
int err = vfs_symlink(dir, dentry, oldname);
|
||||
if (debug)
|
||||
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err = vfs_setxattr(dentry, name, value, size, flags);
|
||||
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
|
||||
dentry, name, (int) size, (char *) value, flags, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
|
||||
{
|
||||
int err = vfs_removexattr(dentry, name);
|
||||
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
|
||||
struct inode *newdir, struct dentry *newdentry,
|
||||
unsigned int flags)
|
||||
{
|
||||
int err;
|
||||
|
||||
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
|
||||
olddentry, newdentry, flags);
|
||||
|
||||
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
|
||||
|
||||
if (err) {
|
||||
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
|
||||
olddentry, newdentry, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_whiteout(dir, dentry);
|
||||
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
bool ovl_is_nocopyupw(struct dentry *dentry);
|
||||
enum ovl_path_type ovl_path_type(struct dentry *dentry);
|
||||
u64 ovl_dentry_version_get(struct dentry *dentry);
|
||||
void ovl_dentry_version_inc(struct dentry *dentry);
|
||||
void ovl_path_upper(struct dentry *dentry, struct path *path);
|
||||
void ovl_path_lower(struct dentry *dentry, struct path *path);
|
||||
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
|
||||
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
|
||||
struct dentry *ovl_dentry_upper(struct dentry *dentry);
|
||||
struct dentry *ovl_dentry_lower(struct dentry *dentry);
|
||||
struct dentry *ovl_dentry_real(struct dentry *dentry);
|
||||
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
|
||||
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
|
||||
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
|
||||
struct dentry *ovl_workdir(struct dentry *dentry);
|
||||
int ovl_want_write(struct dentry *dentry);
|
||||
void ovl_drop_write(struct dentry *dentry);
|
||||
bool ovl_dentry_is_opaque(struct dentry *dentry);
|
||||
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
|
||||
bool ovl_is_whiteout(struct dentry *dentry);
|
||||
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
|
||||
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
|
||||
unsigned int flags);
|
||||
struct file *ovl_path_open(struct path *path, int flags);
|
||||
|
||||
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
|
||||
struct kstat *stat, const char *link);
|
||||
|
||||
/* readdir.c */
|
||||
extern const struct file_operations ovl_dir_operations;
|
||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
|
||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
|
||||
void ovl_cache_free(struct list_head *list);
|
||||
|
||||
/* inode.c */
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
int ovl_permission(struct inode *inode, int mask);
|
||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags);
|
||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size);
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
|
||||
int ovl_removexattr(struct dentry *dentry, const char *name);
|
||||
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
||||
struct ovl_entry *oe);
|
||||
static inline void ovl_copyattr(struct inode *from, struct inode *to)
|
||||
{
|
||||
to->i_uid = from->i_uid;
|
||||
to->i_gid = from->i_gid;
|
||||
}
|
||||
|
||||
/* dir.c */
|
||||
extern const struct inode_operations_wrapper ovl_dir_inode_operations;
|
||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
|
||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink, bool debug);
|
||||
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
|
||||
|
||||
/* copy_up.c */
|
||||
int ovl_copy_up(struct dentry *dentry);
|
||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
||||
struct path *lowerpath, struct kstat *stat,
|
||||
struct iattr *attr);
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
|
||||
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
|
||||
@ -1,626 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/cred.h>
|
||||
#include <linux/version.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
struct ovl_cache_entry {
|
||||
unsigned int len;
|
||||
unsigned int type;
|
||||
u64 ino;
|
||||
struct list_head l_node;
|
||||
struct rb_node node;
|
||||
struct ovl_cache_entry *next_maybe_whiteout;
|
||||
bool is_whiteout;
|
||||
char name[];
|
||||
};
|
||||
|
||||
struct ovl_dir_cache {
|
||||
long refcount;
|
||||
u64 version;
|
||||
struct list_head entries;
|
||||
};
|
||||
|
||||
/* vfs_readdir vs. iterate_dir compat */
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 11, 0) || \
|
||||
(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 5))
|
||||
#define USE_ITERATE_DIR 1
|
||||
#endif
|
||||
|
||||
#ifndef USE_ITERATE_DIR
|
||||
struct dir_context {
|
||||
const filldir_t actor;
|
||||
//loff_t pos;
|
||||
};
|
||||
#endif
|
||||
|
||||
struct ovl_readdir_data {
|
||||
struct dir_context ctx;
|
||||
bool is_merge;
|
||||
struct rb_root root;
|
||||
struct list_head *list;
|
||||
struct list_head middle;
|
||||
struct ovl_cache_entry *first_maybe_whiteout;
|
||||
int count;
|
||||
int err;
|
||||
};
|
||||
|
||||
struct ovl_dir_file {
|
||||
bool is_real;
|
||||
bool is_upper;
|
||||
struct ovl_dir_cache *cache;
|
||||
struct list_head *cursor;
|
||||
struct file *realfile;
|
||||
struct file *upperfile;
|
||||
};
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
|
||||
{
|
||||
return container_of(n, struct ovl_cache_entry, node);
|
||||
}
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
|
||||
const char *name, int len)
|
||||
{
|
||||
struct rb_node *node = root->rb_node;
|
||||
int cmp;
|
||||
|
||||
while (node) {
|
||||
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
|
||||
|
||||
cmp = strncmp(name, p->name, len);
|
||||
if (cmp > 0)
|
||||
node = p->node.rb_right;
|
||||
else if (cmp < 0 || len < p->len)
|
||||
node = p->node.rb_left;
|
||||
else
|
||||
return p;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
|
||||
const char *name, int len,
|
||||
u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
|
||||
|
||||
p = kmalloc(size, GFP_KERNEL);
|
||||
if (!p)
|
||||
return NULL;
|
||||
|
||||
memcpy(p->name, name, len);
|
||||
p->name[len] = '\0';
|
||||
p->len = len;
|
||||
p->type = d_type;
|
||||
p->ino = ino;
|
||||
p->is_whiteout = false;
|
||||
|
||||
if (d_type == DT_CHR) {
|
||||
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
|
||||
rdd->first_maybe_whiteout = p;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
|
||||
const char *name, int len, u64 ino,
|
||||
unsigned int d_type)
|
||||
{
|
||||
struct rb_node **newp = &rdd->root.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
while (*newp) {
|
||||
int cmp;
|
||||
struct ovl_cache_entry *tmp;
|
||||
|
||||
parent = *newp;
|
||||
tmp = ovl_cache_entry_from_node(*newp);
|
||||
cmp = strncmp(name, tmp->name, len);
|
||||
if (cmp > 0)
|
||||
newp = &tmp->node.rb_right;
|
||||
else if (cmp < 0 || len < tmp->len)
|
||||
newp = &tmp->node.rb_left;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
|
||||
if (p == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
list_add_tail(&p->l_node, rdd->list);
|
||||
rb_link_node(&p->node, parent, newp);
|
||||
rb_insert_color(&p->node, &rdd->root);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
|
||||
const char *name, int namelen,
|
||||
loff_t offset, u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
p = ovl_cache_entry_find(&rdd->root, name, namelen);
|
||||
if (p) {
|
||||
list_move_tail(&p->l_node, &rdd->middle);
|
||||
} else {
|
||||
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
|
||||
if (p == NULL)
|
||||
rdd->err = -ENOMEM;
|
||||
else
|
||||
list_add_tail(&p->l_node, &rdd->middle);
|
||||
}
|
||||
|
||||
return rdd->err;
|
||||
}
|
||||
|
||||
void ovl_cache_free(struct list_head *list)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
struct ovl_cache_entry *n;
|
||||
|
||||
list_for_each_entry_safe(p, n, list, l_node)
|
||||
kfree(p);
|
||||
|
||||
INIT_LIST_HEAD(list);
|
||||
}
|
||||
|
||||
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
|
||||
{
|
||||
struct ovl_dir_cache *cache = od->cache;
|
||||
|
||||
WARN_ON(cache->refcount <= 0);
|
||||
cache->refcount--;
|
||||
if (!cache->refcount) {
|
||||
if (ovl_dir_cache(dentry) == cache)
|
||||
ovl_set_dir_cache(dentry, NULL);
|
||||
|
||||
ovl_cache_free(&cache->entries);
|
||||
kfree(cache);
|
||||
}
|
||||
}
|
||||
|
||||
static int ovl_fill_merge(void *buf, const char *name, int namelen,
|
||||
loff_t offset, u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct dir_context *ctx = buf;
|
||||
struct ovl_readdir_data *rdd =
|
||||
container_of(ctx, struct ovl_readdir_data, ctx);
|
||||
|
||||
rdd->count++;
|
||||
if (!rdd->is_merge)
|
||||
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
|
||||
else
|
||||
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
|
||||
}
|
||||
|
||||
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
|
||||
{
|
||||
int err;
|
||||
struct ovl_cache_entry *p;
|
||||
struct dentry *dentry;
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
return -ENOMEM;
|
||||
|
||||
/*
|
||||
* CAP_DAC_OVERRIDE for lookup
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = mutex_lock_killable(&dir->d_inode->i_mutex);
|
||||
if (!err) {
|
||||
while (rdd->first_maybe_whiteout) {
|
||||
p = rdd->first_maybe_whiteout;
|
||||
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
|
||||
dentry = lookup_one_len(p->name, dir, p->len);
|
||||
if (!IS_ERR(dentry)) {
|
||||
p->is_whiteout = ovl_is_whiteout(dentry);
|
||||
dput(dentry);
|
||||
}
|
||||
}
|
||||
mutex_unlock(&dir->d_inode->i_mutex);
|
||||
}
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_dir_read(struct path *realpath,
|
||||
struct ovl_readdir_data *rdd)
|
||||
{
|
||||
struct file *realfile;
|
||||
int err;
|
||||
|
||||
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
|
||||
if (IS_ERR(realfile))
|
||||
return PTR_ERR(realfile);
|
||||
|
||||
rdd->first_maybe_whiteout = NULL;
|
||||
//rdd->ctx.pos = 0;
|
||||
do {
|
||||
rdd->count = 0;
|
||||
rdd->err = 0;
|
||||
#ifdef USE_ITERATE_DIR
|
||||
err = iterate_dir(realfile, &rdd->ctx);
|
||||
#else
|
||||
err = vfs_readdir(realfile, rdd->ctx.actor, rdd);
|
||||
#endif
|
||||
if (err >= 0)
|
||||
err = rdd->err;
|
||||
} while (!err && rdd->count);
|
||||
|
||||
if (!err && rdd->first_maybe_whiteout)
|
||||
err = ovl_check_whiteouts(realpath->dentry, rdd);
|
||||
|
||||
fput(realfile);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ovl_dir_reset(struct file *file)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct ovl_dir_cache *cache = od->cache;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
enum ovl_path_type type = ovl_path_type(dentry);
|
||||
|
||||
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
|
||||
ovl_cache_put(od, dentry);
|
||||
od->cache = NULL;
|
||||
od->cursor = NULL;
|
||||
}
|
||||
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
|
||||
if (od->is_real && OVL_TYPE_MERGE(type))
|
||||
od->is_real = false;
|
||||
}
|
||||
|
||||
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
struct ovl_readdir_data rdd = {
|
||||
.ctx.actor = ovl_fill_merge,
|
||||
.list = list,
|
||||
.root = RB_ROOT,
|
||||
.is_merge = false,
|
||||
};
|
||||
int idx, next;
|
||||
|
||||
for (idx = 0; idx != -1; idx = next) {
|
||||
next = ovl_path_next(idx, dentry, &realpath);
|
||||
|
||||
if (next != -1) {
|
||||
err = ovl_dir_read(&realpath, &rdd);
|
||||
if (err)
|
||||
break;
|
||||
} else {
|
||||
/*
|
||||
* Insert lowest layer entries before upper ones, this
|
||||
* allows offsets to be reasonably constant
|
||||
*/
|
||||
list_add(&rdd.middle, rdd.list);
|
||||
rdd.is_merge = true;
|
||||
err = ovl_dir_read(&realpath, &rdd);
|
||||
list_del(&rdd.middle);
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
|
||||
{
|
||||
struct list_head *p;
|
||||
loff_t off = 0;
|
||||
|
||||
list_for_each(p, &od->cache->entries) {
|
||||
if (off >= pos)
|
||||
break;
|
||||
off++;
|
||||
}
|
||||
/* Cursor is safe since the cache is stable */
|
||||
od->cursor = p;
|
||||
}
|
||||
|
||||
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
|
||||
{
|
||||
int res;
|
||||
struct ovl_dir_cache *cache;
|
||||
|
||||
cache = ovl_dir_cache(dentry);
|
||||
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
|
||||
cache->refcount++;
|
||||
return cache;
|
||||
}
|
||||
ovl_set_dir_cache(dentry, NULL);
|
||||
|
||||
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
|
||||
if (!cache)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
cache->refcount = 1;
|
||||
INIT_LIST_HEAD(&cache->entries);
|
||||
|
||||
res = ovl_dir_read_merged(dentry, &cache->entries);
|
||||
if (res) {
|
||||
ovl_cache_free(&cache->entries);
|
||||
kfree(cache);
|
||||
return ERR_PTR(res);
|
||||
}
|
||||
|
||||
cache->version = ovl_dentry_version_get(dentry);
|
||||
ovl_set_dir_cache(dentry, cache);
|
||||
|
||||
return cache;
|
||||
}
|
||||
|
||||
#ifdef USE_ITERATE_DIR
|
||||
struct iterate_wrapper {
|
||||
struct dir_context ctx;
|
||||
filldir_t actor;
|
||||
void *buf;
|
||||
};
|
||||
|
||||
static int ovl_wrap_readdir(void *ctx, const char *name, int namelen,
|
||||
loff_t offset, u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct iterate_wrapper *w = ctx;
|
||||
|
||||
return w->actor(w->buf, name, namelen, offset, ino, d_type);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
struct ovl_cache_entry *p;
|
||||
int res;
|
||||
|
||||
if (!file->f_pos)
|
||||
ovl_dir_reset(file);
|
||||
|
||||
if (od->is_real) {
|
||||
#ifdef USE_ITERATE_DIR
|
||||
struct iterate_wrapper w = {
|
||||
.ctx.actor = ovl_wrap_readdir,
|
||||
.actor = filler,
|
||||
.buf = buf,
|
||||
};
|
||||
res = iterate_dir(od->realfile, &w.ctx);
|
||||
#else
|
||||
res = vfs_readdir(od->realfile, filler, buf);
|
||||
#endif
|
||||
file->f_pos = od->realfile->f_pos;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
if (!od->cache) {
|
||||
struct ovl_dir_cache *cache;
|
||||
|
||||
cache = ovl_cache_get(dentry);
|
||||
if (IS_ERR(cache))
|
||||
return PTR_ERR(cache);
|
||||
|
||||
od->cache = cache;
|
||||
ovl_seek_cursor(od, file->f_pos);
|
||||
}
|
||||
|
||||
while (od->cursor != &od->cache->entries) {
|
||||
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
|
||||
if (!p->is_whiteout)
|
||||
if (filler(buf, p->name, p->len, file->f_pos, p->ino, p->type))
|
||||
break;
|
||||
od->cursor = p->l_node.next;
|
||||
file->f_pos++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
|
||||
{
|
||||
loff_t res;
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
|
||||
mutex_lock(&file_inode(file)->i_mutex);
|
||||
if (!file->f_pos)
|
||||
ovl_dir_reset(file);
|
||||
|
||||
if (od->is_real) {
|
||||
res = vfs_llseek(od->realfile, offset, origin);
|
||||
file->f_pos = od->realfile->f_pos;
|
||||
} else {
|
||||
res = -EINVAL;
|
||||
|
||||
switch (origin) {
|
||||
case SEEK_CUR:
|
||||
offset += file->f_pos;
|
||||
break;
|
||||
case SEEK_SET:
|
||||
break;
|
||||
default:
|
||||
goto out_unlock;
|
||||
}
|
||||
if (offset < 0)
|
||||
goto out_unlock;
|
||||
|
||||
if (offset != file->f_pos) {
|
||||
file->f_pos = offset;
|
||||
if (od->cache)
|
||||
ovl_seek_cursor(od, offset);
|
||||
}
|
||||
res = offset;
|
||||
}
|
||||
out_unlock:
|
||||
mutex_unlock(&file_inode(file)->i_mutex);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
|
||||
int datasync)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
struct file *realfile = od->realfile;
|
||||
|
||||
/*
|
||||
* Need to check if we started out being a lower dir, but got copied up
|
||||
*/
|
||||
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
|
||||
struct inode *inode = file_inode(file);
|
||||
|
||||
realfile = lockless_dereference(od->upperfile);
|
||||
if (!realfile) {
|
||||
struct path upperpath;
|
||||
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
realfile = ovl_path_open(&upperpath, O_RDONLY);
|
||||
smp_mb__before_spinlock();
|
||||
mutex_lock(&inode->i_mutex);
|
||||
if (!od->upperfile) {
|
||||
if (IS_ERR(realfile)) {
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
return PTR_ERR(realfile);
|
||||
}
|
||||
od->upperfile = realfile;
|
||||
} else {
|
||||
/* somebody has beaten us to it */
|
||||
if (!IS_ERR(realfile))
|
||||
fput(realfile);
|
||||
realfile = od->upperfile;
|
||||
}
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
}
|
||||
}
|
||||
|
||||
return vfs_fsync_range(realfile, start, end, datasync);
|
||||
}
|
||||
|
||||
static int ovl_dir_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
|
||||
if (od->cache) {
|
||||
mutex_lock(&inode->i_mutex);
|
||||
ovl_cache_put(od, file->f_path.dentry);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
}
|
||||
fput(od->realfile);
|
||||
if (od->upperfile)
|
||||
fput(od->upperfile);
|
||||
kfree(od);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_dir_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct path realpath;
|
||||
struct file *realfile;
|
||||
struct ovl_dir_file *od;
|
||||
enum ovl_path_type type;
|
||||
|
||||
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
|
||||
if (!od)
|
||||
return -ENOMEM;
|
||||
|
||||
type = ovl_path_real(file->f_path.dentry, &realpath);
|
||||
realfile = ovl_path_open(&realpath, file->f_flags);
|
||||
if (IS_ERR(realfile)) {
|
||||
kfree(od);
|
||||
return PTR_ERR(realfile);
|
||||
}
|
||||
od->realfile = realfile;
|
||||
od->is_real = !OVL_TYPE_MERGE(type);
|
||||
od->is_upper = OVL_TYPE_UPPER(type);
|
||||
file->private_data = od;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct file_operations ovl_dir_operations = {
|
||||
.read = generic_read_dir,
|
||||
.open = ovl_dir_open,
|
||||
.readdir = ovl_readdir,
|
||||
.llseek = ovl_dir_llseek,
|
||||
.fsync = ovl_dir_fsync,
|
||||
.release = ovl_dir_release,
|
||||
};
|
||||
|
||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
|
||||
{
|
||||
int err;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
err = ovl_dir_read_merged(dentry, list);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = 0;
|
||||
|
||||
list_for_each_entry(p, list, l_node) {
|
||||
if (p->is_whiteout)
|
||||
continue;
|
||||
|
||||
if (p->name[0] == '.') {
|
||||
if (p->len == 1)
|
||||
continue;
|
||||
if (p->len == 2 && p->name[1] == '.')
|
||||
continue;
|
||||
}
|
||||
err = -ENOTEMPTY;
|
||||
break;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
|
||||
list_for_each_entry(p, list, l_node) {
|
||||
struct dentry *dentry;
|
||||
|
||||
if (!p->is_whiteout)
|
||||
continue;
|
||||
|
||||
dentry = lookup_one_len(p->name, upper, p->len);
|
||||
if (IS_ERR(dentry)) {
|
||||
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
|
||||
upper->d_name.name, p->len, p->name,
|
||||
(int) PTR_ERR(dentry));
|
||||
continue;
|
||||
}
|
||||
ovl_cleanup(upper->d_inode, dentry);
|
||||
dput(dentry);
|
||||
}
|
||||
mutex_unlock(&upper->d_inode->i_mutex);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,416 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/splice.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/namei.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
|
||||
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
|
||||
{
|
||||
ssize_t list_size, size;
|
||||
char *buf, *name, *value;
|
||||
int error;
|
||||
|
||||
if (!old->d_inode->i_op->getxattr ||
|
||||
!new->d_inode->i_op->getxattr)
|
||||
return 0;
|
||||
|
||||
list_size = vfs_listxattr(old, NULL, 0);
|
||||
if (list_size <= 0) {
|
||||
if (list_size == -EOPNOTSUPP)
|
||||
return 0;
|
||||
return list_size;
|
||||
}
|
||||
|
||||
buf = kzalloc(list_size, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
error = -ENOMEM;
|
||||
value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
|
||||
if (!value)
|
||||
goto out;
|
||||
|
||||
list_size = vfs_listxattr(old, buf, list_size);
|
||||
if (list_size <= 0) {
|
||||
error = list_size;
|
||||
goto out_free_value;
|
||||
}
|
||||
|
||||
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
|
||||
size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
|
||||
if (size <= 0) {
|
||||
error = size;
|
||||
goto out_free_value;
|
||||
}
|
||||
error = vfs_setxattr(new, name, value, size, 0);
|
||||
if (error)
|
||||
goto out_free_value;
|
||||
}
|
||||
|
||||
out_free_value:
|
||||
kfree(value);
|
||||
out:
|
||||
kfree(buf);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
|
||||
{
|
||||
struct file *old_file;
|
||||
struct file *new_file;
|
||||
loff_t old_pos = 0;
|
||||
loff_t new_pos = 0;
|
||||
int error = 0;
|
||||
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
old_file = ovl_path_open(old, O_RDONLY);
|
||||
if (IS_ERR(old_file))
|
||||
return PTR_ERR(old_file);
|
||||
|
||||
new_file = ovl_path_open(new, O_WRONLY);
|
||||
if (IS_ERR(new_file)) {
|
||||
error = PTR_ERR(new_file);
|
||||
goto out_fput;
|
||||
}
|
||||
|
||||
/* FIXME: copy up sparse files efficiently */
|
||||
while (len) {
|
||||
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
|
||||
long bytes;
|
||||
|
||||
if (len < this_len)
|
||||
this_len = len;
|
||||
|
||||
if (signal_pending_state(TASK_KILLABLE, current)) {
|
||||
error = -EINTR;
|
||||
break;
|
||||
}
|
||||
|
||||
bytes = do_splice_direct(old_file, &old_pos,
|
||||
new_file, &new_pos,
|
||||
this_len, SPLICE_F_MOVE);
|
||||
if (bytes <= 0) {
|
||||
error = bytes;
|
||||
break;
|
||||
}
|
||||
WARN_ON(old_pos != new_pos);
|
||||
|
||||
len -= bytes;
|
||||
}
|
||||
|
||||
fput(new_file);
|
||||
out_fput:
|
||||
fput(old_file);
|
||||
return error;
|
||||
}
|
||||
|
||||
static char *ovl_read_symlink(struct dentry *realdentry)
|
||||
{
|
||||
int res;
|
||||
char *buf;
|
||||
struct inode *inode = realdentry->d_inode;
|
||||
mm_segment_t old_fs;
|
||||
|
||||
res = -EINVAL;
|
||||
if (!inode->i_op->readlink)
|
||||
goto err;
|
||||
|
||||
res = -ENOMEM;
|
||||
buf = (char *) __get_free_page(GFP_KERNEL);
|
||||
if (!buf)
|
||||
goto err;
|
||||
|
||||
old_fs = get_fs();
|
||||
set_fs(get_ds());
|
||||
/* The cast to a user pointer is valid due to the set_fs() */
|
||||
res = inode->i_op->readlink(realdentry,
|
||||
(char __user *)buf, PAGE_SIZE - 1);
|
||||
set_fs(old_fs);
|
||||
if (res < 0) {
|
||||
free_page((unsigned long) buf);
|
||||
goto err;
|
||||
}
|
||||
buf[res] = '\0';
|
||||
|
||||
return buf;
|
||||
|
||||
err:
|
||||
return ERR_PTR(res);
|
||||
}
|
||||
|
||||
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
struct iattr attr = {
|
||||
.ia_valid =
|
||||
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
|
||||
.ia_atime = stat->atime,
|
||||
.ia_mtime = stat->mtime,
|
||||
};
|
||||
|
||||
return notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
|
||||
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (!S_ISLNK(stat->mode)) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_MODE,
|
||||
.ia_mode = stat->mode,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_UID | ATTR_GID,
|
||||
.ia_uid = stat->uid,
|
||||
.ia_gid = stat->gid,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err)
|
||||
ovl_set_timestamps(upperdentry, stat);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
|
||||
struct dentry *dentry, struct path *lowerpath,
|
||||
struct kstat *stat, struct iattr *attr,
|
||||
const char *link)
|
||||
{
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *newdentry = NULL;
|
||||
struct dentry *upper = NULL;
|
||||
umode_t mode = stat->mode;
|
||||
int err;
|
||||
|
||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out1;
|
||||
|
||||
/* Can't properly set mode on creation because of the umask */
|
||||
stat->mode &= S_IFMT;
|
||||
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
|
||||
stat->mode = mode;
|
||||
if (err)
|
||||
goto out2;
|
||||
|
||||
if (S_ISREG(stat->mode)) {
|
||||
struct path upperpath;
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
BUG_ON(upperpath.dentry != NULL);
|
||||
upperpath.dentry = newdentry;
|
||||
|
||||
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
}
|
||||
|
||||
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
mutex_lock(&newdentry->d_inode->i_mutex);
|
||||
err = ovl_set_attr(newdentry, stat);
|
||||
if (!err && attr)
|
||||
err = notify_change(newdentry, attr, NULL);
|
||||
mutex_unlock(&newdentry->d_inode->i_mutex);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
newdentry = NULL;
|
||||
|
||||
/*
|
||||
* Non-directores become opaque when copied up.
|
||||
*/
|
||||
if (!S_ISDIR(stat->mode))
|
||||
ovl_dentry_set_opaque(dentry, true);
|
||||
out2:
|
||||
dput(upper);
|
||||
out1:
|
||||
dput(newdentry);
|
||||
out:
|
||||
return err;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, newdentry);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy up a single dentry
|
||||
*
|
||||
* Directory renames only allowed on "pure upper" (already created on
|
||||
* upper filesystem, never copied up). Directories which are on lower or
|
||||
* are merged may not be renamed. For these -EXDEV is returned and
|
||||
* userspace has to deal with it. This means, when copying up a
|
||||
* directory we can rely on it and ancestors being stable.
|
||||
*
|
||||
* Non-directory renames start with copy up of source if necessary. The
|
||||
* actual rename will only proceed once the copy up was successful. Copy
|
||||
* up uses upper parent i_mutex for exclusion. Since rename can change
|
||||
* d_parent it is possible that the copy up will lock the old parent. At
|
||||
* that point the file will have already been copied up anyway.
|
||||
*/
|
||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
||||
struct path *lowerpath, struct kstat *stat,
|
||||
struct iattr *attr)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
int err;
|
||||
struct kstat pstat;
|
||||
struct path parentpath;
|
||||
struct dentry *upperdir;
|
||||
struct dentry *upperdentry;
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
char *link = NULL;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
ovl_path_upper(parent, &parentpath);
|
||||
upperdir = parentpath.dentry;
|
||||
|
||||
err = vfs_getattr(&parentpath, &pstat);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (S_ISLNK(stat->mode)) {
|
||||
link = ovl_read_symlink(lowerpath->dentry);
|
||||
if (IS_ERR(link))
|
||||
return PTR_ERR(link);
|
||||
}
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_free_link;
|
||||
|
||||
override_cred->fsuid = stat->uid;
|
||||
override_cred->fsgid = stat->gid;
|
||||
/*
|
||||
* CAP_SYS_ADMIN for copying up extended attributes
|
||||
* CAP_DAC_OVERRIDE for create
|
||||
* CAP_FOWNER for chmod, timestamp update
|
||||
* CAP_FSETID for chmod
|
||||
* CAP_CHOWN for chown
|
||||
* CAP_MKNOD for mknod
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
cap_raise(override_cred->cap_effective, CAP_MKNOD);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = -EIO;
|
||||
if (lock_rename(workdir, upperdir) != NULL) {
|
||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
||||
goto out_unlock;
|
||||
}
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
if (upperdentry) {
|
||||
unlock_rename(workdir, upperdir);
|
||||
err = 0;
|
||||
/* Raced with another copy-up? Do the setattr here */
|
||||
if (attr) {
|
||||
mutex_lock(&upperdentry->d_inode->i_mutex);
|
||||
err = notify_change(upperdentry, attr, NULL);
|
||||
mutex_unlock(&upperdentry->d_inode->i_mutex);
|
||||
}
|
||||
goto out_put_cred;
|
||||
}
|
||||
|
||||
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
|
||||
stat, attr, link);
|
||||
if (!err) {
|
||||
/* Restore timestamps on parent (best effort) */
|
||||
ovl_set_timestamps(upperdir, &pstat);
|
||||
}
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out_put_cred:
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
|
||||
out_free_link:
|
||||
if (link)
|
||||
free_page((unsigned long) link);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_copy_up(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = 0;
|
||||
while (!err) {
|
||||
struct dentry *next;
|
||||
struct dentry *parent;
|
||||
struct path lowerpath;
|
||||
struct kstat stat;
|
||||
enum ovl_path_type type = ovl_path_type(dentry);
|
||||
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
break;
|
||||
|
||||
next = dget(dentry);
|
||||
/* find the topmost dentry not yet copied up */
|
||||
for (;;) {
|
||||
parent = dget_parent(next);
|
||||
|
||||
type = ovl_path_type(parent);
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
break;
|
||||
|
||||
dput(next);
|
||||
next = parent;
|
||||
}
|
||||
|
||||
ovl_path_lower(next, &lowerpath);
|
||||
err = vfs_getattr(&lowerpath, &stat);
|
||||
if (!err)
|
||||
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
|
||||
|
||||
dput(parent);
|
||||
dput(next);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
@ -1,951 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/cred.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
dget(wdentry);
|
||||
if (d_is_dir(wdentry))
|
||||
err = ovl_do_rmdir(wdir, wdentry);
|
||||
else
|
||||
err = ovl_do_unlink(wdir, wdentry);
|
||||
dput(wdentry);
|
||||
|
||||
if (err) {
|
||||
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
|
||||
wdentry, err);
|
||||
}
|
||||
}
|
||||
|
||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
|
||||
{
|
||||
struct dentry *temp;
|
||||
char name[20];
|
||||
|
||||
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
|
||||
|
||||
temp = lookup_one_len(name, workdir, strlen(name));
|
||||
if (!IS_ERR(temp) && temp->d_inode) {
|
||||
pr_err("overlayfs: workdir/%s already exists\n", name);
|
||||
dput(temp);
|
||||
temp = ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
return temp;
|
||||
}
|
||||
|
||||
/* caller holds i_mutex on workdir */
|
||||
static struct dentry *ovl_whiteout(struct dentry *workdir,
|
||||
struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *whiteout;
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
|
||||
whiteout = ovl_lookup_temp(workdir, dentry);
|
||||
if (IS_ERR(whiteout))
|
||||
return whiteout;
|
||||
|
||||
err = ovl_do_whiteout(wdir, whiteout);
|
||||
if (err) {
|
||||
dput(whiteout);
|
||||
whiteout = ERR_PTR(err);
|
||||
}
|
||||
|
||||
return whiteout;
|
||||
}
|
||||
|
||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink, bool debug)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (newdentry->d_inode)
|
||||
return -ESTALE;
|
||||
|
||||
if (hardlink) {
|
||||
err = ovl_do_link(hardlink, dir, newdentry, debug);
|
||||
} else {
|
||||
switch (stat->mode & S_IFMT) {
|
||||
case S_IFREG:
|
||||
err = ovl_do_create(dir, newdentry, stat->mode, debug);
|
||||
break;
|
||||
|
||||
case S_IFDIR:
|
||||
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
|
||||
break;
|
||||
|
||||
case S_IFCHR:
|
||||
case S_IFBLK:
|
||||
case S_IFIFO:
|
||||
case S_IFSOCK:
|
||||
err = ovl_do_mknod(dir, newdentry,
|
||||
stat->mode, stat->rdev, debug);
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
err = ovl_do_symlink(dir, newdentry, link, debug);
|
||||
break;
|
||||
|
||||
default:
|
||||
err = -EPERM;
|
||||
}
|
||||
}
|
||||
if (!err && WARN_ON(!newdentry->d_inode)) {
|
||||
/*
|
||||
* Not quite sure if non-instantiated dentry is legal or not.
|
||||
* VFS doesn't seem to care so check and warn here.
|
||||
*/
|
||||
err = -ENOENT;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_set_opaque(struct dentry *upperdentry)
|
||||
{
|
||||
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
|
||||
}
|
||||
|
||||
static void ovl_remove_opaque(struct dentry *upperdentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
|
||||
if (err) {
|
||||
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
|
||||
upperdentry->d_name.name, err);
|
||||
}
|
||||
}
|
||||
|
||||
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat)
|
||||
{
|
||||
int err;
|
||||
enum ovl_path_type type;
|
||||
struct path realpath;
|
||||
|
||||
type = ovl_path_real(dentry, &realpath);
|
||||
err = vfs_getattr(&realpath, stat);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
stat->dev = dentry->d_sb->s_dev;
|
||||
stat->ino = dentry->d_inode->i_ino;
|
||||
|
||||
/*
|
||||
* It's probably not worth it to count subdirs to get the
|
||||
* correct link count. nlink=1 seems to pacify 'find' and
|
||||
* other utilities.
|
||||
*/
|
||||
if (OVL_TYPE_MERGE(type))
|
||||
stat->nlink = 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink)
|
||||
{
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *newdentry;
|
||||
int err;
|
||||
|
||||
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
|
||||
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_unlock;
|
||||
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
ovl_copyattr(newdentry->d_inode, inode);
|
||||
d_instantiate(dentry, inode);
|
||||
newdentry = NULL;
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_unlock:
|
||||
mutex_unlock(&udir->i_mutex);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_lock_rename_workdir(struct dentry *workdir,
|
||||
struct dentry *upperdir)
|
||||
{
|
||||
/* Workdir should not be the same as upperdir */
|
||||
if (workdir == upperdir)
|
||||
goto err;
|
||||
|
||||
/* Workdir should not be subdir of upperdir and vice versa */
|
||||
if (lock_rename(workdir, upperdir) != NULL)
|
||||
goto err_unlock;
|
||||
|
||||
return 0;
|
||||
|
||||
err_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
err:
|
||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_clear_empty(struct dentry *dentry,
|
||||
struct list_head *list)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct path upperpath;
|
||||
struct dentry *upper;
|
||||
struct dentry *opaquedir;
|
||||
struct kstat stat;
|
||||
int err;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return ERR_PTR(-EROFS);
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
err = vfs_getattr(&upperpath, &stat);
|
||||
if (err)
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if (!S_ISDIR(stat.mode))
|
||||
goto out_unlock;
|
||||
upper = upperpath.dentry;
|
||||
if (upper->d_parent->d_inode != udir)
|
||||
goto out_unlock;
|
||||
|
||||
opaquedir = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir))
|
||||
goto out_unlock;
|
||||
|
||||
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
err = ovl_copy_xattr(upper, opaquedir);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_set_opaque(opaquedir);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
mutex_lock(&opaquedir->d_inode->i_mutex);
|
||||
err = ovl_set_attr(opaquedir, &stat);
|
||||
mutex_unlock(&opaquedir->d_inode->i_mutex);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_cleanup_whiteouts(upper, list);
|
||||
ovl_cleanup(wdir, upper);
|
||||
unlock_rename(workdir, upperdir);
|
||||
|
||||
/* dentry's upper doesn't match now, get rid of it */
|
||||
d_drop(dentry);
|
||||
|
||||
return opaquedir;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, opaquedir);
|
||||
out_dput:
|
||||
dput(opaquedir);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out:
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *ret = NULL;
|
||||
LIST_HEAD(list);
|
||||
|
||||
err = ovl_check_empty_dir(dentry, &list);
|
||||
if (err)
|
||||
ret = ERR_PTR(err);
|
||||
else {
|
||||
/*
|
||||
* If no upperdentry then skip clearing whiteouts.
|
||||
*
|
||||
* Can race with copy-up, since we don't hold the upperdir
|
||||
* mutex. Doesn't matter, since copy-up can't create a
|
||||
* non-empty directory from an empty one.
|
||||
*/
|
||||
if (ovl_dentry_upper(dentry))
|
||||
ret = ovl_clear_empty(dentry, &list);
|
||||
}
|
||||
|
||||
ovl_cache_free(&list);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *upper;
|
||||
struct dentry *newdentry;
|
||||
int err;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_unlock;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out_dput;
|
||||
|
||||
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
|
||||
if (err)
|
||||
goto out_dput2;
|
||||
|
||||
if (S_ISDIR(stat->mode)) {
|
||||
err = ovl_set_opaque(newdentry);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper,
|
||||
RENAME_EXCHANGE);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_cleanup(wdir, upper);
|
||||
} else {
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
}
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
ovl_copyattr(newdentry->d_inode, inode);
|
||||
d_instantiate(dentry, inode);
|
||||
newdentry = NULL;
|
||||
out_dput2:
|
||||
dput(upper);
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out:
|
||||
return err;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, newdentry);
|
||||
goto out_dput2;
|
||||
}
|
||||
|
||||
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
|
||||
const char *link, struct dentry *hardlink)
|
||||
{
|
||||
int err;
|
||||
struct inode *inode;
|
||||
struct kstat stat = {
|
||||
.mode = mode,
|
||||
.rdev = rdev,
|
||||
};
|
||||
|
||||
err = -ENOMEM;
|
||||
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
|
||||
if (!inode)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry->d_parent);
|
||||
if (err)
|
||||
goto out_iput;
|
||||
|
||||
if (!ovl_dentry_is_opaque(dentry)) {
|
||||
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
|
||||
} else {
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_iput;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting opaque xattr
|
||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
|
||||
hardlink);
|
||||
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
|
||||
if (!err)
|
||||
inode = NULL;
|
||||
out_iput:
|
||||
iput(inode);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
|
||||
const char *link)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (!err) {
|
||||
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
|
||||
ovl_drop_write(dentry);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
bool excl)
|
||||
{
|
||||
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
|
||||
}
|
||||
|
||||
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
||||
{
|
||||
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
|
||||
}
|
||||
|
||||
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
dev_t rdev)
|
||||
{
|
||||
/* Don't allow creation of "whiteout" on overlay */
|
||||
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
|
||||
return -EPERM;
|
||||
|
||||
return ovl_create_object(dentry, mode, rdev, NULL);
|
||||
}
|
||||
|
||||
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *link)
|
||||
{
|
||||
return ovl_create_object(dentry, S_IFLNK, 0, link);
|
||||
}
|
||||
|
||||
static int ovl_link(struct dentry *old, struct inode *newdir,
|
||||
struct dentry *new)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upper;
|
||||
|
||||
err = ovl_want_write(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(old);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
upper = ovl_dentry_upper(old);
|
||||
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
|
||||
|
||||
out_drop_write:
|
||||
ovl_drop_write(old);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *whiteout;
|
||||
struct dentry *upper;
|
||||
struct dentry *opaquedir = NULL;
|
||||
int err;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
if (is_dir) {
|
||||
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
|
||||
opaquedir = ovl_check_empty_and_clear(dentry);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir))
|
||||
goto out;
|
||||
} else {
|
||||
LIST_HEAD(list);
|
||||
|
||||
/*
|
||||
* When removing an empty opaque directory, then it
|
||||
* makes no sense to replace it with an exact replica of
|
||||
* itself. But emptiness still needs to be checked.
|
||||
*/
|
||||
err = ovl_check_empty_dir(dentry, &list);
|
||||
ovl_cache_free(&list);
|
||||
if (err)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
whiteout = ovl_whiteout(workdir, dentry);
|
||||
err = PTR_ERR(whiteout);
|
||||
if (IS_ERR(whiteout))
|
||||
goto out_unlock;
|
||||
|
||||
upper = ovl_dentry_upper(dentry);
|
||||
if (!upper) {
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto kill_whiteout;
|
||||
|
||||
err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
|
||||
dput(upper);
|
||||
if (err)
|
||||
goto kill_whiteout;
|
||||
} else {
|
||||
int flags = 0;
|
||||
|
||||
if (opaquedir)
|
||||
upper = opaquedir;
|
||||
err = -ESTALE;
|
||||
if (upper->d_parent != upperdir)
|
||||
goto kill_whiteout;
|
||||
|
||||
if (is_dir)
|
||||
flags |= RENAME_EXCHANGE;
|
||||
|
||||
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
|
||||
if (err)
|
||||
goto kill_whiteout;
|
||||
|
||||
if (is_dir)
|
||||
ovl_cleanup(wdir, upper);
|
||||
}
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
out_d_drop:
|
||||
d_drop(dentry);
|
||||
dput(whiteout);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out_dput:
|
||||
dput(opaquedir);
|
||||
out:
|
||||
return err;
|
||||
|
||||
kill_whiteout:
|
||||
ovl_cleanup(wdir, whiteout);
|
||||
goto out_d_drop;
|
||||
}
|
||||
|
||||
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *dir = upperdir->d_inode;
|
||||
struct dentry *upper = ovl_dentry_upper(dentry);
|
||||
int err;
|
||||
|
||||
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
|
||||
err = -ESTALE;
|
||||
if (upper->d_parent == upperdir) {
|
||||
/* Don't let d_delete() think it can reset d_inode */
|
||||
dget(upper);
|
||||
if (is_dir)
|
||||
err = vfs_rmdir(dir, upper);
|
||||
else
|
||||
err = vfs_unlink(dir, upper, NULL);
|
||||
dput(upper);
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
}
|
||||
|
||||
/*
|
||||
* Keeping this dentry hashed would mean having to release
|
||||
* upperpath/lowerpath, which could only be done if we are the
|
||||
* sole user of this dentry. Too tricky... Just unhash for
|
||||
* now.
|
||||
*/
|
||||
d_drop(dentry);
|
||||
mutex_unlock(&dir->i_mutex);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_check_sticky(struct dentry *dentry)
|
||||
{
|
||||
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
|
||||
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
|
||||
|
||||
if (check_sticky(dir, inode))
|
||||
return -EPERM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
enum ovl_path_type type;
|
||||
int err;
|
||||
|
||||
err = ovl_check_sticky(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry->d_parent);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
type = ovl_path_type(dentry);
|
||||
if (OVL_TYPE_PURE_UPPER(type)) {
|
||||
err = ovl_remove_upper(dentry, is_dir);
|
||||
} else {
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_drop_write;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
* CAP_FSETID for chmod of opaque dir
|
||||
* CAP_CHOWN for chown of opaque dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = ovl_remove_and_whiteout(dentry, is_dir);
|
||||
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
return ovl_do_remove(dentry, false);
|
||||
}
|
||||
|
||||
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
return ovl_do_remove(dentry, true);
|
||||
}
|
||||
|
||||
static int ovl_rename2(struct inode *olddir, struct dentry *old,
|
||||
struct inode *newdir, struct dentry *new,
|
||||
unsigned int flags)
|
||||
{
|
||||
int err;
|
||||
enum ovl_path_type old_type;
|
||||
enum ovl_path_type new_type;
|
||||
struct dentry *old_upperdir;
|
||||
struct dentry *new_upperdir;
|
||||
struct dentry *olddentry;
|
||||
struct dentry *newdentry;
|
||||
struct dentry *trap;
|
||||
bool old_opaque;
|
||||
bool new_opaque;
|
||||
bool new_create = false;
|
||||
bool cleanup_whiteout = false;
|
||||
bool overwrite = !(flags & RENAME_EXCHANGE);
|
||||
bool is_dir = d_is_dir(old);
|
||||
bool new_is_dir = false;
|
||||
struct dentry *opaquedir = NULL;
|
||||
const struct cred *old_cred = NULL;
|
||||
struct cred *override_cred = NULL;
|
||||
|
||||
err = -EINVAL;
|
||||
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
|
||||
goto out;
|
||||
|
||||
flags &= ~RENAME_NOREPLACE;
|
||||
|
||||
err = ovl_check_sticky(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
/* Don't copy up directory trees */
|
||||
old_type = ovl_path_type(old);
|
||||
err = -EXDEV;
|
||||
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
|
||||
goto out;
|
||||
|
||||
if (new->d_inode) {
|
||||
err = ovl_check_sticky(new);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (d_is_dir(new))
|
||||
new_is_dir = true;
|
||||
|
||||
new_type = ovl_path_type(new);
|
||||
err = -EXDEV;
|
||||
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
|
||||
goto out;
|
||||
|
||||
err = 0;
|
||||
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
|
||||
if (ovl_dentry_lower(old)->d_inode ==
|
||||
ovl_dentry_lower(new)->d_inode)
|
||||
goto out;
|
||||
}
|
||||
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
|
||||
if (ovl_dentry_upper(old)->d_inode ==
|
||||
ovl_dentry_upper(new)->d_inode)
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
if (ovl_dentry_is_opaque(new))
|
||||
new_type = __OVL_PATH_UPPER;
|
||||
else
|
||||
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
|
||||
}
|
||||
|
||||
err = ovl_want_write(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(old);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(new->d_parent);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
if (!overwrite) {
|
||||
err = ovl_copy_up(new);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
}
|
||||
|
||||
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
|
||||
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
|
||||
|
||||
if (old_opaque || new_opaque) {
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_drop_write;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
||||
* CAP_DAC_OVERRIDE for create in workdir
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
* CAP_FSETID for chmod of opaque dir
|
||||
* CAP_CHOWN for chown of opaque dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
old_cred = override_creds(override_cred);
|
||||
}
|
||||
|
||||
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
|
||||
opaquedir = ovl_check_empty_and_clear(new);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir)) {
|
||||
opaquedir = NULL;
|
||||
goto out_revert_creds;
|
||||
}
|
||||
}
|
||||
|
||||
if (overwrite) {
|
||||
if (old_opaque) {
|
||||
if (new->d_inode || !new_opaque) {
|
||||
/* Whiteout source */
|
||||
flags |= RENAME_WHITEOUT;
|
||||
} else {
|
||||
/* Switch whiteouts */
|
||||
flags |= RENAME_EXCHANGE;
|
||||
}
|
||||
} else if (is_dir && !new->d_inode && new_opaque) {
|
||||
flags |= RENAME_EXCHANGE;
|
||||
cleanup_whiteout = true;
|
||||
}
|
||||
}
|
||||
|
||||
old_upperdir = ovl_dentry_upper(old->d_parent);
|
||||
new_upperdir = ovl_dentry_upper(new->d_parent);
|
||||
|
||||
trap = lock_rename(new_upperdir, old_upperdir);
|
||||
|
||||
olddentry = ovl_dentry_upper(old);
|
||||
newdentry = ovl_dentry_upper(new);
|
||||
if (newdentry) {
|
||||
if (opaquedir) {
|
||||
newdentry = opaquedir;
|
||||
opaquedir = NULL;
|
||||
} else {
|
||||
dget(newdentry);
|
||||
}
|
||||
} else {
|
||||
new_create = true;
|
||||
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
|
||||
new->d_name.len);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
err = -ESTALE;
|
||||
if (olddentry->d_parent != old_upperdir)
|
||||
goto out_dput;
|
||||
if (newdentry->d_parent != new_upperdir)
|
||||
goto out_dput;
|
||||
if (olddentry == trap)
|
||||
goto out_dput;
|
||||
if (newdentry == trap)
|
||||
goto out_dput;
|
||||
|
||||
if (is_dir && !old_opaque && new_opaque) {
|
||||
err = ovl_set_opaque(olddentry);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
}
|
||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
|
||||
err = ovl_set_opaque(newdentry);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (old_opaque || new_opaque) {
|
||||
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
|
||||
new_upperdir->d_inode, newdentry,
|
||||
flags);
|
||||
} else {
|
||||
/* No debug for the plain case */
|
||||
BUG_ON(flags & ~RENAME_EXCHANGE);
|
||||
err = vfs_rename(old_upperdir->d_inode, olddentry,
|
||||
new_upperdir->d_inode, newdentry,
|
||||
NULL, flags);
|
||||
}
|
||||
|
||||
if (err) {
|
||||
if (is_dir && !old_opaque && new_opaque)
|
||||
ovl_remove_opaque(olddentry);
|
||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
|
||||
ovl_remove_opaque(newdentry);
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (is_dir && old_opaque && !new_opaque)
|
||||
ovl_remove_opaque(olddentry);
|
||||
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
|
||||
ovl_remove_opaque(newdentry);
|
||||
|
||||
if (old_opaque != new_opaque) {
|
||||
ovl_dentry_set_opaque(old, new_opaque);
|
||||
if (!overwrite)
|
||||
ovl_dentry_set_opaque(new, old_opaque);
|
||||
}
|
||||
|
||||
if (cleanup_whiteout)
|
||||
ovl_cleanup(old_upperdir->d_inode, newdentry);
|
||||
|
||||
ovl_dentry_version_inc(old->d_parent);
|
||||
ovl_dentry_version_inc(new->d_parent);
|
||||
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_unlock:
|
||||
unlock_rename(new_upperdir, old_upperdir);
|
||||
out_revert_creds:
|
||||
if (old_opaque || new_opaque) {
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
out_drop_write:
|
||||
ovl_drop_write(old);
|
||||
out:
|
||||
dput(opaquedir);
|
||||
return err;
|
||||
}
|
||||
|
||||
const struct inode_operations ovl_dir_inode_operations = {
|
||||
.lookup = ovl_lookup,
|
||||
.mkdir = ovl_mkdir,
|
||||
.symlink = ovl_symlink,
|
||||
.unlink = ovl_unlink,
|
||||
.rmdir = ovl_rmdir,
|
||||
.rename2 = ovl_rename2,
|
||||
.link = ovl_link,
|
||||
.setattr = ovl_setattr,
|
||||
.create = ovl_create,
|
||||
.mknod = ovl_mknod,
|
||||
.permission = ovl_permission,
|
||||
.getattr = ovl_dir_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
};
|
||||
@ -1,438 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/xattr.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
|
||||
bool no_data)
|
||||
{
|
||||
int err;
|
||||
struct dentry *parent;
|
||||
struct kstat stat;
|
||||
struct path lowerpath;
|
||||
|
||||
parent = dget_parent(dentry);
|
||||
err = ovl_copy_up(parent);
|
||||
if (err)
|
||||
goto out_dput_parent;
|
||||
|
||||
ovl_path_lower(dentry, &lowerpath);
|
||||
err = vfs_getattr(&lowerpath, &stat);
|
||||
if (err)
|
||||
goto out_dput_parent;
|
||||
|
||||
if (no_data)
|
||||
stat.size = 0;
|
||||
|
||||
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
|
||||
|
||||
out_dput_parent:
|
||||
dput(parent);
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
if (upperdentry) {
|
||||
mutex_lock(&upperdentry->d_inode->i_mutex);
|
||||
err = notify_change(upperdentry, attr, NULL);
|
||||
mutex_unlock(&upperdentry->d_inode->i_mutex);
|
||||
} else {
|
||||
err = ovl_copy_up_last(dentry, attr, false);
|
||||
}
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat)
|
||||
{
|
||||
struct path realpath;
|
||||
|
||||
ovl_path_real(dentry, &realpath);
|
||||
return vfs_getattr(&realpath, stat);
|
||||
}
|
||||
|
||||
int ovl_permission(struct inode *inode, int mask)
|
||||
{
|
||||
struct ovl_entry *oe;
|
||||
struct dentry *alias = NULL;
|
||||
struct inode *realinode;
|
||||
struct dentry *realdentry;
|
||||
bool is_upper;
|
||||
int err;
|
||||
|
||||
if (S_ISDIR(inode->i_mode)) {
|
||||
oe = inode->i_private;
|
||||
} else if (mask & MAY_NOT_BLOCK) {
|
||||
return -ECHILD;
|
||||
} else {
|
||||
/*
|
||||
* For non-directories find an alias and get the info
|
||||
* from there.
|
||||
*/
|
||||
alias = d_find_any_alias(inode);
|
||||
if (WARN_ON(!alias))
|
||||
return -ENOENT;
|
||||
|
||||
oe = alias->d_fsdata;
|
||||
}
|
||||
|
||||
realdentry = ovl_entry_real(oe, &is_upper);
|
||||
|
||||
/* Careful in RCU walk mode */
|
||||
realinode = ACCESS_ONCE(realdentry->d_inode);
|
||||
if (!realinode) {
|
||||
WARN_ON(!(mask & MAY_NOT_BLOCK));
|
||||
err = -ENOENT;
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (mask & MAY_WRITE) {
|
||||
umode_t mode = realinode->i_mode;
|
||||
|
||||
/*
|
||||
* Writes will always be redirected to upper layer, so
|
||||
* ignore lower layer being read-only.
|
||||
*
|
||||
* If the overlay itself is read-only then proceed
|
||||
* with the permission check, don't return EROFS.
|
||||
* This will only happen if this is the lower layer of
|
||||
* another overlayfs.
|
||||
*
|
||||
* If upper fs becomes read-only after the overlay was
|
||||
* constructed return EROFS to prevent modification of
|
||||
* upper layer.
|
||||
*/
|
||||
err = -EROFS;
|
||||
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
|
||||
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
err = __inode_permission(realinode, mask);
|
||||
out_dput:
|
||||
dput(alias);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
struct ovl_link_data {
|
||||
struct dentry *realdentry;
|
||||
void *cookie;
|
||||
};
|
||||
|
||||
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
|
||||
{
|
||||
void *ret;
|
||||
struct dentry *realdentry;
|
||||
struct inode *realinode;
|
||||
|
||||
realdentry = ovl_dentry_real(dentry);
|
||||
realinode = realdentry->d_inode;
|
||||
|
||||
if (WARN_ON(!realinode->i_op->follow_link))
|
||||
return ERR_PTR(-EPERM);
|
||||
|
||||
ret = realinode->i_op->follow_link(realdentry, nd);
|
||||
if (IS_ERR(ret))
|
||||
return ret;
|
||||
|
||||
if (realinode->i_op->put_link) {
|
||||
struct ovl_link_data *data;
|
||||
|
||||
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
|
||||
if (!data) {
|
||||
realinode->i_op->put_link(realdentry, nd, ret);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
data->realdentry = realdentry;
|
||||
data->cookie = ret;
|
||||
|
||||
return data;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
|
||||
{
|
||||
struct inode *realinode;
|
||||
struct ovl_link_data *data = c;
|
||||
|
||||
if (!data)
|
||||
return;
|
||||
|
||||
realinode = data->realdentry->d_inode;
|
||||
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
|
||||
kfree(data);
|
||||
}
|
||||
|
||||
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
|
||||
{
|
||||
struct path realpath;
|
||||
struct inode *realinode;
|
||||
|
||||
ovl_path_real(dentry, &realpath);
|
||||
realinode = realpath.dentry->d_inode;
|
||||
|
||||
if (!realinode->i_op->readlink)
|
||||
return -EINVAL;
|
||||
|
||||
touch_atime(&realpath);
|
||||
|
||||
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
|
||||
}
|
||||
|
||||
|
||||
static bool ovl_is_private_xattr(const char *name)
|
||||
{
|
||||
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
|
||||
}
|
||||
|
||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = -EPERM;
|
||||
if (ovl_is_private_xattr(name))
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
err = vfs_setxattr(upperdentry, name, value, size, flags);
|
||||
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool ovl_need_xattr_filter(struct dentry *dentry,
|
||||
enum ovl_path_type type)
|
||||
{
|
||||
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
|
||||
return S_ISDIR(dentry->d_inode->i_mode);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size)
|
||||
{
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
|
||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
||||
return -ENODATA;
|
||||
|
||||
return vfs_getxattr(realpath.dentry, name, value, size);
|
||||
}
|
||||
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
|
||||
{
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
ssize_t res;
|
||||
int off;
|
||||
|
||||
res = vfs_listxattr(realpath.dentry, list, size);
|
||||
if (res <= 0 || size == 0)
|
||||
return res;
|
||||
|
||||
if (!ovl_need_xattr_filter(dentry, type))
|
||||
return res;
|
||||
|
||||
/* filter out private xattrs */
|
||||
for (off = 0; off < res;) {
|
||||
char *s = list + off;
|
||||
size_t slen = strlen(s) + 1;
|
||||
|
||||
BUG_ON(off + slen > res);
|
||||
|
||||
if (ovl_is_private_xattr(s)) {
|
||||
res -= slen;
|
||||
memmove(s, s + slen, res - off);
|
||||
} else {
|
||||
off += slen;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
int ovl_removexattr(struct dentry *dentry, const char *name)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = -ENODATA;
|
||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
||||
goto out_drop_write;
|
||||
|
||||
if (!OVL_TYPE_UPPER(type)) {
|
||||
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
|
||||
if (err < 0)
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
ovl_path_upper(dentry, &realpath);
|
||||
}
|
||||
|
||||
err = vfs_removexattr(realpath.dentry, name);
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
|
||||
struct dentry *realdentry)
|
||||
{
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
return false;
|
||||
|
||||
if (special_file(realdentry->d_inode->i_mode))
|
||||
return false;
|
||||
|
||||
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
|
||||
const struct cred *cred)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
enum ovl_path_type type;
|
||||
bool want_write = false;
|
||||
|
||||
type = ovl_path_real(dentry, &realpath);
|
||||
if (!ovl_is_nocopyupw(dentry)) {
|
||||
if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
|
||||
want_write = true;
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (file->f_flags & O_TRUNC)
|
||||
err = ovl_copy_up_last(dentry, NULL, true);
|
||||
else
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
ovl_path_upper(dentry, &realpath);
|
||||
}
|
||||
}
|
||||
|
||||
err = vfs_open(&realpath, file, cred);
|
||||
out_drop_write:
|
||||
if (want_write)
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static const struct inode_operations ovl_file_inode_operations = {
|
||||
.setattr = ovl_setattr,
|
||||
.permission = ovl_permission,
|
||||
.getattr = ovl_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
.dentry_open = ovl_dentry_open,
|
||||
};
|
||||
|
||||
static const struct inode_operations ovl_symlink_inode_operations = {
|
||||
.setattr = ovl_setattr,
|
||||
.follow_link = ovl_follow_link,
|
||||
.put_link = ovl_put_link,
|
||||
.readlink = ovl_readlink,
|
||||
.getattr = ovl_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
};
|
||||
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
||||
struct ovl_entry *oe)
|
||||
{
|
||||
struct inode *inode;
|
||||
|
||||
inode = new_inode(sb);
|
||||
if (!inode)
|
||||
return NULL;
|
||||
|
||||
mode &= S_IFMT;
|
||||
|
||||
inode->i_ino = get_next_ino();
|
||||
inode->i_mode = mode;
|
||||
inode->i_flags |= S_NOATIME | S_NOCMTIME;
|
||||
|
||||
switch (mode) {
|
||||
case S_IFDIR:
|
||||
inode->i_private = oe;
|
||||
inode->i_op = &ovl_dir_inode_operations;
|
||||
inode->i_fop = &ovl_dir_operations;
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
inode->i_op = &ovl_symlink_inode_operations;
|
||||
break;
|
||||
|
||||
case S_IFREG:
|
||||
case S_IFSOCK:
|
||||
case S_IFBLK:
|
||||
case S_IFCHR:
|
||||
case S_IFIFO:
|
||||
inode->i_op = &ovl_file_inode_operations;
|
||||
break;
|
||||
|
||||
default:
|
||||
WARN(1, "illegal file type: %i\n", mode);
|
||||
iput(inode);
|
||||
inode = NULL;
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
@ -1,200 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
||||
struct ovl_entry;
|
||||
|
||||
enum ovl_path_type {
|
||||
__OVL_PATH_PURE = (1 << 0),
|
||||
__OVL_PATH_UPPER = (1 << 1),
|
||||
__OVL_PATH_MERGE = (1 << 2),
|
||||
};
|
||||
|
||||
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
|
||||
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
|
||||
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
|
||||
#define OVL_TYPE_MERGE_OR_LOWER(type) \
|
||||
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
|
||||
|
||||
#define OVL_XATTR_PRE_NAME "trusted.overlay."
|
||||
#define OVL_XATTR_PRE_LEN 16
|
||||
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
|
||||
|
||||
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_rmdir(dir, dentry);
|
||||
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_unlink(dir, dentry, NULL);
|
||||
pr_debug("unlink(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
|
||||
struct dentry *new_dentry, bool debug)
|
||||
{
|
||||
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
|
||||
if (debug) {
|
||||
pr_debug("link(%pd2, %pd2) = %i\n",
|
||||
old_dentry, new_dentry, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, bool debug)
|
||||
{
|
||||
int err = vfs_create(dir, dentry, mode, true);
|
||||
if (debug)
|
||||
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, bool debug)
|
||||
{
|
||||
int err = vfs_mkdir(dir, dentry, mode);
|
||||
if (debug)
|
||||
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, dev_t dev, bool debug)
|
||||
{
|
||||
int err = vfs_mknod(dir, dentry, mode, dev);
|
||||
if (debug) {
|
||||
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
|
||||
dentry, mode, dev, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *oldname, bool debug)
|
||||
{
|
||||
int err = vfs_symlink(dir, dentry, oldname);
|
||||
if (debug)
|
||||
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err = vfs_setxattr(dentry, name, value, size, flags);
|
||||
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
|
||||
dentry, name, (int) size, (char *) value, flags, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
|
||||
{
|
||||
int err = vfs_removexattr(dentry, name);
|
||||
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
|
||||
struct inode *newdir, struct dentry *newdentry,
|
||||
unsigned int flags)
|
||||
{
|
||||
int err;
|
||||
|
||||
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
|
||||
olddentry, newdentry, flags);
|
||||
|
||||
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
|
||||
|
||||
if (err) {
|
||||
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
|
||||
olddentry, newdentry, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_whiteout(dir, dentry);
|
||||
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
bool ovl_is_nocopyupw(struct dentry *dentry);
|
||||
enum ovl_path_type ovl_path_type(struct dentry *dentry);
|
||||
u64 ovl_dentry_version_get(struct dentry *dentry);
|
||||
void ovl_dentry_version_inc(struct dentry *dentry);
|
||||
void ovl_path_upper(struct dentry *dentry, struct path *path);
|
||||
void ovl_path_lower(struct dentry *dentry, struct path *path);
|
||||
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
|
||||
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
|
||||
struct dentry *ovl_dentry_upper(struct dentry *dentry);
|
||||
struct dentry *ovl_dentry_lower(struct dentry *dentry);
|
||||
struct dentry *ovl_dentry_real(struct dentry *dentry);
|
||||
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
|
||||
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
|
||||
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
|
||||
struct dentry *ovl_workdir(struct dentry *dentry);
|
||||
int ovl_want_write(struct dentry *dentry);
|
||||
void ovl_drop_write(struct dentry *dentry);
|
||||
bool ovl_dentry_is_opaque(struct dentry *dentry);
|
||||
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
|
||||
bool ovl_is_whiteout(struct dentry *dentry);
|
||||
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
|
||||
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
|
||||
unsigned int flags);
|
||||
struct file *ovl_path_open(struct path *path, int flags);
|
||||
|
||||
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
|
||||
struct kstat *stat, const char *link);
|
||||
|
||||
/* readdir.c */
|
||||
extern const struct file_operations ovl_dir_operations;
|
||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
|
||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
|
||||
void ovl_cache_free(struct list_head *list);
|
||||
|
||||
/* inode.c */
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
int ovl_permission(struct inode *inode, int mask);
|
||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags);
|
||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size);
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
|
||||
int ovl_removexattr(struct dentry *dentry, const char *name);
|
||||
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
||||
struct ovl_entry *oe);
|
||||
static inline void ovl_copyattr(struct inode *from, struct inode *to)
|
||||
{
|
||||
to->i_uid = from->i_uid;
|
||||
to->i_gid = from->i_gid;
|
||||
}
|
||||
|
||||
/* dir.c */
|
||||
extern const struct inode_operations ovl_dir_inode_operations;
|
||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
|
||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink, bool debug);
|
||||
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
|
||||
|
||||
/* copy_up.c */
|
||||
int ovl_copy_up(struct dentry *dentry);
|
||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
||||
struct path *lowerpath, struct kstat *stat,
|
||||
struct iattr *attr);
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
|
||||
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
|
||||
@ -1,557 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/cred.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
struct ovl_cache_entry {
|
||||
unsigned int len;
|
||||
unsigned int type;
|
||||
u64 ino;
|
||||
struct list_head l_node;
|
||||
struct rb_node node;
|
||||
bool is_whiteout;
|
||||
char name[];
|
||||
};
|
||||
|
||||
struct ovl_dir_cache {
|
||||
long refcount;
|
||||
u64 version;
|
||||
struct list_head entries;
|
||||
};
|
||||
|
||||
struct ovl_readdir_data {
|
||||
struct dir_context ctx;
|
||||
bool is_merge;
|
||||
struct rb_root root;
|
||||
struct list_head *list;
|
||||
struct list_head middle;
|
||||
struct dentry *dir;
|
||||
int count;
|
||||
int err;
|
||||
};
|
||||
|
||||
struct ovl_dir_file {
|
||||
bool is_real;
|
||||
bool is_upper;
|
||||
struct ovl_dir_cache *cache;
|
||||
struct list_head *cursor;
|
||||
struct file *realfile;
|
||||
struct file *upperfile;
|
||||
};
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
|
||||
{
|
||||
return container_of(n, struct ovl_cache_entry, node);
|
||||
}
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
|
||||
const char *name, int len)
|
||||
{
|
||||
struct rb_node *node = root->rb_node;
|
||||
int cmp;
|
||||
|
||||
while (node) {
|
||||
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
|
||||
|
||||
cmp = strncmp(name, p->name, len);
|
||||
if (cmp > 0)
|
||||
node = p->node.rb_right;
|
||||
else if (cmp < 0 || len < p->len)
|
||||
node = p->node.rb_left;
|
||||
else
|
||||
return p;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
|
||||
const char *name, int len,
|
||||
u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
|
||||
|
||||
p = kmalloc(size, GFP_KERNEL);
|
||||
if (!p)
|
||||
return NULL;
|
||||
|
||||
memcpy(p->name, name, len);
|
||||
p->name[len] = '\0';
|
||||
p->len = len;
|
||||
p->type = d_type;
|
||||
p->ino = ino;
|
||||
p->is_whiteout = false;
|
||||
|
||||
if (d_type == DT_CHR) {
|
||||
struct dentry *dentry;
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred) {
|
||||
kfree(p);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* CAP_DAC_OVERRIDE for lookup
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
dentry = lookup_one_len(name, dir, len);
|
||||
if (!IS_ERR(dentry)) {
|
||||
p->is_whiteout = ovl_is_whiteout(dentry);
|
||||
dput(dentry);
|
||||
}
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
|
||||
const char *name, int len, u64 ino,
|
||||
unsigned int d_type)
|
||||
{
|
||||
struct rb_node **newp = &rdd->root.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
while (*newp) {
|
||||
int cmp;
|
||||
struct ovl_cache_entry *tmp;
|
||||
|
||||
parent = *newp;
|
||||
tmp = ovl_cache_entry_from_node(*newp);
|
||||
cmp = strncmp(name, tmp->name, len);
|
||||
if (cmp > 0)
|
||||
newp = &tmp->node.rb_right;
|
||||
else if (cmp < 0 || len < tmp->len)
|
||||
newp = &tmp->node.rb_left;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
|
||||
if (p == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
list_add_tail(&p->l_node, rdd->list);
|
||||
rb_link_node(&p->node, parent, newp);
|
||||
rb_insert_color(&p->node, &rdd->root);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
|
||||
const char *name, int namelen,
|
||||
loff_t offset, u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
p = ovl_cache_entry_find(&rdd->root, name, namelen);
|
||||
if (p) {
|
||||
list_move_tail(&p->l_node, &rdd->middle);
|
||||
} else {
|
||||
p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
|
||||
if (p == NULL)
|
||||
rdd->err = -ENOMEM;
|
||||
else
|
||||
list_add_tail(&p->l_node, &rdd->middle);
|
||||
}
|
||||
|
||||
return rdd->err;
|
||||
}
|
||||
|
||||
void ovl_cache_free(struct list_head *list)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
struct ovl_cache_entry *n;
|
||||
|
||||
list_for_each_entry_safe(p, n, list, l_node)
|
||||
kfree(p);
|
||||
|
||||
INIT_LIST_HEAD(list);
|
||||
}
|
||||
|
||||
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
|
||||
{
|
||||
struct ovl_dir_cache *cache = od->cache;
|
||||
|
||||
WARN_ON(cache->refcount <= 0);
|
||||
cache->refcount--;
|
||||
if (!cache->refcount) {
|
||||
if (ovl_dir_cache(dentry) == cache)
|
||||
ovl_set_dir_cache(dentry, NULL);
|
||||
|
||||
ovl_cache_free(&cache->entries);
|
||||
kfree(cache);
|
||||
}
|
||||
}
|
||||
|
||||
static int ovl_fill_merge(struct dir_context *ctx, const char *name,
|
||||
int namelen, loff_t offset, u64 ino,
|
||||
unsigned int d_type)
|
||||
{
|
||||
struct ovl_readdir_data *rdd =
|
||||
container_of(ctx, struct ovl_readdir_data, ctx);
|
||||
|
||||
rdd->count++;
|
||||
if (!rdd->is_merge)
|
||||
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
|
||||
else
|
||||
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
|
||||
}
|
||||
|
||||
static inline int ovl_dir_read(struct path *realpath,
|
||||
struct ovl_readdir_data *rdd)
|
||||
{
|
||||
struct file *realfile;
|
||||
int err;
|
||||
|
||||
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
|
||||
if (IS_ERR(realfile))
|
||||
return PTR_ERR(realfile);
|
||||
|
||||
rdd->dir = realpath->dentry;
|
||||
rdd->ctx.pos = 0;
|
||||
do {
|
||||
rdd->count = 0;
|
||||
rdd->err = 0;
|
||||
err = iterate_dir(realfile, &rdd->ctx);
|
||||
if (err >= 0)
|
||||
err = rdd->err;
|
||||
} while (!err && rdd->count);
|
||||
fput(realfile);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ovl_dir_reset(struct file *file)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct ovl_dir_cache *cache = od->cache;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
enum ovl_path_type type = ovl_path_type(dentry);
|
||||
|
||||
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
|
||||
ovl_cache_put(od, dentry);
|
||||
od->cache = NULL;
|
||||
od->cursor = NULL;
|
||||
}
|
||||
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
|
||||
if (od->is_real && OVL_TYPE_MERGE(type))
|
||||
od->is_real = false;
|
||||
}
|
||||
|
||||
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
struct ovl_readdir_data rdd = {
|
||||
.ctx.actor = ovl_fill_merge,
|
||||
.list = list,
|
||||
.root = RB_ROOT,
|
||||
.is_merge = false,
|
||||
};
|
||||
int idx, next;
|
||||
|
||||
for (idx = 0; idx != -1; idx = next) {
|
||||
next = ovl_path_next(idx, dentry, &realpath);
|
||||
|
||||
if (next != -1) {
|
||||
err = ovl_dir_read(&realpath, &rdd);
|
||||
if (err)
|
||||
break;
|
||||
} else {
|
||||
/*
|
||||
* Insert lowest layer entries before upper ones, this
|
||||
* allows offsets to be reasonably constant
|
||||
*/
|
||||
list_add(&rdd.middle, rdd.list);
|
||||
rdd.is_merge = true;
|
||||
err = ovl_dir_read(&realpath, &rdd);
|
||||
list_del(&rdd.middle);
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
|
||||
{
|
||||
struct list_head *p;
|
||||
loff_t off = 0;
|
||||
|
||||
list_for_each(p, &od->cache->entries) {
|
||||
if (off >= pos)
|
||||
break;
|
||||
off++;
|
||||
}
|
||||
/* Cursor is safe since the cache is stable */
|
||||
od->cursor = p;
|
||||
}
|
||||
|
||||
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
|
||||
{
|
||||
int res;
|
||||
struct ovl_dir_cache *cache;
|
||||
|
||||
cache = ovl_dir_cache(dentry);
|
||||
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
|
||||
cache->refcount++;
|
||||
return cache;
|
||||
}
|
||||
ovl_set_dir_cache(dentry, NULL);
|
||||
|
||||
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
|
||||
if (!cache)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
cache->refcount = 1;
|
||||
INIT_LIST_HEAD(&cache->entries);
|
||||
|
||||
res = ovl_dir_read_merged(dentry, &cache->entries);
|
||||
if (res) {
|
||||
ovl_cache_free(&cache->entries);
|
||||
kfree(cache);
|
||||
return ERR_PTR(res);
|
||||
}
|
||||
|
||||
cache->version = ovl_dentry_version_get(dentry);
|
||||
ovl_set_dir_cache(dentry, cache);
|
||||
|
||||
return cache;
|
||||
}
|
||||
|
||||
static int ovl_iterate(struct file *file, struct dir_context *ctx)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
if (!ctx->pos)
|
||||
ovl_dir_reset(file);
|
||||
|
||||
if (od->is_real)
|
||||
return iterate_dir(od->realfile, ctx);
|
||||
|
||||
if (!od->cache) {
|
||||
struct ovl_dir_cache *cache;
|
||||
|
||||
cache = ovl_cache_get(dentry);
|
||||
if (IS_ERR(cache))
|
||||
return PTR_ERR(cache);
|
||||
|
||||
od->cache = cache;
|
||||
ovl_seek_cursor(od, ctx->pos);
|
||||
}
|
||||
|
||||
while (od->cursor != &od->cache->entries) {
|
||||
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
|
||||
if (!p->is_whiteout)
|
||||
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
|
||||
break;
|
||||
od->cursor = p->l_node.next;
|
||||
ctx->pos++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
|
||||
{
|
||||
loff_t res;
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
|
||||
mutex_lock(&file_inode(file)->i_mutex);
|
||||
if (!file->f_pos)
|
||||
ovl_dir_reset(file);
|
||||
|
||||
if (od->is_real) {
|
||||
res = vfs_llseek(od->realfile, offset, origin);
|
||||
file->f_pos = od->realfile->f_pos;
|
||||
} else {
|
||||
res = -EINVAL;
|
||||
|
||||
switch (origin) {
|
||||
case SEEK_CUR:
|
||||
offset += file->f_pos;
|
||||
break;
|
||||
case SEEK_SET:
|
||||
break;
|
||||
default:
|
||||
goto out_unlock;
|
||||
}
|
||||
if (offset < 0)
|
||||
goto out_unlock;
|
||||
|
||||
if (offset != file->f_pos) {
|
||||
file->f_pos = offset;
|
||||
if (od->cache)
|
||||
ovl_seek_cursor(od, offset);
|
||||
}
|
||||
res = offset;
|
||||
}
|
||||
out_unlock:
|
||||
mutex_unlock(&file_inode(file)->i_mutex);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
|
||||
int datasync)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
struct file *realfile = od->realfile;
|
||||
|
||||
/*
|
||||
* Need to check if we started out being a lower dir, but got copied up
|
||||
*/
|
||||
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
|
||||
struct inode *inode = file_inode(file);
|
||||
|
||||
realfile = lockless_dereference(od->upperfile);
|
||||
if (!realfile) {
|
||||
struct path upperpath;
|
||||
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
realfile = ovl_path_open(&upperpath, O_RDONLY);
|
||||
smp_mb__before_spinlock();
|
||||
mutex_lock(&inode->i_mutex);
|
||||
if (!od->upperfile) {
|
||||
if (IS_ERR(realfile)) {
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
return PTR_ERR(realfile);
|
||||
}
|
||||
od->upperfile = realfile;
|
||||
} else {
|
||||
/* somebody has beaten us to it */
|
||||
if (!IS_ERR(realfile))
|
||||
fput(realfile);
|
||||
realfile = od->upperfile;
|
||||
}
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
}
|
||||
}
|
||||
|
||||
return vfs_fsync_range(realfile, start, end, datasync);
|
||||
}
|
||||
|
||||
static int ovl_dir_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
|
||||
if (od->cache) {
|
||||
mutex_lock(&inode->i_mutex);
|
||||
ovl_cache_put(od, file->f_path.dentry);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
}
|
||||
fput(od->realfile);
|
||||
if (od->upperfile)
|
||||
fput(od->upperfile);
|
||||
kfree(od);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_dir_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct path realpath;
|
||||
struct file *realfile;
|
||||
struct ovl_dir_file *od;
|
||||
enum ovl_path_type type;
|
||||
|
||||
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
|
||||
if (!od)
|
||||
return -ENOMEM;
|
||||
|
||||
type = ovl_path_real(file->f_path.dentry, &realpath);
|
||||
realfile = ovl_path_open(&realpath, file->f_flags);
|
||||
if (IS_ERR(realfile)) {
|
||||
kfree(od);
|
||||
return PTR_ERR(realfile);
|
||||
}
|
||||
od->realfile = realfile;
|
||||
od->is_real = !OVL_TYPE_MERGE(type);
|
||||
od->is_upper = OVL_TYPE_UPPER(type);
|
||||
file->private_data = od;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct file_operations ovl_dir_operations = {
|
||||
.read = generic_read_dir,
|
||||
.open = ovl_dir_open,
|
||||
.iterate = ovl_iterate,
|
||||
.llseek = ovl_dir_llseek,
|
||||
.fsync = ovl_dir_fsync,
|
||||
.release = ovl_dir_release,
|
||||
};
|
||||
|
||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
|
||||
{
|
||||
int err;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
err = ovl_dir_read_merged(dentry, list);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = 0;
|
||||
|
||||
list_for_each_entry(p, list, l_node) {
|
||||
if (p->is_whiteout)
|
||||
continue;
|
||||
|
||||
if (p->name[0] == '.') {
|
||||
if (p->len == 1)
|
||||
continue;
|
||||
if (p->len == 2 && p->name[1] == '.')
|
||||
continue;
|
||||
}
|
||||
err = -ENOTEMPTY;
|
||||
break;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
|
||||
list_for_each_entry(p, list, l_node) {
|
||||
struct dentry *dentry;
|
||||
|
||||
if (!p->is_whiteout)
|
||||
continue;
|
||||
|
||||
dentry = lookup_one_len(p->name, upper, p->len);
|
||||
if (IS_ERR(dentry)) {
|
||||
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
|
||||
upper->d_name.name, p->len, p->name,
|
||||
(int) PTR_ERR(dentry));
|
||||
continue;
|
||||
}
|
||||
ovl_cleanup(upper->d_inode, dentry);
|
||||
dput(dentry);
|
||||
}
|
||||
mutex_unlock(&upper->d_inode->i_mutex);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,7 +0,0 @@
|
||||
kmod(mcoverlay
|
||||
SOURCES
|
||||
copy_up.c dir.c inode.c readdir.c super.c namei.c util.c export.c
|
||||
INSTALL_DEST
|
||||
${KMODDIR}
|
||||
)
|
||||
|
||||
@ -1,804 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/splice.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/cred.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/fdtable.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include <linux/exportfs.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
|
||||
|
||||
static bool __read_mostly ovl_check_copy_up;
|
||||
module_param_named(check_copy_up, ovl_check_copy_up, bool,
|
||||
S_IWUSR | S_IRUGO);
|
||||
MODULE_PARM_DESC(ovl_check_copy_up,
|
||||
"Warn on copy-up when causing process also has a R/O fd open");
|
||||
|
||||
static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
|
||||
{
|
||||
const struct dentry *dentry = data;
|
||||
|
||||
if (file_inode(f) == d_inode(dentry))
|
||||
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
|
||||
f, fd, current->pid, current->comm);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check the fds open by this process and warn if something like the following
|
||||
* scenario is about to occur:
|
||||
*
|
||||
* fd1 = open("foo", O_RDONLY);
|
||||
* fd2 = open("foo", O_RDWR);
|
||||
*/
|
||||
static void ovl_do_check_copy_up(struct dentry *dentry)
|
||||
{
|
||||
if (ovl_check_copy_up)
|
||||
iterate_fd(current->files, 0, ovl_check_fd, dentry);
|
||||
}
|
||||
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
|
||||
{
|
||||
ssize_t list_size, size, value_size = 0;
|
||||
char *buf, *name, *value = NULL;
|
||||
int uninitialized_var(error);
|
||||
size_t slen;
|
||||
|
||||
if (!(old->d_inode->i_opflags & IOP_XATTR) ||
|
||||
!(new->d_inode->i_opflags & IOP_XATTR))
|
||||
return 0;
|
||||
|
||||
list_size = vfs_listxattr(old, NULL, 0);
|
||||
if (list_size <= 0) {
|
||||
if (list_size == -EOPNOTSUPP)
|
||||
return 0;
|
||||
return list_size;
|
||||
}
|
||||
|
||||
buf = kzalloc(list_size, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
list_size = vfs_listxattr(old, buf, list_size);
|
||||
if (list_size <= 0) {
|
||||
error = list_size;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (name = buf; list_size; name += slen) {
|
||||
slen = strnlen(name, list_size) + 1;
|
||||
|
||||
/* underlying fs providing us with an broken xattr list? */
|
||||
if (WARN_ON(slen > list_size)) {
|
||||
error = -EIO;
|
||||
break;
|
||||
}
|
||||
list_size -= slen;
|
||||
|
||||
if (ovl_is_private_xattr(name))
|
||||
continue;
|
||||
retry:
|
||||
size = vfs_getxattr(old, name, value, value_size);
|
||||
if (size == -ERANGE)
|
||||
size = vfs_getxattr(old, name, NULL, 0);
|
||||
|
||||
if (size < 0) {
|
||||
/* NOFSCHECK */
|
||||
continue;
|
||||
|
||||
error = size;
|
||||
break;
|
||||
}
|
||||
|
||||
if (size > value_size) {
|
||||
void *new;
|
||||
|
||||
new = krealloc(value, size, GFP_KERNEL);
|
||||
if (!new) {
|
||||
error = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
value = new;
|
||||
value_size = size;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
error = security_inode_copy_up_xattr(name);
|
||||
if (error < 0 && error != -EOPNOTSUPP)
|
||||
break;
|
||||
if (error == 1) {
|
||||
error = 0;
|
||||
continue; /* Discard */
|
||||
}
|
||||
error = vfs_setxattr(new, name, value, size, 0);
|
||||
if (error)
|
||||
break;
|
||||
}
|
||||
kfree(value);
|
||||
out:
|
||||
kfree(buf);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
|
||||
{
|
||||
struct file *old_file;
|
||||
struct file *new_file;
|
||||
loff_t old_pos = 0;
|
||||
loff_t new_pos = 0;
|
||||
int error = 0;
|
||||
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY);
|
||||
if (IS_ERR(old_file))
|
||||
return PTR_ERR(old_file);
|
||||
|
||||
new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY);
|
||||
if (IS_ERR(new_file)) {
|
||||
error = PTR_ERR(new_file);
|
||||
goto out_fput;
|
||||
}
|
||||
|
||||
/* Try to use clone_file_range to clone up within the same fs */
|
||||
error = vfs_clone_file_range(old_file, 0, new_file, 0, len);
|
||||
if (!error)
|
||||
goto out;
|
||||
/* Couldn't clone, so now we try to copy the data */
|
||||
error = 0;
|
||||
|
||||
/* FIXME: copy up sparse files efficiently */
|
||||
while (len) {
|
||||
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
|
||||
long bytes;
|
||||
|
||||
if (len < this_len)
|
||||
this_len = len;
|
||||
|
||||
if (signal_pending_state(TASK_KILLABLE, current)) {
|
||||
error = -EINTR;
|
||||
break;
|
||||
}
|
||||
|
||||
bytes = do_splice_direct(old_file, &old_pos,
|
||||
new_file, &new_pos,
|
||||
this_len, SPLICE_F_MOVE);
|
||||
if (bytes <= 0) {
|
||||
error = bytes;
|
||||
break;
|
||||
}
|
||||
WARN_ON(old_pos != new_pos);
|
||||
|
||||
len -= bytes;
|
||||
}
|
||||
out:
|
||||
if (!error)
|
||||
error = vfs_fsync(new_file, 0);
|
||||
fput(new_file);
|
||||
out_fput:
|
||||
fput(old_file);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
struct iattr attr = {
|
||||
.ia_valid =
|
||||
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
|
||||
.ia_atime = stat->atime,
|
||||
.ia_mtime = stat->mtime,
|
||||
};
|
||||
|
||||
return notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
|
||||
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (!S_ISLNK(stat->mode)) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_MODE,
|
||||
.ia_mode = stat->mode,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_UID | ATTR_GID,
|
||||
.ia_uid = stat->uid,
|
||||
.ia_gid = stat->gid,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err)
|
||||
ovl_set_timestamps(upperdentry, stat);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper)
|
||||
{
|
||||
struct ovl_fh *fh;
|
||||
int fh_type, fh_len, dwords;
|
||||
void *buf;
|
||||
int buflen = MAX_HANDLE_SZ;
|
||||
uuid_t *uuid = &real->d_sb->s_uuid;
|
||||
|
||||
buf = kmalloc(buflen, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
/*
|
||||
* We encode a non-connectable file handle for non-dir, because we
|
||||
* only need to find the lower inode number and we don't want to pay
|
||||
* the price or reconnecting the dentry.
|
||||
*/
|
||||
dwords = buflen >> 2;
|
||||
fh_type = exportfs_encode_fh(real, buf, &dwords, 0);
|
||||
buflen = (dwords << 2);
|
||||
|
||||
fh = ERR_PTR(-EIO);
|
||||
if (WARN_ON(fh_type < 0) ||
|
||||
WARN_ON(buflen > MAX_HANDLE_SZ) ||
|
||||
WARN_ON(fh_type == FILEID_INVALID))
|
||||
goto out;
|
||||
|
||||
BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255);
|
||||
fh_len = offsetof(struct ovl_fh, fid) + buflen;
|
||||
fh = kmalloc(fh_len, GFP_KERNEL);
|
||||
if (!fh) {
|
||||
fh = ERR_PTR(-ENOMEM);
|
||||
goto out;
|
||||
}
|
||||
|
||||
fh->version = OVL_FH_VERSION;
|
||||
fh->magic = OVL_FH_MAGIC;
|
||||
fh->type = fh_type;
|
||||
fh->flags = OVL_FH_FLAG_CPU_ENDIAN;
|
||||
/*
|
||||
* When we will want to decode an overlay dentry from this handle
|
||||
* and all layers are on the same fs, if we get a disconncted real
|
||||
* dentry when we decode fid, the only way to tell if we should assign
|
||||
* it to upperdentry or to lowerstack is by checking this flag.
|
||||
*/
|
||||
if (is_upper)
|
||||
fh->flags |= OVL_FH_FLAG_PATH_UPPER;
|
||||
fh->len = fh_len;
|
||||
fh->uuid = *uuid;
|
||||
memcpy(fh->fid, buf, buflen);
|
||||
|
||||
out:
|
||||
kfree(buf);
|
||||
return fh;
|
||||
}
|
||||
|
||||
int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
|
||||
struct dentry *upper)
|
||||
{
|
||||
const struct ovl_fh *fh = NULL;
|
||||
int err;
|
||||
|
||||
/*
|
||||
* When lower layer doesn't support export operations store a 'null' fh,
|
||||
* so we can use the overlay.origin xattr to distignuish between a copy
|
||||
* up and a pure upper inode.
|
||||
*/
|
||||
if (ovl_can_decode_fh(lower->d_sb)) {
|
||||
fh = ovl_encode_real_fh(lower, false);
|
||||
if (IS_ERR(fh))
|
||||
return PTR_ERR(fh);
|
||||
}
|
||||
|
||||
/*
|
||||
* Do not fail when upper doesn't support xattrs.
|
||||
*/
|
||||
err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh,
|
||||
fh ? fh->len : 0, 0);
|
||||
kfree(fh);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Store file handle of @upper dir in @index dir entry */
|
||||
static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index)
|
||||
{
|
||||
const struct ovl_fh *fh;
|
||||
int err;
|
||||
|
||||
fh = ovl_encode_real_fh(upper, true);
|
||||
if (IS_ERR(fh))
|
||||
return PTR_ERR(fh);
|
||||
|
||||
err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh, fh->len, 0);
|
||||
|
||||
kfree(fh);
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create and install index entry.
|
||||
*
|
||||
* Caller must hold i_mutex on indexdir.
|
||||
*/
|
||||
static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
|
||||
struct dentry *upper)
|
||||
{
|
||||
struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
|
||||
struct inode *dir = d_inode(indexdir);
|
||||
struct dentry *index = NULL;
|
||||
struct dentry *temp = NULL;
|
||||
struct qstr name = { };
|
||||
int err;
|
||||
|
||||
/*
|
||||
* For now this is only used for creating index entry for directories,
|
||||
* because non-dir are copied up directly to index and then hardlinked
|
||||
* to upper dir.
|
||||
*
|
||||
* TODO: implement create index for non-dir, so we can call it when
|
||||
* encoding file handle for non-dir in case index does not exist.
|
||||
*/
|
||||
if (WARN_ON(!d_is_dir(dentry)))
|
||||
return -EIO;
|
||||
|
||||
/* Directory not expected to be indexed before copy up */
|
||||
if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry))))
|
||||
return -EIO;
|
||||
|
||||
err = ovl_get_index_name(origin, &name);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
temp = ovl_create_temp(indexdir, OVL_CATTR(S_IFDIR | 0));
|
||||
err = PTR_ERR(temp);
|
||||
if (IS_ERR(temp))
|
||||
goto free_name;
|
||||
|
||||
err = ovl_set_upper_fh(upper, temp);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
index = lookup_one_len(name.name, indexdir, name.len);
|
||||
if (IS_ERR(index)) {
|
||||
err = PTR_ERR(index);
|
||||
} else {
|
||||
err = ovl_do_rename(dir, temp, dir, index, 0);
|
||||
dput(index);
|
||||
}
|
||||
out:
|
||||
if (err)
|
||||
ovl_cleanup(dir, temp);
|
||||
dput(temp);
|
||||
free_name:
|
||||
kfree(name.name);
|
||||
return err;
|
||||
}
|
||||
|
||||
struct ovl_copy_up_ctx {
|
||||
struct dentry *parent;
|
||||
struct dentry *dentry;
|
||||
struct path lowerpath;
|
||||
struct kstat stat;
|
||||
struct kstat pstat;
|
||||
const char *link;
|
||||
struct dentry *destdir;
|
||||
struct qstr destname;
|
||||
struct dentry *workdir;
|
||||
bool tmpfile;
|
||||
bool origin;
|
||||
bool indexed;
|
||||
};
|
||||
|
||||
static int ovl_link_up(struct ovl_copy_up_ctx *c)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upper;
|
||||
struct dentry *upperdir = ovl_dentry_upper(c->parent);
|
||||
struct inode *udir = d_inode(upperdir);
|
||||
|
||||
/* Mark parent "impure" because it may now contain non-pure upper */
|
||||
err = ovl_set_impure(c->parent, upperdir);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = ovl_set_nlink_lower(c->dentry);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
inode_lock_nested(udir, I_MUTEX_PARENT);
|
||||
upper = lookup_one_len(c->dentry->d_name.name, upperdir,
|
||||
c->dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (!IS_ERR(upper)) {
|
||||
err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper);
|
||||
dput(upper);
|
||||
|
||||
if (!err) {
|
||||
/* Restore timestamps on parent (best effort) */
|
||||
ovl_set_timestamps(upperdir, &c->pstat);
|
||||
ovl_dentry_set_upper_alias(c->dentry);
|
||||
}
|
||||
}
|
||||
inode_unlock(udir);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = ovl_set_nlink_upper(c->dentry);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp,
|
||||
struct dentry **newdentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upper;
|
||||
struct inode *udir = d_inode(c->destdir);
|
||||
|
||||
upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
|
||||
if (IS_ERR(upper))
|
||||
return PTR_ERR(upper);
|
||||
|
||||
if (c->tmpfile)
|
||||
err = ovl_do_link(temp, udir, upper);
|
||||
else
|
||||
err = ovl_do_rename(d_inode(c->workdir), temp, udir, upper, 0);
|
||||
|
||||
if (!err)
|
||||
*newdentry = dget(c->tmpfile ? upper : temp);
|
||||
dput(upper);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_get_tmpfile(struct ovl_copy_up_ctx *c)
|
||||
{
|
||||
int err;
|
||||
struct dentry *temp;
|
||||
const struct cred *old_creds = NULL;
|
||||
struct cred *new_creds = NULL;
|
||||
struct ovl_cattr cattr = {
|
||||
/* Can't properly set mode on creation because of the umask */
|
||||
.mode = c->stat.mode & S_IFMT,
|
||||
.rdev = c->stat.rdev,
|
||||
.link = c->link
|
||||
};
|
||||
|
||||
err = security_inode_copy_up(c->dentry, &new_creds);
|
||||
temp = ERR_PTR(err);
|
||||
if (err < 0)
|
||||
goto out;
|
||||
|
||||
if (new_creds)
|
||||
old_creds = override_creds(new_creds);
|
||||
|
||||
if (c->tmpfile)
|
||||
temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
|
||||
else
|
||||
temp = ovl_create_temp(c->workdir, &cattr);
|
||||
out:
|
||||
if (new_creds) {
|
||||
revert_creds(old_creds);
|
||||
put_cred(new_creds);
|
||||
}
|
||||
|
||||
return temp;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (S_ISREG(c->stat.mode)) {
|
||||
struct path upperpath;
|
||||
|
||||
ovl_path_upper(c->dentry, &upperpath);
|
||||
BUG_ON(upperpath.dentry != NULL);
|
||||
upperpath.dentry = temp;
|
||||
|
||||
err = ovl_copy_up_data(&c->lowerpath, &upperpath, c->stat.size);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
err = ovl_copy_xattr(c->lowerpath.dentry, temp);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
inode_lock(temp->d_inode);
|
||||
err = ovl_set_attr(temp, &c->stat);
|
||||
inode_unlock(temp->d_inode);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
/*
|
||||
* Store identifier of lower inode in upper inode xattr to
|
||||
* allow lookup of the copy up origin inode.
|
||||
*
|
||||
* Don't set origin when we are breaking the association with a lower
|
||||
* hard link.
|
||||
*/
|
||||
if (c->origin) {
|
||||
err = ovl_set_origin(c->dentry, c->lowerpath.dentry, temp);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
|
||||
{
|
||||
struct inode *udir = c->destdir->d_inode;
|
||||
struct inode *inode;
|
||||
struct dentry *newdentry = NULL;
|
||||
struct dentry *temp;
|
||||
int err;
|
||||
|
||||
temp = ovl_get_tmpfile(c);
|
||||
if (IS_ERR(temp))
|
||||
return PTR_ERR(temp);
|
||||
|
||||
err = ovl_copy_up_inode(c, temp);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (S_ISDIR(c->stat.mode) && c->indexed) {
|
||||
err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
|
||||
if (err)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (c->tmpfile) {
|
||||
inode_lock_nested(udir, I_MUTEX_PARENT);
|
||||
err = ovl_install_temp(c, temp, &newdentry);
|
||||
inode_unlock(udir);
|
||||
} else {
|
||||
err = ovl_install_temp(c, temp, &newdentry);
|
||||
}
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
inode = d_inode(c->dentry);
|
||||
ovl_inode_update(inode, newdentry);
|
||||
if (S_ISDIR(inode->i_mode))
|
||||
ovl_set_flag(OVL_WHITEOUTS, inode);
|
||||
|
||||
out:
|
||||
if (err && !c->tmpfile)
|
||||
ovl_cleanup(d_inode(c->workdir), temp);
|
||||
dput(temp);
|
||||
return err;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy up a single dentry
|
||||
*
|
||||
* All renames start with copy up of source if necessary. The actual
|
||||
* rename will only proceed once the copy up was successful. Copy up uses
|
||||
* upper parent i_mutex for exclusion. Since rename can change d_parent it
|
||||
* is possible that the copy up will lock the old parent. At that point
|
||||
* the file will have already been copied up anyway.
|
||||
*/
|
||||
static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
|
||||
{
|
||||
int err;
|
||||
struct ovl_fs *ofs = c->dentry->d_sb->s_fs_info;
|
||||
bool to_index = false;
|
||||
|
||||
/*
|
||||
* Indexed non-dir is copied up directly to the index entry and then
|
||||
* hardlinked to upper dir. Indexed dir is copied up to indexdir,
|
||||
* then index entry is created and then copied up dir installed.
|
||||
* Copying dir up to indexdir instead of workdir simplifies locking.
|
||||
*/
|
||||
if (ovl_need_index(c->dentry)) {
|
||||
c->indexed = true;
|
||||
if (S_ISDIR(c->stat.mode))
|
||||
c->workdir = ovl_indexdir(c->dentry->d_sb);
|
||||
else
|
||||
to_index = true;
|
||||
}
|
||||
|
||||
if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index)
|
||||
c->origin = true;
|
||||
|
||||
if (to_index) {
|
||||
c->destdir = ovl_indexdir(c->dentry->d_sb);
|
||||
err = ovl_get_index_name(c->lowerpath.dentry, &c->destname);
|
||||
if (err)
|
||||
return err;
|
||||
} else if (WARN_ON(!c->parent)) {
|
||||
/* Disconnected dentry must be copied up to index dir */
|
||||
return -EIO;
|
||||
} else {
|
||||
/*
|
||||
* Mark parent "impure" because it may now contain non-pure
|
||||
* upper
|
||||
*/
|
||||
err = ovl_set_impure(c->parent, c->destdir);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Should we copyup with O_TMPFILE or with workdir? */
|
||||
if (S_ISREG(c->stat.mode) && ofs->tmpfile) {
|
||||
c->tmpfile = true;
|
||||
err = ovl_copy_up_locked(c);
|
||||
} else {
|
||||
err = ovl_lock_rename_workdir(c->workdir, c->destdir);
|
||||
if (!err) {
|
||||
err = ovl_copy_up_locked(c);
|
||||
unlock_rename(c->workdir, c->destdir);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (c->indexed)
|
||||
ovl_set_flag(OVL_INDEX, d_inode(c->dentry));
|
||||
|
||||
if (to_index) {
|
||||
/* Initialize nlink for copy up of disconnected dentry */
|
||||
err = ovl_set_nlink_upper(c->dentry);
|
||||
} else {
|
||||
struct inode *udir = d_inode(c->destdir);
|
||||
|
||||
/* Restore timestamps on parent (best effort) */
|
||||
inode_lock(udir);
|
||||
ovl_set_timestamps(c->destdir, &c->pstat);
|
||||
inode_unlock(udir);
|
||||
|
||||
ovl_dentry_set_upper_alias(c->dentry);
|
||||
}
|
||||
|
||||
out:
|
||||
if (to_index)
|
||||
kfree(c->destname.name);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
||||
int flags)
|
||||
{
|
||||
int err;
|
||||
DEFINE_DELAYED_CALL(done);
|
||||
struct path parentpath;
|
||||
struct ovl_copy_up_ctx ctx = {
|
||||
.parent = parent,
|
||||
.dentry = dentry,
|
||||
.workdir = ovl_workdir(dentry),
|
||||
};
|
||||
|
||||
if (WARN_ON(!ctx.workdir))
|
||||
return -EROFS;
|
||||
|
||||
ovl_path_lower(dentry, &ctx.lowerpath);
|
||||
err = vfs_getattr(&ctx.lowerpath, &ctx.stat,
|
||||
STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (parent) {
|
||||
ovl_path_upper(parent, &parentpath);
|
||||
ctx.destdir = parentpath.dentry;
|
||||
ctx.destname = dentry->d_name;
|
||||
|
||||
err = vfs_getattr(&parentpath, &ctx.pstat,
|
||||
STATX_ATIME | STATX_MTIME,
|
||||
AT_STATX_SYNC_AS_STAT);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
/* maybe truncate regular file. this has no effect on dirs */
|
||||
if (flags & O_TRUNC)
|
||||
ctx.stat.size = 0;
|
||||
|
||||
if (S_ISLNK(ctx.stat.mode)) {
|
||||
ctx.link = vfs_get_link(ctx.lowerpath.dentry, &done);
|
||||
if (IS_ERR(ctx.link))
|
||||
return PTR_ERR(ctx.link);
|
||||
}
|
||||
ovl_do_check_copy_up(ctx.lowerpath.dentry);
|
||||
|
||||
err = ovl_copy_up_start(dentry);
|
||||
/* err < 0: interrupted, err > 0: raced with another copy-up */
|
||||
if (unlikely(err)) {
|
||||
if (err > 0)
|
||||
err = 0;
|
||||
} else {
|
||||
if (!ovl_dentry_upper(dentry))
|
||||
err = ovl_do_copy_up(&ctx);
|
||||
if (!err && parent && !ovl_dentry_has_upper_alias(dentry))
|
||||
err = ovl_link_up(&ctx);
|
||||
ovl_copy_up_end(dentry);
|
||||
}
|
||||
do_delayed_call(&done);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_copy_up_flags(struct dentry *dentry, int flags)
|
||||
{
|
||||
int err = 0;
|
||||
const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
|
||||
bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED);
|
||||
|
||||
/*
|
||||
* With NFS export, copy up can get called for a disconnected non-dir.
|
||||
* In this case, we will copy up lower inode to index dir without
|
||||
* linking it to upper dir.
|
||||
*/
|
||||
if (WARN_ON(disconnected && d_is_dir(dentry)))
|
||||
return -EIO;
|
||||
|
||||
while (!err) {
|
||||
struct dentry *next;
|
||||
struct dentry *parent = NULL;
|
||||
|
||||
/*
|
||||
* Check if copy-up has happened as well as for upper alias (in
|
||||
* case of hard links) is there.
|
||||
*
|
||||
* Both checks are lockless:
|
||||
* - false negatives: will recheck under oi->lock
|
||||
* - false positives:
|
||||
* + ovl_dentry_upper() uses memory barriers to ensure the
|
||||
* upper dentry is up-to-date
|
||||
* + ovl_dentry_has_upper_alias() relies on locking of
|
||||
* upper parent i_rwsem to prevent reordering copy-up
|
||||
* with rename.
|
||||
*/
|
||||
if (ovl_dentry_upper(dentry) &&
|
||||
(ovl_dentry_has_upper_alias(dentry) || disconnected))
|
||||
break;
|
||||
|
||||
next = dget(dentry);
|
||||
/* find the topmost dentry not yet copied up */
|
||||
for (; !disconnected;) {
|
||||
parent = dget_parent(next);
|
||||
|
||||
if (ovl_dentry_upper(parent))
|
||||
break;
|
||||
|
||||
dput(next);
|
||||
next = parent;
|
||||
}
|
||||
|
||||
err = ovl_copy_up_one(parent, next, flags);
|
||||
|
||||
dput(parent);
|
||||
dput(next);
|
||||
}
|
||||
revert_creds(old_cred);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_copy_up(struct dentry *dentry)
|
||||
{
|
||||
return ovl_copy_up_flags(dentry, 0);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,853 +0,0 @@
|
||||
/*
|
||||
* Overlayfs NFS export support.
|
||||
*
|
||||
* Amir Goldstein <amir73il@gmail.com>
|
||||
*
|
||||
* Copyright (C) 2017-2018 CTERA Networks. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/cred.h>
|
||||
#include <linux/mount.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/exportfs.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include <linux/version.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
static int ovl_encode_maybe_copy_up(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (ovl_dentry_upper(dentry))
|
||||
return 0;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (!err) {
|
||||
err = ovl_copy_up(dentry);
|
||||
ovl_drop_write(dentry);
|
||||
}
|
||||
|
||||
if (err) {
|
||||
pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n",
|
||||
dentry, err);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Before encoding a non-upper directory file handle from real layer N, we need
|
||||
* to check if it will be possible to reconnect an overlay dentry from the real
|
||||
* lower decoded dentry. This is done by following the overlay ancestry up to a
|
||||
* "layer N connected" ancestor and verifying that all parents along the way are
|
||||
* "layer N connectable". If an ancestor that is NOT "layer N connectable" is
|
||||
* found, we need to copy up an ancestor, which is "layer N connectable", thus
|
||||
* making that ancestor "layer N connected". For example:
|
||||
*
|
||||
* layer 1: /a
|
||||
* layer 2: /a/b/c
|
||||
*
|
||||
* The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is
|
||||
* copied up and renamed, upper dir /a will be indexed by lower dir /a from
|
||||
* layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*)
|
||||
* in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay
|
||||
* dentry from the connected lower dentry /a/b/c.
|
||||
*
|
||||
* To avoid this problem on decode time, we need to copy up an ancestor of
|
||||
* /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is
|
||||
* /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected"
|
||||
* and when the time comes to decode the file handle from lower dentry /a/b/c,
|
||||
* ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding
|
||||
* a connected overlay dentry will be accomplished.
|
||||
*
|
||||
* (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an
|
||||
* entry /a in the lower layers above layer N and find the indexed dir /a from
|
||||
* layer 1. If that improvement is made, then the check for "layer N connected"
|
||||
* will need to verify there are no redirects in lower layers above N. In the
|
||||
* example above, /a will be "layer 2 connectable". However, if layer 2 dir /a
|
||||
* is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable":
|
||||
*
|
||||
* layer 1: /A (redirect = /a)
|
||||
* layer 2: /a/b/c
|
||||
*/
|
||||
|
||||
/* Return the lowest layer for encoding a connectable file handle */
|
||||
static int ovl_connectable_layer(struct dentry *dentry)
|
||||
{
|
||||
struct ovl_entry *oe = OVL_E(dentry);
|
||||
|
||||
/* We can get overlay root from root of any layer */
|
||||
if (dentry == dentry->d_sb->s_root)
|
||||
return oe->numlower;
|
||||
|
||||
/*
|
||||
* If it's an unindexed merge dir, then it's not connectable with any
|
||||
* lower layer
|
||||
*/
|
||||
if (ovl_dentry_upper(dentry) &&
|
||||
!ovl_test_flag(OVL_INDEX, d_inode(dentry)))
|
||||
return 0;
|
||||
|
||||
/* We can get upper/overlay path from indexed/lower dentry */
|
||||
return oe->lowerstack[0].layer->idx;
|
||||
}
|
||||
|
||||
/*
|
||||
* @dentry is "connected" if all ancestors up to root or a "connected" ancestor
|
||||
* have the same uppermost lower layer as the origin's layer. We may need to
|
||||
* copy up a "connectable" ancestor to make it "connected". A "connected" dentry
|
||||
* cannot become non "connected", so cache positive result in dentry flags.
|
||||
*
|
||||
* Return the connected origin layer or < 0 on error.
|
||||
*/
|
||||
static int ovl_connect_layer(struct dentry *dentry)
|
||||
{
|
||||
struct dentry *next, *parent = NULL;
|
||||
int origin_layer;
|
||||
int err = 0;
|
||||
|
||||
if (WARN_ON(dentry == dentry->d_sb->s_root) ||
|
||||
WARN_ON(!ovl_dentry_lower(dentry)))
|
||||
return -EIO;
|
||||
|
||||
origin_layer = OVL_E(dentry)->lowerstack[0].layer->idx;
|
||||
if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry))
|
||||
return origin_layer;
|
||||
|
||||
/* Find the topmost origin layer connectable ancestor of @dentry */
|
||||
next = dget(dentry);
|
||||
for (;;) {
|
||||
parent = dget_parent(next);
|
||||
if (WARN_ON(parent == next)) {
|
||||
err = -EIO;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If @parent is not origin layer connectable, then copy up
|
||||
* @next which is origin layer connectable and we are done.
|
||||
*/
|
||||
if (ovl_connectable_layer(parent) < origin_layer) {
|
||||
err = ovl_encode_maybe_copy_up(next);
|
||||
break;
|
||||
}
|
||||
|
||||
/* If @parent is connected or indexed we are done */
|
||||
if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) ||
|
||||
ovl_test_flag(OVL_INDEX, d_inode(parent)))
|
||||
break;
|
||||
|
||||
dput(next);
|
||||
next = parent;
|
||||
}
|
||||
|
||||
dput(parent);
|
||||
dput(next);
|
||||
|
||||
if (!err)
|
||||
ovl_dentry_set_flag(OVL_E_CONNECTED, dentry);
|
||||
|
||||
return err ?: origin_layer;
|
||||
}
|
||||
|
||||
/*
|
||||
* We only need to encode origin if there is a chance that the same object was
|
||||
* encoded pre copy up and then we need to stay consistent with the same
|
||||
* encoding also after copy up. If non-pure upper is not indexed, then it was
|
||||
* copied up before NFS export was enabled. In that case we don't need to worry
|
||||
* about staying consistent with pre copy up encoding and we encode an upper
|
||||
* file handle. Overlay root dentry is a private case of non-indexed upper.
|
||||
*
|
||||
* The following table summarizes the different file handle encodings used for
|
||||
* different overlay object types:
|
||||
*
|
||||
* Object type | Encoding
|
||||
* --------------------------------
|
||||
* Pure upper | U
|
||||
* Non-indexed upper | U
|
||||
* Indexed upper | L (*)
|
||||
* Non-upper | L (*)
|
||||
*
|
||||
* U = upper file handle
|
||||
* L = lower file handle
|
||||
*
|
||||
* (*) Connecting an overlay dir from real lower dentry is not always
|
||||
* possible when there are redirects in lower layers and non-indexed merge dirs.
|
||||
* To mitigate those case, we may copy up the lower dir ancestor before encode
|
||||
* a lower dir file handle.
|
||||
*
|
||||
* Return 0 for upper file handle, > 0 for lower file handle or < 0 on error.
|
||||
*/
|
||||
static int ovl_check_encode_origin(struct dentry *dentry)
|
||||
{
|
||||
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
|
||||
|
||||
/* Upper file handle for pure upper */
|
||||
if (!ovl_dentry_lower(dentry))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Upper file handle for non-indexed upper.
|
||||
*
|
||||
* Root is never indexed, so if there's an upper layer, encode upper for
|
||||
* root.
|
||||
*/
|
||||
if (ovl_dentry_upper(dentry) &&
|
||||
!ovl_test_flag(OVL_INDEX, d_inode(dentry)))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Decoding a merge dir, whose origin's ancestor is under a redirected
|
||||
* lower dir or under a non-indexed upper is not always possible.
|
||||
* ovl_connect_layer() will try to make origin's layer "connected" by
|
||||
* copying up a "connectable" ancestor.
|
||||
*/
|
||||
if (d_is_dir(dentry) && ofs->upper_mnt)
|
||||
return ovl_connect_layer(dentry);
|
||||
|
||||
/* Lower file handle for indexed and non-upper dir/non-dir */
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen)
|
||||
{
|
||||
struct ovl_fh *fh = NULL;
|
||||
int err, enc_lower;
|
||||
|
||||
/*
|
||||
* Check if we should encode a lower or upper file handle and maybe
|
||||
* copy up an ancestor to make lower file handle connectable.
|
||||
*/
|
||||
err = enc_lower = ovl_check_encode_origin(dentry);
|
||||
if (enc_lower < 0)
|
||||
goto fail;
|
||||
|
||||
/* Encode an upper or lower file handle */
|
||||
fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) :
|
||||
ovl_dentry_upper(dentry), !enc_lower);
|
||||
err = PTR_ERR(fh);
|
||||
if (IS_ERR(fh))
|
||||
goto fail;
|
||||
|
||||
err = -EOVERFLOW;
|
||||
if (fh->len > buflen)
|
||||
goto fail;
|
||||
|
||||
memcpy(buf, (char *)fh, fh->len);
|
||||
err = fh->len;
|
||||
|
||||
out:
|
||||
kfree(fh);
|
||||
return err;
|
||||
|
||||
fail:
|
||||
pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n",
|
||||
dentry, err, buflen, fh ? (int)fh->len : 0,
|
||||
fh ? fh->type : 0);
|
||||
goto out;
|
||||
}
|
||||
|
||||
static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len)
|
||||
{
|
||||
int res, len = *max_len << 2;
|
||||
|
||||
res = ovl_d_to_fh(dentry, (char *)fid, len);
|
||||
if (res <= 0)
|
||||
return FILEID_INVALID;
|
||||
|
||||
len = res;
|
||||
|
||||
/* Round up to dwords */
|
||||
*max_len = (len + 3) >> 2;
|
||||
return OVL_FILEID;
|
||||
}
|
||||
|
||||
static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
|
||||
struct inode *parent)
|
||||
{
|
||||
struct dentry *dentry;
|
||||
int type;
|
||||
|
||||
/* TODO: encode connectable file handles */
|
||||
if (parent)
|
||||
return FILEID_INVALID;
|
||||
|
||||
dentry = d_find_any_alias(inode);
|
||||
if (WARN_ON(!dentry))
|
||||
return FILEID_INVALID;
|
||||
|
||||
type = ovl_dentry_to_fh(dentry, fid, max_len);
|
||||
|
||||
dput(dentry);
|
||||
return type;
|
||||
}
|
||||
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0)
|
||||
/*
|
||||
* Find or instantiate an overlay dentry from real dentries and index.
|
||||
*/
|
||||
static struct dentry *ovl_obtain_alias(struct super_block *sb,
|
||||
struct dentry *upper_alias,
|
||||
struct ovl_path *lowerpath,
|
||||
struct dentry *index)
|
||||
{
|
||||
struct dentry *lower = lowerpath ? lowerpath->dentry : NULL;
|
||||
struct dentry *upper = upper_alias ?: index;
|
||||
struct dentry *dentry;
|
||||
struct inode *inode;
|
||||
struct ovl_entry *oe;
|
||||
struct ovl_inode_params oip = {
|
||||
.lowerpath = lowerpath,
|
||||
.index = index,
|
||||
.numlower = !!lower
|
||||
};
|
||||
|
||||
/* We get overlay directory dentries with ovl_lookup_real() */
|
||||
if (d_is_dir(upper ?: lower))
|
||||
return ERR_PTR(-EIO);
|
||||
|
||||
oip.upperdentry = dget(upper);
|
||||
inode = ovl_get_inode(sb, &oip);
|
||||
if (IS_ERR(inode)) {
|
||||
dput(upper);
|
||||
return ERR_CAST(inode);
|
||||
}
|
||||
|
||||
dentry = d_find_any_alias(inode);
|
||||
if (!dentry) {
|
||||
dentry = d_alloc_anon(inode->i_sb);
|
||||
if (!dentry)
|
||||
goto nomem;
|
||||
oe = ovl_alloc_entry(lower ? 1 : 0);
|
||||
if (!oe)
|
||||
goto nomem;
|
||||
|
||||
if (lower) {
|
||||
oe->lowerstack->dentry = dget(lower);
|
||||
oe->lowerstack->layer = lowerpath->layer;
|
||||
}
|
||||
dentry->d_fsdata = oe;
|
||||
if (upper_alias)
|
||||
ovl_dentry_set_upper_alias(dentry);
|
||||
}
|
||||
|
||||
return d_instantiate_anon(dentry, inode);
|
||||
|
||||
nomem:
|
||||
iput(inode);
|
||||
dput(dentry);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
/* Get the upper or lower dentry in stach whose on layer @idx */
|
||||
static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx)
|
||||
{
|
||||
struct ovl_entry *oe = dentry->d_fsdata;
|
||||
int i;
|
||||
|
||||
if (!idx)
|
||||
return ovl_dentry_upper(dentry);
|
||||
|
||||
for (i = 0; i < oe->numlower; i++) {
|
||||
if (oe->lowerstack[i].layer->idx == idx)
|
||||
return oe->lowerstack[i].dentry;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lookup a child overlay dentry to get a connected overlay dentry whose real
|
||||
* dentry is @real. If @real is on upper layer, we lookup a child overlay
|
||||
* dentry with the same name as the real dentry. Otherwise, we need to consult
|
||||
* index for lookup.
|
||||
*/
|
||||
static struct dentry *ovl_lookup_real_one(struct dentry *connected,
|
||||
struct dentry *real,
|
||||
struct ovl_layer *layer)
|
||||
{
|
||||
struct inode *dir = d_inode(connected);
|
||||
struct dentry *this, *parent = NULL;
|
||||
struct name_snapshot name;
|
||||
int err;
|
||||
|
||||
/*
|
||||
* Lookup child overlay dentry by real name. The dir mutex protects us
|
||||
* from racing with overlay rename. If the overlay dentry that is above
|
||||
* real has already been moved to a parent that is not under the
|
||||
* connected overlay dir, we return -ECHILD and restart the lookup of
|
||||
* connected real path from the top.
|
||||
*/
|
||||
inode_lock_nested(dir, I_MUTEX_PARENT);
|
||||
err = -ECHILD;
|
||||
parent = dget_parent(real);
|
||||
if (ovl_dentry_real_at(connected, layer->idx) != parent)
|
||||
goto fail;
|
||||
|
||||
/*
|
||||
* We also need to take a snapshot of real dentry name to protect us
|
||||
* from racing with underlying layer rename. In this case, we don't
|
||||
* care about returning ESTALE, only from dereferencing a free name
|
||||
* pointer because we hold no lock on the real dentry.
|
||||
*/
|
||||
take_dentry_name_snapshot(&name, real);
|
||||
this = lookup_one_len(name.name, connected, strlen(name.name));
|
||||
err = PTR_ERR(this);
|
||||
if (IS_ERR(this)) {
|
||||
goto fail;
|
||||
} else if (!this || !this->d_inode) {
|
||||
dput(this);
|
||||
err = -ENOENT;
|
||||
goto fail;
|
||||
} else if (ovl_dentry_real_at(this, layer->idx) != real) {
|
||||
dput(this);
|
||||
err = -ESTALE;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
out:
|
||||
release_dentry_name_snapshot(&name);
|
||||
dput(parent);
|
||||
inode_unlock(dir);
|
||||
return this;
|
||||
|
||||
fail:
|
||||
pr_warn_ratelimited("overlayfs: failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n",
|
||||
real, layer->idx, connected, err);
|
||||
this = ERR_PTR(err);
|
||||
goto out;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_lookup_real(struct super_block *sb,
|
||||
struct dentry *real,
|
||||
struct ovl_layer *layer);
|
||||
|
||||
/*
|
||||
* Lookup an indexed or hashed overlay dentry by real inode.
|
||||
*/
|
||||
static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
|
||||
struct dentry *real,
|
||||
struct ovl_layer *layer)
|
||||
{
|
||||
struct ovl_fs *ofs = sb->s_fs_info;
|
||||
struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt };
|
||||
struct dentry *index = NULL;
|
||||
struct dentry *this = NULL;
|
||||
struct inode *inode;
|
||||
|
||||
/*
|
||||
* Decoding upper dir from index is expensive, so first try to lookup
|
||||
* overlay dentry in inode/dcache.
|
||||
*/
|
||||
inode = ovl_lookup_inode(sb, real, !layer->idx);
|
||||
if (IS_ERR(inode))
|
||||
return ERR_CAST(inode);
|
||||
if (inode) {
|
||||
this = d_find_any_alias(inode);
|
||||
iput(inode);
|
||||
}
|
||||
|
||||
/*
|
||||
* For decoded lower dir file handle, lookup index by origin to check
|
||||
* if lower dir was copied up and and/or removed.
|
||||
*/
|
||||
if (!this && layer->idx && ofs->indexdir && !WARN_ON(!d_is_dir(real))) {
|
||||
index = ovl_lookup_index(ofs, NULL, real, false);
|
||||
if (IS_ERR(index))
|
||||
return index;
|
||||
}
|
||||
|
||||
/* Get connected upper overlay dir from index */
|
||||
if (index) {
|
||||
struct dentry *upper = ovl_index_upper(ofs, index);
|
||||
|
||||
dput(index);
|
||||
if (IS_ERR_OR_NULL(upper))
|
||||
return upper;
|
||||
|
||||
/*
|
||||
* ovl_lookup_real() in lower layer may call recursively once to
|
||||
* ovl_lookup_real() in upper layer. The first level call walks
|
||||
* back lower parents to the topmost indexed parent. The second
|
||||
* recursive call walks back from indexed upper to the topmost
|
||||
* connected/hashed upper parent (or up to root).
|
||||
*/
|
||||
this = ovl_lookup_real(sb, upper, &upper_layer);
|
||||
dput(upper);
|
||||
}
|
||||
|
||||
if (IS_ERR_OR_NULL(this))
|
||||
return this;
|
||||
|
||||
if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) {
|
||||
dput(this);
|
||||
this = ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lookup an indexed or hashed overlay dentry, whose real dentry is an
|
||||
* ancestor of @real.
|
||||
*/
|
||||
static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb,
|
||||
struct dentry *real,
|
||||
struct ovl_layer *layer)
|
||||
{
|
||||
struct dentry *next, *parent = NULL;
|
||||
struct dentry *ancestor = ERR_PTR(-EIO);
|
||||
|
||||
if (real == layer->mnt->mnt_root)
|
||||
return dget(sb->s_root);
|
||||
|
||||
/* Find the topmost indexed or hashed ancestor */
|
||||
next = dget(real);
|
||||
for (;;) {
|
||||
parent = dget_parent(next);
|
||||
|
||||
/*
|
||||
* Lookup a matching overlay dentry in inode/dentry
|
||||
* cache or in index by real inode.
|
||||
*/
|
||||
ancestor = ovl_lookup_real_inode(sb, next, layer);
|
||||
if (ancestor)
|
||||
break;
|
||||
|
||||
if (parent == layer->mnt->mnt_root) {
|
||||
ancestor = dget(sb->s_root);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If @real has been moved out of the layer root directory,
|
||||
* we will eventully hit the real fs root. This cannot happen
|
||||
* by legit overlay rename, so we return error in that case.
|
||||
*/
|
||||
if (parent == next) {
|
||||
ancestor = ERR_PTR(-EXDEV);
|
||||
break;
|
||||
}
|
||||
|
||||
dput(next);
|
||||
next = parent;
|
||||
}
|
||||
|
||||
dput(parent);
|
||||
dput(next);
|
||||
|
||||
return ancestor;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lookup a connected overlay dentry whose real dentry is @real.
|
||||
* If @real is on upper layer, we lookup a child overlay dentry with the same
|
||||
* path the real dentry. Otherwise, we need to consult index for lookup.
|
||||
*/
|
||||
static struct dentry *ovl_lookup_real(struct super_block *sb,
|
||||
struct dentry *real,
|
||||
struct ovl_layer *layer)
|
||||
{
|
||||
struct dentry *connected;
|
||||
int err = 0;
|
||||
|
||||
connected = ovl_lookup_real_ancestor(sb, real, layer);
|
||||
if (IS_ERR(connected))
|
||||
return connected;
|
||||
|
||||
while (!err) {
|
||||
struct dentry *next, *this;
|
||||
struct dentry *parent = NULL;
|
||||
struct dentry *real_connected = ovl_dentry_real_at(connected,
|
||||
layer->idx);
|
||||
|
||||
if (real_connected == real)
|
||||
break;
|
||||
|
||||
/* Find the topmost dentry not yet connected */
|
||||
next = dget(real);
|
||||
for (;;) {
|
||||
parent = dget_parent(next);
|
||||
|
||||
if (parent == real_connected)
|
||||
break;
|
||||
|
||||
/*
|
||||
* If real has been moved out of 'real_connected',
|
||||
* we will not find 'real_connected' and hit the layer
|
||||
* root. In that case, we need to restart connecting.
|
||||
* This game can go on forever in the worst case. We
|
||||
* may want to consider taking s_vfs_rename_mutex if
|
||||
* this happens more than once.
|
||||
*/
|
||||
if (parent == layer->mnt->mnt_root) {
|
||||
dput(connected);
|
||||
connected = dget(sb->s_root);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If real file has been moved out of the layer root
|
||||
* directory, we will eventully hit the real fs root.
|
||||
* This cannot happen by legit overlay rename, so we
|
||||
* return error in that case.
|
||||
*/
|
||||
if (parent == next) {
|
||||
err = -EXDEV;
|
||||
break;
|
||||
}
|
||||
|
||||
dput(next);
|
||||
next = parent;
|
||||
}
|
||||
|
||||
if (!err) {
|
||||
this = ovl_lookup_real_one(connected, next, layer);
|
||||
if (IS_ERR(this))
|
||||
err = PTR_ERR(this);
|
||||
|
||||
/*
|
||||
* Lookup of child in overlay can fail when racing with
|
||||
* overlay rename of child away from 'connected' parent.
|
||||
* In this case, we need to restart the lookup from the
|
||||
* top, because we cannot trust that 'real_connected' is
|
||||
* still an ancestor of 'real'. There is a good chance
|
||||
* that the renamed overlay ancestor is now in cache, so
|
||||
* ovl_lookup_real_ancestor() will find it and we can
|
||||
* continue to connect exactly from where lookup failed.
|
||||
*/
|
||||
if (err == -ECHILD) {
|
||||
this = ovl_lookup_real_ancestor(sb, real,
|
||||
layer);
|
||||
err = PTR_ERR_OR_ZERO(this);
|
||||
}
|
||||
if (!err) {
|
||||
dput(connected);
|
||||
connected = this;
|
||||
}
|
||||
}
|
||||
|
||||
dput(parent);
|
||||
dput(next);
|
||||
}
|
||||
|
||||
if (err)
|
||||
goto fail;
|
||||
|
||||
return connected;
|
||||
|
||||
fail:
|
||||
pr_warn_ratelimited("overlayfs: failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n",
|
||||
real, layer->idx, connected, err);
|
||||
dput(connected);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get an overlay dentry from upper/lower real dentries and index.
|
||||
*/
|
||||
static struct dentry *ovl_get_dentry(struct super_block *sb,
|
||||
struct dentry *upper,
|
||||
struct ovl_path *lowerpath,
|
||||
struct dentry *index)
|
||||
{
|
||||
struct ovl_fs *ofs = sb->s_fs_info;
|
||||
struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt };
|
||||
struct ovl_layer *layer = upper ? &upper_layer : lowerpath->layer;
|
||||
struct dentry *real = upper ?: (index ?: lowerpath->dentry);
|
||||
|
||||
/*
|
||||
* Obtain a disconnected overlay dentry from a non-dir real dentry
|
||||
* and index.
|
||||
*/
|
||||
if (!d_is_dir(real))
|
||||
return ovl_obtain_alias(sb, upper, lowerpath, index);
|
||||
|
||||
/* Removed empty directory? */
|
||||
if ((real->d_flags & DCACHE_DISCONNECTED) || d_unhashed(real))
|
||||
return ERR_PTR(-ENOENT);
|
||||
|
||||
/*
|
||||
* If real dentry is connected and hashed, get a connected overlay
|
||||
* dentry whose real dentry is @real.
|
||||
*/
|
||||
return ovl_lookup_real(sb, real, layer);
|
||||
}
|
||||
|
||||
static struct dentry *ovl_upper_fh_to_d(struct super_block *sb,
|
||||
struct ovl_fh *fh)
|
||||
{
|
||||
struct ovl_fs *ofs = sb->s_fs_info;
|
||||
struct dentry *dentry;
|
||||
struct dentry *upper;
|
||||
|
||||
if (!ofs->upper_mnt)
|
||||
return ERR_PTR(-EACCES);
|
||||
|
||||
upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true);
|
||||
if (IS_ERR_OR_NULL(upper))
|
||||
return upper;
|
||||
|
||||
dentry = ovl_get_dentry(sb, upper, NULL, NULL);
|
||||
dput(upper);
|
||||
|
||||
return dentry;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
|
||||
struct ovl_fh *fh)
|
||||
{
|
||||
struct ovl_fs *ofs = sb->s_fs_info;
|
||||
struct ovl_path origin = { };
|
||||
struct ovl_path *stack = &origin;
|
||||
struct dentry *dentry = NULL;
|
||||
struct dentry *index = NULL;
|
||||
struct inode *inode;
|
||||
int err;
|
||||
|
||||
/* First lookup overlay inode in inode cache by origin fh */
|
||||
err = ovl_check_origin_fh(ofs, fh, false, NULL, &stack);
|
||||
if (err)
|
||||
return ERR_PTR(err);
|
||||
|
||||
if (!d_is_dir(origin.dentry) ||
|
||||
!(origin.dentry->d_flags & DCACHE_DISCONNECTED)) {
|
||||
inode = ovl_lookup_inode(sb, origin.dentry, false);
|
||||
err = PTR_ERR(inode);
|
||||
if (IS_ERR(inode))
|
||||
goto out_err;
|
||||
if (inode) {
|
||||
dentry = d_find_any_alias(inode);
|
||||
iput(inode);
|
||||
if (dentry)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* Then lookup indexed upper/whiteout by origin fh */
|
||||
if (ofs->indexdir) {
|
||||
index = ovl_get_index_fh(ofs, fh);
|
||||
err = PTR_ERR(index);
|
||||
if (IS_ERR(index)) {
|
||||
index = NULL;
|
||||
goto out_err;
|
||||
}
|
||||
}
|
||||
|
||||
/* Then try to get a connected upper dir by index */
|
||||
if (index && d_is_dir(index)) {
|
||||
struct dentry *upper = ovl_index_upper(ofs, index);
|
||||
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR_OR_NULL(upper))
|
||||
goto out_err;
|
||||
|
||||
dentry = ovl_get_dentry(sb, upper, NULL, NULL);
|
||||
dput(upper);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Otherwise, get a connected non-upper dir or disconnected non-dir */
|
||||
if (d_is_dir(origin.dentry) &&
|
||||
(origin.dentry->d_flags & DCACHE_DISCONNECTED)) {
|
||||
dput(origin.dentry);
|
||||
origin.dentry = NULL;
|
||||
err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack);
|
||||
if (err)
|
||||
goto out_err;
|
||||
}
|
||||
if (index) {
|
||||
err = ovl_verify_origin(index, origin.dentry, false);
|
||||
if (err)
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
dentry = ovl_get_dentry(sb, NULL, &origin, index);
|
||||
|
||||
out:
|
||||
dput(origin.dentry);
|
||||
dput(index);
|
||||
return dentry;
|
||||
|
||||
out_err:
|
||||
dentry = ERR_PTR(err);
|
||||
goto out;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid,
|
||||
int fh_len, int fh_type)
|
||||
{
|
||||
struct dentry *dentry = NULL;
|
||||
struct ovl_fh *fh = (struct ovl_fh *) fid;
|
||||
int len = fh_len << 2;
|
||||
unsigned int flags = 0;
|
||||
int err;
|
||||
|
||||
err = -EINVAL;
|
||||
if (fh_type != OVL_FILEID)
|
||||
goto out_err;
|
||||
|
||||
err = ovl_check_fh_len(fh, len);
|
||||
if (err)
|
||||
goto out_err;
|
||||
|
||||
flags = fh->flags;
|
||||
dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ?
|
||||
ovl_upper_fh_to_d(sb, fh) :
|
||||
ovl_lower_fh_to_d(sb, fh);
|
||||
err = PTR_ERR(dentry);
|
||||
if (IS_ERR(dentry) && err != -ESTALE)
|
||||
goto out_err;
|
||||
|
||||
return dentry;
|
||||
|
||||
out_err:
|
||||
pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n",
|
||||
len, fh_type, flags, err);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid,
|
||||
int fh_len, int fh_type)
|
||||
{
|
||||
pr_warn_ratelimited("overlayfs: connectable file handles not supported; use 'no_subtree_check' exportfs option.\n");
|
||||
return ERR_PTR(-EACCES);
|
||||
}
|
||||
|
||||
static int ovl_get_name(struct dentry *parent, char *name,
|
||||
struct dentry *child)
|
||||
{
|
||||
/*
|
||||
* ovl_fh_to_dentry() returns connected dir overlay dentries and
|
||||
* ovl_fh_to_parent() is not implemented, so we should not get here.
|
||||
*/
|
||||
WARN_ON_ONCE(1);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_get_parent(struct dentry *dentry)
|
||||
{
|
||||
/*
|
||||
* ovl_fh_to_dentry() returns connected dir overlay dentries, so we
|
||||
* should not get here.
|
||||
*/
|
||||
WARN_ON_ONCE(1);
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
#endif
|
||||
|
||||
const struct export_operations ovl_export_operations = {
|
||||
.encode_fh = ovl_encode_fh,
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0)
|
||||
.fh_to_dentry = ovl_fh_to_dentry,
|
||||
.fh_to_parent = ovl_fh_to_parent,
|
||||
.get_name = ovl_get_name,
|
||||
.get_parent = ovl_get_parent,
|
||||
#endif
|
||||
};
|
||||
@ -1,874 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/cred.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/posix_acl.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include <linux/version.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry;
|
||||
const struct cred *old_cred;
|
||||
|
||||
/* NOCOPYUPW */
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Check for permissions before trying to copy-up. This is redundant
|
||||
* since it will be rechecked later by ->setattr() on upper dentry. But
|
||||
* without this, copy-up can be triggered by just about anybody.
|
||||
*
|
||||
* We don't initialize inode->size, which just means that
|
||||
* inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not
|
||||
* check for a swapfile (which this won't be anyway).
|
||||
*/
|
||||
err = setattr_prepare(dentry, attr);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (!err) {
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
|
||||
if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
|
||||
attr->ia_valid &= ~ATTR_MODE;
|
||||
|
||||
inode_lock(upperdentry->d_inode);
|
||||
old_cred = ovl_override_creds(dentry->d_sb);
|
||||
err = notify_change(upperdentry, attr, NULL);
|
||||
revert_creds(old_cred);
|
||||
if (!err)
|
||||
ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
|
||||
inode_unlock(upperdentry->d_inode);
|
||||
}
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat,
|
||||
struct ovl_layer *lower_layer)
|
||||
{
|
||||
bool samefs = ovl_same_sb(dentry->d_sb);
|
||||
unsigned int xinobits = ovl_xino_bits(dentry->d_sb);
|
||||
|
||||
if (samefs) {
|
||||
/*
|
||||
* When all layers are on the same fs, all real inode
|
||||
* number are unique, so we use the overlay st_dev,
|
||||
* which is friendly to du -x.
|
||||
*/
|
||||
stat->dev = dentry->d_sb->s_dev;
|
||||
return 0;
|
||||
} else if (xinobits) {
|
||||
unsigned int shift = 64 - xinobits;
|
||||
/*
|
||||
* All inode numbers of underlying fs should not be using the
|
||||
* high xinobits, so we use high xinobits to partition the
|
||||
* overlay st_ino address space. The high bits holds the fsid
|
||||
* (upper fsid is 0). This way overlay inode numbers are unique
|
||||
* and all inodes use overlay st_dev. Inode numbers are also
|
||||
* persistent for a given layer configuration.
|
||||
*/
|
||||
if (stat->ino >> shift) {
|
||||
pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
|
||||
dentry, stat->ino, xinobits);
|
||||
} else {
|
||||
if (lower_layer)
|
||||
stat->ino |= ((u64)lower_layer->fsid) << shift;
|
||||
|
||||
stat->dev = dentry->d_sb->s_dev;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* The inode could not be mapped to a unified st_ino address space */
|
||||
if (S_ISDIR(dentry->d_inode->i_mode)) {
|
||||
/*
|
||||
* Always use the overlay st_dev for directories, so 'find
|
||||
* -xdev' will scan the entire overlay mount and won't cross the
|
||||
* overlay mount boundaries.
|
||||
*
|
||||
* If not all layers are on the same fs the pair {real st_ino;
|
||||
* overlay st_dev} is not unique, so use the non persistent
|
||||
* overlay st_ino for directories.
|
||||
*/
|
||||
stat->dev = dentry->d_sb->s_dev;
|
||||
stat->ino = dentry->d_inode->i_ino;
|
||||
} else if (lower_layer && lower_layer->fsid) {
|
||||
/*
|
||||
* For non-samefs setup, if we cannot map all layers st_ino
|
||||
* to a unified address space, we need to make sure that st_dev
|
||||
* is unique per lower fs. Upper layer uses real st_dev and
|
||||
* lower layers use the unique anonymous bdev assigned to the
|
||||
* lower fs.
|
||||
*/
|
||||
stat->dev = lower_layer->fs->pseudo_dev;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ovl_getattr(const struct path *path, struct kstat *stat,
|
||||
u32 request_mask, unsigned int flags)
|
||||
{
|
||||
struct dentry *dentry = path->dentry;
|
||||
enum ovl_path_type type;
|
||||
struct path realpath;
|
||||
const struct cred *old_cred;
|
||||
bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
|
||||
bool samefs = ovl_same_sb(dentry->d_sb);
|
||||
struct ovl_layer *lower_layer = NULL;
|
||||
int err;
|
||||
|
||||
type = ovl_path_real(dentry, &realpath);
|
||||
old_cred = ovl_override_creds(dentry->d_sb);
|
||||
err = vfs_getattr(&realpath, stat, request_mask, flags);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* For non-dir or same fs, we use st_ino of the copy up origin.
|
||||
* This guaranties constant st_dev/st_ino across copy up.
|
||||
* With xino feature and non-samefs, we use st_ino of the copy up
|
||||
* origin masked with high bits that represent the layer id.
|
||||
*
|
||||
* If lower filesystem supports NFS file handles, this also guaranties
|
||||
* persistent st_ino across mount cycle.
|
||||
*/
|
||||
if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) {
|
||||
if (!OVL_TYPE_UPPER(type)) {
|
||||
lower_layer = ovl_layer_lower(dentry);
|
||||
} else if (OVL_TYPE_ORIGIN(type)) {
|
||||
struct kstat lowerstat;
|
||||
u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0);
|
||||
|
||||
ovl_path_lower(dentry, &realpath);
|
||||
err = vfs_getattr(&realpath, &lowerstat,
|
||||
lowermask, flags);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Lower hardlinks may be broken on copy up to different
|
||||
* upper files, so we cannot use the lower origin st_ino
|
||||
* for those different files, even for the same fs case.
|
||||
*
|
||||
* Similarly, several redirected dirs can point to the
|
||||
* same dir on a lower layer. With the "verify_lower"
|
||||
* feature, we do not use the lower origin st_ino, if
|
||||
* we haven't verified that this redirect is unique.
|
||||
*
|
||||
* With inodes index enabled, it is safe to use st_ino
|
||||
* of an indexed origin. The index validates that the
|
||||
* upper hardlink is not broken and that a redirected
|
||||
* dir is the only redirect to that origin.
|
||||
*/
|
||||
if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) ||
|
||||
(!ovl_verify_lower(dentry->d_sb) &&
|
||||
(is_dir || lowerstat.nlink == 1))) {
|
||||
stat->ino = lowerstat.ino;
|
||||
lower_layer = ovl_layer_lower(dentry);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
err = ovl_map_dev_ino(dentry, stat, lower_layer);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* It's probably not worth it to count subdirs to get the
|
||||
* correct link count. nlink=1 seems to pacify 'find' and
|
||||
* other utilities.
|
||||
*/
|
||||
if (is_dir && OVL_TYPE_MERGE(type))
|
||||
stat->nlink = 1;
|
||||
|
||||
/*
|
||||
* Return the overlay inode nlinks for indexed upper inodes.
|
||||
* Overlay inode nlink counts the union of the upper hardlinks
|
||||
* and non-covered lower hardlinks. It does not include the upper
|
||||
* index hardlink.
|
||||
*/
|
||||
if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry)))
|
||||
stat->nlink = dentry->d_inode->i_nlink;
|
||||
|
||||
out:
|
||||
revert_creds(old_cred);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_permission(struct inode *inode, int mask)
|
||||
{
|
||||
struct inode *upperinode = ovl_inode_upper(inode);
|
||||
struct inode *realinode = upperinode ?: ovl_inode_lower(inode);
|
||||
const struct cred *old_cred;
|
||||
int err;
|
||||
|
||||
/* Careful in RCU walk mode */
|
||||
if (!realinode) {
|
||||
WARN_ON(!(mask & MAY_NOT_BLOCK));
|
||||
return -ECHILD;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check overlay inode with the creds of task and underlying inode
|
||||
* with creds of mounter
|
||||
*/
|
||||
err = generic_permission(inode, mask);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
old_cred = ovl_override_creds(inode->i_sb);
|
||||
if (!upperinode &&
|
||||
!special_file(realinode->i_mode) && mask & MAY_WRITE) {
|
||||
mask &= ~(MAY_WRITE | MAY_APPEND);
|
||||
/* Make sure mounter can read file for copy up later */
|
||||
mask |= MAY_READ;
|
||||
}
|
||||
err = inode_permission(realinode, mask);
|
||||
revert_creds(old_cred);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
|
||||
{
|
||||
struct path realpath;
|
||||
struct inode *realinode;
|
||||
|
||||
ovl_path_real(dentry, &realpath);
|
||||
realinode = realpath.dentry->d_inode;
|
||||
|
||||
if (!realinode->i_op->readlink)
|
||||
return -EINVAL;
|
||||
|
||||
touch_atime(&realpath);
|
||||
|
||||
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
|
||||
}
|
||||
|
||||
static const char *ovl_get_link(struct dentry *dentry,
|
||||
struct inode *inode,
|
||||
struct delayed_call *done)
|
||||
{
|
||||
const struct cred *old_cred;
|
||||
const char *p;
|
||||
|
||||
if (!dentry)
|
||||
return ERR_PTR(-ECHILD);
|
||||
|
||||
old_cred = ovl_override_creds(dentry->d_sb);
|
||||
p = vfs_get_link(ovl_dentry_real(dentry), done);
|
||||
revert_creds(old_cred);
|
||||
return p;
|
||||
}
|
||||
|
||||
bool ovl_is_private_xattr(const char *name)
|
||||
{
|
||||
return strncmp(name, OVL_XATTR_PREFIX,
|
||||
sizeof(OVL_XATTR_PREFIX) - 1) == 0;
|
||||
}
|
||||
|
||||
int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry = ovl_i_dentry_upper(inode);
|
||||
struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
|
||||
const struct cred *old_cred;
|
||||
|
||||
/* NOCOPYUPW */
|
||||
return 0;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (!value && !upperdentry) {
|
||||
err = vfs_getxattr(realdentry, name, NULL, 0);
|
||||
if (err < 0)
|
||||
goto out_drop_write;
|
||||
}
|
||||
|
||||
if (!upperdentry) {
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
realdentry = ovl_dentry_upper(dentry);
|
||||
}
|
||||
|
||||
old_cred = ovl_override_creds(dentry->d_sb);
|
||||
if (value)
|
||||
err = vfs_setxattr(realdentry, name, value, size, flags);
|
||||
else {
|
||||
WARN_ON(flags != XATTR_REPLACE);
|
||||
err = vfs_removexattr(realdentry, name);
|
||||
}
|
||||
revert_creds(old_cred);
|
||||
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
|
||||
void *value, size_t size)
|
||||
{
|
||||
ssize_t res;
|
||||
const struct cred *old_cred;
|
||||
struct dentry *realdentry =
|
||||
ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry);
|
||||
|
||||
old_cred = ovl_override_creds(dentry->d_sb);
|
||||
res = vfs_getxattr(realdentry, name, value, size);
|
||||
revert_creds(old_cred);
|
||||
return res;
|
||||
}
|
||||
|
||||
static bool ovl_can_list(const char *s)
|
||||
{
|
||||
/* List all non-trusted xatts */
|
||||
if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
|
||||
return true;
|
||||
|
||||
/* Never list trusted.overlay, list other trusted for superuser only */
|
||||
return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN);
|
||||
}
|
||||
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
|
||||
{
|
||||
struct dentry *realdentry = ovl_dentry_real(dentry);
|
||||
ssize_t res;
|
||||
size_t len;
|
||||
char *s;
|
||||
const struct cred *old_cred;
|
||||
|
||||
old_cred = ovl_override_creds(dentry->d_sb);
|
||||
res = vfs_listxattr(realdentry, list, size);
|
||||
revert_creds(old_cred);
|
||||
if (res <= 0 || size == 0)
|
||||
return res;
|
||||
|
||||
/* filter out private xattrs */
|
||||
for (s = list, len = res; len;) {
|
||||
size_t slen = strnlen(s, len) + 1;
|
||||
|
||||
/* underlying fs providing us with an broken xattr list? */
|
||||
if (WARN_ON(slen > len))
|
||||
return -EIO;
|
||||
|
||||
len -= slen;
|
||||
if (!ovl_can_list(s)) {
|
||||
res -= slen;
|
||||
memmove(s, s + slen, len);
|
||||
} else {
|
||||
s += slen;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
struct posix_acl *ovl_get_acl(struct inode *inode, int type)
|
||||
{
|
||||
struct inode *realinode = ovl_inode_real(inode);
|
||||
const struct cred *old_cred;
|
||||
struct posix_acl *acl;
|
||||
|
||||
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
|
||||
return NULL;
|
||||
|
||||
old_cred = ovl_override_creds(inode->i_sb);
|
||||
acl = get_acl(realinode, type);
|
||||
revert_creds(old_cred);
|
||||
|
||||
return acl;
|
||||
}
|
||||
|
||||
static bool ovl_open_need_copy_up(struct dentry *dentry, int flags)
|
||||
{
|
||||
/* Copy up of disconnected dentry does not set upper alias */
|
||||
if (ovl_dentry_upper(dentry) &&
|
||||
(ovl_dentry_has_upper_alias(dentry) ||
|
||||
(dentry->d_flags & DCACHE_DISCONNECTED)))
|
||||
return false;
|
||||
|
||||
if (special_file(d_inode(dentry)->i_mode))
|
||||
return false;
|
||||
|
||||
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
/* NOCOPYUPW */
|
||||
return err;
|
||||
|
||||
if (ovl_open_need_copy_up(dentry, file_flags)) {
|
||||
err = ovl_want_write(dentry);
|
||||
if (!err) {
|
||||
err = ovl_copy_up_flags(dentry, file_flags);
|
||||
ovl_drop_write(dentry);
|
||||
}
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags)
|
||||
{
|
||||
if (flags & S_ATIME) {
|
||||
struct ovl_fs *ofs = inode->i_sb->s_fs_info;
|
||||
struct path upperpath = {
|
||||
.mnt = ofs->upper_mnt,
|
||||
.dentry = ovl_upperdentry_dereference(OVL_I(inode)),
|
||||
};
|
||||
|
||||
if (upperpath.dentry) {
|
||||
touch_atime(&upperpath);
|
||||
inode->i_atime = d_inode(upperpath.dentry)->i_atime;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct inode_operations ovl_file_inode_operations = {
|
||||
.setattr = ovl_setattr,
|
||||
.permission = ovl_permission,
|
||||
.getattr = ovl_getattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.get_acl = ovl_get_acl,
|
||||
.update_time = ovl_update_time,
|
||||
};
|
||||
|
||||
static const struct inode_operations ovl_symlink_inode_operations = {
|
||||
.setattr = ovl_setattr,
|
||||
.get_link = ovl_get_link,
|
||||
.readlink = ovl_readlink,
|
||||
.getattr = ovl_getattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.update_time = ovl_update_time,
|
||||
};
|
||||
|
||||
/*
|
||||
* It is possible to stack overlayfs instance on top of another
|
||||
* overlayfs instance as lower layer. We need to annonate the
|
||||
* stackable i_mutex locks according to stack level of the super
|
||||
* block instance. An overlayfs instance can never be in stack
|
||||
* depth 0 (there is always a real fs below it). An overlayfs
|
||||
* inode lock will use the lockdep annotaion ovl_i_mutex_key[depth].
|
||||
*
|
||||
* For example, here is a snip from /proc/lockdep_chains after
|
||||
* dir_iterate of nested overlayfs:
|
||||
*
|
||||
* [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2)
|
||||
* [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1)
|
||||
* [...] &type->i_mutex_dir_key (stack_depth=0)
|
||||
*/
|
||||
#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH
|
||||
|
||||
static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
|
||||
{
|
||||
#ifdef CONFIG_LOCKDEP
|
||||
static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING];
|
||||
static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING];
|
||||
static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING];
|
||||
|
||||
int depth = inode->i_sb->s_stack_depth - 1;
|
||||
|
||||
if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING))
|
||||
depth = 0;
|
||||
|
||||
if (S_ISDIR(inode->i_mode))
|
||||
lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]);
|
||||
else
|
||||
lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]);
|
||||
|
||||
lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
|
||||
unsigned long ino, int fsid)
|
||||
{
|
||||
int xinobits = ovl_xino_bits(inode->i_sb);
|
||||
|
||||
/*
|
||||
* When NFS export is enabled and d_ino is consistent with st_ino
|
||||
* (samefs or i_ino has enough bits to encode layer), set the same
|
||||
* value used for d_ino to i_ino, because nfsd readdirplus compares
|
||||
* d_ino values to i_ino values of child entries. When called from
|
||||
* ovl_new_inode(), ino arg is 0, so i_ino will be updated to real
|
||||
* upper inode i_ino on ovl_inode_init() or ovl_inode_update().
|
||||
*/
|
||||
if (inode->i_sb->s_export_op &&
|
||||
(ovl_same_sb(inode->i_sb) || xinobits)) {
|
||||
inode->i_ino = ino;
|
||||
if (xinobits && fsid && !(ino >> (64 - xinobits)))
|
||||
inode->i_ino |= (unsigned long)fsid << (64 - xinobits);
|
||||
} else {
|
||||
inode->i_ino = get_next_ino();
|
||||
}
|
||||
inode->i_mode = mode;
|
||||
inode->i_flags |= S_NOCMTIME;
|
||||
#ifdef CONFIG_FS_POSIX_ACL
|
||||
inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
|
||||
#endif
|
||||
|
||||
ovl_lockdep_annotate_inode_mutex_key(inode);
|
||||
|
||||
switch (mode & S_IFMT) {
|
||||
case S_IFREG:
|
||||
inode->i_op = &ovl_file_inode_operations;
|
||||
break;
|
||||
|
||||
case S_IFDIR:
|
||||
inode->i_op = &ovl_dir_inode_operations;
|
||||
inode->i_fop = &ovl_dir_operations;
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
inode->i_op = &ovl_symlink_inode_operations;
|
||||
break;
|
||||
|
||||
default:
|
||||
inode->i_op = &ovl_file_inode_operations;
|
||||
init_special_inode(inode, mode, rdev);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* With inodes index enabled, an overlay inode nlink counts the union of upper
|
||||
* hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure
|
||||
* upper inode, the following nlink modifying operations can happen:
|
||||
*
|
||||
* 1. Lower hardlink copy up
|
||||
* 2. Upper hardlink created, unlinked or renamed over
|
||||
* 3. Lower hardlink whiteout or renamed over
|
||||
*
|
||||
* For the first, copy up case, the union nlink does not change, whether the
|
||||
* operation succeeds or fails, but the upper inode nlink may change.
|
||||
* Therefore, before copy up, we store the union nlink value relative to the
|
||||
* lower inode nlink in the index inode xattr trusted.overlay.nlink.
|
||||
*
|
||||
* For the second, upper hardlink case, the union nlink should be incremented
|
||||
* or decremented IFF the operation succeeds, aligned with nlink change of the
|
||||
* upper inode. Therefore, before link/unlink/rename, we store the union nlink
|
||||
* value relative to the upper inode nlink in the index inode.
|
||||
*
|
||||
* For the last, lower cover up case, we simplify things by preceding the
|
||||
* whiteout or cover up with copy up. This makes sure that there is an index
|
||||
* upper inode where the nlink xattr can be stored before the copied up upper
|
||||
* entry is unlink.
|
||||
*/
|
||||
#define OVL_NLINK_ADD_UPPER (1 << 0)
|
||||
|
||||
/*
|
||||
* On-disk format for indexed nlink:
|
||||
*
|
||||
* nlink relative to the upper inode - "U[+-]NUM"
|
||||
* nlink relative to the lower inode - "L[+-]NUM"
|
||||
*/
|
||||
|
||||
static int ovl_set_nlink_common(struct dentry *dentry,
|
||||
struct dentry *realdentry, const char *format)
|
||||
{
|
||||
struct inode *inode = d_inode(dentry);
|
||||
struct inode *realinode = d_inode(realdentry);
|
||||
char buf[13];
|
||||
int len;
|
||||
|
||||
len = snprintf(buf, sizeof(buf), format,
|
||||
(int) (inode->i_nlink - realinode->i_nlink));
|
||||
|
||||
if (WARN_ON(len >= sizeof(buf)))
|
||||
return -EIO;
|
||||
|
||||
return ovl_do_setxattr(ovl_dentry_upper(dentry),
|
||||
OVL_XATTR_NLINK, buf, len, 0);
|
||||
}
|
||||
|
||||
int ovl_set_nlink_upper(struct dentry *dentry)
|
||||
{
|
||||
return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i");
|
||||
}
|
||||
|
||||
int ovl_set_nlink_lower(struct dentry *dentry)
|
||||
{
|
||||
return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i");
|
||||
}
|
||||
|
||||
unsigned int ovl_get_nlink(struct dentry *lowerdentry,
|
||||
struct dentry *upperdentry,
|
||||
unsigned int fallback)
|
||||
{
|
||||
int nlink_diff;
|
||||
int nlink;
|
||||
char buf[13];
|
||||
int err;
|
||||
|
||||
if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1)
|
||||
return fallback;
|
||||
|
||||
err = vfs_getxattr(upperdentry, OVL_XATTR_NLINK, &buf, sizeof(buf) - 1);
|
||||
if (err < 0)
|
||||
goto fail;
|
||||
|
||||
buf[err] = '\0';
|
||||
if ((buf[0] != 'L' && buf[0] != 'U') ||
|
||||
(buf[1] != '+' && buf[1] != '-'))
|
||||
goto fail;
|
||||
|
||||
err = kstrtoint(buf + 1, 10, &nlink_diff);
|
||||
if (err < 0)
|
||||
goto fail;
|
||||
|
||||
nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink;
|
||||
nlink += nlink_diff;
|
||||
|
||||
if (nlink <= 0)
|
||||
goto fail;
|
||||
|
||||
return nlink;
|
||||
|
||||
fail:
|
||||
pr_warn_ratelimited("overlayfs: failed to get index nlink (%pd2, err=%i)\n",
|
||||
upperdentry, err);
|
||||
return fallback;
|
||||
}
|
||||
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
|
||||
{
|
||||
struct inode *inode;
|
||||
|
||||
inode = new_inode(sb);
|
||||
if (inode)
|
||||
ovl_fill_inode(inode, mode, rdev, 0, 0);
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
static int ovl_inode_test(struct inode *inode, void *data)
|
||||
{
|
||||
return inode->i_private == data;
|
||||
}
|
||||
|
||||
static int ovl_inode_set(struct inode *inode, void *data)
|
||||
{
|
||||
inode->i_private = data;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
|
||||
struct dentry *upperdentry, bool strict)
|
||||
{
|
||||
/*
|
||||
* For directories, @strict verify from lookup path performs consistency
|
||||
* checks, so NULL lower/upper in dentry must match NULL lower/upper in
|
||||
* inode. Non @strict verify from NFS handle decode path passes NULL for
|
||||
* 'unknown' lower/upper.
|
||||
*/
|
||||
if (S_ISDIR(inode->i_mode) && strict) {
|
||||
/* Real lower dir moved to upper layer under us? */
|
||||
if (!lowerdentry && ovl_inode_lower(inode))
|
||||
return false;
|
||||
|
||||
/* Lookup of an uncovered redirect origin? */
|
||||
if (!upperdentry && ovl_inode_upper(inode))
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL.
|
||||
* This happens when finding a copied up overlay inode for a renamed
|
||||
* or hardlinked overlay dentry and lower dentry cannot be followed
|
||||
* by origin because lower fs does not support file handles.
|
||||
*/
|
||||
if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Allow non-NULL __upperdentry in inode even if upperdentry is NULL.
|
||||
* This happens when finding a lower alias for a copied up hard link.
|
||||
*/
|
||||
if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
|
||||
bool is_upper)
|
||||
{
|
||||
struct inode *inode, *key = d_inode(real);
|
||||
|
||||
inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key);
|
||||
if (!inode)
|
||||
return NULL;
|
||||
|
||||
if (!ovl_verify_inode(inode, is_upper ? NULL : real,
|
||||
is_upper ? real : NULL, false)) {
|
||||
iput(inode);
|
||||
return ERR_PTR(-ESTALE);
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
/*
|
||||
* Does overlay inode need to be hashed by lower inode?
|
||||
*/
|
||||
static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
|
||||
struct dentry *lower, struct dentry *index)
|
||||
{
|
||||
struct ovl_fs *ofs = sb->s_fs_info;
|
||||
|
||||
/* No, if pure upper */
|
||||
if (!lower)
|
||||
return false;
|
||||
|
||||
/* Yes, if already indexed */
|
||||
if (index)
|
||||
return true;
|
||||
|
||||
/* Yes, if won't be copied up */
|
||||
if (!ofs->upper_mnt)
|
||||
return true;
|
||||
|
||||
/* No, if lower hardlink is or will be broken on copy up */
|
||||
if ((upper || !ovl_indexdir(sb)) &&
|
||||
!d_is_dir(lower) && d_inode(lower)->i_nlink > 1)
|
||||
return false;
|
||||
|
||||
/* No, if non-indexed upper with NFS export */
|
||||
if (sb->s_export_op && upper)
|
||||
return false;
|
||||
|
||||
/* Otherwise, hash by lower inode for fsnotify */
|
||||
return true;
|
||||
}
|
||||
|
||||
static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode,
|
||||
struct inode *key)
|
||||
{
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
|
||||
return newinode ? inode_insert5(newinode, (unsigned long) key,
|
||||
ovl_inode_test, ovl_inode_set, key) :
|
||||
#else
|
||||
return
|
||||
#endif
|
||||
iget5_locked(sb, (unsigned long) key,
|
||||
ovl_inode_test, ovl_inode_set, key);
|
||||
}
|
||||
|
||||
struct inode *ovl_get_inode(struct super_block *sb,
|
||||
struct ovl_inode_params *oip)
|
||||
{
|
||||
struct dentry *upperdentry = oip->upperdentry;
|
||||
struct ovl_path *lowerpath = oip->lowerpath;
|
||||
struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
|
||||
struct inode *inode;
|
||||
struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
|
||||
bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
|
||||
oip->index);
|
||||
int fsid = bylower ? oip->lowerpath->layer->fsid : 0;
|
||||
bool is_dir;
|
||||
unsigned long ino = 0;
|
||||
|
||||
if (!realinode)
|
||||
realinode = d_inode(lowerdentry);
|
||||
|
||||
/*
|
||||
* Copy up origin (lower) may exist for non-indexed upper, but we must
|
||||
* not use lower as hash key if this is a broken hardlink.
|
||||
*/
|
||||
is_dir = S_ISDIR(realinode->i_mode);
|
||||
if (upperdentry || bylower) {
|
||||
struct inode *key = d_inode(bylower ? lowerdentry :
|
||||
upperdentry);
|
||||
unsigned int nlink = is_dir ? 1 : realinode->i_nlink;
|
||||
|
||||
inode = ovl_iget5(sb, oip->newinode, key);
|
||||
if (!inode)
|
||||
goto out_nomem;
|
||||
if (!(inode->i_state & I_NEW)) {
|
||||
/*
|
||||
* Verify that the underlying files stored in the inode
|
||||
* match those in the dentry.
|
||||
*/
|
||||
if (!ovl_verify_inode(inode, lowerdentry, upperdentry,
|
||||
true)) {
|
||||
iput(inode);
|
||||
inode = ERR_PTR(-ESTALE);
|
||||
goto out;
|
||||
}
|
||||
|
||||
dput(upperdentry);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Recalculate nlink for non-dir due to indexing */
|
||||
if (!is_dir)
|
||||
nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink);
|
||||
set_nlink(inode, nlink);
|
||||
ino = key->i_ino;
|
||||
} else {
|
||||
/* Lower hardlink that will be broken on copy up */
|
||||
inode = new_inode(sb);
|
||||
if (!inode)
|
||||
goto out_nomem;
|
||||
}
|
||||
ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid);
|
||||
ovl_inode_init(inode, upperdentry, lowerdentry);
|
||||
|
||||
if (upperdentry && ovl_is_impuredir(upperdentry))
|
||||
ovl_set_flag(OVL_IMPURE, inode);
|
||||
|
||||
if (oip->index)
|
||||
ovl_set_flag(OVL_INDEX, inode);
|
||||
|
||||
/* Check for non-merge dir that may have whiteouts */
|
||||
if (is_dir) {
|
||||
if (((upperdentry && lowerdentry) || oip->numlower > 1) ||
|
||||
ovl_check_origin_xattr(upperdentry ?: lowerdentry)) {
|
||||
ovl_set_flag(OVL_WHITEOUTS, inode);
|
||||
}
|
||||
}
|
||||
|
||||
if (inode->i_state & I_NEW)
|
||||
unlock_new_inode(inode);
|
||||
out:
|
||||
return inode;
|
||||
|
||||
out_nomem:
|
||||
inode = ERR_PTR(-ENOMEM);
|
||||
goto out;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user