Compare commits
336 Commits
fj_test_20
...
1.7.0-0.4
| Author | SHA1 | Date | |
|---|---|---|---|
| 1cfc5ca71f | |||
| 7ee533d620 | |||
| 28334c7a29 | |||
| 697e9386b3 | |||
| 0e787b731e | |||
| 612f364e6a | |||
| ceee4c379f | |||
| 36c981bc34 | |||
| fd941dad44 | |||
| 5f5b9f94d1 | |||
| 3f3c4acd71 | |||
| 00007dafaa | |||
| cbe2b2149d | |||
| 4cecde3fba | |||
| 8022a2a8c0 | |||
| 3328ce03d9 | |||
| 97b107f61c | |||
| 6f3be17c19 | |||
| dea7d00545 | |||
| 4512778569 | |||
| a7adb266ff | |||
| 2566f4f213 | |||
| ac0081eddd | |||
| d4056acfc3 | |||
| 1910543380 | |||
| 6332903f0d | |||
| 29d27b7c8d | |||
| 7136384384 | |||
| 2fe5c8de2e | |||
| e774e1b984 | |||
| 33b7414615 | |||
| 3c646e2485 | |||
| a5fcc91656 | |||
| d370e9241f | |||
| 3e254c06bf | |||
| 07537cd2e7 | |||
| a37f72da0e | |||
| ab11b168f0 | |||
| eac414d6d8 | |||
| bb725f5f50 | |||
| 5224551782 | |||
| 91146acfe5 | |||
| f64731ab34 | |||
| cd46cbd4b3 | |||
| 39780917af | |||
| 0f8f6d298e | |||
| f8e8b21f04 | |||
| 5c2f9b8239 | |||
| 1afc3d9b70 | |||
| 17a8f68d60 | |||
| 2b9a053504 | |||
| 6441aa1abb | |||
| 9b55b68934 | |||
| 83ef96a739 | |||
| b5337358cf | |||
| 2db3717e57 | |||
| 5395891966 | |||
| c32a5e261b | |||
| c0c80b71ca | |||
| d15a396d5a | |||
| e35ec09da1 | |||
| 5e44c9c9f9 | |||
| 0f6c36870c | |||
| 2ec2112cc5 | |||
| c86a38e18f | |||
| 6aa7b50e26 | |||
| c3c57940ba | |||
| 7aa2d64294 | |||
| 51fe77cdae | |||
| d5aafca1ae | |||
| 54b529c82d | |||
| 232bc9c44b | |||
| f34373d1c0 | |||
| 4698ae166c | |||
| db9ca358f9 | |||
| 16a6a1d08b | |||
| 2e2e973d78 | |||
| c3c0b7197f | |||
| d086100b35 | |||
| 8f74888f87 | |||
| 8e42c2a254 | |||
| caf0f5ef63 | |||
| 3d030391e8 | |||
| 0aeab6b840 | |||
| 367bbda713 | |||
| 0082447043 | |||
| 4f50c90f6e | |||
| 79950e045e | |||
| 6cf7cebb2d | |||
| c9f05f238d | |||
| f1caaa9b74 | |||
| 97cd379ee2 | |||
| 8ee1d61d0f | |||
| 04d17dd3e9 | |||
| 33eef71133 | |||
| c10b4a1c16 | |||
| 8cf70900e7 | |||
| b2618a98f5 | |||
| 01d06cb218 | |||
| c78803ac08 | |||
| 3300e65efc | |||
| d82ac31bc6 | |||
| 4946fbdd82 | |||
| 33cba1ad48 | |||
| 7c69cfaf67 | |||
| b3cbdeec84 | |||
| 1d1ec39a27 | |||
| 0a4e6b49b4 | |||
| bb7e140655 | |||
| 32b32f0c4a | |||
| bf7fd81c1b | |||
| 92d191de9e | |||
| baf68f7e71 | |||
| 26bebb2749 | |||
| 9e2196c9ce | |||
| 93581cb142 | |||
| 67f5a1d4e0 | |||
| edf7b36669 | |||
| 1a204b6674 | |||
| 305511b48f | |||
| 606db376fd | |||
| 5719b4c64a | |||
| 343121c3d0 | |||
| 86c45484e3 | |||
| 767792808a | |||
| 117f070fd6 | |||
| a27909be88 | |||
| cec6f24559 | |||
| b3b8283f87 | |||
| d62f80a7c0 | |||
| 6d584feaef | |||
| e2e015e120 | |||
| 5fb3abe87b | |||
| 37fd9e0cd2 | |||
| 7e748b4ecb | |||
| cafb46efc7 | |||
| 41ea9d16c4 | |||
| 4bbdee395e | |||
| 597baf8445 | |||
| 55faba77a5 | |||
| 6bef773741 | |||
| 7882110e9f | |||
| d1df17ffb7 | |||
| 72af689e69 | |||
| 153d0609de | |||
| 83bbb87a0f | |||
| f00d03445c | |||
| 911b07f507 | |||
| 5b26fe2956 | |||
| 1db00ebc04 | |||
| d5de68e97b | |||
| 1526237bc6 | |||
| b8d96a74ce | |||
| 3c256e1a6c | |||
| 7fc4272b89 | |||
| d052acab1d | |||
| 91ea69cf8f | |||
| 0c63a2a3cd | |||
| a8696d811d | |||
| 569dc33a9c | |||
| 4b252a990f | |||
| adb6cce3ce | |||
| ed21b6849d | |||
| 37605740a4 | |||
| e069694c12 | |||
| dca1cb2625 | |||
| caac060684 | |||
| d330721421 | |||
| 157eeca41a | |||
| 8ba725b225 | |||
| a563d780c1 | |||
| 621533bbd3 | |||
| 37ea770f8c | |||
| edd3ea0103 | |||
| 41d37bcd30 | |||
| 309145587f | |||
| bc06d68d84 | |||
| 18412616e1 | |||
| c371fbf13b | |||
| 1492f16d67 | |||
| fd38ab6fd0 | |||
| f115bae8a7 | |||
| ba80dd8650 | |||
| 06960a41d9 | |||
| 86a2aabb24 | |||
| b4101d9c36 | |||
| ec31d72483 | |||
| 83ade5cdcd | |||
| dec133c1dd | |||
| 04a528ab27 | |||
| 8e4073c2ca | |||
| ff982b8594 | |||
| 299d47abf5 | |||
| f2460695c4 | |||
| 6ce5c754f3 | |||
| e932f2e70c | |||
| bb08742467 | |||
| 3e9fdfc0f1 | |||
| 58f4593478 | |||
| de0e07f29e | |||
| a4b83dc6d4 | |||
| beac6c3e80 | |||
| 5d6715078f | |||
| 0615a0b00b | |||
| 51cd7cbb6c | |||
| 0c1cae45fe | |||
| 11ef2f8092 | |||
| 12aef0b578 | |||
| 9b3450ee7e | |||
| 0d3ef65092 | |||
| 258156b57e | |||
| 8efced7bf7 | |||
| 2dd8687974 | |||
| f0bc1a6b07 | |||
| c52370b959 | |||
| 9c78d4d249 | |||
| b6285c9aa9 | |||
| b945367c90 | |||
| 0f434288e1 | |||
| b5cd813229 | |||
| 7268942c35 | |||
| f8cad24a9a | |||
| 2b6b3f31e5 | |||
| ca19ee434a | |||
| bb2589bac4 | |||
| e1c6e17400 | |||
| 207eba93ea | |||
| 06af2d62c6 | |||
| 3e267e24cb | |||
| e58e1c6e33 | |||
| fb924ebb9d | |||
| ac61577414 | |||
| 4cee9b1a27 | |||
| b55e164669 | |||
| aa66fe2cb1 | |||
| 3b74b0a093 | |||
| 0267a0c8ea | |||
| b3b7801d51 | |||
| 10f1fe76db | |||
| 089b443aaf | |||
| e9955a4bba | |||
| dc52c8a11a | |||
| bc4629dfb0 | |||
| 99fba2df1c | |||
| 239c95449b | |||
| 9dfc139eae | |||
| bc81d362b4 | |||
| 90b6aec53d | |||
| 0887e0de6d | |||
| 2c5c47344d | |||
| b9f223ceca | |||
| 6297181dcd | |||
| 80f964e44f | |||
| cc07d6e017 | |||
| 07c517828d | |||
| 75e42badf4 | |||
| bdccbf7356 | |||
| ad3ee26d36 | |||
| 16f8ccb35b | |||
| 3fda54ece8 | |||
| 4d252c2bb2 | |||
| 0cf89c5682 | |||
| 0d902872a1 | |||
| 9b6a88eeeb | |||
| 96b4729cd5 | |||
| 3372bbfd23 | |||
| f17c30da07 | |||
| 9a0eb915fb | |||
| a5ded1fc06 | |||
| de042b2cb2 | |||
| 2cee82673b | |||
| dfb3bef96d | |||
| 2dc51530f3 | |||
| 13758417c5 | |||
| c32edff2bb | |||
| 8356ef6c96 | |||
| 63d500515a | |||
| 791e8c2114 | |||
| 0bb612caea | |||
| 5e992bc195 | |||
| 08f817a654 | |||
| b87ac8b8c0 | |||
| a48a2cd3e8 | |||
| 7c238c27c9 | |||
| de77d2b061 | |||
| 52f89cf8fa | |||
| c96dfb0c68 | |||
| 21c9e57646 | |||
| 312b6c171b | |||
| 2ce695b47b | |||
| e5c1fdf129 | |||
| 9e3dd53c58 | |||
| fe53c6e0a5 | |||
| e988bfaf50 | |||
| f6f48b1210 | |||
| 70b42fde5d | |||
| ccb36a5849 | |||
| ea7f517e3d | |||
| ac18a24a27 | |||
| 8880710fad | |||
| 03a85825ed | |||
| 940eeca6f5 | |||
| 19b02cf4ed | |||
| 76a0cc71fc | |||
| ab39798181 | |||
| 0cc3496747 | |||
| 10cca81401 | |||
| 0c79de67b4 | |||
| 3fbad79afb | |||
| 1b76aaa7e1 | |||
| aa3c5e91db | |||
| 20d5900c35 | |||
| 414cffd95b | |||
| 9ec0aeeab5 | |||
| 06e96005a6 | |||
| 4606714c07 | |||
| a5d5baf8a8 | |||
| 8074445d59 | |||
| 6a456f11aa | |||
| 81e665cb48 | |||
| e0b9c5deec | |||
| 62772c8a24 | |||
| 63d15f7dfc | |||
| fb3f1c58a8 | |||
| 69846345de | |||
| b8155cc618 | |||
| f07e20a381 | |||
| 764948b51f | |||
| 7da5fede8b | |||
| 6810506c3d | |||
| c82c2c1231 | |||
| 5bc54a3bbe | |||
| 07aa96ef95 | |||
| dac99f708c | |||
| f3c9fbf4ea | |||
| 54122360e8 |
9
.gitignore
vendored
9
.gitignore
vendored
@ -13,6 +13,10 @@ old_timestamp
|
||||
CMakeFiles
|
||||
CMakeCache.txt
|
||||
Makefile
|
||||
!test/*/*/Makefile
|
||||
!test/signalonfork+wait/Makefile
|
||||
!test/perf_overflow/Makefile
|
||||
!test/*/*/*.cmd
|
||||
Kbuild
|
||||
cmake_install.cmake
|
||||
config.h
|
||||
@ -33,3 +37,8 @@ executer/user/libmcexec.a
|
||||
executer/user/libldump2mcdump.so
|
||||
executer/user/eclair
|
||||
tools/mcstat/mcstat
|
||||
/_CPack_Packages
|
||||
/CPackSourceConfig.cmake
|
||||
CPackConfig.cmake
|
||||
/build
|
||||
mckernel-*.tar.gz
|
||||
|
||||
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -1,3 +1,6 @@
|
||||
[submodule "ihk"]
|
||||
path = ihk
|
||||
url = https://github.com/RIKEN-SysSoft/ihk.git
|
||||
[submodule "executer/user/lib/libdwarf/libdwarf"]
|
||||
path = executer/user/lib/libdwarf/libdwarf
|
||||
url = https://github.com/bgerofi/libdwarf.git
|
||||
|
||||
182
CMakeLists.txt
182
CMakeLists.txt
@ -7,71 +7,157 @@ endif (NOT CMAKE_BUILD_TYPE)
|
||||
enable_language(C ASM)
|
||||
|
||||
project(mckernel C ASM)
|
||||
set(MCKERNEL_VERSION "1.6.0")
|
||||
set(MCKERNEL_VERSION "1.7.0")
|
||||
|
||||
# See "Fedora Packaging Guidlines -- Versioning"
|
||||
set(MCKERNEL_RELEASE "0.4")
|
||||
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/modules)
|
||||
# for rpmbuild
|
||||
if(DEFINED SYSCONF_INSTALL_DIR)
|
||||
set(CMAKE_INSTALL_SYSCONFDIR "${SYSCONF_INSTALL_DIR}")
|
||||
endif()
|
||||
include(GNUInstallDirs)
|
||||
include(CMakeParseArguments)
|
||||
include(Kbuild)
|
||||
include(Ksym)
|
||||
include(CheckCCompilerFlag)
|
||||
|
||||
set(CFLAGS_WARNINGS "-Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Wno-unused-function")
|
||||
CHECK_C_COMPILER_FLAG(-Wno-implicit-fallthrough IMPLICIT_FALLTHROUGH)
|
||||
if(IMPLICIT_FALLTHROUGH)
|
||||
set(CFLAGS_WARNINGS "${CFLAGS_WARNINGS} -Wno-implicit-fallthrough")
|
||||
endif(IMPLICIT_FALLTHROUGH)
|
||||
|
||||
# C flags need to be set before enabling language?
|
||||
set(CMAKE_C_FLAGS_DEBUG "-g ${CFLAGS_WARNINGS}" CACHE STRING "Debug compiler flags")
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CFLAGS_WARNINGS}" CACHE STRING "Release compiler flags")
|
||||
|
||||
# build options
|
||||
option(ENABLE_WERROR "Enable -Werror" OFF)
|
||||
if (ENABLE_WERROR)
|
||||
add_compile_options("-Werror")
|
||||
endif(ENABLE_WERROR)
|
||||
|
||||
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
|
||||
set(BUILD_TARGET "smp-x86" CACHE STRING "Build target: smp-x86 | smp-arm64")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
||||
set(BUILD_TARGET "smp-arm64" CACHE STRING "Build target: smp-x86 | smp-arm64")
|
||||
endif()
|
||||
|
||||
if (BUILD_TARGET STREQUAL "smp-x86")
|
||||
set(ARCH "x86_64")
|
||||
elseif (BUILD_TARGET STREQUAL "smp-arm64")
|
||||
set(ARCH "arm64")
|
||||
endif()
|
||||
|
||||
include(GNUInstallDirs)
|
||||
include(CMakeParseArguments)
|
||||
include(Kbuild)
|
||||
include(CheckCCompilerFlag)
|
||||
include(AutoconfHelper)
|
||||
|
||||
CHECK_C_COMPILER_FLAG(-Wno-implicit-fallthrough IMPLICIT_FALLTHROUGH)
|
||||
if(IMPLICIT_FALLTHROUGH)
|
||||
set(EXTRA_WARNINGS "-Wno-implicit-fallthrough")
|
||||
endif(IMPLICIT_FALLTHROUGH)
|
||||
|
||||
# build options
|
||||
set(CFLAGS_WARNING "-Wall" "-Wextra" "-Wno-unused-parameter" "-Wno-sign-compare" "-Wno-unused-function" ${EXTRA_WARNINGS} CACHE STRING "Warning flags")
|
||||
add_compile_options(${CFLAGS_WARNING})
|
||||
|
||||
option(ENABLE_WERROR "Enable -Werror" OFF)
|
||||
if (ENABLE_WERROR)
|
||||
add_compile_options("-Werror")
|
||||
endif(ENABLE_WERROR)
|
||||
|
||||
option(ENABLE_LINUX_WORK_IRQ_FOR_IKC "Use Linux work IRQ for IKC IPI" ON)
|
||||
if (ENABLE_LINUX_WORK_IRQ_FOR_IKC)
|
||||
set(KBUILD_C_FLAGS "${KBUILD_C_FLAGS} -DIHK_IKC_USE_LINUX_WORK_IRQ")
|
||||
add_definitions(-DIHK_IKC_USE_LINUX_WORK_IRQ)
|
||||
endif()
|
||||
|
||||
if (BUILD_TARGET STREQUAL "smp-arm64")
|
||||
foreach(i RANGE 1 120)
|
||||
add_definitions(-DPOSTK_DEBUG_ARCH_DEP_${i} -DPOSTK_DEBUG_TEMP_FIX_${i})
|
||||
set(KBUILD_C_FLAGS "${KBUILD_C_FLAGS} -DPOSTK_DEBUG_ARCH_DEP_${i} -DPOSTK_DEBUG_TEMP_FIX_${i}")
|
||||
endforeach()
|
||||
add_definitions(-DCONFIG_ARM64_64K_PAGES -DCONFIG_ARM64_VA_BITS=48)
|
||||
|
||||
execute_process(COMMAND awk -F= "$1 == \"CONFIG_ARM64_64K_PAGES\" { print $2; exit; }" "${KERNEL_DIR}/.config"
|
||||
OUTPUT_VARIABLE CONFIG_ARM64_64K_PAGES OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
execute_process(COMMAND awk -F= "$1 == \"CONFIG_ARM64_VA_BITS\" { print $2; exit; }" "${KERNEL_DIR}/.config"
|
||||
OUTPUT_VARIABLE CONFIG_ARM64_VA_BITS OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
message("Host kernel CONFIG_ARM64_64K_PAGES=${CONFIG_ARM64_64K_PAGES}")
|
||||
message("Host kernel CONFIG_ARM64_VA_BITS=${CONFIG_ARM64_VA_BITS}")
|
||||
|
||||
if(CONFIG_ARM64_64K_PAGES STREQUAL "y")
|
||||
if(CONFIG_ARM64_VA_BITS STREQUAL 42)
|
||||
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=2 -DCONFIG_ARM64_VA_BITS=42 -DCONFIG_ARM64_64K_PAGES)
|
||||
set(LINKER_SCRIPT "smp-arm64_type3.lds")
|
||||
elseif(CONFIG_ARM64_VA_BITS STREQUAL 48)
|
||||
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=3 -DCONFIG_ARM64_VA_BITS=48 -DCONFIG_ARM64_64K_PAGES)
|
||||
set(LINKER_SCRIPT "smp-arm64_type4.lds")
|
||||
endif()
|
||||
else(CONFIG_ARM64_64K_PAGES STREQUAL "y")
|
||||
if(CONFIG_ARM64_VA_BITS STREQUAL 39)
|
||||
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=3 -DCONFIG_ARM64_VA_BITS=39)
|
||||
set(LINKER_SCRIPT "smp-arm64_type1.lds")
|
||||
elseif(CONFIG_ARM64_VA_BITS STREQUAL 48)
|
||||
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=4 -DCONFIG_ARM64_VA_BITS=48)
|
||||
set(LINKER_SCRIPT "smp-arm64_type2.lds")
|
||||
endif()
|
||||
endif(CONFIG_ARM64_64K_PAGES STREQUAL "y")
|
||||
endif()
|
||||
set_property(CACHE BUILD_TARGET PROPERTY STRINGS smp-x86 smp-arm64)
|
||||
|
||||
# define MAP_KERNEL_START
|
||||
|
||||
set(tmpdir ${CMAKE_CURRENT_BINARY_DIR}/tmp.resolve_MODULES_END)
|
||||
file(REMOVE_RECURSE ${tmpdir})
|
||||
file(MAKE_DIRECTORY ${tmpdir})
|
||||
file(WRITE ${tmpdir}/driver.c "#include <linux/module.h>\n")
|
||||
file(APPEND ${tmpdir}/driver.c "unsigned long MAP_KERNEL_START = MODULES_END - (1UL << 23);\n")
|
||||
file(WRITE ${tmpdir}/Makefile "obj-m := driver.o\n")
|
||||
file(APPEND ${tmpdir}/Makefile "all:\n")
|
||||
file(APPEND ${tmpdir}/Makefile "\tmake ${KBUILD_MAKE_FLAGS_STR} -C ${KERNEL_DIR} M=${tmpdir} modules\n")
|
||||
|
||||
execute_process(COMMAND make -C ${tmpdir})
|
||||
execute_process(COMMAND bash -c "offset=`readelf -S ${tmpdir}/driver.ko | grep .data | sed 's/.* //g'`; echo $((0x$offset))"
|
||||
OUTPUT_VARIABLE MAP_KERNEL_START_OFFSET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
execute_process(COMMAND bash -c "dd if=${tmpdir}/driver.ko bs=1 skip=${MAP_KERNEL_START_OFFSET} count=8 2>/dev/null | od -tx8 -Ax | head -1 | sed 's|.* |0x|g'"
|
||||
OUTPUT_VARIABLE MAP_KERNEL_START OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
|
||||
|
||||
set(ENABLE_MEMDUMP ON)
|
||||
option(ENABLE_PERF "Enable perf support" ON)
|
||||
option(ENABLE_RUSAGE "Enable rusage support" ON)
|
||||
option(ENABLE_MCOVERLAYFS "Enable overlay filesystem" OFF)
|
||||
option(ENABLE_QLMPI "Enable qlmpi programs" OFF)
|
||||
option(ENABLE_UTI "Enable uti support" OFF)
|
||||
option(ENABLE_UBSAN "Enable undefined behaviour sanitizer on mckernel size" OFF)
|
||||
option(ENABLE_PER_CPU_ALLOC_CACHE "Enable per-CPU allocator cache (ThunderX2 workaround)" OFF)
|
||||
|
||||
find_package(PkgConfig REQUIRED)
|
||||
set(PKG_CONFIG_USE_CMAKE_PREFIX_PATH ON)
|
||||
|
||||
find_library(LIBRT rt)
|
||||
if (NOT LIBRT)
|
||||
message(FATAL_ERROR "error: couldn't find librt")
|
||||
endif()
|
||||
find_library(LIBNUMA numa)
|
||||
if (NOT LIBNUMA)
|
||||
message(FATAL_ERROR "error: couldn't find libnuma")
|
||||
endif()
|
||||
find_library(LIBBFD bfd)
|
||||
if (NOT LIBBFD)
|
||||
message(FATAL_ERROR "error: couldn't find libbfd")
|
||||
endif()
|
||||
find_library(LIBIBERTY iberty)
|
||||
if (NOT LIBIBERTY)
|
||||
message(FATAL_ERROR "error: couldn't find libiberty")
|
||||
endif()
|
||||
|
||||
find_library(LIBDWARF dwarf)
|
||||
|
||||
if (NOT LIBDWARF)
|
||||
if (CMAKE_CROSSCOMPILING)
|
||||
message(FATAL_ERROR "Could not find libdwarf.so, install libdwarf-devel to ${CMAKE_FIND_ROOT_PATH}")
|
||||
endif()
|
||||
message("WARNING: libdwarf will be compiled locally")
|
||||
enable_language(CXX)
|
||||
else()
|
||||
# Note that libdwarf-devel provides /usr/include/libdwarf/dwarf.h
|
||||
# but elfutils-devel provides /usr/include/dwarf.h
|
||||
# while mcinspect.c performs "#include <dwarf.h>"
|
||||
find_path(DWARF_H dwarf.h PATH_SUFFIXES libdwarf)
|
||||
endif()
|
||||
|
||||
if (ENABLE_QLMPI)
|
||||
find_package(MPI REQUIRED)
|
||||
endif()
|
||||
|
||||
if (ENABLE_UTI)
|
||||
find_library(LIBSYSCALL_INTERCEPT syscall_intercept)
|
||||
pkg_check_modules(LIBSYSCALL_INTERCEPT REQUIRED libsyscall_intercept)
|
||||
link_directories(${LIBSYSCALL_INTERCEPT_LIBRARY_DIRS})
|
||||
endif()
|
||||
|
||||
string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(-([0-9]+)(.*))?" "\\1;\\2;\\3;\\5;\\6" LINUX_VERSION ${UNAME_R})
|
||||
@ -81,29 +167,11 @@ list(GET LINUX_VERSION 2 LINUX_VERSION_PATCH)
|
||||
list(GET LINUX_VERSION 3 LINUX_VERSION_RELEASE)
|
||||
math(EXPR LINUX_VERSION_CODE "${LINUX_VERSION_MAJOR} * 65536 + ${LINUX_VERSION_MINOR} * 256 + ${LINUX_VERSION_PATCH}")
|
||||
|
||||
ksym(sys_mount PREFIX MCCTRL_)
|
||||
ksym(sys_umount PREFIX MCCTRL_)
|
||||
ksym(sys_unshare PREFIX MCCTRL_)
|
||||
ksym(zap_page_range PREFIX MCCTRL_)
|
||||
ksym(vdso_image_64 PREFIX MCCTRL_)
|
||||
ksym(vdso_start PREFIX MCCTRL_)
|
||||
ksym(vdso_end PREFIX MCCTRL_)
|
||||
ksym(vdso_pages PREFIX MCCTRL_)
|
||||
ksym(__vvar_page PREFIX MCCTRL_)
|
||||
ksym(hpet_address PREFIX MCCTRL_)
|
||||
# POSTK_DEBUG_ARCH_DEP_50, add:find kernel symbol.
|
||||
ksym(vdso_spec PREFIX MCCTRL_)
|
||||
ksym(hv_clock PREFIX MCCTRL_)
|
||||
ksym(sys_readlink PREFIX MCCTRL_)
|
||||
ksym(walk_page_range PREFIX MCCTRL_)
|
||||
|
||||
|
||||
# compat with various install paths
|
||||
set(MCKERNEL_LIBDIR ${CMAKE_INSTALL_FULL_LIBDIR})
|
||||
set(BINDIR ${CMAKE_INSTALL_FULL_BINDIR})
|
||||
set(SBINDIR ${CMAKE_INSTALL_FULL_SBINDIR})
|
||||
set(ETCDIR ${CMAKE_INSTALL_FULL_SYSCONFDIR})
|
||||
set(ROOTFSDIR "${CMAKE_INSTALL_PREFIX}/rootfs")
|
||||
set(ETCDIR ${CMAKE_INSTALL_PREFIX}/etc)
|
||||
set(ROOTFSDIR "/rootfs")
|
||||
if (CMAKE_INSTALL_PREFIX STREQUAL "/usr")
|
||||
set(KMODDIR "/lib/modules/${UNAME_R}/extra/mckernel")
|
||||
set(MCKERNELDIR "${CMAKE_INSTALL_FULL_DATADIR}/mckernel/${BUILD_TARGET}")
|
||||
@ -138,23 +206,23 @@ configure_file(config.h.in config.h)
|
||||
|
||||
# actual build section - just subdirs
|
||||
add_subdirectory(executer/kernel/mcctrl)
|
||||
if (ENABLE_MCOVERLAYFS)
|
||||
add_subdirectory(executer/kernel/mcoverlayfs)
|
||||
endif()
|
||||
add_subdirectory(executer/user)
|
||||
add_subdirectory(kernel)
|
||||
add_subdirectory(tools/mcstat)
|
||||
add_subdirectory(tools/crash)
|
||||
|
||||
configure_file(arch/x86_64/tools/mcreboot-smp-x86.sh.in mcreboot.sh @ONLY)
|
||||
configure_file(arch/x86_64/tools/mcstop+release-smp-x86.sh.in mcstop+release.sh @ONLY)
|
||||
configure_file(arch/x86_64/tools/mcreboot.1in mcreboot.1 @ONLY)
|
||||
configure_file(scripts/mcreboot-smp.sh.in mcreboot.sh @ONLY)
|
||||
configure_file(scripts/mcstop+release-smp.sh.in mcstop+release.sh @ONLY)
|
||||
configure_file(scripts/mcreboot.1in mcreboot.1 @ONLY)
|
||||
configure_file(scripts/eclair-dump-backtrace.exp.in eclair-dump-backtrace.exp @ONLY)
|
||||
install(PROGRAMS
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/mcreboot.sh"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/mcstop+release.sh"
|
||||
DESTINATION "${CMAKE_INSTALL_SBINDIR}")
|
||||
install(FILES
|
||||
"arch/x86_64/tools/irqbalance_mck.service"
|
||||
"arch/x86_64/tools/irqbalance_mck.in"
|
||||
install(PROGRAMS
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/eclair-dump-backtrace.exp"
|
||||
DESTINATION "${CMAKE_INSTALL_BINDIR}")
|
||||
install(FILES "scripts/irqbalance_mck.in"
|
||||
DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}")
|
||||
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/mcreboot.1"
|
||||
DESTINATION "${CMAKE_INSTALL_MANDIR}/man1")
|
||||
@ -162,7 +230,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/mcreboot.1"
|
||||
|
||||
configure_file(scripts/mckernel.spec.in scripts/mckernel.spec @ONLY)
|
||||
set(CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${MCKERNEL_VERSION}")
|
||||
set(CPACK_SOURCE_IGNORE_FILES "/.git$")
|
||||
set(CPACK_SOURCE_IGNORE_FILES "/.git/;/build;/CMakeCache.txt$;/CMakeFiles$;/Makefile$")
|
||||
set(CPACK_SOURCE_INSTALLED_DIRECTORIES "${CMAKE_SOURCE_DIR};/;${IHK_FULL_SOURCE_DIR};/ihk;${CMAKE_BINARY_DIR}/scripts;/scripts")
|
||||
set(CPACK_SOURCE_GENERATOR "TGZ")
|
||||
include(CPack)
|
||||
@ -181,12 +249,14 @@ message("KERNEL_DIR: ${KERNEL_DIR}")
|
||||
message("SYSTEM_MAP: ${SYSTEM_MAP}")
|
||||
message("VMLINUX: ${VMLINUX}")
|
||||
message("KBUILD_C_FLAGS: ${KBUILD_C_FLAGS}")
|
||||
message("MAP_KERNEL_START: ${MAP_KERNEL_START}")
|
||||
message("ENABLE_MEMDUMP: ${ENABLE_MEMDUMP}")
|
||||
message("ENABLE_PERF: ${ENABLE_PERF}")
|
||||
message("ENABLE_RUSAGE: ${ENABLE_RUSAGE}")
|
||||
message("ENABLE_MCOVERLAYFS: ${ENABLE_MCOVERLAYFS}")
|
||||
message("ENABLE_QLMPI: ${ENABLE_QLMPI}")
|
||||
message("ENABLE_UTI: ${ENABLE_UTI}")
|
||||
message("ENABLE_WERROR: ${ENABLE_WERROR}")
|
||||
message("ENABLE_UBSAN: ${ENABLE_UBSAN}")
|
||||
message("ENABLE_LINUX_WORK_IRQ_FOR_IKC: ${ENABLE_LINUX_WORK_IRQ_FOR_IKC}")
|
||||
message("ENABLE_PER_CPU_ALLOC_CACHE: ${ENABLE_PER_CPU_ALLOC_CACHE}")
|
||||
message("-------------------------------")
|
||||
|
||||
70
KNOWN_BUGS.md
Normal file
70
KNOWN_BUGS.md
Normal file
@ -0,0 +1,70 @@
|
||||
Linux crash when offlining CPU (el7, hardware-specific)
|
||||
=========================================================
|
||||
|
||||
On some hardware with el7 kernel, linux can crash due to a bug in the
|
||||
irq handling when offlining CPUs (reserve cpu part of mcreboot)
|
||||
|
||||
Example stack trace:
|
||||
```
|
||||
[ 4147.052753] BUG: unable to handle kernel NULL pointer dereference at 0000000000000040
|
||||
[ 4147.060677] IP: [<ffffffff8102ce26>] check_irq_vectors_for_cpu_disable+0x86/0x1c0
|
||||
[ 4147.068226] PGD 1057e44067 PUD 105f1e7067 PMD 0
|
||||
[ 4147.072935] Oops: 0000 [#1] SMP
|
||||
[ 4147.076230] Modules linked in: mcctrl(OE) ihk_smp_x86_64(OE) ihk(OE) xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 tun rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache ip6t_rpfilter ipt_REJECT nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge stp llc ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp scsi_tgt ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm mlx4_ib ib_core
|
||||
[ 4147.148619] dm_mirror dm_region_hash dm_log dm_mod sb_edac edac_core intel_powerclamp coretemp ext4 mbcache jbd2 intel_rapl iosf_mbi kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul ipmi_ssif glue_helper ablk_helper joydev iTCO_wdt iTCO_vendor_support cryptd ipmi_si ipmi_devintf ipmi_msghandler pcspkr wmi mei_me mei lpc_ich i2c_i801 sg ioatdma shpchp nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c mlx4_en sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm isci igb drm mlx4_core libsas ahci libahci scsi_transport_sas libata crct10dif_pclmul ptp crct10dif_common pps_core crc32c_intel dca i2c_algo_bit i2c_core devlink [last unloaded: ihk]
|
||||
[ 4147.215370] CPU: 6 PID: 38 Comm: migration/6 Tainted: G OE ------------ T 3.10.0-693.2.2.el7.x86_64 #1
|
||||
[ 4147.225672] Hardware name: SGI.COM C1104G-RP5/X9DRG-HF, BIOS 3.0 10/25/2013
|
||||
[ 4147.232747] task: ffff880174689fa0 ti: ffff8801746ac000 task.ti: ffff8801746ac000
|
||||
[ 4147.240278] RIP: 0010:[<ffffffff8102ce26>] [<ffffffff8102ce26>] check_irq_vectors_for_cpu_disable+0x86/0x1c0
|
||||
[ 4147.250275] RSP: 0018:ffff8801746afd30 EFLAGS: 00010046
|
||||
[ 4147.255608] RAX: 0000000000000000 RBX: 000000000000004e RCX: 0000000000000000
|
||||
[ 4147.262770] RDX: 0000000000000020 RSI: 000000000000005f RDI: 0000000000000023
|
||||
[ 4147.269936] RBP: ffff8801746afd58 R08: 0000000000000001 R09: ffff88017f800490
|
||||
[ 4147.277103] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000006
|
||||
[ 4147.284269] R13: 0000000000000000 R14: ffff88085ca82500 R15: 000000000000005f
|
||||
[ 4147.291429] FS: 0000000000000000(0000) GS:ffff88085fb80000(0000) knlGS:0000000000000000
|
||||
[ 4147.299556] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
|
||||
[ 4147.305326] CR2: 0000000000000040 CR3: 0000001059704000 CR4: 00000000001407e0
|
||||
[ 4147.312490] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
|
||||
[ 4147.319659] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
|
||||
[ 4147.326827] Stack:
|
||||
[ 4147.328857] ffff8808f43078c8 ffff8808f4307850 0000000000000286 ffff8808f4307701
|
||||
[ 4147.336384] 0000000000000000 ffff8801746afd70 ffffffff81052a82 0000000200000000
|
||||
[ 4147.343915] ffff8801746afd88 ffffffff81693ca3 0000000000000003 ffff8801746afdc0
|
||||
[ 4147.351447] Call Trace:
|
||||
[ 4147.353921] [<ffffffff81052a82>] native_cpu_disable+0x12/0x40
|
||||
[ 4147.359795] [<ffffffff81693ca3>] take_cpu_down+0x13/0x40
|
||||
[ 4147.365236] [<ffffffff81116899>] multi_cpu_stop+0xd9/0x100
|
||||
[ 4147.370850] [<ffffffff811167c0>] ? cpu_stop_should_run+0x50/0x50
|
||||
[ 4147.376983] [<ffffffff81116ab7>] cpu_stopper_thread+0x97/0x150
|
||||
[ 4147.382942] [<ffffffff816a8fad>] ? __schedule+0x39d/0x8b0
|
||||
[ 4147.388461] [<ffffffff810b909f>] smpboot_thread_fn+0x12f/0x180
|
||||
[ 4147.394406] [<ffffffff810b8f70>] ? lg_double_unlock+0x40/0x40
|
||||
[ 4147.400276] [<ffffffff810b098f>] kthread+0xcf/0xe0
|
||||
[ 4147.405182] [<ffffffff810b08c0>] ? insert_kthread_work+0x40/0x40
|
||||
[ 4147.411319] [<ffffffff816b4f58>] ret_from_fork+0x58/0x90
|
||||
[ 4147.418893] [<ffffffff810b08c0>] ? insert_kthread_work+0x40/0x40
|
||||
[ 4147.426524] Code: 81 fb 00 01 00 00 0f 84 8a 00 00 00 89 d8 65 44 8b 3c 85 20 c6 00 00 45 85 ff 78 e1 44 89 ff e8 91 31 10 00 48 63 15 7e 10 af 00 <48> 8b 70 40 48 c7 c7 80 71 cf 81 49 89 c6 48 83 c2 3f 48 c1 fa
|
||||
[ 4147.450352] RIP [<ffffffff8102ce26>] check_irq_vectors_for_cpu_disable+0x86/0x1c0
|
||||
[ 4147.460135] RSP <ffff8801746afd30>
|
||||
[ 4147.465154] CR2: 0000000000000040
|
||||
```
|
||||
|
||||
This bug has been fixed upstream, but redhat will not backport the fixes.
|
||||
You can work around the problem with a kpatch by backporting the three
|
||||
following commits:
|
||||
|
||||
x86: irq: Get correct available vectors for cpu disable
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ac2a55395eddccd6e3e39532df9869d61e97b2ee
|
||||
|
||||
x86/irq: Check for valid irq descriptor in check_irq_vectors_for_cpu_disable()
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d97eb8966c91f2c9d05f0a22eb89ed5b76d966d1
|
||||
|
||||
x86/irq: Use proper locking in check_irq_vectors_for_cpu_disable()
|
||||
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cbb24dc761d95fe39a7a122bb1b298e9604cae15
|
||||
|
||||
|
||||
Alternatively, since it is related to the irq configuration, it might
|
||||
be possible to mitigate the issue by setting the irq affinities early
|
||||
on and making sure none of the cpus that will be offlined have any irq
|
||||
configured.
|
||||
537
NEWS.md
Normal file
537
NEWS.md
Normal file
@ -0,0 +1,537 @@
|
||||
=============================================
|
||||
What's new in version 1.7.0rc4 (Apr 15, 2020)
|
||||
=============================================
|
||||
|
||||
----------------------
|
||||
McKernel major updates
|
||||
----------------------
|
||||
1. arm64: Contiguous PTE support
|
||||
2. arm64: Scalable Vector Extension (SVE) support
|
||||
3. arm64: PMU overflow interrupt support
|
||||
4. xpmem: Support large page attachment
|
||||
5. arm64 port: Direct access to Mckernel memory from Linux
|
||||
6. arm64 port: utility thread offloading, which spawns thread onto
|
||||
Linux CPU
|
||||
7. eclair: support for live debug
|
||||
8. Crash utility extension
|
||||
9. Replace mcoverlayfs with a soft userspace overlay
|
||||
10. Build system is switched to cmake
|
||||
11. Core dump includes thread information
|
||||
|
||||
------------------------
|
||||
McKernel major bug fixes
|
||||
------------------------
|
||||
1. shmobj: Fix rusage counting for large page
|
||||
2. mcctrl control: task start_time changed to u64 nsec
|
||||
3. mcctrl: add handling for one more level of page tables
|
||||
4. Add kernel argument to turn on/off time sharing
|
||||
5. flatten_string/process env: realign env and clear trailing bits
|
||||
6. madvise: Add MADV_HUGEPAGE support
|
||||
8. mcctrl: remove in-kernel calls to syscalls
|
||||
9. arch_cpu_read_write_register: error return fix.
|
||||
10. set_cputime(): interrupt enable/disable fix.
|
||||
11. set_mempolicy(): Add mode check.
|
||||
12. mbind(): Fix memory_range_lock deadlock.
|
||||
13. ihk_ikc_recv: Record channel to packet for release
|
||||
14. Add set_cputime() kernel to kernel case and mode enum.
|
||||
15. execve: Call preempt_enable() before error-exit
|
||||
16. memory/x86_64: fix linux safe_kernel_map
|
||||
17. do_kill(): fix pids table when nr of threads is larger than num_processors
|
||||
18. shmget: Use transparent huge pages when page size isn't specified
|
||||
19. prctl: Add support for PR_SET_THP_DISABLE and PR_GET_THP_DISABLE
|
||||
20. monitor_init: fix undetected hang on highest numbered core
|
||||
21. init_process_stack: change premapped stack size based on arch
|
||||
22. x86 syscalls: add a bunch of XXat() delegated syscalls
|
||||
23. do_pageout: fix direct kernel-user access
|
||||
24. stack: add hwcap auxval
|
||||
25. perf counters: add arch-specific perf counters
|
||||
26. Added check of nohost to terminate_host().
|
||||
27. kmalloc: Fix address order in free list
|
||||
28. sysfs: use nr_cpu_ids for cpumasks (fixes libnuma parsing error on ARM)
|
||||
29. monitor_init: Use ihk_mc_cpu_info()
|
||||
30. Fix ThunderX2 write-combined PTE flag insanity
|
||||
31. ARM: eliminate zero page mapping (i.e, init_low_area())
|
||||
32. eliminate futex_cmpxchg_enabled check (not used and dereffed a NULL pointer)
|
||||
33. page_table: Fix return value of lookup_pte when ptl4 is blank
|
||||
34. sysfs: add missing symlinks for cpu/node
|
||||
35. Make Linux handler run when mmap to procfs.
|
||||
36. Separate mmap area from program loading (relocation) area
|
||||
37. move rusage into kernel ELF image (avoid dynamic alloc before NUMA init)
|
||||
38. arm: turn off cpu on panic
|
||||
39. page fault handler: protect thread accesses
|
||||
40. Register PPD and release_handler at the same time.
|
||||
41. fix to missing exclusive processing between terminate() and
|
||||
finalize_process().
|
||||
42. perfctr_stop: add flags to no 'disable_intens'
|
||||
43. fileobj, shmobj: free pages in object destructor (as opposed to page_unmap())
|
||||
44. clear_range_l1, clear_range_middle: Fix handling contiguous PTE
|
||||
45. do_mmap: don't pre-populate the whole file when asked for smaller segment
|
||||
46. invalidate_one_page: Support shmobj and contiguous PTE
|
||||
47. ubsan: fix undefined shifts
|
||||
48. x86: disable zero mapping and add a boot pt for ap trampoline
|
||||
49. rusage: Don't count PF_PATCH change
|
||||
50. Fixed time processing.
|
||||
51. copy_user_pte: vmap area not owned by McKernel
|
||||
52. gencore: Zero-clear ELF header and memory range table
|
||||
53. rpm: ignore CMakeCache.txt in dist and relax BuildRequires on cross build
|
||||
54. gencore: Allocate ELF header to heap instead of stack
|
||||
55. nanosleep: add cpu_pause() in spinwait loop
|
||||
56. init_process: add missing initializations to proc struct
|
||||
57. rus_vm_fault: always use a packet on the stack
|
||||
58. process stack: use PAGE_SIZE in aux vector
|
||||
59. copy_user_pte: base memobj copy on range & VR_PRIVATE
|
||||
60. arm64: ptrace: Fix overwriting 1st argument with return value
|
||||
61. page fault: use cow for private device mappings
|
||||
62. reproductible builds: remove most install paths in c code
|
||||
63. page fault: clear writable bit for non-dirtying access to shared ranges
|
||||
64. mcreboot/mcstop+release: support for regular user execution
|
||||
65. irqbalance_mck: replace extra service with service drop-in
|
||||
66. do_mmap: give addr argument a chance even if not MAP_FIXED
|
||||
67. x86: fix xchg() and cmpxchg() macros
|
||||
68. IHK: support for using Linux work IRQ as IKC interrupt (optional)
|
||||
69. MCS: fix ARM64 issue by using smp_XXX() functions (i.e., barrier()s)
|
||||
70. procfs: add number of threads to stat and status
|
||||
71. memory_range_lock: Fix deadlock in procfs/sysfs handler
|
||||
72. flush instruction cache at context switch time if necessary
|
||||
73. arm64: Fix PMU related functions
|
||||
74. page_fault_process_memory_range: Disable COW for VM region with zeroobj
|
||||
75. extend_process_region: Fall back to demand paging when not contiguous
|
||||
76. munmap: fix deadlock with remote pagefault on vm range lock
|
||||
77. procfs: if memory_range_lock fails, process later
|
||||
78. migrate-cpu: Prevent migration target from calling schedule() twice
|
||||
79. sched_request_migrate(): fix race condition between migration req and IRQs
|
||||
80. get_one_cpu_topology: Renumber core_id (physical core id)
|
||||
81. bb7e140 procfs cpuinfo: use sequence number as processor
|
||||
82. set_host_vma(): do NOT read protect Linux VMA
|
||||
|
||||
===========================================
|
||||
What's new in V1.6.0 (Nov 11, 2018)
|
||||
===========================================
|
||||
|
||||
-----------------------------------------------
|
||||
McKernel new features, improvements and changes
|
||||
-----------------------------------------------
|
||||
1. McKernel and Linux share one unified kernel virtual address space.
|
||||
That is, McKernel sections resides in Linux sections spared for
|
||||
modules. In this way, Linux can access the McKernel kernel memory
|
||||
area.
|
||||
2. hugetlbfs support
|
||||
3. IHK is now included as a git submodule
|
||||
4. Debug messages are turned on/off in per souce file basis at run-time.
|
||||
5. It's prohibited for McKernel to access physical memory ranges which
|
||||
Linux didn't give to McKernel.
|
||||
6. UTI (capability to spawn a thread on Linux CPU) improvement:
|
||||
* System calls issued from the thread are hooked by modifying
|
||||
binary in memory.
|
||||
|
||||
---------------------------
|
||||
McKernel bug fixes (digest)
|
||||
---------------------------
|
||||
#<num> below corresponds to the redmine issue number
|
||||
(https://postpeta.pccluster.org/redmine/).
|
||||
|
||||
1. #926: shmget: Hide object with IPC_RMID from shmget
|
||||
2. #1028: init_process: Inherit parent cpu_set
|
||||
3. #995: Fix shebang recorded in argv[0]
|
||||
4. #1024: Fix VMAP virtual address leak
|
||||
5. #1109: init_process_stack: Support "ulimit -s unlimited"
|
||||
6. x86 mem init: do not map identity mapping
|
||||
7. mcexec_wait_syscall: requeue potential request on interrupted wait
|
||||
8. mcctrl_ikc_send_wait: fix interrupt with do_frees == NULL
|
||||
9. pager_req_read: handle short read
|
||||
10. kprintf: only call eventfd() if it is safe to interrupt
|
||||
11. process_procfs_request: Add Pid to /proc/<PID>/status
|
||||
12. terminate: fix oversubscribe hang when waiting for other threads on same CPU to die
|
||||
13. mcexec: Do not close fd returned to mckernel side
|
||||
14. #976: execve: Clear sigaltstack and fp_regs
|
||||
15. #1002: perf_event: Specify counter by bit_mask on start/stop
|
||||
16. #1027: schedule: Don't reschedule immediately when wake up on migrate
|
||||
17. #mcctrl: lookup unexported symbols at runtime
|
||||
18. __sched_wakeup_thread: Notify interrupt_exit() of re-schedule
|
||||
19. futex_wait_queue_me: Spin-sleep when timeout and idle_halt is specified
|
||||
20. #1167: ihk_os_getperfevent,setperfevent: Timeout IKC sent by mcctrl
|
||||
21. devobj: fix object size (POSTK_DEBUG_TEMP_FIX_36)
|
||||
22. mcctrl: remove rus page cache
|
||||
23. #1021: procfs: Support multiple reads of e.g. /proc/*/maps
|
||||
24. #1006: wait: Delay wake-up parent within switch context
|
||||
25. #1164: mem: Check if phys-mem is within the range of McKernel memory
|
||||
26. #1039: page_fault_process_memory_range: Remove ihk_mc_map_virtual for CoW of device map
|
||||
27. partitioned execution: pass process rank to LWK
|
||||
28. process/vm: implement access_ok()
|
||||
29. spinlock: rewrite spinlock to use Linux ticket head/tail format
|
||||
30. #986: Fix deadlock involving mmap_sem and memory_range_lock
|
||||
31. Prevent one CPU from getting chosen by concurrent forks
|
||||
32. #1009: check_signal: system call restart is done only once
|
||||
33. #1176: syscall: the signal received during system call processing is not processed.
|
||||
34. #1036 syscall_time: Handle by McKernel
|
||||
35. #1165 do_syscall: Delegate system calls to the mcexec with the same pid
|
||||
36. #1194 execve: Fix calling ptrace_report_signal after preemption is disabled
|
||||
37. #1005 coredump: Exclude special areas
|
||||
38. #1018 procfs: Fix pread/pwrite to procfs fail when specified size is bigger than 4MB
|
||||
39. #1180 sched_setaffinity: Check migration after decrementing in_interrupt
|
||||
40. #771, #1179, #1143 ptrace supports threads
|
||||
41. #1189 procfs/do_fork: wait until procfs entries are registered
|
||||
42. #1114 procfs: add '/proc/pid/stat' to mckernel side and fix its comm
|
||||
43. #1116 mcctrl procfs: check entry was returned before using it
|
||||
44. #1167 ihk_os_getperfevent,setperfevent: Return -ETIME when IKC timeouts
|
||||
45. mcexec/execve: fix shebangs handling
|
||||
46. procfs: handle 'comm' on mckernel side
|
||||
47. ihk_os_setperfevent: Return number of registered events
|
||||
48. mcexec: fix terminating zero after readlink()
|
||||
|
||||
===========================================
|
||||
What's new in V1.5.1 (July 9, 2018)
|
||||
===========================================
|
||||
|
||||
-----------------------------------------------
|
||||
McKernel new features, improvements and changes
|
||||
-----------------------------------------------
|
||||
1. Watchdog timer to detect hang of McKernel
|
||||
mcexec prints out the following line to its stderr when a hang of
|
||||
McKernel is detected.
|
||||
|
||||
mcexec detected hang of McKernel
|
||||
|
||||
The watchdog timer is enabled by passing -i <timeout_in_sec> option
|
||||
to mcreboot.sh. <timeout_in_sec> specifies the interval of checking
|
||||
if McKernel is alive.
|
||||
Example: mcreboot.sh -i 600: Detect the hang with 10 minutes interval
|
||||
|
||||
The detailed step of the hang detection is as follows.
|
||||
(1) mcexec acquires eventfd for notification from IHK and perform
|
||||
epoll() on it.
|
||||
(2) A daemon called ihkmond monitors the state of McKernel periodically
|
||||
with the interval specified by the -i option. It judges that
|
||||
McKernel is hanging and notifies mcexec by the eventfd if its
|
||||
state hasn't changed since the last check.
|
||||
|
||||
2. Documentation
|
||||
man page: Installed directory is changed to <install_dir>/share/man
|
||||
|
||||
---------------------------
|
||||
McKernel bug fixes (digest)
|
||||
---------------------------
|
||||
1. #1146: pager_req_map(): do not take mmap_sem if not needed
|
||||
2. #1135: prepare_process_ranges_args_envs(): fix saving cmdline
|
||||
3. #1144: fileobj/devobj: record path name
|
||||
4. #1145: fileobj: use MCS locks for per-file page hash
|
||||
5. #1076: mcctrl: refactor prepare_image into new generic ikc send&wait
|
||||
6. #1072: execve: fix execve with oversubscribing
|
||||
7. #1132: execve: use thread variable instead of cpu_local_var(current)
|
||||
8. #1117: mprotect: do not set page table writable for cow pages
|
||||
9. #1143: syscall wait4: add _WALL (POSTK_DEBUG_ARCH_DEP_44)
|
||||
10. #1064: rusage: Fix initialization of rusage->num_processors
|
||||
11. #1133: pager_req_unmap: Put per-process data at exit
|
||||
12. #731: do_fork: Propagate error code returned by mcexec
|
||||
13. #1149: execve: Reinitialize vm_regions's map area on execve
|
||||
14. #1065: procfs: Show file names in /proc/<PID>/maps
|
||||
15. #1112: mremap: Fix type of size arguments (from ssize_t to size_t)
|
||||
16. #1121: sched_getaffinity: Check arguments in the same order as in Linux
|
||||
17. #1137: mmap, mremap: Check arguments in the same order as in Linux
|
||||
18. #1122: fix return value of sched_getaffinity
|
||||
19. #732: fix: /proc/<PID>/maps outputs a unnecessary NULL character
|
||||
|
||||
===================================
|
||||
What's new in V1.5.0 (Apr 5, 2018)
|
||||
===================================
|
||||
|
||||
--------------------------------------
|
||||
McKernel new features and improvements
|
||||
--------------------------------------
|
||||
1. Aid for Linux version migration: Detect /proc, /sys format change
|
||||
between two kernel verions
|
||||
2. Swap out
|
||||
* Only swap-out anonymous pages for now
|
||||
3. Improve support of /proc/maps
|
||||
4. mcstat: Linux tool to show resource usage
|
||||
|
||||
---------------------------
|
||||
McKernel bug fixes (digest)
|
||||
---------------------------
|
||||
1. #727: execve: Fix memory leak when receiving SIGKILL
|
||||
2. #829: perf_event_open: Support PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
|
||||
3. #906: mcexec: Check return code of fork()
|
||||
4. #1038: mcexec: Timeout when incorrect value is given to -n option
|
||||
5. #943 #945 #946 #960 $961: mcexec: Support strace
|
||||
6. #1029: struct thread is not released with stress-test involving signal
|
||||
and futex
|
||||
7. #863 #870: Respond immediately to terminating signal when
|
||||
offloading system call
|
||||
8. #1119: translate_rva_to_rpa(): use 2MB blocks in 1GB pages on x86
|
||||
11. #898: Shutdown OS only after no in-flight IKC exist
|
||||
12. #882: release_handler: Destroy objects as the process which opened it
|
||||
13. #882: mcexec: Make child process exit if the parent is killed during
|
||||
fork()
|
||||
14. #925: XPMEM: Don't destroy per-process object of the parent
|
||||
15. #885: ptrace: Support the case where a process attaches its child
|
||||
16. #1031: sigaction: Support SA_RESETHAND
|
||||
17. #923: rus_vm_fault: Return error when a thread not performing
|
||||
system call offloading causes remote page fault
|
||||
18. #1032 #1033 #1034: getrusage: Fix ru_maxrss, RUSAGE_CHILDREN,
|
||||
ru_stime related bugs
|
||||
19. #1120: getrusage: Fix deadlock on thread->times_update
|
||||
20. #1123: Fix deadlock related to wait_queue_head_list_node
|
||||
21. #1124: Fix deadlock of calling terminate() from terminate()
|
||||
22. #1125: Fix deadlock related to thread status
|
||||
* Related functions are: hold_thread(), do_kill() and terminate()
|
||||
23. #1126: uti: Fix uti thread on the McKernel side blocks others in do_syscall()
|
||||
24. #1066: procfs: Show Linux /proc/self/cgroup
|
||||
25. #1127: prepare_process_ranges_args_envs(): fix generating saved_cmdline to
|
||||
avoid PF in strlen()
|
||||
26. #1128: ihk_mc_map/unmap_virtual(): do proper TLB invalidation
|
||||
27. #1043: terminate(): fix update_lock and threads_lock order to avoid deadlock
|
||||
28. #1129: mcreboot.sh: Save /proc/irq/*/smp_affinity to /tmp/mcreboot
|
||||
29. #1130: mcexec: drop READ_IMPLIES_EXEC from personality
|
||||
|
||||
--------------------
|
||||
McKernel workarounds
|
||||
--------------------
|
||||
1. Forbid CPU oversubscription
|
||||
* It can be turned on by mcreboot.sh -O option
|
||||
|
||||
|
||||
===================================
|
||||
What's new in V1.4.0 (Oct 30, 2017)
|
||||
===================================
|
||||
|
||||
-----------------------------------------------------------
|
||||
Feature: Abstracted event type support in perf_event_open()
|
||||
-----------------------------------------------------------
|
||||
PERF_TYPE_HARDWARE and PERF_TYPE_CACHE types are supported.
|
||||
|
||||
----------------------------------
|
||||
Clean-up: Direct user-space access
|
||||
----------------------------------
|
||||
Code lines using direct user-space access (e.g. passing user-space
|
||||
pointer to memcpy()) becomes more portable across processor
|
||||
architectures. The modification follows the following rules.
|
||||
|
||||
1. Move the code section as it is to the architecture dependent
|
||||
directory if it is a part of the critical-path.
|
||||
2. Otherwise, rewrite the code section by using the portable methods.
|
||||
The methods include copy_from_user(), copy_to_user(),
|
||||
pte_get_phys() and phys_to_virt().
|
||||
|
||||
--------------------------------
|
||||
Test: MPI and OpenMP micro-bench
|
||||
--------------------------------
|
||||
The performance figures of MPI and OpenMP primitives are compared with
|
||||
those of Linux by using Intel MPI Benchmarks and EPCC OpenMP Micro
|
||||
Benchmark.
|
||||
|
||||
|
||||
===================================
|
||||
What's new in V1.3.0 (Sep 30, 2017)
|
||||
===================================
|
||||
|
||||
--------------------
|
||||
Feature: Kernel dump
|
||||
--------------------
|
||||
1. A dump level of "only kernel memory" is added.
|
||||
|
||||
The following two levels are available now:
|
||||
0: Dump all
|
||||
24: Dump only kernel memory
|
||||
|
||||
The dump level can be set by -d option in ihkosctl or the argument
|
||||
for ihk_os_makedumpfile(), as shown in the following examples:
|
||||
|
||||
Command: ihkosctl 0 dump -d 24
|
||||
Function call: ihk_os_makedumpfile(0, NULL, 24, 0);
|
||||
|
||||
2. Dump file is created when Linux panics.
|
||||
|
||||
The dump level can be set by dump_level kernel argument, as shown in the
|
||||
following example:
|
||||
|
||||
ihkosctl 0 kargs "hidos dump_level=24"
|
||||
|
||||
The IHK dump function is registered to panic_notifier_list when creating
|
||||
/dev/mcdX and called when Linux panics.
|
||||
|
||||
-----------------------------
|
||||
Feature: Quick Process Launch
|
||||
-----------------------------
|
||||
|
||||
MPI process launch time and some of the initialization time can be
|
||||
reduced in application consisting of multiple MPI programs which are
|
||||
launched in turn in the job script.
|
||||
|
||||
The following two steps should be performed to use this feature:
|
||||
1. Replace mpiexec with ql_mpiexec_start and add some lines for
|
||||
ql_mpiexec_finalize in the job script
|
||||
2. Modify the app so that it can repeat calculations and wait for the
|
||||
instructions from ql_mpiexec_{start,finalize} at the end of the
|
||||
loop
|
||||
|
||||
The first step is explained using an example. Assume the original job
|
||||
script looks like this:
|
||||
|
||||
/* Execute ensamble simulation and then data assimilation, and repeat this
|
||||
ten times */
|
||||
for i in {1..10}; do
|
||||
|
||||
/* Each ensamble simulation execution uses 100 nodes, launch ten of them
|
||||
in parallel */
|
||||
for j in {1..10}; do
|
||||
mpiexec -n 100 -machinefile ./list1_$j p1.out a1 & pids[$i]=$!;
|
||||
done
|
||||
|
||||
/* Wait until the ten ensamble simulation programs finish */
|
||||
for j in {1..10}; do wait ${pids[$j]}; done
|
||||
|
||||
/* Launch one data assimilation program using 1000 nodes */
|
||||
mpiexec -n 1000 -machinefile ./list2 p2.out a2
|
||||
done
|
||||
|
||||
The job script should be modified like this:
|
||||
|
||||
for i in {1..10}; do
|
||||
for j in {1..10}; do
|
||||
/* Replace mpiexec with ql_mpiexec_start */
|
||||
ql_mpiexec_start -n 100 -machinefile ./list1_$j p1.out a1 & pids[$j]=$!;
|
||||
done
|
||||
|
||||
for j in {1..10}; do wait ${pids[$j]}; done
|
||||
|
||||
ql_mpiexec_start -n 1000 -machinefile ./list2 p2.out a2
|
||||
done
|
||||
|
||||
/* p1.out and p2.out don't exit but are waiting for the next calculation.
|
||||
So tell them to exit */
|
||||
for j in {1..10}; do
|
||||
ql_mpiexec_finalize -machinefile ./list1_$i p1.out a1;
|
||||
done
|
||||
ql_mpiexec_finalize -machinefile ./list2 p2.out a2;
|
||||
|
||||
|
||||
The second step is explained using a pseudo-code.
|
||||
|
||||
MPI_Init();
|
||||
Prepare data exchange with preceding / following MPI programs
|
||||
loop:
|
||||
foreach Fortran module
|
||||
Initialize data using command-line argments, parameter files,
|
||||
environment variables
|
||||
Input data from preceding MPI programs / Read snap-shot
|
||||
Perform main calculation
|
||||
Output data to following MPI programs / Write snap-shot
|
||||
/* ql_client() waits for command of ql_mpiexec_{start,finish} */
|
||||
if (ql_client() == QL_CONTINUE) { goto loop; }
|
||||
MPI_Finalize();
|
||||
|
||||
qlmpilib.h should be included in the code and libql{mpi,fort}.so
|
||||
should be linked to the executable file.
|
||||
|
||||
|
||||
========================
|
||||
Restrictions on McKernel
|
||||
========================
|
||||
|
||||
1. Pseudo devices such as /dev/mem and /dev/zero are not mmap()ed
|
||||
correctly even if the mmap() returns a success. An access of their
|
||||
mapping receives the SIGSEGV signal.
|
||||
|
||||
2. clone() supports only the following flags. All the other flags
|
||||
cause clone() to return error or are simply ignored.
|
||||
|
||||
* CLONE_CHILD_CLEARTID
|
||||
* CLONE_CHILD_SETTID
|
||||
* CLONE_PARENT_SETTID
|
||||
* CLONE_SETTLS
|
||||
* CLONE_SIGHAND
|
||||
* CLONE_VM
|
||||
|
||||
3. PAPI has the following restriction.
|
||||
|
||||
* Number of counters a user can use at the same time is up to the
|
||||
number of the physical counters in the processor.
|
||||
|
||||
4. msync writes back only the modified pages mapped by the calling process.
|
||||
|
||||
5. The following syscalls always return the ENOSYS error.
|
||||
|
||||
* migrate_pages()
|
||||
* move_pages()
|
||||
* set_robust_list()
|
||||
|
||||
6. The following syscalls always return the EOPNOTSUPP error.
|
||||
|
||||
* arch_prctl(ARCH_SET_GS)
|
||||
* signalfd()
|
||||
|
||||
7. signalfd4() returns a fd, but signal is not notified through the
|
||||
fd.
|
||||
|
||||
8. set_rlimit sets the limit values but they are not enforced.
|
||||
|
||||
9. Address randomization is not supported.
|
||||
|
||||
10. brk() extends the heap more than requestd when -h
|
||||
(--extend-heap-by=)<step> option of mcexec is used with the value
|
||||
larger than 4 KiB. syscall_pwrite02 of LTP would fail for this
|
||||
reason. This is because the test expects that the end of the heap
|
||||
is set to the same address as the argument of sbrk() and expects a
|
||||
segmentation violation occurs when it tries to access the memory
|
||||
area right next to the boundary. However, the optimization sets
|
||||
the end to a value larger than the requested. Therefore, the
|
||||
expected segmentation violation doesn't occur.
|
||||
|
||||
11. setpriority()/getpriority() won't work. They might set/get the
|
||||
priority of a random mcexec thread. This is because there's no
|
||||
fixed correspondence between a McKernel thread which issues the
|
||||
system call and a mcexec thread which handles the offload request.
|
||||
|
||||
12. mbind() can set the policy but it is not used when allocating
|
||||
physical pages.
|
||||
|
||||
13. MPOL_F_RELATIVE_NODES and MPOL_INTERLEAVE flags for
|
||||
set_mempolicy()/mbind() are not supported.
|
||||
|
||||
14. The MPOL_BIND policy for set_mempolicy()/mbind() works as the same
|
||||
as the MPOL_PREFERRED policy. That is, the physical page allocator
|
||||
doesn't give up the allocation when the specified nodes are
|
||||
running out of pages but continues to search pages in the other
|
||||
nodes.
|
||||
|
||||
15. Kernel dump on Linux panic requires Linux kernel CentOS-7.4 and
|
||||
later. In addition, crash_kexec_post_notifiers kernel argument
|
||||
must be given to Linux kernel.
|
||||
|
||||
16. setfsuid()/setfsgid() cannot change the id of the calling thread.
|
||||
Instead, it changes that of the mcexec worker thread which takes
|
||||
the system-call offload request.
|
||||
|
||||
17. mmap (hugeTLBfs): The physical pages corresponding to a map are
|
||||
released when no McKernel process exist. The next map gets fresh
|
||||
physical pages.
|
||||
|
||||
18. Sticky bit on executable file has no effect.
|
||||
|
||||
19. Linux (RHEL-7 for x86_64) could hang when offlining CPUs in the
|
||||
process of booting McKernel due to the Linux bug, found in
|
||||
Linux-3.10 and fixed in the later version. One way to circumvent
|
||||
this is to always assign the same CPU set to McKernel.
|
||||
|
||||
20. madvise:
|
||||
* MADV_HWPOISON and MADV_SOFT_OFFLINE always returns -EPERM.
|
||||
* MADV_MERGEABLE and MADV_UNMERGEABLE always returns -EINVAL.
|
||||
* MADV_HUGEPAGE and MADV_NOHUGEPAGE on file map returns -EINVAL
|
||||
(It succeeds on RHEL-8 for aarch64).
|
||||
|
||||
21. brk() and mmap() doesn't report out-of-memory through its return
|
||||
value. Instead, page-fault reports the error.
|
||||
|
||||
22. Anonymous mmap pre-maps requested number of pages when contiguous
|
||||
pages are available. Demand paging is used when not available.
|
||||
|
||||
23. Mixing page sizes in anonymous shared mapping is not allowed. mmap
|
||||
creates vm_range with one page size. And munmap or mremap that
|
||||
needs the reduced page size changes the sizes of all the pages of
|
||||
the vm_range.
|
||||
|
||||
24. ihk_os_getperfevent() could time-out when invoked from Fujitsu TCS
|
||||
(job-scheduler).
|
||||
|
||||
25. The behaviors of madvise and mbind are changed to do nothing and
|
||||
report success as a workaround for Fugaku.
|
||||
109
README.md
109
README.md
@ -10,7 +10,7 @@ IHK/McKernel is a light-weight multi-kernel operating system designed for high-e
|
||||
|
||||
## Contents
|
||||
|
||||
- [Background] (#background)
|
||||
- [Background](#background-and-motivation)
|
||||
- [Architectural Overview](#architectural-overview)
|
||||
- [Installation](#installation)
|
||||
- [The Team](#the-team)
|
||||
@ -85,7 +85,7 @@ sudo reboot
|
||||
You will need the following packages installed:
|
||||
|
||||
~~~~
|
||||
sudo yum install kernel-devel binutils-devel libnuma-devel
|
||||
sudo yum install cmake kernel-devel binutils-devel systemd-devel numactl-devel gcc make nasm git
|
||||
~~~~
|
||||
|
||||
Grant read permission to the System.map file of your kernel version:
|
||||
@ -96,24 +96,51 @@ sudo chmod a+r /boot/System.map-`uname -r`
|
||||
|
||||
##### 4. Obtain sources and compile the kernel
|
||||
|
||||
Clone the source code and set up ihk symlink (this is currently required):
|
||||
Clone the source code:
|
||||
|
||||
~~~~
|
||||
mkdir -p ~/src/ihk+mckernel/
|
||||
cd ~/src/ihk+mckernel/
|
||||
git clone -r git@github.com:RIKEN-SysSoft/mckernel.git
|
||||
git clone --recursive -b development https://github.com/RIKEN-SysSoft/mckernel.git
|
||||
~~~~
|
||||
|
||||
(Optional) Checkout to the specific branch or version:
|
||||
|
||||
~~~~
|
||||
cd mckernel
|
||||
git checkout <pathspec>
|
||||
git submodule update
|
||||
~~~~
|
||||
|
||||
Foe example, if you want to try the development branch, use "development" as the pathspec. If you want to try the prerelease version 1.7.0-0.2, use "1.7.0-0.2".
|
||||
|
||||
###### 4.1 Install with cmake
|
||||
|
||||
Configure and compile:
|
||||
|
||||
~~~~
|
||||
mkdir -p build && cd build
|
||||
cmake -DCMAKE_INSTALL_PREFIX=${HOME}/ihk+mckernel $HOME/src/mckernel
|
||||
cmake -DCMAKE_INSTALL_PREFIX=${HOME}/ihk+mckernel $HOME/src/ihk+mckernel/mckernel
|
||||
make -j install
|
||||
~~~~
|
||||
|
||||
The IHK kernel modules and McKernel kernel image should be installed under the **ihk+mckernel** folder in your home directory.
|
||||
|
||||
###### 4.2 Install with rpm
|
||||
|
||||
Configure, compile and build rpm:
|
||||
|
||||
~~~~
|
||||
mkdir -p build && cd build
|
||||
cmake $HOME/src/ihk+mckernel/mckernel
|
||||
make dist
|
||||
cp mckernel-<version>.tar.gz <rpmbuild>/SOURCES
|
||||
rpm -ba scripts/mckernel.spec
|
||||
sudo rpm -ivh <rpmbuild>/RPMS/<arch>/mckernel-<version>-<release>_<linux_kernel_ver>_<dist>.<arch>.rpm
|
||||
~~~~
|
||||
|
||||
The IHK kernel modules and McKernel kernel image are installed under the system directory.
|
||||
|
||||
##### 5. Boot McKernel
|
||||
|
||||
A boot script called mcreboot.sh is provided under sbin in the install folder. To boot on logical CPU 1 with 512MB of memory, use the following invocation:
|
||||
@ -170,6 +197,71 @@ Finally, to shutdown McKernel and release CPU/memory resources back to Linux use
|
||||
sudo ./sbin/mcstop+release.sh
|
||||
~~~~
|
||||
|
||||
##### 7. Advanced: Enable Utility Thread offloading Interface (UTI)
|
||||
|
||||
UTI enables a runtime such as MPI runtime to spawn utility threads such as MPI asynchronous progress threads to Linux cores.
|
||||
|
||||
1. Install capstone
|
||||
|
||||
Install EPEL capstone-devel:
|
||||
|
||||
~~~~
|
||||
sudo yum install epel-release
|
||||
sudo yum install capstone-devel
|
||||
~~~~
|
||||
|
||||
2. Install syscall_intercept
|
||||
|
||||
~~~~
|
||||
git clone https://github.com/RIKEN-SysSoft/syscall_intercept.git
|
||||
cmake ../arch/aarch64 -DCMAKE_INSTALL_PREFIX=<syscall-intercept-install> -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DTREAT_WARNINGS_AS_ERRORS=OFF
|
||||
~~~~
|
||||
|
||||
3. Install UTI for McKernel
|
||||
|
||||
Install:
|
||||
|
||||
~~~~
|
||||
git clone https://github.com/RIKEN-SysSoft/uti.git
|
||||
mkdir build && cd build
|
||||
../uti/configure --prefix=<mckernel-install> --with-rm=mckernel
|
||||
make && make install
|
||||
~~~~
|
||||
|
||||
4. Install McKernel
|
||||
|
||||
~~~~
|
||||
CMAKE_PREFIX_PATH=<syscall-intercept-install> cmake -DCMAKE_INSTALL_PREFIX=${HOME}/ihk+mckernel -DENABLE_UTI=ON $HOME/src/ihk+mckernel/mckernel
|
||||
~~~~
|
||||
|
||||
5. Run executable
|
||||
|
||||
~~~~
|
||||
mcexec --enable-uti <command>
|
||||
~~~~
|
||||
|
||||
6. Install UTI for Linux for performance comparison
|
||||
|
||||
Install by make:
|
||||
|
||||
~~~~
|
||||
git clone https://github.com/RIKEN-SysSoft/uti.git
|
||||
mkdir build && cd build
|
||||
../uti/configure --prefix=<uti-install> --with-rm=linux
|
||||
make && make install
|
||||
~~~~
|
||||
|
||||
Install by rpm:
|
||||
|
||||
~~~~
|
||||
git clone https://github.com/RIKEN-SysSoft/uti.git
|
||||
mkdir build && cd build
|
||||
../uti/configure --prefix=<uti-install> --with-rm=linux
|
||||
rm -f ~/rpmbuild/SOURCES/<version>.tar.gz
|
||||
rpmbuild -ba ./scripts/uti.spec
|
||||
rpm -Uvh uti-<version>-<release>-<arch>.rpm
|
||||
~~~~
|
||||
|
||||
## The Team
|
||||
|
||||
The McKernel project was started at The University of Tokyo and currently it is mainly developed at RIKEN.
|
||||
@ -184,3 +276,10 @@ Some of our collaborators include:
|
||||
## License
|
||||
|
||||
McKernel is GPL licensed, as found in the LICENSE file.
|
||||
|
||||
## Contact
|
||||
|
||||
Please give your feedback to us via one of the following mailing lists. Subscription via [www.pccluster.org](http://www.pccluster.org/mailman/listinfo/mckernel-users) is needed.
|
||||
|
||||
* English: mckernel-users@pccluster.org
|
||||
* Japanese: mckernel-users-jp@pccluster.org
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* assert.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||
/* assert.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
|
||||
#include <process.h>
|
||||
#include <list.h>
|
||||
@ -53,4 +53,4 @@ STATIC_ASSERT(SVE_PT_FPSIMD_OFFSET == sizeof(struct user_sve_header));
|
||||
STATIC_ASSERT(SVE_PT_SVE_OFFSET == sizeof(struct user_sve_header));
|
||||
|
||||
/* assert for struct arm64_cpu_local_thread member offset define */
|
||||
STATIC_ASSERT(offsetof(struct arm64_cpu_local_thread, panic_regs) == 160);
|
||||
STATIC_ASSERT(offsetof(struct arm64_cpu_local_thread, panic_regs) == 168);
|
||||
|
||||
@ -1,9 +1,15 @@
|
||||
/* coredump.c COPYRIGHT FUJITSU LIMITED 2015-2016 */
|
||||
/* coredump.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
#include <process.h>
|
||||
#include <elfcore.h>
|
||||
#include <string.h>
|
||||
#include <ptrace.h>
|
||||
#include <cls.h>
|
||||
#include <hwcap.h>
|
||||
|
||||
void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread, void *regs0)
|
||||
#define align32(x) ((((x) + 3) / 4) * 4)
|
||||
|
||||
void arch_fill_prstatus(struct elf_prstatus64 *prstatus,
|
||||
struct thread *thread, void *regs0, int sig)
|
||||
{
|
||||
struct pt_regs *regs = regs0;
|
||||
struct elf_prstatus64 tmp_prstatus;
|
||||
@ -14,8 +20,6 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
||||
short int pr_cursig;
|
||||
a8_uint64_t pr_sigpend;
|
||||
a8_uint64_t pr_sighold;
|
||||
pid_t pr_pid;
|
||||
pid_t pr_ppid;
|
||||
pid_t pr_pgrp;
|
||||
pid_t pr_sid;
|
||||
struct prstatus64_timeval pr_utime;
|
||||
@ -23,10 +27,66 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
||||
struct prstatus64_timeval pr_cutime;
|
||||
struct prstatus64_timeval pr_cstime;
|
||||
*/
|
||||
|
||||
/* copy x0-30, sp, pc, pstate */
|
||||
memcpy(&tmp_prstatus.pr_reg, ®s->user_regs, sizeof(tmp_prstatus.pr_reg));
|
||||
tmp_prstatus.pr_fpvalid = 0; /* We assume no fp */
|
||||
|
||||
/* copy unaligned prstatus addr */
|
||||
memcpy(prstatus, &tmp_prstatus, sizeof(*prstatus));
|
||||
|
||||
prstatus->pr_pid = thread->tid;
|
||||
if (thread->proc->parent) {
|
||||
prstatus->pr_ppid = thread->proc->parent->pid;
|
||||
}
|
||||
|
||||
prstatus->pr_info.si_signo = sig;
|
||||
prstatus->pr_cursig = sig;
|
||||
}
|
||||
|
||||
int arch_get_thread_core_info_size(void)
|
||||
{
|
||||
const struct user_regset_view *view = current_user_regset_view();
|
||||
const struct user_regset *regset = find_regset(view, NT_ARM_SVE);
|
||||
|
||||
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||
return 0;
|
||||
}
|
||||
return sizeof(struct note) + align32(sizeof("LINUX"))
|
||||
+ regset_size(cpu_local_var(current), regset);
|
||||
}
|
||||
|
||||
void arch_fill_thread_core_info(struct note *head,
|
||||
struct thread *thread, void *regs)
|
||||
{
|
||||
const struct user_regset_view *view = current_user_regset_view();
|
||||
const struct user_regset *regset = find_regset(view, NT_ARM_SVE);
|
||||
|
||||
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* pre saved registers */
|
||||
save_fp_regs(thread);
|
||||
|
||||
if (regset->core_note_type && regset->get &&
|
||||
(!regset->active || regset->active(thread, regset))) {
|
||||
int ret;
|
||||
size_t size = regset_size(thread, regset);
|
||||
void *namep;
|
||||
void *descp;
|
||||
|
||||
namep = (void *) (head + 1);
|
||||
descp = namep + align32(sizeof("LINUX"));
|
||||
|
||||
ret = regset->get(thread, regset, 0, size, descp, NULL);
|
||||
if (ret) {
|
||||
return;
|
||||
}
|
||||
|
||||
head->namesz = sizeof("LINUX");
|
||||
head->descsz = size;
|
||||
head->type = NT_ARM_SVE;
|
||||
memcpy(namep, "LINUX", sizeof("LINUX"));
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
/* cpu.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||
/* cpu.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
#include <ihk/cpu.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <ihk/mm.h>
|
||||
#include <types.h>
|
||||
#include <errno.h>
|
||||
@ -30,9 +29,11 @@
|
||||
#include <debug-monitors.h>
|
||||
#include <sysreg.h>
|
||||
#include <cpufeature.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <hwcap.h>
|
||||
#include <virt.h>
|
||||
#include <init.h>
|
||||
#include <bootparam.h>
|
||||
|
||||
//#define DEBUG_PRINT_CPU
|
||||
|
||||
@ -67,6 +68,7 @@ void (*gic_dist_init)(unsigned long dist_base_pa, unsigned long size);
|
||||
void (*gic_cpu_init)(unsigned long cpu_base_pa, unsigned long size);
|
||||
void (*gic_enable)(void);
|
||||
void (*arm64_issue_ipi)(unsigned int cpid, unsigned int vector);
|
||||
void (*arm64_issue_host_ipi)(unsigned int cpid, unsigned int vector);
|
||||
void (*handle_arch_irq)(struct pt_regs *);
|
||||
|
||||
static void gic_init(void)
|
||||
@ -77,14 +79,18 @@ static void gic_init(void)
|
||||
gic_cpu_init = gic_cpu_init_gicv3;
|
||||
gic_enable = gic_enable_gicv3;
|
||||
arm64_issue_ipi = arm64_issue_ipi_gicv3;
|
||||
arm64_issue_host_ipi = arm64_issue_host_ipi_gicv3;
|
||||
handle_arch_irq = handle_interrupt_gicv3;
|
||||
kprintf("%: GICv3\n", __func__);
|
||||
} else {
|
||||
/* Setup functions for GICv2 */
|
||||
gic_dist_init = gic_dist_init_gicv2;
|
||||
gic_cpu_init = gic_cpu_init_gicv2;
|
||||
gic_enable = gic_enable_gicv2;
|
||||
arm64_issue_ipi = arm64_issue_ipi_gicv2;
|
||||
arm64_issue_host_ipi = arm64_issue_host_ipi_gicv2;
|
||||
handle_arch_irq = handle_interrupt_gicv2;
|
||||
kprintf("%: GICv2\n", __func__);
|
||||
}
|
||||
|
||||
gic_dist_init(ihk_param_gic_dist_base_pa, ihk_param_gic_dist_map_size);
|
||||
@ -114,42 +120,94 @@ static struct ihk_mc_interrupt_handler cpu_stop_handler = {
|
||||
};
|
||||
|
||||
extern long freeze_thaw(void *nmi_ctx);
|
||||
static void multi_nm_interrupt_handler(void *priv)
|
||||
static void multi_interrupt_handler(void *priv)
|
||||
{
|
||||
extern int nmi_mode;
|
||||
struct pt_regs *regs = (struct pt_regs *)priv;
|
||||
union arm64_cpu_local_variables *clv;
|
||||
|
||||
switch (nmi_mode) {
|
||||
switch (multi_intr_mode) {
|
||||
case 1:
|
||||
case 2:
|
||||
/* mode == 1or2, for FREEZER NMI */
|
||||
dkprintf("%s: freeze mode NMI catch. (nmi_mode=%d)\n",
|
||||
__func__, nmi_mode);
|
||||
case 2: /* mode == 1or2, for FREEZER intr */
|
||||
dkprintf("%s: freeze mode intr catch. (multi_intr_mode=%d)\n",
|
||||
__func__, multi_intr_mode);
|
||||
freeze_thaw(NULL);
|
||||
break;
|
||||
default:
|
||||
ekprintf("%s: Unknown multi-intr-mode(%d) detected.\n",
|
||||
__func__, multi_intr_mode);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void arch_save_panic_regs(void *irq_regs)
|
||||
{
|
||||
struct pt_regs *regs = (struct pt_regs *)irq_regs;
|
||||
union arm64_cpu_local_variables *clv;
|
||||
|
||||
clv = get_arm64_this_cpu_local();
|
||||
|
||||
/* For user-space, use saved kernel context */
|
||||
if (regs->pc < USER_END) {
|
||||
memset(clv->arm64_cpu_local_thread.panic_regs,
|
||||
0, sizeof(clv->arm64_cpu_local_thread.panic_regs));
|
||||
clv->arm64_cpu_local_thread.panic_regs[29] =
|
||||
current_thread_info()->cpu_context.fp;
|
||||
clv->arm64_cpu_local_thread.panic_regs[31] =
|
||||
current_thread_info()->cpu_context.sp;
|
||||
clv->arm64_cpu_local_thread.panic_regs[32] =
|
||||
current_thread_info()->cpu_context.pc;
|
||||
clv->arm64_cpu_local_thread.panic_regs[33] =
|
||||
PSR_MODE_EL1h;
|
||||
}
|
||||
else {
|
||||
memcpy(clv->arm64_cpu_local_thread.panic_regs,
|
||||
regs->regs, sizeof(regs->regs));
|
||||
clv->arm64_cpu_local_thread.panic_regs[31] = regs->sp;
|
||||
clv->arm64_cpu_local_thread.panic_regs[32] = regs->pc;
|
||||
clv->arm64_cpu_local_thread.panic_regs[33] =
|
||||
regs->pstate;
|
||||
|
||||
}
|
||||
|
||||
clv->arm64_cpu_local_thread.paniced = 1;
|
||||
}
|
||||
|
||||
void arch_clear_panic(void)
|
||||
{
|
||||
union arm64_cpu_local_variables *clv;
|
||||
|
||||
clv = get_arm64_this_cpu_local();
|
||||
clv->arm64_cpu_local_thread.paniced = 0;
|
||||
}
|
||||
|
||||
static struct ihk_mc_interrupt_handler multi_intr_handler = {
|
||||
.func = multi_interrupt_handler,
|
||||
.priv = NULL,
|
||||
};
|
||||
|
||||
static void multi_nm_interrupt_handler(void *irq_regs)
|
||||
{
|
||||
extern int nmi_mode;
|
||||
|
||||
dkprintf("%s: ...\n", __func__);
|
||||
switch (nmi_mode) {
|
||||
case 0:
|
||||
/* mode == 0, for MEMDUMP NMI */
|
||||
clv = get_arm64_this_cpu_local();
|
||||
|
||||
if (regs) {
|
||||
memcpy(clv->arm64_cpu_local_thread.panic_regs,
|
||||
regs->regs, sizeof(regs->regs));
|
||||
clv->arm64_cpu_local_thread.panic_regs[31] = regs->sp;
|
||||
clv->arm64_cpu_local_thread.panic_regs[32] = regs->pc;
|
||||
clv->arm64_cpu_local_thread.panic_regs[33] =
|
||||
regs->pstate;
|
||||
}
|
||||
clv->arm64_cpu_local_thread.paniced = 1;
|
||||
/* mode == 0, for MEMDUMP NMI */
|
||||
arch_save_panic_regs(irq_regs);
|
||||
ihk_mc_query_mem_areas();
|
||||
/* memdump-nmi is halted McKernel, break is unnecessary. */
|
||||
/* fall through */
|
||||
case 3:
|
||||
/* mode == 3, for SHUTDOWN-WAIT NMI */
|
||||
while (1) {
|
||||
/* mode == 3, for SHUTDOWN-WAIT NMI */
|
||||
kprintf("%s: STOP\n", __func__);
|
||||
while (nmi_mode != 4)
|
||||
cpu_halt();
|
||||
break;
|
||||
|
||||
case 4:
|
||||
/* mode == 4, continue NMI */
|
||||
arch_clear_panic();
|
||||
if (!ihk_mc_get_processor_id()) {
|
||||
ihk_mc_clear_dump_page_completion();
|
||||
}
|
||||
kprintf("%s: RESUME, nmi_mode: %d\n", __func__, nmi_mode);
|
||||
break;
|
||||
|
||||
default:
|
||||
@ -423,6 +481,8 @@ void ihk_mc_init_ap(void)
|
||||
|
||||
ihk_mc_register_interrupt_handler(INTRID_CPU_STOP, &cpu_stop_handler);
|
||||
ihk_mc_register_interrupt_handler(INTRID_MULTI_NMI, &multi_nmi_handler);
|
||||
ihk_mc_register_interrupt_handler(INTRID_MULTI_INTR,
|
||||
&multi_intr_handler);
|
||||
ihk_mc_register_interrupt_handler(
|
||||
ihk_mc_get_vector(IHK_TLB_FLUSH_IRQ_VECTOR_START),
|
||||
&remote_tlb_flush_handler);
|
||||
@ -776,6 +836,21 @@ unsigned long cpu_disable_interrupt_save(void)
|
||||
return flags;
|
||||
}
|
||||
|
||||
/* save ICC_PMR_EL1 & enable interrupt (ICC_PMR_EL1 <= ICC_PMR_EL1_UNMASKED) */
|
||||
unsigned long cpu_enable_interrupt_save(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
unsigned long masked = ICC_PMR_EL1_UNMASKED;
|
||||
|
||||
asm volatile(
|
||||
"mrs_s %0, " __stringify(ICC_PMR_EL1) "\n"
|
||||
"msr_s " __stringify(ICC_PMR_EL1) ",%1"
|
||||
: "=&r" (flags)
|
||||
: "r" (masked)
|
||||
: "memory");
|
||||
return flags;
|
||||
}
|
||||
|
||||
#else /* defined(CONFIG_HAS_NMI) */
|
||||
|
||||
/* @ref.impl arch/arm64/include/asm/irqflags.h::arch_local_irq_enable */
|
||||
@ -824,6 +899,20 @@ unsigned long cpu_disable_interrupt_save(void)
|
||||
: "memory");
|
||||
return flags;
|
||||
}
|
||||
|
||||
/* save PSTATE.DAIF & enable interrupt (PSTATE.DAIF I bit set) */
|
||||
unsigned long cpu_enable_interrupt_save(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
asm volatile(
|
||||
"mrs %0, daif // arch_local_irq_save\n"
|
||||
"msr daifclr, #2"
|
||||
: "=r" (flags)
|
||||
:
|
||||
: "memory");
|
||||
return flags;
|
||||
}
|
||||
#endif /* defined(CONFIG_HAS_NMI) */
|
||||
|
||||
/* we not have "pause" instruction, instead "yield" instruction */
|
||||
@ -951,7 +1040,7 @@ void ihk_mc_boot_cpu(int cpuid, unsigned long pc)
|
||||
setup_cpu_features();
|
||||
}
|
||||
|
||||
init_sve_vl();
|
||||
sve_setup();
|
||||
}
|
||||
|
||||
/* for ihk_mc_init_context() */
|
||||
@ -986,6 +1075,9 @@ void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx,
|
||||
/* branch in ret_from_fork */
|
||||
new_ctx->thread->cpu_context.x19 = (unsigned long)next_function;
|
||||
|
||||
sp -= 16;
|
||||
new_ctx->thread->cpu_context.fp = sp;
|
||||
|
||||
/* set stack_pointer */
|
||||
new_ctx->thread->cpu_context.sp = sp - sizeof(ihk_mc_user_context_t);
|
||||
|
||||
@ -1001,9 +1093,10 @@ void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx,
|
||||
const int lcpuid = ihk_mc_get_processor_id();
|
||||
const unsigned long syscallno = current_pt_regs()->syscallno;
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
const uint16_t orig_sve_vl = current_thread_info()->sve_vl;
|
||||
const uint16_t orig_sve_vl_onexec = current_thread_info()->sve_vl_onexec;
|
||||
const uint16_t orig_sve_flags = current_thread_info()->sve_flags;
|
||||
struct thread_info *ti = current_thread_info();
|
||||
const unsigned int orig_sve_vl = ti->sve_vl;
|
||||
const unsigned int orig_sve_vl_onexec = ti->sve_vl_onexec;
|
||||
const unsigned long orig_sve_flags = ti->sve_flags;
|
||||
#endif /* CONFIG_ARM64_SVE */
|
||||
|
||||
/* get kernel stack address */
|
||||
@ -1023,6 +1116,9 @@ void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx,
|
||||
|
||||
/* set stack_pointer */
|
||||
new_ctx->thread->cpu_context.sp = sp;
|
||||
/* use the 16 bytes padding in ihk_mc_init_user_process()
|
||||
* as closing frame in the frame chain */
|
||||
new_ctx->thread->cpu_context.fp = sp + sizeof(ihk_mc_user_context_t);
|
||||
|
||||
/* clear pt_regs area */
|
||||
new_uctx = (ihk_mc_user_context_t *)new_ctx->thread->cpu_context.sp;
|
||||
@ -1234,7 +1330,6 @@ err:
|
||||
}
|
||||
|
||||
static int check_and_allocate_fp_regs(struct thread *thread);
|
||||
void save_fp_regs(struct thread *thread);
|
||||
|
||||
void arch_clone_thread(struct thread *othread, unsigned long pc,
|
||||
unsigned long sp, struct thread *nthread)
|
||||
@ -1346,11 +1441,15 @@ int ihk_mc_arch_get_special_register(enum ihk_asr_type type,
|
||||
}
|
||||
|
||||
/*@
|
||||
@ requires \valid_apicid(cpu); // valid APIC ID or not
|
||||
@ requires \valid_cpuid(cpu); // valid CPU logical ID
|
||||
@ ensures \result == 0
|
||||
@*/
|
||||
int ihk_mc_interrupt_cpu(int cpu, int vector)
|
||||
{
|
||||
if (cpu < 0 || cpu >= num_processors) {
|
||||
kprintf("%s: invalid CPU id: %d\n", __func__, cpu);
|
||||
return -1;
|
||||
}
|
||||
dkprintf("[%d] ihk_mc_interrupt_cpu: %d\n", ihk_mc_get_processor_id(), cpu);
|
||||
(*arm64_issue_ipi)(cpu, vector);
|
||||
return 0;
|
||||
@ -1398,6 +1497,19 @@ struct thread *arch_switch_context(struct thread *prev, struct thread *next)
|
||||
}
|
||||
}
|
||||
#endif /*ENABLE_PERF*/
|
||||
|
||||
#ifdef PROFILE_ENABLE
|
||||
if (prev && prev->profile && prev->profile_start_ts != 0) {
|
||||
prev->profile_elapsed_ts +=
|
||||
(rdtsc() - prev->profile_start_ts);
|
||||
prev->profile_start_ts = 0;
|
||||
}
|
||||
|
||||
if (next->profile && next->profile_start_ts == 0) {
|
||||
next->profile_start_ts = rdtsc();
|
||||
}
|
||||
#endif
|
||||
|
||||
if (likely(prev)) {
|
||||
tls_thread_switch(prev, next);
|
||||
|
||||
@ -1471,8 +1583,7 @@ check_and_allocate_fp_regs(struct thread *thread)
|
||||
|
||||
if (!thread->fp_regs) {
|
||||
kprintf("error: allocating fp_regs pages\n");
|
||||
result = 1;
|
||||
panic("panic: error allocating fp_regs pages");
|
||||
result = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1481,37 +1592,51 @@ check_and_allocate_fp_regs(struct thread *thread)
|
||||
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
if (likely(elf_hwcap & HWCAP_SVE)) {
|
||||
sve_alloc(thread);
|
||||
result = sve_alloc(thread);
|
||||
}
|
||||
#endif /* CONFIG_ARM64_SVE */
|
||||
out:
|
||||
if (result) {
|
||||
release_fp_regs(thread);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/*@
|
||||
@ requires \valid(thread);
|
||||
@*/
|
||||
void
|
||||
int
|
||||
save_fp_regs(struct thread *thread)
|
||||
{
|
||||
int ret = 0;
|
||||
if (thread == &cpu_local_var(idle)) {
|
||||
return;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (likely(elf_hwcap & (HWCAP_FP | HWCAP_ASIMD))) {
|
||||
if (check_and_allocate_fp_regs(thread) != 0) {
|
||||
// alloc error.
|
||||
return;
|
||||
ret = check_and_allocate_fp_regs(thread);
|
||||
if (ret) {
|
||||
goto out;
|
||||
}
|
||||
thread_fpsimd_save(thread);
|
||||
}
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void copy_fp_regs(struct thread *from, struct thread *to)
|
||||
int copy_fp_regs(struct thread *from, struct thread *to)
|
||||
{
|
||||
if ((from->fp_regs != NULL) && (check_and_allocate_fp_regs(to) == 0)) {
|
||||
memcpy(to->fp_regs, from->fp_regs, sizeof(fp_regs_struct));
|
||||
int ret = 0;
|
||||
|
||||
if (from->fp_regs != NULL) {
|
||||
ret = check_and_allocate_fp_regs(to);
|
||||
if (!ret) {
|
||||
memcpy(to->fp_regs,
|
||||
from->fp_regs,
|
||||
sizeof(fp_regs_struct));
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void clear_fp_regs(void)
|
||||
@ -1626,6 +1751,7 @@ static inline int arch_cpu_mrs(uint32_t sys_reg, uint64_t *val)
|
||||
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE5_EL0);
|
||||
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE6_EL0);
|
||||
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE7_EL0);
|
||||
SYSREG_READ_S(IMP_PF_PMUSERENR_EL0);
|
||||
SYSREG_READ_S(IMP_BARRIER_CTRL_EL1);
|
||||
SYSREG_READ_S(IMP_BARRIER_BST_BIT_EL1);
|
||||
SYSREG_READ_S(IMP_BARRIER_INIT_SYNC_BB0_EL1);
|
||||
@ -1696,6 +1822,7 @@ static inline int arch_cpu_msr(uint32_t sys_reg, uint64_t val)
|
||||
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE5_EL0);
|
||||
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE6_EL0);
|
||||
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE7_EL0);
|
||||
SYSREG_WRITE_S(IMP_PF_PMUSERENR_EL0);
|
||||
SYSREG_WRITE_S(IMP_BARRIER_CTRL_EL1);
|
||||
SYSREG_WRITE_S(IMP_BARRIER_BST_BIT_EL1);
|
||||
SYSREG_WRITE_S(IMP_BARRIER_INIT_SYNC_BB0_EL1);
|
||||
@ -1753,6 +1880,11 @@ int arch_cpu_read_write_register(
|
||||
ret = -1;
|
||||
}
|
||||
|
||||
dkprintf("%s: MCCTRL_OS_CPU_%s_REGISTER: reg: 0x%lx, val: 0x%lx\n",
|
||||
__FUNCTION__,
|
||||
(op == MCCTRL_OS_CPU_READ_REGISTER ? "READ" : "WRITE"),
|
||||
desc->addr, desc->val);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1762,4 +1894,9 @@ int smp_call_func(cpu_set_t *__cpu_set, smp_func_t __func, void *__arg)
|
||||
return -1;
|
||||
}
|
||||
|
||||
void arch_flush_icache_all(void)
|
||||
{
|
||||
asm("ic ialluis");
|
||||
dsb(ish);
|
||||
}
|
||||
/*** end of file ***/
|
||||
|
||||
@ -970,7 +970,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SVE),
|
||||
#endif
|
||||
{},
|
||||
{ 0 },
|
||||
};
|
||||
|
||||
/* @ref.impl arch/arm64/kernel/cpufeature.c */
|
||||
|
||||
@ -10,5 +10,5 @@ struct cpu_info cpu_table[] = {
|
||||
.cpu_name = "AArch64 Processor",
|
||||
.cpu_setup = __cpu_setup,
|
||||
},
|
||||
{ /* Empty */ },
|
||||
{ 0 },
|
||||
};
|
||||
|
||||
@ -2,7 +2,6 @@
|
||||
#include <cputype.h>
|
||||
#include <irqflags.h>
|
||||
#include <ihk/context.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <signal.h>
|
||||
#include <errno.h>
|
||||
#include <debug-monitors.h>
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* fpsimd.c COPYRIGHT FUJITSU LIMITED 2016-2018 */
|
||||
/* fpsimd.c COPYRIGHT FUJITSU LIMITED 2016-2019 */
|
||||
#include <thread_info.h>
|
||||
#include <fpsimd.h>
|
||||
#include <cpuinfo.h>
|
||||
@ -9,8 +9,9 @@
|
||||
#include <prctl.h>
|
||||
#include <cpufeature.h>
|
||||
#include <kmalloc.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <process.h>
|
||||
#include <bitmap.h>
|
||||
|
||||
//#define DEBUG_PRINT_FPSIMD
|
||||
|
||||
@ -21,11 +22,87 @@
|
||||
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
|
||||
/* Set of available vector lengths, as vq_to_bit(vq): */
|
||||
static DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
|
||||
|
||||
/* Maximum supported vector length across all CPUs (initially poisoned) */
|
||||
int sve_max_vl = -1;
|
||||
|
||||
/* Default VL for tasks that don't set it explicitly: */
|
||||
int sve_default_vl = -1;
|
||||
|
||||
/*
|
||||
* Helpers to translate bit indices in sve_vq_map to VQ values (and
|
||||
* vice versa). This allows find_next_bit() to be used to find the
|
||||
* _maximum_ VQ not exceeding a certain value.
|
||||
*/
|
||||
|
||||
static unsigned int vq_to_bit(unsigned int vq)
|
||||
{
|
||||
return SVE_VQ_MAX - vq;
|
||||
}
|
||||
|
||||
static unsigned int bit_to_vq(unsigned int bit)
|
||||
{
|
||||
if (bit >= SVE_VQ_MAX) {
|
||||
bit = SVE_VQ_MAX - 1;
|
||||
}
|
||||
return SVE_VQ_MAX - bit;
|
||||
}
|
||||
|
||||
/*
|
||||
* All vector length selection from userspace comes through here.
|
||||
* We're on a slow path, so some sanity-checks are included.
|
||||
* If things go wrong there's a bug somewhere, but try to fall back to a
|
||||
* safe choice.
|
||||
*/
|
||||
static unsigned int find_supported_vector_length(unsigned int vl)
|
||||
{
|
||||
int bit;
|
||||
int max_vl = sve_max_vl;
|
||||
|
||||
if (!sve_vl_valid(vl)) {
|
||||
vl = SVE_VL_MIN;
|
||||
}
|
||||
|
||||
if (!sve_vl_valid(max_vl)) {
|
||||
max_vl = SVE_VL_MIN;
|
||||
}
|
||||
|
||||
if (vl > max_vl) {
|
||||
vl = max_vl;
|
||||
}
|
||||
|
||||
bit = find_next_bit(sve_vq_map, SVE_VQ_MAX,
|
||||
vq_to_bit(sve_vq_from_vl(vl)));
|
||||
return sve_vl_from_vq(bit_to_vq(bit));
|
||||
}
|
||||
|
||||
static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX))
|
||||
{
|
||||
unsigned int vq, vl;
|
||||
unsigned long zcr;
|
||||
|
||||
bitmap_zero(map, SVE_VQ_MAX);
|
||||
|
||||
zcr = ZCR_EL1_LEN_MASK;
|
||||
zcr = read_sysreg_s(SYS_ZCR_EL1) & ~zcr;
|
||||
|
||||
for (vq = SVE_VQ_MAX; vq >= SVE_VQ_MIN; --vq) {
|
||||
/* self-syncing */
|
||||
write_sysreg_s(zcr | (vq - 1), SYS_ZCR_EL1);
|
||||
vl = sve_get_vl();
|
||||
/* skip intervening lengths */
|
||||
vq = sve_vq_from_vl(vl);
|
||||
set_bit(vq_to_bit(vq), map);
|
||||
}
|
||||
}
|
||||
|
||||
void sve_init_vq_map(void)
|
||||
{
|
||||
sve_probe_vqs(sve_vq_map);
|
||||
}
|
||||
|
||||
size_t sve_state_size(struct thread const *thread)
|
||||
{
|
||||
unsigned int vl = thread->ctx.thread->sve_vl;
|
||||
@ -42,17 +119,19 @@ void sve_free(struct thread *thread)
|
||||
}
|
||||
}
|
||||
|
||||
void sve_alloc(struct thread *thread)
|
||||
int sve_alloc(struct thread *thread)
|
||||
{
|
||||
if (thread->ctx.thread->sve_state) {
|
||||
return;
|
||||
return 0;
|
||||
}
|
||||
|
||||
thread->ctx.thread->sve_state =
|
||||
kmalloc(sve_state_size(thread), IHK_MC_AP_NOWAIT);
|
||||
BUG_ON(!thread->ctx.thread->sve_state);
|
||||
|
||||
if (thread->ctx.thread->sve_state == NULL) {
|
||||
return -ENOMEM;
|
||||
}
|
||||
memset(thread->ctx.thread->sve_state, 0, sve_state_size(thread));
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int get_nr_threads(struct process *proc)
|
||||
@ -75,19 +154,7 @@ int sve_set_vector_length(struct thread *thread,
|
||||
{
|
||||
struct thread_info *ti = thread->ctx.thread;
|
||||
|
||||
BUG_ON(thread == cpu_local_var(current) && cpu_local_var(no_preempt) == 0);
|
||||
|
||||
/*
|
||||
* To avoid accidents, forbid setting for individual threads of a
|
||||
* multithreaded process. User code that knows what it's doing can
|
||||
* pass PR_SVE_SET_VL_THREAD to override this restriction:
|
||||
*/
|
||||
if (!(flags & PR_SVE_SET_VL_THREAD) && get_nr_threads(thread->proc) != 1) {
|
||||
return -EINVAL;
|
||||
}
|
||||
flags &= ~(unsigned long)PR_SVE_SET_VL_THREAD;
|
||||
|
||||
if (flags & ~(unsigned long)(PR_SVE_SET_VL_INHERIT |
|
||||
if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT |
|
||||
PR_SVE_SET_VL_ONEXEC)) {
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -96,13 +163,19 @@ int sve_set_vector_length(struct thread *thread,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (vl > sve_max_vl) {
|
||||
BUG_ON(!sve_vl_valid(sve_max_vl));
|
||||
vl = sve_max_vl;
|
||||
/*
|
||||
* Clamp to the maximum vector length that VL-agnostic SVE code can
|
||||
* work with. A flag may be assigned in the future to allow setting
|
||||
* of larger vector lengths without confusing older software.
|
||||
*/
|
||||
if (vl > SVE_VL_ARCH_MAX) {
|
||||
vl = SVE_VL_ARCH_MAX;
|
||||
}
|
||||
|
||||
if (flags & (PR_SVE_SET_VL_ONEXEC |
|
||||
PR_SVE_SET_VL_INHERIT)) {
|
||||
vl = find_supported_vector_length(vl);
|
||||
|
||||
if (flags & (PR_SVE_VL_INHERIT |
|
||||
PR_SVE_SET_VL_ONEXEC)) {
|
||||
ti->sve_vl_onexec = vl;
|
||||
} else {
|
||||
/* Reset VL to system default on next exec: */
|
||||
@ -114,39 +187,42 @@ int sve_set_vector_length(struct thread *thread,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (vl != ti->sve_vl) {
|
||||
if ((elf_hwcap & HWCAP_SVE)) {
|
||||
fp_regs_struct fp_regs;
|
||||
memset(&fp_regs, 0, sizeof(fp_regs));
|
||||
if (vl == ti->sve_vl) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* for self at prctl syscall */
|
||||
if (thread == cpu_local_var(current)) {
|
||||
save_fp_regs(thread);
|
||||
clear_fp_regs();
|
||||
thread_sve_to_fpsimd(thread, &fp_regs);
|
||||
sve_free(thread);
|
||||
if ((elf_hwcap & HWCAP_SVE)) {
|
||||
fp_regs_struct fp_regs;
|
||||
|
||||
ti->sve_vl = vl;
|
||||
memset(&fp_regs, 0, sizeof(fp_regs));
|
||||
|
||||
sve_alloc(thread);
|
||||
thread_fpsimd_to_sve(thread, &fp_regs);
|
||||
restore_fp_regs(thread);
|
||||
/* for target thread at ptrace */
|
||||
} else {
|
||||
thread_sve_to_fpsimd(thread, &fp_regs);
|
||||
sve_free(thread);
|
||||
/* for self at prctl syscall */
|
||||
if (thread == cpu_local_var(current)) {
|
||||
save_fp_regs(thread);
|
||||
clear_fp_regs();
|
||||
thread_sve_to_fpsimd(thread, &fp_regs);
|
||||
sve_free(thread);
|
||||
|
||||
ti->sve_vl = vl;
|
||||
ti->sve_vl = vl;
|
||||
|
||||
sve_alloc(thread);
|
||||
thread_fpsimd_to_sve(thread, &fp_regs);
|
||||
}
|
||||
sve_alloc(thread);
|
||||
thread_fpsimd_to_sve(thread, &fp_regs);
|
||||
restore_fp_regs(thread);
|
||||
/* for target thread at ptrace */
|
||||
} else {
|
||||
thread_sve_to_fpsimd(thread, &fp_regs);
|
||||
sve_free(thread);
|
||||
|
||||
ti->sve_vl = vl;
|
||||
|
||||
sve_alloc(thread);
|
||||
thread_fpsimd_to_sve(thread, &fp_regs);
|
||||
}
|
||||
}
|
||||
ti->sve_vl = vl;
|
||||
|
||||
out:
|
||||
ti->sve_flags = flags & PR_SVE_SET_VL_INHERIT;
|
||||
ti->sve_flags = flags & PR_SVE_VL_INHERIT;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -156,44 +232,53 @@ out:
|
||||
* Encode the current vector length and flags for return.
|
||||
* This is only required for prctl(): ptrace has separate fields
|
||||
*/
|
||||
static int sve_prctl_status(const struct thread_info *ti)
|
||||
static int sve_prctl_status(unsigned long flags)
|
||||
{
|
||||
int ret = ti->sve_vl;
|
||||
int ret;
|
||||
struct thread_info *ti = cpu_local_var(current)->ctx.thread;
|
||||
|
||||
ret |= ti->sve_flags << 16;
|
||||
if (flags & PR_SVE_SET_VL_ONEXEC) {
|
||||
ret = ti->sve_vl_onexec;
|
||||
}
|
||||
else {
|
||||
ret = ti->sve_vl;
|
||||
}
|
||||
|
||||
if (ti->sve_flags & PR_SVE_VL_INHERIT) {
|
||||
ret |= PR_SVE_VL_INHERIT;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* @ref.impl arch/arm64/kernel/fpsimd.c::sve_set_task_vl */
|
||||
int sve_set_thread_vl(struct thread *thread, const unsigned long vector_length,
|
||||
const unsigned long flags)
|
||||
int sve_set_thread_vl(unsigned long arg)
|
||||
{
|
||||
unsigned long vl, flags;
|
||||
int ret;
|
||||
|
||||
if (!(elf_hwcap & HWCAP_SVE)) {
|
||||
vl = arg & PR_SVE_VL_LEN_MASK;
|
||||
flags = arg & ~vl;
|
||||
|
||||
/* Instead of system_supports_sve() */
|
||||
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
BUG_ON(thread != cpu_local_var(current));
|
||||
|
||||
preempt_disable();
|
||||
ret = sve_set_vector_length(thread, vector_length, flags);
|
||||
preempt_enable();
|
||||
|
||||
ret = sve_set_vector_length(cpu_local_var(current), vl, flags);
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
return sve_prctl_status(thread->ctx.thread);
|
||||
return sve_prctl_status(flags);
|
||||
}
|
||||
|
||||
/* @ref.impl arch/arm64/kernel/fpsimd.c::sve_get_ti_vl */
|
||||
int sve_get_thread_vl(const struct thread *thread)
|
||||
int sve_get_thread_vl(void)
|
||||
{
|
||||
if (!(elf_hwcap & HWCAP_SVE)) {
|
||||
/* Instead of system_supports_sve() */
|
||||
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||
return -EINVAL;
|
||||
}
|
||||
return sve_prctl_status(thread->ctx.thread);
|
||||
return sve_prctl_status(0);
|
||||
}
|
||||
|
||||
void do_sve_acc(unsigned int esr, struct pt_regs *regs)
|
||||
@ -203,25 +288,48 @@ void do_sve_acc(unsigned int esr, struct pt_regs *regs)
|
||||
panic("");
|
||||
}
|
||||
|
||||
void init_sve_vl(void)
|
||||
void sve_setup(void)
|
||||
{
|
||||
extern unsigned long ihk_param_default_vl;
|
||||
uint64_t zcr;
|
||||
|
||||
/* Instead of system_supports_sve() */
|
||||
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||
return;
|
||||
}
|
||||
|
||||
zcr = read_system_reg(SYS_ZCR_EL1);
|
||||
BUG_ON(((zcr & ZCR_EL1_LEN_MASK) + 1) * 16 > sve_max_vl);
|
||||
/* init sve_vq_map bitmap */
|
||||
sve_init_vq_map();
|
||||
|
||||
/*
|
||||
* The SVE architecture mandates support for 128-bit vectors,
|
||||
* so sve_vq_map must have at least SVE_VQ_MIN set.
|
||||
* If something went wrong, at least try to patch it up:
|
||||
*/
|
||||
if (!test_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map)) {
|
||||
set_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map);
|
||||
}
|
||||
|
||||
zcr = read_system_reg(SYS_ZCR_EL1);
|
||||
sve_max_vl = sve_vl_from_vq((zcr & ZCR_EL1_LEN_MASK) + 1);
|
||||
|
||||
/*
|
||||
* Sanity-check that the max VL we determined through CPU features
|
||||
* corresponds properly to sve_vq_map. If not, do our best:
|
||||
*/
|
||||
if (sve_max_vl != find_supported_vector_length(sve_max_vl)) {
|
||||
sve_max_vl = find_supported_vector_length(sve_max_vl);
|
||||
}
|
||||
|
||||
sve_max_vl = ((zcr & ZCR_EL1_LEN_MASK) + 1) * 16;
|
||||
sve_default_vl = ihk_param_default_vl;
|
||||
|
||||
if (sve_default_vl == 0) {
|
||||
kprintf("SVE: Getting default VL = 0 from HOST-Linux.\n");
|
||||
sve_default_vl = sve_max_vl > 64 ? 64 : sve_max_vl;
|
||||
kprintf("SVE: Using default vl(%d byte).\n", sve_default_vl);
|
||||
if (ihk_param_default_vl !=
|
||||
find_supported_vector_length(ihk_param_default_vl)) {
|
||||
kprintf("SVE: Getting unsupported default VL = %d "
|
||||
"from HOST-Linux.\n", sve_default_vl);
|
||||
sve_default_vl = find_supported_vector_length(64);
|
||||
kprintf("SVE: Using default vl(%d byte).\n",
|
||||
sve_default_vl);
|
||||
}
|
||||
|
||||
kprintf("SVE: maximum available vector length %u bytes per vector\n",
|
||||
@ -232,7 +340,7 @@ void init_sve_vl(void)
|
||||
|
||||
#else /* CONFIG_ARM64_SVE */
|
||||
|
||||
void init_sve_vl(void)
|
||||
void sve_setup(void)
|
||||
{
|
||||
/* nothing to do. */
|
||||
}
|
||||
|
||||
@ -10,7 +10,7 @@
|
||||
#include <smp.h>
|
||||
#include <arm-gic-v3.h>
|
||||
|
||||
#define KERNEL_RAM_VADDR MAP_KERNEL_START
|
||||
/* KERNEL_RAM_VADDR is defined by cmake */
|
||||
|
||||
//#ifndef CONFIG_SMP
|
||||
//# define PTE_FLAGS PTE_TYPE_PAGE | PTE_AF
|
||||
|
||||
@ -255,90 +255,6 @@ static void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
|
||||
cpu_restore_interrupt(flags);
|
||||
}
|
||||
|
||||
/* An implementation of the Mellor-Crummey Scott (MCS) lock */
|
||||
typedef struct mcs_lock_node {
|
||||
unsigned long locked;
|
||||
struct mcs_lock_node *next;
|
||||
unsigned long irqsave;
|
||||
#ifndef ENABLE_UBSAN
|
||||
} __aligned(64) mcs_lock_node_t;
|
||||
#else
|
||||
} mcs_lock_node_t;
|
||||
#endif
|
||||
|
||||
typedef mcs_lock_node_t mcs_lock_t;
|
||||
|
||||
static void mcs_lock_init(struct mcs_lock_node *node)
|
||||
{
|
||||
node->locked = 0;
|
||||
node->next = NULL;
|
||||
}
|
||||
|
||||
static void __mcs_lock_lock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
struct mcs_lock_node *pred;
|
||||
|
||||
node->next = NULL;
|
||||
node->locked = 0;
|
||||
pred = xchg8(&(lock->next), node);
|
||||
|
||||
if (pred) {
|
||||
node->locked = 1;
|
||||
pred->next = node;
|
||||
while (node->locked != 0) {
|
||||
cpu_pause();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void __mcs_lock_unlock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
if (node->next == NULL) {
|
||||
struct mcs_lock_node *old = atomic_cmpxchg8(&(lock->next), node, 0);
|
||||
|
||||
if (old == node) {
|
||||
return;
|
||||
}
|
||||
|
||||
while (node->next == NULL) {
|
||||
cpu_pause();
|
||||
}
|
||||
}
|
||||
|
||||
node->next->locked = 0;
|
||||
}
|
||||
|
||||
static void mcs_lock_lock_noirq(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
preempt_disable();
|
||||
__mcs_lock_lock(lock, node);
|
||||
}
|
||||
|
||||
static void mcs_lock_unlock_noirq(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
__mcs_lock_unlock(lock, node);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void mcs_lock_lock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
node->irqsave = cpu_disable_interrupt_save();
|
||||
mcs_lock_lock_noirq(lock, node);
|
||||
}
|
||||
|
||||
static void mcs_lock_unlock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
mcs_lock_unlock_noirq(lock, node);
|
||||
cpu_restore_interrupt(node->irqsave);
|
||||
}
|
||||
|
||||
|
||||
#define SPINLOCK_IN_MCS_RWLOCK
|
||||
|
||||
// reader/writer lock
|
||||
@ -743,5 +659,102 @@ static inline int irqflags_can_interrupt(unsigned long flags)
|
||||
}
|
||||
#endif /* CONFIG_HAS_NMI */
|
||||
|
||||
struct ihk_rwlock {
|
||||
unsigned int lock;
|
||||
};
|
||||
|
||||
static inline void ihk_mc_rwlock_init(struct ihk_rwlock *rw)
|
||||
{
|
||||
rw->lock = 0;
|
||||
}
|
||||
|
||||
static inline void ihk_mc_read_lock(struct ihk_rwlock *rw)
|
||||
{
|
||||
unsigned int tmp, tmp2;
|
||||
|
||||
asm volatile(
|
||||
" sevl\n"
|
||||
"1: wfe\n"
|
||||
"2: ldaxr %w0, %2\n"
|
||||
" add %w0, %w0, #1\n"
|
||||
" tbnz %w0, #31, 1b\n"
|
||||
" stxr %w1, %w0, %2\n"
|
||||
" cbnz %w1, 2b\n"
|
||||
: "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock)
|
||||
:
|
||||
: "cc", "memory");
|
||||
}
|
||||
|
||||
static inline int ihk_mc_read_trylock(struct ihk_rwlock *rw)
|
||||
{
|
||||
unsigned int tmp, tmp2 = 1;
|
||||
|
||||
asm volatile(
|
||||
" ldaxr %w0, %2\n"
|
||||
" add %w0, %w0, #1\n"
|
||||
" tbnz %w0, #31, 1f\n"
|
||||
" stxr %w1, %w0, %2\n"
|
||||
"1:\n"
|
||||
: "=&r" (tmp), "+r" (tmp2), "+Q" (rw->lock)
|
||||
:
|
||||
: "cc", "memory");
|
||||
|
||||
return !tmp2;
|
||||
}
|
||||
|
||||
static inline void ihk_mc_read_unlock(struct ihk_rwlock *rw)
|
||||
{
|
||||
unsigned int tmp, tmp2;
|
||||
|
||||
asm volatile(
|
||||
"1: ldxr %w0, %2\n"
|
||||
" sub %w0, %w0, #1\n"
|
||||
" stlxr %w1, %w0, %2\n"
|
||||
" cbnz %w1, 1b\n"
|
||||
: "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock)
|
||||
:
|
||||
: "cc", "memory");
|
||||
}
|
||||
|
||||
static inline void ihk_mc_write_lock(struct ihk_rwlock *rw)
|
||||
{
|
||||
unsigned int tmp;
|
||||
|
||||
asm volatile(
|
||||
" sevl\n"
|
||||
"1: wfe\n"
|
||||
"2: ldaxr %w0, %1\n"
|
||||
" cbnz %w0, 1b\n"
|
||||
" stxr %w0, %w2, %1\n"
|
||||
" cbnz %w0, 2b\n"
|
||||
: "=&r" (tmp), "+Q" (rw->lock)
|
||||
: "r" (0x80000000)
|
||||
: "cc", "memory");
|
||||
}
|
||||
|
||||
static inline int ihk_mc_write_trylock(struct ihk_rwlock *rw)
|
||||
{
|
||||
unsigned int tmp;
|
||||
|
||||
asm volatile(
|
||||
" ldaxr %w0, %1\n"
|
||||
" cbnz %w0, 1f\n"
|
||||
" stxr %w0, %w2, %1\n"
|
||||
"1:\n"
|
||||
: "=&r" (tmp), "+Q" (rw->lock)
|
||||
: "r" (0x80000000)
|
||||
: "cc", "memory");
|
||||
|
||||
return !tmp;
|
||||
}
|
||||
|
||||
static inline void ihk_mc_write_unlock(struct ihk_rwlock *rw)
|
||||
{
|
||||
asm volatile(
|
||||
" stlr %w1, %0\n"
|
||||
: "=Q" (rw->lock) : "r" (0) : "memory");
|
||||
}
|
||||
|
||||
#define ihk_mc_read_can_lock(rw) ((rw)->lock < 0x80000000)
|
||||
#define ihk_mc_write_can_lock(rw) ((rw)->lock == 0)
|
||||
#endif /* !__HEADER_ARM64_COMMON_ARCH_LOCK_H */
|
||||
|
||||
@ -34,7 +34,7 @@ void panic(const char *);
|
||||
*/
|
||||
/* early alloc area address */
|
||||
/* START:_end, SIZE:512 pages */
|
||||
#define MAP_EARLY_ALLOC_SHIFT 9
|
||||
#define MAP_EARLY_ALLOC_SHIFT 5
|
||||
#define MAP_EARLY_ALLOC_SIZE (UL(1) << (PAGE_SHIFT + MAP_EARLY_ALLOC_SHIFT))
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
@ -55,7 +55,11 @@ extern char _end[];
|
||||
# define MAP_BOOT_PARAM_END (MAP_BOOT_PARAM + MAP_BOOT_PARAM_SIZE)
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
|
||||
#if (VA_BITS == 39 && GRANULE_SIZE == _SZ4KB)
|
||||
/*
|
||||
* MAP_KERNEL_START is HOST MODULES_END - 8MiB.
|
||||
* It's defined by cmake.
|
||||
*/
|
||||
#if (VA_BITS == 39 && GRANULE_SIZE == _SZ4KB) /* ARM64_MEMORY_LAYOUT=1 */
|
||||
#
|
||||
# define LD_TASK_UNMAPPED_BASE UL(0x0000000400000000)
|
||||
# define TASK_UNMAPPED_BASE UL(0x0000000800000000)
|
||||
@ -64,9 +68,8 @@ extern char _end[];
|
||||
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
||||
# define MAP_FIXED_START UL(0xffffffbffbdfd000)
|
||||
# define MAP_ST_START UL(0xffffffc000000000)
|
||||
# define MAP_KERNEL_START UL(0xffffffffff800000)
|
||||
#
|
||||
#elif (VA_BITS == 42 && GRANULE_SIZE == _SZ64KB)
|
||||
#elif (VA_BITS == 42 && GRANULE_SIZE == _SZ64KB) /* ARM64_MEMORY_LAYOUT=3 */
|
||||
#
|
||||
# define LD_TASK_UNMAPPED_BASE UL(0x0000002000000000)
|
||||
# define TASK_UNMAPPED_BASE UL(0x0000004000000000)
|
||||
@ -75,9 +78,8 @@ extern char _end[];
|
||||
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
||||
# define MAP_FIXED_START UL(0xfffffdfffbdd0000)
|
||||
# define MAP_ST_START UL(0xfffffe0000000000)
|
||||
# define MAP_KERNEL_START UL(0xffffffffe0000000)
|
||||
#
|
||||
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ4KB)
|
||||
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ4KB) /* ARM64_MEMORY_LAYOUT=2 */
|
||||
#
|
||||
# define LD_TASK_UNMAPPED_BASE UL(0x0000080000000000)
|
||||
# define TASK_UNMAPPED_BASE UL(0x0000100000000000)
|
||||
@ -86,9 +88,8 @@ extern char _end[];
|
||||
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
||||
# define MAP_FIXED_START UL(0xffff7ffffbdfd000)
|
||||
# define MAP_ST_START UL(0xffff800000000000)
|
||||
# define MAP_KERNEL_START UL(0xffffffffff800000)
|
||||
#
|
||||
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ64KB)
|
||||
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ64KB) /* ARM64_MEMORY_LAYOUT=4 */
|
||||
#
|
||||
# define LD_TASK_UNMAPPED_BASE UL(0x0000080000000000)
|
||||
# define TASK_UNMAPPED_BASE UL(0x0000100000000000)
|
||||
@ -97,7 +98,6 @@ extern char _end[];
|
||||
# define MAP_VMAP_SIZE UL(0x0000000100000000)
|
||||
# define MAP_FIXED_START UL(0xffff7ffffbdd0000)
|
||||
# define MAP_ST_START UL(0xffff800000000000)
|
||||
# define MAP_KERNEL_START UL(0xffffffffe0000000)
|
||||
#
|
||||
#else
|
||||
# error address space is not defined.
|
||||
@ -583,6 +583,40 @@ static inline int pgsize_to_tbllv(size_t pgsize)
|
||||
return level;
|
||||
}
|
||||
|
||||
static inline int pgsize_to_pgshift(size_t pgsize)
|
||||
{
|
||||
/* We need to use if instead of switch because
|
||||
* sometimes PTLX_CONT_SIZE == PTLX_SIZE
|
||||
*/
|
||||
if (pgsize == PTL4_CONT_SIZE) {
|
||||
if (CONFIG_ARM64_PGTABLE_LEVELS > 3) {
|
||||
return PTL4_CONT_SHIFT;
|
||||
}
|
||||
} else if (pgsize == PTL4_SIZE) {
|
||||
if (CONFIG_ARM64_PGTABLE_LEVELS > 3) {
|
||||
return PTL4_SHIFT;
|
||||
}
|
||||
} else if (pgsize == PTL3_CONT_SIZE) {
|
||||
if (CONFIG_ARM64_PGTABLE_LEVELS > 2) {
|
||||
return PTL3_CONT_SHIFT;
|
||||
}
|
||||
} else if (pgsize == PTL3_SIZE) {
|
||||
if (CONFIG_ARM64_PGTABLE_LEVELS > 2) {
|
||||
return PTL3_SHIFT;
|
||||
}
|
||||
} else if (pgsize == PTL2_CONT_SIZE) {
|
||||
return PTL2_CONT_SHIFT;
|
||||
} else if (pgsize == PTL2_SIZE) {
|
||||
return PTL2_SHIFT;
|
||||
} else if (pgsize == PTL1_CONT_SIZE) {
|
||||
return PTL1_CONT_SHIFT;
|
||||
} else if (pgsize == PTL1_SIZE) {
|
||||
return PTL1_SHIFT;
|
||||
}
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static inline size_t tbllv_to_pgsize(int level)
|
||||
{
|
||||
size_t pgsize = 0;
|
||||
|
||||
@ -20,17 +20,21 @@ struct arm_pmu {
|
||||
void (*reset)(void*);
|
||||
int (*enable_pmu)(void);
|
||||
void (*disable_pmu)(void);
|
||||
int (*enable_counter)(int);
|
||||
int (*disable_counter)(int);
|
||||
int (*enable_intens)(int);
|
||||
int (*disable_intens)(int);
|
||||
int (*enable_counter)(unsigned long counter_mask);
|
||||
int (*disable_counter)(unsigned long counter_mask);
|
||||
int (*enable_intens)(unsigned long counter_mask);
|
||||
int (*disable_intens)(unsigned long counter_mask);
|
||||
int (*set_event_filter)(unsigned long*, int);
|
||||
void (*write_evtype)(int, uint32_t);
|
||||
int (*get_event_idx)(int num_events, unsigned long used_mask,
|
||||
unsigned long config);
|
||||
int (*map_event)(uint32_t, uint64_t);
|
||||
int (*map_hw_event)(uint64_t config);
|
||||
int (*map_cache_event)(uint64_t config);
|
||||
int (*map_raw_event)(uint64_t config);
|
||||
void (*enable_user_access_pmu_regs)(void);
|
||||
void (*disable_user_access_pmu_regs)(void);
|
||||
int (*counter_mask_valid)(unsigned long counter_mask);
|
||||
struct per_cpu_arm_pmu *per_cpu;
|
||||
};
|
||||
|
||||
|
||||
@ -102,4 +102,6 @@ static inline void cpu_disable_nmi(void)
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
void arch_flush_icache_all(void);
|
||||
|
||||
#endif /* !__HEADER_ARM64_ARCH_CPU_H */
|
||||
|
||||
@ -1,60 +0,0 @@
|
||||
#ifndef ARCH_RUSAGE_H_INCLUDED
|
||||
#define ARCH_RUSAGE_H_INCLUDED
|
||||
|
||||
#define DEBUG_RUSAGE
|
||||
|
||||
#define IHK_OS_PGSIZE_4KB 0
|
||||
#define IHK_OS_PGSIZE_2MB 1
|
||||
#define IHK_OS_PGSIZE_1GB 2
|
||||
|
||||
extern struct ihk_os_monitor *monitor;
|
||||
|
||||
extern int sprintf(char * buf, const char *fmt, ...);
|
||||
|
||||
#define DEBUG_ARCH_RUSAGE
|
||||
#ifdef DEBUG_ARCH_RUSAGE
|
||||
#define dprintf(...) \
|
||||
do { \
|
||||
char msg[1024]; \
|
||||
sprintf(msg, __VA_ARGS__); \
|
||||
kprintf("%s,%s", __FUNCTION__, msg); \
|
||||
} while (0);
|
||||
#define eprintf(...) \
|
||||
do { \
|
||||
char msg[1024]; \
|
||||
sprintf(msg, __VA_ARGS__); \
|
||||
kprintf("%s,%s", __FUNCTION__, msg); \
|
||||
} while (0);
|
||||
#else
|
||||
#define dprintf(...) do { } while (0)
|
||||
#define eprintf(...) \
|
||||
do { \
|
||||
char msg[1024]; \
|
||||
sprintf(msg, __VA_ARGS__); \
|
||||
kprintf("%s,%s", __FUNCTION__, msg); \
|
||||
} while (0);
|
||||
#endif
|
||||
|
||||
static inline int rusage_pgsize_to_pgtype(size_t pgsize)
|
||||
{
|
||||
int ret = IHK_OS_PGSIZE_4KB;
|
||||
#if 0 /* postk-TODO */
|
||||
switch (pgsize) {
|
||||
case PTL1_SIZE:
|
||||
ret = IHK_OS_PGSIZE_4KB;
|
||||
break;
|
||||
case PTL2_SIZE:
|
||||
ret = IHK_OS_PGSIZE_2MB;
|
||||
break;
|
||||
case PTL3_SIZE:
|
||||
ret = IHK_OS_PGSIZE_1GB;
|
||||
break;
|
||||
default:
|
||||
eprintf("unknown pgsize=%ld\n", pgsize);
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* !defined(ARCH_RUSAGE_H_INCLUDED) */
|
||||
@ -1,33 +0,0 @@
|
||||
#ifndef ARCH_RUSAGE_H_INCLUDED
|
||||
#define ARCH_RUSAGE_H_INCLUDED
|
||||
|
||||
#include <arch-memory.h>
|
||||
|
||||
#define DEBUG_RUSAGE
|
||||
|
||||
#define IHK_OS_PGSIZE_4KB 0
|
||||
#define IHK_OS_PGSIZE_2MB 1
|
||||
#define IHK_OS_PGSIZE_1GB 2
|
||||
|
||||
extern struct rusage_global rusage;
|
||||
|
||||
static inline int rusage_pgsize_to_pgtype(size_t pgsize)
|
||||
{
|
||||
int ret = IHK_OS_PGSIZE_4KB;
|
||||
|
||||
if (pgsize == PTL1_SIZE) {
|
||||
ret = IHK_OS_PGSIZE_4KB;
|
||||
}
|
||||
else if (pgsize == PTL2_SIZE) {
|
||||
ret = IHK_OS_PGSIZE_2MB;
|
||||
}
|
||||
else if (pgsize == PTL3_SIZE) {
|
||||
ret = IHK_OS_PGSIZE_1GB;
|
||||
}
|
||||
else {
|
||||
kprintf("%s: Error: Unknown pgsize=%ld\n", __FUNCTION__, pgsize);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* !defined(ARCH_RUSAGE_H_INCLUDED) */
|
||||
@ -8,6 +8,7 @@
|
||||
#define SYSCALL_HANDLED(number, name) DECLARATOR(number, name)
|
||||
#define SYSCALL_DELEGATED(number, name) DECLARATOR(number, name)
|
||||
|
||||
#include <config.h>
|
||||
#include <syscall_list.h>
|
||||
|
||||
#undef DECLARATOR
|
||||
|
||||
@ -67,21 +67,12 @@ struct arm64_cpu_capabilities {
|
||||
int def_scope;/* default scope */
|
||||
int (*matches)(const struct arm64_cpu_capabilities *caps, int scope);
|
||||
int (*enable)(void *);/* Called on all active CPUs */
|
||||
union {
|
||||
struct {/* To be used for erratum handling only */
|
||||
uint32_t midr_model;
|
||||
uint32_t midr_range_min, midr_range_max;
|
||||
};
|
||||
|
||||
struct {/* Feature register checking */
|
||||
uint32_t sys_reg;
|
||||
uint8_t field_pos;
|
||||
uint8_t min_field_value;
|
||||
uint8_t hwcap_type;
|
||||
int sign;
|
||||
unsigned long hwcap;
|
||||
};
|
||||
};
|
||||
uint32_t sys_reg;
|
||||
uint8_t field_pos;
|
||||
uint8_t min_field_value;
|
||||
uint8_t hwcap_type;
|
||||
int sign;
|
||||
unsigned long hwcap;
|
||||
};
|
||||
|
||||
/* @ref.impl include/linux/bitops.h */
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* fpsimd.h COPYRIGHT FUJITSU LIMITED 2016-2017 */
|
||||
/* fpsimd.h COPYRIGHT FUJITSU LIMITED 2016-2019 */
|
||||
#ifndef __HEADER_ARM64_COMMON_FPSIMD_H
|
||||
#define __HEADER_ARM64_COMMON_FPSIMD_H
|
||||
|
||||
@ -42,16 +42,19 @@ extern void thread_sve_to_fpsimd(struct thread *thread, fp_regs_struct *fp_regs)
|
||||
|
||||
extern size_t sve_state_size(struct thread const *thread);
|
||||
extern void sve_free(struct thread *thread);
|
||||
extern void sve_alloc(struct thread *thread);
|
||||
extern int sve_alloc(struct thread *thread);
|
||||
extern void sve_save_state(void *state, unsigned int *pfpsr);
|
||||
extern void sve_load_state(void const *state, unsigned int const *pfpsr, unsigned long vq_minus_1);
|
||||
extern unsigned int sve_get_vl(void);
|
||||
extern int sve_set_thread_vl(struct thread *thread, const unsigned long vector_length, const unsigned long flags);
|
||||
extern int sve_get_thread_vl(const struct thread *thread);
|
||||
extern int sve_set_thread_vl(unsigned long arg);
|
||||
extern int sve_get_thread_vl(void);
|
||||
extern int sve_set_vector_length(struct thread *thread, unsigned long vl, unsigned long flags);
|
||||
|
||||
#define SVE_SET_VL(thread, vector_length, flags) sve_set_thread_vl(thread, vector_length, flags)
|
||||
#define SVE_GET_VL(thread) sve_get_thread_vl(thread)
|
||||
#define SVE_SET_VL(arg) sve_set_thread_vl(arg)
|
||||
#define SVE_GET_VL() sve_get_thread_vl()
|
||||
|
||||
/* Maximum VL that SVE VL-agnostic software can transparently support */
|
||||
#define SVE_VL_ARCH_MAX 0x100
|
||||
|
||||
#else /* CONFIG_ARM64_SVE */
|
||||
|
||||
@ -80,12 +83,12 @@ static int sve_set_vector_length(struct thread *thread, unsigned long vl, unsign
|
||||
}
|
||||
|
||||
/* for prctl syscall */
|
||||
#define SVE_SET_VL(a,b,c) (-EINVAL)
|
||||
#define SVE_GET_VL(a) (-EINVAL)
|
||||
#define SVE_SET_VL(a) (-EINVAL)
|
||||
#define SVE_GET_VL() (-EINVAL)
|
||||
|
||||
#endif /* CONFIG_ARM64_SVE */
|
||||
|
||||
extern void init_sve_vl(void);
|
||||
extern void sve_setup(void);
|
||||
extern void fpsimd_save_state(struct fpsimd_state *state);
|
||||
extern void fpsimd_load_state(struct fpsimd_state *state);
|
||||
extern void thread_fpsimd_save(struct thread *thread);
|
||||
|
||||
@ -124,7 +124,7 @@ static inline long ihk_atomic64_read(const ihk_atomic64_t *v)
|
||||
return *(volatile long *)&(v)->counter64;
|
||||
}
|
||||
|
||||
static inline void ihk_atomic64_set(ihk_atomic64_t *v, int i)
|
||||
static inline void ihk_atomic64_set(ihk_atomic64_t *v, long i)
|
||||
{
|
||||
v->counter64 = i;
|
||||
}
|
||||
@ -147,6 +147,8 @@ static inline void ihk_atomic64_add(long i, ihk_atomic64_t *v)
|
||||
/* @ref.impl arch/arm64/include/asm/atomic.h::atomic64_inc */
|
||||
#define ihk_atomic64_inc(v) ihk_atomic64_add(1LL, (v))
|
||||
|
||||
#define ihk_atomic64_cmpxchg(p, o, n) cmpxchg(&((p)->counter64), o, n)
|
||||
|
||||
/***********************************************************************
|
||||
* others
|
||||
*/
|
||||
|
||||
@ -29,6 +29,7 @@
|
||||
#define IMP_PF_INJECTION_DISTANCE5_EL0 sys_reg(3, 3, 11, 7, 5)
|
||||
#define IMP_PF_INJECTION_DISTANCE6_EL0 sys_reg(3, 3, 11, 7, 6)
|
||||
#define IMP_PF_INJECTION_DISTANCE7_EL0 sys_reg(3, 3, 11, 7, 7)
|
||||
#define IMP_PF_PMUSERENR_EL0 sys_reg(3, 3, 9, 14, 0)
|
||||
#define IMP_BARRIER_CTRL_EL1 sys_reg(3, 0, 11, 12, 0)
|
||||
#define IMP_BARRIER_BST_BIT_EL1 sys_reg(3, 0, 11, 12, 4)
|
||||
#define IMP_BARRIER_INIT_SYNC_BB0_EL1 sys_reg(3, 0, 15, 13, 0)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* irq.h COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||
/* irq.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
|
||||
#ifndef __HEADER_ARM64_IRQ_H
|
||||
#define __HEADER_ARM64_IRQ_H
|
||||
@ -14,7 +14,8 @@
|
||||
#define INTRID_QUERY_FREE_MEM 2
|
||||
#define INTRID_CPU_STOP 3
|
||||
#define INTRID_TLB_FLUSH 4
|
||||
#define INTRID_STACK_TRACE 6
|
||||
#define INTRID_STACK_TRACE 5
|
||||
#define INTRID_MULTI_INTR 6
|
||||
#define INTRID_MULTI_NMI 7
|
||||
|
||||
/* use PPI interrupt number */
|
||||
@ -29,6 +30,7 @@ extern void gic_dist_init_gicv2(unsigned long dist_base_pa, unsigned long size);
|
||||
extern void gic_cpu_init_gicv2(unsigned long cpu_base_pa, unsigned long size);
|
||||
extern void gic_enable_gicv2(void);
|
||||
extern void arm64_issue_ipi_gicv2(unsigned int cpuid, unsigned int vector);
|
||||
extern void arm64_issue_host_ipi_gicv2(uint32_t cpuid, uint32_t vector);
|
||||
extern void handle_interrupt_gicv2(struct pt_regs *regs);
|
||||
|
||||
/* Functions for GICv3 */
|
||||
@ -36,6 +38,7 @@ extern void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size);
|
||||
extern void gic_cpu_init_gicv3(unsigned long cpu_base_pa, unsigned long size);
|
||||
extern void gic_enable_gicv3(void);
|
||||
extern void arm64_issue_ipi_gicv3(unsigned int cpuid, unsigned int vector);
|
||||
extern void arm64_issue_host_ipi_gicv3(uint32_t cpuid, uint32_t vector);
|
||||
extern void handle_interrupt_gicv3(struct pt_regs *regs);
|
||||
|
||||
void handle_IPI(unsigned int vector, struct pt_regs *regs);
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* prctl.h COPYRIGHT FUJITSU LIMITED 2017 */
|
||||
/* prctl.h COPYRIGHT FUJITSU LIMITED 2017-2019 */
|
||||
#ifndef __HEADER_ARM64_COMMON_PRCTL_H
|
||||
#define __HEADER_ARM64_COMMON_PRCTL_H
|
||||
|
||||
@ -6,15 +6,12 @@
|
||||
#define PR_GET_THP_DISABLE 42
|
||||
|
||||
/* arm64 Scalable Vector Extension controls */
|
||||
#define PR_SVE_SET_VL 48 /* set task vector length */
|
||||
#define PR_SVE_SET_VL_THREAD (1 << 1) /* set just this thread */
|
||||
#define PR_SVE_SET_VL_INHERIT (1 << 2) /* inherit across exec */
|
||||
#define PR_SVE_SET_VL_ONEXEC (1 << 3) /* defer effect until exec */
|
||||
|
||||
#define PR_SVE_GET_VL 49 /* get task vector length */
|
||||
/* Decode helpers for the return value from PR_SVE_GET_VL: */
|
||||
#define PR_SVE_GET_VL_LEN(ret) ((ret) & 0x3fff) /* vector length */
|
||||
#define PR_SVE_GET_VL_INHERIT (PR_SVE_SET_VL_INHERIT << 16)
|
||||
/* For conveinence, PR_SVE_SET_VL returns the result in the same encoding */
|
||||
/* Flag values must be kept in sync with ptrace NT_ARM_SVE interface */
|
||||
#define PR_SVE_SET_VL 50 /* set task vector length */
|
||||
# define PR_SVE_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */
|
||||
#define PR_SVE_GET_VL 51 /* get task vector length */
|
||||
/* Bits common to PR_SVE_SET_VL and PR_SVE_GET_VL */
|
||||
# define PR_SVE_VL_LEN_MASK 0xffff
|
||||
# define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */
|
||||
|
||||
#endif /* !__HEADER_ARM64_COMMON_PRCTL_H */
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* ptrace.h COPYRIGHT FUJITSU LIMITED 2015-2017 */
|
||||
/* ptrace.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
#ifndef __HEADER_ARM64_COMMON_PTRACE_H
|
||||
#define __HEADER_ARM64_COMMON_PTRACE_H
|
||||
|
||||
@ -46,6 +46,7 @@
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
#include <lwk/compiler.h>
|
||||
#include <ihk/types.h>
|
||||
|
||||
struct user_hwdebug_state {
|
||||
@ -78,6 +79,70 @@ struct user_sve_header {
|
||||
uint16_t __reserved;
|
||||
};
|
||||
|
||||
enum aarch64_regset {
|
||||
REGSET_GPR,
|
||||
REGSET_FPR,
|
||||
REGSET_TLS,
|
||||
REGSET_HW_BREAK,
|
||||
REGSET_HW_WATCH,
|
||||
REGSET_SYSTEM_CALL,
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
REGSET_SVE,
|
||||
#endif /* CONFIG_ARM64_SVE */
|
||||
};
|
||||
|
||||
struct thread;
|
||||
struct user_regset;
|
||||
|
||||
typedef int user_regset_active_fn(struct thread *target,
|
||||
const struct user_regset *regset);
|
||||
|
||||
typedef long user_regset_get_fn(struct thread *target,
|
||||
const struct user_regset *regset,
|
||||
unsigned int pos, unsigned int count,
|
||||
void *kbuf, void __user *ubuf);
|
||||
|
||||
typedef long user_regset_set_fn(struct thread *target,
|
||||
const struct user_regset *regset,
|
||||
unsigned int pos, unsigned int count,
|
||||
const void *kbuf, const void __user *ubuf);
|
||||
|
||||
typedef int user_regset_writeback_fn(struct thread *target,
|
||||
const struct user_regset *regset,
|
||||
int immediate);
|
||||
|
||||
typedef unsigned int user_regset_get_size_fn(struct thread *target,
|
||||
const struct user_regset *regset);
|
||||
|
||||
struct user_regset {
|
||||
user_regset_get_fn *get;
|
||||
user_regset_set_fn *set;
|
||||
user_regset_active_fn *active;
|
||||
user_regset_writeback_fn *writeback;
|
||||
user_regset_get_size_fn *get_size;
|
||||
unsigned int n;
|
||||
unsigned int size;
|
||||
unsigned int align;
|
||||
unsigned int bias;
|
||||
unsigned int core_note_type;
|
||||
};
|
||||
|
||||
struct user_regset_view {
|
||||
const char *name;
|
||||
const struct user_regset *regsets;
|
||||
unsigned int n;
|
||||
uint32_t e_flags;
|
||||
uint16_t e_machine;
|
||||
uint8_t ei_osabi;
|
||||
};
|
||||
|
||||
extern const struct user_regset_view *current_user_regset_view(void);
|
||||
extern const struct user_regset *find_regset(
|
||||
const struct user_regset_view *view,
|
||||
unsigned int type);
|
||||
extern unsigned int regset_size(struct thread *target,
|
||||
const struct user_regset *regset);
|
||||
|
||||
/* Definitions for user_sve_header.flags: */
|
||||
#define SVE_PT_REGS_MASK (1 << 0)
|
||||
|
||||
@ -85,7 +150,7 @@ struct user_sve_header {
|
||||
#define SVE_PT_REGS_SVE SVE_PT_REGS_MASK
|
||||
|
||||
#define SVE_PT_VL_THREAD PR_SVE_SET_VL_THREAD
|
||||
#define SVE_PT_VL_INHERIT PR_SVE_SET_VL_INHERIT
|
||||
#define SVE_PT_VL_INHERIT PR_SVE_VL_INHERIT
|
||||
#define SVE_PT_VL_ONEXEC PR_SVE_SET_VL_ONEXEC
|
||||
|
||||
/*
|
||||
@ -99,7 +164,9 @@ struct user_sve_header {
|
||||
*/
|
||||
|
||||
/* Offset from the start of struct user_sve_header to the register data */
|
||||
#define SVE_PT_REGS_OFFSET ((sizeof(struct sve_context) + 15) / 16 * 16)
|
||||
#define SVE_PT_REGS_OFFSET \
|
||||
((sizeof(struct sve_context) + (SVE_VQ_BYTES - 1)) \
|
||||
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
|
||||
|
||||
/*
|
||||
* The register data content and layout depends on the value of the
|
||||
@ -174,8 +241,10 @@ struct user_sve_header {
|
||||
#define SVE_PT_SVE_FFR_OFFSET(vq) \
|
||||
__SVE_SIG_TO_PT(SVE_SIG_FFR_OFFSET(vq))
|
||||
|
||||
#define SVE_PT_SVE_FPSR_OFFSET(vq) \
|
||||
((SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq) + 15) / 16 * 16)
|
||||
#define SVE_PT_SVE_FPSR_OFFSET(vq) \
|
||||
((SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq) + \
|
||||
(SVE_VQ_BYTES - 1)) \
|
||||
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
|
||||
#define SVE_PT_SVE_FPCR_OFFSET(vq) \
|
||||
(SVE_PT_SVE_FPSR_OFFSET(vq) + SVE_PT_SVE_FPSR_SIZE)
|
||||
|
||||
@ -184,9 +253,10 @@ struct user_sve_header {
|
||||
* 128-bit boundary.
|
||||
*/
|
||||
|
||||
#define SVE_PT_SVE_SIZE(vq, flags) \
|
||||
((SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE - \
|
||||
SVE_PT_SVE_OFFSET + 15) / 16 * 16)
|
||||
#define SVE_PT_SVE_SIZE(vq, flags) \
|
||||
((SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE \
|
||||
- SVE_PT_SVE_OFFSET + (SVE_VQ_BYTES - 1)) \
|
||||
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
|
||||
|
||||
#define SVE_PT_SIZE(vq, flags) \
|
||||
(((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE ? \
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* signal.h COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||
/* signal.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
#ifndef __HEADER_ARM64_COMMON_SIGNAL_H
|
||||
#define __HEADER_ARM64_COMMON_SIGNAL_H
|
||||
|
||||
@ -298,6 +298,7 @@ struct extra_context {
|
||||
struct _aarch64_ctx head;
|
||||
void *data; /* 16-byte aligned pointer to the extra space */
|
||||
uint32_t size; /* size in bytes of the extra space */
|
||||
uint32_t __reserved[3];
|
||||
};
|
||||
|
||||
#define SVE_MAGIC 0x53564501
|
||||
@ -318,19 +319,25 @@ struct sve_context {
|
||||
* The SVE architecture leaves space for future expansion of the
|
||||
* vector length beyond its initial architectural limit of 2048 bits
|
||||
* (16 quadwords).
|
||||
*
|
||||
* See linux/Documentation/arm64/sve.txt for a description of the VL/VQ
|
||||
* terminology.
|
||||
*/
|
||||
#define SVE_VQ_MIN 1
|
||||
#define SVE_VQ_MAX 0x200
|
||||
#define SVE_VQ_BYTES 16 /* number of bytes per quadword */
|
||||
|
||||
#define SVE_VL_MIN (SVE_VQ_MIN * 0x10)
|
||||
#define SVE_VL_MAX (SVE_VQ_MAX * 0x10)
|
||||
#define SVE_VQ_MIN 1
|
||||
#define SVE_VQ_MAX 512
|
||||
|
||||
#define SVE_VL_MIN (SVE_VQ_MIN * SVE_VQ_BYTES)
|
||||
#define SVE_VL_MAX (SVE_VQ_MAX * SVE_VQ_BYTES)
|
||||
|
||||
#define SVE_NUM_ZREGS 32
|
||||
#define SVE_NUM_PREGS 16
|
||||
|
||||
#define sve_vl_valid(vl) \
|
||||
((vl) % 0x10 == 0 && (vl) >= SVE_VL_MIN && (vl) <= SVE_VL_MAX)
|
||||
#define sve_vq_from_vl(vl) ((vl) / 0x10)
|
||||
((vl) % SVE_VQ_BYTES == 0 && (vl) >= SVE_VL_MIN && (vl) <= SVE_VL_MAX)
|
||||
#define sve_vq_from_vl(vl) ((vl) / SVE_VQ_BYTES)
|
||||
#define sve_vl_from_vq(vq) ((vq) * SVE_VQ_BYTES)
|
||||
|
||||
/*
|
||||
* The total size of meaningful data in the SVE context in bytes,
|
||||
@ -365,11 +372,13 @@ struct sve_context {
|
||||
* Additional data might be appended in the future.
|
||||
*/
|
||||
|
||||
#define SVE_SIG_ZREG_SIZE(vq) ((uint32_t)(vq) * 16)
|
||||
#define SVE_SIG_PREG_SIZE(vq) ((uint32_t)(vq) * 2)
|
||||
#define SVE_SIG_ZREG_SIZE(vq) ((uint32_t)(vq) * SVE_VQ_BYTES)
|
||||
#define SVE_SIG_PREG_SIZE(vq) ((uint32_t)(vq) * (SVE_VQ_BYTES / 8))
|
||||
#define SVE_SIG_FFR_SIZE(vq) SVE_SIG_PREG_SIZE(vq)
|
||||
|
||||
#define SVE_SIG_REGS_OFFSET ((sizeof(struct sve_context) + 15) / 16 * 16)
|
||||
#define SVE_SIG_REGS_OFFSET \
|
||||
((sizeof(struct sve_context) + (SVE_VQ_BYTES - 1)) \
|
||||
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
|
||||
|
||||
#define SVE_SIG_ZREGS_OFFSET SVE_SIG_REGS_OFFSET
|
||||
#define SVE_SIG_ZREG_OFFSET(vq, n) \
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
SYSCALL_DELEGATED(4, io_getevents)
|
||||
SYSCALL_DELEGATED(17, getcwd)
|
||||
SYSCALL_DELEGATED(22, epoll_pwait)
|
||||
SYSCALL_HANDLED(22, epoll_pwait)
|
||||
SYSCALL_DELEGATED(25, fcntl)
|
||||
SYSCALL_HANDLED(29, ioctl)
|
||||
SYSCALL_DELEGATED(35, unlinkat)
|
||||
@ -17,8 +17,8 @@ SYSCALL_DELEGATED(64, write)
|
||||
SYSCALL_DELEGATED(66, writev)
|
||||
SYSCALL_DELEGATED(67, pread64)
|
||||
SYSCALL_DELEGATED(68, pwrite64)
|
||||
SYSCALL_DELEGATED(72, pselect6)
|
||||
SYSCALL_DELEGATED(73, ppoll)
|
||||
SYSCALL_HANDLED(72, pselect6)
|
||||
SYSCALL_HANDLED(73, ppoll)
|
||||
SYSCALL_HANDLED(74, signalfd4)
|
||||
SYSCALL_DELEGATED(78, readlinkat)
|
||||
SYSCALL_DELEGATED(80, fstat)
|
||||
@ -111,7 +111,7 @@ SYSCALL_HANDLED(236, get_mempolicy)
|
||||
SYSCALL_HANDLED(237, set_mempolicy)
|
||||
SYSCALL_HANDLED(238, migrate_pages)
|
||||
SYSCALL_HANDLED(239, move_pages)
|
||||
#ifdef PERF_ENABLE
|
||||
#ifdef ENABLE_PERF
|
||||
SYSCALL_HANDLED(241, perf_event_open)
|
||||
#else // PERF_ENABLE
|
||||
SYSCALL_DELEGATED(241, perf_event_open)
|
||||
@ -119,12 +119,7 @@ SYSCALL_DELEGATED(241, perf_event_open)
|
||||
SYSCALL_HANDLED(260, wait4)
|
||||
SYSCALL_HANDLED(270, process_vm_readv)
|
||||
SYSCALL_HANDLED(271, process_vm_writev)
|
||||
#ifdef PERF_ENABLE
|
||||
SYSCALL_HANDLED(601, pmc_init)
|
||||
SYSCALL_HANDLED(602, pmc_start)
|
||||
SYSCALL_HANDLED(603, pmc_stop)
|
||||
SYSCALL_HANDLED(604, pmc_reset)
|
||||
#endif // PERF_ENABLE
|
||||
SYSCALL_HANDLED(281, execveat)
|
||||
SYSCALL_HANDLED(700, get_cpu_id)
|
||||
#ifdef PROFILE_ENABLE
|
||||
SYSCALL_HANDLED(__NR_profile, profile)
|
||||
@ -132,6 +127,7 @@ SYSCALL_HANDLED(__NR_profile, profile)
|
||||
SYSCALL_HANDLED(730, util_migrate_inter_kernel)
|
||||
SYSCALL_HANDLED(731, util_indicate_clone)
|
||||
SYSCALL_HANDLED(732, get_system)
|
||||
SYSCALL_HANDLED(733, util_register_desc)
|
||||
|
||||
/* McKernel Specific */
|
||||
SYSCALL_HANDLED(801, swapout)
|
||||
@ -146,3 +142,4 @@ SYSCALL_HANDLED(1045, signalfd)
|
||||
SYSCALL_DELEGATED(1049, stat)
|
||||
SYSCALL_DELEGATED(1060, getpgrp)
|
||||
SYSCALL_HANDLED(1062, time)
|
||||
SYSCALL_DELEGATED(1069, epoll_wait)
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* thread_info.h COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||
/* thread_info.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
#ifndef __HEADER_ARM64_COMMON_THREAD_INFO_H
|
||||
#define __HEADER_ARM64_COMMON_THREAD_INFO_H
|
||||
|
||||
@ -46,9 +46,9 @@ struct thread_info {
|
||||
int cpu; /* cpu */
|
||||
struct cpu_context cpu_context; /* kernel_context */
|
||||
void *sve_state; /* SVE registers, if any */
|
||||
uint16_t sve_vl; /* SVE vector length */
|
||||
uint16_t sve_vl_onexec; /* SVE vl after next exec */
|
||||
uint16_t sve_flags; /* SVE related flags */
|
||||
unsigned int sve_vl; /* SVE vector length */
|
||||
unsigned int sve_vl_onexec; /* SVE vl after next exec */
|
||||
unsigned long sve_flags; /* SVE related flags */
|
||||
unsigned long fault_address; /* fault info */
|
||||
unsigned long fault_code; /* ESR_EL1 value */
|
||||
};
|
||||
@ -56,7 +56,7 @@ struct thread_info {
|
||||
/* Flags for sve_flags (intentionally defined to match the prctl flags) */
|
||||
|
||||
/* Inherit sve_vl and sve_flags across execve(): */
|
||||
#define THREAD_VL_INHERIT PR_SVE_SET_VL_INHERIT
|
||||
#define THREAD_VL_INHERIT PR_SVE_VL_INHERIT
|
||||
|
||||
struct arm64_cpu_local_thread {
|
||||
struct thread_info thread_info;
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
#define __ASM_TRAP_H
|
||||
|
||||
#include <types.h>
|
||||
#include <arch-lock.h>
|
||||
|
||||
struct pt_regs;
|
||||
|
||||
|
||||
@ -7,7 +7,7 @@
|
||||
#include <memory.h>
|
||||
#include <affinity.h>
|
||||
#include <syscall.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <arch-timer.h>
|
||||
#include <cls.h>
|
||||
|
||||
@ -31,10 +31,9 @@ void *cpu_base;
|
||||
* function, it is not necessary to perform the disable/enable
|
||||
* interrupts in this function as gic_raise_softirq() .
|
||||
*/
|
||||
static void arm64_raise_sgi_gicv2(unsigned int cpuid, unsigned int vector)
|
||||
static void __arm64_raise_sgi_gicv2(unsigned int hw_cpuid, unsigned int vector)
|
||||
{
|
||||
/* Build interrupt destination of the target cpu */
|
||||
unsigned int hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
|
||||
uint8_t cpu_target_list = gic_hwid_to_affinity(hw_cpuid);
|
||||
|
||||
/*
|
||||
@ -50,6 +49,23 @@ static void arm64_raise_sgi_gicv2(unsigned int cpuid, unsigned int vector)
|
||||
);
|
||||
}
|
||||
|
||||
static void arm64_raise_sgi_gicv2(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
/* Build interrupt destination of the target CPU */
|
||||
uint32_t hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
|
||||
|
||||
__arm64_raise_sgi_gicv2(hw_cpuid, vector);
|
||||
}
|
||||
|
||||
static void arm64_raise_sgi_to_host_gicv2(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
/* Build interrupt destination of the target Linux/host CPU */
|
||||
uint32_t hw_cpuid = ihk_mc_get_apicid(cpuid);
|
||||
|
||||
__arm64_raise_sgi_gicv2(hw_cpuid, vector);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* arm64_raise_spi_gicv2
|
||||
* @ref.impl nothing.
|
||||
@ -77,6 +93,11 @@ static void arm64_raise_spi_gicv2(unsigned int cpuid, unsigned int vector)
|
||||
);
|
||||
}
|
||||
|
||||
void arm64_issue_host_ipi_gicv2(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
arm64_raise_sgi_to_host_gicv2(cpuid, vector);
|
||||
}
|
||||
|
||||
/**
|
||||
* arm64_issue_ipi_gicv2
|
||||
* @param cpuid : hardware cpu id
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#include <cputype.h>
|
||||
#include <process.h>
|
||||
#include <syscall.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <arch-timer.h>
|
||||
#include <cls.h>
|
||||
|
||||
@ -195,15 +195,12 @@ static inline void gic_write_bpr1(uint32_t val)
|
||||
}
|
||||
#endif
|
||||
|
||||
static void arm64_raise_sgi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
static void __arm64_raise_sgi_gicv3(uint32_t hw_cpuid, uint32_t vector)
|
||||
{
|
||||
uint64_t mpidr, cluster_id;
|
||||
uint16_t tlist;
|
||||
uint64_t val;
|
||||
|
||||
/* Build interrupt destination of the target cpu */
|
||||
uint32_t hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
|
||||
|
||||
/*
|
||||
* Ensure that stores to Normal memory are visible to the
|
||||
* other CPUs before issuing the IPI.
|
||||
@ -239,6 +236,22 @@ static void arm64_raise_sgi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
}
|
||||
}
|
||||
|
||||
static void arm64_raise_sgi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
/* Build interrupt destination of the target CPU */
|
||||
uint32_t hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
|
||||
|
||||
__arm64_raise_sgi_gicv3(hw_cpuid, vector);
|
||||
}
|
||||
|
||||
static void arm64_raise_sgi_to_host_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
/* Build interrupt destination of the target Linux/host CPU */
|
||||
uint32_t hw_cpuid = ihk_mc_get_apicid(cpuid);
|
||||
|
||||
__arm64_raise_sgi_gicv3(hw_cpuid, vector);
|
||||
}
|
||||
|
||||
static void arm64_raise_spi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
uint64_t spi_reg_offset;
|
||||
@ -268,6 +281,11 @@ static void arm64_raise_lpi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
ekprintf("%s called.\n", __func__);
|
||||
}
|
||||
|
||||
void arm64_issue_host_ipi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
arm64_raise_sgi_to_host_gicv3(cpuid, vector);
|
||||
}
|
||||
|
||||
void arm64_issue_ipi_gicv3(uint32_t cpuid, uint32_t vector)
|
||||
{
|
||||
dkprintf("Send irq#%d to cpuid=%d\n", vector, cpuid);
|
||||
@ -292,6 +310,9 @@ void handle_interrupt_gicv3(struct pt_regs *regs)
|
||||
{
|
||||
uint64_t irqnr;
|
||||
const int from_user = interrupt_from_user(regs);
|
||||
struct cpu_local_var *v = get_this_cpu_local_var();
|
||||
//unsigned long irqflags;
|
||||
int do_check = 0;
|
||||
|
||||
irqnr = gic_read_iar();
|
||||
cpu_enable_nmi();
|
||||
@ -305,10 +326,18 @@ void handle_interrupt_gicv3(struct pt_regs *regs)
|
||||
}
|
||||
set_cputime(from_user ? CPUTIME_MODE_K2U : CPUTIME_MODE_K2K_OUT);
|
||||
|
||||
/* for migration by IPI */
|
||||
if (get_this_cpu_local_var()->flags & CPU_FLAG_NEED_MIGRATE) {
|
||||
schedule();
|
||||
//irqflags = ihk_mc_spinlock_lock(&v->runq_lock);
|
||||
/* For migration by IPI or by timesharing */
|
||||
if (v->flags &
|
||||
(CPU_FLAG_NEED_MIGRATE | CPU_FLAG_NEED_RESCHED)) {
|
||||
v->flags &= ~CPU_FLAG_NEED_RESCHED;
|
||||
do_check = 1;
|
||||
}
|
||||
//ihk_mc_spinlock_unlock(&v->runq_lock, irqflags);
|
||||
|
||||
if (do_check) {
|
||||
check_signal(0, regs, 0);
|
||||
schedule();
|
||||
}
|
||||
}
|
||||
|
||||
@ -344,9 +373,11 @@ static void init_spi_routing(uint32_t irq, uint32_t linux_cpu)
|
||||
|
||||
void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size)
|
||||
{
|
||||
#ifndef IHK_IKC_USE_LINUX_WORK_IRQ
|
||||
extern int spi_table[];
|
||||
extern int nr_spi_table;
|
||||
int i;
|
||||
#endif // !IHK_IKC_USE_LINUX_WORK_IRQ
|
||||
|
||||
dist_base = map_fixed_area(dist_base_pa, size, 1 /*non chachable*/);
|
||||
|
||||
@ -357,6 +388,7 @@ void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef IHK_IKC_USE_LINUX_WORK_IRQ
|
||||
/* initialize spi routing */
|
||||
for (i = 0; i < nr_spi_table; i++) {
|
||||
if (spi_table[i] == -1) {
|
||||
@ -364,6 +396,7 @@ void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size)
|
||||
}
|
||||
init_spi_routing(spi_table[i], i);
|
||||
}
|
||||
#endif // !IHK_IKC_USE_LINUX_WORK_IRQ
|
||||
}
|
||||
|
||||
void gic_cpu_init_gicv3(unsigned long cpu_base_pa, unsigned long size)
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
/* memory.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||
#include <ihk/cpu.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <ihk/mm.h>
|
||||
#include <types.h>
|
||||
#include <memory.h>
|
||||
@ -14,7 +13,7 @@
|
||||
#include <context.h>
|
||||
#include <kmalloc.h>
|
||||
#include <vdso.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <rusage_private.h>
|
||||
#include <cputype.h>
|
||||
|
||||
@ -2672,17 +2671,28 @@ int set_range_l1(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start,
|
||||
}
|
||||
|
||||
phys = args->phys + (base - start);
|
||||
if (__page_offset(base, PTL1_CONT_SIZE) == 0) { //check head pte
|
||||
|
||||
/* Check if we can begin / end a series of contiguous PTEs */
|
||||
if (__page_offset(base, PTL1_CONT_SIZE) == 0) {
|
||||
uintptr_t next_addr = base + PTL1_CONT_SIZE;
|
||||
|
||||
if (end < next_addr) {
|
||||
next_addr = end;
|
||||
}
|
||||
|
||||
// set contiguous bit until the next head pte
|
||||
// if phys is aligned and range does not end early.
|
||||
/* Begin the series if physical address is also aligned and
|
||||
* the range covers the series. Don't start or end it if
|
||||
* physical address is not aligned or the range ends early.
|
||||
*/
|
||||
if (__page_offset(phys | next_addr, PTL1_CONT_SIZE) == 0) {
|
||||
args->attr[0] |= PTE_CONT;
|
||||
if (rusage_memory_stat_add(args->range, phys,
|
||||
PTL1_CONT_SIZE,
|
||||
PTL1_CONT_SIZE)) {
|
||||
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
phys, __func__, base, phys,
|
||||
PTL1_CONT_SIZE, PTL1_CONT_SIZE);
|
||||
}
|
||||
} else {
|
||||
args->attr[0] &= ~PTE_CONT;
|
||||
}
|
||||
@ -2692,12 +2702,13 @@ int set_range_l1(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start,
|
||||
|
||||
error = 0;
|
||||
// call memory_stat_rss_add() here because pgshift is resolved here
|
||||
if (rusage_memory_stat_add(args->range, phys, PTL1_SIZE, PTL1_SIZE)) {
|
||||
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
phys, __func__, base, phys, PTL1_SIZE, PTL1_SIZE);
|
||||
} else {
|
||||
dkprintf("%s: !calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
__func__, base, phys, PTL1_SIZE, PTL1_SIZE);
|
||||
if (!(args->attr[0] & PTE_CONT)) {
|
||||
if (rusage_memory_stat_add(args->range, phys,
|
||||
PTL1_SIZE, PTL1_SIZE)) {
|
||||
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
phys, __func__, base, phys,
|
||||
PTL1_SIZE, PTL1_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
@ -2761,7 +2772,9 @@ retry:
|
||||
|
||||
phys = args->phys + (base - start);
|
||||
|
||||
//check head pte
|
||||
/* Check if we can begin / end a series of
|
||||
* contiguous PTEs
|
||||
*/
|
||||
if (__page_offset(base, tbl.cont_pgsize) == 0) {
|
||||
uintptr_t next_addr = base +
|
||||
tbl.cont_pgsize;
|
||||
@ -2770,11 +2783,24 @@ retry:
|
||||
next_addr = end;
|
||||
}
|
||||
|
||||
// set contiguous bit until the
|
||||
// next head pte if phys is aligned
|
||||
// and range does not end early.
|
||||
/* Begin the series if physical address
|
||||
* is also aligned and the range covers
|
||||
* the series. Don't start or end it if
|
||||
* physical address is not aligned or
|
||||
* the range ends early.
|
||||
*/
|
||||
if (__page_offset(phys | next_addr, tbl.cont_pgsize) == 0) {
|
||||
args->attr[level-1] |= PTE_CONT;
|
||||
if (rusage_memory_stat_add(args->range,
|
||||
phys,
|
||||
tbl.cont_pgsize,
|
||||
tbl.cont_pgsize)) {
|
||||
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
phys, __func__,
|
||||
base, phys,
|
||||
tbl.cont_pgsize,
|
||||
tbl.cont_pgsize);
|
||||
}
|
||||
} else {
|
||||
args->attr[level-1] &= ~PTE_CONT;
|
||||
}
|
||||
@ -2782,21 +2808,23 @@ retry:
|
||||
|
||||
ptl_set(ptep, phys | args->attr[level-1],
|
||||
level);
|
||||
|
||||
error = 0;
|
||||
dkprintf("set_range_middle(%lx,%lx,%lx,%d):"
|
||||
"large page. %d %lx\n",
|
||||
base, start, end, level, error, *ptep);
|
||||
// Call memory_stat_rss_add() here because pgshift is resolved here
|
||||
if (rusage_memory_stat_add(args->range, phys,
|
||||
tbl.pgsize,
|
||||
tbl.pgsize)) {
|
||||
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
phys, __func__, base, phys,
|
||||
tbl.pgsize, tbl.pgsize);
|
||||
} else {
|
||||
dkprintf("%s: !calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
__func__, base, phys,
|
||||
tbl.pgsize, tbl.pgsize);
|
||||
if (!(args->attr[level-1] & PTE_CONT)) {
|
||||
if (rusage_memory_stat_add(args->range,
|
||||
phys,
|
||||
tbl.pgsize,
|
||||
tbl.pgsize)) {
|
||||
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
|
||||
phys, __func__, base,
|
||||
phys,
|
||||
tbl.pgsize,
|
||||
tbl.pgsize);
|
||||
}
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
@ -2848,7 +2876,7 @@ retry:
|
||||
error = 0;
|
||||
out:
|
||||
if (tt_pa) {
|
||||
ihk_mc_free_pages(tt_pa, 1);
|
||||
ihk_mc_free_pages(phys_to_virt((unsigned long)tt_pa), 1);
|
||||
}
|
||||
dkprintf("set_range_middle(%lx,%lx,%lx,%d): %d %lx\n",
|
||||
base, start, end, level, error, *ptep);
|
||||
@ -3200,6 +3228,7 @@ void load_page_table(struct page_table *pt)
|
||||
{
|
||||
if (pt == NULL) {
|
||||
// load page table for idle(EL1) process.
|
||||
switch_mm(init_pt);
|
||||
return;
|
||||
}
|
||||
// load page table for user(EL0) thread.
|
||||
@ -3259,7 +3288,7 @@ void *map_fixed_area(unsigned long phys, unsigned long size, int uncachable)
|
||||
attr |= PTATTR_UNCACHABLE;
|
||||
}
|
||||
|
||||
kprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
|
||||
dkprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
|
||||
paligned, v, npages);
|
||||
|
||||
pt = get_init_page_table();
|
||||
@ -3335,15 +3364,15 @@ unsigned long virt_to_phys(void *v)
|
||||
{
|
||||
unsigned long va = (unsigned long)v;
|
||||
|
||||
if (MAP_KERNEL_START <= va) {
|
||||
return va - MAP_KERNEL_START + arm64_kernel_phys_base;
|
||||
if (va >= MAP_ST_START) {
|
||||
return va - MAP_ST_START + arm64_st_phys_base;
|
||||
}
|
||||
return va - MAP_ST_START;
|
||||
return va - MAP_KERNEL_START + arm64_kernel_phys_base;
|
||||
}
|
||||
|
||||
void *phys_to_virt(unsigned long p)
|
||||
{
|
||||
return (void *)(p | MAP_ST_START);
|
||||
return (void *)((p - arm64_st_phys_base) | MAP_ST_START);
|
||||
}
|
||||
|
||||
int copy_from_user(void *dst, const void *src, size_t siz)
|
||||
@ -3716,44 +3745,6 @@ translation_table_t* get_translation_table_as_paddr(const struct page_table *pt)
|
||||
return pt->tt_pa;
|
||||
}
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_8
|
||||
void remote_flush_tlb_cpumask(struct process_vm *vm,
|
||||
unsigned long addr, int cpu_id)
|
||||
{
|
||||
unsigned long cpu;
|
||||
cpu_set_t _cpu_set;
|
||||
int flush_ind;
|
||||
|
||||
if (addr) {
|
||||
flush_ind = (addr >> PAGE_SHIFT) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
|
||||
}
|
||||
/* Zero address denotes full TLB flush */
|
||||
else {
|
||||
/* Random.. */
|
||||
flush_ind = (rdtsc()) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
|
||||
}
|
||||
|
||||
/* Take a copy of the cpu set so that we don't hold the lock
|
||||
* all the way while interrupting other cores */
|
||||
ihk_mc_spinlock_lock_noirq(&vm->address_space->cpu_set_lock);
|
||||
memcpy(&_cpu_set, &vm->address_space->cpu_set, sizeof(cpu_set_t));
|
||||
ihk_mc_spinlock_unlock_noirq(&vm->address_space->cpu_set_lock);
|
||||
|
||||
/* Loop through CPUs in this address space and interrupt them for
|
||||
* TLB flush on the specified address */
|
||||
for_each_set_bit(cpu, (const unsigned long*)&_cpu_set.__bits, CPU_SETSIZE) {
|
||||
if (ihk_mc_get_processor_id() == cpu)
|
||||
continue;
|
||||
|
||||
dkprintf("remote_flush_tlb_cpumask: flush_ind: %d, addr: 0x%lX, interrupting cpu: %d\n",
|
||||
flush_ind, addr, cpu);
|
||||
|
||||
ihk_mc_interrupt_cpu(cpu,
|
||||
ihk_mc_get_vector(flush_ind + IHK_TLB_FLUSH_IRQ_VECTOR_START));
|
||||
}
|
||||
}
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_8 */
|
||||
|
||||
void arch_adjust_allocate_page_size(struct page_table *pt,
|
||||
uintptr_t fault_addr,
|
||||
pte_t *ptep,
|
||||
|
||||
@ -19,7 +19,7 @@ int ihk_mc_ikc_init_first_local(struct ihk_ikc_channel_desc *channel,
|
||||
|
||||
memset(channel, 0, sizeof(struct ihk_ikc_channel_desc));
|
||||
|
||||
mikc_queue_pages = ((4 * num_processors * MASTER_IKCQ_PKTSIZE)
|
||||
mikc_queue_pages = ((8 * num_processors * MASTER_IKCQ_PKTSIZE)
|
||||
+ (PAGE_SIZE - 1)) / PAGE_SIZE;
|
||||
|
||||
/* Place both sides in this side */
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
#include <string.h>
|
||||
#include <ihk/mm.h>
|
||||
#include <irq.h>
|
||||
#include <process.h>
|
||||
|
||||
/*
|
||||
* @ref.impl arch/arm64/kernel/perf_event.c
|
||||
@ -85,25 +86,17 @@ void arm64_disable_user_access_pmu_regs(void)
|
||||
cpu_pmu.disable_user_access_pmu_regs();
|
||||
}
|
||||
|
||||
extern unsigned int *arm64_march_perfmap;
|
||||
|
||||
static int __ihk_mc_perfctr_init(int counter, uint32_t type, uint64_t config, int mode)
|
||||
{
|
||||
int ret = -1;
|
||||
unsigned long config_base = 0;
|
||||
int mapping;
|
||||
|
||||
mapping = cpu_pmu.map_event(type, config);
|
||||
if (mapping < 0) {
|
||||
return mapping;
|
||||
}
|
||||
|
||||
ret = cpu_pmu.disable_counter(counter);
|
||||
ret = cpu_pmu.disable_counter(1UL << counter);
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = cpu_pmu.enable_intens(counter);
|
||||
ret = cpu_pmu.enable_intens(1UL << counter);
|
||||
if (ret < 0) {
|
||||
return ret;
|
||||
}
|
||||
@ -112,7 +105,7 @@ static int __ihk_mc_perfctr_init(int counter, uint32_t type, uint64_t config, in
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
config_base |= (unsigned long)mapping;
|
||||
config_base |= config;
|
||||
cpu_pmu.write_evtype(counter, config_base);
|
||||
return ret;
|
||||
}
|
||||
@ -124,68 +117,24 @@ int ihk_mc_perfctr_init_raw(int counter, uint64_t config, int mode)
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_init(int counter, uint64_t config, int mode)
|
||||
{
|
||||
int ret;
|
||||
ret = __ihk_mc_perfctr_init(counter, PERF_TYPE_RAW, config, mode);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_start(unsigned long counter_mask)
|
||||
{
|
||||
int ret = 0, i;
|
||||
|
||||
for (i = 0; i < sizeof(counter_mask) * BITS_PER_BYTE; i++) {
|
||||
if (counter_mask & (1UL << i)) {
|
||||
ret = cpu_pmu.enable_counter(i);
|
||||
if (ret < 0) {
|
||||
kprintf("%s: enable failed(idx=%d)\n",
|
||||
__func__, i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
return cpu_pmu.enable_counter(counter_mask);
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_stop(unsigned long counter_mask, int flags)
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
for (i = 0; i < sizeof(counter_mask) * BITS_PER_BYTE; i++) {
|
||||
if (!(counter_mask & (1UL << i)))
|
||||
continue;
|
||||
|
||||
int ret = 0;
|
||||
|
||||
ret = cpu_pmu.disable_counter(i);
|
||||
if (ret < 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (flags & IHK_MC_PERFCTR_DISABLE_INTERRUPT) {
|
||||
// when ihk_mc_perfctr_start is called,
|
||||
// ihk_mc_perfctr_init is also called so disable
|
||||
// interrupt
|
||||
ret = cpu_pmu.disable_intens(i);
|
||||
if (ret < 0) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
return cpu_pmu.disable_counter(counter_mask);
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_reset(int counter)
|
||||
{
|
||||
// TODO[PMU]: ihk_mc_perfctr_setと同様にサンプリングレートの共通部実装の扱いを見てから本実装。
|
||||
cpu_pmu.write_counter(counter, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_set(int counter, long val)
|
||||
{
|
||||
// TODO[PMU]: 共通部でサンプリングレートの計算をして、設定するカウンタ値をvalに渡してくるようになると想定。サンプリングレートの扱いを見てから本実装。
|
||||
uint32_t v = val;
|
||||
cpu_pmu.write_counter(counter, v);
|
||||
return 0;
|
||||
@ -198,6 +147,15 @@ int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_alloc(struct thread *thread, struct mc_perf_event *event)
|
||||
{
|
||||
const int counters = ihk_mc_perf_get_num_counters();
|
||||
|
||||
return cpu_pmu.get_event_idx(counters,
|
||||
thread->pmc_alloc_map,
|
||||
event->hw_config);
|
||||
}
|
||||
|
||||
unsigned long ihk_mc_perfctr_read(int counter)
|
||||
{
|
||||
unsigned long count;
|
||||
@ -205,6 +163,14 @@ unsigned long ihk_mc_perfctr_read(int counter)
|
||||
return count;
|
||||
}
|
||||
|
||||
unsigned long ihk_mc_perfctr_value(int counter, unsigned long correction)
|
||||
{
|
||||
unsigned long count = ihk_mc_perfctr_read(counter) + correction;
|
||||
|
||||
count &= ((1UL << 32) - 1);
|
||||
return count;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config,
|
||||
unsigned long pmc_status)
|
||||
{
|
||||
@ -234,12 +200,14 @@ int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config,
|
||||
|
||||
int ihk_mc_perf_counter_mask_check(unsigned long counter_mask)
|
||||
{
|
||||
return 1;
|
||||
return cpu_pmu.counter_mask_valid(counter_mask);
|
||||
}
|
||||
|
||||
int ihk_mc_perf_get_num_counters(void)
|
||||
{
|
||||
return cpu_pmu.per_cpu[ihk_mc_get_processor_id()].num_events;
|
||||
const struct per_cpu_arm_pmu *per_cpu_arm_pmu = get_per_cpu_pmu();
|
||||
|
||||
return per_cpu_arm_pmu->num_events;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
|
||||
@ -247,3 +215,83 @@ int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
|
||||
/* Nothing to do. */
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline uint64_t arm_pmu_event_max_period(struct mc_perf_event *event)
|
||||
{
|
||||
return 0xFFFFFFFF;
|
||||
}
|
||||
|
||||
int hw_perf_event_init(struct mc_perf_event *event)
|
||||
{
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
|
||||
if (!is_sampling_event(event)) {
|
||||
hwc->sample_period = arm_pmu_event_max_period(event) >> 1;
|
||||
hwc->last_period = hwc->sample_period;
|
||||
ihk_atomic64_set(&hwc->period_left, hwc->sample_period);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ihk_mc_event_set_period(struct mc_perf_event *event)
|
||||
{
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
int64_t left = ihk_atomic64_read(&hwc->period_left);
|
||||
int64_t period = hwc->sample_period;
|
||||
uint64_t max_period;
|
||||
int ret = 0;
|
||||
|
||||
max_period = arm_pmu_event_max_period(event);
|
||||
if (unlikely(left <= -period)) {
|
||||
left = period;
|
||||
ihk_atomic64_set(&hwc->period_left, left);
|
||||
hwc->last_period = period;
|
||||
ret = 1;
|
||||
}
|
||||
|
||||
if (unlikely(left <= 0)) {
|
||||
left += period;
|
||||
ihk_atomic64_set(&hwc->period_left, left);
|
||||
hwc->last_period = period;
|
||||
ret = 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Limit the maximum period to prevent the counter value
|
||||
* from overtaking the one we are about to program. In
|
||||
* effect we are reducing max_period to account for
|
||||
* interrupt latency (and we are being very conservative).
|
||||
*/
|
||||
if (left > (max_period >> 1))
|
||||
left = (max_period >> 1);
|
||||
|
||||
ihk_atomic64_set(&hwc->prev_count, (uint64_t)-left);
|
||||
|
||||
cpu_pmu.write_counter(event->counter_id,
|
||||
(uint64_t)(-left) & max_period);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
uint64_t ihk_mc_event_update(struct mc_perf_event *event)
|
||||
{
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
int64_t delta;
|
||||
uint64_t prev_raw_count, new_raw_count;
|
||||
uint64_t max_period = arm_pmu_event_max_period(event);
|
||||
|
||||
again:
|
||||
prev_raw_count = ihk_atomic64_read(&hwc->prev_count);
|
||||
new_raw_count = cpu_pmu.read_counter(event->counter_id);
|
||||
|
||||
if (ihk_atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
|
||||
new_raw_count) != prev_raw_count)
|
||||
goto again;
|
||||
|
||||
delta = (new_raw_count - prev_raw_count) & max_period;
|
||||
|
||||
ihk_atomic64_add(delta, &event->count);
|
||||
ihk_atomic64_add(-delta, &hwc->period_left);
|
||||
|
||||
return new_raw_count;
|
||||
}
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
#include <ihk/perfctr.h>
|
||||
#include <errno.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <debug.h>
|
||||
#include <sysreg.h>
|
||||
#include <virt.h>
|
||||
#include <bitops.h>
|
||||
@ -21,29 +20,174 @@
|
||||
#define DDEBUG_DEFAULT DDEBUG_PRINT
|
||||
#endif
|
||||
|
||||
/*
|
||||
* read pmevcntr<n>_el0 functions
|
||||
*/
|
||||
#define read_pmevcntrN_el0(N) \
|
||||
static uint32_t read_pmevcntr##N##_el0(void) \
|
||||
{ \
|
||||
return read_sysreg(pmevcntr##N##_el0); \
|
||||
}
|
||||
|
||||
read_pmevcntrN_el0(0)
|
||||
read_pmevcntrN_el0(1)
|
||||
read_pmevcntrN_el0(2)
|
||||
read_pmevcntrN_el0(3)
|
||||
read_pmevcntrN_el0(4)
|
||||
read_pmevcntrN_el0(5)
|
||||
read_pmevcntrN_el0(6)
|
||||
read_pmevcntrN_el0(7)
|
||||
read_pmevcntrN_el0(8)
|
||||
read_pmevcntrN_el0(9)
|
||||
read_pmevcntrN_el0(10)
|
||||
read_pmevcntrN_el0(11)
|
||||
read_pmevcntrN_el0(12)
|
||||
read_pmevcntrN_el0(13)
|
||||
read_pmevcntrN_el0(14)
|
||||
read_pmevcntrN_el0(15)
|
||||
read_pmevcntrN_el0(16)
|
||||
read_pmevcntrN_el0(17)
|
||||
read_pmevcntrN_el0(18)
|
||||
read_pmevcntrN_el0(19)
|
||||
read_pmevcntrN_el0(20)
|
||||
read_pmevcntrN_el0(21)
|
||||
read_pmevcntrN_el0(22)
|
||||
read_pmevcntrN_el0(23)
|
||||
read_pmevcntrN_el0(24)
|
||||
read_pmevcntrN_el0(25)
|
||||
read_pmevcntrN_el0(26)
|
||||
read_pmevcntrN_el0(27)
|
||||
read_pmevcntrN_el0(28)
|
||||
read_pmevcntrN_el0(29)
|
||||
read_pmevcntrN_el0(30)
|
||||
|
||||
static uint32_t (* const read_pmevcntr_el0[])(void) = {
|
||||
read_pmevcntr0_el0, read_pmevcntr1_el0, read_pmevcntr2_el0,
|
||||
read_pmevcntr3_el0, read_pmevcntr4_el0, read_pmevcntr5_el0,
|
||||
read_pmevcntr6_el0, read_pmevcntr7_el0, read_pmevcntr8_el0,
|
||||
read_pmevcntr9_el0, read_pmevcntr10_el0, read_pmevcntr11_el0,
|
||||
read_pmevcntr12_el0, read_pmevcntr13_el0, read_pmevcntr14_el0,
|
||||
read_pmevcntr15_el0, read_pmevcntr16_el0, read_pmevcntr17_el0,
|
||||
read_pmevcntr18_el0, read_pmevcntr19_el0, read_pmevcntr20_el0,
|
||||
read_pmevcntr21_el0, read_pmevcntr22_el0, read_pmevcntr23_el0,
|
||||
read_pmevcntr24_el0, read_pmevcntr25_el0, read_pmevcntr26_el0,
|
||||
read_pmevcntr27_el0, read_pmevcntr28_el0, read_pmevcntr29_el0,
|
||||
read_pmevcntr30_el0,
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c
|
||||
* Perf Events' indices
|
||||
* write pmevcntr<n>_el0 functions
|
||||
*/
|
||||
#define ARMV8_IDX_CYCLE_COUNTER 0
|
||||
#define ARMV8_IDX_COUNTER0 1
|
||||
#define ARMV8_IDX_COUNTER_LAST (ARMV8_IDX_CYCLE_COUNTER + get_per_cpu_pmu()->num_events - 1)
|
||||
#define write_pmevcntrN_el0(N) \
|
||||
static void write_pmevcntr##N##_el0(uint32_t v) \
|
||||
{ \
|
||||
write_sysreg(v, pmevcntr##N##_el0); \
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/include/asm/perf_event.h */
|
||||
#define ARMV8_PMU_MAX_COUNTERS 32
|
||||
#define ARMV8_PMU_COUNTER_MASK (ARMV8_PMU_MAX_COUNTERS - 1)
|
||||
write_pmevcntrN_el0(0)
|
||||
write_pmevcntrN_el0(1)
|
||||
write_pmevcntrN_el0(2)
|
||||
write_pmevcntrN_el0(3)
|
||||
write_pmevcntrN_el0(4)
|
||||
write_pmevcntrN_el0(5)
|
||||
write_pmevcntrN_el0(6)
|
||||
write_pmevcntrN_el0(7)
|
||||
write_pmevcntrN_el0(8)
|
||||
write_pmevcntrN_el0(9)
|
||||
write_pmevcntrN_el0(10)
|
||||
write_pmevcntrN_el0(11)
|
||||
write_pmevcntrN_el0(12)
|
||||
write_pmevcntrN_el0(13)
|
||||
write_pmevcntrN_el0(14)
|
||||
write_pmevcntrN_el0(15)
|
||||
write_pmevcntrN_el0(16)
|
||||
write_pmevcntrN_el0(17)
|
||||
write_pmevcntrN_el0(18)
|
||||
write_pmevcntrN_el0(19)
|
||||
write_pmevcntrN_el0(20)
|
||||
write_pmevcntrN_el0(21)
|
||||
write_pmevcntrN_el0(22)
|
||||
write_pmevcntrN_el0(23)
|
||||
write_pmevcntrN_el0(24)
|
||||
write_pmevcntrN_el0(25)
|
||||
write_pmevcntrN_el0(26)
|
||||
write_pmevcntrN_el0(27)
|
||||
write_pmevcntrN_el0(28)
|
||||
write_pmevcntrN_el0(29)
|
||||
write_pmevcntrN_el0(30)
|
||||
|
||||
static void (* const write_pmevcntr_el0[])(uint32_t) = {
|
||||
write_pmevcntr0_el0, write_pmevcntr1_el0, write_pmevcntr2_el0,
|
||||
write_pmevcntr3_el0, write_pmevcntr4_el0, write_pmevcntr5_el0,
|
||||
write_pmevcntr6_el0, write_pmevcntr7_el0, write_pmevcntr8_el0,
|
||||
write_pmevcntr9_el0, write_pmevcntr10_el0, write_pmevcntr11_el0,
|
||||
write_pmevcntr12_el0, write_pmevcntr13_el0, write_pmevcntr14_el0,
|
||||
write_pmevcntr15_el0, write_pmevcntr16_el0, write_pmevcntr17_el0,
|
||||
write_pmevcntr18_el0, write_pmevcntr19_el0, write_pmevcntr20_el0,
|
||||
write_pmevcntr21_el0, write_pmevcntr22_el0, write_pmevcntr23_el0,
|
||||
write_pmevcntr24_el0, write_pmevcntr25_el0, write_pmevcntr26_el0,
|
||||
write_pmevcntr27_el0, write_pmevcntr28_el0, write_pmevcntr29_el0,
|
||||
write_pmevcntr30_el0,
|
||||
};
|
||||
|
||||
/*
|
||||
* ARMv8 low level PMU access
|
||||
* write pmevtyper<n>_el0 functions
|
||||
*/
|
||||
#define write_pmevtyperN_el0(N) \
|
||||
static void write_pmevtyper##N##_el0(uint32_t v) \
|
||||
{ \
|
||||
write_sysreg(v, pmevtyper##N##_el0); \
|
||||
}
|
||||
|
||||
/*
|
||||
* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c
|
||||
* Perf Event to low level counters mapping
|
||||
*/
|
||||
#define ARMV8_IDX_TO_COUNTER(x) \
|
||||
(((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)
|
||||
write_pmevtyperN_el0(0)
|
||||
write_pmevtyperN_el0(1)
|
||||
write_pmevtyperN_el0(2)
|
||||
write_pmevtyperN_el0(3)
|
||||
write_pmevtyperN_el0(4)
|
||||
write_pmevtyperN_el0(5)
|
||||
write_pmevtyperN_el0(6)
|
||||
write_pmevtyperN_el0(7)
|
||||
write_pmevtyperN_el0(8)
|
||||
write_pmevtyperN_el0(9)
|
||||
write_pmevtyperN_el0(10)
|
||||
write_pmevtyperN_el0(11)
|
||||
write_pmevtyperN_el0(12)
|
||||
write_pmevtyperN_el0(13)
|
||||
write_pmevtyperN_el0(14)
|
||||
write_pmevtyperN_el0(15)
|
||||
write_pmevtyperN_el0(16)
|
||||
write_pmevtyperN_el0(17)
|
||||
write_pmevtyperN_el0(18)
|
||||
write_pmevtyperN_el0(19)
|
||||
write_pmevtyperN_el0(20)
|
||||
write_pmevtyperN_el0(21)
|
||||
write_pmevtyperN_el0(22)
|
||||
write_pmevtyperN_el0(23)
|
||||
write_pmevtyperN_el0(24)
|
||||
write_pmevtyperN_el0(25)
|
||||
write_pmevtyperN_el0(26)
|
||||
write_pmevtyperN_el0(27)
|
||||
write_pmevtyperN_el0(28)
|
||||
write_pmevtyperN_el0(29)
|
||||
write_pmevtyperN_el0(30)
|
||||
|
||||
static void (* const write_pmevtyper_el0[])(uint32_t) = {
|
||||
write_pmevtyper0_el0, write_pmevtyper1_el0, write_pmevtyper2_el0,
|
||||
write_pmevtyper3_el0, write_pmevtyper4_el0, write_pmevtyper5_el0,
|
||||
write_pmevtyper6_el0, write_pmevtyper7_el0, write_pmevtyper8_el0,
|
||||
write_pmevtyper9_el0, write_pmevtyper10_el0, write_pmevtyper11_el0,
|
||||
write_pmevtyper12_el0, write_pmevtyper13_el0, write_pmevtyper14_el0,
|
||||
write_pmevtyper15_el0, write_pmevtyper16_el0, write_pmevtyper17_el0,
|
||||
write_pmevtyper18_el0, write_pmevtyper19_el0, write_pmevtyper20_el0,
|
||||
write_pmevtyper21_el0, write_pmevtyper22_el0, write_pmevtyper23_el0,
|
||||
write_pmevtyper24_el0, write_pmevtyper25_el0, write_pmevtyper26_el0,
|
||||
write_pmevtyper27_el0, write_pmevtyper28_el0, write_pmevtyper29_el0,
|
||||
write_pmevtyper30_el0,
|
||||
};
|
||||
|
||||
#define ARMV8_IDX_CYCLE_COUNTER 31
|
||||
#define ARMV8_IDX_COUNTER0 0
|
||||
|
||||
/*
|
||||
* @ref.impl linux-v4.15-rc3 arch/arm64/include/asm/perf_event.h
|
||||
@ -175,6 +319,10 @@
|
||||
|
||||
/* PMUv3 HW events mapping. */
|
||||
|
||||
/* disable -Woverride-init for the following initializations */
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Woverride-init"
|
||||
|
||||
/*
|
||||
* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c
|
||||
* ARMv8 Architectural defined events, not all of these may
|
||||
@ -220,6 +368,9 @@ static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
|
||||
[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED,
|
||||
};
|
||||
|
||||
/* restore warnings */
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 drivers/perf/arm_pmu.c */
|
||||
static int
|
||||
armpmu_map_cache_event(const unsigned (*cache_map)
|
||||
@ -298,11 +449,25 @@ armpmu_map_event(uint32_t type, uint64_t config,
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline int armv8pmu_counter_mask_valid(unsigned long counter_mask)
|
||||
{
|
||||
int num;
|
||||
unsigned long event;
|
||||
unsigned long cycle;
|
||||
unsigned long invalid_mask;
|
||||
|
||||
num = get_per_cpu_pmu()->num_events;
|
||||
num--; /* Sub the CPU cycles counter */
|
||||
event = ((1UL << num) - 1) << ARMV8_IDX_COUNTER0;
|
||||
cycle = 1UL << ARMV8_IDX_CYCLE_COUNTER;
|
||||
invalid_mask = ~(event | cycle);
|
||||
|
||||
return !(counter_mask & invalid_mask);
|
||||
}
|
||||
|
||||
static inline int armv8pmu_counter_valid(int idx)
|
||||
{
|
||||
return idx >= ARMV8_IDX_CYCLE_COUNTER &&
|
||||
idx <= ARMV8_IDX_COUNTER_LAST;
|
||||
return armv8pmu_counter_mask_valid(1UL << idx);
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
@ -326,6 +491,11 @@ static inline int armv8pmu_has_overflowed(uint32_t pmovsr)
|
||||
return pmovsr & ARMV8_PMU_OVERFLOWED_MASK;
|
||||
}
|
||||
|
||||
static inline int armv8pmu_counter_has_overflowed(uint32_t pmnc, int idx)
|
||||
{
|
||||
return pmnc & BIT(idx);
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static int __armv8_pmuv3_map_event(uint32_t type, uint64_t config,
|
||||
const unsigned int (*extra_event_map)
|
||||
@ -357,6 +527,23 @@ static int armv8_pmuv3_map_event(uint32_t type, uint64_t config)
|
||||
return __armv8_pmuv3_map_event(type, config, NULL, NULL);
|
||||
}
|
||||
|
||||
|
||||
static int armv8_pmuv3_map_hw_event(uint64_t config)
|
||||
{
|
||||
return __armv8_pmuv3_map_event(PERF_TYPE_HARDWARE, config, NULL, NULL);
|
||||
}
|
||||
|
||||
|
||||
static int armv8_pmuv3_map_cache_event(uint64_t config)
|
||||
{
|
||||
return __armv8_pmuv3_map_event(PERF_TYPE_HW_CACHE, config, NULL, NULL);
|
||||
}
|
||||
|
||||
static int armv8_pmuv3_map_raw_event(uint64_t config)
|
||||
{
|
||||
return __armv8_pmuv3_map_event(PERF_TYPE_RAW, config, NULL, NULL);
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline uint32_t armv8pmu_pmcr_read(void)
|
||||
{
|
||||
@ -371,24 +558,6 @@ static inline void armv8pmu_pmcr_write(uint32_t val)
|
||||
write_sysreg(val, pmcr_el0);
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline int armv8pmu_select_counter(int idx)
|
||||
{
|
||||
uint32_t counter;
|
||||
|
||||
if (!armv8pmu_counter_valid(idx)) {
|
||||
ekprintf("%s: The count_register#%d is not implemented.\n",
|
||||
__func__, idx);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
counter = ARMV8_IDX_TO_COUNTER(idx);
|
||||
write_sysreg(counter, pmselr_el0);
|
||||
isb();
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline uint32_t armv8pmu_read_counter(int idx)
|
||||
{
|
||||
@ -401,8 +570,8 @@ static inline uint32_t armv8pmu_read_counter(int idx)
|
||||
else if (idx == ARMV8_IDX_CYCLE_COUNTER) {
|
||||
value = read_sysreg(pmccntr_el0);
|
||||
}
|
||||
else if (armv8pmu_select_counter(idx) == idx) {
|
||||
value = read_sysreg(pmxevcntr_el0);
|
||||
else {
|
||||
value = read_pmevcntr_el0[idx]();
|
||||
}
|
||||
|
||||
return value;
|
||||
@ -421,43 +590,42 @@ static inline void armv8pmu_write_counter(int idx, uint32_t value)
|
||||
* count using the lower 32bits and we want an interrupt when
|
||||
* it overflows.
|
||||
*/
|
||||
uint64_t value64 = 0xffffffff00000000ULL | value;
|
||||
uint64_t value64 = (int32_t)value;
|
||||
|
||||
write_sysreg(value64, pmccntr_el0);
|
||||
}
|
||||
else if (armv8pmu_select_counter(idx) == idx) {
|
||||
write_sysreg(value, pmxevcntr_el0);
|
||||
else {
|
||||
write_pmevcntr_el0[idx](value);
|
||||
}
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline int armv8pmu_enable_intens(int idx)
|
||||
static inline int armv8pmu_enable_intens(unsigned long counter_mask)
|
||||
{
|
||||
uint32_t counter;
|
||||
|
||||
if (!armv8pmu_counter_valid(idx)) {
|
||||
ekprintf("%s: The count_register#%d is not implemented.\n",
|
||||
__func__, idx);
|
||||
if (!armv8pmu_counter_mask_valid(counter_mask)) {
|
||||
ekprintf("%s: invalid counter mask(%#lx)\n",
|
||||
__func__, counter_mask);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
counter = ARMV8_IDX_TO_COUNTER(idx);
|
||||
write_sysreg(BIT(counter), pmintenset_el1);
|
||||
return idx;
|
||||
write_sysreg(counter_mask, pmintenset_el1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline int armv8pmu_disable_intens(int idx)
|
||||
static inline int armv8pmu_disable_intens(unsigned long counter_mask)
|
||||
{
|
||||
uint32_t counter = ARMV8_IDX_TO_COUNTER(idx);
|
||||
|
||||
write_sysreg(BIT(counter), pmintenclr_el1);
|
||||
if (!armv8pmu_counter_mask_valid(counter_mask)) {
|
||||
ekprintf("%s: invalid counter mask(%#lx)\n",
|
||||
__func__, counter_mask);
|
||||
return -EINVAL;
|
||||
}
|
||||
write_sysreg(counter_mask, pmintenclr_el1);
|
||||
isb();
|
||||
/* Clear the overflow flag in case an interrupt is pending. */
|
||||
write_sysreg(BIT(counter), pmovsclr_el0);
|
||||
write_sysreg(counter_mask, pmovsclr_el0);
|
||||
isb();
|
||||
|
||||
return idx;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
@ -492,42 +660,37 @@ static int armv8pmu_set_event_filter(unsigned long *config_base, int mode)
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline void armv8pmu_write_evtype(int idx, uint32_t val)
|
||||
{
|
||||
if (armv8pmu_select_counter(idx) == idx) {
|
||||
val &= ARMV8_PMU_EVTYPE_MASK;
|
||||
write_sysreg(val, pmxevtyper_el0);
|
||||
if (!armv8pmu_counter_valid(idx)) {
|
||||
ekprintf("%s: The count_register#%d is not implemented.\n",
|
||||
__func__, idx);
|
||||
return;
|
||||
} else if (idx != ARMV8_IDX_CYCLE_COUNTER) {
|
||||
write_pmevtyper_el0[idx](val);
|
||||
}
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline int armv8pmu_enable_counter(int idx)
|
||||
static inline int armv8pmu_enable_counter(unsigned long counter_mask)
|
||||
{
|
||||
uint32_t counter;
|
||||
|
||||
if (!armv8pmu_counter_valid(idx)) {
|
||||
ekprintf("%s: The count_register#%d is not implemented.\n",
|
||||
__func__, idx);
|
||||
if (!armv8pmu_counter_mask_valid(counter_mask)) {
|
||||
ekprintf("%s: invalid counter mask 0x%lx.\n",
|
||||
__func__, counter_mask);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
counter = ARMV8_IDX_TO_COUNTER(idx);
|
||||
write_sysreg(BIT(counter), pmcntenset_el0);
|
||||
return idx;
|
||||
write_sysreg(counter_mask, pmcntenset_el0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static inline int armv8pmu_disable_counter(int idx)
|
||||
static inline int armv8pmu_disable_counter(unsigned long counter_mask)
|
||||
{
|
||||
uint32_t counter;
|
||||
|
||||
if (!armv8pmu_counter_valid(idx)) {
|
||||
ekprintf("%s: The count_register#%d is not implemented.\n",
|
||||
__func__, idx);
|
||||
if (!armv8pmu_counter_mask_valid(counter_mask)) {
|
||||
ekprintf("%s: invalid counter mask 0x%lx.\n",
|
||||
__func__, counter_mask);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
counter = ARMV8_IDX_TO_COUNTER(idx);
|
||||
write_sysreg(BIT(counter), pmcntenclr_el0);
|
||||
return idx;
|
||||
write_sysreg(counter_mask, pmcntenclr_el0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
@ -555,41 +718,20 @@ static void armv8pmu_stop(void)
|
||||
ihk_mc_spinlock_unlock(&pmu_lock, flags);
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static void armv8pmu_disable_event(int idx)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* Disable counter and interrupt
|
||||
*/
|
||||
flags = ihk_mc_spinlock_lock(&pmu_lock);
|
||||
|
||||
/*
|
||||
* Disable counter
|
||||
*/
|
||||
armv8pmu_disable_counter(idx);
|
||||
|
||||
/*
|
||||
* Disable interrupt for this counter
|
||||
*/
|
||||
armv8pmu_disable_intens(idx);
|
||||
|
||||
ihk_mc_spinlock_unlock(&pmu_lock, flags);
|
||||
}
|
||||
|
||||
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
|
||||
static void armv8pmu_reset(void *info)
|
||||
{
|
||||
struct arm_pmu *cpu_pmu = (struct arm_pmu *)info;
|
||||
uint32_t idx, nb_cnt =
|
||||
uint32_t nb_cnt =
|
||||
cpu_pmu->per_cpu[ihk_mc_get_processor_id()].num_events;
|
||||
nb_cnt--; /* Sub the CPU cycles counter */
|
||||
unsigned long event = ((1UL << nb_cnt) - 1) << ARMV8_IDX_COUNTER0;
|
||||
unsigned long cycle = 1UL << ARMV8_IDX_CYCLE_COUNTER;
|
||||
unsigned long valid_mask = event | cycle;
|
||||
|
||||
/* The counter and interrupt enable registers are unknown at reset. */
|
||||
for (idx = ARMV8_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx) {
|
||||
armv8pmu_disable_counter(idx);
|
||||
armv8pmu_disable_intens(idx);
|
||||
}
|
||||
armv8pmu_disable_counter(valid_mask);
|
||||
armv8pmu_disable_intens(valid_mask);
|
||||
|
||||
/*
|
||||
* Initialize & Reset PMNC. Request overflow interrupt for
|
||||
@ -603,7 +745,7 @@ static void armv8pmu_reset(void *info)
|
||||
static int armv8pmu_get_event_idx(int num_events, unsigned long used_mask,
|
||||
unsigned long config)
|
||||
{
|
||||
int idx;
|
||||
int idx, end;
|
||||
unsigned long evtype = config & ARMV8_PMU_EVTYPE_EVENT;
|
||||
|
||||
/* Always prefer to place a cycle counter into the cycle counter. */
|
||||
@ -615,7 +757,9 @@ static int armv8pmu_get_event_idx(int num_events, unsigned long used_mask,
|
||||
/*
|
||||
* Otherwise use events counters
|
||||
*/
|
||||
for (idx = ARMV8_IDX_COUNTER0; idx < num_events; ++idx) {
|
||||
end = ARMV8_IDX_COUNTER0 + num_events;
|
||||
end--; /* Sub the CPU cycles counter */
|
||||
for (idx = ARMV8_IDX_COUNTER0; idx < end; ++idx) {
|
||||
if (!(used_mask & (1UL << idx)))
|
||||
return idx;
|
||||
}
|
||||
@ -642,13 +786,11 @@ static uint32_t armv8pmu_read_num_pmnc_events(void)
|
||||
|
||||
static void armv8pmu_handle_irq(void *priv)
|
||||
{
|
||||
struct siginfo info;
|
||||
uint32_t pmovsr;
|
||||
struct thread *thread = cpu_local_var(current);
|
||||
struct process *proc = thread->proc;
|
||||
long irqstate;
|
||||
struct mckfd *fdp;
|
||||
struct pt_regs *regs = (struct pt_regs *)priv;
|
||||
const struct per_cpu_arm_pmu *cpu_pmu = get_per_cpu_pmu();
|
||||
int idx;
|
||||
|
||||
/*
|
||||
* Get and reset the IRQ flags
|
||||
@ -661,27 +803,40 @@ static void armv8pmu_handle_irq(void *priv)
|
||||
if (!armv8pmu_has_overflowed(pmovsr))
|
||||
return;
|
||||
|
||||
if (!proc->monitoring_event) {
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* Handle the counter(s) overflow(s)
|
||||
*/
|
||||
/* same as x86_64 mckernel */
|
||||
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
|
||||
for (fdp = proc->mckfd; fdp; fdp = fdp->next) {
|
||||
if (fdp->sig_no > 0)
|
||||
break;
|
||||
}
|
||||
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
|
||||
for (idx = 0; idx < cpu_pmu->num_events; idx++) {
|
||||
struct mc_perf_event *event = NULL;
|
||||
struct mc_perf_event *sub;
|
||||
|
||||
if (fdp) {
|
||||
memset(&info, '\0', sizeof(info));
|
||||
info.si_signo = fdp->sig_no;
|
||||
info._sifields._sigfault.si_addr = (void *)regs->pc;
|
||||
info._sifields._sigpoll.si_fd = fdp->fd;
|
||||
set_signal(fdp->sig_no, regs, &info);
|
||||
}
|
||||
else {
|
||||
set_signal(SIGIO, regs, NULL);
|
||||
if (!armv8pmu_counter_has_overflowed(pmovsr, idx)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (proc->monitoring_event->counter_id == idx) {
|
||||
event = proc->monitoring_event;
|
||||
} else {
|
||||
list_for_each_entry(sub,
|
||||
&proc->monitoring_event->sibling_list,
|
||||
group_entry) {
|
||||
if (sub->counter_id == idx) {
|
||||
event = sub;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!event) {
|
||||
continue;
|
||||
}
|
||||
ihk_mc_event_update(event);
|
||||
ihk_mc_event_set_period(event);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
static void armv8pmu_enable_user_access_pmu_regs(void)
|
||||
@ -735,11 +890,15 @@ int armv8pmu_init(struct arm_pmu* cpu_pmu)
|
||||
cpu_pmu->write_evtype = armv8pmu_write_evtype;
|
||||
cpu_pmu->get_event_idx = armv8pmu_get_event_idx;
|
||||
cpu_pmu->map_event = armv8_pmuv3_map_event;
|
||||
cpu_pmu->map_hw_event = armv8_pmuv3_map_hw_event;
|
||||
cpu_pmu->map_cache_event = armv8_pmuv3_map_cache_event;
|
||||
cpu_pmu->map_raw_event = armv8_pmuv3_map_raw_event;
|
||||
cpu_pmu->enable_user_access_pmu_regs =
|
||||
armv8pmu_enable_user_access_pmu_regs;
|
||||
cpu_pmu->disable_user_access_pmu_regs =
|
||||
armv8pmu_disable_user_access_pmu_regs;
|
||||
cpu_pmu->handler = &armv8pmu_handler;
|
||||
cpu_pmu->counter_mask_valid = &armv8pmu_counter_mask_valid;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -18,10 +18,9 @@
|
||||
#include <psci.h>
|
||||
#include <errno.h>
|
||||
#include <ihk/types.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <compiler.h>
|
||||
#include <lwk/compiler.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
//#define DEBUG_PRINT_PSCI
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* ptrace.c COPYRIGHT FUJITSU LIMITED 2016-2018 */
|
||||
/* ptrace.c COPYRIGHT FUJITSU LIMITED 2016-2019 */
|
||||
#include <errno.h>
|
||||
#include <debug-monitors.h>
|
||||
#include <hw_breakpoint.h>
|
||||
@ -11,7 +11,8 @@
|
||||
#include <hwcap.h>
|
||||
#include <string.h>
|
||||
#include <thread_info.h>
|
||||
#include <debug.h>
|
||||
#include <ptrace.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
//#define DEBUG_PRINT_SC
|
||||
|
||||
@ -25,37 +26,6 @@
|
||||
extern void save_debugreg(unsigned long *debugreg);
|
||||
extern int interrupt_from_user(void *);
|
||||
|
||||
enum aarch64_regset {
|
||||
REGSET_GPR,
|
||||
REGSET_FPR,
|
||||
REGSET_TLS,
|
||||
REGSET_HW_BREAK,
|
||||
REGSET_HW_WATCH,
|
||||
REGSET_SYSTEM_CALL,
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
REGSET_SVE,
|
||||
#endif /* CONFIG_ARM64_SVE */
|
||||
};
|
||||
|
||||
struct user_regset;
|
||||
typedef long user_regset_get_fn(struct thread *target,
|
||||
const struct user_regset *regset,
|
||||
unsigned int pos, unsigned int count,
|
||||
void *kbuf, void __user *ubuf);
|
||||
|
||||
typedef long user_regset_set_fn(struct thread *target,
|
||||
const struct user_regset *regset,
|
||||
unsigned int pos, unsigned int count,
|
||||
const void *kbuf, const void __user *ubuf);
|
||||
|
||||
struct user_regset {
|
||||
user_regset_get_fn *get;
|
||||
user_regset_set_fn *set;
|
||||
unsigned int n;
|
||||
unsigned int size;
|
||||
unsigned int core_note_type;
|
||||
};
|
||||
|
||||
long ptrace_read_user(struct thread *thread, long addr, unsigned long *value)
|
||||
{
|
||||
return -EIO;
|
||||
@ -273,6 +243,17 @@ static inline long copy_regset_from_user(struct thread *target,
|
||||
return regset->set(target, regset, offset, size, NULL, data);
|
||||
}
|
||||
|
||||
unsigned int regset_size(struct thread *target,
|
||||
const struct user_regset *regset)
|
||||
{
|
||||
if (!regset->get_size) {
|
||||
return regset->n * regset->size;
|
||||
}
|
||||
else {
|
||||
return regset->get_size(target, regset);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Bits which are always architecturally RES0 per ARM DDI 0487A.h
|
||||
* Userspace cannot use these until they have an architectural meaning.
|
||||
@ -624,6 +605,48 @@ out:
|
||||
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
|
||||
static void sve_init_header_from_thread(struct user_sve_header *header,
|
||||
struct thread *target)
|
||||
{
|
||||
unsigned int vq;
|
||||
|
||||
memset(header, 0, sizeof(*header));
|
||||
|
||||
/* McKernel processes always enable SVE. */
|
||||
header->flags = SVE_PT_REGS_SVE;
|
||||
|
||||
if (target->ctx.thread->sve_flags & SVE_PT_VL_INHERIT) {
|
||||
header->flags |= SVE_PT_VL_INHERIT;
|
||||
}
|
||||
|
||||
header->vl = target->ctx.thread->sve_vl;
|
||||
vq = sve_vq_from_vl(header->vl);
|
||||
|
||||
header->max_vl = sve_max_vl;
|
||||
header->size = SVE_PT_SIZE(vq, header->flags);
|
||||
header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl),
|
||||
SVE_PT_REGS_SVE);
|
||||
}
|
||||
|
||||
static unsigned int sve_size_from_header(struct user_sve_header const *header)
|
||||
{
|
||||
return ALIGN(header->size, SVE_VQ_BYTES);
|
||||
}
|
||||
|
||||
static unsigned int sve_get_size(struct thread *target,
|
||||
const struct user_regset *regset)
|
||||
{
|
||||
struct user_sve_header header;
|
||||
|
||||
/* Instead of system_supports_sve() */
|
||||
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
sve_init_header_from_thread(&header, target);
|
||||
return sve_size_from_header(&header);
|
||||
}
|
||||
|
||||
/* read NT_ARM_SVE */
|
||||
static long sve_get(struct thread *target,
|
||||
const struct user_regset *regset,
|
||||
@ -646,23 +669,9 @@ static long sve_get(struct thread *target,
|
||||
}
|
||||
|
||||
/* Header */
|
||||
memset(&header, 0, sizeof(header));
|
||||
|
||||
header.vl = target->ctx.thread->sve_vl;
|
||||
|
||||
BUG_ON(!sve_vl_valid(header.vl));
|
||||
sve_init_header_from_thread(&header, target);
|
||||
vq = sve_vq_from_vl(header.vl);
|
||||
|
||||
BUG_ON(!sve_vl_valid(sve_max_vl));
|
||||
header.max_vl = sve_max_vl;
|
||||
|
||||
/* McKernel processes always enable SVE. */
|
||||
header.flags = SVE_PT_REGS_SVE;
|
||||
|
||||
header.size = SVE_PT_SIZE(vq, header.flags);
|
||||
header.max_size = SVE_PT_SIZE(sve_vq_from_vl(header.max_vl),
|
||||
SVE_PT_REGS_SVE);
|
||||
|
||||
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &header,
|
||||
0, sizeof(header));
|
||||
if (ret) {
|
||||
@ -676,11 +685,9 @@ static long sve_get(struct thread *target,
|
||||
*/
|
||||
|
||||
/* Otherwise: full SVE case */
|
||||
|
||||
start = SVE_PT_SVE_OFFSET;
|
||||
end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
|
||||
|
||||
BUG_ON(end < start);
|
||||
BUG_ON(end - start > sve_state_size(target));
|
||||
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
|
||||
target->ctx.thread->sve_state,
|
||||
start, end);
|
||||
@ -690,24 +697,18 @@ static long sve_get(struct thread *target,
|
||||
|
||||
start = end;
|
||||
end = SVE_PT_SVE_FPSR_OFFSET(vq);
|
||||
|
||||
BUG_ON(end < start);
|
||||
ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
|
||||
start, end);
|
||||
if (ret) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy fpsr, and fpcr which must follow contiguously in
|
||||
* struct fpsimd_state:
|
||||
*/
|
||||
start = end;
|
||||
end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
|
||||
|
||||
BUG_ON((char *)(&target->fp_regs->fpcr + 1) <
|
||||
(char *)&target->fp_regs->fpsr);
|
||||
BUG_ON(end < start);
|
||||
BUG_ON((char *)(&target->fp_regs->fpcr + 1) -
|
||||
(char *)&target->fp_regs->fpsr !=
|
||||
end - start);
|
||||
|
||||
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
|
||||
&target->fp_regs->fpsr,
|
||||
start, end);
|
||||
@ -716,9 +717,7 @@ static long sve_get(struct thread *target,
|
||||
}
|
||||
|
||||
start = end;
|
||||
end = (SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE) + 15) / 16 * 16;
|
||||
|
||||
BUG_ON(end < start);
|
||||
end = sve_size_from_header(&header);
|
||||
ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
|
||||
start, end);
|
||||
out:
|
||||
@ -762,13 +761,12 @@ static long sve_set(struct thread *target,
|
||||
* sve_set_vector_length(), which will also validate them for us:
|
||||
*/
|
||||
ret = sve_set_vector_length(target, header.vl,
|
||||
header.flags & ~SVE_PT_REGS_MASK);
|
||||
((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
|
||||
if (ret) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Actual VL set may be less than the user asked for: */
|
||||
BUG_ON(!sve_vl_valid(target->ctx.thread->sve_vl));
|
||||
vq = sve_vq_from_vl(target->ctx.thread->sve_vl);
|
||||
|
||||
/* Registers: FPSIMD-only case */
|
||||
@ -779,11 +777,19 @@ static long sve_set(struct thread *target,
|
||||
}
|
||||
|
||||
/* Otherwise: full SVE case */
|
||||
|
||||
/*
|
||||
* If setting a different VL from the requested VL and there is
|
||||
* register data, the data layout will be wrong: don't even
|
||||
* try to set the registers in this case.
|
||||
*/
|
||||
if (count && vq != sve_vq_from_vl(header.vl)) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
start = SVE_PT_SVE_OFFSET;
|
||||
end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
|
||||
|
||||
BUG_ON(end < start);
|
||||
BUG_ON(end - start > sve_state_size(target));
|
||||
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
|
||||
target->ctx.thread->sve_state,
|
||||
start, end);
|
||||
@ -793,27 +799,21 @@ static long sve_set(struct thread *target,
|
||||
|
||||
start = end;
|
||||
end = SVE_PT_SVE_FPSR_OFFSET(vq);
|
||||
|
||||
BUG_ON(end < start);
|
||||
ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
|
||||
start, end);
|
||||
if (ret) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy fpsr, and fpcr which must follow contiguously in
|
||||
* struct fpsimd_state:
|
||||
*/
|
||||
start = end;
|
||||
end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
|
||||
|
||||
BUG_ON((char *)(&target->fp_regs->fpcr + 1) <
|
||||
(char *)&target->fp_regs->fpsr);
|
||||
BUG_ON(end < start);
|
||||
BUG_ON((char *)(&target->fp_regs->fpcr + 1) -
|
||||
(char *)&target->fp_regs->fpsr !=
|
||||
end - start);
|
||||
|
||||
user_regset_copyin(&pos, &count, &kbuf, &ubuf,
|
||||
&target->fp_regs->fpsr,
|
||||
start, end);
|
||||
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
|
||||
&target->fp_regs->fpsr,
|
||||
start, end);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
@ -825,8 +825,9 @@ static const struct user_regset aarch64_regsets[] = {
|
||||
.core_note_type = NT_PRSTATUS,
|
||||
.n = sizeof(struct user_pt_regs) / sizeof(uint64_t),
|
||||
.size = sizeof(uint64_t),
|
||||
.align = sizeof(uint64_t),
|
||||
.get = gpr_get,
|
||||
.set = gpr_set
|
||||
.set = gpr_set,
|
||||
},
|
||||
[REGSET_FPR] = {
|
||||
.core_note_type = NT_PRFPREG,
|
||||
@ -836,56 +837,75 @@ static const struct user_regset aarch64_regsets[] = {
|
||||
* fpcr are 32-bits wide.
|
||||
*/
|
||||
.size = sizeof(uint32_t),
|
||||
.align = sizeof(uint32_t),
|
||||
.get = fpr_get,
|
||||
.set = fpr_set
|
||||
.set = fpr_set,
|
||||
},
|
||||
[REGSET_TLS] = {
|
||||
.core_note_type = NT_ARM_TLS,
|
||||
.n = 1,
|
||||
.size = sizeof(void *),
|
||||
.align = sizeof(void *),
|
||||
.get = tls_get,
|
||||
.set = tls_set
|
||||
.set = tls_set,
|
||||
},
|
||||
[REGSET_HW_BREAK] = {
|
||||
.core_note_type = NT_ARM_HW_BREAK,
|
||||
.n = sizeof(struct user_hwdebug_state) / sizeof(uint32_t),
|
||||
.size = sizeof(uint32_t),
|
||||
.align = sizeof(uint32_t),
|
||||
.get = hw_break_get,
|
||||
.set = hw_break_set
|
||||
.set = hw_break_set,
|
||||
},
|
||||
[REGSET_HW_WATCH] = {
|
||||
.core_note_type = NT_ARM_HW_WATCH,
|
||||
.n = sizeof(struct user_hwdebug_state) / sizeof(uint32_t),
|
||||
.size = sizeof(uint32_t),
|
||||
.align = sizeof(uint32_t),
|
||||
.get = hw_break_get,
|
||||
.set = hw_break_set
|
||||
.set = hw_break_set,
|
||||
},
|
||||
[REGSET_SYSTEM_CALL] = {
|
||||
.core_note_type = NT_ARM_SYSTEM_CALL,
|
||||
.n = 1,
|
||||
.size = sizeof(int),
|
||||
.align = sizeof(int),
|
||||
.get = system_call_get,
|
||||
.set = system_call_set
|
||||
.set = system_call_set,
|
||||
},
|
||||
#ifdef CONFIG_ARM64_SVE
|
||||
[REGSET_SVE] = { /* Scalable Vector Extension */
|
||||
.core_note_type = NT_ARM_SVE,
|
||||
.n = (SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE) + 15) / 16,
|
||||
.size = 16,
|
||||
.n = (SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE) +
|
||||
(SVE_VQ_BYTES - 1)) / SVE_VQ_BYTES,
|
||||
.size = SVE_VQ_BYTES,
|
||||
.align = SVE_VQ_BYTES,
|
||||
.get = sve_get,
|
||||
.set = sve_set
|
||||
.set = sve_set,
|
||||
.get_size = sve_get_size,
|
||||
},
|
||||
#endif /* CONFIG_ARM64_SVE */
|
||||
};
|
||||
|
||||
static const struct user_regset *
|
||||
find_regset(const struct user_regset *regset, unsigned int type, int n)
|
||||
static const struct user_regset_view user_aarch64_view = {
|
||||
.name = "aarch64", .e_machine = EM_AARCH64,
|
||||
.regsets = aarch64_regsets,
|
||||
.n = sizeof(aarch64_regsets) / sizeof(aarch64_regsets[0])
|
||||
};
|
||||
|
||||
const struct user_regset_view *current_user_regset_view(void)
|
||||
{
|
||||
return &user_aarch64_view;
|
||||
}
|
||||
|
||||
const struct user_regset *find_regset(const struct user_regset_view *view,
|
||||
unsigned int type)
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
if (regset[i].core_note_type == type) {
|
||||
return ®set[i];
|
||||
for (i = 0; i < view->n; i++) {
|
||||
if (view->regsets[i].core_note_type == type) {
|
||||
return &view->regsets[i];
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
@ -894,8 +914,8 @@ find_regset(const struct user_regset *regset, unsigned int type, int n)
|
||||
static long ptrace_regset(struct thread *thread, int req, long type, struct iovec *iov)
|
||||
{
|
||||
long rc = -EINVAL;
|
||||
const struct user_regset *regset = find_regset(aarch64_regsets, type,
|
||||
sizeof(aarch64_regsets) / sizeof(aarch64_regsets[0]));
|
||||
const struct user_regset *regset =
|
||||
find_regset(&user_aarch64_view, type);
|
||||
|
||||
if (!regset) {
|
||||
kprintf("%s: not supported type 0x%x\n", __FUNCTION__, type);
|
||||
@ -944,6 +964,7 @@ void ptrace_report_signal(struct thread *thread, int sig)
|
||||
/* save thread_info, if called by ptrace_report_exec() */
|
||||
if (sig == ((SIGTRAP | (PTRACE_EVENT_EXEC << 8)))) {
|
||||
memcpy(&tinfo, thread->ctx.thread, sizeof(struct thread_info));
|
||||
thread->uctx->user_regs.regs[0] = 0;
|
||||
}
|
||||
|
||||
mcs_rwlock_writer_lock(&proc->update_lock, &lock);
|
||||
@ -956,6 +977,13 @@ void ptrace_report_signal(struct thread *thread, int sig)
|
||||
thread->exit_status = sig;
|
||||
thread->status = PS_TRACED;
|
||||
thread->ptrace &= ~PT_TRACE_SYSCALL;
|
||||
if (sig == ((SIGTRAP | (PTRACE_EVENT_EXEC << 8))) &&
|
||||
thread->ptrace & PTRACE_O_TRACEEXEC) {
|
||||
/* PTRACE_O_TRACEEXEC: since Linux 3.0, the former
|
||||
* thread ID can be retrieved with PTRACE_GETEVENTMSG.
|
||||
* Report no change. */
|
||||
thread->ptrace_eventmsg = thread->tid;
|
||||
}
|
||||
save_debugreg(thread->ptrace_debugreg);
|
||||
if (sig == SIGSTOP || sig == SIGTSTP ||
|
||||
sig == SIGTTIN || sig == SIGTTOU) {
|
||||
@ -991,6 +1019,7 @@ void ptrace_report_signal(struct thread *thread, int sig)
|
||||
if (sig == ((SIGTRAP | (PTRACE_EVENT_EXEC << 8)))) {
|
||||
memcpy(thread->ctx.thread, &tinfo, sizeof(struct thread_info));
|
||||
}
|
||||
arch_flush_icache_all();
|
||||
}
|
||||
|
||||
long
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* syscall.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
|
||||
/* syscall.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
||||
#include <cpulocal.h>
|
||||
#include <string.h>
|
||||
#include <kmalloc.h>
|
||||
@ -15,7 +15,8 @@
|
||||
#include <limits.h>
|
||||
#include <uio.h>
|
||||
#include <syscall.h>
|
||||
#include <debug.h>
|
||||
#include <rusage_private.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
void terminate_mcexec(int, int);
|
||||
extern void ptrace_report_signal(struct thread *thread, int sig);
|
||||
@ -42,7 +43,7 @@ uintptr_t debug_constants[] = {
|
||||
offsetof(struct cpu_local_var, runq),
|
||||
offsetof(struct cpu_local_var, status),
|
||||
offsetof(struct cpu_local_var, idle),
|
||||
offsetof(struct thread, ctx) + offsetof(struct thread_info, cpu_context),
|
||||
offsetof(struct thread, ctx),
|
||||
offsetof(struct thread, sched_list),
|
||||
offsetof(struct thread, proc),
|
||||
offsetof(struct thread, status),
|
||||
@ -56,13 +57,34 @@ extern int num_processors;
|
||||
int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last)
|
||||
{
|
||||
int min_queue_len = -1;
|
||||
int cpu, min_cpu = -1, uti_cpu = -1;
|
||||
unsigned long irqstate;
|
||||
int cpu, min_cpu = -1;
|
||||
#if 0
|
||||
int uti_cpu = -1;
|
||||
#endif
|
||||
unsigned long irqstate = 0;
|
||||
|
||||
irqstate = ihk_mc_spinlock_lock(&runq_reservation_lock);
|
||||
int start, end, step;
|
||||
|
||||
if (use_last) {
|
||||
start = num_processors - 1;
|
||||
end = -1;
|
||||
step = -1;
|
||||
}
|
||||
else {
|
||||
start = 0;
|
||||
end = num_processors;
|
||||
step = 1;
|
||||
}
|
||||
|
||||
if (!cpu_local_var(current)->proc->nr_processes) {
|
||||
irqstate = ihk_mc_spinlock_lock(&runq_reservation_lock);
|
||||
}
|
||||
else {
|
||||
irqstate = cpu_disable_interrupt_save();
|
||||
}
|
||||
|
||||
/* Find the first allowed core with the shortest run queue */
|
||||
for (cpu = 0; cpu < num_processors; ++cpu) {
|
||||
for (cpu = start; cpu != end; cpu += step) {
|
||||
struct cpu_local_var *v;
|
||||
|
||||
if (!CPU_ISSET(cpu, cpu_set))
|
||||
@ -73,11 +95,14 @@ int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last)
|
||||
dkprintf("%s: cpu=%d,runq_len=%d,runq_reserved=%d\n",
|
||||
__func__, cpu, v->runq_len, v->runq_reserved);
|
||||
if (min_queue_len == -1 ||
|
||||
v->runq_len + v->runq_reserved < min_queue_len) {
|
||||
min_queue_len = v->runq_len + v->runq_reserved;
|
||||
//v->runq_len + v->runq_reserved < min_queue_len) {
|
||||
v->runq_len < min_queue_len) {
|
||||
//min_queue_len = v->runq_len + v->runq_reserved;
|
||||
min_queue_len = v->runq_len;
|
||||
min_cpu = cpu;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* Record the last tie CPU */
|
||||
if (min_cpu != cpu &&
|
||||
v->runq_len + v->runq_reserved == min_queue_len) {
|
||||
@ -86,14 +111,15 @@ int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last)
|
||||
dkprintf("%s: cpu=%d,runq_len=%d,runq_reserved=%d,min_cpu=%d,uti_cpu=%d\n",
|
||||
__func__, cpu, v->runq_len, v->runq_reserved,
|
||||
min_cpu, uti_cpu);
|
||||
#else
|
||||
|
||||
ihk_mc_spinlock_unlock_noirq(&v->runq_lock);
|
||||
#if 0
|
||||
if (min_queue_len == 0)
|
||||
break;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if 0
|
||||
min_cpu = use_last ? uti_cpu : min_cpu;
|
||||
if (min_cpu != -1) {
|
||||
if (get_cpu_local_var(min_cpu)->status != CPU_STATUS_RESERVED)
|
||||
@ -102,7 +128,16 @@ int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last)
|
||||
__sync_fetch_and_add(&get_cpu_local_var(min_cpu)->runq_reserved,
|
||||
1);
|
||||
}
|
||||
ihk_mc_spinlock_unlock(&runq_reservation_lock, irqstate);
|
||||
#else
|
||||
__sync_fetch_and_add(&get_cpu_local_var(min_cpu)->runq_reserved, 1);
|
||||
#endif
|
||||
|
||||
if (!cpu_local_var(current)->proc->nr_processes) {
|
||||
ihk_mc_spinlock_unlock(&runq_reservation_lock, irqstate);
|
||||
}
|
||||
else {
|
||||
cpu_restore_interrupt(irqstate);
|
||||
}
|
||||
|
||||
return min_cpu;
|
||||
}
|
||||
@ -114,7 +149,7 @@ arch_clear_host_user_space()
|
||||
|
||||
/* XXX: might be unnecessary */
|
||||
clear_host_pte(th->vm->region.user_start,
|
||||
(th->vm->region.user_end - th->vm->region.user_start));
|
||||
(th->vm->region.user_end - th->vm->region.user_start), 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -126,10 +161,18 @@ extern unsigned long do_fork(int clone_flags, unsigned long newsp,
|
||||
|
||||
SYSCALL_DECLARE(clone)
|
||||
{
|
||||
struct process *proc = cpu_local_var(current)->proc;
|
||||
struct mcs_rwlock_node_irqsave lock_dump;
|
||||
unsigned long ret;
|
||||
|
||||
/* mutex coredump */
|
||||
mcs_rwlock_reader_lock(&proc->coredump_lock, &lock_dump);
|
||||
|
||||
if ((int)ihk_mc_syscall_arg0(ctx) & CLONE_VFORK) {
|
||||
return do_fork(CLONE_VFORK|SIGCHLD, 0, 0, 0, 0, ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx));
|
||||
ret = do_fork(CLONE_VFORK|SIGCHLD, 0, 0, 0, 0,
|
||||
ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx));
|
||||
} else {
|
||||
return do_fork((int)ihk_mc_syscall_arg0(ctx), /* clone_flags */
|
||||
ret = do_fork((int)ihk_mc_syscall_arg0(ctx), /* clone_flags */
|
||||
ihk_mc_syscall_arg1(ctx), /* newsp */
|
||||
ihk_mc_syscall_arg2(ctx), /* parent_tidptr */
|
||||
ihk_mc_syscall_arg4(ctx), /* child_tidptr (swap arg3) */
|
||||
@ -137,6 +180,9 @@ SYSCALL_DECLARE(clone)
|
||||
ihk_mc_syscall_pc(ctx), /* curpc */
|
||||
ihk_mc_syscall_sp(ctx)); /* cursp */
|
||||
}
|
||||
mcs_rwlock_reader_unlock(&proc->coredump_lock, &lock_dump);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
SYSCALL_DECLARE(rt_sigaction)
|
||||
@ -178,11 +224,10 @@ SYSCALL_DECLARE(prctl)
|
||||
|
||||
switch (option) {
|
||||
case PR_SVE_SET_VL:
|
||||
error = SVE_SET_VL(cpu_local_var(current),
|
||||
ihk_mc_syscall_arg1(ctx), ihk_mc_syscall_arg2(ctx));
|
||||
error = SVE_SET_VL(ihk_mc_syscall_arg1(ctx));
|
||||
break;
|
||||
case PR_SVE_GET_VL:
|
||||
error = SVE_GET_VL(cpu_local_var(current));
|
||||
error = SVE_GET_VL();
|
||||
break;
|
||||
case PR_SET_THP_DISABLE:
|
||||
if (arg3 || arg4 || arg5) {
|
||||
@ -657,7 +702,7 @@ void set_single_step(struct thread *thread)
|
||||
set_regs_spsr_ss(thread->uctx);
|
||||
}
|
||||
|
||||
extern void coredump(struct thread *thread, void *regs);
|
||||
extern int coredump(struct thread *thread, void *regs, int sig);
|
||||
|
||||
static int
|
||||
isrestart(int syscallno, unsigned long rc, int sig, int restart)
|
||||
@ -1096,6 +1141,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
struct mcs_rwlock_node_irqsave lock;
|
||||
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
||||
int restart = 0;
|
||||
int ret;
|
||||
|
||||
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
|
||||
dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig);
|
||||
@ -1270,15 +1316,6 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
dkprintf("SIGTRAP(): woken up\n");
|
||||
break;
|
||||
case SIGCONT:
|
||||
memset(&info, '\0', sizeof info);
|
||||
info.si_signo = SIGCHLD;
|
||||
info.si_code = CLD_CONTINUED;
|
||||
info._sifields._sigchld.si_pid = proc->pid;
|
||||
info._sifields._sigchld.si_status = 0x0000ffff;
|
||||
do_kill(cpu_local_var(current), proc->parent->pid, -1, SIGCHLD, &info, 0);
|
||||
proc->main_thread->signal_flags = SIGNAL_STOP_CONTINUED;
|
||||
proc->status = PS_RUNNING;
|
||||
dkprintf("do_signal,SIGCONT,do nothing\n");
|
||||
break;
|
||||
case SIGQUIT:
|
||||
case SIGILL:
|
||||
@ -1290,9 +1327,31 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
case SIGXCPU:
|
||||
case SIGXFSZ:
|
||||
core:
|
||||
dkprintf("do_signal,default,core,sig=%d\n", sig);
|
||||
coredump(thread, regs);
|
||||
coredumped = 0x80;
|
||||
thread->coredump_regs =
|
||||
kmalloc(sizeof(struct pt_regs),
|
||||
IHK_MC_AP_NOWAIT);
|
||||
if (!thread->coredump_regs) {
|
||||
kprintf("%s: Out of memory\n", __func__);
|
||||
goto skip;
|
||||
}
|
||||
memcpy(thread->coredump_regs, regs,
|
||||
sizeof(struct pt_regs));
|
||||
|
||||
ret = coredump(thread, regs, sig);
|
||||
switch (ret) {
|
||||
case -EBUSY:
|
||||
kprintf("%s: INFO: coredump not performed, try ulimit -c <non-zero>\n",
|
||||
__func__);
|
||||
break;
|
||||
case 0:
|
||||
coredumped = 0x80;
|
||||
break;
|
||||
default:
|
||||
kprintf("%s: ERROR: coredump failed (%d)\n",
|
||||
__func__, ret);
|
||||
break;
|
||||
}
|
||||
skip:
|
||||
terminate(0, sig | coredumped);
|
||||
break;
|
||||
case SIGCHLD:
|
||||
@ -1422,7 +1481,9 @@ __check_signal(unsigned long rc, void *regs0, int num, int irq_disabled)
|
||||
|
||||
if(thread == NULL || thread->proc->pid == 0){
|
||||
struct thread *t;
|
||||
irqstate = ihk_mc_spinlock_lock(&(cpu_local_var(runq_lock)));
|
||||
|
||||
irqstate = cpu_disable_interrupt_save();
|
||||
ihk_mc_spinlock_lock_noirq(&(cpu_local_var(runq_lock)));
|
||||
list_for_each_entry(t, &(cpu_local_var(runq)), sched_list){
|
||||
if(t->proc->pid <= 0)
|
||||
continue;
|
||||
@ -1432,7 +1493,8 @@ __check_signal(unsigned long rc, void *regs0, int num, int irq_disabled)
|
||||
break;
|
||||
}
|
||||
}
|
||||
ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), irqstate);
|
||||
ihk_mc_spinlock_unlock_noirq(&(cpu_local_var(runq_lock)));
|
||||
cpu_restore_interrupt(irqstate);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1498,7 +1560,9 @@ check_sig_pending_thread(struct thread *thread)
|
||||
sig++, x >>= 1)
|
||||
;
|
||||
k = thread->sigcommon->action + sig - 1;
|
||||
if ((sig != SIGCHLD && sig != SIGURG) ||
|
||||
if ((sig != SIGCHLD &&
|
||||
sig != SIGURG &&
|
||||
sig != SIGCONT) ||
|
||||
(k->sa.sa_handler != SIG_IGN &&
|
||||
k->sa.sa_handler != NULL)) {
|
||||
if (!(pending->sigmask.__val[0] & w)) {
|
||||
@ -1507,6 +1571,7 @@ check_sig_pending_thread(struct thread *thread)
|
||||
found = 1;
|
||||
if (sig != SIGCHLD &&
|
||||
sig != SIGURG &&
|
||||
sig != SIGCONT &&
|
||||
!k->sa.sa_handler) {
|
||||
found = 2;
|
||||
break;
|
||||
@ -1590,7 +1655,6 @@ do_kill(struct thread * thread, int pid, int tid, int sig, siginfo_t *info, int
|
||||
struct list_head *head = NULL;
|
||||
int rc;
|
||||
unsigned long irqstate = 0;
|
||||
struct k_sigaction *k;
|
||||
int doint;
|
||||
int found = 0;
|
||||
siginfo_t info0;
|
||||
@ -1600,6 +1664,7 @@ do_kill(struct thread * thread, int pid, int tid, int sig, siginfo_t *info, int
|
||||
struct process_hash *phash = rset->process_hash;
|
||||
struct mcs_rwlock_node lock;
|
||||
struct mcs_rwlock_node updatelock;
|
||||
struct sig_pending *pending = NULL;
|
||||
|
||||
if(sig > SIGRTMAX || sig < 0)
|
||||
return -EINVAL;
|
||||
@ -1786,47 +1851,61 @@ done:
|
||||
|
||||
mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node);
|
||||
|
||||
/* Put signal event even when handler is SIG_IGN or SIG_DFL
|
||||
because target ptraced thread must call ptrace_report_signal
|
||||
in check_signal */
|
||||
rc = 0;
|
||||
k = tthread->sigcommon->action + sig - 1;
|
||||
if ((sig != SIGKILL && (tthread->ptrace & PT_TRACED)) ||
|
||||
(k->sa.sa_handler != SIG_IGN &&
|
||||
(k->sa.sa_handler != NULL ||
|
||||
(sig != SIGCHLD && sig != SIGURG)))) {
|
||||
struct sig_pending *pending = NULL;
|
||||
if (sig < SIGRTMIN) { // SIGRTMIN - SIGRTMAX
|
||||
list_for_each_entry(pending, head, list){
|
||||
if(pending->sigmask.__val[0] == mask &&
|
||||
pending->ptracecont == ptracecont)
|
||||
break;
|
||||
}
|
||||
if(&pending->list == head)
|
||||
pending = NULL;
|
||||
|
||||
if (sig < SIGRTMIN) { // SIGRTMIN - SIGRTMAX
|
||||
list_for_each_entry(pending, head, list) {
|
||||
if (pending->sigmask.__val[0] == mask &&
|
||||
pending->ptracecont == ptracecont)
|
||||
break;
|
||||
}
|
||||
if(pending == NULL){
|
||||
doint = 1;
|
||||
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
|
||||
if(!pending){
|
||||
rc = -ENOMEM;
|
||||
}
|
||||
else{
|
||||
memset(pending, 0, sizeof(struct sig_pending));
|
||||
pending->sigmask.__val[0] = mask;
|
||||
memcpy(&pending->info, info, sizeof(siginfo_t));
|
||||
pending->ptracecont = ptracecont;
|
||||
if(sig == SIGKILL || sig == SIGSTOP)
|
||||
list_add(&pending->list, head);
|
||||
else
|
||||
list_add_tail(&pending->list, head);
|
||||
tthread->sigevent = 1;
|
||||
}
|
||||
if (&pending->list == head)
|
||||
pending = NULL;
|
||||
}
|
||||
if (pending == NULL) {
|
||||
doint = 1;
|
||||
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
|
||||
if (!pending) {
|
||||
rc = -ENOMEM;
|
||||
}
|
||||
else {
|
||||
memset(pending, 0, sizeof(struct sig_pending));
|
||||
pending->sigmask.__val[0] = mask;
|
||||
memcpy(&pending->info, info, sizeof(siginfo_t));
|
||||
pending->ptracecont = ptracecont;
|
||||
if (sig == SIGKILL || sig == SIGSTOP)
|
||||
list_add(&pending->list, head);
|
||||
else
|
||||
list_add_tail(&pending->list, head);
|
||||
tthread->sigevent = 1;
|
||||
}
|
||||
}
|
||||
|
||||
mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node);
|
||||
cpu_restore_interrupt(irqstate);
|
||||
|
||||
if (sig == SIGCONT || ptracecont == 1) {
|
||||
/* Wake up the target only when stopped by SIGSTOP */
|
||||
if (sched_wakeup_thread(tthread, PS_STOPPED) == 0) {
|
||||
struct siginfo info;
|
||||
|
||||
tthread->proc->main_thread->signal_flags =
|
||||
SIGNAL_STOP_CONTINUED;
|
||||
tthread->proc->status = PS_RUNNING;
|
||||
memset(&info, '\0', sizeof(info));
|
||||
info.si_signo = SIGCHLD;
|
||||
info.si_code = CLD_CONTINUED;
|
||||
info._sifields._sigchld.si_pid = tthread->proc->pid;
|
||||
info._sifields._sigchld.si_status = 0x0000ffff;
|
||||
do_kill(tthread, tthread->proc->parent->pid, -1,
|
||||
SIGCHLD, &info, 0);
|
||||
if (thread != tthread) {
|
||||
ihk_mc_interrupt_cpu(tthread->cpu_id,
|
||||
ihk_mc_get_vector(IHK_GV_IKC));
|
||||
}
|
||||
doint = 0;
|
||||
}
|
||||
}
|
||||
if (doint && !(mask & tthread->sigmask.__val[0])) {
|
||||
int status = tthread->status;
|
||||
|
||||
@ -1841,11 +1920,6 @@ done:
|
||||
/* Wake up the target only when stopped by ptrace-reporting */
|
||||
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED | PS_INTERRUPTIBLE);
|
||||
}
|
||||
else if(sig == SIGCONT || ptracecont == 1){
|
||||
/* Wake up the target only when stopped by SIGSTOP */
|
||||
sched_wakeup_thread(tthread, PS_STOPPED);
|
||||
tthread->proc->status = PS_RUNNING;
|
||||
}
|
||||
else {
|
||||
sched_wakeup_thread(tthread, PS_INTERRUPTIBLE);
|
||||
}
|
||||
@ -1870,7 +1944,7 @@ set_signal(int sig, void *regs0, siginfo_t *info)
|
||||
}
|
||||
|
||||
if ((__sigmask(sig) & thread->sigmask.__val[0])) {
|
||||
coredump(thread, regs0);
|
||||
coredump(thread, regs0, sig);
|
||||
terminate(0, sig | 0x80);
|
||||
}
|
||||
do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0);
|
||||
@ -1900,7 +1974,7 @@ SYSCALL_DECLARE(mmap)
|
||||
;
|
||||
|
||||
const uintptr_t addr0 = ihk_mc_syscall_arg0(ctx);
|
||||
const size_t len0 = ihk_mc_syscall_arg1(ctx);
|
||||
size_t len0 = ihk_mc_syscall_arg1(ctx);
|
||||
const int prot = ihk_mc_syscall_arg2(ctx);
|
||||
const int flags0 = ihk_mc_syscall_arg3(ctx);
|
||||
const int fd = ihk_mc_syscall_arg4(ctx);
|
||||
@ -1941,7 +2015,8 @@ SYSCALL_DECLARE(mmap)
|
||||
|
||||
if (hugeshift == 0) {
|
||||
/* default hugepage size */
|
||||
flags |= MAP_HUGE_SECOND_BLOCK;
|
||||
flags |= ihk_mc_get_linux_default_huge_page_shift() <<
|
||||
MAP_HUGE_SHIFT;
|
||||
} else if ((first_level_block_support &&
|
||||
hugeshift == MAP_HUGE_FIRST_BLOCK) ||
|
||||
(first_level_block_support &&
|
||||
@ -1958,6 +2033,14 @@ SYSCALL_DECLARE(mmap)
|
||||
goto out;
|
||||
}
|
||||
pgsize = (size_t)1 << ((flags >> MAP_HUGE_SHIFT) & 0x3F);
|
||||
/* Round-up map length by pagesize */
|
||||
len0 = ALIGN(len0, pgsize);
|
||||
|
||||
if (rusage_check_overmap(len0,
|
||||
(flags >> MAP_HUGE_SHIFT) & 0x3F)) {
|
||||
error = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
#define VALID_DUMMY_ADDR ((region->user_start + PTL3_SIZE - 1) & ~(PTL3_SIZE - 1))
|
||||
@ -2018,7 +2101,8 @@ SYSCALL_DECLARE(shmget)
|
||||
|
||||
if (hugeshift == 0) {
|
||||
/* default hugepage size */
|
||||
shmflg |= SHM_HUGE_SECOND_BLOCK;
|
||||
shmflg |= ihk_mc_get_linux_default_huge_page_shift() <<
|
||||
MAP_HUGE_SHIFT;
|
||||
} else if ((first_level_block_support &&
|
||||
hugeshift == SHM_HUGE_FIRST_BLOCK) ||
|
||||
(first_level_block_support &&
|
||||
@ -2082,11 +2166,13 @@ int do_process_vm_read_writev(int pid,
|
||||
struct process *rproc;
|
||||
struct process *lproc = lthread->proc;
|
||||
struct process_vm *rvm = NULL;
|
||||
unsigned long rphys;
|
||||
unsigned long rpage_left;
|
||||
unsigned long psize;
|
||||
void *rva;
|
||||
unsigned long lphys, rphys;
|
||||
unsigned long lpage_left, rpage_left;
|
||||
unsigned long lpsize, rpsize;
|
||||
void *rva, *lva;
|
||||
#if 0
|
||||
struct vm_range *range;
|
||||
#endif
|
||||
struct mcs_rwlock_node_irqsave lock;
|
||||
struct mcs_rwlock_node update_lock;
|
||||
|
||||
@ -2099,8 +2185,9 @@ int do_process_vm_read_writev(int pid,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* Check if parameters are okay */
|
||||
ihk_mc_spinlock_lock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_lock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
range = lookup_process_memory_range(lthread->vm,
|
||||
(uintptr_t)local_iov,
|
||||
@ -2122,11 +2209,12 @@ int do_process_vm_read_writev(int pid,
|
||||
|
||||
ret = 0;
|
||||
arg_out:
|
||||
ihk_mc_spinlock_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (li = 0; li < liovcnt; ++li) {
|
||||
llen += local_iov[li].iov_len;
|
||||
@ -2191,7 +2279,7 @@ arg_out:
|
||||
if (pli != li) {
|
||||
struct vm_range *range;
|
||||
|
||||
ihk_mc_spinlock_lock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_lock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
/* Is base valid? */
|
||||
range = lookup_process_memory_range(lthread->vm,
|
||||
@ -2221,7 +2309,7 @@ arg_out:
|
||||
|
||||
ret = 0;
|
||||
pli_out:
|
||||
ihk_mc_spinlock_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
@ -2234,7 +2322,7 @@ pli_out:
|
||||
if (pri != ri) {
|
||||
struct vm_range *range;
|
||||
|
||||
ihk_mc_spinlock_lock_noirq(&rvm->memory_range_lock);
|
||||
ihk_rwspinlock_read_lock_noirq(&rvm->memory_range_lock);
|
||||
|
||||
/* Is base valid? */
|
||||
range = lookup_process_memory_range(rvm,
|
||||
@ -2264,7 +2352,7 @@ pli_out:
|
||||
|
||||
ret = 0;
|
||||
pri_out:
|
||||
ihk_mc_spinlock_unlock_noirq(&rvm->memory_range_lock);
|
||||
ihk_rwspinlock_read_unlock_noirq(&rvm->memory_range_lock);
|
||||
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
@ -2279,10 +2367,53 @@ pri_out:
|
||||
to_copy = remote_iov[ri].iov_len - roff;
|
||||
}
|
||||
|
||||
retry_lookup:
|
||||
retry_llookup:
|
||||
/* Figure out local physical */
|
||||
/* TODO: remember page and do this only if necessary */
|
||||
ret = ihk_mc_pt_virt_to_phys_size(lthread->vm->address_space->page_table,
|
||||
local_iov[li].iov_base + loff, &lphys, &lpsize);
|
||||
|
||||
if (ret) {
|
||||
uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER;
|
||||
void *addr;
|
||||
|
||||
if (faulted) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Fault in pages */
|
||||
for (addr = (void *)
|
||||
(((unsigned long)local_iov[li].iov_base + loff)
|
||||
& PAGE_MASK);
|
||||
addr < (local_iov[li].iov_base + loff + to_copy);
|
||||
addr += PAGE_SIZE) {
|
||||
|
||||
ret = page_fault_process_vm(lthread->vm, addr, reason);
|
||||
if (ret) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
faulted = 1;
|
||||
goto retry_llookup;
|
||||
}
|
||||
|
||||
lpage_left = ((((unsigned long)local_iov[li].iov_base + loff +
|
||||
lpsize) & ~(lpsize - 1)) -
|
||||
((unsigned long)local_iov[li].iov_base + loff));
|
||||
if (lpage_left < to_copy) {
|
||||
to_copy = lpage_left;
|
||||
}
|
||||
|
||||
lva = phys_to_virt(lphys);
|
||||
|
||||
retry_rlookup:
|
||||
/* Figure out remote physical */
|
||||
/* TODO: remember page and do this only if necessary */
|
||||
ret = ihk_mc_pt_virt_to_phys_size(rvm->address_space->page_table,
|
||||
remote_iov[ri].iov_base + roff, &rphys, &psize);
|
||||
remote_iov[ri].iov_base + roff, &rphys, &rpsize);
|
||||
|
||||
if (ret) {
|
||||
uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER;
|
||||
@ -2308,11 +2439,11 @@ retry_lookup:
|
||||
}
|
||||
|
||||
faulted = 1;
|
||||
goto retry_lookup;
|
||||
goto retry_rlookup;
|
||||
}
|
||||
|
||||
rpage_left = ((((unsigned long)remote_iov[ri].iov_base + roff +
|
||||
psize) & ~(psize - 1)) -
|
||||
rpsize) & ~(rpsize - 1)) -
|
||||
((unsigned long)remote_iov[ri].iov_base + roff));
|
||||
if (rpage_left < to_copy) {
|
||||
to_copy = rpage_left;
|
||||
@ -2321,16 +2452,16 @@ retry_lookup:
|
||||
rva = phys_to_virt(rphys);
|
||||
|
||||
fast_memcpy(
|
||||
(op == PROCESS_VM_READ) ? local_iov[li].iov_base + loff : rva,
|
||||
(op == PROCESS_VM_READ) ? rva : local_iov[li].iov_base + loff,
|
||||
(op == PROCESS_VM_READ) ? lva : rva,
|
||||
(op == PROCESS_VM_READ) ? rva : lva,
|
||||
to_copy);
|
||||
|
||||
copied += to_copy;
|
||||
dkprintf("local_iov[%d]: 0x%lx %s remote_iov[%d]: 0x%lx, %lu copied, psize: %lu, rpage_left: %lu\n",
|
||||
dkprintf("local_iov[%d]: 0x%lx %s remote_iov[%d]: 0x%lx, %lu copied, rpsize: %lu, rpage_left: %lu\n",
|
||||
li, local_iov[li].iov_base + loff,
|
||||
(op == PROCESS_VM_READ) ? "<-" : "->",
|
||||
ri, remote_iov[ri].iov_base + roff, to_copy,
|
||||
psize, rpage_left);
|
||||
rpsize, rpage_left);
|
||||
|
||||
loff += to_copy;
|
||||
roff += to_copy;
|
||||
@ -2700,4 +2831,48 @@ SYSCALL_DECLARE(time)
|
||||
return time();
|
||||
}
|
||||
|
||||
void calculate_time_from_tsc(struct timespec *ts)
|
||||
{
|
||||
long ver;
|
||||
unsigned long current_tsc;
|
||||
time_t sec_delta;
|
||||
long ns_delta;
|
||||
|
||||
for (;;) {
|
||||
while ((ver = ihk_atomic64_read(&tod_data.version)) & 1) {
|
||||
/* settimeofday() is in progress */
|
||||
cpu_pause();
|
||||
}
|
||||
rmb(); /* fetch version before time */
|
||||
*ts = tod_data.origin;
|
||||
rmb(); /* fetch time before checking version */
|
||||
if (ver == ihk_atomic64_read(&tod_data.version)) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* settimeofday() has intervened */
|
||||
cpu_pause();
|
||||
}
|
||||
|
||||
current_tsc = rdtsc();
|
||||
sec_delta = current_tsc / tod_data.clocks_per_sec;
|
||||
ns_delta = NS_PER_SEC * (current_tsc % tod_data.clocks_per_sec)
|
||||
/ tod_data.clocks_per_sec;
|
||||
/* calc. of ns_delta overflows if clocks_per_sec exceeds 18.44 GHz */
|
||||
|
||||
ts->tv_sec += sec_delta;
|
||||
ts->tv_nsec += ns_delta;
|
||||
if (ts->tv_nsec >= NS_PER_SEC) {
|
||||
ts->tv_nsec -= NS_PER_SEC;
|
||||
++ts->tv_sec;
|
||||
}
|
||||
}
|
||||
|
||||
extern void ptrace_syscall_event(struct thread *thread);
|
||||
long arch_ptrace_syscall_event(struct thread *thread,
|
||||
ihk_mc_user_context_t *ctx, long setret)
|
||||
{
|
||||
ptrace_syscall_event(thread);
|
||||
return setret;
|
||||
}
|
||||
/*** End of File ***/
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
#include <cputype.h>
|
||||
#include <irq.h>
|
||||
#include <arch-timer.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
//#define DEBUG_PRINT_TIMER
|
||||
|
||||
@ -111,6 +111,8 @@ static void timer_handler(void *priv)
|
||||
/* set timer re-enable for periodic */
|
||||
arch_timer_reg_write(ARCH_TIMER_REG_TVAL, clocks);
|
||||
arch_timer_reg_write(ARCH_TIMER_REG_CTRL, ctrl);
|
||||
|
||||
do_backlog();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -11,10 +11,9 @@
|
||||
#include <process.h>
|
||||
#include <string.h>
|
||||
#include <syscall.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <ikc/queue.h>
|
||||
#include <vdso.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
//#define DEBUG_PRINT_VDSO
|
||||
|
||||
@ -23,7 +22,6 @@
|
||||
#define DDEBUG_DEFAULT DDEBUG_PRINT
|
||||
#endif
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_52
|
||||
#define VDSO_MAXPAGES 1
|
||||
struct vdso {
|
||||
long busy;
|
||||
@ -34,7 +32,6 @@ struct vdso {
|
||||
long lbase;
|
||||
long offset_sigtramp;
|
||||
};
|
||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
||||
|
||||
extern char vdso_start, vdso_end;
|
||||
static struct vdso vdso;
|
||||
@ -90,6 +87,7 @@ int arch_setup_vdso(void)
|
||||
}
|
||||
|
||||
panic("Only support host mapping vDSO");
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int get_free_area(struct process_vm *vm, size_t len, intptr_t hint,
|
||||
|
||||
@ -18,7 +18,7 @@ extern char data_start[], data_end[];
|
||||
#define LARGE_PAGE_MASK (~((unsigned long)LARGE_PAGE_SIZE - 1))
|
||||
|
||||
#define MAP_ST_START 0xffff800000000000UL
|
||||
#define MAP_KERNEL_START 0xffffffff80000000UL
|
||||
/* MAP_KERNEL_START is defined by cmake */
|
||||
|
||||
#define PTL4_SHIFT 39
|
||||
#define PTL3_SHIFT 30
|
||||
|
||||
@ -1,8 +1,9 @@
|
||||
/* coredump.c COPYRIGHT FUJITSU LIMITED 2018 */
|
||||
/* coredump.c COPYRIGHT FUJITSU LIMITED 2018-2019 */
|
||||
#include <process.h>
|
||||
#include <elfcore.h>
|
||||
|
||||
void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread, void *regs0)
|
||||
void arch_fill_prstatus(struct elf_prstatus64 *prstatus,
|
||||
struct thread *thread, void *regs0, int sig)
|
||||
{
|
||||
struct x86_user_context *uctx = regs0;
|
||||
struct x86_basic_regs *regs = &uctx->gpr;
|
||||
@ -18,8 +19,6 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
||||
short int pr_cursig;
|
||||
a8_uint64_t pr_sigpend;
|
||||
a8_uint64_t pr_sighold;
|
||||
pid_t pr_pid;
|
||||
pid_t pr_ppid;
|
||||
pid_t pr_pgrp;
|
||||
pid_t pr_sid;
|
||||
struct prstatus64_timeval pr_utime;
|
||||
@ -28,6 +27,14 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
||||
struct prstatus64_timeval pr_cstime;
|
||||
*/
|
||||
|
||||
prstatus->pr_pid = thread->tid;
|
||||
if (thread->proc->parent) {
|
||||
prstatus->pr_ppid = thread->proc->parent->pid;
|
||||
}
|
||||
|
||||
prstatus->pr_info.si_signo = sig;
|
||||
prstatus->pr_cursig = sig;
|
||||
|
||||
prstatus->pr_reg[0] = _r15;
|
||||
prstatus->pr_reg[1] = _r14;
|
||||
prstatus->pr_reg[2] = _r13;
|
||||
@ -55,3 +62,13 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
|
||||
|
||||
prstatus->pr_fpvalid = 0; /* We assume no fp */
|
||||
}
|
||||
|
||||
void arch_fill_thread_core_info(struct note *head,
|
||||
struct thread *thread, void *regs)
|
||||
{
|
||||
}
|
||||
|
||||
int arch_get_thread_core_info_size(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
/* cpu.c COPYRIGHT FUJITSU LIMITED 2018 */
|
||||
/* cpu.c COPYRIGHT FUJITSU LIMITED 2018-2019 */
|
||||
/**
|
||||
* \file cpu.c
|
||||
* License details are found in the file LICENSE.
|
||||
@ -16,7 +16,6 @@
|
||||
*/
|
||||
|
||||
#include <ihk/cpu.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <ihk/mm.h>
|
||||
#include <types.h>
|
||||
#include <errno.h>
|
||||
@ -32,7 +31,7 @@
|
||||
#include <prctl.h>
|
||||
#include <page.h>
|
||||
#include <kmalloc.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
#define LAPIC_ID 0x020
|
||||
#define LAPIC_TIMER 0x320
|
||||
@ -45,11 +44,9 @@
|
||||
#define LAPIC_ICR0 0x300
|
||||
#define LAPIC_ICR2 0x310
|
||||
#define LAPIC_ESR 0x280
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_75 /* x86 depend hide */
|
||||
#define LOCAL_TIMER_VECTOR 0xef
|
||||
#define LOCAL_PERF_VECTOR 0xf0
|
||||
#define LOCAL_SMP_FUNC_CALL_VECTOR 0xf1
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_75 */
|
||||
|
||||
#define APIC_INT_LEVELTRIG 0x08000
|
||||
#define APIC_INT_ASSERT 0x04000
|
||||
@ -148,7 +145,7 @@ void reload_idt(void)
|
||||
}
|
||||
|
||||
static struct list_head handlers[256 - 32];
|
||||
extern char nmi[];
|
||||
extern char nmi_handler[];
|
||||
extern char page_fault[], general_protection_exception[];
|
||||
extern char debug_exception[], int3_exception[];
|
||||
|
||||
@ -175,7 +172,7 @@ static void init_idt(void)
|
||||
set_idt_entry(i, generic_common_handlers[i]);
|
||||
}
|
||||
|
||||
set_idt_entry(2, (uintptr_t)nmi);
|
||||
set_idt_entry(2, (uintptr_t)nmi_handler);
|
||||
set_idt_entry(13, (unsigned long)general_protection_exception);
|
||||
set_idt_entry(14, (unsigned long)page_fault);
|
||||
|
||||
@ -955,6 +952,8 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
|
||||
v->flags |= CPU_FLAG_NEED_RESCHED;
|
||||
ihk_mc_spinlock_unlock(&v->runq_lock, irqstate);
|
||||
dkprintf("timer[%lu]: CPU_FLAG_NEED_RESCHED \n", rdtsc());
|
||||
|
||||
do_backlog();
|
||||
}
|
||||
else if (vector == LOCAL_PERF_VECTOR) {
|
||||
struct siginfo info;
|
||||
@ -1206,6 +1205,15 @@ unsigned long cpu_disable_interrupt_save(void)
|
||||
return flags;
|
||||
}
|
||||
|
||||
unsigned long cpu_enable_interrupt_save(void)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
asm volatile("pushf; pop %0; sti" : "=r"(flags) : : "memory", "cc");
|
||||
|
||||
return flags;
|
||||
}
|
||||
|
||||
/*@
|
||||
@ behavior valid_vector:
|
||||
@ assumes 32 <= vector <= 255;
|
||||
@ -1602,14 +1610,18 @@ int ihk_mc_arch_get_special_register(enum ihk_asr_type type,
|
||||
}
|
||||
|
||||
/*@
|
||||
@ requires \valid_apicid(cpu); // valid APIC ID or not
|
||||
@ requires \valid_cpuid(cpu); // valid CPU logical ID
|
||||
@ ensures \result == 0
|
||||
@*/
|
||||
int ihk_mc_interrupt_cpu(int cpu, int vector)
|
||||
{
|
||||
if (cpu < 0 || cpu >= num_processors) {
|
||||
kprintf("%s: invalid CPU id: %d\n", __func__, cpu);
|
||||
return -1;
|
||||
}
|
||||
dkprintf("[%d] ihk_mc_interrupt_cpu: %d\n", ihk_mc_get_processor_id(), cpu);
|
||||
|
||||
x86_issue_ipi(cpu, vector);
|
||||
x86_issue_ipi(get_x86_cpu_local_variable(cpu)->apic_id, vector);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1624,6 +1636,7 @@ struct thread *arch_switch_context(struct thread *prev, struct thread *next)
|
||||
/* Set up new TLS.. */
|
||||
ihk_mc_init_user_tlsbase(next->uctx, next->tlsblock_base);
|
||||
|
||||
#ifdef ENABLE_PERF
|
||||
/* Performance monitoring inherit */
|
||||
if(next->proc->monitoring_event) {
|
||||
if(next->proc->perf_status == PP_RESET)
|
||||
@ -1633,6 +1646,7 @@ struct thread *arch_switch_context(struct thread *prev, struct thread *next)
|
||||
perf_start(next->proc->monitoring_event);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PROFILE_ENABLE
|
||||
if (prev && prev->profile && prev->profile_start_ts != 0) {
|
||||
@ -1708,7 +1722,7 @@ check_and_allocate_fp_regs(struct thread *thread)
|
||||
|
||||
if (!thread->fp_regs) {
|
||||
kprintf("error: allocating fp_regs pages\n");
|
||||
result = 1;
|
||||
result = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1721,12 +1735,14 @@ out:
|
||||
/*@
|
||||
@ requires \valid(thread);
|
||||
@*/
|
||||
void
|
||||
int
|
||||
save_fp_regs(struct thread *thread)
|
||||
{
|
||||
if (check_and_allocate_fp_regs(thread) != 0) {
|
||||
// alloc error
|
||||
return;
|
||||
int ret = 0;
|
||||
|
||||
ret = check_and_allocate_fp_regs(thread);
|
||||
if (ret) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (xsave_available) {
|
||||
@ -1741,13 +1757,23 @@ save_fp_regs(struct thread *thread)
|
||||
|
||||
dkprintf("fp_regs for TID %d saved\n", thread->tid);
|
||||
}
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void copy_fp_regs(struct thread *from, struct thread *to)
|
||||
int copy_fp_regs(struct thread *from, struct thread *to)
|
||||
{
|
||||
if ((from->fp_regs != NULL) && (check_and_allocate_fp_regs(to) == 0)) {
|
||||
memcpy(to->fp_regs, from->fp_regs, sizeof(fp_regs_struct));
|
||||
int ret = 0;
|
||||
|
||||
if (from->fp_regs != NULL) {
|
||||
ret = check_and_allocate_fp_regs(to);
|
||||
if (!ret) {
|
||||
memcpy(to->fp_regs,
|
||||
from->fp_regs,
|
||||
sizeof(fp_regs_struct));
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*@
|
||||
@ -1820,6 +1846,10 @@ ihk_mc_init_user_tlsbase(ihk_mc_user_context_t *ctx,
|
||||
do_arch_prctl(ARCH_SET_FS, tls_base_addr);
|
||||
}
|
||||
|
||||
void arch_flush_icache_all(void)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
/*@
|
||||
@ assigns \nothing;
|
||||
@ -1973,6 +2003,92 @@ mod_nmi_ctx(void *nmi_ctx, void (*func)())
|
||||
l[i++] = 0x28; // KERNEL DS
|
||||
}
|
||||
|
||||
void arch_save_panic_regs(void *irq_regs)
|
||||
{
|
||||
struct thread *current = cpu_local_var(current);
|
||||
struct x86_user_context *regs =
|
||||
(struct x86_user_context *)irq_regs;
|
||||
struct x86_cpu_local_variables *x86v =
|
||||
get_x86_cpu_local_variable(ihk_mc_get_processor_id());
|
||||
struct segment_regs {
|
||||
uint32_t rflags;
|
||||
uint32_t cs;
|
||||
uint32_t ss;
|
||||
uint32_t ds;
|
||||
uint32_t es;
|
||||
uint32_t fs;
|
||||
uint32_t gs;
|
||||
} *sregs;
|
||||
|
||||
/* Kernel space? */
|
||||
if (regs->gpr.rip > USER_END) {
|
||||
x86v->panic_regs[0] = regs->gpr.rax;
|
||||
x86v->panic_regs[1] = regs->gpr.rbx;
|
||||
x86v->panic_regs[2] = regs->gpr.rcx;
|
||||
x86v->panic_regs[3] = regs->gpr.rdx;
|
||||
x86v->panic_regs[4] = regs->gpr.rsi;
|
||||
x86v->panic_regs[5] = regs->gpr.rdi;
|
||||
x86v->panic_regs[6] = regs->gpr.rbp;
|
||||
x86v->panic_regs[7] = regs->gpr.rsp;
|
||||
x86v->panic_regs[8] = regs->gpr.r8;
|
||||
x86v->panic_regs[9] = regs->gpr.r9;
|
||||
x86v->panic_regs[10] = regs->gpr.r10;
|
||||
x86v->panic_regs[11] = regs->gpr.r11;
|
||||
x86v->panic_regs[12] = regs->gpr.r12;
|
||||
x86v->panic_regs[13] = regs->gpr.r13;
|
||||
x86v->panic_regs[14] = regs->gpr.r14;
|
||||
x86v->panic_regs[15] = regs->gpr.r15;
|
||||
x86v->panic_regs[16] = regs->gpr.rip;
|
||||
sregs = (struct segment_regs *)&x86v->panic_regs[17];
|
||||
sregs->rflags = regs->gpr.rflags;
|
||||
sregs->cs = regs->gpr.cs;
|
||||
sregs->ss = regs->gpr.ss;
|
||||
sregs->ds = regs->sr.ds;
|
||||
sregs->es = regs->sr.es;
|
||||
sregs->fs = regs->sr.fs;
|
||||
sregs->gs = regs->sr.gs;
|
||||
}
|
||||
/* User-space, show kernel context */
|
||||
else {
|
||||
kprintf("%s: in user-space: %p\n", __func__, regs->gpr.rip);
|
||||
x86v->panic_regs[0] = 0;
|
||||
x86v->panic_regs[1] = current->ctx.rbx;
|
||||
x86v->panic_regs[2] = 0;
|
||||
x86v->panic_regs[3] = 0;
|
||||
x86v->panic_regs[4] = current->ctx.rsi;
|
||||
x86v->panic_regs[5] = current->ctx.rdi;
|
||||
x86v->panic_regs[6] = current->ctx.rbp;
|
||||
x86v->panic_regs[7] = current->ctx.rsp;
|
||||
x86v->panic_regs[8] = 0;
|
||||
x86v->panic_regs[9] = 0;
|
||||
x86v->panic_regs[10] = 0;
|
||||
x86v->panic_regs[11] = 0;
|
||||
x86v->panic_regs[12] = regs->gpr.r12;
|
||||
x86v->panic_regs[13] = regs->gpr.r13;
|
||||
x86v->panic_regs[14] = regs->gpr.r14;
|
||||
x86v->panic_regs[15] = regs->gpr.r15;
|
||||
x86v->panic_regs[16] = (unsigned long)enter_user_mode;
|
||||
sregs = (struct segment_regs *)&x86v->panic_regs[17];
|
||||
sregs->rflags = regs->gpr.rflags;
|
||||
sregs->cs = regs->gpr.cs;
|
||||
sregs->ss = regs->gpr.ss;
|
||||
sregs->ds = regs->sr.ds;
|
||||
sregs->es = regs->sr.es;
|
||||
sregs->fs = regs->sr.fs;
|
||||
sregs->gs = regs->sr.gs;
|
||||
}
|
||||
|
||||
x86v->paniced = 1;
|
||||
}
|
||||
|
||||
void arch_clear_panic(void)
|
||||
{
|
||||
struct x86_cpu_local_variables *x86v =
|
||||
get_x86_cpu_local_variable(ihk_mc_get_processor_id());
|
||||
|
||||
x86v->paniced = 0;
|
||||
}
|
||||
|
||||
int arch_cpu_read_write_register(
|
||||
struct ihk_os_cpu_register *desc,
|
||||
enum mcctrl_os_cpu_operation op)
|
||||
@ -2096,9 +2212,7 @@ int smp_call_func(cpu_set_t *__cpu_set, smp_func_t __func, void *__arg)
|
||||
ihk_mc_spinlock_unlock(&get_cpu_local_var(cpu)->smp_func_req_lock,
|
||||
irq_flags);
|
||||
|
||||
ihk_mc_interrupt_cpu(
|
||||
get_x86_cpu_local_variable(cpu)->apic_id,
|
||||
LOCAL_SMP_FUNC_CALL_VECTOR);
|
||||
ihk_mc_interrupt_cpu(cpu, LOCAL_SMP_FUNC_CALL_VECTOR);
|
||||
|
||||
++cpu_index;
|
||||
}
|
||||
@ -2130,4 +2244,48 @@ free_out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
extern int nmi_mode;
|
||||
extern long freeze_thaw(void *nmi_ctx);
|
||||
|
||||
void multi_nm_interrupt_handler(void *irq_regs)
|
||||
{
|
||||
dkprintf("%s: ...\n", __func__);
|
||||
switch (nmi_mode) {
|
||||
case 1:
|
||||
case 2:
|
||||
/* mode == 1 or 2, for FREEZER NMI */
|
||||
dkprintf("%s: freeze mode NMI catch. (nmi_mode=%d)\n",
|
||||
__func__, nmi_mode);
|
||||
freeze_thaw(NULL);
|
||||
break;
|
||||
|
||||
case 0:
|
||||
/* mode == 0, for MEMDUMP NMI */
|
||||
arch_save_panic_regs(irq_regs);
|
||||
ihk_mc_query_mem_areas();
|
||||
/* memdump-nmi is halted McKernel, break is unnecessary. */
|
||||
/* fall through */
|
||||
case 3:
|
||||
/* mode == 3, for SHUTDOWN-WAIT NMI */
|
||||
kprintf("%s: STOP\n", __func__);
|
||||
while (nmi_mode != 4)
|
||||
cpu_halt();
|
||||
break;
|
||||
|
||||
case 4:
|
||||
/* mode == 4, continue NMI */
|
||||
arch_clear_panic();
|
||||
if (!ihk_mc_get_processor_id()) {
|
||||
ihk_mc_clear_dump_page_completion();
|
||||
}
|
||||
kprintf("%s: RESUME, nmi_mode: %d\n", __func__, nmi_mode);
|
||||
break;
|
||||
|
||||
default:
|
||||
ekprintf("%s: Unknown nmi-mode(%d) detected.\n",
|
||||
__func__, nmi_mode);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*** end of file ***/
|
||||
|
||||
@ -33,7 +33,7 @@ extern void preempt_disable(void);
|
||||
|
||||
#define IHK_STATIC_SPINLOCK_FUNCS
|
||||
|
||||
static void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
|
||||
static inline void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
|
||||
{
|
||||
lock->head_tail = 0;
|
||||
}
|
||||
@ -50,10 +50,13 @@ rc = __ihk_mc_spinlock_trylock_noirq(l); \
|
||||
#define ihk_mc_spinlock_trylock_noirq __ihk_mc_spinlock_trylock_noirq
|
||||
#endif
|
||||
|
||||
static int __ihk_mc_spinlock_trylock_noirq(ihk_spinlock_t *lock)
|
||||
static inline int __ihk_mc_spinlock_trylock_noirq(ihk_spinlock_t *lock)
|
||||
{
|
||||
ihk_spinlock_t cur = { .head_tail = lock->head_tail };
|
||||
ihk_spinlock_t next = { .tickets.head = cur.tickets.head, .tickets.tail = cur.tickets.tail + 2 };
|
||||
ihk_spinlock_t next = { .tickets = {
|
||||
.head = cur.tickets.head,
|
||||
.tail = cur.tickets.tail + 2
|
||||
} };
|
||||
int success;
|
||||
|
||||
if (cur.tickets.head != cur.tickets.tail) {
|
||||
@ -80,7 +83,8 @@ __kprintf("[%d] ret ihk_mc_spinlock_trylock\n", ihk_mc_get_processor_id()); rc;\
|
||||
#else
|
||||
#define ihk_mc_spinlock_trylock __ihk_mc_spinlock_trylock
|
||||
#endif
|
||||
static unsigned long __ihk_mc_spinlock_trylock(ihk_spinlock_t *lock, int *result)
|
||||
static inline unsigned long __ihk_mc_spinlock_trylock(ihk_spinlock_t *lock,
|
||||
int *result)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
@ -101,7 +105,7 @@ __kprintf("[%d] ret ihk_mc_spinlock_lock_noirq\n", ihk_mc_get_processor_id()); \
|
||||
#define ihk_mc_spinlock_lock_noirq __ihk_mc_spinlock_lock_noirq
|
||||
#endif
|
||||
|
||||
static void __ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
|
||||
static inline void __ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
|
||||
{
|
||||
register struct __raw_tickets inc = { .tail = 0x0002 };
|
||||
|
||||
@ -132,7 +136,7 @@ __kprintf("[%d] ret ihk_mc_spinlock_lock\n", ihk_mc_get_processor_id()); rc;\
|
||||
#else
|
||||
#define ihk_mc_spinlock_lock __ihk_mc_spinlock_lock
|
||||
#endif
|
||||
static unsigned long __ihk_mc_spinlock_lock(ihk_spinlock_t *lock)
|
||||
static inline unsigned long __ihk_mc_spinlock_lock(ihk_spinlock_t *lock)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
@ -152,7 +156,7 @@ __kprintf("[%d] ret ihk_mc_spinlock_unlock_noirq\n", ihk_mc_get_processor_id());
|
||||
#else
|
||||
#define ihk_mc_spinlock_unlock_noirq __ihk_mc_spinlock_unlock_noirq
|
||||
#endif
|
||||
static void __ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
|
||||
static inline void __ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
|
||||
{
|
||||
__ticket_t inc = 0x0002;
|
||||
|
||||
@ -171,100 +175,14 @@ __kprintf("[%d] ret ihk_mc_spinlock_unlock\n", ihk_mc_get_processor_id()); \
|
||||
#else
|
||||
#define ihk_mc_spinlock_unlock __ihk_mc_spinlock_unlock
|
||||
#endif
|
||||
static void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
|
||||
static inline void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock,
|
||||
unsigned long flags)
|
||||
{
|
||||
__ihk_mc_spinlock_unlock_noirq(lock);
|
||||
|
||||
cpu_restore_interrupt(flags);
|
||||
}
|
||||
|
||||
/* An implementation of the Mellor-Crummey Scott (MCS) lock */
|
||||
typedef struct mcs_lock_node {
|
||||
unsigned long locked;
|
||||
struct mcs_lock_node *next;
|
||||
unsigned long irqsave;
|
||||
#ifndef ENABLE_UBSAN
|
||||
} __aligned(64) mcs_lock_node_t;
|
||||
#else
|
||||
} mcs_lock_node_t;
|
||||
#endif
|
||||
|
||||
typedef mcs_lock_node_t mcs_lock_t;
|
||||
|
||||
static void mcs_lock_init(struct mcs_lock_node *node)
|
||||
{
|
||||
node->locked = 0;
|
||||
node->next = NULL;
|
||||
}
|
||||
|
||||
static void __mcs_lock_lock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
struct mcs_lock_node *pred;
|
||||
|
||||
node->next = NULL;
|
||||
node->locked = 0;
|
||||
pred = (struct mcs_lock_node *)xchg8((unsigned long *)&lock->next,
|
||||
(unsigned long)node);
|
||||
|
||||
if (pred) {
|
||||
node->locked = 1;
|
||||
pred->next = node;
|
||||
while (node->locked != 0) {
|
||||
cpu_pause();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void __mcs_lock_unlock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
if (node->next == NULL) {
|
||||
struct mcs_lock_node *old = (struct mcs_lock_node *)
|
||||
atomic_cmpxchg8((unsigned long *)&lock->next,
|
||||
(unsigned long)node, (unsigned long)0);
|
||||
|
||||
if (old == node) {
|
||||
return;
|
||||
}
|
||||
|
||||
while (node->next == NULL) {
|
||||
cpu_pause();
|
||||
}
|
||||
}
|
||||
|
||||
node->next->locked = 0;
|
||||
}
|
||||
|
||||
static void mcs_lock_lock_noirq(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
preempt_disable();
|
||||
__mcs_lock_lock(lock, node);
|
||||
}
|
||||
|
||||
static void mcs_lock_unlock_noirq(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
__mcs_lock_unlock(lock, node);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
static void mcs_lock_lock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
node->irqsave = cpu_disable_interrupt_save();
|
||||
mcs_lock_lock_noirq(lock, node);
|
||||
}
|
||||
|
||||
static void mcs_lock_unlock(struct mcs_lock_node *lock,
|
||||
struct mcs_lock_node *node)
|
||||
{
|
||||
mcs_lock_unlock_noirq(lock, node);
|
||||
cpu_restore_interrupt(node->irqsave);
|
||||
}
|
||||
|
||||
|
||||
#define SPINLOCK_IN_MCS_RWLOCK
|
||||
|
||||
// reader/writer lock
|
||||
@ -310,7 +228,7 @@ typedef struct mcs_rwlock_lock {
|
||||
} mcs_rwlock_lock_t;
|
||||
#endif
|
||||
|
||||
static void
|
||||
static inline void
|
||||
mcs_rwlock_init(struct mcs_rwlock_lock *lock)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -331,7 +249,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_lock_noirq\n", ihk_mc_get_processor_id());
|
||||
#else
|
||||
#define mcs_rwlock_writer_lock_noirq __mcs_rwlock_writer_lock_noirq
|
||||
#endif
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -358,7 +276,7 @@ __mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_n
|
||||
}
|
||||
|
||||
#ifndef SPINLOCK_IN_MCS_RWLOCK
|
||||
static void
|
||||
static inline void
|
||||
mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
|
||||
{
|
||||
struct mcs_rwlock_node *p;
|
||||
@ -425,7 +343,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_unlock_noirq\n", ihk_mc_get_processor_id()
|
||||
#else
|
||||
#define mcs_rwlock_writer_unlock_noirq __mcs_rwlock_writer_unlock_noirq
|
||||
#endif
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_writer_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -485,7 +403,7 @@ atomic_inc_ifnot0(ihk_atomic_t *v)
|
||||
return old;
|
||||
}
|
||||
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -551,7 +469,7 @@ __kprintf("[%d] ret mcs_rwlock_reader_unlock_noirq\n", ihk_mc_get_processor_id()
|
||||
#else
|
||||
#define mcs_rwlock_reader_unlock_noirq __mcs_rwlock_reader_unlock_noirq
|
||||
#endif
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_reader_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -598,7 +516,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_lock\n", ihk_mc_get_processor_id()); \
|
||||
#else
|
||||
#define mcs_rwlock_writer_lock __mcs_rwlock_writer_lock
|
||||
#endif
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_writer_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -618,7 +536,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_unlock\n", ihk_mc_get_processor_id()); \
|
||||
#else
|
||||
#define mcs_rwlock_writer_unlock __mcs_rwlock_writer_unlock
|
||||
#endif
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_writer_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -638,7 +556,7 @@ __kprintf("[%d] ret mcs_rwlock_reader_lock\n", ihk_mc_get_processor_id()); \
|
||||
#else
|
||||
#define mcs_rwlock_reader_lock __mcs_rwlock_reader_lock
|
||||
#endif
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_reader_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -658,7 +576,7 @@ __kprintf("[%d] ret mcs_rwlock_reader_unlock\n", ihk_mc_get_processor_id()); \
|
||||
#else
|
||||
#define mcs_rwlock_reader_unlock __mcs_rwlock_reader_unlock
|
||||
#endif
|
||||
static void
|
||||
static inline void
|
||||
__mcs_rwlock_reader_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
|
||||
{
|
||||
#ifdef SPINLOCK_IN_MCS_RWLOCK
|
||||
@ -674,4 +592,90 @@ static inline int irqflags_can_interrupt(unsigned long flags)
|
||||
return !!(flags & 0x200);
|
||||
}
|
||||
|
||||
struct ihk_rwlock {
|
||||
union {
|
||||
long lock;
|
||||
struct {
|
||||
unsigned int read;
|
||||
int write;
|
||||
};
|
||||
} lock;
|
||||
};
|
||||
|
||||
static inline void ihk_mc_rwlock_init(struct ihk_rwlock *rw)
|
||||
{
|
||||
rw->lock.read = 0;
|
||||
rw->lock.write = 1;
|
||||
}
|
||||
|
||||
static inline void ihk_mc_read_lock(struct ihk_rwlock *rw)
|
||||
{
|
||||
asm volatile("1:\t"
|
||||
"lock; decq %0\n\t"
|
||||
"jns 3f\n\t"
|
||||
"lock incq %0\n\t"
|
||||
"2:\t"
|
||||
"pause\n\t"
|
||||
"cmpq $0x1, %0\n\t"
|
||||
"jns 1b\n\t"
|
||||
"jmp 2b\n\t"
|
||||
"3:"
|
||||
: "+m" (rw->lock.lock) : : "memory");
|
||||
}
|
||||
|
||||
static inline void ihk_mc_write_lock(struct ihk_rwlock *rw)
|
||||
{
|
||||
asm volatile("1:\t"
|
||||
"lock; decl %0\n\t"
|
||||
"je 3f\n\t"
|
||||
"lock; incl %0\n\t"
|
||||
"2:\t"
|
||||
"pause\n\t"
|
||||
"cmpl $0x1,%0\n\t"
|
||||
"je 1b\n\t"
|
||||
"jmp 2b\n\t"
|
||||
"3:"
|
||||
: "+m" (rw->lock.write) : "i" (((1L) << 32)) : "memory");
|
||||
}
|
||||
|
||||
static inline int ihk_mc_read_trylock(struct ihk_rwlock *rw)
|
||||
{
|
||||
ihk_atomic64_t *count = (ihk_atomic64_t *)rw;
|
||||
|
||||
if (ihk_atomic64_sub_return(1, count) >= 0)
|
||||
return 1;
|
||||
ihk_atomic64_inc(count);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int ihk_mc_write_trylock(struct ihk_rwlock *rw)
|
||||
{
|
||||
ihk_atomic_t *count = (ihk_atomic_t *)&rw->lock.write;
|
||||
|
||||
if (ihk_atomic_dec_and_test(count))
|
||||
return 1;
|
||||
ihk_atomic_inc(count);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void ihk_mc_read_unlock(struct ihk_rwlock *rw)
|
||||
{
|
||||
asm volatile("lock; incq %0" : "+m" (rw->lock.lock) : : "memory");
|
||||
}
|
||||
|
||||
static inline void ihk_mc_write_unlock(struct ihk_rwlock *rw)
|
||||
{
|
||||
asm volatile("lock; incl %0"
|
||||
: "+m" (rw->lock.write) : "i" (((1L) << 32)) : "memory");
|
||||
}
|
||||
|
||||
static inline int ihk_mc_write_can_lock(struct ihk_rwlock *rw)
|
||||
{
|
||||
return rw->lock.write == 1;
|
||||
}
|
||||
|
||||
static inline int ihk_mc_read_can_lock(struct ihk_rwlock *rw)
|
||||
{
|
||||
return rw->lock.lock > 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -17,6 +17,7 @@
|
||||
#define __HEADER_X86_COMMON_ARCH_MEMORY_H
|
||||
|
||||
#include <ihk/types.h>
|
||||
#include <errno.h>
|
||||
|
||||
#define KERNEL_CS_ENTRY 4
|
||||
#define KERNEL_DS_ENTRY 5
|
||||
@ -66,8 +67,8 @@
|
||||
* Placing the LWK image in the virtual address space at the end of
|
||||
* the Linux modules section enables us to map the LWK TEXT in Linux
|
||||
* as well, so that Linux can also call into LWK text.
|
||||
* It's defined by cmake.
|
||||
*/
|
||||
#define MAP_KERNEL_START 0xFFFFFFFFFE800000UL
|
||||
#define STACK_TOP(region) ((region)->user_end)
|
||||
|
||||
#define MAP_VMAP_SIZE 0x0000000100000000UL
|
||||
@ -183,12 +184,10 @@ enum ihk_mc_pt_attribute {
|
||||
|
||||
enum ihk_mc_pt_attribute attr_mask;
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
||||
static inline int pfn_is_write_combined(uintptr_t pfn)
|
||||
{
|
||||
return ((pfn & PFL1_PWT) && !(pfn & PFL1_PCD));
|
||||
}
|
||||
#endif /* #ifdef POSTK_DEBUG_ARCH_DEP_12 */
|
||||
|
||||
static inline int pte_is_null(pte_t *ptep)
|
||||
{
|
||||
@ -365,6 +364,17 @@ static inline int pgsize_to_tbllv(size_t pgsize)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int pgsize_to_pgshift(size_t pgsize)
|
||||
{
|
||||
switch (pgsize) {
|
||||
case PTL1_SIZE: return PTL1_SHIFT;
|
||||
case PTL2_SIZE: return PTL2_SHIFT;
|
||||
case PTL3_SIZE: return PTL3_SHIFT;
|
||||
case PTL4_SIZE: return PTL4_SHIFT;
|
||||
default: return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
static inline size_t tbllv_to_pgsize(int level)
|
||||
{
|
||||
switch (level) {
|
||||
|
||||
@ -13,19 +13,17 @@
|
||||
#ifndef ARCH_CPU_H
|
||||
#define ARCH_CPU_H
|
||||
|
||||
#define mb() asm volatile("mfence":::"memory")
|
||||
#define rmb() asm volatile("lfence":::"memory")
|
||||
#define wmb() asm volatile("sfence" ::: "memory")
|
||||
|
||||
#define smp_mb() mb()
|
||||
#define smp_rmb() rmb()
|
||||
#define smp_wmb() barrier()
|
||||
|
||||
#define arch_barrier() asm volatile("" : : : "memory")
|
||||
|
||||
static inline void rmb(void)
|
||||
{
|
||||
arch_barrier();
|
||||
}
|
||||
|
||||
static inline void wmb(void)
|
||||
{
|
||||
arch_barrier();
|
||||
}
|
||||
|
||||
static unsigned long read_tsc(void)
|
||||
static inline unsigned long read_tsc(void)
|
||||
{
|
||||
unsigned int low, high;
|
||||
|
||||
@ -34,4 +32,21 @@ static unsigned long read_tsc(void)
|
||||
return (low | ((unsigned long)high << 32));
|
||||
}
|
||||
|
||||
#define smp_load_acquire(p) \
|
||||
({ \
|
||||
typeof(*p) ___p1 = ACCESS_ONCE(*p); \
|
||||
compiletime_assert_atomic_type(*p); \
|
||||
barrier(); \
|
||||
___p1; \
|
||||
})
|
||||
|
||||
#define smp_store_release(p, v) \
|
||||
({ \
|
||||
compiletime_assert_atomic_type(*p); \
|
||||
barrier(); \
|
||||
WRITE_ONCE(*p, v); \
|
||||
})
|
||||
|
||||
void arch_flush_icache_all(void);
|
||||
|
||||
#endif /* ARCH_CPU_H */
|
||||
|
||||
@ -1,32 +0,0 @@
|
||||
#ifndef ARCH_RUSAGE_H_INCLUDED
|
||||
#define ARCH_RUSAGE_H_INCLUDED
|
||||
|
||||
#define DEBUG_RUSAGE
|
||||
|
||||
#define IHK_OS_PGSIZE_4KB 0
|
||||
#define IHK_OS_PGSIZE_2MB 1
|
||||
#define IHK_OS_PGSIZE_1GB 2
|
||||
|
||||
extern struct rusage_global rusage;
|
||||
|
||||
static inline int rusage_pgsize_to_pgtype(size_t pgsize)
|
||||
{
|
||||
int ret = IHK_OS_PGSIZE_4KB;
|
||||
switch (pgsize) {
|
||||
case PTL1_SIZE:
|
||||
ret = IHK_OS_PGSIZE_4KB;
|
||||
break;
|
||||
case PTL2_SIZE:
|
||||
ret = IHK_OS_PGSIZE_2MB;
|
||||
break;
|
||||
case PTL3_SIZE:
|
||||
ret = IHK_OS_PGSIZE_1GB;
|
||||
break;
|
||||
default:
|
||||
kprintf("%s: Error: Unknown pgsize=%ld\n", __FUNCTION__, pgsize);
|
||||
break;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif /* !defined(ARCH_RUSAGE_H_INCLUDED) */
|
||||
@ -13,6 +13,8 @@
|
||||
#ifndef HEADER_X86_COMMON_IHK_ATOMIC_H
|
||||
#define HEADER_X86_COMMON_IHK_ATOMIC_H
|
||||
|
||||
#include <lwk/compiler.h>
|
||||
|
||||
/***********************************************************************
|
||||
* ihk_atomic_t
|
||||
*/
|
||||
@ -114,7 +116,7 @@ static inline long ihk_atomic64_read(const ihk_atomic64_t *v)
|
||||
return *(volatile long *)&(v)->counter64;
|
||||
}
|
||||
|
||||
static inline void ihk_atomic64_set(ihk_atomic64_t *v, int i)
|
||||
static inline void ihk_atomic64_set(ihk_atomic64_t *v, long i)
|
||||
{
|
||||
v->counter64 = i;
|
||||
}
|
||||
@ -124,6 +126,22 @@ static inline void ihk_atomic64_inc(ihk_atomic64_t *v)
|
||||
asm volatile ("lock incq %0" : "+m"(v->counter64));
|
||||
}
|
||||
|
||||
static inline long ihk_atomic64_add_return(long i, ihk_atomic64_t *v)
|
||||
{
|
||||
long __i;
|
||||
|
||||
__i = i;
|
||||
asm volatile("lock xaddq %0, %1"
|
||||
: "+r" (i), "+m" (v->counter64)
|
||||
: : "memory");
|
||||
return i + __i;
|
||||
}
|
||||
|
||||
static inline long ihk_atomic64_sub_return(long i, ihk_atomic64_t *v)
|
||||
{
|
||||
return ihk_atomic64_add_return(-i, v);
|
||||
}
|
||||
|
||||
/***********************************************************************
|
||||
* others
|
||||
*/
|
||||
@ -156,43 +174,55 @@ static inline unsigned long xchg8(unsigned long *ptr, unsigned long x)
|
||||
return __x;
|
||||
}
|
||||
|
||||
#define __xchg(x, ptr, size) \
|
||||
({ \
|
||||
__typeof(*(ptr)) __x = (x); \
|
||||
switch (size) { \
|
||||
case 1: \
|
||||
asm volatile("xchgb %b0,%1" \
|
||||
: "=q" (__x) \
|
||||
: "m" (*__xg(ptr)), "0" (__x) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
case 2: \
|
||||
asm volatile("xchgw %w0,%1" \
|
||||
: "=r" (__x) \
|
||||
: "m" (*__xg(ptr)), "0" (__x) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
case 4: \
|
||||
asm volatile("xchgl %k0,%1" \
|
||||
: "=r" (__x) \
|
||||
: "m" (*__xg(ptr)), "0" (__x) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
case 8: \
|
||||
asm volatile("xchgq %0,%1" \
|
||||
: "=r" (__x) \
|
||||
: "m" (*__xg(ptr)), "0" (__x) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
default: \
|
||||
panic("xchg for wrong size"); \
|
||||
} \
|
||||
__x; \
|
||||
})
|
||||
#define __X86_CASE_B 1
|
||||
#define __X86_CASE_W 2
|
||||
#define __X86_CASE_L 4
|
||||
#define __X86_CASE_Q 8
|
||||
|
||||
extern void __xchg_wrong_size(void)
|
||||
__compiletime_error("Bad argument size for xchg");
|
||||
|
||||
#define xchg(ptr, v) \
|
||||
__xchg((v), (ptr), sizeof(*ptr))
|
||||
/*
|
||||
* An exchange-type operation, which takes a value and a pointer, and
|
||||
* returns the old value.
|
||||
*/
|
||||
#define __xchg_op(ptr, arg, op, lock) \
|
||||
({ \
|
||||
__typeof__(*(ptr)) __ret = (arg); \
|
||||
switch (sizeof(*(ptr))) { \
|
||||
case __X86_CASE_B: \
|
||||
asm volatile (lock #op "b %b0, %1\n" \
|
||||
: "+q" (__ret), "+m" (*(ptr)) \
|
||||
: : "memory", "cc"); \
|
||||
break; \
|
||||
case __X86_CASE_W: \
|
||||
asm volatile (lock #op "w %w0, %1\n" \
|
||||
: "+r" (__ret), "+m" (*(ptr)) \
|
||||
: : "memory", "cc"); \
|
||||
break; \
|
||||
case __X86_CASE_L: \
|
||||
asm volatile (lock #op "l %0, %1\n" \
|
||||
: "+r" (__ret), "+m" (*(ptr)) \
|
||||
: : "memory", "cc"); \
|
||||
break; \
|
||||
case __X86_CASE_Q: \
|
||||
asm volatile (lock #op "q %q0, %1\n" \
|
||||
: "+r" (__ret), "+m" (*(ptr)) \
|
||||
: : "memory", "cc"); \
|
||||
break; \
|
||||
default: \
|
||||
__xchg_wrong_size(); \
|
||||
} \
|
||||
__ret; \
|
||||
})
|
||||
|
||||
/*
|
||||
* Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
|
||||
* Since this is generally used to protect other memory information, we
|
||||
* use "asm volatile" and "memory" clobbers to prevent gcc from moving
|
||||
* information around.
|
||||
*/
|
||||
#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "")
|
||||
|
||||
static inline unsigned long atomic_cmpxchg8(unsigned long *addr,
|
||||
unsigned long oldval,
|
||||
@ -241,4 +271,66 @@ static inline unsigned long ihk_atomic_add_long_return(long i, long *v) {
|
||||
return i + __i;
|
||||
}
|
||||
|
||||
extern void __cmpxchg_wrong_size(void)
|
||||
__compiletime_error("Bad argument size for cmpxchg");
|
||||
|
||||
/*
|
||||
* Atomic compare and exchange. Compare OLD with MEM, if identical,
|
||||
* store NEW in MEM. Return the initial value in MEM. Success is
|
||||
* indicated by comparing RETURN with OLD.
|
||||
*/
|
||||
#define __raw_cmpxchg(ptr, old, new, size, lock) \
|
||||
({ \
|
||||
__typeof__(*(ptr)) __ret; \
|
||||
__typeof__(*(ptr)) __old = (old); \
|
||||
__typeof__(*(ptr)) __new = (new); \
|
||||
switch (size) { \
|
||||
case __X86_CASE_B: \
|
||||
{ \
|
||||
volatile uint8_t *__ptr = (volatile uint8_t *)(ptr);\
|
||||
asm volatile(lock "cmpxchgb %2,%1" \
|
||||
: "=a" (__ret), "+m" (*__ptr) \
|
||||
: "q" (__new), "0" (__old) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
} \
|
||||
case __X86_CASE_W: \
|
||||
{ \
|
||||
volatile uint16_t *__ptr = (volatile uint16_t *)(ptr);\
|
||||
asm volatile(lock "cmpxchgw %2,%1" \
|
||||
: "=a" (__ret), "+m" (*__ptr) \
|
||||
: "r" (__new), "0" (__old) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
} \
|
||||
case __X86_CASE_L: \
|
||||
{ \
|
||||
volatile uint32_t *__ptr = (volatile uint32_t *)(ptr);\
|
||||
asm volatile(lock "cmpxchgl %2,%1" \
|
||||
: "=a" (__ret), "+m" (*__ptr) \
|
||||
: "r" (__new), "0" (__old) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
} \
|
||||
case __X86_CASE_Q: \
|
||||
{ \
|
||||
volatile uint64_t *__ptr = (volatile uint64_t *)(ptr);\
|
||||
asm volatile(lock "cmpxchgq %2,%1" \
|
||||
: "=a" (__ret), "+m" (*__ptr) \
|
||||
: "r" (__new), "0" (__old) \
|
||||
: "memory"); \
|
||||
break; \
|
||||
} \
|
||||
default: \
|
||||
__cmpxchg_wrong_size(); \
|
||||
} \
|
||||
__ret; \
|
||||
})
|
||||
|
||||
#define __cmpxchg(ptr, old, new, size) \
|
||||
__raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
|
||||
|
||||
#define cmpxchg(ptr, old, new) \
|
||||
__cmpxchg(ptr, old, new, sizeof(*(ptr)))
|
||||
|
||||
#endif
|
||||
|
||||
@ -71,7 +71,7 @@
|
||||
#define MSR_PERF_CTL_0 0xc0010000
|
||||
#define MSR_PERF_CTR_0 0xc0010004
|
||||
|
||||
static unsigned long xgetbv(unsigned int index)
|
||||
static inline unsigned long xgetbv(unsigned int index)
|
||||
{
|
||||
unsigned int low, high;
|
||||
|
||||
@ -80,7 +80,7 @@ static unsigned long xgetbv(unsigned int index)
|
||||
return low | ((unsigned long)high << 32);
|
||||
}
|
||||
|
||||
static void xsetbv(unsigned int index, unsigned long val)
|
||||
static inline void xsetbv(unsigned int index, unsigned long val)
|
||||
{
|
||||
unsigned int low, high;
|
||||
|
||||
@ -90,7 +90,8 @@ static void xsetbv(unsigned int index, unsigned long val)
|
||||
asm volatile("xsetbv" : : "a" (low), "d" (high), "c" (index));
|
||||
}
|
||||
|
||||
static void wrmsr(unsigned int idx, unsigned long value){
|
||||
static inline void wrmsr(unsigned int idx, unsigned long value)
|
||||
{
|
||||
unsigned int high, low;
|
||||
|
||||
high = value >> 32;
|
||||
@ -99,7 +100,7 @@ static void wrmsr(unsigned int idx, unsigned long value){
|
||||
asm volatile("wrmsr" : : "c" (idx), "a" (low), "d" (high) : "memory");
|
||||
}
|
||||
|
||||
static unsigned long rdpmc(unsigned int counter)
|
||||
static inline unsigned long rdpmc(unsigned int counter)
|
||||
{
|
||||
unsigned int high, low;
|
||||
|
||||
@ -108,7 +109,7 @@ static unsigned long rdpmc(unsigned int counter)
|
||||
return (unsigned long)high << 32 | low;
|
||||
}
|
||||
|
||||
static unsigned long rdmsr(unsigned int index)
|
||||
static inline unsigned long rdmsr(unsigned int index)
|
||||
{
|
||||
unsigned int high, low;
|
||||
|
||||
@ -117,7 +118,7 @@ static unsigned long rdmsr(unsigned int index)
|
||||
return (unsigned long)high << 32 | low;
|
||||
}
|
||||
|
||||
static unsigned long rdtsc(void)
|
||||
static inline unsigned long rdtsc(void)
|
||||
{
|
||||
unsigned int high, low;
|
||||
|
||||
@ -126,7 +127,7 @@ static unsigned long rdtsc(void)
|
||||
return (unsigned long)high << 32 | low;
|
||||
}
|
||||
|
||||
static void set_perfctl(int counter, int event, int mask)
|
||||
static inline void set_perfctl(int counter, int event, int mask)
|
||||
{
|
||||
unsigned long value;
|
||||
|
||||
@ -137,7 +138,7 @@ static void set_perfctl(int counter, int event, int mask)
|
||||
wrmsr(MSR_PERF_CTL_0 + counter, value);
|
||||
}
|
||||
|
||||
static void start_perfctr(int counter)
|
||||
static inline void start_perfctr(int counter)
|
||||
{
|
||||
unsigned long value;
|
||||
|
||||
@ -145,7 +146,7 @@ static void start_perfctr(int counter)
|
||||
value |= (1 << 22);
|
||||
wrmsr(MSR_PERF_CTL_0 + counter, value);
|
||||
}
|
||||
static void stop_perfctr(int counter)
|
||||
static inline void stop_perfctr(int counter)
|
||||
{
|
||||
unsigned long value;
|
||||
|
||||
@ -154,17 +155,17 @@ static void stop_perfctr(int counter)
|
||||
wrmsr(MSR_PERF_CTL_0 + counter, value);
|
||||
}
|
||||
|
||||
static void clear_perfctl(int counter)
|
||||
static inline void clear_perfctl(int counter)
|
||||
{
|
||||
wrmsr(MSR_PERF_CTL_0 + counter, 0);
|
||||
}
|
||||
|
||||
static void set_perfctr(int counter, unsigned long value)
|
||||
static inline void set_perfctr(int counter, unsigned long value)
|
||||
{
|
||||
wrmsr(MSR_PERF_CTR_0 + counter, value);
|
||||
}
|
||||
|
||||
static unsigned long read_perfctr(int counter)
|
||||
static inline unsigned long read_perfctr(int counter)
|
||||
{
|
||||
return rdpmc(counter);
|
||||
}
|
||||
|
||||
@ -147,24 +147,23 @@ SYSCALL_DELEGATED(266, symlinkat)
|
||||
SYSCALL_DELEGATED(267, readlinkat)
|
||||
SYSCALL_DELEGATED(268, fchmodat)
|
||||
SYSCALL_DELEGATED(269, faccessat)
|
||||
SYSCALL_DELEGATED(270, pselect6)
|
||||
SYSCALL_DELEGATED(271, ppoll)
|
||||
SYSCALL_HANDLED(270, pselect6)
|
||||
SYSCALL_HANDLED(271, ppoll)
|
||||
SYSCALL_HANDLED(273, set_robust_list)
|
||||
SYSCALL_HANDLED(279, move_pages)
|
||||
SYSCALL_DELEGATED(281, epoll_pwait)
|
||||
SYSCALL_HANDLED(281, epoll_pwait)
|
||||
SYSCALL_HANDLED(282, signalfd)
|
||||
SYSCALL_HANDLED(289, signalfd4)
|
||||
#ifdef ENABLE_PERF
|
||||
SYSCALL_HANDLED(298, perf_event_open)
|
||||
#endif
|
||||
#ifdef DCFA_KMOD
|
||||
SYSCALL_HANDLED(303, mod_call)
|
||||
#endif
|
||||
SYSCALL_HANDLED(309, getcpu)
|
||||
SYSCALL_HANDLED(310, process_vm_readv)
|
||||
SYSCALL_HANDLED(311, process_vm_writev)
|
||||
SYSCALL_HANDLED(601, pmc_init)
|
||||
SYSCALL_HANDLED(602, pmc_start)
|
||||
SYSCALL_HANDLED(603, pmc_stop)
|
||||
SYSCALL_HANDLED(604, pmc_reset)
|
||||
SYSCALL_HANDLED(322, execveat)
|
||||
SYSCALL_HANDLED(700, get_cpu_id)
|
||||
#ifdef PROFILE_ENABLE
|
||||
SYSCALL_HANDLED(__NR_profile, profile)
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
/* interrupt.S COPYRIGHT FUJITSU LIMITED 2019 */
|
||||
/**
|
||||
* \file interrupt.S
|
||||
* License details are found in the file LICENSE.
|
||||
@ -91,6 +92,9 @@ vector=vector+1
|
||||
.endr
|
||||
|
||||
common_interrupt:
|
||||
#define MULT_INTR_VECTOR 242
|
||||
cmp $(MULT_INTR_VECTOR),%rdi
|
||||
je 1f
|
||||
PUSH_ALL_REGS
|
||||
movq ERROR_OFFSET(%rsp), %rdi
|
||||
movq %rsp, %rsi
|
||||
@ -99,6 +103,19 @@ common_interrupt:
|
||||
addq $8, %rsp
|
||||
iretq
|
||||
|
||||
|
||||
.globl nmi_handler
|
||||
nmi_handler:
|
||||
cld
|
||||
pushq $0 /* error field of x86_basic_regs */
|
||||
PUSH_ALL_REGS
|
||||
movq %rsp, %rdi
|
||||
call multi_nm_interrupt_handler /* Enter C code */
|
||||
POP_ALL_REGS
|
||||
addq $8, %rsp
|
||||
iretq
|
||||
|
||||
|
||||
.globl __page_fault_handler_address
|
||||
__page_fault_handler_address:
|
||||
.quad 0
|
||||
@ -137,74 +154,6 @@ __freeze:
|
||||
POP_ALL_REGS
|
||||
iretq
|
||||
|
||||
.globl nmi
|
||||
nmi:
|
||||
#define PANICED 232
|
||||
#define PANIC_REGS 240
|
||||
movq %rax,%gs:PANIC_REGS+0x00
|
||||
movq %rsp,%gs:PANIC_REGS+0x08
|
||||
|
||||
movl nmi_mode(%rip),%eax
|
||||
cmp $3,%rax
|
||||
je 4f
|
||||
cmp $1,%rax
|
||||
je 1f
|
||||
cmp $2,%rax
|
||||
jne 3f
|
||||
1:
|
||||
cld
|
||||
movq %gs:PANIC_REGS+0x00,%rax
|
||||
PUSH_ALL_REGS
|
||||
subq $40, %rsp
|
||||
movq %rsp,%gs:PANIC_REGS+0x10
|
||||
movq %rsp, %rdi
|
||||
call freeze_thaw
|
||||
cmpq $0, %rax
|
||||
jnz 2f
|
||||
addq $40, %rsp
|
||||
2:
|
||||
POP_ALL_REGS
|
||||
iretq
|
||||
3:
|
||||
movq %rbx,%gs:PANIC_REGS+0x08
|
||||
movq %rcx,%gs:PANIC_REGS+0x10
|
||||
movq %rdx,%gs:PANIC_REGS+0x18
|
||||
movq %rsi,%gs:PANIC_REGS+0x20
|
||||
movq %rdi,%gs:PANIC_REGS+0x28
|
||||
movq %rbp,%gs:PANIC_REGS+0x30
|
||||
movq 0x18(%rsp),%rax /* rsp */
|
||||
movq %rax,%gs:PANIC_REGS+0x38
|
||||
movq %r8, %gs:PANIC_REGS+0x40
|
||||
movq %r9, %gs:PANIC_REGS+0x48
|
||||
movq %r10,%gs:PANIC_REGS+0x50
|
||||
movq %r11,%gs:PANIC_REGS+0x58
|
||||
movq %r12,%gs:PANIC_REGS+0x60
|
||||
movq %r13,%gs:PANIC_REGS+0x68
|
||||
movq %r14,%gs:PANIC_REGS+0x70
|
||||
movq %r15,%gs:PANIC_REGS+0x78
|
||||
movq 0x00(%rsp),%rax /* rip */
|
||||
movq %rax,%gs:PANIC_REGS+0x80
|
||||
movq 0x10(%rsp),%rax /* rflags */
|
||||
movl %eax,%gs:PANIC_REGS+0x88
|
||||
movq 0x08(%rsp),%rax /* cs */
|
||||
movl %eax,%gs:PANIC_REGS+0x8C
|
||||
movq 0x20(%rsp),%rax /* ss */
|
||||
movl %eax,%gs:PANIC_REGS+0x90
|
||||
xorq %rax,%rax
|
||||
movw %ds,%ax
|
||||
movl %eax,%gs:PANIC_REGS+0x94
|
||||
movw %es,%ax
|
||||
movl %eax,%gs:PANIC_REGS+0x98
|
||||
movw %fs,%ax
|
||||
movl %eax,%gs:PANIC_REGS+0x9C
|
||||
movw %gs,%ax
|
||||
movl %eax,%gs:PANIC_REGS+0xA0
|
||||
movq $1,%gs:PANICED
|
||||
call ihk_mc_query_mem_areas
|
||||
4:
|
||||
hlt
|
||||
jmp 4b
|
||||
|
||||
.globl x86_syscall
|
||||
x86_syscall:
|
||||
cld
|
||||
|
||||
@ -14,7 +14,6 @@
|
||||
*/
|
||||
|
||||
#include <ihk/cpu.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <ihk/mm.h>
|
||||
#include <types.h>
|
||||
#include <memory.h>
|
||||
@ -26,7 +25,7 @@
|
||||
#include <cls.h>
|
||||
#include <kmalloc.h>
|
||||
#include <rusage_private.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
//#define DEBUG
|
||||
|
||||
@ -38,6 +37,7 @@
|
||||
static char *last_page;
|
||||
extern char _head[], _end[];
|
||||
|
||||
extern unsigned long linux_page_offset_base;
|
||||
extern unsigned long x86_kernel_phys_base;
|
||||
|
||||
/* Arch specific early allocation routine */
|
||||
@ -1355,109 +1355,6 @@ struct clear_range_args {
|
||||
int max_nr_addr;
|
||||
};
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_8
|
||||
void remote_flush_tlb_cpumask(struct process_vm *vm,
|
||||
unsigned long addr, int cpu_id)
|
||||
{
|
||||
unsigned long __addr = addr;
|
||||
return remote_flush_tlb_array_cpumask(vm, &__addr, 1, cpu_id);
|
||||
}
|
||||
|
||||
void remote_flush_tlb_array_cpumask(struct process_vm *vm,
|
||||
unsigned long *addr,
|
||||
int nr_addr,
|
||||
int cpu_id)
|
||||
{
|
||||
unsigned long cpu;
|
||||
int flush_ind;
|
||||
struct tlb_flush_entry *flush_entry;
|
||||
cpu_set_t _cpu_set;
|
||||
|
||||
if (addr[0]) {
|
||||
flush_ind = (addr[0] >> PAGE_SHIFT) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
|
||||
}
|
||||
/* Zero address denotes full TLB flush */
|
||||
else {
|
||||
/* Random.. */
|
||||
flush_ind = (rdtsc()) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
|
||||
}
|
||||
|
||||
flush_entry = &tlb_flush_vector[flush_ind];
|
||||
|
||||
/* Take a copy of the cpu set so that we don't hold the lock
|
||||
* all the way while interrupting other cores */
|
||||
ihk_mc_spinlock_lock_noirq(&vm->address_space->cpu_set_lock);
|
||||
memcpy(&_cpu_set, &vm->address_space->cpu_set, sizeof(cpu_set_t));
|
||||
ihk_mc_spinlock_unlock_noirq(&vm->address_space->cpu_set_lock);
|
||||
|
||||
dkprintf("trying to aquire flush_entry->lock flush_ind: %d\n", flush_ind);
|
||||
|
||||
ihk_mc_spinlock_lock_noirq(&flush_entry->lock);
|
||||
|
||||
flush_entry->vm = vm;
|
||||
flush_entry->addr = addr;
|
||||
flush_entry->nr_addr = nr_addr;
|
||||
ihk_atomic_set(&flush_entry->pending, 0);
|
||||
|
||||
dkprintf("lock aquired, iterating cpu mask.. flush_ind: %d\n", flush_ind);
|
||||
|
||||
/* Loop through CPUs in this address space and interrupt them for
|
||||
* TLB flush on the specified address */
|
||||
for_each_set_bit(cpu, (const unsigned long*)&_cpu_set.__bits, CPU_SETSIZE) {
|
||||
|
||||
if (ihk_mc_get_processor_id() == cpu)
|
||||
continue;
|
||||
|
||||
ihk_atomic_inc(&flush_entry->pending);
|
||||
dkprintf("remote_flush_tlb_cpumask: flush_ind: %d, addr: 0x%lX, interrupting cpu: %d\n",
|
||||
flush_ind, addr, cpu);
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_8 /* arch depend hide */
|
||||
/* TODO(pka_idke) Interim support */
|
||||
ihk_mc_interrupt_cpu(cpu,
|
||||
ihk_mc_get_vector(flush_ind + IHK_TLB_FLUSH_IRQ_VECTOR_START));
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_8 */
|
||||
ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(cpu)->apic_id,
|
||||
flush_ind + IHK_TLB_FLUSH_IRQ_VECTOR_START);
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_8 */
|
||||
}
|
||||
|
||||
#ifdef DEBUG_IC_TLB
|
||||
{
|
||||
unsigned long tsc;
|
||||
tsc = rdtsc() + 12884901888; /* 1.2GHz =>10 sec */
|
||||
#endif
|
||||
if (flush_entry->addr[0]) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < flush_entry->nr_addr; ++i) {
|
||||
flush_tlb_single(flush_entry->addr[i] & PAGE_MASK);
|
||||
}
|
||||
}
|
||||
/* Zero address denotes full TLB flush */
|
||||
else {
|
||||
flush_tlb();
|
||||
}
|
||||
|
||||
/* Wait for all cores */
|
||||
while (ihk_atomic_read(&flush_entry->pending) != 0) {
|
||||
cpu_pause();
|
||||
|
||||
#ifdef DEBUG_IC_TLB
|
||||
if (rdtsc() > tsc) {
|
||||
kprintf("waited 10 secs for remote TLB!! -> panic_all()\n");
|
||||
panic_all_cores("waited 10 secs for remote TLB!!\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#ifdef DEBUG_IC_TLB
|
||||
}
|
||||
#endif
|
||||
|
||||
ihk_mc_spinlock_unlock_noirq(&flush_entry->lock);
|
||||
}
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_8 */
|
||||
|
||||
static void remote_flush_tlb_add_addr(struct clear_range_args *args,
|
||||
unsigned long addr)
|
||||
{
|
||||
@ -1622,7 +1519,7 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
|
||||
{
|
||||
struct clear_range_args *args = args0;
|
||||
int error;
|
||||
uint64_t phys;
|
||||
uint64_t phys = 0;
|
||||
pte_t old;
|
||||
struct page *page;
|
||||
struct page_table *pt;
|
||||
@ -2572,10 +2469,10 @@ static void init_linux_kernel_mapping(struct page_table *pt)
|
||||
map_start = 0;
|
||||
map_end = 0x20000000000;
|
||||
|
||||
virt = (void *)LINUX_PAGE_OFFSET;
|
||||
virt = (void *)linux_page_offset_base;
|
||||
|
||||
kprintf("Linux kernel virtual: 0x%lx - 0x%lx -> 0x%lx - 0x%lx\n",
|
||||
LINUX_PAGE_OFFSET, LINUX_PAGE_OFFSET + map_end, 0, map_end);
|
||||
virt, virt + map_end, 0, map_end);
|
||||
|
||||
for (phys = map_start; phys < map_end; phys += LARGE_PAGE_SIZE) {
|
||||
if (set_pt_large_page(pt, virt, phys, PTATTR_WRITABLE) != 0) {
|
||||
@ -2599,9 +2496,11 @@ static void init_linux_kernel_mapping(struct page_table *pt)
|
||||
}
|
||||
|
||||
dkprintf("Linux kernel virtual: 0x%lx - 0x%lx -> 0x%lx - 0x%lx\n",
|
||||
LINUX_PAGE_OFFSET + map_start, LINUX_PAGE_OFFSET + map_end, map_start, map_end);
|
||||
linux_page_offset_base + map_start,
|
||||
linux_page_offset_base + map_end,
|
||||
map_start, map_end);
|
||||
|
||||
virt = (void *)(LINUX_PAGE_OFFSET + map_start);
|
||||
virt = (void *)(linux_page_offset_base + map_start);
|
||||
for (phys = map_start; phys < map_end; phys += LARGE_PAGE_SIZE, virt += LARGE_PAGE_SIZE) {
|
||||
if (set_pt_large_page(pt, virt, phys, PTATTR_WRITABLE) != 0) {
|
||||
kprintf("%s: set_pt_large_page() failed for 0x%lx\n", __FUNCTION__, virt);
|
||||
@ -2652,7 +2551,7 @@ void *map_fixed_area(unsigned long phys, unsigned long size, int uncachable)
|
||||
attr |= PTATTR_UNCACHABLE;
|
||||
}
|
||||
|
||||
kprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
|
||||
dkprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
|
||||
paligned, v, npages);
|
||||
|
||||
for (i = 0; i < npages; i++) {
|
||||
@ -2745,12 +2644,12 @@ unsigned long virt_to_phys(void *v)
|
||||
unsigned long va = (unsigned long)v;
|
||||
|
||||
if (va >= MAP_KERNEL_START) {
|
||||
dkprintf("%s: MAP_KERNEL_START <= 0x%lx <= LINUX_PAGE_OFFSET\n",
|
||||
dkprintf("%s: MAP_KERNEL_START <= 0x%lx <= linux_page_offset_base\n",
|
||||
__FUNCTION__, va);
|
||||
return va - MAP_KERNEL_START + x86_kernel_phys_base;
|
||||
}
|
||||
else if (va >= LINUX_PAGE_OFFSET) {
|
||||
return va - LINUX_PAGE_OFFSET;
|
||||
else if (va >= linux_page_offset_base) {
|
||||
return va - linux_page_offset_base;
|
||||
}
|
||||
else if (va >= MAP_FIXED_START) {
|
||||
return va - MAP_FIXED_START;
|
||||
@ -2769,7 +2668,7 @@ void *phys_to_virt(unsigned long p)
|
||||
return (void *)(p + MAP_ST_START);
|
||||
}
|
||||
|
||||
return (void *)(p + LINUX_PAGE_OFFSET);
|
||||
return (void *)(p + linux_page_offset_base);
|
||||
}
|
||||
|
||||
int copy_from_user(void *dst, const void *src, size_t siz)
|
||||
|
||||
@ -12,12 +12,12 @@
|
||||
#include <march.h>
|
||||
#include <errno.h>
|
||||
#include <cls.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <ihk/cpu.h>
|
||||
#include <registers.h>
|
||||
#include <mc_perf_event.h>
|
||||
#include <config.h>
|
||||
#include <debug.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <process.h>
|
||||
|
||||
extern unsigned int *x86_march_perfmap;
|
||||
extern int running_on_kvm(void);
|
||||
@ -223,41 +223,6 @@ int ihk_mc_perfctr_init_raw(int counter, unsigned int code, int mode)
|
||||
#endif /*POSTK_DEBUG_TEMP_FIX_29*/
|
||||
}
|
||||
|
||||
#ifdef POSTK_DEBUG_TEMP_FIX_29
|
||||
int ihk_mc_perfctr_init(int counter, uint64_t config, int mode)
|
||||
#else
|
||||
int ihk_mc_perfctr_init(int counter, enum ihk_perfctr_type type, int mode)
|
||||
#endif /*POSTK_DEBUG_TEMP_FIX_29*/
|
||||
{
|
||||
#ifdef POSTK_DEBUG_TEMP_FIX_29
|
||||
enum ihk_perfctr_type type;
|
||||
|
||||
switch (config) {
|
||||
case PERF_COUNT_HW_CPU_CYCLES :
|
||||
type = APT_TYPE_CYCLE;
|
||||
break;
|
||||
case PERF_COUNT_HW_INSTRUCTIONS :
|
||||
type = APT_TYPE_INSTRUCTIONS;
|
||||
break;
|
||||
default :
|
||||
// Not supported config.
|
||||
type = PERFCTR_MAX_TYPE;
|
||||
}
|
||||
#endif /*POSTK_DEBUG_TEMP_FIX_29*/
|
||||
|
||||
if (counter < 0 || counter >= NUM_PERF_COUNTERS) {
|
||||
return -EINVAL;
|
||||
}
|
||||
if (type < 0 || type >= PERFCTR_MAX_TYPE) {
|
||||
return -EINVAL;
|
||||
}
|
||||
if (!x86_march_perfmap[type]) {
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return set_perfctr_x86_direct(counter, mode, x86_march_perfmap[type]);
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
|
||||
{
|
||||
struct thread *thread = cpu_local_var(current);
|
||||
@ -412,6 +377,23 @@ int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ihk_mc_perfctr_alloc(struct thread *thread, struct mc_perf_event *event)
|
||||
{
|
||||
int ret = -EINVAL;
|
||||
int i = 0;
|
||||
const int counters = ihk_mc_perf_get_num_counters();
|
||||
|
||||
// find avail generic counter
|
||||
for (i = 0; i < counters; i++) {
|
||||
if (!(thread->pmc_alloc_map & (1 << i))) {
|
||||
ret = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
unsigned long ihk_mc_perfctr_read(int counter)
|
||||
{
|
||||
unsigned long retval = 0;
|
||||
@ -439,6 +421,14 @@ unsigned long ihk_mc_perfctr_read(int counter)
|
||||
return retval;
|
||||
}
|
||||
|
||||
unsigned long ihk_mc_perfctr_value(int counter, unsigned long correction)
|
||||
{
|
||||
unsigned long count = ihk_mc_perfctr_read(counter) + correction;
|
||||
|
||||
count &= 0x000000ffffffffffL;
|
||||
return count;
|
||||
}
|
||||
|
||||
// read by rdmsr
|
||||
unsigned long ihk_mc_perfctr_read_msr(int counter)
|
||||
{
|
||||
@ -513,3 +503,18 @@ int ihk_mc_perf_get_num_counters(void)
|
||||
{
|
||||
return NUM_PERF_COUNTERS;
|
||||
}
|
||||
|
||||
int hw_perf_event_init(struct mc_perf_event *event)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ihk_mc_event_set_period(struct mc_perf_event *event)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t ihk_mc_event_update(struct mc_perf_event *event)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -16,7 +16,6 @@
|
||||
*/
|
||||
|
||||
#include <ihk/cpu.h>
|
||||
#include <ihk/debug.h>
|
||||
#include <cls.h>
|
||||
#include <cpulocal.h>
|
||||
#include <syscall.h>
|
||||
@ -32,7 +31,8 @@
|
||||
#include <page.h>
|
||||
#include <limits.h>
|
||||
#include <syscall.h>
|
||||
#include <debug.h>
|
||||
#include <rusage_private.h>
|
||||
#include <ihk/debug.h>
|
||||
|
||||
void terminate_mcexec(int, int);
|
||||
extern long do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact);
|
||||
@ -64,7 +64,6 @@ uintptr_t debug_constants[] = {
|
||||
-1,
|
||||
};
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_52
|
||||
#define VDSO_MAXPAGES 2
|
||||
struct vdso {
|
||||
long busy;
|
||||
@ -80,8 +79,24 @@ struct vdso {
|
||||
long hpet_phys;
|
||||
void *pvti_virt;
|
||||
long pvti_phys;
|
||||
void *vgtod_virt;
|
||||
};
|
||||
|
||||
struct vsyscall_gtod_data {
|
||||
int seq;
|
||||
|
||||
struct {
|
||||
int vclock_mode;
|
||||
unsigned long cycle_last;
|
||||
unsigned long mask;
|
||||
unsigned int mult;
|
||||
unsigned int shift;
|
||||
} clock;
|
||||
|
||||
/* open coded 'struct timespec' */
|
||||
time_t wall_time_sec;
|
||||
unsigned long wall_time_snsec;
|
||||
};
|
||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
||||
|
||||
static struct vdso vdso;
|
||||
static size_t container_size = 0;
|
||||
@ -139,7 +154,7 @@ arch_clear_host_user_space()
|
||||
|
||||
/* XXX: might be unnecessary */
|
||||
clear_host_pte(th->vm->region.user_start,
|
||||
(th->vm->region.user_end - th->vm->region.user_start));
|
||||
(th->vm->region.user_end - th->vm->region.user_start), 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -558,7 +573,7 @@ long ptrace_write_regset(struct thread *thread, long type, struct iovec *iov)
|
||||
return rc;
|
||||
}
|
||||
|
||||
extern void coredump(struct thread *thread, void *regs);
|
||||
extern int coredump(struct thread *thread, void *regs, int sig);
|
||||
|
||||
void ptrace_report_signal(struct thread *thread, int sig)
|
||||
{
|
||||
@ -726,6 +741,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
struct mcs_rwlock_node_irqsave lock;
|
||||
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
||||
int restart = 0;
|
||||
int ret;
|
||||
|
||||
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
|
||||
dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig);
|
||||
@ -971,15 +987,6 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
dkprintf("SIGTRAP(): woken up\n");
|
||||
break;
|
||||
case SIGCONT:
|
||||
memset(&info, '\0', sizeof info);
|
||||
info.si_signo = SIGCHLD;
|
||||
info.si_code = CLD_CONTINUED;
|
||||
info._sifields._sigchld.si_pid = proc->pid;
|
||||
info._sifields._sigchld.si_status = 0x0000ffff;
|
||||
do_kill(cpu_local_var(current), proc->parent->pid, -1, SIGCHLD, &info, 0);
|
||||
proc->main_thread->signal_flags = SIGNAL_STOP_CONTINUED;
|
||||
proc->status = PS_RUNNING;
|
||||
dkprintf("do_signal,SIGCONT,do nothing\n");
|
||||
break;
|
||||
case SIGQUIT:
|
||||
case SIGILL:
|
||||
@ -991,9 +998,31 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
|
||||
case SIGXCPU:
|
||||
case SIGXFSZ:
|
||||
core:
|
||||
dkprintf("do_signal,default,core,sig=%d\n", sig);
|
||||
coredump(thread, regs);
|
||||
coredumped = 0x80;
|
||||
thread->coredump_regs =
|
||||
kmalloc(sizeof(struct x86_user_context),
|
||||
IHK_MC_AP_NOWAIT);
|
||||
if (!thread->coredump_regs) {
|
||||
kprintf("%s: Out of memory\n", __func__);
|
||||
goto skip;
|
||||
}
|
||||
memcpy(thread->coredump_regs, regs,
|
||||
sizeof(struct x86_user_context));
|
||||
|
||||
ret = coredump(thread, regs, sig);
|
||||
switch (ret) {
|
||||
case -EBUSY:
|
||||
kprintf("%s: INFO: coredump not performed, try ulimit -c <non-zero>\n",
|
||||
__func__);
|
||||
break;
|
||||
case 0:
|
||||
coredumped = 0x80;
|
||||
break;
|
||||
default:
|
||||
kprintf("%s: ERROR: coredump failed (%d)\n",
|
||||
__func__, ret);
|
||||
break;
|
||||
}
|
||||
skip:
|
||||
terminate(0, sig | coredumped);
|
||||
break;
|
||||
case SIGCHLD:
|
||||
@ -1038,7 +1067,9 @@ getsigpending(struct thread *thread, int delflag){
|
||||
for(x = pending->sigmask.__val[0], sig = 0; x; sig++, x >>= 1);
|
||||
k = thread->sigcommon->action + sig - 1;
|
||||
if(delflag ||
|
||||
(sig != SIGCHLD && sig != SIGURG) ||
|
||||
(sig != SIGCHLD &&
|
||||
sig != SIGURG &&
|
||||
sig != SIGCONT) ||
|
||||
(k->sa.sa_handler != (void *)1 &&
|
||||
k->sa.sa_handler != NULL)){
|
||||
if(!(pending->sigmask.__val[0] & w)){
|
||||
@ -1119,7 +1150,8 @@ check_signal(unsigned long rc, void *regs0, int num)
|
||||
if(thread == NULL || thread == &cpu_local_var(idle)){
|
||||
struct thread *t;
|
||||
|
||||
irqstate = ihk_mc_spinlock_lock(&(cpu_local_var(runq_lock)));
|
||||
irqstate = cpu_disable_interrupt_save();
|
||||
ihk_mc_spinlock_lock_noirq(&(cpu_local_var(runq_lock)));
|
||||
list_for_each_entry(t, &(cpu_local_var(runq)), sched_list){
|
||||
if(t == &cpu_local_var(idle))
|
||||
continue;
|
||||
@ -1129,7 +1161,8 @@ check_signal(unsigned long rc, void *regs0, int num)
|
||||
break;
|
||||
}
|
||||
}
|
||||
ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), irqstate);
|
||||
ihk_mc_spinlock_unlock_noirq(&(cpu_local_var(runq_lock)));
|
||||
cpu_restore_interrupt(irqstate);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -1185,7 +1218,9 @@ check_sig_pending_thread(struct thread *thread)
|
||||
for (x = pending->sigmask.__val[0], sig = 0; x;
|
||||
sig++, x >>= 1);
|
||||
k = thread->sigcommon->action + sig - 1;
|
||||
if ((sig != SIGCHLD && sig != SIGURG) ||
|
||||
if ((sig != SIGCHLD &&
|
||||
sig != SIGURG &&
|
||||
sig != SIGCONT) ||
|
||||
(k->sa.sa_handler != (void *)1 &&
|
||||
k->sa.sa_handler != NULL)) {
|
||||
if (!(pending->sigmask.__val[0] & w)) {
|
||||
@ -1194,6 +1229,7 @@ check_sig_pending_thread(struct thread *thread)
|
||||
found = 1;
|
||||
if (sig != SIGCHLD &&
|
||||
sig != SIGURG &&
|
||||
sig != SIGCONT &&
|
||||
!k->sa.sa_handler) {
|
||||
found = 2;
|
||||
break;
|
||||
@ -1278,7 +1314,6 @@ do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
|
||||
struct list_head *head = NULL;
|
||||
int rc;
|
||||
unsigned long irqstate = 0;
|
||||
struct k_sigaction *k;
|
||||
int doint;
|
||||
int found = 0;
|
||||
siginfo_t info0;
|
||||
@ -1288,6 +1323,7 @@ do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
|
||||
struct process_hash *phash = rset->process_hash;
|
||||
struct mcs_rwlock_node lock;
|
||||
struct mcs_rwlock_node updatelock;
|
||||
struct sig_pending *pending = NULL;
|
||||
|
||||
if(sig > 64 || sig < 0)
|
||||
return -EINVAL;
|
||||
@ -1509,54 +1545,70 @@ done:
|
||||
|
||||
mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node);
|
||||
|
||||
/* Put signal event even when handler is SIG_IGN or SIG_DFL
|
||||
because target ptraced thread must call ptrace_report_signal
|
||||
in check_signal */
|
||||
rc = 0;
|
||||
k = tthread->sigcommon->action + sig - 1;
|
||||
if ((sig != SIGKILL && (tthread->ptrace & PT_TRACED)) ||
|
||||
(k->sa.sa_handler != (void *)1 &&
|
||||
(k->sa.sa_handler != NULL ||
|
||||
(sig != SIGCHLD && sig != SIGURG)))) {
|
||||
struct sig_pending *pending = NULL;
|
||||
if (sig < 33) { // SIGRTMIN - SIGRTMAX
|
||||
list_for_each_entry(pending, head, list){
|
||||
if(pending->sigmask.__val[0] == mask &&
|
||||
pending->ptracecont == ptracecont)
|
||||
break;
|
||||
}
|
||||
if(&pending->list == head)
|
||||
pending = NULL;
|
||||
|
||||
if (sig < 33) { // SIGRTMIN - SIGRTMAX
|
||||
list_for_each_entry(pending, head, list) {
|
||||
if (pending->sigmask.__val[0] == mask &&
|
||||
pending->ptracecont == ptracecont)
|
||||
break;
|
||||
}
|
||||
if(pending == NULL){
|
||||
doint = 1;
|
||||
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
|
||||
if(!pending){
|
||||
rc = -ENOMEM;
|
||||
}
|
||||
else{
|
||||
memset(pending, 0, sizeof(struct sig_pending));
|
||||
pending->sigmask.__val[0] = mask;
|
||||
memcpy(&pending->info, info, sizeof(siginfo_t));
|
||||
pending->ptracecont = ptracecont;
|
||||
if(sig == SIGKILL || sig == SIGSTOP)
|
||||
list_add(&pending->list, head);
|
||||
else
|
||||
list_add_tail(&pending->list, head);
|
||||
tthread->sigevent = 1;
|
||||
}
|
||||
if (&pending->list == head)
|
||||
pending = NULL;
|
||||
}
|
||||
if (pending == NULL) {
|
||||
doint = 1;
|
||||
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
|
||||
if (!pending) {
|
||||
rc = -ENOMEM;
|
||||
}
|
||||
else {
|
||||
memset(pending, 0, sizeof(struct sig_pending));
|
||||
pending->sigmask.__val[0] = mask;
|
||||
memcpy(&pending->info, info, sizeof(siginfo_t));
|
||||
pending->ptracecont = ptracecont;
|
||||
if (sig == SIGKILL || sig == SIGSTOP)
|
||||
list_add(&pending->list, head);
|
||||
else
|
||||
list_add_tail(&pending->list, head);
|
||||
tthread->sigevent = 1;
|
||||
}
|
||||
}
|
||||
|
||||
mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node);
|
||||
cpu_restore_interrupt(irqstate);
|
||||
|
||||
if (sig == SIGCONT || ptracecont == 1) {
|
||||
/* Wake up the target only when stopped by SIGSTOP */
|
||||
if (sched_wakeup_thread(tthread, PS_STOPPED) == 0) {
|
||||
struct siginfo info;
|
||||
|
||||
tthread->proc->main_thread->signal_flags =
|
||||
SIGNAL_STOP_CONTINUED;
|
||||
tthread->proc->status = PS_RUNNING;
|
||||
memset(&info, '\0', sizeof(info));
|
||||
info.si_signo = SIGCHLD;
|
||||
info.si_code = CLD_CONTINUED;
|
||||
info._sifields._sigchld.si_pid = tthread->proc->pid;
|
||||
info._sifields._sigchld.si_status = 0x0000ffff;
|
||||
do_kill(tthread, tthread->proc->parent->pid, -1,
|
||||
SIGCHLD, &info, 0);
|
||||
tthread->proc->status = PS_RUNNING;
|
||||
if (thread != tthread) {
|
||||
ihk_mc_interrupt_cpu(tthread->cpu_id,
|
||||
ihk_mc_get_vector(IHK_GV_IKC));
|
||||
}
|
||||
doint = 0;
|
||||
}
|
||||
}
|
||||
if (doint && !(mask & tthread->sigmask.__val[0])) {
|
||||
int status = tthread->status;
|
||||
|
||||
if (thread != tthread) {
|
||||
dkprintf("do_kill,ipi,pid=%d,cpu_id=%d\n",
|
||||
tproc->pid, tthread->cpu_id);
|
||||
ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(tthread->cpu_id)->apic_id, 0xd0);
|
||||
ihk_mc_interrupt_cpu(tthread->cpu_id,
|
||||
ihk_mc_get_vector(IHK_GV_IKC));
|
||||
}
|
||||
|
||||
if (status != PS_RUNNING) {
|
||||
@ -1564,11 +1616,6 @@ done:
|
||||
/* Wake up the target only when stopped by ptrace-reporting */
|
||||
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED | PS_INTERRUPTIBLE);
|
||||
}
|
||||
else if(sig == SIGCONT || ptracecont == 1){
|
||||
/* Wake up the target only when stopped by SIGSTOP */
|
||||
sched_wakeup_thread(tthread, PS_STOPPED);
|
||||
tthread->proc->status = PS_RUNNING;
|
||||
}
|
||||
else {
|
||||
sched_wakeup_thread(tthread, PS_INTERRUPTIBLE);
|
||||
}
|
||||
@ -1593,7 +1640,7 @@ set_signal(int sig, void *regs0, siginfo_t *info)
|
||||
}
|
||||
|
||||
if ((__sigmask(sig) & thread->sigmask.__val[0])) {
|
||||
coredump(thread, regs0);
|
||||
coredump(thread, regs0, sig);
|
||||
terminate(0, sig | 0x80);
|
||||
}
|
||||
do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0);
|
||||
@ -1629,7 +1676,7 @@ SYSCALL_DECLARE(mmap)
|
||||
;
|
||||
|
||||
const uintptr_t addr0 = ihk_mc_syscall_arg0(ctx);
|
||||
const size_t len0 = ihk_mc_syscall_arg1(ctx);
|
||||
size_t len0 = ihk_mc_syscall_arg1(ctx);
|
||||
const int prot = ihk_mc_syscall_arg2(ctx);
|
||||
const int flags0 = ihk_mc_syscall_arg3(ctx);
|
||||
const int fd = ihk_mc_syscall_arg4(ctx);
|
||||
@ -1668,7 +1715,9 @@ SYSCALL_DECLARE(mmap)
|
||||
if (flags & MAP_HUGETLB) {
|
||||
switch (flags & (0x3F << MAP_HUGE_SHIFT)) {
|
||||
case 0:
|
||||
flags |= MAP_HUGE_2MB; /* default hugepage size */
|
||||
/* default hugepage size */
|
||||
flags |= ihk_mc_get_linux_default_huge_page_shift() <<
|
||||
MAP_HUGE_SHIFT;
|
||||
break;
|
||||
|
||||
case MAP_HUGE_2MB:
|
||||
@ -1684,16 +1733,29 @@ SYSCALL_DECLARE(mmap)
|
||||
}
|
||||
|
||||
pgsize = (size_t)1 << ((flags >> MAP_HUGE_SHIFT) & 0x3F);
|
||||
/* Round-up map length by pagesize */
|
||||
len0 = ALIGN(len0, pgsize);
|
||||
|
||||
if (rusage_check_overmap(len0,
|
||||
(flags >> MAP_HUGE_SHIFT) & 0x3F)) {
|
||||
error = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
#define VALID_DUMMY_ADDR ((region->user_start + PTL3_SIZE - 1) & ~(PTL3_SIZE - 1))
|
||||
addr = (flags & MAP_FIXED)? addr0: VALID_DUMMY_ADDR;
|
||||
addr = addr0;
|
||||
len = (len0 + pgsize - 1) & ~(pgsize - 1);
|
||||
recheck:
|
||||
if ((addr & (pgsize - 1))
|
||||
|| (len == 0)
|
||||
|| !(flags & (MAP_SHARED | MAP_PRIVATE))
|
||||
|| ((flags & MAP_SHARED) && (flags & MAP_PRIVATE))
|
||||
|| (off0 & (pgsize - 1))) {
|
||||
if (!(flags & MAP_FIXED) && addr != VALID_DUMMY_ADDR) {
|
||||
addr = VALID_DUMMY_ADDR;
|
||||
goto recheck;
|
||||
}
|
||||
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):EINVAL\n",
|
||||
addr0, len0, prot, flags0, fd, off0);
|
||||
error = -EINVAL;
|
||||
@ -1703,6 +1765,10 @@ SYSCALL_DECLARE(mmap)
|
||||
if (addr < region->user_start
|
||||
|| region->user_end <= addr
|
||||
|| len > (region->user_end - region->user_start)) {
|
||||
if (!(flags & MAP_FIXED) && addr != VALID_DUMMY_ADDR) {
|
||||
addr = VALID_DUMMY_ADDR;
|
||||
goto recheck;
|
||||
}
|
||||
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):ENOMEM\n",
|
||||
addr0, len0, prot, flags0, fd, off0);
|
||||
error = -ENOMEM;
|
||||
@ -1730,10 +1796,20 @@ out:
|
||||
|
||||
SYSCALL_DECLARE(clone)
|
||||
{
|
||||
return do_fork((int)ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx),
|
||||
struct process *proc = cpu_local_var(current)->proc;
|
||||
struct mcs_rwlock_node_irqsave lock_dump;
|
||||
unsigned long ret;
|
||||
|
||||
/* mutex coredump */
|
||||
mcs_rwlock_reader_lock(&proc->coredump_lock, &lock_dump);
|
||||
|
||||
ret = do_fork((int)ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx),
|
||||
ihk_mc_syscall_arg2(ctx), ihk_mc_syscall_arg3(ctx),
|
||||
ihk_mc_syscall_arg4(ctx), ihk_mc_syscall_pc(ctx),
|
||||
ihk_mc_syscall_sp(ctx));
|
||||
|
||||
mcs_rwlock_reader_unlock(&proc->coredump_lock, &lock_dump);
|
||||
return ret;
|
||||
}
|
||||
|
||||
SYSCALL_DECLARE(fork)
|
||||
@ -1761,7 +1837,9 @@ SYSCALL_DECLARE(shmget)
|
||||
int hugeshift = shmflg & (0x3F << SHM_HUGE_SHIFT);
|
||||
|
||||
if (hugeshift == 0) {
|
||||
shmflg |= SHM_HUGE_2MB; /* default hugepage size */
|
||||
/* default hugepage size */
|
||||
shmflg |= ihk_mc_get_linux_default_huge_page_shift() <<
|
||||
MAP_HUGE_SHIFT;
|
||||
} else if (hugeshift == SHM_HUGE_2MB ||
|
||||
hugeshift == SHM_HUGE_1GB) {
|
||||
/*nop*/
|
||||
@ -2210,7 +2288,7 @@ int do_process_vm_read_writev(int pid,
|
||||
}
|
||||
|
||||
/* Check if parameters are okay */
|
||||
ihk_mc_spinlock_lock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_lock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
range = lookup_process_memory_range(lthread->vm,
|
||||
(uintptr_t)local_iov,
|
||||
@ -2232,7 +2310,7 @@ int do_process_vm_read_writev(int pid,
|
||||
|
||||
ret = 0;
|
||||
arg_out:
|
||||
ihk_mc_spinlock_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
@ -2301,7 +2379,7 @@ arg_out:
|
||||
if (pli != li) {
|
||||
struct vm_range *range;
|
||||
|
||||
ihk_mc_spinlock_lock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_lock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
/* Is base valid? */
|
||||
range = lookup_process_memory_range(lthread->vm,
|
||||
@ -2331,7 +2409,7 @@ arg_out:
|
||||
|
||||
ret = 0;
|
||||
pli_out:
|
||||
ihk_mc_spinlock_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
ihk_rwspinlock_read_unlock_noirq(<hread->vm->memory_range_lock);
|
||||
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
@ -2344,7 +2422,7 @@ pli_out:
|
||||
if (pri != ri) {
|
||||
struct vm_range *range;
|
||||
|
||||
ihk_mc_spinlock_lock_noirq(&rvm->memory_range_lock);
|
||||
ihk_rwspinlock_read_lock_noirq(&rvm->memory_range_lock);
|
||||
|
||||
/* Is base valid? */
|
||||
range = lookup_process_memory_range(rvm,
|
||||
@ -2374,7 +2452,7 @@ pli_out:
|
||||
|
||||
ret = 0;
|
||||
pri_out:
|
||||
ihk_mc_spinlock_unlock_noirq(&rvm->memory_range_lock);
|
||||
ihk_rwspinlock_read_unlock_noirq(&rvm->memory_range_lock);
|
||||
|
||||
if (ret != 0) {
|
||||
goto out;
|
||||
@ -2811,4 +2889,46 @@ time_t time(void) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
void calculate_time_from_tsc(struct timespec *ts)
|
||||
{
|
||||
unsigned long seq;
|
||||
unsigned long seq2;
|
||||
unsigned long ns;
|
||||
unsigned long delta;
|
||||
struct vsyscall_gtod_data *gtod = vdso.vgtod_virt;
|
||||
|
||||
do {
|
||||
for (;;) {
|
||||
seq = ACCESS_ONCE(gtod->seq);
|
||||
if (unlikely(seq & 1)) {
|
||||
cpu_pause();
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
rmb(); /* fetch sequence before time */
|
||||
ts->tv_sec = gtod->wall_time_sec;
|
||||
ns = gtod->wall_time_snsec;
|
||||
delta = rdtsc() - gtod->clock.cycle_last;
|
||||
ns += delta * gtod->clock.mult;
|
||||
ns >>= gtod->clock.shift;
|
||||
seq2 = ACCESS_ONCE(gtod->seq);
|
||||
rmb(); /* fetch time before checking sequence */
|
||||
} while (seq != seq2);
|
||||
ts->tv_nsec = ns;
|
||||
|
||||
if (ts->tv_nsec >= NS_PER_SEC) {
|
||||
ts->tv_nsec -= NS_PER_SEC;
|
||||
++ts->tv_sec;
|
||||
}
|
||||
}
|
||||
|
||||
extern void ptrace_syscall_event(struct thread *thread);
|
||||
long arch_ptrace_syscall_event(struct thread *thread,
|
||||
ihk_mc_user_context_t *ctx, long setret)
|
||||
{
|
||||
ihk_mc_syscall_ret(ctx) = setret;
|
||||
ptrace_syscall_event(thread);
|
||||
return ihk_mc_syscall_ret(ctx);
|
||||
}
|
||||
/*** End of File ***/
|
||||
|
||||
@ -1,10 +0,0 @@
|
||||
[Unit]
|
||||
Description=irqbalance daemon
|
||||
After=syslog.target
|
||||
|
||||
[Service]
|
||||
EnvironmentFile=/tmp/irqbalance_mck
|
||||
ExecStart=/usr/sbin/irqbalance --foreground $IRQBALANCE_ARGS
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
@ -1,150 +0,0 @@
|
||||
# mcoverlay-create-smp-x86.sh.in COPYRIGHT FUJITSU LIMITED 2018
|
||||
# Overlay /proc, /sys with McKernel specific contents
|
||||
|
||||
#
|
||||
# Revert any state that has been initialized before the error occured.
|
||||
#
|
||||
if [ -z "$(declare -f error_exit)" ]; then
|
||||
error_exit() {
|
||||
local status=$1
|
||||
|
||||
case $status in
|
||||
mcos_sys_mounted)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
umount /tmp/mcos/mcos0_sys
|
||||
fi
|
||||
;&
|
||||
mcos_proc_mounted)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
umount /tmp/mcos/mcos0_proc
|
||||
fi
|
||||
;&
|
||||
mcoverlayfs_loaded)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
rmmod mcoverlay 2>/dev/null
|
||||
fi
|
||||
;&
|
||||
linux_proc_bind_mounted)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
umount /tmp/mcos/linux_proc
|
||||
fi
|
||||
;&
|
||||
tmp_mcos_mounted)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
umount /tmp/mcos
|
||||
fi
|
||||
;&
|
||||
tmp_mcos_created)
|
||||
if [ "$enable_mcoverlay" == "yes" ]; then
|
||||
rm -rf /tmp/mcos
|
||||
fi
|
||||
;&
|
||||
initial)
|
||||
# Nothing more to revert
|
||||
;;
|
||||
esac
|
||||
|
||||
# Retun -EINVAL
|
||||
exit -22
|
||||
}
|
||||
fi
|
||||
|
||||
if [ ! -e /tmp/mcos ]; then
|
||||
mkdir -p /tmp/mcos;
|
||||
fi
|
||||
if ! mount -t tmpfs tmpfs /tmp/mcos; then
|
||||
echo "error: mount /tmp/mcos" >&2
|
||||
error_exit "tmp_mcos_created"
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/linux_proc ]; then
|
||||
mkdir -p /tmp/mcos/linux_proc;
|
||||
fi
|
||||
if ! mount --bind /proc /tmp/mcos/linux_proc; then
|
||||
echo "error: mount /tmp/mcos/linux_proc" >&2
|
||||
error_exit "tmp_mcos_mounted"
|
||||
fi
|
||||
if ! taskset -c 0 insmod @KMODDIR@/mcoverlay.ko 2>/dev/null; then
|
||||
echo "error: inserting mcoverlay.ko" >&2
|
||||
error_exit "linux_proc_bind_mounted"
|
||||
fi
|
||||
while [ ! -e /proc/mcos0 ]
|
||||
do
|
||||
sleep 0.1
|
||||
done
|
||||
if [ ! -e /tmp/mcos/mcos0_proc ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_proc;
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_proc_upper;
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_proc_work;
|
||||
fi
|
||||
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then
|
||||
echo "error: mounting /tmp/mcos/mcos0_proc" >&2
|
||||
error_exit "mcoverlayfs_loaded"
|
||||
fi
|
||||
# TODO: How de we revert this in case of failure??
|
||||
mount --make-rprivate /proc
|
||||
|
||||
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys/setup_complete ]
|
||||
do
|
||||
sleep 0.1
|
||||
done
|
||||
if [ ! -e /tmp/mcos/mcos0_sys ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_sys;
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_sys_upper;
|
||||
fi
|
||||
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then
|
||||
mkdir -p /tmp/mcos/mcos0_sys_work;
|
||||
fi
|
||||
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then
|
||||
echo "error: mount /tmp/mcos/mcos0_sys" >&2
|
||||
error_exit "mcos_proc_mounted"
|
||||
fi
|
||||
# TODO: How de we revert this in case of failure??
|
||||
mount --make-rprivate /sys
|
||||
|
||||
touch /tmp/mcos/mcos0_proc/mckernel
|
||||
|
||||
rm -rf /tmp/mcos/mcos0_sys/setup_complete
|
||||
|
||||
# Hide NUMA related files which are outside the LWK partition
|
||||
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
|
||||
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
|
||||
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
|
||||
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/drivers/processor/$cpuid
|
||||
else
|
||||
for nodeid in `find /sys/devices/system/cpu/$cpuid/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid/$nodeid" ]; then
|
||||
rm -f /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid/$nodeid
|
||||
fi
|
||||
done
|
||||
fi
|
||||
done
|
||||
for nodeid in `find /sys/devices/system/node/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid" ]; then
|
||||
rm -rf /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/*
|
||||
rm -rf /tmp/mcos/mcos0_sys/bus/node/devices/$nodeid
|
||||
else
|
||||
# Delete non-existent symlinks
|
||||
for cpuid in `find /sys/devices/system/node/$nodeid/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid/$cpuid" ]; then
|
||||
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/$cpuid
|
||||
fi
|
||||
done
|
||||
|
||||
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/memory*
|
||||
fi
|
||||
done
|
||||
rm -f /tmp/mcos/mcos0_sys/devices/system/node/has_*
|
||||
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
|
||||
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
|
||||
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
|
||||
fi
|
||||
done
|
||||
|
||||
exit 0
|
||||
@ -1,16 +0,0 @@
|
||||
# Remove mcoverlay if loaded
|
||||
|
||||
if grep mcoverlay /proc/modules &>/dev/null; then
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
|
||||
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
|
||||
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
|
||||
if ! rmmod mcoverlay 2>/dev/null; then
|
||||
echo "error: removing mcoverlay" >&2
|
||||
# Return -EINVAL
|
||||
exit -22
|
||||
fi
|
||||
fi
|
||||
|
||||
exit 0
|
||||
383
cmake/modules/AutoconfHelper.cmake
Normal file
383
cmake/modules/AutoconfHelper.cmake
Normal file
@ -0,0 +1,383 @@
|
||||
# Helper functions for translating autoconf projects. Several functions
|
||||
# are lifted from the Mono sources
|
||||
|
||||
include (CheckCSourceCompiles)
|
||||
include (CheckIncludeFile)
|
||||
include (TestBigEndian)
|
||||
include (CheckFunctionExists)
|
||||
include (CheckTypeSize)
|
||||
include (CheckCSourceRuns)
|
||||
|
||||
|
||||
# Function to get the version information from the configure.ac file in the
|
||||
# current directory. Its argument is the name of the library as passed to
|
||||
# AC_INIT. It will set the variables ${LIBNAME}_VERSION and ${LIBNAME}_SOVERSION
|
||||
function (ac_get_version libname)
|
||||
string(TOUPPER "${libname}" libname_upper)
|
||||
|
||||
# Read the relevant content from configure.ac
|
||||
file (STRINGS configure.ac tmp_configure_ac
|
||||
REGEX "${libname_upper}_[_A-Z]+=[ \\t]*[0-9]+")
|
||||
|
||||
# Product version
|
||||
string (REGEX REPLACE ".+MAJOR[_A-Z]+=([0-9]+).+MINOR[_A-Z]+=([0-9]+).+MICRO[_A-Z]+=([0-9]+).*"
|
||||
"\\1.\\2.\\3" ${libname_upper}_VERSION "${tmp_configure_ac}")
|
||||
|
||||
# Library version for libtool
|
||||
string (REGEX REPLACE ".+CURRENT=([0-9]+).+REVISION=([0-9]+).+AGE=([0-9]+).*"
|
||||
"\\1.\\2.\\3" ${libname_upper}_SOVERSION "${tmp_configure_ac}")
|
||||
|
||||
# Checks if the string needs to be displayed
|
||||
set (${libname_upper}_DISPLAYSTR_AUX
|
||||
"Found ${libname} version ${${libname_upper}_VERSION}, soversion ${${libname_upper}_SOVERSION} from configure.ac"
|
||||
)
|
||||
if ((NOT ${libname_upper}_DISPLAYSTR) OR (NOT ${libname_upper}_DISPLAYSTR STREQUAL ${libname_upper}_DISPLAYSTR_AUX))
|
||||
set (${libname_upper}_DISPLAYSTR ${${libname_upper}_DISPLAYSTR_AUX}
|
||||
CACHE INTERNAL "Version string from ${libname}" FORCE)
|
||||
message (STATUS ${${libname_upper}_DISPLAYSTR})
|
||||
endif ()
|
||||
|
||||
# Export the result to the caller
|
||||
set(${libname_upper}_VERSION "${${libname_upper}_VERSION}" PARENT_SCOPE)
|
||||
set(${libname_upper}_SOVERSION "${${libname_upper}_SOVERSION}" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
|
||||
# Also from mono's source code
|
||||
# Implementation of AC_CHECK_HEADERS
|
||||
# In addition, it also records the list of variables in the variable
|
||||
# 'autoheader_vars', and for each variable, a documentation string in the
|
||||
# variable ${var}_doc
|
||||
function(ac_check_headers)
|
||||
foreach (header ${ARGV})
|
||||
string(TOUPPER ${header} header_var)
|
||||
string(REPLACE "." "_" header_var ${header_var})
|
||||
string(REPLACE "/" "_" header_var ${header_var})
|
||||
set(header_var "HAVE_${header_var}")
|
||||
check_include_file (${header} ${header_var})
|
||||
set("${header_var}_doc" "Define to 1 if you have the <${header}> header file." PARENT_SCOPE)
|
||||
if (${header_var})
|
||||
set("${header_var}_defined" "1" PARENT_SCOPE)
|
||||
endif()
|
||||
set("${header_var}_val" "1" PARENT_SCOPE)
|
||||
set (autoheader_vars ${autoheader_vars} ${header_var})
|
||||
endforeach()
|
||||
set (autoheader_vars ${autoheader_vars} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
# Function taken from mono's source code
|
||||
function (ac_check_funcs)
|
||||
foreach (func ${ARGV})
|
||||
string(TOUPPER ${func} var)
|
||||
set(var "HAVE_${var}")
|
||||
set(${var})
|
||||
check_function_exists (${func} ${var})
|
||||
set("${var}_doc" "Define to 1 if you have the '${func}' function." PARENT_SCOPE)
|
||||
if (${var})
|
||||
set("${var}_defined" "1" PARENT_SCOPE)
|
||||
set(${var} yes PARENT_SCOPE)
|
||||
endif()
|
||||
set("${var}_val" "1" PARENT_SCOPE)
|
||||
set (autoheader_vars ${autoheader_vars} ${var})
|
||||
endforeach()
|
||||
set (autoheader_vars ${autoheader_vars} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
|
||||
# Specifically, this macro checks for stdlib.h', stdarg.h',
|
||||
# string.h', and float.h'; if the system has those, it probably
|
||||
# has the rest of the ANSI C header files. This macro also checks
|
||||
# whether string.h' declares memchr' (and thus presumably the
|
||||
# other mem' functions), whether stdlib.h' declare free' (and
|
||||
# thus presumably malloc' and other related functions), and whether
|
||||
# the ctype.h' macros work on characters with the high bit set, as
|
||||
# ANSI C requires.
|
||||
function (ac_header_stdc)
|
||||
if (STDC_HEADERS)
|
||||
return()
|
||||
endif()
|
||||
message(STATUS "Looking for ANSI-C headers")
|
||||
set(code "
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <string.h>
|
||||
#include <float.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
void *ptr;
|
||||
free((void*)1);
|
||||
ptr = memchr((void*)1, 0, 0);
|
||||
|
||||
return (int)ptr;
|
||||
}
|
||||
")
|
||||
# FIXME Check the ctype.h high bit
|
||||
CHECK_C_SOURCE_COMPILES("${code}" STDC_HEADERS)
|
||||
if (STDC_HEADERS)
|
||||
set(STDC_HEADERS 1 PARENT_SCOPE)
|
||||
message(STATUS "Looking for ANSI-C headers - found")
|
||||
else()
|
||||
message(STATUS "Looking for ANSI-C headers - not found")
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
# Also from the mono sources, kind of implements AC_SYS_LARGEFILE
|
||||
function (ac_sys_largefile)
|
||||
CHECK_C_SOURCE_RUNS("
|
||||
#include <sys/types.h>
|
||||
#define BIG_OFF_T (((off_t)1<<62)-1+((off_t)1<<62))
|
||||
int main (int argc, char **argv) {
|
||||
int big_off_t=((BIG_OFF_T%2147483629==721) &&
|
||||
(BIG_OFF_T%2147483647==1));
|
||||
return big_off ? 0 : 1;
|
||||
}
|
||||
" HAVE_LARGE_FILE_SUPPORT)
|
||||
|
||||
# Check if it makes sense to define _LARGE_FILES or _FILE_OFFSET_BITS
|
||||
if (HAVE_LARGE_FILE_SUPPORT)
|
||||
return()
|
||||
endif()
|
||||
|
||||
set (_LARGE_FILE_EXTRA_SRC "
|
||||
#include <sys/types.h>
|
||||
int main (int argc, char **argv) {
|
||||
return sizeof(off_t) == 8 ? 0 : 1;
|
||||
}
|
||||
")
|
||||
CHECK_C_SOURCE_RUNS ("#define _LARGE_FILES\n${_LARGE_FILE_EXTRA_SRC}"
|
||||
HAVE_USEFUL_D_LARGE_FILES)
|
||||
if (NOT HAVE_USEFUL_D_LARGE_FILES)
|
||||
if (NOT DEFINED HAVE_USEFUL_D_FILE_OFFSET_BITS)
|
||||
set (SHOW_LARGE_FILE_WARNING TRUE)
|
||||
endif ()
|
||||
CHECK_C_SOURCE_RUNS ("#define _FILE_OFFSET_BITS 64\n${_LARGE_FILE_EXTRA_SRC}"
|
||||
HAVE_USEFUL_D_FILE_OFFSET_BITS)
|
||||
if (HAVE_USEFUL_D_FILE_OFFSET_BITS)
|
||||
set (_FILE_OFFSET_BITS 64 PARENT_SCOPE)
|
||||
elseif (SHOW_LARGE_FILE_WARNING)
|
||||
message (WARNING "No 64 bit file support through off_t available.")
|
||||
endif ()
|
||||
else ()
|
||||
set (_LARGE_FILES 1 PARENT_SCOPE)
|
||||
endif ()
|
||||
endfunction ()
|
||||
|
||||
|
||||
# Quick way to set some basic variables
|
||||
# FIXME add support for variable number of arguments: only package and version are mandatory
|
||||
# arguments are package version bug_report tarname url
|
||||
function (ac_init)
|
||||
set(package ${ARGV0})
|
||||
set(version ${ARGV1})
|
||||
set(bug_report ${ARGV2})
|
||||
set(tarname ${ARGV3})
|
||||
set(url ${ARGV4})
|
||||
set(PACKAGE_NAME "\"${package}\"" PARENT_SCOPE)
|
||||
set(PACKAGE_VERSION "\"${version}\"" PARENT_SCOPE)
|
||||
set(VERSION "\"${version}\"" PARENT_SCOPE)
|
||||
if(version)
|
||||
set(PACKAGE_STRING "\"${package} ${version}\"" PARENT_SCOPE)
|
||||
else()
|
||||
set(PACKAGE_STRING "\"${package}\"" PARENT_SCOPE)
|
||||
endif()
|
||||
|
||||
set(PACKAGE_BUGREPORT "\"${bug_report}\"" PARENT_SCOPE)
|
||||
|
||||
if(NOT tarname)
|
||||
string(REGEX REPLACE "[^a-zA-Z0-9_]" "-" tarname "${package}")
|
||||
endif()
|
||||
set(PACKAGE_TARNAME "\"${tarname}\"" PARENT_SCOPE)
|
||||
|
||||
set(PACKAGE_URL "\"${url}\"" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
|
||||
# Checks for the const keyword, defining "HAS_CONST_SUPPORT"
|
||||
# If it does not have support, defines "const" to 0 in the parent scope
|
||||
function (ac_c_const)
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"int main(int argc, char **argv){const int r = 0;return r;}"
|
||||
HAS_CONST_SUPPORT)
|
||||
if (NOT HAS_CONST_SUPPORT)
|
||||
set(const 0 PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
# Inline keyword support. Defines "inline" in the parent scope to the
|
||||
# compiler internal keyword for inline in C
|
||||
# TODO write a better test!
|
||||
function (ac_c_inline)
|
||||
if (MSVC)
|
||||
set (inline __inline)
|
||||
elseif(CMAKE_COMPILER_IS_GNUC)
|
||||
set (inline __inline__)
|
||||
endif()
|
||||
set(inline "${inline}" PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
|
||||
# Test if you can safely include both <sys/time.h> and <time.h>
|
||||
function (ac_header_time)
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"#include <sys/time.h>\n#include <time.h>\nint main(int argc, char **argv) { return 0; }"
|
||||
TIME_WITH_SYS_TIME)
|
||||
set(TIME_WITH_SYS_TIME ${TIME_WITH_SYS_TIME} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
|
||||
# Native cpu byte order: 1 if big-endian (Motorola) or 0 if little-endian
|
||||
# (Intel), setting "WORDS_BIGENDIAN" to 1 if big endian
|
||||
function (ac_c_bigendian)
|
||||
TEST_BIG_ENDIAN(HOST_BIGENDIAN)
|
||||
if (HOST_BIGENDIAN)
|
||||
set(WORDS_BIGENDIAN 1 PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
# Check for off_t, setting "off_t" in the parent scope
|
||||
function(ac_type_off_t)
|
||||
CHECK_TYPE_SIZE("off_t" SIZEOF_OFF_T)
|
||||
if (NOT SIZEOF_OFF_T)
|
||||
set(off_t "long int")
|
||||
endif()
|
||||
set(off_t ${off_t} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
|
||||
# Check for size_t, setting "size_t" in the parent scope
|
||||
function(ac_type_size_t)
|
||||
CHECK_TYPE_SIZE("size_t" SIZEOF_SIZE_T)
|
||||
if (NOT SIZEOF_SIZE_T)
|
||||
set(size_t "unsigned int")
|
||||
endif()
|
||||
set(size_t ${size_t} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
|
||||
# Define "TM_IN_SYS_TIME" to 1 if <sys/time.h> declares "struct tm"
|
||||
function(ac_struct_tm)
|
||||
CHECK_C_SOURCE_COMPILES(
|
||||
"#include <sys/time.h>\nint main(int argc, char **argv) { struct tm x; return 0; }"
|
||||
TM_IN_SYS_TIME
|
||||
)
|
||||
if (TM_IN_SYS_TIME)
|
||||
set (TM_IN_SYS_TIME 1 PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
# Obtain size of an 'type' and define as SIZEOF_TYPE
|
||||
function (ac_check_sizeof typename)
|
||||
string(TOUPPER "SIZEOF_${typename}" varname)
|
||||
string(REPLACE " " "_" varname "${varname}")
|
||||
string(REPLACE "*" "p" varname "${varname}")
|
||||
CHECK_TYPE_SIZE("${typename}" ${varname} BUILTIN_TYPES_ONLY)
|
||||
if(NOT ${varname})
|
||||
set(${varname} 0 PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
# Check if the type exists, defines HAVE_<type>
|
||||
function (ac_check_type typename)
|
||||
string(TOUPPER "${typename}" varname)
|
||||
string(REPLACE " " "_" varname "${varname}")
|
||||
string(REPLACE "*" "p" varname "${varname}")
|
||||
CHECK_TYPE_SIZE("${typename}" ${varname})
|
||||
if (NOT "${varname}" STREQUAL "")
|
||||
set("HAVE_${varname}" 1 PARENT_SCOPE)
|
||||
set("${varname}" "${typename}" PARENT_SCOPE)
|
||||
else()
|
||||
set("${varname}" "unknown" PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
|
||||
# Verifies if each type on the list exists, using the given prelude
|
||||
function (ac_check_types type_list prelude)
|
||||
foreach(typename ${type_list})
|
||||
string(TOUPPER "HAVE_${typename}" varname)
|
||||
string(REPLACE " " "_" varname "${varname}")
|
||||
string(REPLACE "*" "p" varname "${varname}")
|
||||
CHECK_C_SOURCE_COMPILES("${prelude}\n ${typename} foo;" ${varname})
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
function(ac_path_prog variable prog_to_check_for value_if_not_found env_var)
|
||||
find_program(${variable} NAMES ${prog_to_check_for} PATHS ENV ${env_var} NO_DEFAULT_PATH)
|
||||
if(NOT ${variable})
|
||||
message(STATUS "Looking for ${prog_to_check_for} - not found")
|
||||
set(${variable} ${value_if_not_fount} PARENT_SCOPE)
|
||||
else()
|
||||
message(STATUS "Looking for ${prog_to_check_for} - ${variable}")
|
||||
set(${variable} ${${variable}} PARENT_SCOPE)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
# check if function func exists in library lib
|
||||
function(ac_check_lib lib func)
|
||||
string(TOUPPER "HAVE_${func}" varname)
|
||||
set(CMAKE_REQUIRED_LIBRARIES ${lib})
|
||||
check_function_exists(${func} ${varname})
|
||||
set(CMAKE_REQUIRED_LIBRARIES)
|
||||
endfunction()
|
||||
|
||||
# check if source compiles without linking
|
||||
function(ac_try_compile SOURCE VAR)
|
||||
set(CMAKE_TMP_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp)
|
||||
if(NOT DEFINED "${VAR}")
|
||||
file(WRITE
|
||||
"${CMAKE_TMP_DIR}/src.c"
|
||||
"${SOURCE}\n"
|
||||
)
|
||||
|
||||
if(NOT CMAKE_REQUIRED_QUIET)
|
||||
message(STATUS "Performing Test ${VAR}")
|
||||
endif()
|
||||
# Set up CMakeLists.txt for static library:
|
||||
file(WRITE
|
||||
${CMAKE_TMP_DIR}/CMakeLists.txt
|
||||
"add_library(compile STATIC src.c)"
|
||||
)
|
||||
|
||||
# Configure:
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
|
||||
WORKING_DIRECTORY ${CMAKE_TMP_DIR}
|
||||
)
|
||||
|
||||
# Build:
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_COMMAND} --build ${CMAKE_TMP_DIR}
|
||||
RESULT_VARIABLE RESVAR
|
||||
OUTPUT_VARIABLE OUTPUT
|
||||
ERROR_VARIABLE OUTPUT
|
||||
)
|
||||
|
||||
# Set up result:
|
||||
if(RESVAR EQUAL 0)
|
||||
set(${VAR} 1 CACHE INTERNAL "Test ${VAR}")
|
||||
if(NOT CMAKE_REQUIRED_QUIET)
|
||||
message(STATUS "Performing Test ${VAR} - Success")
|
||||
endif()
|
||||
|
||||
file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
|
||||
"Performing C SOURCE FILE Test ${VAR} succeded with the following output:\n"
|
||||
"${OUTPUT}\n"
|
||||
"Source file was:\n${SOURCE}\n")
|
||||
else()
|
||||
if(NOT CMAKE_REQUIRED_QUIET)
|
||||
message(STATUS "Performing Test ${VAR} - Failed")
|
||||
endif()
|
||||
set(${VAR} "" CACHE INTERNAL "Test ${VAR}")
|
||||
file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
|
||||
"Performing C SOURCE FILE Test ${VAR} failed with the following output:\n"
|
||||
"${OUTPUT}\n"
|
||||
"Source file was:\n${SOURCE}\n")
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
64
cmake/modules/FindLibElf.cmake
Normal file
64
cmake/modules/FindLibElf.cmake
Normal file
@ -0,0 +1,64 @@
|
||||
# - Try to find libelf
|
||||
# Once done this will define
|
||||
#
|
||||
# LIBELF_FOUND - system has libelf
|
||||
# LIBELF_INCLUDE_DIRS - the libelf include directory
|
||||
# LIBELF_LIBRARIES - Link these to use libelf
|
||||
# LIBELF_DEFINITIONS - Compiler switches required for using libelf
|
||||
#
|
||||
# This module reads hints about search locations from variables:
|
||||
#
|
||||
# LIBELF_ROOT - Preferred installation prefix
|
||||
#
|
||||
# Copyright (c) 2008 Bernhard Walle <bernhard.walle@gmx.de>
|
||||
#
|
||||
# Redistribution and use is allowed according to the terms of the New
|
||||
# BSD license.
|
||||
# For details see the accompanying COPYING-CMAKE-SCRIPTS file.
|
||||
#
|
||||
|
||||
|
||||
if (LIBELF_LIBRARIES AND LIBELF_INCLUDE_DIRS)
|
||||
set (LibElf_FIND_QUIETLY TRUE)
|
||||
endif (LIBELF_LIBRARIES AND LIBELF_INCLUDE_DIRS)
|
||||
|
||||
find_path (LIBELF_INCLUDE_DIRS
|
||||
NAMES
|
||||
libelf/libelf.h libelf.h
|
||||
HINTS
|
||||
${LIBELF_ROOT}
|
||||
PATH_SUFFIXES
|
||||
include
|
||||
libelf/include
|
||||
)
|
||||
|
||||
find_library (LIBELF_LIBRARIES
|
||||
NAMES
|
||||
elf libelf
|
||||
HINTS
|
||||
${LIBELF_ROOT}
|
||||
PATH_SUFFIXES
|
||||
lib
|
||||
libelf/lib
|
||||
)
|
||||
|
||||
include (FindPackageHandleStandardArgs)
|
||||
|
||||
|
||||
# handle the QUIETLY and REQUIRED arguments and set LIBELF_FOUND to TRUE if all listed variables are TRUE
|
||||
FIND_PACKAGE_HANDLE_STANDARD_ARGS(LibElf DEFAULT_MSG
|
||||
LIBELF_LIBRARIES
|
||||
LIBELF_INCLUDE_DIRS)
|
||||
|
||||
set(CMAKE_REQUIRED_LIBRARIES elf)
|
||||
include(CheckCXXSourceCompiles)
|
||||
check_cxx_source_compiles("#include <libelf.h>
|
||||
int main() {
|
||||
Elf *e = (Elf*)0;
|
||||
size_t sz;
|
||||
elf_getshdrstrndx(e, &sz);
|
||||
return 0;
|
||||
}" ELF_GETSHDRSTRNDX)
|
||||
unset(CMAKE_REQUIRED_LIBRARIES)
|
||||
|
||||
mark_as_advanced(LIBELF_INCLUDE_DIRS LIBELF_LIBRARIES ELF_GETSHDRSTRNDX)
|
||||
@ -14,6 +14,28 @@ mark_as_advanced(
|
||||
KBUILD_MAKE_FLAGS
|
||||
)
|
||||
|
||||
if (${CMAKE_GENERATOR} STREQUAL Ninja)
|
||||
set(MAKE "make")
|
||||
list(APPEND KBUILD_MAKE_FLAGS "-j")
|
||||
else ()
|
||||
set(MAKE "$(MAKE)")
|
||||
endif ()
|
||||
|
||||
# Convert McKernel "arm64" into Linux "aarch64"
|
||||
if ("${ARCH}" STREQUAL "arm64")
|
||||
set(LINUX_ARCH "aarch64")
|
||||
else ()
|
||||
set(LINUX_ARCH "${ARCH}")
|
||||
endif ()
|
||||
|
||||
if (NOT "${LINUX_ARCH}" STREQUAL "${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
||||
string(REGEX REPLACE "ld$" "" CROSS_COMPILE "${CMAKE_LINKER}")
|
||||
list(APPEND KBUILD_MAKE_FLAGS "ARCH=${ARCH}")
|
||||
list(APPEND KBUILD_MAKE_FLAGS "CROSS_COMPILE=${CROSS_COMPILE}")
|
||||
endif()
|
||||
|
||||
string(REPLACE ";" " " KBUILD_MAKE_FLAGS_STR "${KBUILD_MAKE_FLAGS}")
|
||||
|
||||
function(kmod MODULE_NAME)
|
||||
cmake_parse_arguments(KMOD "" "INSTALL_DEST" "C_FLAGS;SOURCES;EXTRA_SYMBOLS;DEPENDS" ${ARGN})
|
||||
|
||||
@ -33,17 +55,6 @@ endif(ENABLE_WERROR)
|
||||
configure_file(${CMAKE_SOURCE_DIR}/cmake/modules/Kbuild.in
|
||||
${CMAKE_CURRENT_BINARY_DIR}/Kbuild)
|
||||
|
||||
if (${CMAKE_GENERATOR} STREQUAL Ninja)
|
||||
set(MAKE "make")
|
||||
list(APPEND KBUILD_MAKE_FLAGS "-j")
|
||||
else ()
|
||||
set(MAKE "$(MAKE)")
|
||||
endif ()
|
||||
if (NOT "${ARCH}" STREQUAL "${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
||||
string(REGEX REPLACE "ld$" "" CROSS_COMPILE "${CMAKE_LINKER}")
|
||||
list(APPEND KBUILD_MAKE_FLAGS "ARCH=${ARCH};CROSS_COMPILE=${CROSS_COMPILE}")
|
||||
endif()
|
||||
|
||||
string(REGEX REPLACE "\\.c(;|$)" ".o.cmd\\1" KMOD_O_CMD "${KMOD_SOURCES}")
|
||||
string(REGEX REPLACE "[^/;]+(;|$)" ".\\0" KMOD_O_CMD "${KMOD_O_CMD}")
|
||||
|
||||
@ -78,6 +89,10 @@ endif(ENABLE_WERROR)
|
||||
# the native build system do these checks, if possible at all...
|
||||
add_custom_command(OUTPUT kmod_always_rebuild COMMAND touch kmod_always_rebuild)
|
||||
|
||||
if (NOT EXISTS "${KERNEL_DIR}/Makefile")
|
||||
message(FATAL_ERROR "${KERNEL_DIR} does not contain a Makefile and is probably missing. install kernel development package or set the KERNEL_DIR variable")
|
||||
endif()
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${MODULE_NAME}.ko"
|
||||
"Module.symvers"
|
||||
|
||||
17
config.h.in
17
config.h.in
@ -6,8 +6,9 @@
|
||||
/* version number */
|
||||
#define MCKERNEL_VERSION "${MCKERNEL_VERSION}"
|
||||
|
||||
/* whether mcoverlayfs is enabled */
|
||||
#cmakedefine ENABLE_MCOVERLAYFS 1
|
||||
/* enable the required code for mcexec to be able to use bind mount
|
||||
* there is no config option as its use is discouraged */
|
||||
// #define MCEXEC_BIND_MOUNT 1
|
||||
|
||||
/* whether memdump feature is enabled */
|
||||
#cmakedefine ENABLE_MEMDUMP 1
|
||||
@ -27,18 +28,12 @@
|
||||
/* whether undefined behaviour sanitizer is enabled */
|
||||
#cmakedefine ENABLE_UBSAN 1
|
||||
|
||||
/* whether per-CPU allocator cache (ThunderX2 workaround) is enabled */
|
||||
#cmakedefine ENABLE_PER_CPU_ALLOC_CACHE 1
|
||||
|
||||
/* Path of bind-mount source directory */
|
||||
#cmakedefine ROOTFSDIR "${ROOTFSDIR}"
|
||||
|
||||
/* Path of install directory for libraries */
|
||||
#cmakedefine MCKERNEL_LIBDIR "${MCKERNEL_LIBDIR}"
|
||||
|
||||
/* Path of install directory for binary */
|
||||
#cmakedefine BINDIR "${BINDIR}"
|
||||
|
||||
/* Path of install directory for system binary */
|
||||
#cmakedefine SBINDIR "${SBINDIR}"
|
||||
|
||||
/* for non-RHEL kernels */
|
||||
#ifndef RHEL_RELEASE_VERSION
|
||||
#define RHEL_RELEASE_VERSION(a,b) (((a) << 8) + (b))
|
||||
|
||||
@ -1,27 +0,0 @@
|
||||
#ifndef IHKLIB_RUSAGE_H_INCLUDED
|
||||
#define IHKLIB_RUSAGE_H_INCLUDED
|
||||
|
||||
#define IHK_MAX_NUM_PGSIZES 4
|
||||
#define IHK_MAX_NUM_NUMA_NODES 1024
|
||||
#define IHK_MAX_NUM_CPUS 1024
|
||||
|
||||
#define IHK_OS_PGSIZE_4KB 0
|
||||
#define IHK_OS_PGSIZE_2MB 1
|
||||
#define IHK_OS_PGSIZE_1GB 2
|
||||
|
||||
struct mckernel_rusage {
|
||||
unsigned long memory_stat_rss[IHK_MAX_NUM_PGSIZES];
|
||||
unsigned long memory_stat_mapped_file[IHK_MAX_NUM_PGSIZES];
|
||||
unsigned long memory_max_usage;
|
||||
unsigned long memory_kmem_usage;
|
||||
unsigned long memory_kmem_max_usage;
|
||||
unsigned long memory_numa_stat[IHK_MAX_NUM_NUMA_NODES];
|
||||
unsigned long cpuacct_stat_system;
|
||||
unsigned long cpuacct_stat_user;
|
||||
unsigned long cpuacct_usage;
|
||||
unsigned long cpuacct_usage_percpu[IHK_MAX_NUM_CPUS];
|
||||
int num_threads;
|
||||
int max_num_threads;
|
||||
};
|
||||
|
||||
#endif /* !defined(IHKLIB_RUSAGE_H_INCLUDED) */
|
||||
@ -55,7 +55,7 @@
|
||||
#define MCEXEC_UP_SYS_UNSHARE 0x30a02916
|
||||
|
||||
#define MCEXEC_UP_UTI_GET_CTX 0x30a02920
|
||||
#define MCEXEC_UP_UTI_SAVE_FS 0x30a02921
|
||||
#define MCEXEC_UP_UTI_SWITCH_CTX 0x30a02921
|
||||
#define MCEXEC_UP_SIG_THREAD 0x30a02922
|
||||
#define MCEXEC_UP_SYSCALL_THREAD 0x30a02924
|
||||
#define MCEXEC_UP_TERMINATE_THREAD 0x30a02925
|
||||
@ -193,7 +193,6 @@ struct syscall_response {
|
||||
unsigned long req_thread_status;
|
||||
long ret;
|
||||
unsigned long fault_address;
|
||||
unsigned long fault_reason;
|
||||
};
|
||||
|
||||
struct syscall_ret_desc {
|
||||
@ -359,7 +358,7 @@ struct uti_get_ctx_desc {
|
||||
unsigned long key; /* OUT: struct task_struct* of mcexec thread, used to search struct host_thread */
|
||||
};
|
||||
|
||||
struct uti_save_fs_desc {
|
||||
struct uti_switch_ctx_desc {
|
||||
void *rctx; /* Remote context */
|
||||
void *lctx; /* Local context */
|
||||
};
|
||||
|
||||
@ -4,6 +4,8 @@ if(ARCH STREQUAL "x86_64")
|
||||
set(ARCH_C_FLAGS "-mno-red-zone -mcmodel=kernel")
|
||||
endif()
|
||||
|
||||
set(MCEXEC_PATH "${CMAKE_INSTALL_FULL_BINDIR}/mcexec" CACHE STRING "mcexec path for binfmt")
|
||||
|
||||
kmod(mcctrl
|
||||
C_FLAGS
|
||||
-I${IHK_FULL_SOURCE_DIR}/linux/include
|
||||
@ -16,7 +18,7 @@ kmod(mcctrl
|
||||
-I${CMAKE_CURRENT_SOURCE_DIR}/arch/${ARCH}/include
|
||||
-I${PROJECT_BINARY_DIR}
|
||||
-I${PROJECT_SOURCE_DIR}/kernel/include
|
||||
-DMCEXEC_PATH=\\"${CMAKE_INSTALL_FULL_BINDIR}/mcexec\\"
|
||||
-DMCEXEC_PATH=\\"${MCEXEC_PATH}\\"
|
||||
${ARCH_C_FLAGS}
|
||||
SOURCES
|
||||
driver.c control.c ikc.c syscall.c procfs.c binfmt_mcexec.c
|
||||
|
||||
@ -1,7 +1,12 @@
|
||||
/* archdeps.c COPYRIGHT FUJITSU LIMITED 2016 */
|
||||
/* archdeps.c COPYRIGHT FUJITSU LIMITED 2016-2019 */
|
||||
#include <linux/version.h>
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/kallsyms.h>
|
||||
#if KERNEL_VERSION(4, 11, 0) <= LINUX_VERSION_CODE
|
||||
#include <linux/sched/task_stack.h>
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) */
|
||||
#include <linux/ptrace.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/vdso.h>
|
||||
#include "config.h"
|
||||
#include "../../mcctrl.h"
|
||||
@ -42,7 +47,6 @@ int arch_symbols_init(void)
|
||||
}
|
||||
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_52
|
||||
#define VDSO_MAXPAGES 1
|
||||
struct vdso {
|
||||
long busy;
|
||||
@ -53,7 +57,6 @@ struct vdso {
|
||||
long lbase;
|
||||
long offset_sigtramp;
|
||||
};
|
||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
||||
|
||||
unsigned long
|
||||
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
|
||||
@ -95,6 +98,74 @@ reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsign
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if KERNEL_VERSION(4, 0, 0) <= LINUX_VERSION_CODE
|
||||
static long elf_search_vdso_sigtramp(void)
|
||||
{
|
||||
int i = 0;
|
||||
long ans = -1;
|
||||
char *shstr = NULL, *dynstr = NULL;
|
||||
Elf64_Ehdr *eh = NULL;
|
||||
Elf64_Shdr *tmp_sh = NULL, *sym_sh = NULL;
|
||||
Elf64_Sym *sym = NULL;
|
||||
|
||||
/* ELF header */
|
||||
eh = (Elf64_Ehdr *)vdso_start;
|
||||
if (eh == NULL) {
|
||||
D("vdso_start is NULL.\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* ELF magic check */
|
||||
if (eh->e_ident[EI_MAG0] != ELFMAG0 &&
|
||||
eh->e_ident[EI_MAG1] != ELFMAG1 &&
|
||||
eh->e_ident[EI_MAG2] != ELFMAG2 &&
|
||||
eh->e_ident[EI_MAG3] != ELFMAG3) {
|
||||
D("vdso_start ELF MAGIC Mismatch.\n"
|
||||
"e_ident[EI_MAG0 - EI_MAG3]: %02x %02x %02x %02x\n",
|
||||
eh->e_ident[EI_MAG0], eh->e_ident[EI_MAG1],
|
||||
eh->e_ident[EI_MAG2], eh->e_ident[EI_MAG3]);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Search dynsym-table and dynstr-table offset
|
||||
* from section header table
|
||||
*/
|
||||
tmp_sh = (Elf64_Shdr *)(vdso_start + eh->e_shoff);
|
||||
shstr = vdso_start + (tmp_sh + eh->e_shstrndx)->sh_offset;
|
||||
for (i = 0; i < eh->e_shnum; i++, tmp_sh++) {
|
||||
if (tmp_sh->sh_type == SHT_DYNSYM) {
|
||||
sym_sh = tmp_sh;
|
||||
}
|
||||
|
||||
if (tmp_sh->sh_type == SHT_STRTAB &&
|
||||
!strcmp(&shstr[tmp_sh->sh_name], ".dynstr")) {
|
||||
dynstr = vdso_start + tmp_sh->sh_offset;
|
||||
}
|
||||
}
|
||||
|
||||
if (sym_sh == NULL) {
|
||||
D("dynsym-table not found.\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (dynstr == 0) {
|
||||
D("dynstr-table not found.\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Search __kernel_rt_sigreturn offset from dynsym-table */
|
||||
sym = (Elf64_Sym *)(vdso_start + sym_sh->sh_offset);
|
||||
for (i = 0; (i * sym_sh->sh_entsize) < sym_sh->sh_size; i++, sym++) {
|
||||
if (!strcmp(dynstr + sym->st_name, "__kernel_rt_sigreturn")) {
|
||||
ans = sym->st_value;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
return ans;
|
||||
}
|
||||
#endif /*LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)*/
|
||||
|
||||
void get_vdso_info(ihk_os_t os, long vdso_rpa)
|
||||
{
|
||||
ihk_device_t dev = ihk_os_to_dev(os);
|
||||
@ -128,7 +199,12 @@ void get_vdso_info(ihk_os_t os, long vdso_rpa)
|
||||
|
||||
/* offsets */
|
||||
vdso->lbase = VDSO_LBASE;
|
||||
vdso->offset_sigtramp = vdso_offset_sigtramp;
|
||||
vdso->offset_sigtramp = elf_search_vdso_sigtramp();
|
||||
|
||||
if (unlikely(vdso->offset_sigtramp == -1)) {
|
||||
D("Use vdso_offset_sigtramp in header-file.\n");
|
||||
vdso->offset_sigtramp = vdso_offset_sigtramp;
|
||||
}
|
||||
#endif /*LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)*/
|
||||
out:
|
||||
wmb();
|
||||
@ -142,59 +218,61 @@ out:
|
||||
void *
|
||||
get_user_sp(void)
|
||||
{
|
||||
/* TODO; skeleton for UTI */
|
||||
return NULL;
|
||||
return (void *)current_pt_regs()->sp;
|
||||
}
|
||||
|
||||
void
|
||||
set_user_sp(void *usp)
|
||||
{
|
||||
/* TODO; skeleton for UTI */
|
||||
current_pt_regs()->sp = (unsigned long)usp;
|
||||
}
|
||||
|
||||
/* TODO; skeleton for UTI */
|
||||
struct trans_uctx {
|
||||
volatile int cond;
|
||||
int fregsize;
|
||||
|
||||
unsigned long rax;
|
||||
unsigned long rbx;
|
||||
unsigned long rcx;
|
||||
unsigned long rdx;
|
||||
unsigned long rsi;
|
||||
unsigned long rdi;
|
||||
unsigned long rbp;
|
||||
unsigned long r8;
|
||||
unsigned long r9;
|
||||
unsigned long r10;
|
||||
unsigned long r11;
|
||||
unsigned long r12;
|
||||
unsigned long r13;
|
||||
unsigned long r14;
|
||||
unsigned long r15;
|
||||
unsigned long rflags;
|
||||
unsigned long rip;
|
||||
unsigned long rsp;
|
||||
unsigned long fs;
|
||||
struct user_pt_regs regs;
|
||||
unsigned long tls_baseaddr;
|
||||
};
|
||||
|
||||
void
|
||||
restore_fs(unsigned long fs)
|
||||
restore_tls(unsigned long addr)
|
||||
{
|
||||
/* TODO; skeleton for UTI */
|
||||
const unsigned long tpidrro = 0;
|
||||
|
||||
asm volatile(
|
||||
" msr tpidr_el0, %0\n"
|
||||
" msr tpidrro_el0, %1"
|
||||
: : "r" (addr), "r" (tpidrro));
|
||||
}
|
||||
|
||||
void
|
||||
save_fs_ctx(void *ctx)
|
||||
save_tls_ctx(void __user *ctx)
|
||||
{
|
||||
/* TODO; skeleton for UTI */
|
||||
struct trans_uctx __user *tctx = ctx;
|
||||
unsigned long baseaddr;
|
||||
|
||||
asm volatile(
|
||||
" mrs %0, tpidr_el0"
|
||||
: "=r" (baseaddr));
|
||||
|
||||
if (copy_to_user(&tctx->tls_baseaddr, &baseaddr,
|
||||
sizeof(tctx->tls_baseaddr))) {
|
||||
pr_err("%s: copy_to_user failed.\n", __func__);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned long
|
||||
get_fs_ctx(void *ctx)
|
||||
get_tls_ctx(void __user *ctx)
|
||||
{
|
||||
/* TODO; skeleton for UTI */
|
||||
return 0;
|
||||
struct trans_uctx __user *tctx = ctx;
|
||||
struct trans_uctx kctx;
|
||||
|
||||
if (copy_from_user(&kctx, tctx, sizeof(struct trans_uctx))) {
|
||||
pr_err("%s: copy_from_user failed.\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
return kctx.tls_baseaddr;
|
||||
}
|
||||
|
||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0)
|
||||
@ -304,3 +382,38 @@ out:
|
||||
error, rva, rpa, pgsize);
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Assembler switch_ctx executes only ioctl.
|
||||
* Context register save/load is done on Linux (get from current_pt_regs).
|
||||
* Do TLS save/load and register host_thread with ioctl.
|
||||
*/
|
||||
long arch_switch_ctx(struct uti_switch_ctx_desc *desc)
|
||||
{
|
||||
int rc = 0;
|
||||
struct trans_uctx *__user rctx = NULL;
|
||||
struct trans_uctx *__user lctx = NULL;
|
||||
struct trans_uctx klctx = {
|
||||
.regs = current_pt_regs()->user_regs,
|
||||
};
|
||||
|
||||
rctx = desc->rctx;
|
||||
lctx = desc->lctx;
|
||||
|
||||
if (copy_to_user(lctx, &klctx, sizeof(klctx))) {
|
||||
pr_err("%s: Error: copy_to_user failed\n", __func__);
|
||||
rc = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (copy_from_user(¤t_pt_regs()->user_regs,
|
||||
&rctx->regs, sizeof(rctx->regs))) {
|
||||
pr_err("%s: Error: copy_from_user failed\n", __func__);
|
||||
rc = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
restore_tls(get_tls_ctx(rctx));
|
||||
|
||||
out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -9,8 +9,6 @@
|
||||
extern int translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva,
|
||||
unsigned long *rpap, unsigned long *pgsizep);
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
||||
|
||||
#define PFN_WRITE_COMBINED PTE_ATTRINDX(MT_NORMAL_NC)
|
||||
static inline bool pte_is_write_combined(pte_t pte)
|
||||
{
|
||||
@ -31,9 +29,8 @@ static inline bool pte_is_write_combined(pte_t pte)
|
||||
#endif
|
||||
return ((pte_val(pte) & PTE_ATTRINDX_MASK) == PFN_WRITE_COMBINED);
|
||||
}
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
|
||||
|
||||
#define ARMV8_IDX_COUNTER0 1
|
||||
#define ARMV8_IDX_COUNTER0 0
|
||||
#define ARCH_PERF_COUNTER_START ARMV8_IDX_COUNTER0
|
||||
|
||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
|
||||
|
||||
@ -2,9 +2,13 @@
|
||||
#include <linux/version.h>
|
||||
#include <linux/kallsyms.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/vsyscall.h>
|
||||
#include <asm/vgtod.h>
|
||||
#include "config.h"
|
||||
#include "../../mcctrl.h"
|
||||
|
||||
#define gtod (&VVAR(vsyscall_gtod_data))
|
||||
|
||||
//#define SC_DEBUG
|
||||
|
||||
#ifdef SC_DEBUG
|
||||
@ -54,7 +58,6 @@ int arch_symbols_init(void)
|
||||
}
|
||||
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_52
|
||||
#define VDSO_MAXPAGES 2
|
||||
struct vdso {
|
||||
long busy;
|
||||
@ -70,8 +73,8 @@ struct vdso {
|
||||
long hpet_phys;
|
||||
void *pvti_virt;
|
||||
long pvti_phys;
|
||||
void *vgtod_virt;
|
||||
};
|
||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
||||
|
||||
unsigned long
|
||||
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
|
||||
@ -207,6 +210,7 @@ void get_vdso_info(ihk_os_t os, long vdso_rpa)
|
||||
#endif
|
||||
}
|
||||
|
||||
vdso->vgtod_virt = (void *)gtod;
|
||||
out:
|
||||
wmb();
|
||||
vdso->busy = 0;
|
||||
@ -257,25 +261,35 @@ struct trans_uctx {
|
||||
};
|
||||
|
||||
void
|
||||
restore_fs(unsigned long fs)
|
||||
restore_tls(unsigned long addr)
|
||||
{
|
||||
wrmsrl(MSR_FS_BASE, fs);
|
||||
wrmsrl(MSR_FS_BASE, addr);
|
||||
}
|
||||
|
||||
void
|
||||
save_fs_ctx(void *ctx)
|
||||
save_tls_ctx(void __user *ctx)
|
||||
{
|
||||
struct trans_uctx *tctx = ctx;
|
||||
struct trans_uctx __user *tctx = ctx;
|
||||
struct trans_uctx kctx;
|
||||
|
||||
rdmsrl(MSR_FS_BASE, tctx->fs);
|
||||
if (copy_from_user(&kctx, tctx, sizeof(struct trans_uctx))) {
|
||||
pr_err("%s: copy_from_user failed.\n", __func__);
|
||||
return;
|
||||
}
|
||||
rdmsrl(MSR_FS_BASE, kctx.fs);
|
||||
}
|
||||
|
||||
unsigned long
|
||||
get_fs_ctx(void *ctx)
|
||||
get_tls_ctx(void __user *ctx)
|
||||
{
|
||||
struct trans_uctx *tctx = ctx;
|
||||
struct trans_uctx __user *tctx = ctx;
|
||||
struct trans_uctx kctx;
|
||||
|
||||
return tctx->fs;
|
||||
if (copy_from_user(&kctx, tctx, sizeof(struct trans_uctx))) {
|
||||
pr_err("%s: copy_from_user failed.\n", __func__);
|
||||
return 0;
|
||||
}
|
||||
return kctx.fs;
|
||||
}
|
||||
|
||||
unsigned long
|
||||
@ -356,11 +370,17 @@ out:
|
||||
return error;
|
||||
}
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
||||
#define PFN_WRITE_COMBINED _PAGE_PWT
|
||||
static inline bool pte_is_write_combined(pte_t pte)
|
||||
{
|
||||
return ((pte_flags(pte) & _PAGE_PWT) && !(pte_flags(pte) & _PAGE_PCD));
|
||||
}
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
|
||||
|
||||
/*
|
||||
* The assembler switch_ctx is save/load registers in the context.
|
||||
* Do TLS save/load and register host_thread with ioctl.
|
||||
*/
|
||||
long arch_switch_ctx(struct uti_switch_ctx_desc *desc)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -9,14 +9,12 @@
|
||||
extern int translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva,
|
||||
unsigned long *rpap, unsigned long *pgsizep);
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
||||
#define PFN_WRITE_COMBINED _PAGE_PWT
|
||||
|
||||
static inline bool pte_is_write_combined(pte_t pte)
|
||||
{
|
||||
return ((pte_flags(pte) & _PAGE_PWT) && !(pte_flags(pte) & _PAGE_PCD));
|
||||
}
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
|
||||
|
||||
#define ARCH_PERF_COUNTER_START 0
|
||||
|
||||
|
||||
@ -44,7 +44,6 @@
|
||||
#include <config.h>
|
||||
#include "mcctrl.h"
|
||||
#include <ihk/ihk_host_user.h>
|
||||
#include <ihklib_rusage.h>
|
||||
#include <rusage.h>
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
|
||||
#include <uapi/linux/sched/types.h>
|
||||
@ -87,8 +86,19 @@ int syscall_backward(struct mcctrl_usrdata *, int, unsigned long, unsigned long,
|
||||
unsigned long, unsigned long, unsigned long,
|
||||
unsigned long, unsigned long *);
|
||||
|
||||
struct mcos_handler_info {
|
||||
int pid;
|
||||
int cpu;
|
||||
struct mcctrl_usrdata *ud;
|
||||
struct file *file;
|
||||
unsigned long user_start;
|
||||
unsigned long user_end;
|
||||
unsigned long prepare_thread;
|
||||
};
|
||||
|
||||
static long mcexec_prepare_image(ihk_os_t os,
|
||||
struct program_load_desc * __user udesc)
|
||||
struct program_load_desc * __user udesc,
|
||||
struct file *file)
|
||||
{
|
||||
struct program_load_desc *desc = NULL;
|
||||
struct program_load_desc *pdesc = NULL;
|
||||
@ -100,6 +110,7 @@ static long mcexec_prepare_image(ihk_os_t os,
|
||||
struct mcctrl_per_proc_data *ppd = NULL;
|
||||
int num_sections;
|
||||
int free_ikc_pointers = 1;
|
||||
struct mcos_handler_info *info;
|
||||
|
||||
if (!usrdata) {
|
||||
pr_err("%s: error: mcctrl_usrdata not found\n", __func__);
|
||||
@ -122,6 +133,14 @@ static long mcexec_prepare_image(ihk_os_t os,
|
||||
goto free_out;
|
||||
}
|
||||
|
||||
info = ihk_os_get_mcos_private_data(file);
|
||||
if (!info) {
|
||||
ret = -EFAULT;
|
||||
goto free_out;
|
||||
}
|
||||
/* To serialize SCD_MSG_SCHEDULE_PROCESS and SCD_MSG_CLEANUP_PROCESS */
|
||||
info->cpu = desc->cpu;
|
||||
|
||||
ppd = mcctrl_get_per_proc_data(usrdata, desc->pid);
|
||||
if (!ppd) {
|
||||
printk("%s: ERROR: no per process data for PID %d\n",
|
||||
@ -193,6 +212,11 @@ static long mcexec_prepare_image(ihk_os_t os,
|
||||
/* either send or remote prepare_process failed */
|
||||
goto put_and_free_out;
|
||||
}
|
||||
/*
|
||||
* Used as SCD_MSG_CLEANUP_PROCESS target which isn't scheduled
|
||||
* with SCD_MSG_SCHEDULE_PROCESS
|
||||
*/
|
||||
info->prepare_thread = pdesc->rprocess;
|
||||
|
||||
/* Update rpgtable */
|
||||
ppd->rpgtable = pdesc->rpgtable;
|
||||
@ -307,30 +331,10 @@ int mcexec_transfer_image(ihk_os_t os, struct remote_transfer *__user upt)
|
||||
#endif
|
||||
}
|
||||
|
||||
struct mcos_handler_info {
|
||||
int pid;
|
||||
int cpu;
|
||||
struct mcctrl_usrdata *ud;
|
||||
struct file *file;
|
||||
unsigned long user_start;
|
||||
unsigned long user_end;
|
||||
};
|
||||
|
||||
struct mcos_handler_info;
|
||||
static LIST_HEAD(host_threads); /* Used for FS switch */
|
||||
DEFINE_RWLOCK(host_thread_lock);
|
||||
|
||||
/* Info of Linux counterpart of migrated-to-Linux thread */
|
||||
struct host_thread {
|
||||
struct list_head list;
|
||||
struct mcos_handler_info *handler;
|
||||
int pid;
|
||||
int tid;
|
||||
unsigned long usp;
|
||||
unsigned long lfs;
|
||||
unsigned long rfs;
|
||||
};
|
||||
|
||||
struct mcos_handler_info *new_mcos_handler_info(ihk_os_t os, struct file *file)
|
||||
{
|
||||
struct mcos_handler_info *info;
|
||||
@ -391,6 +395,7 @@ static void release_handler(ihk_os_t os, void *param)
|
||||
memset(&isp, '\0', sizeof isp);
|
||||
isp.msg = SCD_MSG_CLEANUP_PROCESS;
|
||||
isp.pid = info->pid;
|
||||
isp.arg = info->prepare_thread;
|
||||
|
||||
dprintk("%s: SCD_MSG_CLEANUP_PROCESS, info: %p, cpu: %d\n",
|
||||
__FUNCTION__, info, info->cpu);
|
||||
@ -426,6 +431,7 @@ static long mcexec_start_image(ihk_os_t os,
|
||||
struct mcctrl_channel *c;
|
||||
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
|
||||
struct mcos_handler_info *info;
|
||||
struct mcos_handler_info *prev_info;
|
||||
int ret = 0;
|
||||
|
||||
if (!usrdata) {
|
||||
@ -446,6 +452,7 @@ static long mcexec_start_image(ihk_os_t os,
|
||||
goto out;
|
||||
}
|
||||
|
||||
prev_info = ihk_os_get_mcos_private_data(file);
|
||||
info = new_mcos_handler_info(os, file);
|
||||
if (info == NULL) {
|
||||
ret = -ENOMEM;
|
||||
@ -456,6 +463,7 @@ static long mcexec_start_image(ihk_os_t os,
|
||||
info->cpu = desc->cpu;
|
||||
info->user_start = desc->user_start;
|
||||
info->user_end = desc->user_end;
|
||||
info->prepare_thread = prev_info->prepare_thread;
|
||||
ihk_os_register_release_handler(file, release_handler, info);
|
||||
ihk_os_set_mcos_private_data(file, info);
|
||||
|
||||
@ -472,8 +480,10 @@ static long mcexec_start_image(ihk_os_t os,
|
||||
ret = mcctrl_ikc_send(os, desc->cpu, &isp);
|
||||
if (ret < 0) {
|
||||
printk("%s: error: sending IKC msg\n", __FUNCTION__);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* clear prepared thread struct */
|
||||
info->prepare_thread = 0;
|
||||
out:
|
||||
kfree(desc);
|
||||
return ret;
|
||||
@ -579,11 +589,7 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
|
||||
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
|
||||
struct mcctrl_part_exec *pe;
|
||||
struct get_cpu_set_arg req;
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
struct mcctrl_cpu_topology *cpu_top, *cpu_top_i;
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cpu_topology *cpu_top, *cpu_top_i;
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cache_topology *cache_top;
|
||||
int cpu, cpus_assigned, cpus_to_assign, cpu_prev;
|
||||
int ret = 0;
|
||||
@ -1199,7 +1205,7 @@ int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet)
|
||||
ppd = mcctrl_get_per_proc_data(ud, pid);
|
||||
|
||||
if (unlikely(!ppd)) {
|
||||
kprintf("%s: ERROR: no per-process structure for PID %d, "
|
||||
dprintk("%s: ERROR: no per-process structure for PID %d, "
|
||||
"syscall nr: %lu\n",
|
||||
__FUNCTION__, pid, packet->req.number);
|
||||
|
||||
@ -1414,7 +1420,7 @@ retry_alloc:
|
||||
__FUNCTION__, task_pid_vnr(current), packet->ref);
|
||||
|
||||
mb();
|
||||
if (!packet->req.valid) {
|
||||
if (!smp_load_acquire(&packet->req.valid)) {
|
||||
printk("%s: ERROR: stray wakeup pid: %d, tid: %d: SC %lu\n",
|
||||
__FUNCTION__,
|
||||
task_tgid_vnr(current),
|
||||
@ -1424,7 +1430,7 @@ retry_alloc:
|
||||
goto retry;
|
||||
}
|
||||
|
||||
packet->req.valid = 0; /* ack */
|
||||
smp_store_release(&packet->req.valid, 0); /* ack */
|
||||
dprintk("%s: system call: %d, args[0]: %lu, args[1]: %lu, args[2]: %lu, "
|
||||
"args[3]: %lu, args[4]: %lu, args[5]: %lu\n",
|
||||
__FUNCTION__,
|
||||
@ -2376,7 +2382,7 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
|
||||
{
|
||||
struct mcctrl_ioctl_getrusage_desc desc;
|
||||
struct rusage_global *rusage_global = ihk_os_get_rusage(ihk_os);
|
||||
struct mckernel_rusage *rusage = NULL;
|
||||
struct ihk_os_rusage *rusage = NULL;
|
||||
int ret = 0;
|
||||
int i;
|
||||
unsigned long ut;
|
||||
@ -2388,13 +2394,13 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
|
||||
goto out;
|
||||
}
|
||||
|
||||
rusage = kmalloc(sizeof(struct mckernel_rusage), GFP_KERNEL);
|
||||
rusage = kmalloc(sizeof(struct ihk_os_rusage), GFP_KERNEL);
|
||||
if (!rusage) {
|
||||
printk("%s: kmalloc failed\n", __FUNCTION__);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
memset(rusage, 0, sizeof(struct mckernel_rusage));
|
||||
memset(rusage, 0, sizeof(struct ihk_os_rusage));
|
||||
|
||||
/* Compile statistics */
|
||||
for (i = 0; i < IHK_MAX_NUM_PGSIZES; i++) {
|
||||
@ -2415,15 +2421,17 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
|
||||
st += rusage_global->cpu[i].system_tsc * rusage_global->ns_per_tsc / 1000;
|
||||
rusage->cpuacct_usage_percpu[i] = wt;
|
||||
}
|
||||
rusage->cpuacct_stat_system = st / 10000000;
|
||||
rusage->cpuacct_stat_user = ut / 10000000;
|
||||
rusage->cpuacct_stat_system = (st + 10000000 - 1) / 10000000;
|
||||
rusage->cpuacct_stat_user = (ut + 10000000 - 1) / 10000000;
|
||||
rusage->cpuacct_usage = ut;
|
||||
|
||||
rusage->num_threads = rusage_global->num_threads;
|
||||
rusage->max_num_threads = rusage_global->max_num_threads;
|
||||
|
||||
if (desc.size_rusage > sizeof(struct mckernel_rusage)) {
|
||||
printk("%s: desc.size_rusage=%ld > sizeof(struct mckernel_rusage)=%ld\n", __FUNCTION__, desc.size_rusage, sizeof(struct mckernel_rusage));
|
||||
if (desc.size_rusage > sizeof(struct ihk_os_rusage)) {
|
||||
printk("%s: desc.size_rusage=%ld > sizeof(struct mckernel_rusage)=%ld\n",
|
||||
__func__, desc.size_rusage,
|
||||
sizeof(struct ihk_os_rusage));
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
@ -2444,10 +2452,10 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
|
||||
|
||||
extern void *get_user_sp(void);
|
||||
extern void set_user_sp(unsigned long);
|
||||
extern void restore_fs(unsigned long fs);
|
||||
extern void save_fs_ctx(void *);
|
||||
extern unsigned long get_fs_ctx(void *);
|
||||
extern unsigned long get_rsp_ctx(void *);
|
||||
extern void restore_tls(unsigned long addr);
|
||||
extern void save_tls_ctx(void __user *ctx);
|
||||
extern unsigned long get_tls_ctx(void __user *ctx);
|
||||
extern unsigned long get_rsp_ctx(void *ctx);
|
||||
|
||||
long mcexec_uti_get_ctx(ihk_os_t os, struct uti_get_ctx_desc __user *udesc)
|
||||
{
|
||||
@ -2491,14 +2499,15 @@ long mcexec_uti_get_ctx(ihk_os_t os, struct uti_get_ctx_desc __user *udesc)
|
||||
return rc;
|
||||
}
|
||||
|
||||
long mcexec_uti_save_fs(ihk_os_t os, struct uti_save_fs_desc __user *udesc, struct file *file)
|
||||
long mcctrl_switch_ctx(ihk_os_t os, struct uti_switch_ctx_desc __user *udesc,
|
||||
struct file *file)
|
||||
{
|
||||
int rc = 0;
|
||||
void *usp = get_user_sp();
|
||||
struct mcos_handler_info *info;
|
||||
struct host_thread *thread;
|
||||
unsigned long flags;
|
||||
struct uti_save_fs_desc desc;
|
||||
struct uti_switch_ctx_desc desc;
|
||||
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
|
||||
struct mcctrl_per_proc_data *ppd;
|
||||
|
||||
@ -2508,21 +2517,26 @@ long mcexec_uti_save_fs(ihk_os_t os, struct uti_save_fs_desc __user *udesc, stru
|
||||
goto out;
|
||||
}
|
||||
|
||||
if(copy_from_user(&desc, udesc, sizeof(struct uti_save_fs_desc))) {
|
||||
if (copy_from_user(&desc, udesc, sizeof(struct uti_switch_ctx_desc))) {
|
||||
printk("%s: Error: copy_from_user failed\n", __FUNCTION__);
|
||||
rc = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
save_fs_ctx(desc.lctx);
|
||||
rc = arch_switch_ctx(&desc);
|
||||
if (rc < 0) {
|
||||
goto out;
|
||||
}
|
||||
|
||||
save_tls_ctx(desc.lctx);
|
||||
info = ihk_os_get_mcos_private_data(file);
|
||||
thread = kmalloc(sizeof(struct host_thread), GFP_KERNEL);
|
||||
memset(thread, '\0', sizeof(struct host_thread));
|
||||
thread->pid = task_tgid_vnr(current);
|
||||
thread->tid = task_pid_vnr(current);
|
||||
thread->usp = (unsigned long)usp;
|
||||
thread->lfs = get_fs_ctx(desc.lctx);
|
||||
thread->rfs = get_fs_ctx(desc.rctx);
|
||||
thread->ltls = get_tls_ctx(desc.lctx);
|
||||
thread->rtls = get_tls_ctx(desc.rctx);
|
||||
thread->handler = info;
|
||||
|
||||
write_lock_irqsave(&host_thread_lock, flags);
|
||||
@ -2568,9 +2582,9 @@ mcexec_sig_thread(ihk_os_t os, unsigned long arg, struct file *file)
|
||||
read_unlock_irqrestore(&host_thread_lock, flags);
|
||||
if (thread) {
|
||||
if (arg)
|
||||
restore_fs(thread->lfs);
|
||||
restore_tls(thread->ltls);
|
||||
else
|
||||
restore_fs(thread->rfs);
|
||||
restore_tls(thread->rtls);
|
||||
goto out;
|
||||
}
|
||||
ret = -EINVAL;
|
||||
@ -2774,8 +2788,7 @@ long mcexec_syscall_thread(ihk_os_t os, unsigned long arg, struct file *file)
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
/* debug */
|
||||
if (0 && param.number == __NR_futex) {
|
||||
if (param.number == __NR_futex) {
|
||||
struct uti_futex_resp resp = {
|
||||
.done = 0
|
||||
};
|
||||
@ -2971,13 +2984,8 @@ mcexec_uti_attr(ihk_os_t os, struct uti_attr_desc __user *_desc)
|
||||
cpumask_t *cpuset = NULL, *env_cpuset = NULL;
|
||||
struct mcctrl_usrdata *ud = ihk_host_os_get_usrdata(os);
|
||||
ihk_device_t dev = ihk_os_to_dev(os);
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
struct mcctrl_cpu_topology *cpu_topo;
|
||||
struct mcctrl_cpu_topology *target_cpu = NULL;
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cpu_topology *cpu_topo;
|
||||
struct cpu_topology *target_cpu = NULL;
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct node_topology *node_topo;
|
||||
struct ihk_cache_topology *lcache_topo;
|
||||
struct ihk_node_topology *lnode_topo;
|
||||
@ -3200,13 +3208,51 @@ out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int __mcctrl_control_perm(unsigned int request)
|
||||
{
|
||||
int ret = 0;
|
||||
kuid_t euid;
|
||||
|
||||
/* black list */
|
||||
switch (request) {
|
||||
case IHK_OS_AUX_PERF_NUM:
|
||||
case IHK_OS_AUX_PERF_SET:
|
||||
case IHK_OS_AUX_PERF_GET:
|
||||
case IHK_OS_AUX_PERF_ENABLE:
|
||||
case IHK_OS_AUX_PERF_DISABLE:
|
||||
case IHK_OS_AUX_PERF_DESTROY:
|
||||
euid = current_euid();
|
||||
pr_debug("%s: request=0x%x, euid=%u\n",
|
||||
__func__, request, euid.val);
|
||||
if (euid.val) {
|
||||
ret = -EPERM;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
pr_debug("%s: request=0x%x, ret=%d\n", __func__, request, ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
|
||||
struct file *file)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = __mcctrl_control_perm(req);
|
||||
if (ret) {
|
||||
pr_err("%s: error: permission denied, req: %x\n",
|
||||
__func__, req);
|
||||
return ret;
|
||||
}
|
||||
|
||||
switch (req) {
|
||||
case MCEXEC_UP_PREPARE_IMAGE:
|
||||
return mcexec_prepare_image(os,
|
||||
(struct program_load_desc *)arg);
|
||||
(struct program_load_desc *)arg,
|
||||
file);
|
||||
case MCEXEC_UP_TRANSFER:
|
||||
return mcexec_transfer_image(os, (struct remote_transfer *)arg);
|
||||
|
||||
@ -3272,8 +3318,9 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
|
||||
case MCEXEC_UP_UTI_GET_CTX:
|
||||
return mcexec_uti_get_ctx(os, (struct uti_get_ctx_desc *)arg);
|
||||
|
||||
case MCEXEC_UP_UTI_SAVE_FS:
|
||||
return mcexec_uti_save_fs(os, (struct uti_save_fs_desc *)arg, file);
|
||||
case MCEXEC_UP_UTI_SWITCH_CTX:
|
||||
return mcctrl_switch_ctx(os, (struct uti_switch_ctx_desc *)arg,
|
||||
file);
|
||||
|
||||
case MCEXEC_UP_SIG_THREAD:
|
||||
return mcexec_sig_thread(os, arg, file);
|
||||
@ -3320,14 +3367,6 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* Per-CPU register manipulation functions */
|
||||
struct mcctrl_os_cpu_response {
|
||||
int done;
|
||||
unsigned long val;
|
||||
int err;
|
||||
wait_queue_head_t wq;
|
||||
};
|
||||
|
||||
int mcctrl_get_request_os_cpu(ihk_os_t os, int *ret_cpu)
|
||||
{
|
||||
struct mcctrl_usrdata *usrdata;
|
||||
@ -3379,7 +3418,8 @@ int mcctrl_get_request_os_cpu(ihk_os_t os, int *ret_cpu)
|
||||
*ret_cpu = ch->send.queue->read_cpu;
|
||||
ret = 0;
|
||||
|
||||
printk("%s: OS: %p, CPU: %d\n", __FUNCTION__, os, *ret_cpu);
|
||||
pr_info("%s: OS: %lx, CPU: %d\n",
|
||||
__func__, (unsigned long)os, *ret_cpu);
|
||||
|
||||
out_put_ppd:
|
||||
mcctrl_put_per_thread_data(ptd);
|
||||
@ -3390,73 +3430,67 @@ out_put_ppd:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void mcctrl_os_read_write_cpu_response(ihk_os_t os,
|
||||
struct ikc_scd_packet *pisp)
|
||||
{
|
||||
struct mcctrl_os_cpu_response *resp;
|
||||
|
||||
/* XXX: What if caller thread is unblocked by a signal
|
||||
* before this message arrives? */
|
||||
resp = pisp->resp;
|
||||
if (!resp) {
|
||||
return;
|
||||
}
|
||||
|
||||
resp->val = pisp->desc.val;
|
||||
resp->done = 1;
|
||||
resp->err = pisp->err;
|
||||
wake_up_interruptible(&resp->wq);
|
||||
}
|
||||
|
||||
int __mcctrl_os_read_write_cpu_register(ihk_os_t os, int cpu,
|
||||
struct ihk_os_cpu_register *desc,
|
||||
enum mcctrl_os_cpu_operation op)
|
||||
{
|
||||
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
|
||||
struct ikc_scd_packet isp;
|
||||
struct mcctrl_os_cpu_response resp;
|
||||
struct ihk_os_cpu_register *ldesc = NULL;
|
||||
int do_free = 0;
|
||||
int ret = -EINVAL;
|
||||
|
||||
if (!udp) {
|
||||
pr_err("%s: error: mcctrl_usrdata not found\n", __func__);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (cpu < 0 || cpu >= udp->cpu_info->n_cpus) {
|
||||
pr_err("%s: error: cpu (%d) is out of range\n",
|
||||
__func__, cpu);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
|
||||
}
|
||||
|
||||
/* Keep a dynamic structure around that can
|
||||
* survive an early return due to a signal */
|
||||
ldesc = kmalloc(sizeof(*ldesc), GFP_KERNEL);
|
||||
if (!ldesc) {
|
||||
printk("%s: ERROR: allocating cpu register desc\n", __FUNCTION__);
|
||||
return -ENOMEM;
|
||||
}
|
||||
*ldesc = *desc;
|
||||
|
||||
memset(&isp, '\0', sizeof(struct ikc_scd_packet));
|
||||
isp.msg = SCD_MSG_CPU_RW_REG;
|
||||
isp.op = op;
|
||||
isp.desc = *desc;
|
||||
isp.resp = &resp;
|
||||
isp.pdesc = virt_to_phys(ldesc);
|
||||
|
||||
resp.done = 0;
|
||||
resp.err = 0;
|
||||
init_waitqueue_head(&resp.wq);
|
||||
|
||||
mb();
|
||||
ret = mcctrl_ikc_send(os, cpu, &isp);
|
||||
if (ret < 0) {
|
||||
ret = mcctrl_ikc_send_wait(os, cpu, &isp, 0, NULL, &do_free, 1, ldesc);
|
||||
if (ret != 0) {
|
||||
printk("%s: ERROR sending IKC msg: %d\n", __FUNCTION__, ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Wait for response */
|
||||
ret = wait_event_interruptible(resp.wq, resp.done);
|
||||
if (ret < 0) {
|
||||
printk("%s: ERROR after wait: %d\n", __FUNCTION__, ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = resp.err;
|
||||
if (ret != 0) {
|
||||
printk("%s: ERROR receive: %d\n", __FUNCTION__, resp.err);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Update if read */
|
||||
if (ret == 0 && op == MCCTRL_OS_CPU_READ_REGISTER) {
|
||||
desc->val = resp.val;
|
||||
if (op == MCCTRL_OS_CPU_READ_REGISTER) {
|
||||
desc->val = ldesc->val;
|
||||
}
|
||||
|
||||
dprintk("%s: MCCTRL_OS_CPU_%s_REGISTER: reg: 0x%lx, val: 0x%lx\n",
|
||||
/* Notify caller (for future async implementation) */
|
||||
atomic_set(&desc->sync, 1);
|
||||
|
||||
dprintk("%s: MCCTRL_OS_CPU_%s_REGISTER: CPU: %d, addr_ext: 0x%lx, val: 0x%lx\n",
|
||||
__FUNCTION__,
|
||||
(op == MCCTRL_OS_CPU_READ_REGISTER ? "READ" : "WRITE"),
|
||||
desc->addr, desc->val);
|
||||
(op == MCCTRL_OS_CPU_READ_REGISTER ? "READ" : "WRITE"), cpu,
|
||||
desc->addr_ext, desc->val);
|
||||
|
||||
out:
|
||||
if (do_free) {
|
||||
kfree(ldesc);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@ -21,8 +21,10 @@
|
||||
* 2013/08/19 shirasawa mcexec forward signal to MIC process
|
||||
*/
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/miscdevice.h>
|
||||
#include <linux/slab.h>
|
||||
@ -80,11 +82,13 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
|
||||
{ .request = MCEXEC_UP_CLOSE_EXEC, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
|
||||
#ifdef MCEXEC_BIND_MOUNT
|
||||
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_SYS_UMOUNT, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
|
||||
#endif // MCEXEC_BIND_MOUNT
|
||||
{ .request = MCEXEC_UP_UTI_GET_CTX, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_UTI_SAVE_FS, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_UTI_SWITCH_CTX, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_SIG_THREAD, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_SYSCALL_THREAD, .func = mcctrl_ioctl },
|
||||
{ .request = MCEXEC_UP_TERMINATE_THREAD, .func = mcctrl_ioctl },
|
||||
@ -170,14 +174,6 @@ error_cleanup_channels:
|
||||
int mcctrl_os_shutdown_notifier(int os_index)
|
||||
{
|
||||
if (os[os_index]) {
|
||||
/* Wait for os running */
|
||||
if (ihk_os_wait_for_status(os[os_index], IHK_OS_STATUS_RUNNING, 0, 200) != 0) {
|
||||
printk("IHK: OS does not become RUNNING in shutdown. Force shutdown.\n");
|
||||
/* send nmi to force shutdown */
|
||||
ihk_os_send_nmi(os[os_index], 3);
|
||||
mdelay(200);
|
||||
}
|
||||
|
||||
pager_cleanup();
|
||||
sysfsm_cleanup(os[os_index]);
|
||||
free_topology_info(os[os_index]);
|
||||
|
||||
@ -208,6 +208,8 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
case SCD_MSG_PERF_ACK:
|
||||
case SCD_MSG_SEND_SIGNAL_ACK:
|
||||
case SCD_MSG_PROCFS_ANSWER:
|
||||
case SCD_MSG_REMOTE_PAGE_FAULT_ANSWER:
|
||||
case SCD_MSG_CPU_RW_REG_RESP:
|
||||
mcctrl_wakeup_cb(__os, pisp);
|
||||
break;
|
||||
|
||||
@ -238,10 +240,6 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
|
||||
get_vdso_info(__os, pisp->arg);
|
||||
break;
|
||||
|
||||
case SCD_MSG_CPU_RW_REG_RESP:
|
||||
mcctrl_os_read_write_cpu_response(__os, pisp);
|
||||
break;
|
||||
|
||||
case SCD_MSG_EVENTFD:
|
||||
dkprintf("%s: SCD_MSG_EVENTFD,pisp->eventfd_type=%d\n", __FUNCTION__, pisp->eventfd_type);
|
||||
mcctrl_eventfd(__os, pisp);
|
||||
@ -464,7 +462,7 @@ int prepare_ikc_channels(ihk_os_t os)
|
||||
int i;
|
||||
int ret = 0;
|
||||
|
||||
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL);
|
||||
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_ATOMIC);
|
||||
if (!usrdata) {
|
||||
printk("%s: error: allocating mcctrl_usrdata\n", __FUNCTION__);
|
||||
ret = -ENOMEM;
|
||||
@ -490,7 +488,7 @@ int prepare_ikc_channels(ihk_os_t os)
|
||||
usrdata->num_channels = usrdata->cpu_info->n_cpus;
|
||||
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) *
|
||||
usrdata->num_channels,
|
||||
GFP_KERNEL);
|
||||
GFP_ATOMIC);
|
||||
|
||||
if (!usrdata->channels) {
|
||||
printk("Error: cannot allocate channels.\n");
|
||||
@ -499,7 +497,7 @@ int prepare_ikc_channels(ihk_os_t os)
|
||||
}
|
||||
|
||||
usrdata->ikc2linux = kzalloc(sizeof(struct ihk_ikc_channel_desc *) *
|
||||
nr_cpu_ids, GFP_KERNEL);
|
||||
nr_cpu_ids, GFP_ATOMIC);
|
||||
|
||||
if (!usrdata->ikc2linux) {
|
||||
printk("Error: cannot allocate ikc2linux channels.\n");
|
||||
|
||||
@ -69,6 +69,9 @@
|
||||
#define SCD_MSG_PROCFS_ANSWER 0x13
|
||||
#define SCD_MSG_PROCFS_RELEASE 0x15
|
||||
|
||||
#define SCD_MSG_REMOTE_PAGE_FAULT 0x18
|
||||
#define SCD_MSG_REMOTE_PAGE_FAULT_ANSWER 0x19
|
||||
|
||||
#define SCD_MSG_DEBUG_LOG 0x20
|
||||
|
||||
#define SCD_MSG_SYSFS_REQ_CREATE 0x30
|
||||
@ -121,12 +124,6 @@ enum mcctrl_os_cpu_operation {
|
||||
MCCTRL_OS_CPU_MAX_OP
|
||||
};
|
||||
|
||||
/* Used to wake-up a Linux thread futex_wait()-ing */
|
||||
struct uti_futex_resp {
|
||||
int done;
|
||||
wait_queue_head_t wq;
|
||||
};
|
||||
|
||||
struct ikc_scd_packet {
|
||||
struct ihk_ikc_packet_header header;
|
||||
int msg;
|
||||
@ -157,7 +154,7 @@ struct ikc_scd_packet {
|
||||
|
||||
/* SCD_MSG_CPU_RW_REG */
|
||||
struct {
|
||||
struct ihk_os_cpu_register desc;
|
||||
unsigned long pdesc; /* Physical addr of the descriptor */
|
||||
enum mcctrl_os_cpu_operation op;
|
||||
void *resp;
|
||||
};
|
||||
@ -172,6 +169,14 @@ struct ikc_scd_packet {
|
||||
void *resp;
|
||||
int *spin_sleep; /* 1: waiting in linux_wait_event() 0: woken up by someone else */
|
||||
} futex;
|
||||
|
||||
/* SCD_MSG_REMOTE_PAGE_FAULT */
|
||||
struct {
|
||||
int target_cpu;
|
||||
int fault_tid;
|
||||
unsigned long fault_address;
|
||||
unsigned long fault_reason;
|
||||
};
|
||||
};
|
||||
/* char padding[8]; */ /* We want the size to be 128 bytes */
|
||||
};
|
||||
@ -289,11 +294,7 @@ struct cache_topology {
|
||||
struct list_head chain;
|
||||
};
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
struct mcctrl_cpu_topology {
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cpu_topology {
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
//struct mcctrl_usrdata *udp;
|
||||
struct ihk_cpu_topology *saved;
|
||||
int mckernel_cpu_id;
|
||||
@ -448,40 +449,8 @@ void mcctrl_put_per_proc_data(struct mcctrl_per_proc_data *ppd);
|
||||
int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data *ppd, void *data);
|
||||
void mcctrl_put_per_thread_data_unsafe(struct mcctrl_per_thread_data *ptd);
|
||||
void mcctrl_put_per_thread_data(struct mcctrl_per_thread_data* ptd);
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_56 /* Strange how to use inline declaration fix. */
|
||||
static inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd,
|
||||
struct task_struct *task)
|
||||
{
|
||||
struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL;
|
||||
int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK);
|
||||
unsigned long flags;
|
||||
|
||||
/* Check if data for this thread exists */
|
||||
write_lock_irqsave(&ppd->per_thread_data_hash_lock[hash], flags);
|
||||
|
||||
list_for_each_entry(ptd_iter, &ppd->per_thread_data_hash[hash], hash) {
|
||||
if (ptd_iter->task == task) {
|
||||
ptd = ptd_iter;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ptd) {
|
||||
if (atomic_read(&ptd->refcount) <= 0) {
|
||||
printk("%s: ERROR: use-after-free detected (%d)", __FUNCTION__, atomic_read(&ptd->refcount));
|
||||
ptd = NULL;
|
||||
goto out;
|
||||
}
|
||||
atomic_inc(&ptd->refcount);
|
||||
}
|
||||
|
||||
out:
|
||||
write_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags);
|
||||
return ptd;
|
||||
}
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_56 */
|
||||
inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd, struct task_struct *task);
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_56 */
|
||||
struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd,
|
||||
struct task_struct *task);
|
||||
int mcctrl_clear_pte_range(uintptr_t start, uintptr_t len);
|
||||
|
||||
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
|
||||
@ -526,24 +495,6 @@ void reply_get_cpu_mapping(long req_pa);
|
||||
void free_topology_info(ihk_os_t os);
|
||||
|
||||
/* archdep.c */
|
||||
#ifndef POSTK_DEBUG_ARCH_DEP_52
|
||||
#define VDSO_MAXPAGES 2
|
||||
struct vdso {
|
||||
long busy;
|
||||
int vdso_npages;
|
||||
char vvar_is_global;
|
||||
char hpet_is_global;
|
||||
char pvti_is_global;
|
||||
char padding;
|
||||
long vdso_physlist[VDSO_MAXPAGES];
|
||||
void *vvar_virt;
|
||||
long vvar_phys;
|
||||
void *hpet_virt;
|
||||
long hpet_phys;
|
||||
void *pvti_virt;
|
||||
long pvti_phys;
|
||||
};
|
||||
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
|
||||
|
||||
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
|
||||
unsigned long *endp);
|
||||
@ -573,8 +524,28 @@ struct ihk_perf_event_attr{
|
||||
};
|
||||
|
||||
struct mcctrl_ioctl_getrusage_desc {
|
||||
void* rusage;
|
||||
struct ihk_os_rusage *rusage;
|
||||
size_t size_rusage;
|
||||
};
|
||||
|
||||
/* uti */
|
||||
long mcctrl_switch_ctx(ihk_os_t os, struct uti_switch_ctx_desc __user *desc,
|
||||
struct file *file);
|
||||
long arch_switch_ctx(struct uti_switch_ctx_desc *desc);
|
||||
|
||||
struct host_thread {
|
||||
struct list_head list;
|
||||
struct mcos_handler_info *handler;
|
||||
int pid;
|
||||
int tid;
|
||||
unsigned long usp;
|
||||
unsigned long ltls;
|
||||
unsigned long rtls;
|
||||
};
|
||||
|
||||
/* Used to wake-up a Linux thread futex_wait()-ing */
|
||||
struct uti_futex_resp {
|
||||
int done;
|
||||
wait_queue_head_t wq;
|
||||
};
|
||||
#endif
|
||||
|
||||
@ -1110,10 +1110,10 @@ static const struct procfs_entry pid_entry_stuff[] = {
|
||||
// PROC_LNK("exe", mckernel_readlink),
|
||||
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
|
||||
PROC_REG("maps", 0444, &mckernel_buff_io),
|
||||
PROC_REG("mem", 0400, NULL),
|
||||
PROC_REG("mem", 0600, NULL),
|
||||
PROC_REG("pagemap", 0444, NULL),
|
||||
// PROC_REG("smaps", S_IRUGO, NULL),
|
||||
// PROC_REG("stat", 0444, &mckernel_buff_io),
|
||||
PROC_REG("stat", 0444, &mckernel_buff_io),
|
||||
// PROC_REG("statm", S_IRUGO, NULL),
|
||||
PROC_REG("status", 0444, &mckernel_buff_io),
|
||||
// PROC_REG("syscall", S_IRUGO, NULL),
|
||||
|
||||
@ -43,6 +43,8 @@
|
||||
#include <linux/semaphore.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/mount.h>
|
||||
#include <linux/kdev_t.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/delay.h>
|
||||
#include <asm/io.h>
|
||||
@ -178,8 +180,8 @@ int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data *ppd, void *data)
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifndef POSTK_DEBUG_ARCH_DEP_56 /* Strange how to use inline declaration fix. */
|
||||
struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd, struct task_struct *task)
|
||||
struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd,
|
||||
struct task_struct *task)
|
||||
{
|
||||
struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL;
|
||||
int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK);
|
||||
@ -208,7 +210,6 @@ struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc
|
||||
read_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags);
|
||||
return ptd;
|
||||
}
|
||||
#endif /* !POSTK_DEBUG_ARCH_DEP_56 */
|
||||
|
||||
static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet,
|
||||
struct syscall_response *res)
|
||||
@ -227,19 +228,19 @@ static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet
|
||||
c = (usrdata->channels + packet->ref)->c;
|
||||
|
||||
/* If spinning, no need for IKC message */
|
||||
if (__sync_bool_compare_and_swap(&res->req_thread_status,
|
||||
if (cmpxchg(&res->req_thread_status,
|
||||
IHK_SCD_REQ_THREAD_SPINNING,
|
||||
IHK_SCD_REQ_THREAD_TO_BE_WOKEN)) {
|
||||
IHK_SCD_REQ_THREAD_TO_BE_WOKEN) ==
|
||||
IHK_SCD_REQ_THREAD_SPINNING) {
|
||||
dprintk("%s: no need to send IKC message for PID %d\n",
|
||||
__FUNCTION__, packet->pid);
|
||||
__FUNCTION__, packet->pid);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Wait until the status goes back to IHK_SCD_REQ_THREAD_SPINNING or
|
||||
IHK_SCD_REQ_THREAD_DESCHEDULED because two wake-up attempts are competing.
|
||||
Note that mcexec_terminate_thread() and remote page fault and
|
||||
returning EINTR would compete. */
|
||||
if (res->req_thread_status == IHK_SCD_REQ_THREAD_TO_BE_WOKEN) {
|
||||
Note that mcexec_terminate_thread() and returning EINTR would compete. */
|
||||
if (smp_load_acquire(&res->req_thread_status) == IHK_SCD_REQ_THREAD_TO_BE_WOKEN) {
|
||||
printk("%s: INFO: someone else is waking up the McKernel thread, "
|
||||
"pid: %d, req status: %lu, syscall nr: %lu\n",
|
||||
__FUNCTION__, packet->pid,
|
||||
@ -247,9 +248,10 @@ static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet
|
||||
}
|
||||
|
||||
/* The thread is not spinning any more, make sure it's descheduled */
|
||||
if (!__sync_bool_compare_and_swap(&res->req_thread_status,
|
||||
if (cmpxchg(&res->req_thread_status,
|
||||
IHK_SCD_REQ_THREAD_DESCHEDULED,
|
||||
IHK_SCD_REQ_THREAD_TO_BE_WOKEN)) {
|
||||
IHK_SCD_REQ_THREAD_TO_BE_WOKEN) !=
|
||||
IHK_SCD_REQ_THREAD_DESCHEDULED) {
|
||||
printk("%s: WARNING: inconsistent requester status, "
|
||||
"pid: %d, req status: %lu, syscall nr: %lu\n",
|
||||
__FUNCTION__, packet->pid,
|
||||
@ -273,7 +275,7 @@ long syscall_backward(struct mcctrl_usrdata *usrdata, int num,
|
||||
unsigned long *ret)
|
||||
{
|
||||
struct ikc_scd_packet *packet;
|
||||
struct ikc_scd_packet *free_packet = NULL;
|
||||
struct ikc_scd_packet *free_packet = NULL;
|
||||
struct syscall_request *req;
|
||||
struct syscall_response *resp;
|
||||
unsigned long syscall_ret;
|
||||
@ -282,15 +284,16 @@ long syscall_backward(struct mcctrl_usrdata *usrdata, int num,
|
||||
struct mcctrl_per_proc_data *ppd;
|
||||
struct mcctrl_per_thread_data *ptd;
|
||||
unsigned long phys;
|
||||
struct syscall_request _request[2];
|
||||
struct syscall_request *request;
|
||||
struct syscall_request *request = NULL;
|
||||
int retry;
|
||||
|
||||
if (((unsigned long)_request ^ (unsigned long)(_request + 1)) &
|
||||
~(PAGE_SIZE -1))
|
||||
request = _request + 1;
|
||||
else
|
||||
request = _request;
|
||||
request = kmalloc(sizeof(struct syscall_request), GFP_ATOMIC);
|
||||
if (!request) {
|
||||
printk("%s: ERROR: allocating request\n", __func__);
|
||||
syscall_ret = -ENOMEM;
|
||||
goto no_ppd;
|
||||
}
|
||||
|
||||
request->number = num;
|
||||
request->args[0] = arg1;
|
||||
request->args[1] = arg2;
|
||||
@ -305,8 +308,9 @@ long syscall_backward(struct mcctrl_usrdata *usrdata, int num,
|
||||
|
||||
if (!ppd) {
|
||||
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
|
||||
__FUNCTION__, task_tgid_vnr(current));
|
||||
return -EINVAL;
|
||||
__func__, task_tgid_vnr(current));
|
||||
syscall_ret = -EINVAL;
|
||||
goto no_ppd;
|
||||
}
|
||||
|
||||
ptd = mcctrl_get_per_thread_data(ppd, current);
|
||||
@ -454,11 +458,13 @@ out:
|
||||
out_put_ppd:
|
||||
mcctrl_put_per_thread_data(ptd);
|
||||
pr_ptd("put", task_pid_vnr(current), ptd);
|
||||
no_ptd:
|
||||
no_ptd:
|
||||
dprintk("%s: tid: %d, syscall: %d, syscall_ret: %lx\n",
|
||||
__FUNCTION__, task_pid_vnr(current), num, syscall_ret);
|
||||
|
||||
mcctrl_put_per_proc_data(ppd);
|
||||
no_ppd:
|
||||
kfree(request);
|
||||
return syscall_ret;
|
||||
}
|
||||
|
||||
@ -479,214 +485,43 @@ extern struct host_thread *host_threads;
|
||||
extern rwlock_t host_thread_lock;
|
||||
#endif
|
||||
|
||||
int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, uint64_t reason)
|
||||
int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr,
|
||||
uint64_t reason, struct mcctrl_per_proc_data *ppd,
|
||||
struct ikc_scd_packet *packet)
|
||||
{
|
||||
struct ikc_scd_packet *packet;
|
||||
struct ikc_scd_packet *free_packet = NULL;
|
||||
struct syscall_request *req;
|
||||
struct syscall_response *resp;
|
||||
int error;
|
||||
struct wait_queue_head_list_node *wqhln;
|
||||
unsigned long irqflags;
|
||||
struct mcctrl_per_proc_data *ppd;
|
||||
struct mcctrl_per_thread_data *ptd;
|
||||
unsigned long phys;
|
||||
int retry;
|
||||
struct mcctrl_wakeup_desc *desc;
|
||||
int do_frees = 1;
|
||||
|
||||
dprintk("%s: tid: %d, fault_addr: %p, reason: %lu\n",
|
||||
__FUNCTION__, task_pid_vnr(current), fault_addr, (unsigned long)reason);
|
||||
|
||||
/* Look up per-process structure */
|
||||
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
|
||||
|
||||
if (!ppd) {
|
||||
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
|
||||
__FUNCTION__, task_tgid_vnr(current));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ptd = mcctrl_get_per_thread_data(ppd, current);
|
||||
if (!ptd) {
|
||||
printk("%s: ERROR: mcctrl_get_per_thread_data failed\n", __FUNCTION__);
|
||||
error = -ENOENT;
|
||||
goto no_ptd;
|
||||
}
|
||||
pr_ptd("get", task_pid_vnr(current), ptd);
|
||||
packet = (struct ikc_scd_packet *)ptd->data;
|
||||
if (!packet) {
|
||||
printk("%s: no packet registered for TID %d\n",
|
||||
__FUNCTION__, task_pid_vnr(current));
|
||||
error = -ENOENT;
|
||||
goto out_put_ppd;
|
||||
}
|
||||
|
||||
req = &packet->req;
|
||||
|
||||
/* Map response structure */
|
||||
phys = ihk_device_map_memory(ihk_os_to_dev(usrdata->os),
|
||||
packet->resp_pa, sizeof(*resp));
|
||||
resp = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os),
|
||||
phys, sizeof(*resp), NULL, 0);
|
||||
if (!resp) {
|
||||
printk("%s: ERROR: invalid response structure address\n",
|
||||
__FUNCTION__);
|
||||
error = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
retry_alloc:
|
||||
wqhln = kmalloc(sizeof(*wqhln), GFP_ATOMIC);
|
||||
if (!wqhln) {
|
||||
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
|
||||
goto retry_alloc;
|
||||
}
|
||||
memset(wqhln, 0, sizeof(struct wait_queue_head_list_node));
|
||||
|
||||
/* Prepare per-thread wait queue head */
|
||||
wqhln->task = current;
|
||||
/* Save the TID explicitly, because mcexec_syscall(), where the request
|
||||
* will be matched, is in IRQ context and can't call task_pid_vnr() */
|
||||
wqhln->rtid = task_pid_vnr(current);
|
||||
wqhln->req = 0;
|
||||
init_waitqueue_head(&wqhln->wq_syscall);
|
||||
|
||||
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
|
||||
/* Add to exact list */
|
||||
list_add_tail(&wqhln->list, &ppd->wq_list_exact);
|
||||
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
|
||||
|
||||
/* Request page fault */
|
||||
resp->ret = -EFAULT;
|
||||
resp->fault_address = (unsigned long)fault_addr;
|
||||
resp->fault_reason = reason;
|
||||
resp->stid = task_pid_vnr(current);
|
||||
packet->msg = SCD_MSG_REMOTE_PAGE_FAULT;
|
||||
packet->fault_address = (unsigned long)fault_addr;
|
||||
packet->fault_reason = reason;
|
||||
|
||||
#define STATUS_PAGER_COMPLETED 1
|
||||
#define STATUS_PAGE_FAULT 3
|
||||
req->valid = 0;
|
||||
|
||||
if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) {
|
||||
printk("%s: WARNING: failed to notify PID %d\n",
|
||||
__FUNCTION__, packet->pid);
|
||||
/* we need to alloc desc ourselves because GFP_ATOMIC */
|
||||
retry_alloc:
|
||||
desc = kmalloc(sizeof(*desc), GFP_ATOMIC);
|
||||
if (!desc) {
|
||||
pr_warn("WARNING: coudln't alloc remote page fault wait desc, retrying..\n");
|
||||
goto retry_alloc;
|
||||
}
|
||||
|
||||
mb();
|
||||
resp->status = STATUS_PAGE_FAULT;
|
||||
|
||||
retry = 0;
|
||||
for (;;) {
|
||||
dprintk("%s: tid: %d, fault_addr: %p SLEEPING\n",
|
||||
__FUNCTION__, task_pid_vnr(current), fault_addr);
|
||||
/* wait for response */
|
||||
error = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
|
||||
|
||||
/* Delay signal handling */
|
||||
if (error == -ERESTARTSYS) {
|
||||
printk("%s: INFO: interrupted by signal\n", __FUNCTION__);
|
||||
retry++;
|
||||
if (retry < 5) { /* mcexec is alive */
|
||||
printk("%s: INFO: retry=%d\n", __FUNCTION__, retry);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
/* Remove per-thread wait queue head */
|
||||
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
|
||||
list_del(&wqhln->list);
|
||||
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
|
||||
|
||||
dprintk("%s: tid: %d, fault_addr: %p WOKEN UP\n",
|
||||
__FUNCTION__, task_pid_vnr(current), fault_addr);
|
||||
|
||||
if (retry >= 5) {
|
||||
kfree(wqhln);
|
||||
kprintf("%s: INFO: mcexec is gone or retry count exceeded,pid=%d,retry=%d\n", __FUNCTION__, task_tgid_vnr(current), retry);
|
||||
error = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (error) {
|
||||
kfree(wqhln);
|
||||
printk("remote_page_fault:interrupted. %d\n", error);
|
||||
goto out;
|
||||
}
|
||||
else {
|
||||
/* Update packet reference */
|
||||
packet = wqhln->packet;
|
||||
free_packet = packet;
|
||||
req = &packet->req;
|
||||
{
|
||||
unsigned long phys2;
|
||||
struct syscall_response *resp2;
|
||||
phys2 = ihk_device_map_memory(ihk_os_to_dev(usrdata->os),
|
||||
packet->resp_pa, sizeof(*resp));
|
||||
resp2 = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os),
|
||||
phys2, sizeof(*resp), NULL, 0);
|
||||
|
||||
if (resp != resp2) {
|
||||
resp = resp2;
|
||||
phys = phys2;
|
||||
printk("%s: updated new remote PA for resp\n", __FUNCTION__);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!req->valid) {
|
||||
printk("remote_page_fault:not valid\n");
|
||||
}
|
||||
req->valid = 0;
|
||||
|
||||
/* check result */
|
||||
if (req->number != __NR_mmap) {
|
||||
printk("remote_page_fault:unexpected response. %lx %lx\n",
|
||||
req->number, req->args[0]);
|
||||
error = -EIO;
|
||||
goto out;
|
||||
}
|
||||
#define PAGER_REQ_RESUME 0x0101
|
||||
else if (req->args[0] != PAGER_REQ_RESUME) {
|
||||
resp->ret = pager_call(usrdata->os, (void *)req);
|
||||
|
||||
if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) {
|
||||
printk("%s: WARNING: failed to notify PID %d\n",
|
||||
__FUNCTION__, packet->pid);
|
||||
}
|
||||
|
||||
mb();
|
||||
resp->status = STATUS_PAGER_COMPLETED;
|
||||
break;
|
||||
//continue;
|
||||
}
|
||||
else {
|
||||
error = req->args[1];
|
||||
if (error) {
|
||||
printk("remote_page_fault:response %d\n", error);
|
||||
kfree(wqhln);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
break;
|
||||
/* packet->target_cpu was set in rus_vm_fault if a thread was found */
|
||||
error = mcctrl_ikc_send_wait(usrdata->os, packet->target_cpu, packet,
|
||||
0, desc, &do_frees, 0);
|
||||
if (do_frees)
|
||||
kfree(desc);
|
||||
if (error < 0) {
|
||||
pr_warn("%s: WARNING: failed to request remote page fault PID %d: %d\n",
|
||||
__func__, packet->pid, error);
|
||||
}
|
||||
|
||||
kfree(wqhln);
|
||||
error = 0;
|
||||
out:
|
||||
/* Release remote page-fault response packet */
|
||||
if (free_packet) {
|
||||
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)free_packet);
|
||||
}
|
||||
|
||||
ihk_device_unmap_virtual(ihk_os_to_dev(usrdata->os), resp, sizeof(*resp));
|
||||
ihk_device_unmap_memory(ihk_os_to_dev(usrdata->os), phys, sizeof(*resp));
|
||||
|
||||
out_put_ppd:
|
||||
mcctrl_put_per_thread_data(ptd);
|
||||
pr_ptd("put", task_pid_vnr(current), ptd);
|
||||
no_ptd:
|
||||
dprintk("%s: tid: %d, fault_addr: %p, reason: %lu, error: %d\n",
|
||||
__FUNCTION__, task_pid_vnr(current), fault_addr, (unsigned long)reason, error);
|
||||
|
||||
mcctrl_put_per_proc_data(ppd);
|
||||
__func__, task_pid_vnr(current), fault_addr,
|
||||
(unsigned long)reason, error);
|
||||
return error;
|
||||
}
|
||||
|
||||
@ -726,70 +561,70 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
#endif
|
||||
struct mcctrl_per_proc_data *ppd;
|
||||
struct mcctrl_per_thread_data *ptd;
|
||||
struct ikc_scd_packet *packet;
|
||||
struct task_struct *task = current;
|
||||
struct ikc_scd_packet packet = { };
|
||||
unsigned long rsysnum = 0;
|
||||
int ret = 0;
|
||||
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
dprintk("mcctrl:page fault:flags %#x pgoff %#lx va %#lx page %p\n",
|
||||
vmf->flags, vmf->pgoff, vmf->address, vmf->page);
|
||||
unsigned long addr = vmf->address;
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
dprintk("mcctrl:page fault:flags %#x pgoff %#lx va %p page %p\n",
|
||||
vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page);
|
||||
void __user *addr = vmf->virtual_address;
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
|
||||
/* Look up per-process structure */
|
||||
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
|
||||
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(task));
|
||||
if (!ppd) {
|
||||
kprintf("%s: INFO: no per-process structure for pid %d (tid %d), try to use pid %d\n",
|
||||
__FUNCTION__, task_tgid_vnr(current), task_pid_vnr(current), vma->vm_mm->owner->pid);
|
||||
ppd = mcctrl_get_per_proc_data(usrdata, vma->vm_mm->owner->pid);
|
||||
pr_err("%s: INFO: no per-process structure for "
|
||||
"pid %d (tid %d), trying to use pid %d\n",
|
||||
__func__,
|
||||
task_tgid_vnr(task), task_pid_vnr(task),
|
||||
vma->vm_mm->owner->pid);
|
||||
task = vma->vm_mm->owner;
|
||||
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(task));
|
||||
}
|
||||
|
||||
if (!ppd) {
|
||||
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
|
||||
__FUNCTION__, task_tgid_vnr(current));
|
||||
pr_err("%s: ERROR: no per-process structure for PID %d??\n",
|
||||
__func__, task_tgid_vnr(task));
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
goto no_ppd;
|
||||
}
|
||||
packet.fault_tid = ppd->pid;
|
||||
|
||||
ptd = mcctrl_get_per_thread_data(ppd, current);
|
||||
if (!ptd) {
|
||||
printk("%s: ERROR: mcctrl_get_per_thread_data failed\n", __FUNCTION__);
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
goto no_ptd;
|
||||
ptd = mcctrl_get_per_thread_data(ppd, task);
|
||||
if (ptd) {
|
||||
struct ikc_scd_packet *ptd_packet;
|
||||
|
||||
pr_ptd("get", task_pid_vnr(task), ptd);
|
||||
ptd_packet = (struct ikc_scd_packet *)ptd->data;
|
||||
if (ptd_packet) {
|
||||
packet.target_cpu = ptd_packet->ref;
|
||||
packet.fault_tid = ptd_packet->req.rtid;
|
||||
rsysnum = ptd_packet->req.number;
|
||||
}
|
||||
mcctrl_put_per_thread_data(ptd);
|
||||
pr_ptd("put", task_pid_vnr(task), ptd);
|
||||
}
|
||||
pr_ptd("get", task_pid_vnr(current), ptd);
|
||||
packet = (struct ikc_scd_packet *)ptd->data;
|
||||
if (!packet) {
|
||||
|
||||
/* Don't even bother looking up NULL */
|
||||
if (!addr) {
|
||||
pr_warn("%s: WARNING: attempted NULL pointer access\n",
|
||||
__func__);
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
printk("%s: no packet registered for TID %d\n",
|
||||
__FUNCTION__, task_pid_vnr(current));
|
||||
goto put_and_out;
|
||||
}
|
||||
|
||||
for (try = 1; ; ++try) {
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
error = translate_rva_to_rpa(usrdata->os, ppd->rpgtable,
|
||||
vmf->address, &rpa, &pgsize);
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
error = translate_rva_to_rpa(usrdata->os, ppd->rpgtable,
|
||||
(unsigned long)vmf->virtual_address,
|
||||
&rpa, &pgsize);
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
(unsigned long)addr, &rpa, &pgsize);
|
||||
#define NTRIES 2
|
||||
if (!error || (try >= NTRIES)) {
|
||||
if (error) {
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
printk("%s: error translating 0x%#lx "
|
||||
"(req: TID: %u, syscall: %lu)\n",
|
||||
__FUNCTION__, vmf->address,
|
||||
packet->req.rtid, packet->req.number);
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
printk("%s: error translating 0x%p "
|
||||
"(req: TID: %u, syscall: %lu)\n",
|
||||
__FUNCTION__, vmf->virtual_address,
|
||||
packet->req.rtid, packet->req.number);
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
pr_err("%s: error translating 0x%#lx "
|
||||
"(req: TID: %u, syscall: %lu)\n",
|
||||
__func__,
|
||||
(unsigned long)addr,
|
||||
packet.fault_tid, rsysnum);
|
||||
}
|
||||
|
||||
break;
|
||||
@ -800,23 +635,14 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
#define PF_WRITE 0x02
|
||||
reason |= PF_WRITE;
|
||||
}
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
error = remote_page_fault(usrdata, (void *)vmf->address, reason);
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
error = remote_page_fault(usrdata, vmf->virtual_address, reason);
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
error = remote_page_fault(usrdata, (void *)addr,
|
||||
reason, ppd, &packet);
|
||||
if (error) {
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
printk("%s: error forwarding PF for 0x%#lx "
|
||||
"(req: TID: %d, syscall: %lu)\n",
|
||||
__FUNCTION__, vmf->address,
|
||||
packet->req.rtid, packet->req.number);
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
printk("%s: error forwarding PF for 0x%p "
|
||||
"(req: TID: %d, syscall: %lu)\n",
|
||||
__FUNCTION__, vmf->virtual_address,
|
||||
packet->req.rtid, packet->req.number);
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
pr_err("%s: error forwarding PF for 0x%#lx "
|
||||
"(req: TID: %d, syscall: %lu)\n",
|
||||
__func__,
|
||||
(unsigned long)addr,
|
||||
packet.fault_tid, rsysnum);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -825,11 +651,7 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
goto put_and_out;
|
||||
}
|
||||
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
rva = vmf->address & ~(pgsize - 1);
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
rva = (unsigned long)vmf->virtual_address & ~(pgsize - 1);
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
rva = (unsigned long)addr & ~(pgsize - 1);
|
||||
rpa = rpa & ~(pgsize - 1);
|
||||
|
||||
phys = ihk_device_map_memory(dev, rpa, pgsize);
|
||||
@ -849,21 +671,13 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
|
||||
error = vm_insert_page(vma, rva+(pix*PAGE_SIZE), page);
|
||||
if (error) {
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
printk("%s: error inserting mapping for 0x%#lx "
|
||||
"(req: TID: %d, syscall: %lu) error: %d, "
|
||||
"vm_start: 0x%lx, vm_end: 0x%lx\n",
|
||||
__FUNCTION__, vmf->address,
|
||||
packet->req.rtid, packet->req.number, error,
|
||||
vma->vm_start, vma->vm_end);
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
printk("%s: error inserting mapping for 0x%p "
|
||||
"(req: TID: %d, syscall: %lu) error: %d, "
|
||||
"vm_start: 0x%lx, vm_end: 0x%lx\n",
|
||||
__FUNCTION__, vmf->virtual_address,
|
||||
packet->req.rtid, packet->req.number, error,
|
||||
vma->vm_start, vma->vm_end);
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
pr_err("%s: error inserting mapping for 0x%#lx "
|
||||
"(req: TID: %d, syscall: %lu) error: %d,"
|
||||
" vm_start: 0x%lx, vm_end: 0x%lx\n",
|
||||
__func__,
|
||||
(unsigned long)addr, packet.fault_tid,
|
||||
rsysnum, error,
|
||||
vma->vm_start, vma->vm_end);
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -875,16 +689,13 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
pfn+pix);
|
||||
#endif
|
||||
if (error) {
|
||||
#if 1 /* POSTK_DEBUG_TEMP_FIX_11 */ /* rus_vm_fault() multi-thread fix */
|
||||
printk("%s: vm_insert_pfn returned %d\n", __FUNCTION__, error);
|
||||
pr_err("%s: vm_insert_pfn returned %d\n",
|
||||
__func__, error);
|
||||
if (error == -EBUSY) {
|
||||
error = 0;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
#else /* POSTK_DEBUG_TEMP_FIX_11 */
|
||||
break;
|
||||
#endif /* POSTK_DEBUG_TEMP_FIX_11 */
|
||||
}
|
||||
}
|
||||
#else
|
||||
@ -892,17 +703,11 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
#endif
|
||||
ihk_device_unmap_memory(dev, phys, pgsize);
|
||||
if (error) {
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
|
||||
printk("%s: remote PF failed for 0x%#lx, pgoff: %lu "
|
||||
"(req: TID: %d, syscall: %lu)\n",
|
||||
__FUNCTION__, vmf->address, vmf->pgoff,
|
||||
packet->req.rtid, packet->req.number);
|
||||
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
printk("%s: remote PF failed for 0x%p, pgoff: %lu "
|
||||
"(req: TID: %d, syscall: %lu)\n",
|
||||
__FUNCTION__, vmf->virtual_address, vmf->pgoff,
|
||||
packet->req.rtid, packet->req.number);
|
||||
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
|
||||
pr_err("%s: remote PF failed for 0x%#lx, pgoff: %lu"
|
||||
" (req: TID: %d, syscall: %lu)\n",
|
||||
__func__,
|
||||
(unsigned long)addr, vmf->pgoff,
|
||||
packet.fault_tid, rsysnum);
|
||||
ret = VM_FAULT_SIGBUS;
|
||||
goto put_and_out;
|
||||
}
|
||||
@ -910,9 +715,6 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
ret = VM_FAULT_NOPAGE;
|
||||
|
||||
put_and_out:
|
||||
mcctrl_put_per_thread_data(ptd);
|
||||
pr_ptd("put", task_pid_vnr(current), ptd);
|
||||
no_ptd:
|
||||
mcctrl_put_per_proc_data(ppd);
|
||||
no_ppd:
|
||||
return ret;
|
||||
@ -1075,6 +877,7 @@ struct pager_create_result {
|
||||
int maxprot;
|
||||
uint32_t flags;
|
||||
size_t size;
|
||||
int pgshift;
|
||||
char path[PATH_MAX];
|
||||
};
|
||||
|
||||
@ -1136,6 +939,7 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
||||
struct kstat st;
|
||||
int mf_flags = 0;
|
||||
unsigned long irqflags;
|
||||
int pgshift = 0;
|
||||
|
||||
dprintk("pager_req_create(%d,%lx)\n", fd, (long)result_pa);
|
||||
|
||||
@ -1144,8 +948,16 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
||||
printk("pager_req_create(%d,%lx):vfs_stat failed. %d\n", fd, (long)result_pa, error);
|
||||
goto out;
|
||||
}
|
||||
if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1)) {
|
||||
/* treat memory devices as regular files */
|
||||
if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1) &&
|
||||
(MINOR(st.rdev) == 1 || // /dev/mem
|
||||
MINOR(st.rdev) == 5)) { // /dev/zero
|
||||
/* treat memory devices and zero devices as regular files */
|
||||
}
|
||||
else if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1)) {
|
||||
error = -ENODEV;
|
||||
dprintk("%s(%d,%lx):unmappable device %x\n",
|
||||
__func__, fd, (long)result_pa, st.mode);
|
||||
goto out;
|
||||
}
|
||||
else if (!S_ISREG(st.mode)) {
|
||||
error = -ESRCH;
|
||||
@ -1160,6 +972,30 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Shared memory hack */
|
||||
{
|
||||
char *pathbuf, *fullpath;
|
||||
pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
|
||||
if (pathbuf) {
|
||||
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
|
||||
if (!IS_ERR(fullpath)) {
|
||||
if (!strncmp("/tmp/ompi.", fullpath, 10) ||
|
||||
!strncmp("/dev/shm/", fullpath, 9) ||
|
||||
(!strncmp("/var/opt/FJSVtcs/ple/daemonif/",
|
||||
fullpath, 30) && !strstr(fullpath, "dstore_sm.lock"))) {
|
||||
printk("%s: treating %s as a device file..\n",
|
||||
__func__, fullpath);
|
||||
kfree(pathbuf);
|
||||
|
||||
error = -ESRCH;
|
||||
goto out;
|
||||
}
|
||||
|
||||
kfree(pathbuf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inode = file->f_path.dentry->d_inode;
|
||||
if (!inode) {
|
||||
error = -EBADF;
|
||||
@ -1167,6 +1003,10 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!strcmp(inode->i_sb->s_type->name, "tmpfs")) {
|
||||
mf_flags = MF_IS_REMOVABLE;
|
||||
}
|
||||
|
||||
if (!strcmp(inode->i_sb->s_type->name, "proc")) {
|
||||
error = -ESRCH;
|
||||
goto out;
|
||||
@ -1188,13 +1028,14 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
||||
}
|
||||
|
||||
if (inode->i_op == mcctrl_hugetlbfs_inode_operations) {
|
||||
struct hstate *h = hstate_file(file);
|
||||
|
||||
pgshift = PAGE_SHIFT + huge_page_order(h);
|
||||
mf_flags = MF_HUGETLBFS;
|
||||
/* pager is used as handle id on mckernel side, use inode */
|
||||
pager = (void *)st.ino;
|
||||
/* retrofit blksize in resp as well through st.size field;
|
||||
* the actual file size is not used
|
||||
*/
|
||||
st.size = st.blksize;
|
||||
/* file size is not used */
|
||||
st.size = 0;
|
||||
goto out_reply;
|
||||
}
|
||||
|
||||
@ -1214,7 +1055,7 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
||||
pager = newpager;
|
||||
newpager = NULL;
|
||||
|
||||
/* Intel MPI library and shared memory "prefetch" */
|
||||
/* Shared libraries prefetch */
|
||||
{
|
||||
char *pathbuf, *fullpath;
|
||||
|
||||
@ -1222,15 +1063,7 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
|
||||
if (pathbuf) {
|
||||
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
|
||||
if (!IS_ERR(fullpath)) {
|
||||
if (!strncmp("/dev/shm/Intel_MPI", fullpath, 18)) {
|
||||
mf_flags = (MF_PREMAP | MF_ZEROFILL);
|
||||
dprintk("%s: filename: %s, premap & zerofill\n",
|
||||
__FUNCTION__, fullpath);
|
||||
}
|
||||
else if (strstr(fullpath, "libmpi") ||
|
||||
strstr(fullpath, "libiomp") ||
|
||||
strstr(fullpath, "libpthread") ||
|
||||
strstr(fullpath, "libc.so")) {
|
||||
if (strstr(fullpath, ".so")) {
|
||||
mf_flags = MF_PREFETCH;
|
||||
dprintk("%s: filename: %s, prefetch\n",
|
||||
__FUNCTION__, fullpath);
|
||||
@ -1281,6 +1114,7 @@ out_reply:
|
||||
resp->maxprot = maxprot;
|
||||
resp->flags = mf_flags;
|
||||
resp->size = st.size;
|
||||
resp->pgshift = pgshift;
|
||||
|
||||
error = pager_get_path(file, resp->path);
|
||||
|
||||
@ -1355,8 +1189,9 @@ static int pager_req_read(ihk_os_t os, uintptr_t handle, off_t off, size_t size,
|
||||
uintptr_t phys = -1;
|
||||
ihk_device_t dev = ihk_os_to_dev(os);
|
||||
void *buf = NULL;
|
||||
loff_t pos;
|
||||
loff_t pos, fsize;
|
||||
unsigned long flags;
|
||||
unsigned int major, minor;
|
||||
|
||||
dprintk("pager_req_read(%lx,%lx,%lx,%lx)\n", handle, off, size, rpa);
|
||||
|
||||
@ -1378,6 +1213,21 @@ static int pager_req_read(ihk_os_t os, uintptr_t handle, off_t off, size_t size,
|
||||
goto out;
|
||||
}
|
||||
|
||||
major = MAJOR(file->f_mapping->host->i_rdev);
|
||||
minor = MINOR(file->f_mapping->host->i_rdev);
|
||||
if ((major == 1 && minor == 1) || // /dev/mem
|
||||
(major == 1 && minor == 5)) { // /dev/zero
|
||||
/* Nothing to check */
|
||||
}
|
||||
else {
|
||||
/* Check if the target page fits in the file */
|
||||
fsize = i_size_read(file->f_mapping->host);
|
||||
if (off > fsize) {
|
||||
ss = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
phys = ihk_device_map_memory(dev, rpa, size);
|
||||
buf = ihk_device_map_virtual(dev, phys, size, NULL, 0);
|
||||
if (!buf) {
|
||||
@ -1585,6 +1435,26 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off,
|
||||
#define ANY_WHERE 0
|
||||
if (prot_and_flags & MAP_LOCKED) prot_and_flags |= MAP_POPULATE;
|
||||
|
||||
/* Shared memory hack */
|
||||
{
|
||||
char *pathbuf, *fullpath;
|
||||
pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
|
||||
if (pathbuf) {
|
||||
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
|
||||
if (!IS_ERR(fullpath)) {
|
||||
if (!strncmp("/tmp/ompi.", fullpath, 10) ||
|
||||
!strncmp("/dev/shm/", fullpath, 9) ||
|
||||
!strncmp("/var/opt/FJSVtcs/ple/daemonif/",
|
||||
fullpath, 30)) {
|
||||
dprintk("%s: pre-populating %s..\n",
|
||||
__func__, fullpath);
|
||||
prot_and_flags |= MAP_POPULATE;
|
||||
}
|
||||
kfree(pathbuf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
|
||||
down_write(¤t->mm->mmap_sem);
|
||||
|
||||
@ -1598,7 +1468,12 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off,
|
||||
#endif
|
||||
|
||||
if (IS_ERR_VALUE(va)) {
|
||||
printk("pager_req_map(%p,%d,%lx,%lx,%lx):do_mmap_pgoff failed. %d\n", os, fd, len, off, result_rpa, (int)va);
|
||||
if ((int)va != -ENOTSUPP) {
|
||||
pr_err("%s(%p,%d,%lx,%lx,%lx): "
|
||||
"do_mmap_pgoff failed. %d\n",
|
||||
__func__, os, fd, len, off, result_rpa,
|
||||
(int)va);
|
||||
}
|
||||
error = va;
|
||||
goto out;
|
||||
}
|
||||
@ -1712,16 +1587,9 @@ retry:
|
||||
pfn |= PFN_VALID | PFN_PRESENT;
|
||||
|
||||
/* Check if mapping is write-combined */
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_12
|
||||
if (pte_is_write_combined(*pte)) {
|
||||
pfn |= PFN_WRITE_COMBINED;
|
||||
}
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_12 */
|
||||
if ((pte_flags(*pte) & _PAGE_PWT) &&
|
||||
!(pte_flags(*pte) & _PAGE_PCD)) {
|
||||
pfn |= _PAGE_PWT;
|
||||
}
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
|
||||
}
|
||||
pte_unmap(pte);
|
||||
}
|
||||
@ -1754,10 +1622,27 @@ retry:
|
||||
#else
|
||||
fault = handle_mm_fault(current->mm, vma, va, flags);
|
||||
#endif
|
||||
#ifdef SC_DEBUG
|
||||
if (fault != 0) {
|
||||
printk("%s: error: faulting %lx at off: %lu\n",
|
||||
__FUNCTION__, va, off);
|
||||
char *pathbuf = NULL;
|
||||
char *fullpath;
|
||||
|
||||
if (vma->vm_file) {
|
||||
pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
|
||||
if (pathbuf) {
|
||||
fullpath = d_path(&vma->vm_file->f_path,
|
||||
pathbuf, PATH_MAX);
|
||||
if (!IS_ERR(fullpath)) {
|
||||
printk("%s: WARNING: couldn't fault 0x%lx"
|
||||
" at off: %lu in %s\n",
|
||||
__FUNCTION__, va, off, fullpath);
|
||||
}
|
||||
|
||||
kfree(pathbuf);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
page_fault_attempted = 1;
|
||||
goto retry;
|
||||
@ -2052,6 +1937,8 @@ int mcctrl_clear_pte_range(uintptr_t start, uintptr_t len)
|
||||
}
|
||||
if (addr < end) {
|
||||
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0)
|
||||
/* Revert permission */
|
||||
vma->vm_flags |= VM_READ | VM_WRITE | VM_EXEC;
|
||||
error = zap_vma_ptes(vma, addr, end-addr);
|
||||
if (error) {
|
||||
mcctrl_zap_page_range(vma, addr, end-addr,
|
||||
@ -2069,6 +1956,8 @@ int mcctrl_clear_pte_range(uintptr_t start, uintptr_t len)
|
||||
NULL);
|
||||
}
|
||||
else {
|
||||
/* Revert permission */
|
||||
vma->vm_flags |= VM_READ | VM_WRITE | VM_EXEC;
|
||||
zap_vma_ptes(vma, addr, end-addr);
|
||||
}
|
||||
#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) */
|
||||
@ -2124,7 +2013,10 @@ int release_user_space(uintptr_t start, uintptr_t len)
|
||||
* \param chunks The number of chunks which make a core file image in the whole.
|
||||
*/
|
||||
|
||||
static int writecore(ihk_os_t os, unsigned long rcoretable, int chunks) {
|
||||
static int writecore(ihk_os_t os, unsigned long rcoretable, int chunks,
|
||||
unsigned long cmdline_rphys, unsigned long cmdline_len)
|
||||
{
|
||||
char *fn = NULL;
|
||||
struct file *file;
|
||||
struct coretable *coretable;
|
||||
int i, tablesize, error = 0;
|
||||
@ -2133,22 +2025,43 @@ static int writecore(ihk_os_t os, unsigned long rcoretable, int chunks) {
|
||||
unsigned long phys, tablephys, rphys;
|
||||
ihk_device_t dev = ihk_os_to_dev(os);
|
||||
char *pt;
|
||||
unsigned long cmdline_phys;
|
||||
char *cmdline;
|
||||
|
||||
dprintk("coredump called as a pseudo syscall\n");
|
||||
|
||||
fn = kmalloc(PATH_MAX, GFP_ATOMIC);
|
||||
if (!fn) {
|
||||
dprintk("%s: ERROR: allocating file name\n", __func__);
|
||||
error = -ENOMEM;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (chunks <= 0) {
|
||||
dprintk("no core data found!(%d)\n", chunks);
|
||||
error = -EINVAL;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
cmdline_phys = ihk_device_map_memory(dev, cmdline_rphys, cmdline_len);
|
||||
cmdline = ihk_device_map_virtual(dev, cmdline_phys, cmdline_len, NULL,
|
||||
0);
|
||||
sprintf(fn, "mccore-%s.%d",
|
||||
strrchr(cmdline, '/') ?
|
||||
strrchr(cmdline, '/') + 1 : cmdline,
|
||||
task_tgid_vnr(current));
|
||||
pr_info("%s: fn=%s\n", __func__, fn);
|
||||
|
||||
ihk_device_unmap_virtual(dev, cmdline, cmdline_len);
|
||||
ihk_device_unmap_memory(dev, cmdline_phys, cmdline_len);
|
||||
|
||||
/* Every Linux documentation insists we should not
|
||||
* open a file in the kernel module, but our karma
|
||||
* leads us here. Precisely, Here we emulate the core
|
||||
* dump routine of the Linux kernel in linux/fs/exec.c.
|
||||
* So we have a legitimate reason to do this.
|
||||
*/
|
||||
file = filp_open("core", O_CREAT | O_RDWR | O_LARGEFILE | O_TRUNC, 0600);
|
||||
file = filp_open(fn, O_CREAT | O_RDWR | O_LARGEFILE | O_TRUNC, 0600);
|
||||
if (IS_ERR(file) || !file->f_op) {
|
||||
dprintk("cannot open core file\n");
|
||||
error = PTR_ERR(file);
|
||||
@ -2218,6 +2131,7 @@ fail:
|
||||
/* make sure we do not travel to user land */
|
||||
error = -EINVAL;
|
||||
}
|
||||
kfree(fn);
|
||||
return error;
|
||||
}
|
||||
|
||||
@ -2273,7 +2187,8 @@ int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet)
|
||||
}
|
||||
|
||||
case __NR_coredump:
|
||||
ret = writecore(os, sc->args[1], sc->args[0]);
|
||||
ret = writecore(os, sc->args[1], sc->args[0], sc->args[2],
|
||||
sc->args[3]);
|
||||
break;
|
||||
|
||||
case __NR_sched_setparam: {
|
||||
|
||||
@ -157,13 +157,8 @@ static void free_node_topology(struct mcctrl_usrdata *udp)
|
||||
return;
|
||||
} /* free_node_topology() */
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
static void free_cpu_topology_one(struct mcctrl_usrdata *udp,
|
||||
struct mcctrl_cpu_topology *cpu)
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
static void free_cpu_topology_one(struct mcctrl_usrdata *udp,
|
||||
struct cpu_topology *cpu)
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
{
|
||||
struct cache_topology *cache;
|
||||
struct cache_topology *next;
|
||||
@ -179,13 +174,8 @@ static void free_cpu_topology_one(struct mcctrl_usrdata *udp,
|
||||
|
||||
static void free_cpu_topology(struct mcctrl_usrdata *udp)
|
||||
{
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
struct mcctrl_cpu_topology *cpu;
|
||||
struct mcctrl_cpu_topology *next;
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cpu_topology *cpu;
|
||||
struct cpu_topology *next;
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
|
||||
list_for_each_entry_safe(cpu, next, &udp->cpu_topology_list, chain) {
|
||||
list_del(&cpu->chain);
|
||||
@ -315,13 +305,8 @@ static int translate_cpumap(struct mcctrl_usrdata *udp,
|
||||
return error;
|
||||
} /* translate_cpumap() */
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
static struct cache_topology *get_cache_topology(struct mcctrl_usrdata *udp,
|
||||
struct mcctrl_cpu_topology *cpu_topo, struct ihk_cache_topology *saved)
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
static struct cache_topology *get_cache_topology(struct mcctrl_usrdata *udp,
|
||||
struct cpu_topology *cpu_topo, struct ihk_cache_topology *saved)
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
{
|
||||
int error;
|
||||
struct cache_topology *topo = NULL;
|
||||
@ -355,21 +340,12 @@ out:
|
||||
return (error)? ERR_PTR(error): topo;
|
||||
} /* get_cache_topology() */
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
static struct mcctrl_cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
||||
int index)
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
||||
int index)
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
{
|
||||
int error;
|
||||
ihk_device_t dev = ihk_os_to_dev(udp->os);
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
struct mcctrl_cpu_topology *topology = NULL;
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cpu_topology *topology = NULL;
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cache_topology *cache;
|
||||
struct ihk_cache_topology *saved_cache;
|
||||
|
||||
@ -387,12 +363,8 @@ static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
||||
topology->saved = ihk_device_get_cpu_topology(dev,
|
||||
mckernel_cpu_2_hw_id(udp, index));
|
||||
|
||||
#ifdef POSTK_DEBUG_TEMP_FIX_21 /* IS_ERR() through return NULL */
|
||||
if (!topology->saved) {
|
||||
#else /* POSTK_DEBUG_TEMP_FIX_21 */
|
||||
if (IS_ERR(topology->saved)) {
|
||||
#endif /* POSTK_DEBUG_TEMP_FIX_21 */
|
||||
error = PTR_ERR(topology->saved);
|
||||
error = -ENOENT;
|
||||
eprintk("mcctrl:get_one_cpu_topology:"
|
||||
"ihk_device_get_cpu_topology failed. %d\n",
|
||||
error);
|
||||
@ -428,6 +400,9 @@ static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
|
||||
"get_cache_topology failed. %d\n",
|
||||
error);
|
||||
goto out;
|
||||
} else if (!cache) {
|
||||
error = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
list_add(&cache->chain, &topology->cache_list);
|
||||
@ -447,11 +422,7 @@ static int get_cpu_topology(struct mcctrl_usrdata *udp)
|
||||
{
|
||||
int error;
|
||||
int index;
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
struct mcctrl_cpu_topology *topology;
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cpu_topology *topology;
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
|
||||
dprintk("get_cpu_topology(%p)\n", udp);
|
||||
for (index = 0; index < udp->cpu_info->n_cpus; ++index) {
|
||||
@ -473,13 +444,8 @@ out:
|
||||
return error;
|
||||
} /* get_cpu_topology() */
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
|
||||
struct mcctrl_cpu_topology *cpu, struct cache_topology *cache)
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
|
||||
struct cpu_topology *cpu, struct cache_topology *cache)
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
{
|
||||
char *prefix = "/sys/devices/system/cpu";
|
||||
int cpu_number = cpu->mckernel_cpu_id;
|
||||
@ -531,13 +497,8 @@ static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
|
||||
return;
|
||||
} /* setup_cpu_sysfs_cache_files() */
|
||||
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
|
||||
struct mcctrl_cpu_topology *cpu)
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
|
||||
struct cpu_topology *cpu)
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
{
|
||||
char *prefix = "/sys/devices/system/cpu";
|
||||
int cpu_number = cpu->mckernel_cpu_id;
|
||||
@ -586,11 +547,7 @@ static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
|
||||
static void setup_cpus_sysfs_files(struct mcctrl_usrdata *udp)
|
||||
{
|
||||
int error;
|
||||
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
|
||||
struct mcctrl_cpu_topology *cpu;
|
||||
#else /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
struct cpu_topology *cpu;
|
||||
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
|
||||
|
||||
error = get_cpu_topology(udp);
|
||||
if (error) {
|
||||
|
||||
@ -1,14 +0,0 @@
|
||||
# LESS/GREATER_EQUAL appears somewhere in 3.7... meh compat until we stop caring about 2.x
|
||||
# ...apparently can't define macros ot use inside if, so unfold manually
|
||||
|
||||
if(NOT (LINUX_VERSION_CODE LESS 262144) AND NOT (LINUX_VERSION_CODE GREATER 262400))
|
||||
add_subdirectory("linux-4.0.9")
|
||||
elseif(NOT (LINUX_VERSION_CODE LESS 263680) AND NOT (LINUX_VERSION_CODE GREATER 263936))
|
||||
add_subdirectory("linux-4.6.7")
|
||||
elseif(LINUX_VERSION_CODE EQUAL 199168)
|
||||
add_subdirectory("linux-3.10.0-327.36.1.el7")
|
||||
else()
|
||||
#add_subdirectory("linux-3.10.0-327.36.1.el7")
|
||||
add_subdirectory("linux-4.18.14")
|
||||
#message(FATAL_ERROR "mcoverlayfs enabled but kernel version not compatible")
|
||||
endif()
|
||||
@ -1,7 +0,0 @@
|
||||
kmod(mcoverlay
|
||||
SOURCES
|
||||
copy_up.c dir.c inode.c readdir.c super.c
|
||||
INSTALL_DEST
|
||||
${KMODDIR}
|
||||
)
|
||||
|
||||
@ -1,461 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/splice.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/fdtable.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
|
||||
|
||||
static unsigned ovl_check_copy_up = 1;
|
||||
module_param_named(check_copy_up, ovl_check_copy_up, uint,
|
||||
S_IWUSR | S_IRUGO);
|
||||
MODULE_PARM_DESC(ovl_check_copy_up,
|
||||
"Warn on copy-up when causing process also has a R/O fd open");
|
||||
|
||||
static int ovl_check_fd(const void *data, struct file *f, unsigned fd)
|
||||
{
|
||||
const struct dentry *dentry = data;
|
||||
|
||||
if (f->f_path.dentry == dentry)
|
||||
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
|
||||
f, fd, current->pid, current->comm);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check the fds open by this process and warn if something like the following
|
||||
* scenario is about to occur:
|
||||
*
|
||||
* fd1 = open("foo", O_RDONLY);
|
||||
* fd2 = open("foo", O_RDWR);
|
||||
*/
|
||||
static void ovl_do_check_copy_up(struct dentry *dentry)
|
||||
{
|
||||
if (ovl_check_copy_up)
|
||||
iterate_fd(current->files, 0, ovl_check_fd, dentry);
|
||||
}
|
||||
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
|
||||
{
|
||||
ssize_t list_size, size, value_size = 0;
|
||||
char *buf, *name, *value = NULL;
|
||||
int uninitialized_var(error);
|
||||
|
||||
if (!old->d_inode->i_op->getxattr ||
|
||||
!new->d_inode->i_op->getxattr)
|
||||
return 0;
|
||||
|
||||
list_size = vfs_listxattr(old, NULL, 0);
|
||||
if (list_size <= 0) {
|
||||
if (list_size == -EOPNOTSUPP)
|
||||
return 0;
|
||||
return list_size;
|
||||
}
|
||||
|
||||
buf = kzalloc(list_size, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
list_size = vfs_listxattr(old, buf, list_size);
|
||||
if (list_size <= 0) {
|
||||
error = list_size;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
|
||||
retry:
|
||||
size = vfs_getxattr(old, name, value, value_size);
|
||||
if (size == -ERANGE)
|
||||
size = vfs_getxattr(old, name, NULL, 0);
|
||||
|
||||
if (size < 0) {
|
||||
error = size;
|
||||
break;
|
||||
}
|
||||
|
||||
if (size > value_size) {
|
||||
void *new;
|
||||
|
||||
new = krealloc(value, size, GFP_KERNEL);
|
||||
if (!new) {
|
||||
error = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
value = new;
|
||||
value_size = size;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
error = vfs_setxattr(new, name, value, size, 0);
|
||||
if (error)
|
||||
break;
|
||||
}
|
||||
kfree(value);
|
||||
out:
|
||||
kfree(buf);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
|
||||
{
|
||||
struct file *old_file;
|
||||
struct file *new_file;
|
||||
loff_t old_pos = 0;
|
||||
loff_t new_pos = 0;
|
||||
int error = 0;
|
||||
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
old_file = ovl_path_open(old, O_RDONLY);
|
||||
if (IS_ERR(old_file))
|
||||
return PTR_ERR(old_file);
|
||||
|
||||
new_file = ovl_path_open(new, O_WRONLY);
|
||||
if (IS_ERR(new_file)) {
|
||||
error = PTR_ERR(new_file);
|
||||
goto out_fput;
|
||||
}
|
||||
|
||||
/* FIXME: copy up sparse files efficiently */
|
||||
while (len) {
|
||||
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
|
||||
long bytes;
|
||||
|
||||
if (len < this_len)
|
||||
this_len = len;
|
||||
|
||||
if (signal_pending_state(TASK_KILLABLE, current)) {
|
||||
error = -EINTR;
|
||||
break;
|
||||
}
|
||||
|
||||
bytes = do_splice_direct(old_file, &old_pos,
|
||||
new_file, &new_pos,
|
||||
this_len, SPLICE_F_MOVE);
|
||||
if (bytes <= 0) {
|
||||
error = bytes;
|
||||
break;
|
||||
}
|
||||
WARN_ON(old_pos != new_pos);
|
||||
|
||||
len -= bytes;
|
||||
}
|
||||
|
||||
fput(new_file);
|
||||
out_fput:
|
||||
fput(old_file);
|
||||
return error;
|
||||
}
|
||||
|
||||
static char *ovl_read_symlink(struct dentry *realdentry)
|
||||
{
|
||||
int res;
|
||||
char *buf;
|
||||
struct inode *inode = realdentry->d_inode;
|
||||
mm_segment_t old_fs;
|
||||
|
||||
res = -EINVAL;
|
||||
if (!inode->i_op->readlink)
|
||||
goto err;
|
||||
|
||||
res = -ENOMEM;
|
||||
buf = (char *) __get_free_page(GFP_KERNEL);
|
||||
if (!buf)
|
||||
goto err;
|
||||
|
||||
old_fs = get_fs();
|
||||
set_fs(get_ds());
|
||||
/* The cast to a user pointer is valid due to the set_fs() */
|
||||
res = inode->i_op->readlink(realdentry,
|
||||
(char __user *)buf, PAGE_SIZE - 1);
|
||||
set_fs(old_fs);
|
||||
if (res < 0) {
|
||||
free_page((unsigned long) buf);
|
||||
goto err;
|
||||
}
|
||||
buf[res] = '\0';
|
||||
|
||||
return buf;
|
||||
|
||||
err:
|
||||
return ERR_PTR(res);
|
||||
}
|
||||
|
||||
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
struct iattr attr = {
|
||||
.ia_valid =
|
||||
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
|
||||
.ia_atime = stat->atime,
|
||||
.ia_mtime = stat->mtime,
|
||||
};
|
||||
|
||||
return notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
|
||||
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (!S_ISLNK(stat->mode)) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_MODE,
|
||||
.ia_mode = stat->mode,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_UID | ATTR_GID,
|
||||
.ia_uid = stat->uid,
|
||||
.ia_gid = stat->gid,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err)
|
||||
ovl_set_timestamps(upperdentry, stat);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
|
||||
struct dentry *dentry, struct path *lowerpath,
|
||||
struct kstat *stat, struct iattr *attr,
|
||||
const char *link)
|
||||
{
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *newdentry = NULL;
|
||||
struct dentry *upper = NULL;
|
||||
umode_t mode = stat->mode;
|
||||
int err;
|
||||
|
||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out1;
|
||||
|
||||
/* Can't properly set mode on creation because of the umask */
|
||||
stat->mode &= S_IFMT;
|
||||
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
|
||||
stat->mode = mode;
|
||||
if (err)
|
||||
goto out2;
|
||||
|
||||
if (S_ISREG(stat->mode)) {
|
||||
struct path upperpath;
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
BUG_ON(upperpath.dentry != NULL);
|
||||
upperpath.dentry = newdentry;
|
||||
|
||||
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
}
|
||||
|
||||
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
mutex_lock(&newdentry->d_inode->i_mutex);
|
||||
err = ovl_set_attr(newdentry, stat);
|
||||
if (!err && attr)
|
||||
err = notify_change(newdentry, attr, NULL);
|
||||
mutex_unlock(&newdentry->d_inode->i_mutex);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
newdentry = NULL;
|
||||
|
||||
/*
|
||||
* Non-directores become opaque when copied up.
|
||||
*/
|
||||
if (!S_ISDIR(stat->mode))
|
||||
ovl_dentry_set_opaque(dentry, true);
|
||||
out2:
|
||||
dput(upper);
|
||||
out1:
|
||||
dput(newdentry);
|
||||
out:
|
||||
return err;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, newdentry);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy up a single dentry
|
||||
*
|
||||
* Directory renames only allowed on "pure upper" (already created on
|
||||
* upper filesystem, never copied up). Directories which are on lower or
|
||||
* are merged may not be renamed. For these -EXDEV is returned and
|
||||
* userspace has to deal with it. This means, when copying up a
|
||||
* directory we can rely on it and ancestors being stable.
|
||||
*
|
||||
* Non-directory renames start with copy up of source if necessary. The
|
||||
* actual rename will only proceed once the copy up was successful. Copy
|
||||
* up uses upper parent i_mutex for exclusion. Since rename can change
|
||||
* d_parent it is possible that the copy up will lock the old parent. At
|
||||
* that point the file will have already been copied up anyway.
|
||||
*/
|
||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
||||
struct path *lowerpath, struct kstat *stat,
|
||||
struct iattr *attr)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
int err;
|
||||
struct kstat pstat;
|
||||
struct path parentpath;
|
||||
struct dentry *upperdir;
|
||||
struct dentry *upperdentry;
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
char *link = NULL;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
ovl_do_check_copy_up(lowerpath->dentry);
|
||||
|
||||
ovl_path_upper(parent, &parentpath);
|
||||
upperdir = parentpath.dentry;
|
||||
|
||||
err = vfs_getattr(&parentpath, &pstat);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (S_ISLNK(stat->mode)) {
|
||||
link = ovl_read_symlink(lowerpath->dentry);
|
||||
if (IS_ERR(link))
|
||||
return PTR_ERR(link);
|
||||
}
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_free_link;
|
||||
|
||||
override_cred->fsuid = stat->uid;
|
||||
override_cred->fsgid = stat->gid;
|
||||
/*
|
||||
* CAP_SYS_ADMIN for copying up extended attributes
|
||||
* CAP_DAC_OVERRIDE for create
|
||||
* CAP_FOWNER for chmod, timestamp update
|
||||
* CAP_FSETID for chmod
|
||||
* CAP_CHOWN for chown
|
||||
* CAP_MKNOD for mknod
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
cap_raise(override_cred->cap_effective, CAP_MKNOD);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = -EIO;
|
||||
if (lock_rename(workdir, upperdir) != NULL) {
|
||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
||||
goto out_unlock;
|
||||
}
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
if (upperdentry) {
|
||||
unlock_rename(workdir, upperdir);
|
||||
err = 0;
|
||||
/* Raced with another copy-up? Do the setattr here */
|
||||
if (attr) {
|
||||
mutex_lock(&upperdentry->d_inode->i_mutex);
|
||||
err = notify_change(upperdentry, attr, NULL);
|
||||
mutex_unlock(&upperdentry->d_inode->i_mutex);
|
||||
}
|
||||
goto out_put_cred;
|
||||
}
|
||||
|
||||
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
|
||||
stat, attr, link);
|
||||
if (!err) {
|
||||
/* Restore timestamps on parent (best effort) */
|
||||
ovl_set_timestamps(upperdir, &pstat);
|
||||
}
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out_put_cred:
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
|
||||
out_free_link:
|
||||
if (link)
|
||||
free_page((unsigned long) link);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_copy_up(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = 0;
|
||||
while (!err) {
|
||||
struct dentry *next;
|
||||
struct dentry *parent;
|
||||
struct path lowerpath;
|
||||
struct kstat stat;
|
||||
enum ovl_path_type type = ovl_path_type(dentry);
|
||||
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
break;
|
||||
|
||||
next = dget(dentry);
|
||||
/* find the topmost dentry not yet copied up */
|
||||
for (;;) {
|
||||
parent = dget_parent(next);
|
||||
|
||||
type = ovl_path_type(parent);
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
break;
|
||||
|
||||
dput(next);
|
||||
next = parent;
|
||||
}
|
||||
|
||||
ovl_path_lower(next, &lowerpath);
|
||||
err = vfs_getattr(&lowerpath, &stat);
|
||||
if (!err)
|
||||
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
|
||||
|
||||
dput(parent);
|
||||
dput(next);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
@ -1,972 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/cred.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
dget(wdentry);
|
||||
if (S_ISDIR(wdentry->d_inode->i_mode))
|
||||
err = ovl_do_rmdir(wdir, wdentry);
|
||||
else
|
||||
err = ovl_do_unlink(wdir, wdentry);
|
||||
dput(wdentry);
|
||||
|
||||
if (err) {
|
||||
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
|
||||
wdentry, err);
|
||||
}
|
||||
}
|
||||
|
||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
|
||||
{
|
||||
struct dentry *temp;
|
||||
char name[20];
|
||||
|
||||
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
|
||||
|
||||
temp = lookup_one_len(name, workdir, strlen(name));
|
||||
if (!IS_ERR(temp) && temp->d_inode) {
|
||||
pr_err("overlayfs: workdir/%s already exists\n", name);
|
||||
dput(temp);
|
||||
temp = ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
return temp;
|
||||
}
|
||||
|
||||
/* caller holds i_mutex on workdir */
|
||||
static struct dentry *ovl_whiteout(struct dentry *workdir,
|
||||
struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *whiteout;
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
|
||||
whiteout = ovl_lookup_temp(workdir, dentry);
|
||||
if (IS_ERR(whiteout))
|
||||
return whiteout;
|
||||
|
||||
err = ovl_do_whiteout(wdir, whiteout);
|
||||
if (err) {
|
||||
dput(whiteout);
|
||||
whiteout = ERR_PTR(err);
|
||||
}
|
||||
|
||||
return whiteout;
|
||||
}
|
||||
|
||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink, bool debug)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (newdentry->d_inode)
|
||||
return -ESTALE;
|
||||
|
||||
if (hardlink) {
|
||||
err = ovl_do_link(hardlink, dir, newdentry, debug);
|
||||
} else {
|
||||
switch (stat->mode & S_IFMT) {
|
||||
case S_IFREG:
|
||||
err = ovl_do_create(dir, newdentry, stat->mode, debug);
|
||||
break;
|
||||
|
||||
case S_IFDIR:
|
||||
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
|
||||
break;
|
||||
|
||||
case S_IFCHR:
|
||||
case S_IFBLK:
|
||||
case S_IFIFO:
|
||||
case S_IFSOCK:
|
||||
err = ovl_do_mknod(dir, newdentry,
|
||||
stat->mode, stat->rdev, debug);
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
err = ovl_do_symlink(dir, newdentry, link, debug);
|
||||
break;
|
||||
|
||||
default:
|
||||
err = -EPERM;
|
||||
}
|
||||
}
|
||||
if (!err && WARN_ON(!newdentry->d_inode)) {
|
||||
/*
|
||||
* Not quite sure if non-instantiated dentry is legal or not.
|
||||
* VFS doesn't seem to care so check and warn here.
|
||||
*/
|
||||
err = -ENOENT;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_set_opaque(struct dentry *upperdentry)
|
||||
{
|
||||
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
|
||||
}
|
||||
|
||||
static void ovl_remove_opaque(struct dentry *upperdentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
|
||||
if (err) {
|
||||
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
|
||||
upperdentry->d_name.name, err);
|
||||
}
|
||||
}
|
||||
|
||||
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat)
|
||||
{
|
||||
int err;
|
||||
enum ovl_path_type type;
|
||||
struct path realpath;
|
||||
|
||||
type = ovl_path_real(dentry, &realpath);
|
||||
err = vfs_getattr(&realpath, stat);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
stat->dev = dentry->d_sb->s_dev;
|
||||
stat->ino = dentry->d_inode->i_ino;
|
||||
|
||||
/*
|
||||
* It's probably not worth it to count subdirs to get the
|
||||
* correct link count. nlink=1 seems to pacify 'find' and
|
||||
* other utilities.
|
||||
*/
|
||||
if (OVL_TYPE_MERGE(type))
|
||||
stat->nlink = 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink)
|
||||
{
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *newdentry;
|
||||
int err;
|
||||
|
||||
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
|
||||
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_unlock;
|
||||
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
ovl_copyattr(newdentry->d_inode, inode);
|
||||
d_instantiate(dentry, inode);
|
||||
newdentry = NULL;
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_unlock:
|
||||
mutex_unlock(&udir->i_mutex);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_lock_rename_workdir(struct dentry *workdir,
|
||||
struct dentry *upperdir)
|
||||
{
|
||||
/* Workdir should not be the same as upperdir */
|
||||
if (workdir == upperdir)
|
||||
goto err;
|
||||
|
||||
/* Workdir should not be subdir of upperdir and vice versa */
|
||||
if (lock_rename(workdir, upperdir) != NULL)
|
||||
goto err_unlock;
|
||||
|
||||
return 0;
|
||||
|
||||
err_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
err:
|
||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_clear_empty(struct dentry *dentry,
|
||||
struct list_head *list)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct path upperpath;
|
||||
struct dentry *upper;
|
||||
struct dentry *opaquedir;
|
||||
struct kstat stat;
|
||||
int err;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return ERR_PTR(-EROFS);
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
err = vfs_getattr(&upperpath, &stat);
|
||||
if (err)
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if (!S_ISDIR(stat.mode))
|
||||
goto out_unlock;
|
||||
upper = upperpath.dentry;
|
||||
if (upper->d_parent->d_inode != udir)
|
||||
goto out_unlock;
|
||||
|
||||
opaquedir = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir))
|
||||
goto out_unlock;
|
||||
|
||||
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
err = ovl_copy_xattr(upper, opaquedir);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_set_opaque(opaquedir);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
mutex_lock(&opaquedir->d_inode->i_mutex);
|
||||
err = ovl_set_attr(opaquedir, &stat);
|
||||
mutex_unlock(&opaquedir->d_inode->i_mutex);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_cleanup_whiteouts(upper, list);
|
||||
ovl_cleanup(wdir, upper);
|
||||
unlock_rename(workdir, upperdir);
|
||||
|
||||
/* dentry's upper doesn't match now, get rid of it */
|
||||
d_drop(dentry);
|
||||
|
||||
return opaquedir;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, opaquedir);
|
||||
out_dput:
|
||||
dput(opaquedir);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out:
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *ret = NULL;
|
||||
LIST_HEAD(list);
|
||||
|
||||
err = ovl_check_empty_dir(dentry, &list);
|
||||
if (err)
|
||||
ret = ERR_PTR(err);
|
||||
else {
|
||||
/*
|
||||
* If no upperdentry then skip clearing whiteouts.
|
||||
*
|
||||
* Can race with copy-up, since we don't hold the upperdir
|
||||
* mutex. Doesn't matter, since copy-up can't create a
|
||||
* non-empty directory from an empty one.
|
||||
*/
|
||||
if (ovl_dentry_upper(dentry))
|
||||
ret = ovl_clear_empty(dentry, &list);
|
||||
}
|
||||
|
||||
ovl_cache_free(&list);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *upper;
|
||||
struct dentry *newdentry;
|
||||
int err;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_unlock;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out_dput;
|
||||
|
||||
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
|
||||
if (err)
|
||||
goto out_dput2;
|
||||
|
||||
if (S_ISDIR(stat->mode)) {
|
||||
err = ovl_set_opaque(newdentry);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper,
|
||||
RENAME_EXCHANGE);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_cleanup(wdir, upper);
|
||||
} else {
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
}
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
ovl_copyattr(newdentry->d_inode, inode);
|
||||
d_instantiate(dentry, inode);
|
||||
newdentry = NULL;
|
||||
out_dput2:
|
||||
dput(upper);
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out:
|
||||
return err;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, newdentry);
|
||||
goto out_dput2;
|
||||
}
|
||||
|
||||
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
|
||||
const char *link, struct dentry *hardlink)
|
||||
{
|
||||
int err;
|
||||
struct inode *inode;
|
||||
struct kstat stat = {
|
||||
.mode = mode,
|
||||
.rdev = rdev,
|
||||
};
|
||||
|
||||
err = -ENOMEM;
|
||||
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
|
||||
if (!inode)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry->d_parent);
|
||||
if (err)
|
||||
goto out_iput;
|
||||
|
||||
if (!ovl_dentry_is_opaque(dentry)) {
|
||||
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
|
||||
} else {
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_iput;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting opaque xattr
|
||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
|
||||
hardlink);
|
||||
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
|
||||
if (!err)
|
||||
inode = NULL;
|
||||
out_iput:
|
||||
iput(inode);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
|
||||
const char *link)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (!err) {
|
||||
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
|
||||
ovl_drop_write(dentry);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
bool excl)
|
||||
{
|
||||
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
|
||||
}
|
||||
|
||||
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
||||
{
|
||||
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
|
||||
}
|
||||
|
||||
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
dev_t rdev)
|
||||
{
|
||||
/* Don't allow creation of "whiteout" on overlay */
|
||||
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
|
||||
return -EPERM;
|
||||
|
||||
return ovl_create_object(dentry, mode, rdev, NULL);
|
||||
}
|
||||
|
||||
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *link)
|
||||
{
|
||||
return ovl_create_object(dentry, S_IFLNK, 0, link);
|
||||
}
|
||||
|
||||
static int ovl_link(struct dentry *old, struct inode *newdir,
|
||||
struct dentry *new)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upper;
|
||||
|
||||
err = ovl_want_write(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(old);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
upper = ovl_dentry_upper(old);
|
||||
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
|
||||
|
||||
out_drop_write:
|
||||
ovl_drop_write(old);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *whiteout;
|
||||
struct dentry *upper;
|
||||
struct dentry *opaquedir = NULL;
|
||||
int err;
|
||||
int flags = 0;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
if (is_dir) {
|
||||
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
|
||||
opaquedir = ovl_check_empty_and_clear(dentry);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir))
|
||||
goto out;
|
||||
} else {
|
||||
LIST_HEAD(list);
|
||||
|
||||
/*
|
||||
* When removing an empty opaque directory, then it
|
||||
* makes no sense to replace it with an exact replica of
|
||||
* itself. But emptiness still needs to be checked.
|
||||
*/
|
||||
err = ovl_check_empty_dir(dentry, &list);
|
||||
ovl_cache_free(&list);
|
||||
if (err)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if ((opaquedir && upper != opaquedir) ||
|
||||
(!opaquedir && ovl_dentry_upper(dentry) &&
|
||||
upper != ovl_dentry_upper(dentry))) {
|
||||
goto out_dput_upper;
|
||||
}
|
||||
|
||||
whiteout = ovl_whiteout(workdir, dentry);
|
||||
err = PTR_ERR(whiteout);
|
||||
if (IS_ERR(whiteout))
|
||||
goto out_dput_upper;
|
||||
|
||||
if (d_is_dir(upper))
|
||||
flags = RENAME_EXCHANGE;
|
||||
|
||||
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
|
||||
if (err)
|
||||
goto kill_whiteout;
|
||||
if (flags)
|
||||
ovl_cleanup(wdir, upper);
|
||||
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
out_d_drop:
|
||||
d_drop(dentry);
|
||||
dput(whiteout);
|
||||
out_dput_upper:
|
||||
dput(upper);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out_dput:
|
||||
dput(opaquedir);
|
||||
out:
|
||||
return err;
|
||||
|
||||
kill_whiteout:
|
||||
ovl_cleanup(wdir, whiteout);
|
||||
goto out_d_drop;
|
||||
}
|
||||
|
||||
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *dir = upperdir->d_inode;
|
||||
struct dentry *upper;
|
||||
int err;
|
||||
|
||||
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if (upper == ovl_dentry_upper(dentry)) {
|
||||
if (is_dir)
|
||||
err = vfs_rmdir(dir, upper);
|
||||
else
|
||||
err = vfs_unlink(dir, upper, NULL);
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
}
|
||||
dput(upper);
|
||||
|
||||
/*
|
||||
* Keeping this dentry hashed would mean having to release
|
||||
* upperpath/lowerpath, which could only be done if we are the
|
||||
* sole user of this dentry. Too tricky... Just unhash for
|
||||
* now.
|
||||
*/
|
||||
if (!err)
|
||||
d_drop(dentry);
|
||||
out_unlock:
|
||||
mutex_unlock(&dir->i_mutex);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_check_sticky(struct dentry *dentry)
|
||||
{
|
||||
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
|
||||
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
|
||||
|
||||
if (check_sticky(dir, inode))
|
||||
return -EPERM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
enum ovl_path_type type;
|
||||
int err;
|
||||
|
||||
err = ovl_check_sticky(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry->d_parent);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
type = ovl_path_type(dentry);
|
||||
if (OVL_TYPE_PURE_UPPER(type)) {
|
||||
err = ovl_remove_upper(dentry, is_dir);
|
||||
} else {
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_drop_write;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
* CAP_FSETID for chmod of opaque dir
|
||||
* CAP_CHOWN for chown of opaque dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = ovl_remove_and_whiteout(dentry, is_dir);
|
||||
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
return ovl_do_remove(dentry, false);
|
||||
}
|
||||
|
||||
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
return ovl_do_remove(dentry, true);
|
||||
}
|
||||
|
||||
static int ovl_rename2(struct inode *olddir, struct dentry *old,
|
||||
struct inode *newdir, struct dentry *new,
|
||||
unsigned int flags)
|
||||
{
|
||||
int err;
|
||||
enum ovl_path_type old_type;
|
||||
enum ovl_path_type new_type;
|
||||
struct dentry *old_upperdir;
|
||||
struct dentry *new_upperdir;
|
||||
struct dentry *olddentry;
|
||||
struct dentry *newdentry;
|
||||
struct dentry *trap;
|
||||
bool old_opaque;
|
||||
bool new_opaque;
|
||||
bool new_create = false;
|
||||
bool cleanup_whiteout = false;
|
||||
bool overwrite = !(flags & RENAME_EXCHANGE);
|
||||
bool is_dir = S_ISDIR(old->d_inode->i_mode);
|
||||
bool new_is_dir = false;
|
||||
struct dentry *opaquedir = NULL;
|
||||
const struct cred *old_cred = NULL;
|
||||
struct cred *override_cred = NULL;
|
||||
|
||||
err = -EINVAL;
|
||||
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
|
||||
goto out;
|
||||
|
||||
flags &= ~RENAME_NOREPLACE;
|
||||
|
||||
err = ovl_check_sticky(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
/* Don't copy up directory trees */
|
||||
old_type = ovl_path_type(old);
|
||||
err = -EXDEV;
|
||||
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
|
||||
goto out;
|
||||
|
||||
if (new->d_inode) {
|
||||
err = ovl_check_sticky(new);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (S_ISDIR(new->d_inode->i_mode))
|
||||
new_is_dir = true;
|
||||
|
||||
new_type = ovl_path_type(new);
|
||||
err = -EXDEV;
|
||||
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
|
||||
goto out;
|
||||
|
||||
err = 0;
|
||||
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
|
||||
if (ovl_dentry_lower(old)->d_inode ==
|
||||
ovl_dentry_lower(new)->d_inode)
|
||||
goto out;
|
||||
}
|
||||
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
|
||||
if (ovl_dentry_upper(old)->d_inode ==
|
||||
ovl_dentry_upper(new)->d_inode)
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
if (ovl_dentry_is_opaque(new))
|
||||
new_type = __OVL_PATH_UPPER;
|
||||
else
|
||||
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
|
||||
}
|
||||
|
||||
err = ovl_want_write(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(old);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(new->d_parent);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
if (!overwrite) {
|
||||
err = ovl_copy_up(new);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
}
|
||||
|
||||
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
|
||||
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
|
||||
|
||||
if (old_opaque || new_opaque) {
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_drop_write;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
||||
* CAP_DAC_OVERRIDE for create in workdir
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
* CAP_FSETID for chmod of opaque dir
|
||||
* CAP_CHOWN for chown of opaque dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
old_cred = override_creds(override_cred);
|
||||
}
|
||||
|
||||
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
|
||||
opaquedir = ovl_check_empty_and_clear(new);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir)) {
|
||||
opaquedir = NULL;
|
||||
goto out_revert_creds;
|
||||
}
|
||||
}
|
||||
|
||||
if (overwrite) {
|
||||
if (old_opaque) {
|
||||
if (new->d_inode || !new_opaque) {
|
||||
/* Whiteout source */
|
||||
flags |= RENAME_WHITEOUT;
|
||||
} else {
|
||||
/* Switch whiteouts */
|
||||
flags |= RENAME_EXCHANGE;
|
||||
}
|
||||
} else if (is_dir && !new->d_inode && new_opaque) {
|
||||
flags |= RENAME_EXCHANGE;
|
||||
cleanup_whiteout = true;
|
||||
}
|
||||
}
|
||||
|
||||
old_upperdir = ovl_dentry_upper(old->d_parent);
|
||||
new_upperdir = ovl_dentry_upper(new->d_parent);
|
||||
|
||||
trap = lock_rename(new_upperdir, old_upperdir);
|
||||
|
||||
|
||||
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
|
||||
old->d_name.len);
|
||||
err = PTR_ERR(olddentry);
|
||||
if (IS_ERR(olddentry))
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if (olddentry != ovl_dentry_upper(old))
|
||||
goto out_dput_old;
|
||||
|
||||
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
|
||||
new->d_name.len);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_dput_old;
|
||||
|
||||
err = -ESTALE;
|
||||
if (ovl_dentry_upper(new)) {
|
||||
if (opaquedir) {
|
||||
if (newdentry != opaquedir)
|
||||
goto out_dput;
|
||||
} else {
|
||||
if (newdentry != ovl_dentry_upper(new))
|
||||
goto out_dput;
|
||||
}
|
||||
} else {
|
||||
new_create = true;
|
||||
if (!d_is_negative(newdentry) &&
|
||||
(!new_opaque || !ovl_is_whiteout(newdentry)))
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (olddentry == trap)
|
||||
goto out_dput;
|
||||
if (newdentry == trap)
|
||||
goto out_dput;
|
||||
|
||||
if (is_dir && !old_opaque && new_opaque) {
|
||||
err = ovl_set_opaque(olddentry);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
}
|
||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
|
||||
err = ovl_set_opaque(newdentry);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (old_opaque || new_opaque) {
|
||||
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
|
||||
new_upperdir->d_inode, newdentry,
|
||||
flags);
|
||||
} else {
|
||||
/* No debug for the plain case */
|
||||
BUG_ON(flags & ~RENAME_EXCHANGE);
|
||||
err = vfs_rename(old_upperdir->d_inode, olddentry,
|
||||
new_upperdir->d_inode, newdentry,
|
||||
NULL, flags);
|
||||
}
|
||||
|
||||
if (err) {
|
||||
if (is_dir && !old_opaque && new_opaque)
|
||||
ovl_remove_opaque(olddentry);
|
||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
|
||||
ovl_remove_opaque(newdentry);
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (is_dir && old_opaque && !new_opaque)
|
||||
ovl_remove_opaque(olddentry);
|
||||
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
|
||||
ovl_remove_opaque(newdentry);
|
||||
|
||||
if (old_opaque != new_opaque) {
|
||||
ovl_dentry_set_opaque(old, new_opaque);
|
||||
if (!overwrite)
|
||||
ovl_dentry_set_opaque(new, old_opaque);
|
||||
}
|
||||
|
||||
if (cleanup_whiteout)
|
||||
ovl_cleanup(old_upperdir->d_inode, newdentry);
|
||||
|
||||
ovl_dentry_version_inc(old->d_parent);
|
||||
ovl_dentry_version_inc(new->d_parent);
|
||||
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_dput_old:
|
||||
dput(olddentry);
|
||||
out_unlock:
|
||||
unlock_rename(new_upperdir, old_upperdir);
|
||||
out_revert_creds:
|
||||
if (old_opaque || new_opaque) {
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
out_drop_write:
|
||||
ovl_drop_write(old);
|
||||
out:
|
||||
dput(opaquedir);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_rename(struct inode *olddir, struct dentry *old,
|
||||
struct inode *newdir, struct dentry *new)
|
||||
{
|
||||
return ovl_rename2(olddir, old, newdir, new, 0);
|
||||
}
|
||||
|
||||
const struct inode_operations_wrapper ovl_dir_inode_operations = {
|
||||
.ops = {
|
||||
.lookup = ovl_lookup,
|
||||
.mkdir = ovl_mkdir,
|
||||
.symlink = ovl_symlink,
|
||||
.unlink = ovl_unlink,
|
||||
.rmdir = ovl_rmdir,
|
||||
.rename = ovl_rename,
|
||||
.link = ovl_link,
|
||||
.setattr = ovl_setattr,
|
||||
.create = ovl_create,
|
||||
.mknod = ovl_mknod,
|
||||
.permission = ovl_permission,
|
||||
.getattr = ovl_dir_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
},
|
||||
.rename2 = ovl_rename2,
|
||||
};
|
||||
@ -1,442 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/xattr.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
|
||||
bool no_data)
|
||||
{
|
||||
int err;
|
||||
struct dentry *parent;
|
||||
struct kstat stat;
|
||||
struct path lowerpath;
|
||||
|
||||
parent = dget_parent(dentry);
|
||||
err = ovl_copy_up(parent);
|
||||
if (err)
|
||||
goto out_dput_parent;
|
||||
|
||||
ovl_path_lower(dentry, &lowerpath);
|
||||
err = vfs_getattr(&lowerpath, &stat);
|
||||
if (err)
|
||||
goto out_dput_parent;
|
||||
|
||||
if (no_data)
|
||||
stat.size = 0;
|
||||
|
||||
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
|
||||
|
||||
out_dput_parent:
|
||||
dput(parent);
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (!err) {
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
|
||||
mutex_lock(&upperdentry->d_inode->i_mutex);
|
||||
err = notify_change(upperdentry, attr, NULL);
|
||||
mutex_unlock(&upperdentry->d_inode->i_mutex);
|
||||
}
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat)
|
||||
{
|
||||
struct path realpath;
|
||||
|
||||
ovl_path_real(dentry, &realpath);
|
||||
return vfs_getattr(&realpath, stat);
|
||||
}
|
||||
|
||||
int ovl_permission(struct inode *inode, int mask)
|
||||
{
|
||||
struct ovl_entry *oe;
|
||||
struct dentry *alias = NULL;
|
||||
struct inode *realinode;
|
||||
struct dentry *realdentry;
|
||||
bool is_upper;
|
||||
int err;
|
||||
|
||||
if (S_ISDIR(inode->i_mode)) {
|
||||
oe = inode->i_private;
|
||||
} else if (mask & MAY_NOT_BLOCK) {
|
||||
return -ECHILD;
|
||||
} else {
|
||||
/*
|
||||
* For non-directories find an alias and get the info
|
||||
* from there.
|
||||
*/
|
||||
alias = d_find_any_alias(inode);
|
||||
if (WARN_ON(!alias))
|
||||
return -ENOENT;
|
||||
|
||||
oe = alias->d_fsdata;
|
||||
}
|
||||
|
||||
realdentry = ovl_entry_real(oe, &is_upper);
|
||||
|
||||
/* Careful in RCU walk mode */
|
||||
realinode = ACCESS_ONCE(realdentry->d_inode);
|
||||
if (!realinode) {
|
||||
WARN_ON(!(mask & MAY_NOT_BLOCK));
|
||||
err = -ENOENT;
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (mask & MAY_WRITE) {
|
||||
umode_t mode = realinode->i_mode;
|
||||
|
||||
/*
|
||||
* Writes will always be redirected to upper layer, so
|
||||
* ignore lower layer being read-only.
|
||||
*
|
||||
* If the overlay itself is read-only then proceed
|
||||
* with the permission check, don't return EROFS.
|
||||
* This will only happen if this is the lower layer of
|
||||
* another overlayfs.
|
||||
*
|
||||
* If upper fs becomes read-only after the overlay was
|
||||
* constructed return EROFS to prevent modification of
|
||||
* upper layer.
|
||||
*/
|
||||
err = -EROFS;
|
||||
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
|
||||
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
err = __inode_permission(realinode, mask);
|
||||
out_dput:
|
||||
dput(alias);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
struct ovl_link_data {
|
||||
struct dentry *realdentry;
|
||||
void *cookie;
|
||||
};
|
||||
|
||||
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
|
||||
{
|
||||
void *ret;
|
||||
struct dentry *realdentry;
|
||||
struct inode *realinode;
|
||||
struct ovl_link_data *data = NULL;
|
||||
|
||||
realdentry = ovl_dentry_real(dentry);
|
||||
realinode = realdentry->d_inode;
|
||||
|
||||
if (WARN_ON(!realinode->i_op->follow_link))
|
||||
return ERR_PTR(-EPERM);
|
||||
|
||||
if (realinode->i_op->put_link) {
|
||||
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
|
||||
if (!data)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
data->realdentry = realdentry;
|
||||
}
|
||||
|
||||
ret = realinode->i_op->follow_link(realdentry, nd);
|
||||
if (IS_ERR(ret)) {
|
||||
kfree(data);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (data)
|
||||
data->cookie = ret;
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
|
||||
{
|
||||
struct inode *realinode;
|
||||
struct ovl_link_data *data = c;
|
||||
|
||||
if (!data)
|
||||
return;
|
||||
|
||||
realinode = data->realdentry->d_inode;
|
||||
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
|
||||
kfree(data);
|
||||
}
|
||||
|
||||
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
|
||||
{
|
||||
struct path realpath;
|
||||
struct inode *realinode;
|
||||
|
||||
ovl_path_real(dentry, &realpath);
|
||||
realinode = realpath.dentry->d_inode;
|
||||
|
||||
if (!realinode->i_op->readlink)
|
||||
return -EINVAL;
|
||||
|
||||
touch_atime(&realpath);
|
||||
|
||||
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
|
||||
}
|
||||
|
||||
|
||||
static bool ovl_is_private_xattr(const char *name)
|
||||
{
|
||||
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
|
||||
}
|
||||
|
||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = -EPERM;
|
||||
if (ovl_is_private_xattr(name))
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
err = vfs_setxattr(upperdentry, name, value, size, flags);
|
||||
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool ovl_need_xattr_filter(struct dentry *dentry,
|
||||
enum ovl_path_type type)
|
||||
{
|
||||
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
|
||||
return S_ISDIR(dentry->d_inode->i_mode);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size)
|
||||
{
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
|
||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
||||
return -ENODATA;
|
||||
|
||||
return vfs_getxattr(realpath.dentry, name, value, size);
|
||||
}
|
||||
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
|
||||
{
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
ssize_t res;
|
||||
int off;
|
||||
|
||||
res = vfs_listxattr(realpath.dentry, list, size);
|
||||
if (res <= 0 || size == 0)
|
||||
return res;
|
||||
|
||||
if (!ovl_need_xattr_filter(dentry, type))
|
||||
return res;
|
||||
|
||||
/* filter out private xattrs */
|
||||
for (off = 0; off < res;) {
|
||||
char *s = list + off;
|
||||
size_t slen = strlen(s) + 1;
|
||||
|
||||
BUG_ON(off + slen > res);
|
||||
|
||||
if (ovl_is_private_xattr(s)) {
|
||||
res -= slen;
|
||||
memmove(s, s + slen, res - off);
|
||||
} else {
|
||||
off += slen;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
int ovl_removexattr(struct dentry *dentry, const char *name)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = -ENODATA;
|
||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
||||
goto out_drop_write;
|
||||
|
||||
if (!OVL_TYPE_UPPER(type)) {
|
||||
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
|
||||
if (err < 0)
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
ovl_path_upper(dentry, &realpath);
|
||||
}
|
||||
|
||||
err = vfs_removexattr(realpath.dentry, name);
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
|
||||
struct dentry *realdentry)
|
||||
{
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
return false;
|
||||
|
||||
if (special_file(realdentry->d_inode->i_mode))
|
||||
return false;
|
||||
|
||||
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
|
||||
const struct cred *cred)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
enum ovl_path_type type;
|
||||
bool want_write = false;
|
||||
|
||||
type = ovl_path_real(dentry, &realpath);
|
||||
if (!ovl_is_nocopyupw(dentry)) {
|
||||
if (ovl_open_need_copy_up(file->f_flags, type,
|
||||
realpath.dentry)) {
|
||||
want_write = true;
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (file->f_flags & O_TRUNC)
|
||||
err = ovl_copy_up_last(dentry, NULL, true);
|
||||
else
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
ovl_path_upper(dentry, &realpath);
|
||||
}
|
||||
}
|
||||
|
||||
err = vfs_open(&realpath, file, cred);
|
||||
out_drop_write:
|
||||
if (want_write)
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static const struct inode_operations_wrapper ovl_file_inode_operations = {
|
||||
.ops = {
|
||||
.setattr = ovl_setattr,
|
||||
.permission = ovl_permission,
|
||||
.getattr = ovl_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
},
|
||||
.dentry_open = ovl_dentry_open,
|
||||
};
|
||||
|
||||
static const struct inode_operations ovl_symlink_inode_operations = {
|
||||
.setattr = ovl_setattr,
|
||||
.follow_link = ovl_follow_link,
|
||||
.put_link = ovl_put_link,
|
||||
.readlink = ovl_readlink,
|
||||
.getattr = ovl_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
};
|
||||
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
||||
struct ovl_entry *oe)
|
||||
{
|
||||
struct inode *inode;
|
||||
|
||||
inode = new_inode(sb);
|
||||
if (!inode)
|
||||
return NULL;
|
||||
|
||||
mode &= S_IFMT;
|
||||
|
||||
inode->i_ino = get_next_ino();
|
||||
inode->i_mode = mode;
|
||||
inode->i_flags |= S_NOATIME | S_NOCMTIME;
|
||||
|
||||
switch (mode) {
|
||||
case S_IFDIR:
|
||||
inode->i_private = oe;
|
||||
inode->i_op = &ovl_dir_inode_operations.ops;
|
||||
inode->i_fop = &ovl_dir_operations;
|
||||
inode->i_flags |= S_IOPS_WRAPPER;
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
inode->i_op = &ovl_symlink_inode_operations;
|
||||
break;
|
||||
|
||||
case S_IFREG:
|
||||
case S_IFSOCK:
|
||||
case S_IFBLK:
|
||||
case S_IFCHR:
|
||||
case S_IFIFO:
|
||||
inode->i_op = &ovl_file_inode_operations.ops;
|
||||
inode->i_flags |= S_IOPS_WRAPPER;
|
||||
break;
|
||||
|
||||
default:
|
||||
WARN(1, "illegal file type: %i\n", mode);
|
||||
iput(inode);
|
||||
inode = NULL;
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
@ -1,200 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
||||
struct ovl_entry;
|
||||
|
||||
enum ovl_path_type {
|
||||
__OVL_PATH_PURE = (1 << 0),
|
||||
__OVL_PATH_UPPER = (1 << 1),
|
||||
__OVL_PATH_MERGE = (1 << 2),
|
||||
};
|
||||
|
||||
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
|
||||
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
|
||||
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
|
||||
#define OVL_TYPE_MERGE_OR_LOWER(type) \
|
||||
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
|
||||
|
||||
#define OVL_XATTR_PRE_NAME "trusted.overlay."
|
||||
#define OVL_XATTR_PRE_LEN 16
|
||||
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
|
||||
|
||||
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_rmdir(dir, dentry);
|
||||
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_unlink(dir, dentry, NULL);
|
||||
pr_debug("unlink(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
|
||||
struct dentry *new_dentry, bool debug)
|
||||
{
|
||||
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
|
||||
if (debug) {
|
||||
pr_debug("link(%pd2, %pd2) = %i\n",
|
||||
old_dentry, new_dentry, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, bool debug)
|
||||
{
|
||||
int err = vfs_create(dir, dentry, mode, true);
|
||||
if (debug)
|
||||
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, bool debug)
|
||||
{
|
||||
int err = vfs_mkdir(dir, dentry, mode);
|
||||
if (debug)
|
||||
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, dev_t dev, bool debug)
|
||||
{
|
||||
int err = vfs_mknod(dir, dentry, mode, dev);
|
||||
if (debug) {
|
||||
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
|
||||
dentry, mode, dev, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *oldname, bool debug)
|
||||
{
|
||||
int err = vfs_symlink(dir, dentry, oldname);
|
||||
if (debug)
|
||||
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err = vfs_setxattr(dentry, name, value, size, flags);
|
||||
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
|
||||
dentry, name, (int) size, (char *) value, flags, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
|
||||
{
|
||||
int err = vfs_removexattr(dentry, name);
|
||||
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
|
||||
struct inode *newdir, struct dentry *newdentry,
|
||||
unsigned int flags)
|
||||
{
|
||||
int err;
|
||||
|
||||
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
|
||||
olddentry, newdentry, flags);
|
||||
|
||||
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
|
||||
|
||||
if (err) {
|
||||
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
|
||||
olddentry, newdentry, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_whiteout(dir, dentry);
|
||||
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
bool ovl_is_nocopyupw(struct dentry *dentry);
|
||||
enum ovl_path_type ovl_path_type(struct dentry *dentry);
|
||||
u64 ovl_dentry_version_get(struct dentry *dentry);
|
||||
void ovl_dentry_version_inc(struct dentry *dentry);
|
||||
void ovl_path_upper(struct dentry *dentry, struct path *path);
|
||||
void ovl_path_lower(struct dentry *dentry, struct path *path);
|
||||
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
|
||||
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
|
||||
struct dentry *ovl_dentry_upper(struct dentry *dentry);
|
||||
struct dentry *ovl_dentry_lower(struct dentry *dentry);
|
||||
struct dentry *ovl_dentry_real(struct dentry *dentry);
|
||||
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
|
||||
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
|
||||
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
|
||||
struct dentry *ovl_workdir(struct dentry *dentry);
|
||||
int ovl_want_write(struct dentry *dentry);
|
||||
void ovl_drop_write(struct dentry *dentry);
|
||||
bool ovl_dentry_is_opaque(struct dentry *dentry);
|
||||
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
|
||||
bool ovl_is_whiteout(struct dentry *dentry);
|
||||
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
|
||||
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
|
||||
unsigned int flags);
|
||||
struct file *ovl_path_open(struct path *path, int flags);
|
||||
|
||||
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
|
||||
struct kstat *stat, const char *link);
|
||||
|
||||
/* readdir.c */
|
||||
extern const struct file_operations ovl_dir_operations;
|
||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
|
||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
|
||||
void ovl_cache_free(struct list_head *list);
|
||||
|
||||
/* inode.c */
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
int ovl_permission(struct inode *inode, int mask);
|
||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags);
|
||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size);
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
|
||||
int ovl_removexattr(struct dentry *dentry, const char *name);
|
||||
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
||||
struct ovl_entry *oe);
|
||||
static inline void ovl_copyattr(struct inode *from, struct inode *to)
|
||||
{
|
||||
to->i_uid = from->i_uid;
|
||||
to->i_gid = from->i_gid;
|
||||
}
|
||||
|
||||
/* dir.c */
|
||||
extern const struct inode_operations_wrapper ovl_dir_inode_operations;
|
||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
|
||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink, bool debug);
|
||||
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
|
||||
|
||||
/* copy_up.c */
|
||||
int ovl_copy_up(struct dentry *dentry);
|
||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
||||
struct path *lowerpath, struct kstat *stat,
|
||||
struct iattr *attr);
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
|
||||
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
|
||||
@ -1,626 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/cred.h>
|
||||
#include <linux/version.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
struct ovl_cache_entry {
|
||||
unsigned int len;
|
||||
unsigned int type;
|
||||
u64 ino;
|
||||
struct list_head l_node;
|
||||
struct rb_node node;
|
||||
struct ovl_cache_entry *next_maybe_whiteout;
|
||||
bool is_whiteout;
|
||||
char name[];
|
||||
};
|
||||
|
||||
struct ovl_dir_cache {
|
||||
long refcount;
|
||||
u64 version;
|
||||
struct list_head entries;
|
||||
};
|
||||
|
||||
/* vfs_readdir vs. iterate_dir compat */
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 11, 0) || \
|
||||
(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 5))
|
||||
#define USE_ITERATE_DIR 1
|
||||
#endif
|
||||
|
||||
#ifndef USE_ITERATE_DIR
|
||||
struct dir_context {
|
||||
const filldir_t actor;
|
||||
//loff_t pos;
|
||||
};
|
||||
#endif
|
||||
|
||||
struct ovl_readdir_data {
|
||||
struct dir_context ctx;
|
||||
bool is_merge;
|
||||
struct rb_root root;
|
||||
struct list_head *list;
|
||||
struct list_head middle;
|
||||
struct ovl_cache_entry *first_maybe_whiteout;
|
||||
int count;
|
||||
int err;
|
||||
};
|
||||
|
||||
struct ovl_dir_file {
|
||||
bool is_real;
|
||||
bool is_upper;
|
||||
struct ovl_dir_cache *cache;
|
||||
struct list_head *cursor;
|
||||
struct file *realfile;
|
||||
struct file *upperfile;
|
||||
};
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
|
||||
{
|
||||
return container_of(n, struct ovl_cache_entry, node);
|
||||
}
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
|
||||
const char *name, int len)
|
||||
{
|
||||
struct rb_node *node = root->rb_node;
|
||||
int cmp;
|
||||
|
||||
while (node) {
|
||||
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
|
||||
|
||||
cmp = strncmp(name, p->name, len);
|
||||
if (cmp > 0)
|
||||
node = p->node.rb_right;
|
||||
else if (cmp < 0 || len < p->len)
|
||||
node = p->node.rb_left;
|
||||
else
|
||||
return p;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
|
||||
const char *name, int len,
|
||||
u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
|
||||
|
||||
p = kmalloc(size, GFP_KERNEL);
|
||||
if (!p)
|
||||
return NULL;
|
||||
|
||||
memcpy(p->name, name, len);
|
||||
p->name[len] = '\0';
|
||||
p->len = len;
|
||||
p->type = d_type;
|
||||
p->ino = ino;
|
||||
p->is_whiteout = false;
|
||||
|
||||
if (d_type == DT_CHR) {
|
||||
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
|
||||
rdd->first_maybe_whiteout = p;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
|
||||
const char *name, int len, u64 ino,
|
||||
unsigned int d_type)
|
||||
{
|
||||
struct rb_node **newp = &rdd->root.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
while (*newp) {
|
||||
int cmp;
|
||||
struct ovl_cache_entry *tmp;
|
||||
|
||||
parent = *newp;
|
||||
tmp = ovl_cache_entry_from_node(*newp);
|
||||
cmp = strncmp(name, tmp->name, len);
|
||||
if (cmp > 0)
|
||||
newp = &tmp->node.rb_right;
|
||||
else if (cmp < 0 || len < tmp->len)
|
||||
newp = &tmp->node.rb_left;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
|
||||
if (p == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
list_add_tail(&p->l_node, rdd->list);
|
||||
rb_link_node(&p->node, parent, newp);
|
||||
rb_insert_color(&p->node, &rdd->root);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
|
||||
const char *name, int namelen,
|
||||
loff_t offset, u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
p = ovl_cache_entry_find(&rdd->root, name, namelen);
|
||||
if (p) {
|
||||
list_move_tail(&p->l_node, &rdd->middle);
|
||||
} else {
|
||||
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
|
||||
if (p == NULL)
|
||||
rdd->err = -ENOMEM;
|
||||
else
|
||||
list_add_tail(&p->l_node, &rdd->middle);
|
||||
}
|
||||
|
||||
return rdd->err;
|
||||
}
|
||||
|
||||
void ovl_cache_free(struct list_head *list)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
struct ovl_cache_entry *n;
|
||||
|
||||
list_for_each_entry_safe(p, n, list, l_node)
|
||||
kfree(p);
|
||||
|
||||
INIT_LIST_HEAD(list);
|
||||
}
|
||||
|
||||
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
|
||||
{
|
||||
struct ovl_dir_cache *cache = od->cache;
|
||||
|
||||
WARN_ON(cache->refcount <= 0);
|
||||
cache->refcount--;
|
||||
if (!cache->refcount) {
|
||||
if (ovl_dir_cache(dentry) == cache)
|
||||
ovl_set_dir_cache(dentry, NULL);
|
||||
|
||||
ovl_cache_free(&cache->entries);
|
||||
kfree(cache);
|
||||
}
|
||||
}
|
||||
|
||||
static int ovl_fill_merge(void *buf, const char *name, int namelen,
|
||||
loff_t offset, u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct dir_context *ctx = buf;
|
||||
struct ovl_readdir_data *rdd =
|
||||
container_of(ctx, struct ovl_readdir_data, ctx);
|
||||
|
||||
rdd->count++;
|
||||
if (!rdd->is_merge)
|
||||
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
|
||||
else
|
||||
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
|
||||
}
|
||||
|
||||
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
|
||||
{
|
||||
int err;
|
||||
struct ovl_cache_entry *p;
|
||||
struct dentry *dentry;
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
return -ENOMEM;
|
||||
|
||||
/*
|
||||
* CAP_DAC_OVERRIDE for lookup
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = mutex_lock_killable(&dir->d_inode->i_mutex);
|
||||
if (!err) {
|
||||
while (rdd->first_maybe_whiteout) {
|
||||
p = rdd->first_maybe_whiteout;
|
||||
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
|
||||
dentry = lookup_one_len(p->name, dir, p->len);
|
||||
if (!IS_ERR(dentry)) {
|
||||
p->is_whiteout = ovl_is_whiteout(dentry);
|
||||
dput(dentry);
|
||||
}
|
||||
}
|
||||
mutex_unlock(&dir->d_inode->i_mutex);
|
||||
}
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_dir_read(struct path *realpath,
|
||||
struct ovl_readdir_data *rdd)
|
||||
{
|
||||
struct file *realfile;
|
||||
int err;
|
||||
|
||||
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
|
||||
if (IS_ERR(realfile))
|
||||
return PTR_ERR(realfile);
|
||||
|
||||
rdd->first_maybe_whiteout = NULL;
|
||||
//rdd->ctx.pos = 0;
|
||||
do {
|
||||
rdd->count = 0;
|
||||
rdd->err = 0;
|
||||
#ifdef USE_ITERATE_DIR
|
||||
err = iterate_dir(realfile, &rdd->ctx);
|
||||
#else
|
||||
err = vfs_readdir(realfile, rdd->ctx.actor, rdd);
|
||||
#endif
|
||||
if (err >= 0)
|
||||
err = rdd->err;
|
||||
} while (!err && rdd->count);
|
||||
|
||||
if (!err && rdd->first_maybe_whiteout)
|
||||
err = ovl_check_whiteouts(realpath->dentry, rdd);
|
||||
|
||||
fput(realfile);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ovl_dir_reset(struct file *file)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct ovl_dir_cache *cache = od->cache;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
enum ovl_path_type type = ovl_path_type(dentry);
|
||||
|
||||
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
|
||||
ovl_cache_put(od, dentry);
|
||||
od->cache = NULL;
|
||||
od->cursor = NULL;
|
||||
}
|
||||
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
|
||||
if (od->is_real && OVL_TYPE_MERGE(type))
|
||||
od->is_real = false;
|
||||
}
|
||||
|
||||
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
struct ovl_readdir_data rdd = {
|
||||
.ctx.actor = ovl_fill_merge,
|
||||
.list = list,
|
||||
.root = RB_ROOT,
|
||||
.is_merge = false,
|
||||
};
|
||||
int idx, next;
|
||||
|
||||
for (idx = 0; idx != -1; idx = next) {
|
||||
next = ovl_path_next(idx, dentry, &realpath);
|
||||
|
||||
if (next != -1) {
|
||||
err = ovl_dir_read(&realpath, &rdd);
|
||||
if (err)
|
||||
break;
|
||||
} else {
|
||||
/*
|
||||
* Insert lowest layer entries before upper ones, this
|
||||
* allows offsets to be reasonably constant
|
||||
*/
|
||||
list_add(&rdd.middle, rdd.list);
|
||||
rdd.is_merge = true;
|
||||
err = ovl_dir_read(&realpath, &rdd);
|
||||
list_del(&rdd.middle);
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
|
||||
{
|
||||
struct list_head *p;
|
||||
loff_t off = 0;
|
||||
|
||||
list_for_each(p, &od->cache->entries) {
|
||||
if (off >= pos)
|
||||
break;
|
||||
off++;
|
||||
}
|
||||
/* Cursor is safe since the cache is stable */
|
||||
od->cursor = p;
|
||||
}
|
||||
|
||||
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
|
||||
{
|
||||
int res;
|
||||
struct ovl_dir_cache *cache;
|
||||
|
||||
cache = ovl_dir_cache(dentry);
|
||||
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
|
||||
cache->refcount++;
|
||||
return cache;
|
||||
}
|
||||
ovl_set_dir_cache(dentry, NULL);
|
||||
|
||||
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
|
||||
if (!cache)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
cache->refcount = 1;
|
||||
INIT_LIST_HEAD(&cache->entries);
|
||||
|
||||
res = ovl_dir_read_merged(dentry, &cache->entries);
|
||||
if (res) {
|
||||
ovl_cache_free(&cache->entries);
|
||||
kfree(cache);
|
||||
return ERR_PTR(res);
|
||||
}
|
||||
|
||||
cache->version = ovl_dentry_version_get(dentry);
|
||||
ovl_set_dir_cache(dentry, cache);
|
||||
|
||||
return cache;
|
||||
}
|
||||
|
||||
#ifdef USE_ITERATE_DIR
|
||||
struct iterate_wrapper {
|
||||
struct dir_context ctx;
|
||||
filldir_t actor;
|
||||
void *buf;
|
||||
};
|
||||
|
||||
static int ovl_wrap_readdir(void *ctx, const char *name, int namelen,
|
||||
loff_t offset, u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct iterate_wrapper *w = ctx;
|
||||
|
||||
return w->actor(w->buf, name, namelen, offset, ino, d_type);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
struct ovl_cache_entry *p;
|
||||
int res;
|
||||
|
||||
if (!file->f_pos)
|
||||
ovl_dir_reset(file);
|
||||
|
||||
if (od->is_real) {
|
||||
#ifdef USE_ITERATE_DIR
|
||||
struct iterate_wrapper w = {
|
||||
.ctx.actor = ovl_wrap_readdir,
|
||||
.actor = filler,
|
||||
.buf = buf,
|
||||
};
|
||||
res = iterate_dir(od->realfile, &w.ctx);
|
||||
#else
|
||||
res = vfs_readdir(od->realfile, filler, buf);
|
||||
#endif
|
||||
file->f_pos = od->realfile->f_pos;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
if (!od->cache) {
|
||||
struct ovl_dir_cache *cache;
|
||||
|
||||
cache = ovl_cache_get(dentry);
|
||||
if (IS_ERR(cache))
|
||||
return PTR_ERR(cache);
|
||||
|
||||
od->cache = cache;
|
||||
ovl_seek_cursor(od, file->f_pos);
|
||||
}
|
||||
|
||||
while (od->cursor != &od->cache->entries) {
|
||||
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
|
||||
if (!p->is_whiteout)
|
||||
if (filler(buf, p->name, p->len, file->f_pos, p->ino, p->type))
|
||||
break;
|
||||
od->cursor = p->l_node.next;
|
||||
file->f_pos++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
|
||||
{
|
||||
loff_t res;
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
|
||||
mutex_lock(&file_inode(file)->i_mutex);
|
||||
if (!file->f_pos)
|
||||
ovl_dir_reset(file);
|
||||
|
||||
if (od->is_real) {
|
||||
res = vfs_llseek(od->realfile, offset, origin);
|
||||
file->f_pos = od->realfile->f_pos;
|
||||
} else {
|
||||
res = -EINVAL;
|
||||
|
||||
switch (origin) {
|
||||
case SEEK_CUR:
|
||||
offset += file->f_pos;
|
||||
break;
|
||||
case SEEK_SET:
|
||||
break;
|
||||
default:
|
||||
goto out_unlock;
|
||||
}
|
||||
if (offset < 0)
|
||||
goto out_unlock;
|
||||
|
||||
if (offset != file->f_pos) {
|
||||
file->f_pos = offset;
|
||||
if (od->cache)
|
||||
ovl_seek_cursor(od, offset);
|
||||
}
|
||||
res = offset;
|
||||
}
|
||||
out_unlock:
|
||||
mutex_unlock(&file_inode(file)->i_mutex);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
|
||||
int datasync)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
struct file *realfile = od->realfile;
|
||||
|
||||
/*
|
||||
* Need to check if we started out being a lower dir, but got copied up
|
||||
*/
|
||||
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
|
||||
struct inode *inode = file_inode(file);
|
||||
|
||||
realfile = lockless_dereference(od->upperfile);
|
||||
if (!realfile) {
|
||||
struct path upperpath;
|
||||
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
realfile = ovl_path_open(&upperpath, O_RDONLY);
|
||||
smp_mb__before_spinlock();
|
||||
mutex_lock(&inode->i_mutex);
|
||||
if (!od->upperfile) {
|
||||
if (IS_ERR(realfile)) {
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
return PTR_ERR(realfile);
|
||||
}
|
||||
od->upperfile = realfile;
|
||||
} else {
|
||||
/* somebody has beaten us to it */
|
||||
if (!IS_ERR(realfile))
|
||||
fput(realfile);
|
||||
realfile = od->upperfile;
|
||||
}
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
}
|
||||
}
|
||||
|
||||
return vfs_fsync_range(realfile, start, end, datasync);
|
||||
}
|
||||
|
||||
static int ovl_dir_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
|
||||
if (od->cache) {
|
||||
mutex_lock(&inode->i_mutex);
|
||||
ovl_cache_put(od, file->f_path.dentry);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
}
|
||||
fput(od->realfile);
|
||||
if (od->upperfile)
|
||||
fput(od->upperfile);
|
||||
kfree(od);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_dir_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct path realpath;
|
||||
struct file *realfile;
|
||||
struct ovl_dir_file *od;
|
||||
enum ovl_path_type type;
|
||||
|
||||
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
|
||||
if (!od)
|
||||
return -ENOMEM;
|
||||
|
||||
type = ovl_path_real(file->f_path.dentry, &realpath);
|
||||
realfile = ovl_path_open(&realpath, file->f_flags);
|
||||
if (IS_ERR(realfile)) {
|
||||
kfree(od);
|
||||
return PTR_ERR(realfile);
|
||||
}
|
||||
od->realfile = realfile;
|
||||
od->is_real = !OVL_TYPE_MERGE(type);
|
||||
od->is_upper = OVL_TYPE_UPPER(type);
|
||||
file->private_data = od;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct file_operations ovl_dir_operations = {
|
||||
.read = generic_read_dir,
|
||||
.open = ovl_dir_open,
|
||||
.readdir = ovl_readdir,
|
||||
.llseek = ovl_dir_llseek,
|
||||
.fsync = ovl_dir_fsync,
|
||||
.release = ovl_dir_release,
|
||||
};
|
||||
|
||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
|
||||
{
|
||||
int err;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
err = ovl_dir_read_merged(dentry, list);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = 0;
|
||||
|
||||
list_for_each_entry(p, list, l_node) {
|
||||
if (p->is_whiteout)
|
||||
continue;
|
||||
|
||||
if (p->name[0] == '.') {
|
||||
if (p->len == 1)
|
||||
continue;
|
||||
if (p->len == 2 && p->name[1] == '.')
|
||||
continue;
|
||||
}
|
||||
err = -ENOTEMPTY;
|
||||
break;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
|
||||
list_for_each_entry(p, list, l_node) {
|
||||
struct dentry *dentry;
|
||||
|
||||
if (!p->is_whiteout)
|
||||
continue;
|
||||
|
||||
dentry = lookup_one_len(p->name, upper, p->len);
|
||||
if (IS_ERR(dentry)) {
|
||||
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
|
||||
upper->d_name.name, p->len, p->name,
|
||||
(int) PTR_ERR(dentry));
|
||||
continue;
|
||||
}
|
||||
ovl_cleanup(upper->d_inode, dentry);
|
||||
dput(dentry);
|
||||
}
|
||||
mutex_unlock(&upper->d_inode->i_mutex);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,416 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/splice.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/namei.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
|
||||
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
|
||||
{
|
||||
ssize_t list_size, size;
|
||||
char *buf, *name, *value;
|
||||
int error;
|
||||
|
||||
if (!old->d_inode->i_op->getxattr ||
|
||||
!new->d_inode->i_op->getxattr)
|
||||
return 0;
|
||||
|
||||
list_size = vfs_listxattr(old, NULL, 0);
|
||||
if (list_size <= 0) {
|
||||
if (list_size == -EOPNOTSUPP)
|
||||
return 0;
|
||||
return list_size;
|
||||
}
|
||||
|
||||
buf = kzalloc(list_size, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
error = -ENOMEM;
|
||||
value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
|
||||
if (!value)
|
||||
goto out;
|
||||
|
||||
list_size = vfs_listxattr(old, buf, list_size);
|
||||
if (list_size <= 0) {
|
||||
error = list_size;
|
||||
goto out_free_value;
|
||||
}
|
||||
|
||||
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
|
||||
size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
|
||||
if (size <= 0) {
|
||||
error = size;
|
||||
goto out_free_value;
|
||||
}
|
||||
error = vfs_setxattr(new, name, value, size, 0);
|
||||
if (error)
|
||||
goto out_free_value;
|
||||
}
|
||||
|
||||
out_free_value:
|
||||
kfree(value);
|
||||
out:
|
||||
kfree(buf);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
|
||||
{
|
||||
struct file *old_file;
|
||||
struct file *new_file;
|
||||
loff_t old_pos = 0;
|
||||
loff_t new_pos = 0;
|
||||
int error = 0;
|
||||
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
old_file = ovl_path_open(old, O_RDONLY);
|
||||
if (IS_ERR(old_file))
|
||||
return PTR_ERR(old_file);
|
||||
|
||||
new_file = ovl_path_open(new, O_WRONLY);
|
||||
if (IS_ERR(new_file)) {
|
||||
error = PTR_ERR(new_file);
|
||||
goto out_fput;
|
||||
}
|
||||
|
||||
/* FIXME: copy up sparse files efficiently */
|
||||
while (len) {
|
||||
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
|
||||
long bytes;
|
||||
|
||||
if (len < this_len)
|
||||
this_len = len;
|
||||
|
||||
if (signal_pending_state(TASK_KILLABLE, current)) {
|
||||
error = -EINTR;
|
||||
break;
|
||||
}
|
||||
|
||||
bytes = do_splice_direct(old_file, &old_pos,
|
||||
new_file, &new_pos,
|
||||
this_len, SPLICE_F_MOVE);
|
||||
if (bytes <= 0) {
|
||||
error = bytes;
|
||||
break;
|
||||
}
|
||||
WARN_ON(old_pos != new_pos);
|
||||
|
||||
len -= bytes;
|
||||
}
|
||||
|
||||
fput(new_file);
|
||||
out_fput:
|
||||
fput(old_file);
|
||||
return error;
|
||||
}
|
||||
|
||||
static char *ovl_read_symlink(struct dentry *realdentry)
|
||||
{
|
||||
int res;
|
||||
char *buf;
|
||||
struct inode *inode = realdentry->d_inode;
|
||||
mm_segment_t old_fs;
|
||||
|
||||
res = -EINVAL;
|
||||
if (!inode->i_op->readlink)
|
||||
goto err;
|
||||
|
||||
res = -ENOMEM;
|
||||
buf = (char *) __get_free_page(GFP_KERNEL);
|
||||
if (!buf)
|
||||
goto err;
|
||||
|
||||
old_fs = get_fs();
|
||||
set_fs(get_ds());
|
||||
/* The cast to a user pointer is valid due to the set_fs() */
|
||||
res = inode->i_op->readlink(realdentry,
|
||||
(char __user *)buf, PAGE_SIZE - 1);
|
||||
set_fs(old_fs);
|
||||
if (res < 0) {
|
||||
free_page((unsigned long) buf);
|
||||
goto err;
|
||||
}
|
||||
buf[res] = '\0';
|
||||
|
||||
return buf;
|
||||
|
||||
err:
|
||||
return ERR_PTR(res);
|
||||
}
|
||||
|
||||
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
struct iattr attr = {
|
||||
.ia_valid =
|
||||
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
|
||||
.ia_atime = stat->atime,
|
||||
.ia_mtime = stat->mtime,
|
||||
};
|
||||
|
||||
return notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
|
||||
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (!S_ISLNK(stat->mode)) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_MODE,
|
||||
.ia_mode = stat->mode,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_UID | ATTR_GID,
|
||||
.ia_uid = stat->uid,
|
||||
.ia_gid = stat->gid,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err)
|
||||
ovl_set_timestamps(upperdentry, stat);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
|
||||
struct dentry *dentry, struct path *lowerpath,
|
||||
struct kstat *stat, struct iattr *attr,
|
||||
const char *link)
|
||||
{
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *newdentry = NULL;
|
||||
struct dentry *upper = NULL;
|
||||
umode_t mode = stat->mode;
|
||||
int err;
|
||||
|
||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out1;
|
||||
|
||||
/* Can't properly set mode on creation because of the umask */
|
||||
stat->mode &= S_IFMT;
|
||||
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
|
||||
stat->mode = mode;
|
||||
if (err)
|
||||
goto out2;
|
||||
|
||||
if (S_ISREG(stat->mode)) {
|
||||
struct path upperpath;
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
BUG_ON(upperpath.dentry != NULL);
|
||||
upperpath.dentry = newdentry;
|
||||
|
||||
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
}
|
||||
|
||||
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
mutex_lock(&newdentry->d_inode->i_mutex);
|
||||
err = ovl_set_attr(newdentry, stat);
|
||||
if (!err && attr)
|
||||
err = notify_change(newdentry, attr, NULL);
|
||||
mutex_unlock(&newdentry->d_inode->i_mutex);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
newdentry = NULL;
|
||||
|
||||
/*
|
||||
* Non-directores become opaque when copied up.
|
||||
*/
|
||||
if (!S_ISDIR(stat->mode))
|
||||
ovl_dentry_set_opaque(dentry, true);
|
||||
out2:
|
||||
dput(upper);
|
||||
out1:
|
||||
dput(newdentry);
|
||||
out:
|
||||
return err;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, newdentry);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy up a single dentry
|
||||
*
|
||||
* Directory renames only allowed on "pure upper" (already created on
|
||||
* upper filesystem, never copied up). Directories which are on lower or
|
||||
* are merged may not be renamed. For these -EXDEV is returned and
|
||||
* userspace has to deal with it. This means, when copying up a
|
||||
* directory we can rely on it and ancestors being stable.
|
||||
*
|
||||
* Non-directory renames start with copy up of source if necessary. The
|
||||
* actual rename will only proceed once the copy up was successful. Copy
|
||||
* up uses upper parent i_mutex for exclusion. Since rename can change
|
||||
* d_parent it is possible that the copy up will lock the old parent. At
|
||||
* that point the file will have already been copied up anyway.
|
||||
*/
|
||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
||||
struct path *lowerpath, struct kstat *stat,
|
||||
struct iattr *attr)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
int err;
|
||||
struct kstat pstat;
|
||||
struct path parentpath;
|
||||
struct dentry *upperdir;
|
||||
struct dentry *upperdentry;
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
char *link = NULL;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
ovl_path_upper(parent, &parentpath);
|
||||
upperdir = parentpath.dentry;
|
||||
|
||||
err = vfs_getattr(&parentpath, &pstat);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (S_ISLNK(stat->mode)) {
|
||||
link = ovl_read_symlink(lowerpath->dentry);
|
||||
if (IS_ERR(link))
|
||||
return PTR_ERR(link);
|
||||
}
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_free_link;
|
||||
|
||||
override_cred->fsuid = stat->uid;
|
||||
override_cred->fsgid = stat->gid;
|
||||
/*
|
||||
* CAP_SYS_ADMIN for copying up extended attributes
|
||||
* CAP_DAC_OVERRIDE for create
|
||||
* CAP_FOWNER for chmod, timestamp update
|
||||
* CAP_FSETID for chmod
|
||||
* CAP_CHOWN for chown
|
||||
* CAP_MKNOD for mknod
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
cap_raise(override_cred->cap_effective, CAP_MKNOD);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = -EIO;
|
||||
if (lock_rename(workdir, upperdir) != NULL) {
|
||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
||||
goto out_unlock;
|
||||
}
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
if (upperdentry) {
|
||||
unlock_rename(workdir, upperdir);
|
||||
err = 0;
|
||||
/* Raced with another copy-up? Do the setattr here */
|
||||
if (attr) {
|
||||
mutex_lock(&upperdentry->d_inode->i_mutex);
|
||||
err = notify_change(upperdentry, attr, NULL);
|
||||
mutex_unlock(&upperdentry->d_inode->i_mutex);
|
||||
}
|
||||
goto out_put_cred;
|
||||
}
|
||||
|
||||
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
|
||||
stat, attr, link);
|
||||
if (!err) {
|
||||
/* Restore timestamps on parent (best effort) */
|
||||
ovl_set_timestamps(upperdir, &pstat);
|
||||
}
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out_put_cred:
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
|
||||
out_free_link:
|
||||
if (link)
|
||||
free_page((unsigned long) link);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_copy_up(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = 0;
|
||||
while (!err) {
|
||||
struct dentry *next;
|
||||
struct dentry *parent;
|
||||
struct path lowerpath;
|
||||
struct kstat stat;
|
||||
enum ovl_path_type type = ovl_path_type(dentry);
|
||||
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
break;
|
||||
|
||||
next = dget(dentry);
|
||||
/* find the topmost dentry not yet copied up */
|
||||
for (;;) {
|
||||
parent = dget_parent(next);
|
||||
|
||||
type = ovl_path_type(parent);
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
break;
|
||||
|
||||
dput(next);
|
||||
next = parent;
|
||||
}
|
||||
|
||||
ovl_path_lower(next, &lowerpath);
|
||||
err = vfs_getattr(&lowerpath, &stat);
|
||||
if (!err)
|
||||
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
|
||||
|
||||
dput(parent);
|
||||
dput(next);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
@ -1,951 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/cred.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
dget(wdentry);
|
||||
if (d_is_dir(wdentry))
|
||||
err = ovl_do_rmdir(wdir, wdentry);
|
||||
else
|
||||
err = ovl_do_unlink(wdir, wdentry);
|
||||
dput(wdentry);
|
||||
|
||||
if (err) {
|
||||
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
|
||||
wdentry, err);
|
||||
}
|
||||
}
|
||||
|
||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
|
||||
{
|
||||
struct dentry *temp;
|
||||
char name[20];
|
||||
|
||||
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
|
||||
|
||||
temp = lookup_one_len(name, workdir, strlen(name));
|
||||
if (!IS_ERR(temp) && temp->d_inode) {
|
||||
pr_err("overlayfs: workdir/%s already exists\n", name);
|
||||
dput(temp);
|
||||
temp = ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
return temp;
|
||||
}
|
||||
|
||||
/* caller holds i_mutex on workdir */
|
||||
static struct dentry *ovl_whiteout(struct dentry *workdir,
|
||||
struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *whiteout;
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
|
||||
whiteout = ovl_lookup_temp(workdir, dentry);
|
||||
if (IS_ERR(whiteout))
|
||||
return whiteout;
|
||||
|
||||
err = ovl_do_whiteout(wdir, whiteout);
|
||||
if (err) {
|
||||
dput(whiteout);
|
||||
whiteout = ERR_PTR(err);
|
||||
}
|
||||
|
||||
return whiteout;
|
||||
}
|
||||
|
||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink, bool debug)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (newdentry->d_inode)
|
||||
return -ESTALE;
|
||||
|
||||
if (hardlink) {
|
||||
err = ovl_do_link(hardlink, dir, newdentry, debug);
|
||||
} else {
|
||||
switch (stat->mode & S_IFMT) {
|
||||
case S_IFREG:
|
||||
err = ovl_do_create(dir, newdentry, stat->mode, debug);
|
||||
break;
|
||||
|
||||
case S_IFDIR:
|
||||
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
|
||||
break;
|
||||
|
||||
case S_IFCHR:
|
||||
case S_IFBLK:
|
||||
case S_IFIFO:
|
||||
case S_IFSOCK:
|
||||
err = ovl_do_mknod(dir, newdentry,
|
||||
stat->mode, stat->rdev, debug);
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
err = ovl_do_symlink(dir, newdentry, link, debug);
|
||||
break;
|
||||
|
||||
default:
|
||||
err = -EPERM;
|
||||
}
|
||||
}
|
||||
if (!err && WARN_ON(!newdentry->d_inode)) {
|
||||
/*
|
||||
* Not quite sure if non-instantiated dentry is legal or not.
|
||||
* VFS doesn't seem to care so check and warn here.
|
||||
*/
|
||||
err = -ENOENT;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_set_opaque(struct dentry *upperdentry)
|
||||
{
|
||||
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
|
||||
}
|
||||
|
||||
static void ovl_remove_opaque(struct dentry *upperdentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
|
||||
if (err) {
|
||||
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
|
||||
upperdentry->d_name.name, err);
|
||||
}
|
||||
}
|
||||
|
||||
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat)
|
||||
{
|
||||
int err;
|
||||
enum ovl_path_type type;
|
||||
struct path realpath;
|
||||
|
||||
type = ovl_path_real(dentry, &realpath);
|
||||
err = vfs_getattr(&realpath, stat);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
stat->dev = dentry->d_sb->s_dev;
|
||||
stat->ino = dentry->d_inode->i_ino;
|
||||
|
||||
/*
|
||||
* It's probably not worth it to count subdirs to get the
|
||||
* correct link count. nlink=1 seems to pacify 'find' and
|
||||
* other utilities.
|
||||
*/
|
||||
if (OVL_TYPE_MERGE(type))
|
||||
stat->nlink = 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink)
|
||||
{
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *newdentry;
|
||||
int err;
|
||||
|
||||
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
|
||||
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_unlock;
|
||||
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
ovl_copyattr(newdentry->d_inode, inode);
|
||||
d_instantiate(dentry, inode);
|
||||
newdentry = NULL;
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_unlock:
|
||||
mutex_unlock(&udir->i_mutex);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_lock_rename_workdir(struct dentry *workdir,
|
||||
struct dentry *upperdir)
|
||||
{
|
||||
/* Workdir should not be the same as upperdir */
|
||||
if (workdir == upperdir)
|
||||
goto err;
|
||||
|
||||
/* Workdir should not be subdir of upperdir and vice versa */
|
||||
if (lock_rename(workdir, upperdir) != NULL)
|
||||
goto err_unlock;
|
||||
|
||||
return 0;
|
||||
|
||||
err_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
err:
|
||||
pr_err("overlayfs: failed to lock workdir+upperdir\n");
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_clear_empty(struct dentry *dentry,
|
||||
struct list_head *list)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct path upperpath;
|
||||
struct dentry *upper;
|
||||
struct dentry *opaquedir;
|
||||
struct kstat stat;
|
||||
int err;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return ERR_PTR(-EROFS);
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
err = vfs_getattr(&upperpath, &stat);
|
||||
if (err)
|
||||
goto out_unlock;
|
||||
|
||||
err = -ESTALE;
|
||||
if (!S_ISDIR(stat.mode))
|
||||
goto out_unlock;
|
||||
upper = upperpath.dentry;
|
||||
if (upper->d_parent->d_inode != udir)
|
||||
goto out_unlock;
|
||||
|
||||
opaquedir = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir))
|
||||
goto out_unlock;
|
||||
|
||||
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
err = ovl_copy_xattr(upper, opaquedir);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_set_opaque(opaquedir);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
mutex_lock(&opaquedir->d_inode->i_mutex);
|
||||
err = ovl_set_attr(opaquedir, &stat);
|
||||
mutex_unlock(&opaquedir->d_inode->i_mutex);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_cleanup_whiteouts(upper, list);
|
||||
ovl_cleanup(wdir, upper);
|
||||
unlock_rename(workdir, upperdir);
|
||||
|
||||
/* dentry's upper doesn't match now, get rid of it */
|
||||
d_drop(dentry);
|
||||
|
||||
return opaquedir;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, opaquedir);
|
||||
out_dput:
|
||||
dput(opaquedir);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out:
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *ret = NULL;
|
||||
LIST_HEAD(list);
|
||||
|
||||
err = ovl_check_empty_dir(dentry, &list);
|
||||
if (err)
|
||||
ret = ERR_PTR(err);
|
||||
else {
|
||||
/*
|
||||
* If no upperdentry then skip clearing whiteouts.
|
||||
*
|
||||
* Can race with copy-up, since we don't hold the upperdir
|
||||
* mutex. Doesn't matter, since copy-up can't create a
|
||||
* non-empty directory from an empty one.
|
||||
*/
|
||||
if (ovl_dentry_upper(dentry))
|
||||
ret = ovl_clear_empty(dentry, &list);
|
||||
}
|
||||
|
||||
ovl_cache_free(&list);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *upper;
|
||||
struct dentry *newdentry;
|
||||
int err;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
newdentry = ovl_lookup_temp(workdir, dentry);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_unlock;
|
||||
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto out_dput;
|
||||
|
||||
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
|
||||
if (err)
|
||||
goto out_dput2;
|
||||
|
||||
if (S_ISDIR(stat->mode)) {
|
||||
err = ovl_set_opaque(newdentry);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper,
|
||||
RENAME_EXCHANGE);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
|
||||
ovl_cleanup(wdir, upper);
|
||||
} else {
|
||||
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
|
||||
if (err)
|
||||
goto out_cleanup;
|
||||
}
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
ovl_dentry_update(dentry, newdentry);
|
||||
ovl_copyattr(newdentry->d_inode, inode);
|
||||
d_instantiate(dentry, inode);
|
||||
newdentry = NULL;
|
||||
out_dput2:
|
||||
dput(upper);
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out:
|
||||
return err;
|
||||
|
||||
out_cleanup:
|
||||
ovl_cleanup(wdir, newdentry);
|
||||
goto out_dput2;
|
||||
}
|
||||
|
||||
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
|
||||
const char *link, struct dentry *hardlink)
|
||||
{
|
||||
int err;
|
||||
struct inode *inode;
|
||||
struct kstat stat = {
|
||||
.mode = mode,
|
||||
.rdev = rdev,
|
||||
};
|
||||
|
||||
err = -ENOMEM;
|
||||
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
|
||||
if (!inode)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry->d_parent);
|
||||
if (err)
|
||||
goto out_iput;
|
||||
|
||||
if (!ovl_dentry_is_opaque(dentry)) {
|
||||
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
|
||||
} else {
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_iput;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting opaque xattr
|
||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
|
||||
hardlink);
|
||||
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
|
||||
if (!err)
|
||||
inode = NULL;
|
||||
out_iput:
|
||||
iput(inode);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
|
||||
const char *link)
|
||||
{
|
||||
int err;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (!err) {
|
||||
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
|
||||
ovl_drop_write(dentry);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
bool excl)
|
||||
{
|
||||
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
|
||||
}
|
||||
|
||||
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
|
||||
{
|
||||
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
|
||||
}
|
||||
|
||||
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
dev_t rdev)
|
||||
{
|
||||
/* Don't allow creation of "whiteout" on overlay */
|
||||
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
|
||||
return -EPERM;
|
||||
|
||||
return ovl_create_object(dentry, mode, rdev, NULL);
|
||||
}
|
||||
|
||||
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *link)
|
||||
{
|
||||
return ovl_create_object(dentry, S_IFLNK, 0, link);
|
||||
}
|
||||
|
||||
static int ovl_link(struct dentry *old, struct inode *newdir,
|
||||
struct dentry *new)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upper;
|
||||
|
||||
err = ovl_want_write(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(old);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
upper = ovl_dentry_upper(old);
|
||||
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
|
||||
|
||||
out_drop_write:
|
||||
ovl_drop_write(old);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
struct dentry *workdir = ovl_workdir(dentry);
|
||||
struct inode *wdir = workdir->d_inode;
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *udir = upperdir->d_inode;
|
||||
struct dentry *whiteout;
|
||||
struct dentry *upper;
|
||||
struct dentry *opaquedir = NULL;
|
||||
int err;
|
||||
|
||||
if (WARN_ON(!workdir))
|
||||
return -EROFS;
|
||||
|
||||
if (is_dir) {
|
||||
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
|
||||
opaquedir = ovl_check_empty_and_clear(dentry);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir))
|
||||
goto out;
|
||||
} else {
|
||||
LIST_HEAD(list);
|
||||
|
||||
/*
|
||||
* When removing an empty opaque directory, then it
|
||||
* makes no sense to replace it with an exact replica of
|
||||
* itself. But emptiness still needs to be checked.
|
||||
*/
|
||||
err = ovl_check_empty_dir(dentry, &list);
|
||||
ovl_cache_free(&list);
|
||||
if (err)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
err = ovl_lock_rename_workdir(workdir, upperdir);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
|
||||
whiteout = ovl_whiteout(workdir, dentry);
|
||||
err = PTR_ERR(whiteout);
|
||||
if (IS_ERR(whiteout))
|
||||
goto out_unlock;
|
||||
|
||||
upper = ovl_dentry_upper(dentry);
|
||||
if (!upper) {
|
||||
upper = lookup_one_len(dentry->d_name.name, upperdir,
|
||||
dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR(upper))
|
||||
goto kill_whiteout;
|
||||
|
||||
err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
|
||||
dput(upper);
|
||||
if (err)
|
||||
goto kill_whiteout;
|
||||
} else {
|
||||
int flags = 0;
|
||||
|
||||
if (opaquedir)
|
||||
upper = opaquedir;
|
||||
err = -ESTALE;
|
||||
if (upper->d_parent != upperdir)
|
||||
goto kill_whiteout;
|
||||
|
||||
if (is_dir)
|
||||
flags |= RENAME_EXCHANGE;
|
||||
|
||||
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
|
||||
if (err)
|
||||
goto kill_whiteout;
|
||||
|
||||
if (is_dir)
|
||||
ovl_cleanup(wdir, upper);
|
||||
}
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
out_d_drop:
|
||||
d_drop(dentry);
|
||||
dput(whiteout);
|
||||
out_unlock:
|
||||
unlock_rename(workdir, upperdir);
|
||||
out_dput:
|
||||
dput(opaquedir);
|
||||
out:
|
||||
return err;
|
||||
|
||||
kill_whiteout:
|
||||
ovl_cleanup(wdir, whiteout);
|
||||
goto out_d_drop;
|
||||
}
|
||||
|
||||
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
|
||||
struct inode *dir = upperdir->d_inode;
|
||||
struct dentry *upper = ovl_dentry_upper(dentry);
|
||||
int err;
|
||||
|
||||
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
|
||||
err = -ESTALE;
|
||||
if (upper->d_parent == upperdir) {
|
||||
/* Don't let d_delete() think it can reset d_inode */
|
||||
dget(upper);
|
||||
if (is_dir)
|
||||
err = vfs_rmdir(dir, upper);
|
||||
else
|
||||
err = vfs_unlink(dir, upper, NULL);
|
||||
dput(upper);
|
||||
ovl_dentry_version_inc(dentry->d_parent);
|
||||
}
|
||||
|
||||
/*
|
||||
* Keeping this dentry hashed would mean having to release
|
||||
* upperpath/lowerpath, which could only be done if we are the
|
||||
* sole user of this dentry. Too tricky... Just unhash for
|
||||
* now.
|
||||
*/
|
||||
d_drop(dentry);
|
||||
mutex_unlock(&dir->i_mutex);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_check_sticky(struct dentry *dentry)
|
||||
{
|
||||
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
|
||||
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
|
||||
|
||||
if (check_sticky(dir, inode))
|
||||
return -EPERM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
|
||||
{
|
||||
enum ovl_path_type type;
|
||||
int err;
|
||||
|
||||
err = ovl_check_sticky(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry->d_parent);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
type = ovl_path_type(dentry);
|
||||
if (OVL_TYPE_PURE_UPPER(type)) {
|
||||
err = ovl_remove_upper(dentry, is_dir);
|
||||
} else {
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_drop_write;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
||||
* CAP_DAC_OVERRIDE for create in workdir, rename
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
* CAP_FSETID for chmod of opaque dir
|
||||
* CAP_CHOWN for chown of opaque dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
err = ovl_remove_and_whiteout(dentry, is_dir);
|
||||
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
return ovl_do_remove(dentry, false);
|
||||
}
|
||||
|
||||
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
return ovl_do_remove(dentry, true);
|
||||
}
|
||||
|
||||
static int ovl_rename2(struct inode *olddir, struct dentry *old,
|
||||
struct inode *newdir, struct dentry *new,
|
||||
unsigned int flags)
|
||||
{
|
||||
int err;
|
||||
enum ovl_path_type old_type;
|
||||
enum ovl_path_type new_type;
|
||||
struct dentry *old_upperdir;
|
||||
struct dentry *new_upperdir;
|
||||
struct dentry *olddentry;
|
||||
struct dentry *newdentry;
|
||||
struct dentry *trap;
|
||||
bool old_opaque;
|
||||
bool new_opaque;
|
||||
bool new_create = false;
|
||||
bool cleanup_whiteout = false;
|
||||
bool overwrite = !(flags & RENAME_EXCHANGE);
|
||||
bool is_dir = d_is_dir(old);
|
||||
bool new_is_dir = false;
|
||||
struct dentry *opaquedir = NULL;
|
||||
const struct cred *old_cred = NULL;
|
||||
struct cred *override_cred = NULL;
|
||||
|
||||
err = -EINVAL;
|
||||
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
|
||||
goto out;
|
||||
|
||||
flags &= ~RENAME_NOREPLACE;
|
||||
|
||||
err = ovl_check_sticky(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
/* Don't copy up directory trees */
|
||||
old_type = ovl_path_type(old);
|
||||
err = -EXDEV;
|
||||
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
|
||||
goto out;
|
||||
|
||||
if (new->d_inode) {
|
||||
err = ovl_check_sticky(new);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (d_is_dir(new))
|
||||
new_is_dir = true;
|
||||
|
||||
new_type = ovl_path_type(new);
|
||||
err = -EXDEV;
|
||||
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
|
||||
goto out;
|
||||
|
||||
err = 0;
|
||||
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
|
||||
if (ovl_dentry_lower(old)->d_inode ==
|
||||
ovl_dentry_lower(new)->d_inode)
|
||||
goto out;
|
||||
}
|
||||
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
|
||||
if (ovl_dentry_upper(old)->d_inode ==
|
||||
ovl_dentry_upper(new)->d_inode)
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
if (ovl_dentry_is_opaque(new))
|
||||
new_type = __OVL_PATH_UPPER;
|
||||
else
|
||||
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
|
||||
}
|
||||
|
||||
err = ovl_want_write(old);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(old);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(new->d_parent);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
if (!overwrite) {
|
||||
err = ovl_copy_up(new);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
}
|
||||
|
||||
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
|
||||
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
|
||||
|
||||
if (old_opaque || new_opaque) {
|
||||
err = -ENOMEM;
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred)
|
||||
goto out_drop_write;
|
||||
|
||||
/*
|
||||
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
|
||||
* CAP_DAC_OVERRIDE for create in workdir
|
||||
* CAP_FOWNER for removing whiteout from sticky dir
|
||||
* CAP_FSETID for chmod of opaque dir
|
||||
* CAP_CHOWN for chown of opaque dir
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
cap_raise(override_cred->cap_effective, CAP_FOWNER);
|
||||
cap_raise(override_cred->cap_effective, CAP_FSETID);
|
||||
cap_raise(override_cred->cap_effective, CAP_CHOWN);
|
||||
old_cred = override_creds(override_cred);
|
||||
}
|
||||
|
||||
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
|
||||
opaquedir = ovl_check_empty_and_clear(new);
|
||||
err = PTR_ERR(opaquedir);
|
||||
if (IS_ERR(opaquedir)) {
|
||||
opaquedir = NULL;
|
||||
goto out_revert_creds;
|
||||
}
|
||||
}
|
||||
|
||||
if (overwrite) {
|
||||
if (old_opaque) {
|
||||
if (new->d_inode || !new_opaque) {
|
||||
/* Whiteout source */
|
||||
flags |= RENAME_WHITEOUT;
|
||||
} else {
|
||||
/* Switch whiteouts */
|
||||
flags |= RENAME_EXCHANGE;
|
||||
}
|
||||
} else if (is_dir && !new->d_inode && new_opaque) {
|
||||
flags |= RENAME_EXCHANGE;
|
||||
cleanup_whiteout = true;
|
||||
}
|
||||
}
|
||||
|
||||
old_upperdir = ovl_dentry_upper(old->d_parent);
|
||||
new_upperdir = ovl_dentry_upper(new->d_parent);
|
||||
|
||||
trap = lock_rename(new_upperdir, old_upperdir);
|
||||
|
||||
olddentry = ovl_dentry_upper(old);
|
||||
newdentry = ovl_dentry_upper(new);
|
||||
if (newdentry) {
|
||||
if (opaquedir) {
|
||||
newdentry = opaquedir;
|
||||
opaquedir = NULL;
|
||||
} else {
|
||||
dget(newdentry);
|
||||
}
|
||||
} else {
|
||||
new_create = true;
|
||||
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
|
||||
new->d_name.len);
|
||||
err = PTR_ERR(newdentry);
|
||||
if (IS_ERR(newdentry))
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
err = -ESTALE;
|
||||
if (olddentry->d_parent != old_upperdir)
|
||||
goto out_dput;
|
||||
if (newdentry->d_parent != new_upperdir)
|
||||
goto out_dput;
|
||||
if (olddentry == trap)
|
||||
goto out_dput;
|
||||
if (newdentry == trap)
|
||||
goto out_dput;
|
||||
|
||||
if (is_dir && !old_opaque && new_opaque) {
|
||||
err = ovl_set_opaque(olddentry);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
}
|
||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
|
||||
err = ovl_set_opaque(newdentry);
|
||||
if (err)
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (old_opaque || new_opaque) {
|
||||
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
|
||||
new_upperdir->d_inode, newdentry,
|
||||
flags);
|
||||
} else {
|
||||
/* No debug for the plain case */
|
||||
BUG_ON(flags & ~RENAME_EXCHANGE);
|
||||
err = vfs_rename(old_upperdir->d_inode, olddentry,
|
||||
new_upperdir->d_inode, newdentry,
|
||||
NULL, flags);
|
||||
}
|
||||
|
||||
if (err) {
|
||||
if (is_dir && !old_opaque && new_opaque)
|
||||
ovl_remove_opaque(olddentry);
|
||||
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
|
||||
ovl_remove_opaque(newdentry);
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (is_dir && old_opaque && !new_opaque)
|
||||
ovl_remove_opaque(olddentry);
|
||||
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
|
||||
ovl_remove_opaque(newdentry);
|
||||
|
||||
if (old_opaque != new_opaque) {
|
||||
ovl_dentry_set_opaque(old, new_opaque);
|
||||
if (!overwrite)
|
||||
ovl_dentry_set_opaque(new, old_opaque);
|
||||
}
|
||||
|
||||
if (cleanup_whiteout)
|
||||
ovl_cleanup(old_upperdir->d_inode, newdentry);
|
||||
|
||||
ovl_dentry_version_inc(old->d_parent);
|
||||
ovl_dentry_version_inc(new->d_parent);
|
||||
|
||||
out_dput:
|
||||
dput(newdentry);
|
||||
out_unlock:
|
||||
unlock_rename(new_upperdir, old_upperdir);
|
||||
out_revert_creds:
|
||||
if (old_opaque || new_opaque) {
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
out_drop_write:
|
||||
ovl_drop_write(old);
|
||||
out:
|
||||
dput(opaquedir);
|
||||
return err;
|
||||
}
|
||||
|
||||
const struct inode_operations ovl_dir_inode_operations = {
|
||||
.lookup = ovl_lookup,
|
||||
.mkdir = ovl_mkdir,
|
||||
.symlink = ovl_symlink,
|
||||
.unlink = ovl_unlink,
|
||||
.rmdir = ovl_rmdir,
|
||||
.rename2 = ovl_rename2,
|
||||
.link = ovl_link,
|
||||
.setattr = ovl_setattr,
|
||||
.create = ovl_create,
|
||||
.mknod = ovl_mknod,
|
||||
.permission = ovl_permission,
|
||||
.getattr = ovl_dir_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
};
|
||||
@ -1,438 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/xattr.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
|
||||
bool no_data)
|
||||
{
|
||||
int err;
|
||||
struct dentry *parent;
|
||||
struct kstat stat;
|
||||
struct path lowerpath;
|
||||
|
||||
parent = dget_parent(dentry);
|
||||
err = ovl_copy_up(parent);
|
||||
if (err)
|
||||
goto out_dput_parent;
|
||||
|
||||
ovl_path_lower(dentry, &lowerpath);
|
||||
err = vfs_getattr(&lowerpath, &stat);
|
||||
if (err)
|
||||
goto out_dput_parent;
|
||||
|
||||
if (no_data)
|
||||
stat.size = 0;
|
||||
|
||||
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
|
||||
|
||||
out_dput_parent:
|
||||
dput(parent);
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
if (upperdentry) {
|
||||
mutex_lock(&upperdentry->d_inode->i_mutex);
|
||||
err = notify_change(upperdentry, attr, NULL);
|
||||
mutex_unlock(&upperdentry->d_inode->i_mutex);
|
||||
} else {
|
||||
err = ovl_copy_up_last(dentry, attr, false);
|
||||
}
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
|
||||
struct kstat *stat)
|
||||
{
|
||||
struct path realpath;
|
||||
|
||||
ovl_path_real(dentry, &realpath);
|
||||
return vfs_getattr(&realpath, stat);
|
||||
}
|
||||
|
||||
int ovl_permission(struct inode *inode, int mask)
|
||||
{
|
||||
struct ovl_entry *oe;
|
||||
struct dentry *alias = NULL;
|
||||
struct inode *realinode;
|
||||
struct dentry *realdentry;
|
||||
bool is_upper;
|
||||
int err;
|
||||
|
||||
if (S_ISDIR(inode->i_mode)) {
|
||||
oe = inode->i_private;
|
||||
} else if (mask & MAY_NOT_BLOCK) {
|
||||
return -ECHILD;
|
||||
} else {
|
||||
/*
|
||||
* For non-directories find an alias and get the info
|
||||
* from there.
|
||||
*/
|
||||
alias = d_find_any_alias(inode);
|
||||
if (WARN_ON(!alias))
|
||||
return -ENOENT;
|
||||
|
||||
oe = alias->d_fsdata;
|
||||
}
|
||||
|
||||
realdentry = ovl_entry_real(oe, &is_upper);
|
||||
|
||||
/* Careful in RCU walk mode */
|
||||
realinode = ACCESS_ONCE(realdentry->d_inode);
|
||||
if (!realinode) {
|
||||
WARN_ON(!(mask & MAY_NOT_BLOCK));
|
||||
err = -ENOENT;
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
if (mask & MAY_WRITE) {
|
||||
umode_t mode = realinode->i_mode;
|
||||
|
||||
/*
|
||||
* Writes will always be redirected to upper layer, so
|
||||
* ignore lower layer being read-only.
|
||||
*
|
||||
* If the overlay itself is read-only then proceed
|
||||
* with the permission check, don't return EROFS.
|
||||
* This will only happen if this is the lower layer of
|
||||
* another overlayfs.
|
||||
*
|
||||
* If upper fs becomes read-only after the overlay was
|
||||
* constructed return EROFS to prevent modification of
|
||||
* upper layer.
|
||||
*/
|
||||
err = -EROFS;
|
||||
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
|
||||
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
|
||||
goto out_dput;
|
||||
}
|
||||
|
||||
err = __inode_permission(realinode, mask);
|
||||
out_dput:
|
||||
dput(alias);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
||||
struct ovl_link_data {
|
||||
struct dentry *realdentry;
|
||||
void *cookie;
|
||||
};
|
||||
|
||||
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
|
||||
{
|
||||
void *ret;
|
||||
struct dentry *realdentry;
|
||||
struct inode *realinode;
|
||||
|
||||
realdentry = ovl_dentry_real(dentry);
|
||||
realinode = realdentry->d_inode;
|
||||
|
||||
if (WARN_ON(!realinode->i_op->follow_link))
|
||||
return ERR_PTR(-EPERM);
|
||||
|
||||
ret = realinode->i_op->follow_link(realdentry, nd);
|
||||
if (IS_ERR(ret))
|
||||
return ret;
|
||||
|
||||
if (realinode->i_op->put_link) {
|
||||
struct ovl_link_data *data;
|
||||
|
||||
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
|
||||
if (!data) {
|
||||
realinode->i_op->put_link(realdentry, nd, ret);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
data->realdentry = realdentry;
|
||||
data->cookie = ret;
|
||||
|
||||
return data;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
|
||||
{
|
||||
struct inode *realinode;
|
||||
struct ovl_link_data *data = c;
|
||||
|
||||
if (!data)
|
||||
return;
|
||||
|
||||
realinode = data->realdentry->d_inode;
|
||||
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
|
||||
kfree(data);
|
||||
}
|
||||
|
||||
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
|
||||
{
|
||||
struct path realpath;
|
||||
struct inode *realinode;
|
||||
|
||||
ovl_path_real(dentry, &realpath);
|
||||
realinode = realpath.dentry->d_inode;
|
||||
|
||||
if (!realinode->i_op->readlink)
|
||||
return -EINVAL;
|
||||
|
||||
touch_atime(&realpath);
|
||||
|
||||
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
|
||||
}
|
||||
|
||||
|
||||
static bool ovl_is_private_xattr(const char *name)
|
||||
{
|
||||
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
|
||||
}
|
||||
|
||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = -EPERM;
|
||||
if (ovl_is_private_xattr(name))
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
err = vfs_setxattr(upperdentry, name, value, size, flags);
|
||||
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool ovl_need_xattr_filter(struct dentry *dentry,
|
||||
enum ovl_path_type type)
|
||||
{
|
||||
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
|
||||
return S_ISDIR(dentry->d_inode->i_mode);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size)
|
||||
{
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
|
||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
||||
return -ENODATA;
|
||||
|
||||
return vfs_getxattr(realpath.dentry, name, value, size);
|
||||
}
|
||||
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
|
||||
{
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
ssize_t res;
|
||||
int off;
|
||||
|
||||
res = vfs_listxattr(realpath.dentry, list, size);
|
||||
if (res <= 0 || size == 0)
|
||||
return res;
|
||||
|
||||
if (!ovl_need_xattr_filter(dentry, type))
|
||||
return res;
|
||||
|
||||
/* filter out private xattrs */
|
||||
for (off = 0; off < res;) {
|
||||
char *s = list + off;
|
||||
size_t slen = strlen(s) + 1;
|
||||
|
||||
BUG_ON(off + slen > res);
|
||||
|
||||
if (ovl_is_private_xattr(s)) {
|
||||
res -= slen;
|
||||
memmove(s, s + slen, res - off);
|
||||
} else {
|
||||
off += slen;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
int ovl_removexattr(struct dentry *dentry, const char *name)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = -ENODATA;
|
||||
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
|
||||
goto out_drop_write;
|
||||
|
||||
if (!OVL_TYPE_UPPER(type)) {
|
||||
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
|
||||
if (err < 0)
|
||||
goto out_drop_write;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
ovl_path_upper(dentry, &realpath);
|
||||
}
|
||||
|
||||
err = vfs_removexattr(realpath.dentry, name);
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
|
||||
struct dentry *realdentry)
|
||||
{
|
||||
if (OVL_TYPE_UPPER(type))
|
||||
return false;
|
||||
|
||||
if (special_file(realdentry->d_inode->i_mode))
|
||||
return false;
|
||||
|
||||
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
|
||||
const struct cred *cred)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
enum ovl_path_type type;
|
||||
bool want_write = false;
|
||||
|
||||
type = ovl_path_real(dentry, &realpath);
|
||||
if (!ovl_is_nocopyupw(dentry)) {
|
||||
if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
|
||||
want_write = true;
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (file->f_flags & O_TRUNC)
|
||||
err = ovl_copy_up_last(dentry, NULL, true);
|
||||
else
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
ovl_path_upper(dentry, &realpath);
|
||||
}
|
||||
}
|
||||
|
||||
err = vfs_open(&realpath, file, cred);
|
||||
out_drop_write:
|
||||
if (want_write)
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static const struct inode_operations ovl_file_inode_operations = {
|
||||
.setattr = ovl_setattr,
|
||||
.permission = ovl_permission,
|
||||
.getattr = ovl_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
.dentry_open = ovl_dentry_open,
|
||||
};
|
||||
|
||||
static const struct inode_operations ovl_symlink_inode_operations = {
|
||||
.setattr = ovl_setattr,
|
||||
.follow_link = ovl_follow_link,
|
||||
.put_link = ovl_put_link,
|
||||
.readlink = ovl_readlink,
|
||||
.getattr = ovl_getattr,
|
||||
.setxattr = ovl_setxattr,
|
||||
.getxattr = ovl_getxattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.removexattr = ovl_removexattr,
|
||||
};
|
||||
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
||||
struct ovl_entry *oe)
|
||||
{
|
||||
struct inode *inode;
|
||||
|
||||
inode = new_inode(sb);
|
||||
if (!inode)
|
||||
return NULL;
|
||||
|
||||
mode &= S_IFMT;
|
||||
|
||||
inode->i_ino = get_next_ino();
|
||||
inode->i_mode = mode;
|
||||
inode->i_flags |= S_NOATIME | S_NOCMTIME;
|
||||
|
||||
switch (mode) {
|
||||
case S_IFDIR:
|
||||
inode->i_private = oe;
|
||||
inode->i_op = &ovl_dir_inode_operations;
|
||||
inode->i_fop = &ovl_dir_operations;
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
inode->i_op = &ovl_symlink_inode_operations;
|
||||
break;
|
||||
|
||||
case S_IFREG:
|
||||
case S_IFSOCK:
|
||||
case S_IFBLK:
|
||||
case S_IFCHR:
|
||||
case S_IFIFO:
|
||||
inode->i_op = &ovl_file_inode_operations;
|
||||
break;
|
||||
|
||||
default:
|
||||
WARN(1, "illegal file type: %i\n", mode);
|
||||
iput(inode);
|
||||
inode = NULL;
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
@ -1,200 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
||||
struct ovl_entry;
|
||||
|
||||
enum ovl_path_type {
|
||||
__OVL_PATH_PURE = (1 << 0),
|
||||
__OVL_PATH_UPPER = (1 << 1),
|
||||
__OVL_PATH_MERGE = (1 << 2),
|
||||
};
|
||||
|
||||
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
|
||||
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
|
||||
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
|
||||
#define OVL_TYPE_MERGE_OR_LOWER(type) \
|
||||
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
|
||||
|
||||
#define OVL_XATTR_PRE_NAME "trusted.overlay."
|
||||
#define OVL_XATTR_PRE_LEN 16
|
||||
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
|
||||
|
||||
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_rmdir(dir, dentry);
|
||||
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_unlink(dir, dentry, NULL);
|
||||
pr_debug("unlink(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
|
||||
struct dentry *new_dentry, bool debug)
|
||||
{
|
||||
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
|
||||
if (debug) {
|
||||
pr_debug("link(%pd2, %pd2) = %i\n",
|
||||
old_dentry, new_dentry, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, bool debug)
|
||||
{
|
||||
int err = vfs_create(dir, dentry, mode, true);
|
||||
if (debug)
|
||||
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, bool debug)
|
||||
{
|
||||
int err = vfs_mkdir(dir, dentry, mode);
|
||||
if (debug)
|
||||
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, dev_t dev, bool debug)
|
||||
{
|
||||
int err = vfs_mknod(dir, dentry, mode, dev);
|
||||
if (debug) {
|
||||
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
|
||||
dentry, mode, dev, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *oldname, bool debug)
|
||||
{
|
||||
int err = vfs_symlink(dir, dentry, oldname);
|
||||
if (debug)
|
||||
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err = vfs_setxattr(dentry, name, value, size, flags);
|
||||
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
|
||||
dentry, name, (int) size, (char *) value, flags, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
|
||||
{
|
||||
int err = vfs_removexattr(dentry, name);
|
||||
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
|
||||
struct inode *newdir, struct dentry *newdentry,
|
||||
unsigned int flags)
|
||||
{
|
||||
int err;
|
||||
|
||||
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
|
||||
olddentry, newdentry, flags);
|
||||
|
||||
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
|
||||
|
||||
if (err) {
|
||||
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
|
||||
olddentry, newdentry, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_whiteout(dir, dentry);
|
||||
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
bool ovl_is_nocopyupw(struct dentry *dentry);
|
||||
enum ovl_path_type ovl_path_type(struct dentry *dentry);
|
||||
u64 ovl_dentry_version_get(struct dentry *dentry);
|
||||
void ovl_dentry_version_inc(struct dentry *dentry);
|
||||
void ovl_path_upper(struct dentry *dentry, struct path *path);
|
||||
void ovl_path_lower(struct dentry *dentry, struct path *path);
|
||||
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
|
||||
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
|
||||
struct dentry *ovl_dentry_upper(struct dentry *dentry);
|
||||
struct dentry *ovl_dentry_lower(struct dentry *dentry);
|
||||
struct dentry *ovl_dentry_real(struct dentry *dentry);
|
||||
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
|
||||
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
|
||||
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
|
||||
struct dentry *ovl_workdir(struct dentry *dentry);
|
||||
int ovl_want_write(struct dentry *dentry);
|
||||
void ovl_drop_write(struct dentry *dentry);
|
||||
bool ovl_dentry_is_opaque(struct dentry *dentry);
|
||||
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
|
||||
bool ovl_is_whiteout(struct dentry *dentry);
|
||||
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
|
||||
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
|
||||
unsigned int flags);
|
||||
struct file *ovl_path_open(struct path *path, int flags);
|
||||
|
||||
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
|
||||
struct kstat *stat, const char *link);
|
||||
|
||||
/* readdir.c */
|
||||
extern const struct file_operations ovl_dir_operations;
|
||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
|
||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
|
||||
void ovl_cache_free(struct list_head *list);
|
||||
|
||||
/* inode.c */
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
int ovl_permission(struct inode *inode, int mask);
|
||||
int ovl_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags);
|
||||
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
|
||||
void *value, size_t size);
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
|
||||
int ovl_removexattr(struct dentry *dentry, const char *name);
|
||||
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
|
||||
struct ovl_entry *oe);
|
||||
static inline void ovl_copyattr(struct inode *from, struct inode *to)
|
||||
{
|
||||
to->i_uid = from->i_uid;
|
||||
to->i_gid = from->i_gid;
|
||||
}
|
||||
|
||||
/* dir.c */
|
||||
extern const struct inode_operations ovl_dir_inode_operations;
|
||||
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
|
||||
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
||||
struct kstat *stat, const char *link,
|
||||
struct dentry *hardlink, bool debug);
|
||||
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
|
||||
|
||||
/* copy_up.c */
|
||||
int ovl_copy_up(struct dentry *dentry);
|
||||
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
||||
struct path *lowerpath, struct kstat *stat,
|
||||
struct iattr *attr);
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
|
||||
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
|
||||
@ -1,557 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/cred.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
struct ovl_cache_entry {
|
||||
unsigned int len;
|
||||
unsigned int type;
|
||||
u64 ino;
|
||||
struct list_head l_node;
|
||||
struct rb_node node;
|
||||
bool is_whiteout;
|
||||
char name[];
|
||||
};
|
||||
|
||||
struct ovl_dir_cache {
|
||||
long refcount;
|
||||
u64 version;
|
||||
struct list_head entries;
|
||||
};
|
||||
|
||||
struct ovl_readdir_data {
|
||||
struct dir_context ctx;
|
||||
bool is_merge;
|
||||
struct rb_root root;
|
||||
struct list_head *list;
|
||||
struct list_head middle;
|
||||
struct dentry *dir;
|
||||
int count;
|
||||
int err;
|
||||
};
|
||||
|
||||
struct ovl_dir_file {
|
||||
bool is_real;
|
||||
bool is_upper;
|
||||
struct ovl_dir_cache *cache;
|
||||
struct list_head *cursor;
|
||||
struct file *realfile;
|
||||
struct file *upperfile;
|
||||
};
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
|
||||
{
|
||||
return container_of(n, struct ovl_cache_entry, node);
|
||||
}
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
|
||||
const char *name, int len)
|
||||
{
|
||||
struct rb_node *node = root->rb_node;
|
||||
int cmp;
|
||||
|
||||
while (node) {
|
||||
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
|
||||
|
||||
cmp = strncmp(name, p->name, len);
|
||||
if (cmp > 0)
|
||||
node = p->node.rb_right;
|
||||
else if (cmp < 0 || len < p->len)
|
||||
node = p->node.rb_left;
|
||||
else
|
||||
return p;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
|
||||
const char *name, int len,
|
||||
u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
|
||||
|
||||
p = kmalloc(size, GFP_KERNEL);
|
||||
if (!p)
|
||||
return NULL;
|
||||
|
||||
memcpy(p->name, name, len);
|
||||
p->name[len] = '\0';
|
||||
p->len = len;
|
||||
p->type = d_type;
|
||||
p->ino = ino;
|
||||
p->is_whiteout = false;
|
||||
|
||||
if (d_type == DT_CHR) {
|
||||
struct dentry *dentry;
|
||||
const struct cred *old_cred;
|
||||
struct cred *override_cred;
|
||||
|
||||
override_cred = prepare_creds();
|
||||
if (!override_cred) {
|
||||
kfree(p);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* CAP_DAC_OVERRIDE for lookup
|
||||
*/
|
||||
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
|
||||
old_cred = override_creds(override_cred);
|
||||
|
||||
dentry = lookup_one_len(name, dir, len);
|
||||
if (!IS_ERR(dentry)) {
|
||||
p->is_whiteout = ovl_is_whiteout(dentry);
|
||||
dput(dentry);
|
||||
}
|
||||
revert_creds(old_cred);
|
||||
put_cred(override_cred);
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
|
||||
const char *name, int len, u64 ino,
|
||||
unsigned int d_type)
|
||||
{
|
||||
struct rb_node **newp = &rdd->root.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
while (*newp) {
|
||||
int cmp;
|
||||
struct ovl_cache_entry *tmp;
|
||||
|
||||
parent = *newp;
|
||||
tmp = ovl_cache_entry_from_node(*newp);
|
||||
cmp = strncmp(name, tmp->name, len);
|
||||
if (cmp > 0)
|
||||
newp = &tmp->node.rb_right;
|
||||
else if (cmp < 0 || len < tmp->len)
|
||||
newp = &tmp->node.rb_left;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
|
||||
if (p == NULL)
|
||||
return -ENOMEM;
|
||||
|
||||
list_add_tail(&p->l_node, rdd->list);
|
||||
rb_link_node(&p->node, parent, newp);
|
||||
rb_insert_color(&p->node, &rdd->root);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
|
||||
const char *name, int namelen,
|
||||
loff_t offset, u64 ino, unsigned int d_type)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
p = ovl_cache_entry_find(&rdd->root, name, namelen);
|
||||
if (p) {
|
||||
list_move_tail(&p->l_node, &rdd->middle);
|
||||
} else {
|
||||
p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
|
||||
if (p == NULL)
|
||||
rdd->err = -ENOMEM;
|
||||
else
|
||||
list_add_tail(&p->l_node, &rdd->middle);
|
||||
}
|
||||
|
||||
return rdd->err;
|
||||
}
|
||||
|
||||
void ovl_cache_free(struct list_head *list)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
struct ovl_cache_entry *n;
|
||||
|
||||
list_for_each_entry_safe(p, n, list, l_node)
|
||||
kfree(p);
|
||||
|
||||
INIT_LIST_HEAD(list);
|
||||
}
|
||||
|
||||
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
|
||||
{
|
||||
struct ovl_dir_cache *cache = od->cache;
|
||||
|
||||
WARN_ON(cache->refcount <= 0);
|
||||
cache->refcount--;
|
||||
if (!cache->refcount) {
|
||||
if (ovl_dir_cache(dentry) == cache)
|
||||
ovl_set_dir_cache(dentry, NULL);
|
||||
|
||||
ovl_cache_free(&cache->entries);
|
||||
kfree(cache);
|
||||
}
|
||||
}
|
||||
|
||||
static int ovl_fill_merge(struct dir_context *ctx, const char *name,
|
||||
int namelen, loff_t offset, u64 ino,
|
||||
unsigned int d_type)
|
||||
{
|
||||
struct ovl_readdir_data *rdd =
|
||||
container_of(ctx, struct ovl_readdir_data, ctx);
|
||||
|
||||
rdd->count++;
|
||||
if (!rdd->is_merge)
|
||||
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
|
||||
else
|
||||
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
|
||||
}
|
||||
|
||||
static inline int ovl_dir_read(struct path *realpath,
|
||||
struct ovl_readdir_data *rdd)
|
||||
{
|
||||
struct file *realfile;
|
||||
int err;
|
||||
|
||||
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
|
||||
if (IS_ERR(realfile))
|
||||
return PTR_ERR(realfile);
|
||||
|
||||
rdd->dir = realpath->dentry;
|
||||
rdd->ctx.pos = 0;
|
||||
do {
|
||||
rdd->count = 0;
|
||||
rdd->err = 0;
|
||||
err = iterate_dir(realfile, &rdd->ctx);
|
||||
if (err >= 0)
|
||||
err = rdd->err;
|
||||
} while (!err && rdd->count);
|
||||
fput(realfile);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ovl_dir_reset(struct file *file)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct ovl_dir_cache *cache = od->cache;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
enum ovl_path_type type = ovl_path_type(dentry);
|
||||
|
||||
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
|
||||
ovl_cache_put(od, dentry);
|
||||
od->cache = NULL;
|
||||
od->cursor = NULL;
|
||||
}
|
||||
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
|
||||
if (od->is_real && OVL_TYPE_MERGE(type))
|
||||
od->is_real = false;
|
||||
}
|
||||
|
||||
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
|
||||
{
|
||||
int err;
|
||||
struct path realpath;
|
||||
struct ovl_readdir_data rdd = {
|
||||
.ctx.actor = ovl_fill_merge,
|
||||
.list = list,
|
||||
.root = RB_ROOT,
|
||||
.is_merge = false,
|
||||
};
|
||||
int idx, next;
|
||||
|
||||
for (idx = 0; idx != -1; idx = next) {
|
||||
next = ovl_path_next(idx, dentry, &realpath);
|
||||
|
||||
if (next != -1) {
|
||||
err = ovl_dir_read(&realpath, &rdd);
|
||||
if (err)
|
||||
break;
|
||||
} else {
|
||||
/*
|
||||
* Insert lowest layer entries before upper ones, this
|
||||
* allows offsets to be reasonably constant
|
||||
*/
|
||||
list_add(&rdd.middle, rdd.list);
|
||||
rdd.is_merge = true;
|
||||
err = ovl_dir_read(&realpath, &rdd);
|
||||
list_del(&rdd.middle);
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
|
||||
{
|
||||
struct list_head *p;
|
||||
loff_t off = 0;
|
||||
|
||||
list_for_each(p, &od->cache->entries) {
|
||||
if (off >= pos)
|
||||
break;
|
||||
off++;
|
||||
}
|
||||
/* Cursor is safe since the cache is stable */
|
||||
od->cursor = p;
|
||||
}
|
||||
|
||||
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
|
||||
{
|
||||
int res;
|
||||
struct ovl_dir_cache *cache;
|
||||
|
||||
cache = ovl_dir_cache(dentry);
|
||||
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
|
||||
cache->refcount++;
|
||||
return cache;
|
||||
}
|
||||
ovl_set_dir_cache(dentry, NULL);
|
||||
|
||||
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
|
||||
if (!cache)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
cache->refcount = 1;
|
||||
INIT_LIST_HEAD(&cache->entries);
|
||||
|
||||
res = ovl_dir_read_merged(dentry, &cache->entries);
|
||||
if (res) {
|
||||
ovl_cache_free(&cache->entries);
|
||||
kfree(cache);
|
||||
return ERR_PTR(res);
|
||||
}
|
||||
|
||||
cache->version = ovl_dentry_version_get(dentry);
|
||||
ovl_set_dir_cache(dentry, cache);
|
||||
|
||||
return cache;
|
||||
}
|
||||
|
||||
static int ovl_iterate(struct file *file, struct dir_context *ctx)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
if (!ctx->pos)
|
||||
ovl_dir_reset(file);
|
||||
|
||||
if (od->is_real)
|
||||
return iterate_dir(od->realfile, ctx);
|
||||
|
||||
if (!od->cache) {
|
||||
struct ovl_dir_cache *cache;
|
||||
|
||||
cache = ovl_cache_get(dentry);
|
||||
if (IS_ERR(cache))
|
||||
return PTR_ERR(cache);
|
||||
|
||||
od->cache = cache;
|
||||
ovl_seek_cursor(od, ctx->pos);
|
||||
}
|
||||
|
||||
while (od->cursor != &od->cache->entries) {
|
||||
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
|
||||
if (!p->is_whiteout)
|
||||
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
|
||||
break;
|
||||
od->cursor = p->l_node.next;
|
||||
ctx->pos++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
|
||||
{
|
||||
loff_t res;
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
|
||||
mutex_lock(&file_inode(file)->i_mutex);
|
||||
if (!file->f_pos)
|
||||
ovl_dir_reset(file);
|
||||
|
||||
if (od->is_real) {
|
||||
res = vfs_llseek(od->realfile, offset, origin);
|
||||
file->f_pos = od->realfile->f_pos;
|
||||
} else {
|
||||
res = -EINVAL;
|
||||
|
||||
switch (origin) {
|
||||
case SEEK_CUR:
|
||||
offset += file->f_pos;
|
||||
break;
|
||||
case SEEK_SET:
|
||||
break;
|
||||
default:
|
||||
goto out_unlock;
|
||||
}
|
||||
if (offset < 0)
|
||||
goto out_unlock;
|
||||
|
||||
if (offset != file->f_pos) {
|
||||
file->f_pos = offset;
|
||||
if (od->cache)
|
||||
ovl_seek_cursor(od, offset);
|
||||
}
|
||||
res = offset;
|
||||
}
|
||||
out_unlock:
|
||||
mutex_unlock(&file_inode(file)->i_mutex);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
|
||||
int datasync)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
struct dentry *dentry = file->f_path.dentry;
|
||||
struct file *realfile = od->realfile;
|
||||
|
||||
/*
|
||||
* Need to check if we started out being a lower dir, but got copied up
|
||||
*/
|
||||
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
|
||||
struct inode *inode = file_inode(file);
|
||||
|
||||
realfile = lockless_dereference(od->upperfile);
|
||||
if (!realfile) {
|
||||
struct path upperpath;
|
||||
|
||||
ovl_path_upper(dentry, &upperpath);
|
||||
realfile = ovl_path_open(&upperpath, O_RDONLY);
|
||||
smp_mb__before_spinlock();
|
||||
mutex_lock(&inode->i_mutex);
|
||||
if (!od->upperfile) {
|
||||
if (IS_ERR(realfile)) {
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
return PTR_ERR(realfile);
|
||||
}
|
||||
od->upperfile = realfile;
|
||||
} else {
|
||||
/* somebody has beaten us to it */
|
||||
if (!IS_ERR(realfile))
|
||||
fput(realfile);
|
||||
realfile = od->upperfile;
|
||||
}
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
}
|
||||
}
|
||||
|
||||
return vfs_fsync_range(realfile, start, end, datasync);
|
||||
}
|
||||
|
||||
static int ovl_dir_release(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct ovl_dir_file *od = file->private_data;
|
||||
|
||||
if (od->cache) {
|
||||
mutex_lock(&inode->i_mutex);
|
||||
ovl_cache_put(od, file->f_path.dentry);
|
||||
mutex_unlock(&inode->i_mutex);
|
||||
}
|
||||
fput(od->realfile);
|
||||
if (od->upperfile)
|
||||
fput(od->upperfile);
|
||||
kfree(od);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_dir_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct path realpath;
|
||||
struct file *realfile;
|
||||
struct ovl_dir_file *od;
|
||||
enum ovl_path_type type;
|
||||
|
||||
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
|
||||
if (!od)
|
||||
return -ENOMEM;
|
||||
|
||||
type = ovl_path_real(file->f_path.dentry, &realpath);
|
||||
realfile = ovl_path_open(&realpath, file->f_flags);
|
||||
if (IS_ERR(realfile)) {
|
||||
kfree(od);
|
||||
return PTR_ERR(realfile);
|
||||
}
|
||||
od->realfile = realfile;
|
||||
od->is_real = !OVL_TYPE_MERGE(type);
|
||||
od->is_upper = OVL_TYPE_UPPER(type);
|
||||
file->private_data = od;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct file_operations ovl_dir_operations = {
|
||||
.read = generic_read_dir,
|
||||
.open = ovl_dir_open,
|
||||
.iterate = ovl_iterate,
|
||||
.llseek = ovl_dir_llseek,
|
||||
.fsync = ovl_dir_fsync,
|
||||
.release = ovl_dir_release,
|
||||
};
|
||||
|
||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
|
||||
{
|
||||
int err;
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
err = ovl_dir_read_merged(dentry, list);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = 0;
|
||||
|
||||
list_for_each_entry(p, list, l_node) {
|
||||
if (p->is_whiteout)
|
||||
continue;
|
||||
|
||||
if (p->name[0] == '.') {
|
||||
if (p->len == 1)
|
||||
continue;
|
||||
if (p->len == 2 && p->name[1] == '.')
|
||||
continue;
|
||||
}
|
||||
err = -ENOTEMPTY;
|
||||
break;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
|
||||
{
|
||||
struct ovl_cache_entry *p;
|
||||
|
||||
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
|
||||
list_for_each_entry(p, list, l_node) {
|
||||
struct dentry *dentry;
|
||||
|
||||
if (!p->is_whiteout)
|
||||
continue;
|
||||
|
||||
dentry = lookup_one_len(p->name, upper, p->len);
|
||||
if (IS_ERR(dentry)) {
|
||||
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
|
||||
upper->d_name.name, p->len, p->name,
|
||||
(int) PTR_ERR(dentry));
|
||||
continue;
|
||||
}
|
||||
ovl_cleanup(upper->d_inode, dentry);
|
||||
dput(dentry);
|
||||
}
|
||||
mutex_unlock(&upper->d_inode->i_mutex);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,7 +0,0 @@
|
||||
kmod(mcoverlay
|
||||
SOURCES
|
||||
copy_up.c dir.c inode.c readdir.c super.c namei.c util.c export.c
|
||||
INSTALL_DEST
|
||||
${KMODDIR}
|
||||
)
|
||||
|
||||
@ -1,804 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/module.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/splice.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/cred.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/fdtable.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include <linux/exportfs.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
|
||||
|
||||
static bool __read_mostly ovl_check_copy_up;
|
||||
module_param_named(check_copy_up, ovl_check_copy_up, bool,
|
||||
S_IWUSR | S_IRUGO);
|
||||
MODULE_PARM_DESC(ovl_check_copy_up,
|
||||
"Warn on copy-up when causing process also has a R/O fd open");
|
||||
|
||||
static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
|
||||
{
|
||||
const struct dentry *dentry = data;
|
||||
|
||||
if (file_inode(f) == d_inode(dentry))
|
||||
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
|
||||
f, fd, current->pid, current->comm);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check the fds open by this process and warn if something like the following
|
||||
* scenario is about to occur:
|
||||
*
|
||||
* fd1 = open("foo", O_RDONLY);
|
||||
* fd2 = open("foo", O_RDWR);
|
||||
*/
|
||||
static void ovl_do_check_copy_up(struct dentry *dentry)
|
||||
{
|
||||
if (ovl_check_copy_up)
|
||||
iterate_fd(current->files, 0, ovl_check_fd, dentry);
|
||||
}
|
||||
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
|
||||
{
|
||||
ssize_t list_size, size, value_size = 0;
|
||||
char *buf, *name, *value = NULL;
|
||||
int uninitialized_var(error);
|
||||
size_t slen;
|
||||
|
||||
if (!(old->d_inode->i_opflags & IOP_XATTR) ||
|
||||
!(new->d_inode->i_opflags & IOP_XATTR))
|
||||
return 0;
|
||||
|
||||
list_size = vfs_listxattr(old, NULL, 0);
|
||||
if (list_size <= 0) {
|
||||
if (list_size == -EOPNOTSUPP)
|
||||
return 0;
|
||||
return list_size;
|
||||
}
|
||||
|
||||
buf = kzalloc(list_size, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
list_size = vfs_listxattr(old, buf, list_size);
|
||||
if (list_size <= 0) {
|
||||
error = list_size;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (name = buf; list_size; name += slen) {
|
||||
slen = strnlen(name, list_size) + 1;
|
||||
|
||||
/* underlying fs providing us with an broken xattr list? */
|
||||
if (WARN_ON(slen > list_size)) {
|
||||
error = -EIO;
|
||||
break;
|
||||
}
|
||||
list_size -= slen;
|
||||
|
||||
if (ovl_is_private_xattr(name))
|
||||
continue;
|
||||
retry:
|
||||
size = vfs_getxattr(old, name, value, value_size);
|
||||
if (size == -ERANGE)
|
||||
size = vfs_getxattr(old, name, NULL, 0);
|
||||
|
||||
if (size < 0) {
|
||||
/* NOFSCHECK */
|
||||
continue;
|
||||
|
||||
error = size;
|
||||
break;
|
||||
}
|
||||
|
||||
if (size > value_size) {
|
||||
void *new;
|
||||
|
||||
new = krealloc(value, size, GFP_KERNEL);
|
||||
if (!new) {
|
||||
error = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
value = new;
|
||||
value_size = size;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
error = security_inode_copy_up_xattr(name);
|
||||
if (error < 0 && error != -EOPNOTSUPP)
|
||||
break;
|
||||
if (error == 1) {
|
||||
error = 0;
|
||||
continue; /* Discard */
|
||||
}
|
||||
error = vfs_setxattr(new, name, value, size, 0);
|
||||
if (error)
|
||||
break;
|
||||
}
|
||||
kfree(value);
|
||||
out:
|
||||
kfree(buf);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
|
||||
{
|
||||
struct file *old_file;
|
||||
struct file *new_file;
|
||||
loff_t old_pos = 0;
|
||||
loff_t new_pos = 0;
|
||||
int error = 0;
|
||||
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY);
|
||||
if (IS_ERR(old_file))
|
||||
return PTR_ERR(old_file);
|
||||
|
||||
new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY);
|
||||
if (IS_ERR(new_file)) {
|
||||
error = PTR_ERR(new_file);
|
||||
goto out_fput;
|
||||
}
|
||||
|
||||
/* Try to use clone_file_range to clone up within the same fs */
|
||||
error = vfs_clone_file_range(old_file, 0, new_file, 0, len);
|
||||
if (!error)
|
||||
goto out;
|
||||
/* Couldn't clone, so now we try to copy the data */
|
||||
error = 0;
|
||||
|
||||
/* FIXME: copy up sparse files efficiently */
|
||||
while (len) {
|
||||
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
|
||||
long bytes;
|
||||
|
||||
if (len < this_len)
|
||||
this_len = len;
|
||||
|
||||
if (signal_pending_state(TASK_KILLABLE, current)) {
|
||||
error = -EINTR;
|
||||
break;
|
||||
}
|
||||
|
||||
bytes = do_splice_direct(old_file, &old_pos,
|
||||
new_file, &new_pos,
|
||||
this_len, SPLICE_F_MOVE);
|
||||
if (bytes <= 0) {
|
||||
error = bytes;
|
||||
break;
|
||||
}
|
||||
WARN_ON(old_pos != new_pos);
|
||||
|
||||
len -= bytes;
|
||||
}
|
||||
out:
|
||||
if (!error)
|
||||
error = vfs_fsync(new_file, 0);
|
||||
fput(new_file);
|
||||
out_fput:
|
||||
fput(old_file);
|
||||
return error;
|
||||
}
|
||||
|
||||
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
struct iattr attr = {
|
||||
.ia_valid =
|
||||
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
|
||||
.ia_atime = stat->atime,
|
||||
.ia_mtime = stat->mtime,
|
||||
};
|
||||
|
||||
return notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
|
||||
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
if (!S_ISLNK(stat->mode)) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_MODE,
|
||||
.ia_mode = stat->mode,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err) {
|
||||
struct iattr attr = {
|
||||
.ia_valid = ATTR_UID | ATTR_GID,
|
||||
.ia_uid = stat->uid,
|
||||
.ia_gid = stat->gid,
|
||||
};
|
||||
err = notify_change(upperdentry, &attr, NULL);
|
||||
}
|
||||
if (!err)
|
||||
ovl_set_timestamps(upperdentry, stat);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper)
|
||||
{
|
||||
struct ovl_fh *fh;
|
||||
int fh_type, fh_len, dwords;
|
||||
void *buf;
|
||||
int buflen = MAX_HANDLE_SZ;
|
||||
uuid_t *uuid = &real->d_sb->s_uuid;
|
||||
|
||||
buf = kmalloc(buflen, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
/*
|
||||
* We encode a non-connectable file handle for non-dir, because we
|
||||
* only need to find the lower inode number and we don't want to pay
|
||||
* the price or reconnecting the dentry.
|
||||
*/
|
||||
dwords = buflen >> 2;
|
||||
fh_type = exportfs_encode_fh(real, buf, &dwords, 0);
|
||||
buflen = (dwords << 2);
|
||||
|
||||
fh = ERR_PTR(-EIO);
|
||||
if (WARN_ON(fh_type < 0) ||
|
||||
WARN_ON(buflen > MAX_HANDLE_SZ) ||
|
||||
WARN_ON(fh_type == FILEID_INVALID))
|
||||
goto out;
|
||||
|
||||
BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255);
|
||||
fh_len = offsetof(struct ovl_fh, fid) + buflen;
|
||||
fh = kmalloc(fh_len, GFP_KERNEL);
|
||||
if (!fh) {
|
||||
fh = ERR_PTR(-ENOMEM);
|
||||
goto out;
|
||||
}
|
||||
|
||||
fh->version = OVL_FH_VERSION;
|
||||
fh->magic = OVL_FH_MAGIC;
|
||||
fh->type = fh_type;
|
||||
fh->flags = OVL_FH_FLAG_CPU_ENDIAN;
|
||||
/*
|
||||
* When we will want to decode an overlay dentry from this handle
|
||||
* and all layers are on the same fs, if we get a disconncted real
|
||||
* dentry when we decode fid, the only way to tell if we should assign
|
||||
* it to upperdentry or to lowerstack is by checking this flag.
|
||||
*/
|
||||
if (is_upper)
|
||||
fh->flags |= OVL_FH_FLAG_PATH_UPPER;
|
||||
fh->len = fh_len;
|
||||
fh->uuid = *uuid;
|
||||
memcpy(fh->fid, buf, buflen);
|
||||
|
||||
out:
|
||||
kfree(buf);
|
||||
return fh;
|
||||
}
|
||||
|
||||
int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
|
||||
struct dentry *upper)
|
||||
{
|
||||
const struct ovl_fh *fh = NULL;
|
||||
int err;
|
||||
|
||||
/*
|
||||
* When lower layer doesn't support export operations store a 'null' fh,
|
||||
* so we can use the overlay.origin xattr to distignuish between a copy
|
||||
* up and a pure upper inode.
|
||||
*/
|
||||
if (ovl_can_decode_fh(lower->d_sb)) {
|
||||
fh = ovl_encode_real_fh(lower, false);
|
||||
if (IS_ERR(fh))
|
||||
return PTR_ERR(fh);
|
||||
}
|
||||
|
||||
/*
|
||||
* Do not fail when upper doesn't support xattrs.
|
||||
*/
|
||||
err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh,
|
||||
fh ? fh->len : 0, 0);
|
||||
kfree(fh);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Store file handle of @upper dir in @index dir entry */
|
||||
static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index)
|
||||
{
|
||||
const struct ovl_fh *fh;
|
||||
int err;
|
||||
|
||||
fh = ovl_encode_real_fh(upper, true);
|
||||
if (IS_ERR(fh))
|
||||
return PTR_ERR(fh);
|
||||
|
||||
err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh, fh->len, 0);
|
||||
|
||||
kfree(fh);
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create and install index entry.
|
||||
*
|
||||
* Caller must hold i_mutex on indexdir.
|
||||
*/
|
||||
static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
|
||||
struct dentry *upper)
|
||||
{
|
||||
struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
|
||||
struct inode *dir = d_inode(indexdir);
|
||||
struct dentry *index = NULL;
|
||||
struct dentry *temp = NULL;
|
||||
struct qstr name = { };
|
||||
int err;
|
||||
|
||||
/*
|
||||
* For now this is only used for creating index entry for directories,
|
||||
* because non-dir are copied up directly to index and then hardlinked
|
||||
* to upper dir.
|
||||
*
|
||||
* TODO: implement create index for non-dir, so we can call it when
|
||||
* encoding file handle for non-dir in case index does not exist.
|
||||
*/
|
||||
if (WARN_ON(!d_is_dir(dentry)))
|
||||
return -EIO;
|
||||
|
||||
/* Directory not expected to be indexed before copy up */
|
||||
if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry))))
|
||||
return -EIO;
|
||||
|
||||
err = ovl_get_index_name(origin, &name);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
temp = ovl_create_temp(indexdir, OVL_CATTR(S_IFDIR | 0));
|
||||
err = PTR_ERR(temp);
|
||||
if (IS_ERR(temp))
|
||||
goto free_name;
|
||||
|
||||
err = ovl_set_upper_fh(upper, temp);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
index = lookup_one_len(name.name, indexdir, name.len);
|
||||
if (IS_ERR(index)) {
|
||||
err = PTR_ERR(index);
|
||||
} else {
|
||||
err = ovl_do_rename(dir, temp, dir, index, 0);
|
||||
dput(index);
|
||||
}
|
||||
out:
|
||||
if (err)
|
||||
ovl_cleanup(dir, temp);
|
||||
dput(temp);
|
||||
free_name:
|
||||
kfree(name.name);
|
||||
return err;
|
||||
}
|
||||
|
||||
struct ovl_copy_up_ctx {
|
||||
struct dentry *parent;
|
||||
struct dentry *dentry;
|
||||
struct path lowerpath;
|
||||
struct kstat stat;
|
||||
struct kstat pstat;
|
||||
const char *link;
|
||||
struct dentry *destdir;
|
||||
struct qstr destname;
|
||||
struct dentry *workdir;
|
||||
bool tmpfile;
|
||||
bool origin;
|
||||
bool indexed;
|
||||
};
|
||||
|
||||
static int ovl_link_up(struct ovl_copy_up_ctx *c)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upper;
|
||||
struct dentry *upperdir = ovl_dentry_upper(c->parent);
|
||||
struct inode *udir = d_inode(upperdir);
|
||||
|
||||
/* Mark parent "impure" because it may now contain non-pure upper */
|
||||
err = ovl_set_impure(c->parent, upperdir);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = ovl_set_nlink_lower(c->dentry);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
inode_lock_nested(udir, I_MUTEX_PARENT);
|
||||
upper = lookup_one_len(c->dentry->d_name.name, upperdir,
|
||||
c->dentry->d_name.len);
|
||||
err = PTR_ERR(upper);
|
||||
if (!IS_ERR(upper)) {
|
||||
err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper);
|
||||
dput(upper);
|
||||
|
||||
if (!err) {
|
||||
/* Restore timestamps on parent (best effort) */
|
||||
ovl_set_timestamps(upperdir, &c->pstat);
|
||||
ovl_dentry_set_upper_alias(c->dentry);
|
||||
}
|
||||
}
|
||||
inode_unlock(udir);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = ovl_set_nlink_upper(c->dentry);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp,
|
||||
struct dentry **newdentry)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upper;
|
||||
struct inode *udir = d_inode(c->destdir);
|
||||
|
||||
upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
|
||||
if (IS_ERR(upper))
|
||||
return PTR_ERR(upper);
|
||||
|
||||
if (c->tmpfile)
|
||||
err = ovl_do_link(temp, udir, upper);
|
||||
else
|
||||
err = ovl_do_rename(d_inode(c->workdir), temp, udir, upper, 0);
|
||||
|
||||
if (!err)
|
||||
*newdentry = dget(c->tmpfile ? upper : temp);
|
||||
dput(upper);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_get_tmpfile(struct ovl_copy_up_ctx *c)
|
||||
{
|
||||
int err;
|
||||
struct dentry *temp;
|
||||
const struct cred *old_creds = NULL;
|
||||
struct cred *new_creds = NULL;
|
||||
struct ovl_cattr cattr = {
|
||||
/* Can't properly set mode on creation because of the umask */
|
||||
.mode = c->stat.mode & S_IFMT,
|
||||
.rdev = c->stat.rdev,
|
||||
.link = c->link
|
||||
};
|
||||
|
||||
err = security_inode_copy_up(c->dentry, &new_creds);
|
||||
temp = ERR_PTR(err);
|
||||
if (err < 0)
|
||||
goto out;
|
||||
|
||||
if (new_creds)
|
||||
old_creds = override_creds(new_creds);
|
||||
|
||||
if (c->tmpfile)
|
||||
temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
|
||||
else
|
||||
temp = ovl_create_temp(c->workdir, &cattr);
|
||||
out:
|
||||
if (new_creds) {
|
||||
revert_creds(old_creds);
|
||||
put_cred(new_creds);
|
||||
}
|
||||
|
||||
return temp;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (S_ISREG(c->stat.mode)) {
|
||||
struct path upperpath;
|
||||
|
||||
ovl_path_upper(c->dentry, &upperpath);
|
||||
BUG_ON(upperpath.dentry != NULL);
|
||||
upperpath.dentry = temp;
|
||||
|
||||
err = ovl_copy_up_data(&c->lowerpath, &upperpath, c->stat.size);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
err = ovl_copy_xattr(c->lowerpath.dentry, temp);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
inode_lock(temp->d_inode);
|
||||
err = ovl_set_attr(temp, &c->stat);
|
||||
inode_unlock(temp->d_inode);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
/*
|
||||
* Store identifier of lower inode in upper inode xattr to
|
||||
* allow lookup of the copy up origin inode.
|
||||
*
|
||||
* Don't set origin when we are breaking the association with a lower
|
||||
* hard link.
|
||||
*/
|
||||
if (c->origin) {
|
||||
err = ovl_set_origin(c->dentry, c->lowerpath.dentry, temp);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
|
||||
{
|
||||
struct inode *udir = c->destdir->d_inode;
|
||||
struct inode *inode;
|
||||
struct dentry *newdentry = NULL;
|
||||
struct dentry *temp;
|
||||
int err;
|
||||
|
||||
temp = ovl_get_tmpfile(c);
|
||||
if (IS_ERR(temp))
|
||||
return PTR_ERR(temp);
|
||||
|
||||
err = ovl_copy_up_inode(c, temp);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (S_ISDIR(c->stat.mode) && c->indexed) {
|
||||
err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
|
||||
if (err)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (c->tmpfile) {
|
||||
inode_lock_nested(udir, I_MUTEX_PARENT);
|
||||
err = ovl_install_temp(c, temp, &newdentry);
|
||||
inode_unlock(udir);
|
||||
} else {
|
||||
err = ovl_install_temp(c, temp, &newdentry);
|
||||
}
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
inode = d_inode(c->dentry);
|
||||
ovl_inode_update(inode, newdentry);
|
||||
if (S_ISDIR(inode->i_mode))
|
||||
ovl_set_flag(OVL_WHITEOUTS, inode);
|
||||
|
||||
out:
|
||||
if (err && !c->tmpfile)
|
||||
ovl_cleanup(d_inode(c->workdir), temp);
|
||||
dput(temp);
|
||||
return err;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy up a single dentry
|
||||
*
|
||||
* All renames start with copy up of source if necessary. The actual
|
||||
* rename will only proceed once the copy up was successful. Copy up uses
|
||||
* upper parent i_mutex for exclusion. Since rename can change d_parent it
|
||||
* is possible that the copy up will lock the old parent. At that point
|
||||
* the file will have already been copied up anyway.
|
||||
*/
|
||||
static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
|
||||
{
|
||||
int err;
|
||||
struct ovl_fs *ofs = c->dentry->d_sb->s_fs_info;
|
||||
bool to_index = false;
|
||||
|
||||
/*
|
||||
* Indexed non-dir is copied up directly to the index entry and then
|
||||
* hardlinked to upper dir. Indexed dir is copied up to indexdir,
|
||||
* then index entry is created and then copied up dir installed.
|
||||
* Copying dir up to indexdir instead of workdir simplifies locking.
|
||||
*/
|
||||
if (ovl_need_index(c->dentry)) {
|
||||
c->indexed = true;
|
||||
if (S_ISDIR(c->stat.mode))
|
||||
c->workdir = ovl_indexdir(c->dentry->d_sb);
|
||||
else
|
||||
to_index = true;
|
||||
}
|
||||
|
||||
if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index)
|
||||
c->origin = true;
|
||||
|
||||
if (to_index) {
|
||||
c->destdir = ovl_indexdir(c->dentry->d_sb);
|
||||
err = ovl_get_index_name(c->lowerpath.dentry, &c->destname);
|
||||
if (err)
|
||||
return err;
|
||||
} else if (WARN_ON(!c->parent)) {
|
||||
/* Disconnected dentry must be copied up to index dir */
|
||||
return -EIO;
|
||||
} else {
|
||||
/*
|
||||
* Mark parent "impure" because it may now contain non-pure
|
||||
* upper
|
||||
*/
|
||||
err = ovl_set_impure(c->parent, c->destdir);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
/* Should we copyup with O_TMPFILE or with workdir? */
|
||||
if (S_ISREG(c->stat.mode) && ofs->tmpfile) {
|
||||
c->tmpfile = true;
|
||||
err = ovl_copy_up_locked(c);
|
||||
} else {
|
||||
err = ovl_lock_rename_workdir(c->workdir, c->destdir);
|
||||
if (!err) {
|
||||
err = ovl_copy_up_locked(c);
|
||||
unlock_rename(c->workdir, c->destdir);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (c->indexed)
|
||||
ovl_set_flag(OVL_INDEX, d_inode(c->dentry));
|
||||
|
||||
if (to_index) {
|
||||
/* Initialize nlink for copy up of disconnected dentry */
|
||||
err = ovl_set_nlink_upper(c->dentry);
|
||||
} else {
|
||||
struct inode *udir = d_inode(c->destdir);
|
||||
|
||||
/* Restore timestamps on parent (best effort) */
|
||||
inode_lock(udir);
|
||||
ovl_set_timestamps(c->destdir, &c->pstat);
|
||||
inode_unlock(udir);
|
||||
|
||||
ovl_dentry_set_upper_alias(c->dentry);
|
||||
}
|
||||
|
||||
out:
|
||||
if (to_index)
|
||||
kfree(c->destname.name);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
|
||||
int flags)
|
||||
{
|
||||
int err;
|
||||
DEFINE_DELAYED_CALL(done);
|
||||
struct path parentpath;
|
||||
struct ovl_copy_up_ctx ctx = {
|
||||
.parent = parent,
|
||||
.dentry = dentry,
|
||||
.workdir = ovl_workdir(dentry),
|
||||
};
|
||||
|
||||
if (WARN_ON(!ctx.workdir))
|
||||
return -EROFS;
|
||||
|
||||
ovl_path_lower(dentry, &ctx.lowerpath);
|
||||
err = vfs_getattr(&ctx.lowerpath, &ctx.stat,
|
||||
STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (parent) {
|
||||
ovl_path_upper(parent, &parentpath);
|
||||
ctx.destdir = parentpath.dentry;
|
||||
ctx.destname = dentry->d_name;
|
||||
|
||||
err = vfs_getattr(&parentpath, &ctx.pstat,
|
||||
STATX_ATIME | STATX_MTIME,
|
||||
AT_STATX_SYNC_AS_STAT);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
/* maybe truncate regular file. this has no effect on dirs */
|
||||
if (flags & O_TRUNC)
|
||||
ctx.stat.size = 0;
|
||||
|
||||
if (S_ISLNK(ctx.stat.mode)) {
|
||||
ctx.link = vfs_get_link(ctx.lowerpath.dentry, &done);
|
||||
if (IS_ERR(ctx.link))
|
||||
return PTR_ERR(ctx.link);
|
||||
}
|
||||
ovl_do_check_copy_up(ctx.lowerpath.dentry);
|
||||
|
||||
err = ovl_copy_up_start(dentry);
|
||||
/* err < 0: interrupted, err > 0: raced with another copy-up */
|
||||
if (unlikely(err)) {
|
||||
if (err > 0)
|
||||
err = 0;
|
||||
} else {
|
||||
if (!ovl_dentry_upper(dentry))
|
||||
err = ovl_do_copy_up(&ctx);
|
||||
if (!err && parent && !ovl_dentry_has_upper_alias(dentry))
|
||||
err = ovl_link_up(&ctx);
|
||||
ovl_copy_up_end(dentry);
|
||||
}
|
||||
do_delayed_call(&done);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_copy_up_flags(struct dentry *dentry, int flags)
|
||||
{
|
||||
int err = 0;
|
||||
const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
|
||||
bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED);
|
||||
|
||||
/*
|
||||
* With NFS export, copy up can get called for a disconnected non-dir.
|
||||
* In this case, we will copy up lower inode to index dir without
|
||||
* linking it to upper dir.
|
||||
*/
|
||||
if (WARN_ON(disconnected && d_is_dir(dentry)))
|
||||
return -EIO;
|
||||
|
||||
while (!err) {
|
||||
struct dentry *next;
|
||||
struct dentry *parent = NULL;
|
||||
|
||||
/*
|
||||
* Check if copy-up has happened as well as for upper alias (in
|
||||
* case of hard links) is there.
|
||||
*
|
||||
* Both checks are lockless:
|
||||
* - false negatives: will recheck under oi->lock
|
||||
* - false positives:
|
||||
* + ovl_dentry_upper() uses memory barriers to ensure the
|
||||
* upper dentry is up-to-date
|
||||
* + ovl_dentry_has_upper_alias() relies on locking of
|
||||
* upper parent i_rwsem to prevent reordering copy-up
|
||||
* with rename.
|
||||
*/
|
||||
if (ovl_dentry_upper(dentry) &&
|
||||
(ovl_dentry_has_upper_alias(dentry) || disconnected))
|
||||
break;
|
||||
|
||||
next = dget(dentry);
|
||||
/* find the topmost dentry not yet copied up */
|
||||
for (; !disconnected;) {
|
||||
parent = dget_parent(next);
|
||||
|
||||
if (ovl_dentry_upper(parent))
|
||||
break;
|
||||
|
||||
dput(next);
|
||||
next = parent;
|
||||
}
|
||||
|
||||
err = ovl_copy_up_one(parent, next, flags);
|
||||
|
||||
dput(parent);
|
||||
dput(next);
|
||||
}
|
||||
revert_creds(old_cred);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_copy_up(struct dentry *dentry)
|
||||
{
|
||||
return ovl_copy_up_flags(dentry, 0);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,853 +0,0 @@
|
||||
/*
|
||||
* Overlayfs NFS export support.
|
||||
*
|
||||
* Amir Goldstein <amir73il@gmail.com>
|
||||
*
|
||||
* Copyright (C) 2017-2018 CTERA Networks. All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/cred.h>
|
||||
#include <linux/mount.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/exportfs.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include <linux/version.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
static int ovl_encode_maybe_copy_up(struct dentry *dentry)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (ovl_dentry_upper(dentry))
|
||||
return 0;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (!err) {
|
||||
err = ovl_copy_up(dentry);
|
||||
ovl_drop_write(dentry);
|
||||
}
|
||||
|
||||
if (err) {
|
||||
pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n",
|
||||
dentry, err);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Before encoding a non-upper directory file handle from real layer N, we need
|
||||
* to check if it will be possible to reconnect an overlay dentry from the real
|
||||
* lower decoded dentry. This is done by following the overlay ancestry up to a
|
||||
* "layer N connected" ancestor and verifying that all parents along the way are
|
||||
* "layer N connectable". If an ancestor that is NOT "layer N connectable" is
|
||||
* found, we need to copy up an ancestor, which is "layer N connectable", thus
|
||||
* making that ancestor "layer N connected". For example:
|
||||
*
|
||||
* layer 1: /a
|
||||
* layer 2: /a/b/c
|
||||
*
|
||||
* The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is
|
||||
* copied up and renamed, upper dir /a will be indexed by lower dir /a from
|
||||
* layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*)
|
||||
* in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay
|
||||
* dentry from the connected lower dentry /a/b/c.
|
||||
*
|
||||
* To avoid this problem on decode time, we need to copy up an ancestor of
|
||||
* /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is
|
||||
* /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected"
|
||||
* and when the time comes to decode the file handle from lower dentry /a/b/c,
|
||||
* ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding
|
||||
* a connected overlay dentry will be accomplished.
|
||||
*
|
||||
* (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an
|
||||
* entry /a in the lower layers above layer N and find the indexed dir /a from
|
||||
* layer 1. If that improvement is made, then the check for "layer N connected"
|
||||
* will need to verify there are no redirects in lower layers above N. In the
|
||||
* example above, /a will be "layer 2 connectable". However, if layer 2 dir /a
|
||||
* is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable":
|
||||
*
|
||||
* layer 1: /A (redirect = /a)
|
||||
* layer 2: /a/b/c
|
||||
*/
|
||||
|
||||
/* Return the lowest layer for encoding a connectable file handle */
|
||||
static int ovl_connectable_layer(struct dentry *dentry)
|
||||
{
|
||||
struct ovl_entry *oe = OVL_E(dentry);
|
||||
|
||||
/* We can get overlay root from root of any layer */
|
||||
if (dentry == dentry->d_sb->s_root)
|
||||
return oe->numlower;
|
||||
|
||||
/*
|
||||
* If it's an unindexed merge dir, then it's not connectable with any
|
||||
* lower layer
|
||||
*/
|
||||
if (ovl_dentry_upper(dentry) &&
|
||||
!ovl_test_flag(OVL_INDEX, d_inode(dentry)))
|
||||
return 0;
|
||||
|
||||
/* We can get upper/overlay path from indexed/lower dentry */
|
||||
return oe->lowerstack[0].layer->idx;
|
||||
}
|
||||
|
||||
/*
|
||||
* @dentry is "connected" if all ancestors up to root or a "connected" ancestor
|
||||
* have the same uppermost lower layer as the origin's layer. We may need to
|
||||
* copy up a "connectable" ancestor to make it "connected". A "connected" dentry
|
||||
* cannot become non "connected", so cache positive result in dentry flags.
|
||||
*
|
||||
* Return the connected origin layer or < 0 on error.
|
||||
*/
|
||||
static int ovl_connect_layer(struct dentry *dentry)
|
||||
{
|
||||
struct dentry *next, *parent = NULL;
|
||||
int origin_layer;
|
||||
int err = 0;
|
||||
|
||||
if (WARN_ON(dentry == dentry->d_sb->s_root) ||
|
||||
WARN_ON(!ovl_dentry_lower(dentry)))
|
||||
return -EIO;
|
||||
|
||||
origin_layer = OVL_E(dentry)->lowerstack[0].layer->idx;
|
||||
if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry))
|
||||
return origin_layer;
|
||||
|
||||
/* Find the topmost origin layer connectable ancestor of @dentry */
|
||||
next = dget(dentry);
|
||||
for (;;) {
|
||||
parent = dget_parent(next);
|
||||
if (WARN_ON(parent == next)) {
|
||||
err = -EIO;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If @parent is not origin layer connectable, then copy up
|
||||
* @next which is origin layer connectable and we are done.
|
||||
*/
|
||||
if (ovl_connectable_layer(parent) < origin_layer) {
|
||||
err = ovl_encode_maybe_copy_up(next);
|
||||
break;
|
||||
}
|
||||
|
||||
/* If @parent is connected or indexed we are done */
|
||||
if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) ||
|
||||
ovl_test_flag(OVL_INDEX, d_inode(parent)))
|
||||
break;
|
||||
|
||||
dput(next);
|
||||
next = parent;
|
||||
}
|
||||
|
||||
dput(parent);
|
||||
dput(next);
|
||||
|
||||
if (!err)
|
||||
ovl_dentry_set_flag(OVL_E_CONNECTED, dentry);
|
||||
|
||||
return err ?: origin_layer;
|
||||
}
|
||||
|
||||
/*
|
||||
* We only need to encode origin if there is a chance that the same object was
|
||||
* encoded pre copy up and then we need to stay consistent with the same
|
||||
* encoding also after copy up. If non-pure upper is not indexed, then it was
|
||||
* copied up before NFS export was enabled. In that case we don't need to worry
|
||||
* about staying consistent with pre copy up encoding and we encode an upper
|
||||
* file handle. Overlay root dentry is a private case of non-indexed upper.
|
||||
*
|
||||
* The following table summarizes the different file handle encodings used for
|
||||
* different overlay object types:
|
||||
*
|
||||
* Object type | Encoding
|
||||
* --------------------------------
|
||||
* Pure upper | U
|
||||
* Non-indexed upper | U
|
||||
* Indexed upper | L (*)
|
||||
* Non-upper | L (*)
|
||||
*
|
||||
* U = upper file handle
|
||||
* L = lower file handle
|
||||
*
|
||||
* (*) Connecting an overlay dir from real lower dentry is not always
|
||||
* possible when there are redirects in lower layers and non-indexed merge dirs.
|
||||
* To mitigate those case, we may copy up the lower dir ancestor before encode
|
||||
* a lower dir file handle.
|
||||
*
|
||||
* Return 0 for upper file handle, > 0 for lower file handle or < 0 on error.
|
||||
*/
|
||||
static int ovl_check_encode_origin(struct dentry *dentry)
|
||||
{
|
||||
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
|
||||
|
||||
/* Upper file handle for pure upper */
|
||||
if (!ovl_dentry_lower(dentry))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Upper file handle for non-indexed upper.
|
||||
*
|
||||
* Root is never indexed, so if there's an upper layer, encode upper for
|
||||
* root.
|
||||
*/
|
||||
if (ovl_dentry_upper(dentry) &&
|
||||
!ovl_test_flag(OVL_INDEX, d_inode(dentry)))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Decoding a merge dir, whose origin's ancestor is under a redirected
|
||||
* lower dir or under a non-indexed upper is not always possible.
|
||||
* ovl_connect_layer() will try to make origin's layer "connected" by
|
||||
* copying up a "connectable" ancestor.
|
||||
*/
|
||||
if (d_is_dir(dentry) && ofs->upper_mnt)
|
||||
return ovl_connect_layer(dentry);
|
||||
|
||||
/* Lower file handle for indexed and non-upper dir/non-dir */
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen)
|
||||
{
|
||||
struct ovl_fh *fh = NULL;
|
||||
int err, enc_lower;
|
||||
|
||||
/*
|
||||
* Check if we should encode a lower or upper file handle and maybe
|
||||
* copy up an ancestor to make lower file handle connectable.
|
||||
*/
|
||||
err = enc_lower = ovl_check_encode_origin(dentry);
|
||||
if (enc_lower < 0)
|
||||
goto fail;
|
||||
|
||||
/* Encode an upper or lower file handle */
|
||||
fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) :
|
||||
ovl_dentry_upper(dentry), !enc_lower);
|
||||
err = PTR_ERR(fh);
|
||||
if (IS_ERR(fh))
|
||||
goto fail;
|
||||
|
||||
err = -EOVERFLOW;
|
||||
if (fh->len > buflen)
|
||||
goto fail;
|
||||
|
||||
memcpy(buf, (char *)fh, fh->len);
|
||||
err = fh->len;
|
||||
|
||||
out:
|
||||
kfree(fh);
|
||||
return err;
|
||||
|
||||
fail:
|
||||
pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n",
|
||||
dentry, err, buflen, fh ? (int)fh->len : 0,
|
||||
fh ? fh->type : 0);
|
||||
goto out;
|
||||
}
|
||||
|
||||
static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len)
|
||||
{
|
||||
int res, len = *max_len << 2;
|
||||
|
||||
res = ovl_d_to_fh(dentry, (char *)fid, len);
|
||||
if (res <= 0)
|
||||
return FILEID_INVALID;
|
||||
|
||||
len = res;
|
||||
|
||||
/* Round up to dwords */
|
||||
*max_len = (len + 3) >> 2;
|
||||
return OVL_FILEID;
|
||||
}
|
||||
|
||||
static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
|
||||
struct inode *parent)
|
||||
{
|
||||
struct dentry *dentry;
|
||||
int type;
|
||||
|
||||
/* TODO: encode connectable file handles */
|
||||
if (parent)
|
||||
return FILEID_INVALID;
|
||||
|
||||
dentry = d_find_any_alias(inode);
|
||||
if (WARN_ON(!dentry))
|
||||
return FILEID_INVALID;
|
||||
|
||||
type = ovl_dentry_to_fh(dentry, fid, max_len);
|
||||
|
||||
dput(dentry);
|
||||
return type;
|
||||
}
|
||||
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0)
|
||||
/*
|
||||
* Find or instantiate an overlay dentry from real dentries and index.
|
||||
*/
|
||||
static struct dentry *ovl_obtain_alias(struct super_block *sb,
|
||||
struct dentry *upper_alias,
|
||||
struct ovl_path *lowerpath,
|
||||
struct dentry *index)
|
||||
{
|
||||
struct dentry *lower = lowerpath ? lowerpath->dentry : NULL;
|
||||
struct dentry *upper = upper_alias ?: index;
|
||||
struct dentry *dentry;
|
||||
struct inode *inode;
|
||||
struct ovl_entry *oe;
|
||||
struct ovl_inode_params oip = {
|
||||
.lowerpath = lowerpath,
|
||||
.index = index,
|
||||
.numlower = !!lower
|
||||
};
|
||||
|
||||
/* We get overlay directory dentries with ovl_lookup_real() */
|
||||
if (d_is_dir(upper ?: lower))
|
||||
return ERR_PTR(-EIO);
|
||||
|
||||
oip.upperdentry = dget(upper);
|
||||
inode = ovl_get_inode(sb, &oip);
|
||||
if (IS_ERR(inode)) {
|
||||
dput(upper);
|
||||
return ERR_CAST(inode);
|
||||
}
|
||||
|
||||
dentry = d_find_any_alias(inode);
|
||||
if (!dentry) {
|
||||
dentry = d_alloc_anon(inode->i_sb);
|
||||
if (!dentry)
|
||||
goto nomem;
|
||||
oe = ovl_alloc_entry(lower ? 1 : 0);
|
||||
if (!oe)
|
||||
goto nomem;
|
||||
|
||||
if (lower) {
|
||||
oe->lowerstack->dentry = dget(lower);
|
||||
oe->lowerstack->layer = lowerpath->layer;
|
||||
}
|
||||
dentry->d_fsdata = oe;
|
||||
if (upper_alias)
|
||||
ovl_dentry_set_upper_alias(dentry);
|
||||
}
|
||||
|
||||
return d_instantiate_anon(dentry, inode);
|
||||
|
||||
nomem:
|
||||
iput(inode);
|
||||
dput(dentry);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
/* Get the upper or lower dentry in stach whose on layer @idx */
|
||||
static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx)
|
||||
{
|
||||
struct ovl_entry *oe = dentry->d_fsdata;
|
||||
int i;
|
||||
|
||||
if (!idx)
|
||||
return ovl_dentry_upper(dentry);
|
||||
|
||||
for (i = 0; i < oe->numlower; i++) {
|
||||
if (oe->lowerstack[i].layer->idx == idx)
|
||||
return oe->lowerstack[i].dentry;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lookup a child overlay dentry to get a connected overlay dentry whose real
|
||||
* dentry is @real. If @real is on upper layer, we lookup a child overlay
|
||||
* dentry with the same name as the real dentry. Otherwise, we need to consult
|
||||
* index for lookup.
|
||||
*/
|
||||
static struct dentry *ovl_lookup_real_one(struct dentry *connected,
|
||||
struct dentry *real,
|
||||
struct ovl_layer *layer)
|
||||
{
|
||||
struct inode *dir = d_inode(connected);
|
||||
struct dentry *this, *parent = NULL;
|
||||
struct name_snapshot name;
|
||||
int err;
|
||||
|
||||
/*
|
||||
* Lookup child overlay dentry by real name. The dir mutex protects us
|
||||
* from racing with overlay rename. If the overlay dentry that is above
|
||||
* real has already been moved to a parent that is not under the
|
||||
* connected overlay dir, we return -ECHILD and restart the lookup of
|
||||
* connected real path from the top.
|
||||
*/
|
||||
inode_lock_nested(dir, I_MUTEX_PARENT);
|
||||
err = -ECHILD;
|
||||
parent = dget_parent(real);
|
||||
if (ovl_dentry_real_at(connected, layer->idx) != parent)
|
||||
goto fail;
|
||||
|
||||
/*
|
||||
* We also need to take a snapshot of real dentry name to protect us
|
||||
* from racing with underlying layer rename. In this case, we don't
|
||||
* care about returning ESTALE, only from dereferencing a free name
|
||||
* pointer because we hold no lock on the real dentry.
|
||||
*/
|
||||
take_dentry_name_snapshot(&name, real);
|
||||
this = lookup_one_len(name.name, connected, strlen(name.name));
|
||||
err = PTR_ERR(this);
|
||||
if (IS_ERR(this)) {
|
||||
goto fail;
|
||||
} else if (!this || !this->d_inode) {
|
||||
dput(this);
|
||||
err = -ENOENT;
|
||||
goto fail;
|
||||
} else if (ovl_dentry_real_at(this, layer->idx) != real) {
|
||||
dput(this);
|
||||
err = -ESTALE;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
out:
|
||||
release_dentry_name_snapshot(&name);
|
||||
dput(parent);
|
||||
inode_unlock(dir);
|
||||
return this;
|
||||
|
||||
fail:
|
||||
pr_warn_ratelimited("overlayfs: failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n",
|
||||
real, layer->idx, connected, err);
|
||||
this = ERR_PTR(err);
|
||||
goto out;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_lookup_real(struct super_block *sb,
|
||||
struct dentry *real,
|
||||
struct ovl_layer *layer);
|
||||
|
||||
/*
|
||||
* Lookup an indexed or hashed overlay dentry by real inode.
|
||||
*/
|
||||
static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
|
||||
struct dentry *real,
|
||||
struct ovl_layer *layer)
|
||||
{
|
||||
struct ovl_fs *ofs = sb->s_fs_info;
|
||||
struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt };
|
||||
struct dentry *index = NULL;
|
||||
struct dentry *this = NULL;
|
||||
struct inode *inode;
|
||||
|
||||
/*
|
||||
* Decoding upper dir from index is expensive, so first try to lookup
|
||||
* overlay dentry in inode/dcache.
|
||||
*/
|
||||
inode = ovl_lookup_inode(sb, real, !layer->idx);
|
||||
if (IS_ERR(inode))
|
||||
return ERR_CAST(inode);
|
||||
if (inode) {
|
||||
this = d_find_any_alias(inode);
|
||||
iput(inode);
|
||||
}
|
||||
|
||||
/*
|
||||
* For decoded lower dir file handle, lookup index by origin to check
|
||||
* if lower dir was copied up and and/or removed.
|
||||
*/
|
||||
if (!this && layer->idx && ofs->indexdir && !WARN_ON(!d_is_dir(real))) {
|
||||
index = ovl_lookup_index(ofs, NULL, real, false);
|
||||
if (IS_ERR(index))
|
||||
return index;
|
||||
}
|
||||
|
||||
/* Get connected upper overlay dir from index */
|
||||
if (index) {
|
||||
struct dentry *upper = ovl_index_upper(ofs, index);
|
||||
|
||||
dput(index);
|
||||
if (IS_ERR_OR_NULL(upper))
|
||||
return upper;
|
||||
|
||||
/*
|
||||
* ovl_lookup_real() in lower layer may call recursively once to
|
||||
* ovl_lookup_real() in upper layer. The first level call walks
|
||||
* back lower parents to the topmost indexed parent. The second
|
||||
* recursive call walks back from indexed upper to the topmost
|
||||
* connected/hashed upper parent (or up to root).
|
||||
*/
|
||||
this = ovl_lookup_real(sb, upper, &upper_layer);
|
||||
dput(upper);
|
||||
}
|
||||
|
||||
if (IS_ERR_OR_NULL(this))
|
||||
return this;
|
||||
|
||||
if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) {
|
||||
dput(this);
|
||||
this = ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lookup an indexed or hashed overlay dentry, whose real dentry is an
|
||||
* ancestor of @real.
|
||||
*/
|
||||
static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb,
|
||||
struct dentry *real,
|
||||
struct ovl_layer *layer)
|
||||
{
|
||||
struct dentry *next, *parent = NULL;
|
||||
struct dentry *ancestor = ERR_PTR(-EIO);
|
||||
|
||||
if (real == layer->mnt->mnt_root)
|
||||
return dget(sb->s_root);
|
||||
|
||||
/* Find the topmost indexed or hashed ancestor */
|
||||
next = dget(real);
|
||||
for (;;) {
|
||||
parent = dget_parent(next);
|
||||
|
||||
/*
|
||||
* Lookup a matching overlay dentry in inode/dentry
|
||||
* cache or in index by real inode.
|
||||
*/
|
||||
ancestor = ovl_lookup_real_inode(sb, next, layer);
|
||||
if (ancestor)
|
||||
break;
|
||||
|
||||
if (parent == layer->mnt->mnt_root) {
|
||||
ancestor = dget(sb->s_root);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If @real has been moved out of the layer root directory,
|
||||
* we will eventully hit the real fs root. This cannot happen
|
||||
* by legit overlay rename, so we return error in that case.
|
||||
*/
|
||||
if (parent == next) {
|
||||
ancestor = ERR_PTR(-EXDEV);
|
||||
break;
|
||||
}
|
||||
|
||||
dput(next);
|
||||
next = parent;
|
||||
}
|
||||
|
||||
dput(parent);
|
||||
dput(next);
|
||||
|
||||
return ancestor;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lookup a connected overlay dentry whose real dentry is @real.
|
||||
* If @real is on upper layer, we lookup a child overlay dentry with the same
|
||||
* path the real dentry. Otherwise, we need to consult index for lookup.
|
||||
*/
|
||||
static struct dentry *ovl_lookup_real(struct super_block *sb,
|
||||
struct dentry *real,
|
||||
struct ovl_layer *layer)
|
||||
{
|
||||
struct dentry *connected;
|
||||
int err = 0;
|
||||
|
||||
connected = ovl_lookup_real_ancestor(sb, real, layer);
|
||||
if (IS_ERR(connected))
|
||||
return connected;
|
||||
|
||||
while (!err) {
|
||||
struct dentry *next, *this;
|
||||
struct dentry *parent = NULL;
|
||||
struct dentry *real_connected = ovl_dentry_real_at(connected,
|
||||
layer->idx);
|
||||
|
||||
if (real_connected == real)
|
||||
break;
|
||||
|
||||
/* Find the topmost dentry not yet connected */
|
||||
next = dget(real);
|
||||
for (;;) {
|
||||
parent = dget_parent(next);
|
||||
|
||||
if (parent == real_connected)
|
||||
break;
|
||||
|
||||
/*
|
||||
* If real has been moved out of 'real_connected',
|
||||
* we will not find 'real_connected' and hit the layer
|
||||
* root. In that case, we need to restart connecting.
|
||||
* This game can go on forever in the worst case. We
|
||||
* may want to consider taking s_vfs_rename_mutex if
|
||||
* this happens more than once.
|
||||
*/
|
||||
if (parent == layer->mnt->mnt_root) {
|
||||
dput(connected);
|
||||
connected = dget(sb->s_root);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If real file has been moved out of the layer root
|
||||
* directory, we will eventully hit the real fs root.
|
||||
* This cannot happen by legit overlay rename, so we
|
||||
* return error in that case.
|
||||
*/
|
||||
if (parent == next) {
|
||||
err = -EXDEV;
|
||||
break;
|
||||
}
|
||||
|
||||
dput(next);
|
||||
next = parent;
|
||||
}
|
||||
|
||||
if (!err) {
|
||||
this = ovl_lookup_real_one(connected, next, layer);
|
||||
if (IS_ERR(this))
|
||||
err = PTR_ERR(this);
|
||||
|
||||
/*
|
||||
* Lookup of child in overlay can fail when racing with
|
||||
* overlay rename of child away from 'connected' parent.
|
||||
* In this case, we need to restart the lookup from the
|
||||
* top, because we cannot trust that 'real_connected' is
|
||||
* still an ancestor of 'real'. There is a good chance
|
||||
* that the renamed overlay ancestor is now in cache, so
|
||||
* ovl_lookup_real_ancestor() will find it and we can
|
||||
* continue to connect exactly from where lookup failed.
|
||||
*/
|
||||
if (err == -ECHILD) {
|
||||
this = ovl_lookup_real_ancestor(sb, real,
|
||||
layer);
|
||||
err = PTR_ERR_OR_ZERO(this);
|
||||
}
|
||||
if (!err) {
|
||||
dput(connected);
|
||||
connected = this;
|
||||
}
|
||||
}
|
||||
|
||||
dput(parent);
|
||||
dput(next);
|
||||
}
|
||||
|
||||
if (err)
|
||||
goto fail;
|
||||
|
||||
return connected;
|
||||
|
||||
fail:
|
||||
pr_warn_ratelimited("overlayfs: failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n",
|
||||
real, layer->idx, connected, err);
|
||||
dput(connected);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get an overlay dentry from upper/lower real dentries and index.
|
||||
*/
|
||||
static struct dentry *ovl_get_dentry(struct super_block *sb,
|
||||
struct dentry *upper,
|
||||
struct ovl_path *lowerpath,
|
||||
struct dentry *index)
|
||||
{
|
||||
struct ovl_fs *ofs = sb->s_fs_info;
|
||||
struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt };
|
||||
struct ovl_layer *layer = upper ? &upper_layer : lowerpath->layer;
|
||||
struct dentry *real = upper ?: (index ?: lowerpath->dentry);
|
||||
|
||||
/*
|
||||
* Obtain a disconnected overlay dentry from a non-dir real dentry
|
||||
* and index.
|
||||
*/
|
||||
if (!d_is_dir(real))
|
||||
return ovl_obtain_alias(sb, upper, lowerpath, index);
|
||||
|
||||
/* Removed empty directory? */
|
||||
if ((real->d_flags & DCACHE_DISCONNECTED) || d_unhashed(real))
|
||||
return ERR_PTR(-ENOENT);
|
||||
|
||||
/*
|
||||
* If real dentry is connected and hashed, get a connected overlay
|
||||
* dentry whose real dentry is @real.
|
||||
*/
|
||||
return ovl_lookup_real(sb, real, layer);
|
||||
}
|
||||
|
||||
static struct dentry *ovl_upper_fh_to_d(struct super_block *sb,
|
||||
struct ovl_fh *fh)
|
||||
{
|
||||
struct ovl_fs *ofs = sb->s_fs_info;
|
||||
struct dentry *dentry;
|
||||
struct dentry *upper;
|
||||
|
||||
if (!ofs->upper_mnt)
|
||||
return ERR_PTR(-EACCES);
|
||||
|
||||
upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true);
|
||||
if (IS_ERR_OR_NULL(upper))
|
||||
return upper;
|
||||
|
||||
dentry = ovl_get_dentry(sb, upper, NULL, NULL);
|
||||
dput(upper);
|
||||
|
||||
return dentry;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
|
||||
struct ovl_fh *fh)
|
||||
{
|
||||
struct ovl_fs *ofs = sb->s_fs_info;
|
||||
struct ovl_path origin = { };
|
||||
struct ovl_path *stack = &origin;
|
||||
struct dentry *dentry = NULL;
|
||||
struct dentry *index = NULL;
|
||||
struct inode *inode;
|
||||
int err;
|
||||
|
||||
/* First lookup overlay inode in inode cache by origin fh */
|
||||
err = ovl_check_origin_fh(ofs, fh, false, NULL, &stack);
|
||||
if (err)
|
||||
return ERR_PTR(err);
|
||||
|
||||
if (!d_is_dir(origin.dentry) ||
|
||||
!(origin.dentry->d_flags & DCACHE_DISCONNECTED)) {
|
||||
inode = ovl_lookup_inode(sb, origin.dentry, false);
|
||||
err = PTR_ERR(inode);
|
||||
if (IS_ERR(inode))
|
||||
goto out_err;
|
||||
if (inode) {
|
||||
dentry = d_find_any_alias(inode);
|
||||
iput(inode);
|
||||
if (dentry)
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* Then lookup indexed upper/whiteout by origin fh */
|
||||
if (ofs->indexdir) {
|
||||
index = ovl_get_index_fh(ofs, fh);
|
||||
err = PTR_ERR(index);
|
||||
if (IS_ERR(index)) {
|
||||
index = NULL;
|
||||
goto out_err;
|
||||
}
|
||||
}
|
||||
|
||||
/* Then try to get a connected upper dir by index */
|
||||
if (index && d_is_dir(index)) {
|
||||
struct dentry *upper = ovl_index_upper(ofs, index);
|
||||
|
||||
err = PTR_ERR(upper);
|
||||
if (IS_ERR_OR_NULL(upper))
|
||||
goto out_err;
|
||||
|
||||
dentry = ovl_get_dentry(sb, upper, NULL, NULL);
|
||||
dput(upper);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Otherwise, get a connected non-upper dir or disconnected non-dir */
|
||||
if (d_is_dir(origin.dentry) &&
|
||||
(origin.dentry->d_flags & DCACHE_DISCONNECTED)) {
|
||||
dput(origin.dentry);
|
||||
origin.dentry = NULL;
|
||||
err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack);
|
||||
if (err)
|
||||
goto out_err;
|
||||
}
|
||||
if (index) {
|
||||
err = ovl_verify_origin(index, origin.dentry, false);
|
||||
if (err)
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
dentry = ovl_get_dentry(sb, NULL, &origin, index);
|
||||
|
||||
out:
|
||||
dput(origin.dentry);
|
||||
dput(index);
|
||||
return dentry;
|
||||
|
||||
out_err:
|
||||
dentry = ERR_PTR(err);
|
||||
goto out;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid,
|
||||
int fh_len, int fh_type)
|
||||
{
|
||||
struct dentry *dentry = NULL;
|
||||
struct ovl_fh *fh = (struct ovl_fh *) fid;
|
||||
int len = fh_len << 2;
|
||||
unsigned int flags = 0;
|
||||
int err;
|
||||
|
||||
err = -EINVAL;
|
||||
if (fh_type != OVL_FILEID)
|
||||
goto out_err;
|
||||
|
||||
err = ovl_check_fh_len(fh, len);
|
||||
if (err)
|
||||
goto out_err;
|
||||
|
||||
flags = fh->flags;
|
||||
dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ?
|
||||
ovl_upper_fh_to_d(sb, fh) :
|
||||
ovl_lower_fh_to_d(sb, fh);
|
||||
err = PTR_ERR(dentry);
|
||||
if (IS_ERR(dentry) && err != -ESTALE)
|
||||
goto out_err;
|
||||
|
||||
return dentry;
|
||||
|
||||
out_err:
|
||||
pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n",
|
||||
len, fh_type, flags, err);
|
||||
return ERR_PTR(err);
|
||||
}
|
||||
|
||||
static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid,
|
||||
int fh_len, int fh_type)
|
||||
{
|
||||
pr_warn_ratelimited("overlayfs: connectable file handles not supported; use 'no_subtree_check' exportfs option.\n");
|
||||
return ERR_PTR(-EACCES);
|
||||
}
|
||||
|
||||
static int ovl_get_name(struct dentry *parent, char *name,
|
||||
struct dentry *child)
|
||||
{
|
||||
/*
|
||||
* ovl_fh_to_dentry() returns connected dir overlay dentries and
|
||||
* ovl_fh_to_parent() is not implemented, so we should not get here.
|
||||
*/
|
||||
WARN_ON_ONCE(1);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static struct dentry *ovl_get_parent(struct dentry *dentry)
|
||||
{
|
||||
/*
|
||||
* ovl_fh_to_dentry() returns connected dir overlay dentries, so we
|
||||
* should not get here.
|
||||
*/
|
||||
WARN_ON_ONCE(1);
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
#endif
|
||||
|
||||
const struct export_operations ovl_export_operations = {
|
||||
.encode_fh = ovl_encode_fh,
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0)
|
||||
.fh_to_dentry = ovl_fh_to_dentry,
|
||||
.fh_to_parent = ovl_fh_to_parent,
|
||||
.get_name = ovl_get_name,
|
||||
.get_parent = ovl_get_parent,
|
||||
#endif
|
||||
};
|
||||
@ -1,874 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/fs.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/cred.h>
|
||||
#include <linux/xattr.h>
|
||||
#include <linux/posix_acl.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include <linux/version.h>
|
||||
#include "overlayfs.h"
|
||||
|
||||
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry;
|
||||
const struct cred *old_cred;
|
||||
|
||||
/* NOCOPYUPW */
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Check for permissions before trying to copy-up. This is redundant
|
||||
* since it will be rechecked later by ->setattr() on upper dentry. But
|
||||
* without this, copy-up can be triggered by just about anybody.
|
||||
*
|
||||
* We don't initialize inode->size, which just means that
|
||||
* inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not
|
||||
* check for a swapfile (which this won't be anyway).
|
||||
*/
|
||||
err = setattr_prepare(dentry, attr);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = ovl_copy_up(dentry);
|
||||
if (!err) {
|
||||
upperdentry = ovl_dentry_upper(dentry);
|
||||
|
||||
if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
|
||||
attr->ia_valid &= ~ATTR_MODE;
|
||||
|
||||
inode_lock(upperdentry->d_inode);
|
||||
old_cred = ovl_override_creds(dentry->d_sb);
|
||||
err = notify_change(upperdentry, attr, NULL);
|
||||
revert_creds(old_cred);
|
||||
if (!err)
|
||||
ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
|
||||
inode_unlock(upperdentry->d_inode);
|
||||
}
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat,
|
||||
struct ovl_layer *lower_layer)
|
||||
{
|
||||
bool samefs = ovl_same_sb(dentry->d_sb);
|
||||
unsigned int xinobits = ovl_xino_bits(dentry->d_sb);
|
||||
|
||||
if (samefs) {
|
||||
/*
|
||||
* When all layers are on the same fs, all real inode
|
||||
* number are unique, so we use the overlay st_dev,
|
||||
* which is friendly to du -x.
|
||||
*/
|
||||
stat->dev = dentry->d_sb->s_dev;
|
||||
return 0;
|
||||
} else if (xinobits) {
|
||||
unsigned int shift = 64 - xinobits;
|
||||
/*
|
||||
* All inode numbers of underlying fs should not be using the
|
||||
* high xinobits, so we use high xinobits to partition the
|
||||
* overlay st_ino address space. The high bits holds the fsid
|
||||
* (upper fsid is 0). This way overlay inode numbers are unique
|
||||
* and all inodes use overlay st_dev. Inode numbers are also
|
||||
* persistent for a given layer configuration.
|
||||
*/
|
||||
if (stat->ino >> shift) {
|
||||
pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
|
||||
dentry, stat->ino, xinobits);
|
||||
} else {
|
||||
if (lower_layer)
|
||||
stat->ino |= ((u64)lower_layer->fsid) << shift;
|
||||
|
||||
stat->dev = dentry->d_sb->s_dev;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* The inode could not be mapped to a unified st_ino address space */
|
||||
if (S_ISDIR(dentry->d_inode->i_mode)) {
|
||||
/*
|
||||
* Always use the overlay st_dev for directories, so 'find
|
||||
* -xdev' will scan the entire overlay mount and won't cross the
|
||||
* overlay mount boundaries.
|
||||
*
|
||||
* If not all layers are on the same fs the pair {real st_ino;
|
||||
* overlay st_dev} is not unique, so use the non persistent
|
||||
* overlay st_ino for directories.
|
||||
*/
|
||||
stat->dev = dentry->d_sb->s_dev;
|
||||
stat->ino = dentry->d_inode->i_ino;
|
||||
} else if (lower_layer && lower_layer->fsid) {
|
||||
/*
|
||||
* For non-samefs setup, if we cannot map all layers st_ino
|
||||
* to a unified address space, we need to make sure that st_dev
|
||||
* is unique per lower fs. Upper layer uses real st_dev and
|
||||
* lower layers use the unique anonymous bdev assigned to the
|
||||
* lower fs.
|
||||
*/
|
||||
stat->dev = lower_layer->fs->pseudo_dev;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ovl_getattr(const struct path *path, struct kstat *stat,
|
||||
u32 request_mask, unsigned int flags)
|
||||
{
|
||||
struct dentry *dentry = path->dentry;
|
||||
enum ovl_path_type type;
|
||||
struct path realpath;
|
||||
const struct cred *old_cred;
|
||||
bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
|
||||
bool samefs = ovl_same_sb(dentry->d_sb);
|
||||
struct ovl_layer *lower_layer = NULL;
|
||||
int err;
|
||||
|
||||
type = ovl_path_real(dentry, &realpath);
|
||||
old_cred = ovl_override_creds(dentry->d_sb);
|
||||
err = vfs_getattr(&realpath, stat, request_mask, flags);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* For non-dir or same fs, we use st_ino of the copy up origin.
|
||||
* This guaranties constant st_dev/st_ino across copy up.
|
||||
* With xino feature and non-samefs, we use st_ino of the copy up
|
||||
* origin masked with high bits that represent the layer id.
|
||||
*
|
||||
* If lower filesystem supports NFS file handles, this also guaranties
|
||||
* persistent st_ino across mount cycle.
|
||||
*/
|
||||
if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) {
|
||||
if (!OVL_TYPE_UPPER(type)) {
|
||||
lower_layer = ovl_layer_lower(dentry);
|
||||
} else if (OVL_TYPE_ORIGIN(type)) {
|
||||
struct kstat lowerstat;
|
||||
u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0);
|
||||
|
||||
ovl_path_lower(dentry, &realpath);
|
||||
err = vfs_getattr(&realpath, &lowerstat,
|
||||
lowermask, flags);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Lower hardlinks may be broken on copy up to different
|
||||
* upper files, so we cannot use the lower origin st_ino
|
||||
* for those different files, even for the same fs case.
|
||||
*
|
||||
* Similarly, several redirected dirs can point to the
|
||||
* same dir on a lower layer. With the "verify_lower"
|
||||
* feature, we do not use the lower origin st_ino, if
|
||||
* we haven't verified that this redirect is unique.
|
||||
*
|
||||
* With inodes index enabled, it is safe to use st_ino
|
||||
* of an indexed origin. The index validates that the
|
||||
* upper hardlink is not broken and that a redirected
|
||||
* dir is the only redirect to that origin.
|
||||
*/
|
||||
if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) ||
|
||||
(!ovl_verify_lower(dentry->d_sb) &&
|
||||
(is_dir || lowerstat.nlink == 1))) {
|
||||
stat->ino = lowerstat.ino;
|
||||
lower_layer = ovl_layer_lower(dentry);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
err = ovl_map_dev_ino(dentry, stat, lower_layer);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* It's probably not worth it to count subdirs to get the
|
||||
* correct link count. nlink=1 seems to pacify 'find' and
|
||||
* other utilities.
|
||||
*/
|
||||
if (is_dir && OVL_TYPE_MERGE(type))
|
||||
stat->nlink = 1;
|
||||
|
||||
/*
|
||||
* Return the overlay inode nlinks for indexed upper inodes.
|
||||
* Overlay inode nlink counts the union of the upper hardlinks
|
||||
* and non-covered lower hardlinks. It does not include the upper
|
||||
* index hardlink.
|
||||
*/
|
||||
if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry)))
|
||||
stat->nlink = dentry->d_inode->i_nlink;
|
||||
|
||||
out:
|
||||
revert_creds(old_cred);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_permission(struct inode *inode, int mask)
|
||||
{
|
||||
struct inode *upperinode = ovl_inode_upper(inode);
|
||||
struct inode *realinode = upperinode ?: ovl_inode_lower(inode);
|
||||
const struct cred *old_cred;
|
||||
int err;
|
||||
|
||||
/* Careful in RCU walk mode */
|
||||
if (!realinode) {
|
||||
WARN_ON(!(mask & MAY_NOT_BLOCK));
|
||||
return -ECHILD;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check overlay inode with the creds of task and underlying inode
|
||||
* with creds of mounter
|
||||
*/
|
||||
err = generic_permission(inode, mask);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
old_cred = ovl_override_creds(inode->i_sb);
|
||||
if (!upperinode &&
|
||||
!special_file(realinode->i_mode) && mask & MAY_WRITE) {
|
||||
mask &= ~(MAY_WRITE | MAY_APPEND);
|
||||
/* Make sure mounter can read file for copy up later */
|
||||
mask |= MAY_READ;
|
||||
}
|
||||
err = inode_permission(realinode, mask);
|
||||
revert_creds(old_cred);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
|
||||
{
|
||||
struct path realpath;
|
||||
struct inode *realinode;
|
||||
|
||||
ovl_path_real(dentry, &realpath);
|
||||
realinode = realpath.dentry->d_inode;
|
||||
|
||||
if (!realinode->i_op->readlink)
|
||||
return -EINVAL;
|
||||
|
||||
touch_atime(&realpath);
|
||||
|
||||
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
|
||||
}
|
||||
|
||||
static const char *ovl_get_link(struct dentry *dentry,
|
||||
struct inode *inode,
|
||||
struct delayed_call *done)
|
||||
{
|
||||
const struct cred *old_cred;
|
||||
const char *p;
|
||||
|
||||
if (!dentry)
|
||||
return ERR_PTR(-ECHILD);
|
||||
|
||||
old_cred = ovl_override_creds(dentry->d_sb);
|
||||
p = vfs_get_link(ovl_dentry_real(dentry), done);
|
||||
revert_creds(old_cred);
|
||||
return p;
|
||||
}
|
||||
|
||||
bool ovl_is_private_xattr(const char *name)
|
||||
{
|
||||
return strncmp(name, OVL_XATTR_PREFIX,
|
||||
sizeof(OVL_XATTR_PREFIX) - 1) == 0;
|
||||
}
|
||||
|
||||
int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err;
|
||||
struct dentry *upperdentry = ovl_i_dentry_upper(inode);
|
||||
struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
|
||||
const struct cred *old_cred;
|
||||
|
||||
/* NOCOPYUPW */
|
||||
return 0;
|
||||
|
||||
err = ovl_want_write(dentry);
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
if (!value && !upperdentry) {
|
||||
err = vfs_getxattr(realdentry, name, NULL, 0);
|
||||
if (err < 0)
|
||||
goto out_drop_write;
|
||||
}
|
||||
|
||||
if (!upperdentry) {
|
||||
err = ovl_copy_up(dentry);
|
||||
if (err)
|
||||
goto out_drop_write;
|
||||
|
||||
realdentry = ovl_dentry_upper(dentry);
|
||||
}
|
||||
|
||||
old_cred = ovl_override_creds(dentry->d_sb);
|
||||
if (value)
|
||||
err = vfs_setxattr(realdentry, name, value, size, flags);
|
||||
else {
|
||||
WARN_ON(flags != XATTR_REPLACE);
|
||||
err = vfs_removexattr(realdentry, name);
|
||||
}
|
||||
revert_creds(old_cred);
|
||||
|
||||
out_drop_write:
|
||||
ovl_drop_write(dentry);
|
||||
out:
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
|
||||
void *value, size_t size)
|
||||
{
|
||||
ssize_t res;
|
||||
const struct cred *old_cred;
|
||||
struct dentry *realdentry =
|
||||
ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry);
|
||||
|
||||
old_cred = ovl_override_creds(dentry->d_sb);
|
||||
res = vfs_getxattr(realdentry, name, value, size);
|
||||
revert_creds(old_cred);
|
||||
return res;
|
||||
}
|
||||
|
||||
static bool ovl_can_list(const char *s)
|
||||
{
|
||||
/* List all non-trusted xatts */
|
||||
if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
|
||||
return true;
|
||||
|
||||
/* Never list trusted.overlay, list other trusted for superuser only */
|
||||
return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN);
|
||||
}
|
||||
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
|
||||
{
|
||||
struct dentry *realdentry = ovl_dentry_real(dentry);
|
||||
ssize_t res;
|
||||
size_t len;
|
||||
char *s;
|
||||
const struct cred *old_cred;
|
||||
|
||||
old_cred = ovl_override_creds(dentry->d_sb);
|
||||
res = vfs_listxattr(realdentry, list, size);
|
||||
revert_creds(old_cred);
|
||||
if (res <= 0 || size == 0)
|
||||
return res;
|
||||
|
||||
/* filter out private xattrs */
|
||||
for (s = list, len = res; len;) {
|
||||
size_t slen = strnlen(s, len) + 1;
|
||||
|
||||
/* underlying fs providing us with an broken xattr list? */
|
||||
if (WARN_ON(slen > len))
|
||||
return -EIO;
|
||||
|
||||
len -= slen;
|
||||
if (!ovl_can_list(s)) {
|
||||
res -= slen;
|
||||
memmove(s, s + slen, len);
|
||||
} else {
|
||||
s += slen;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
struct posix_acl *ovl_get_acl(struct inode *inode, int type)
|
||||
{
|
||||
struct inode *realinode = ovl_inode_real(inode);
|
||||
const struct cred *old_cred;
|
||||
struct posix_acl *acl;
|
||||
|
||||
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
|
||||
return NULL;
|
||||
|
||||
old_cred = ovl_override_creds(inode->i_sb);
|
||||
acl = get_acl(realinode, type);
|
||||
revert_creds(old_cred);
|
||||
|
||||
return acl;
|
||||
}
|
||||
|
||||
static bool ovl_open_need_copy_up(struct dentry *dentry, int flags)
|
||||
{
|
||||
/* Copy up of disconnected dentry does not set upper alias */
|
||||
if (ovl_dentry_upper(dentry) &&
|
||||
(ovl_dentry_has_upper_alias(dentry) ||
|
||||
(dentry->d_flags & DCACHE_DISCONNECTED)))
|
||||
return false;
|
||||
|
||||
if (special_file(d_inode(dentry)->i_mode))
|
||||
return false;
|
||||
|
||||
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags)
|
||||
{
|
||||
int err = 0;
|
||||
|
||||
/* NOCOPYUPW */
|
||||
return err;
|
||||
|
||||
if (ovl_open_need_copy_up(dentry, file_flags)) {
|
||||
err = ovl_want_write(dentry);
|
||||
if (!err) {
|
||||
err = ovl_copy_up_flags(dentry, file_flags);
|
||||
ovl_drop_write(dentry);
|
||||
}
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags)
|
||||
{
|
||||
if (flags & S_ATIME) {
|
||||
struct ovl_fs *ofs = inode->i_sb->s_fs_info;
|
||||
struct path upperpath = {
|
||||
.mnt = ofs->upper_mnt,
|
||||
.dentry = ovl_upperdentry_dereference(OVL_I(inode)),
|
||||
};
|
||||
|
||||
if (upperpath.dentry) {
|
||||
touch_atime(&upperpath);
|
||||
inode->i_atime = d_inode(upperpath.dentry)->i_atime;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct inode_operations ovl_file_inode_operations = {
|
||||
.setattr = ovl_setattr,
|
||||
.permission = ovl_permission,
|
||||
.getattr = ovl_getattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.get_acl = ovl_get_acl,
|
||||
.update_time = ovl_update_time,
|
||||
};
|
||||
|
||||
static const struct inode_operations ovl_symlink_inode_operations = {
|
||||
.setattr = ovl_setattr,
|
||||
.get_link = ovl_get_link,
|
||||
.readlink = ovl_readlink,
|
||||
.getattr = ovl_getattr,
|
||||
.listxattr = ovl_listxattr,
|
||||
.update_time = ovl_update_time,
|
||||
};
|
||||
|
||||
/*
|
||||
* It is possible to stack overlayfs instance on top of another
|
||||
* overlayfs instance as lower layer. We need to annonate the
|
||||
* stackable i_mutex locks according to stack level of the super
|
||||
* block instance. An overlayfs instance can never be in stack
|
||||
* depth 0 (there is always a real fs below it). An overlayfs
|
||||
* inode lock will use the lockdep annotaion ovl_i_mutex_key[depth].
|
||||
*
|
||||
* For example, here is a snip from /proc/lockdep_chains after
|
||||
* dir_iterate of nested overlayfs:
|
||||
*
|
||||
* [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2)
|
||||
* [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1)
|
||||
* [...] &type->i_mutex_dir_key (stack_depth=0)
|
||||
*/
|
||||
#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH
|
||||
|
||||
static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
|
||||
{
|
||||
#ifdef CONFIG_LOCKDEP
|
||||
static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING];
|
||||
static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING];
|
||||
static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING];
|
||||
|
||||
int depth = inode->i_sb->s_stack_depth - 1;
|
||||
|
||||
if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING))
|
||||
depth = 0;
|
||||
|
||||
if (S_ISDIR(inode->i_mode))
|
||||
lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]);
|
||||
else
|
||||
lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]);
|
||||
|
||||
lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
|
||||
unsigned long ino, int fsid)
|
||||
{
|
||||
int xinobits = ovl_xino_bits(inode->i_sb);
|
||||
|
||||
/*
|
||||
* When NFS export is enabled and d_ino is consistent with st_ino
|
||||
* (samefs or i_ino has enough bits to encode layer), set the same
|
||||
* value used for d_ino to i_ino, because nfsd readdirplus compares
|
||||
* d_ino values to i_ino values of child entries. When called from
|
||||
* ovl_new_inode(), ino arg is 0, so i_ino will be updated to real
|
||||
* upper inode i_ino on ovl_inode_init() or ovl_inode_update().
|
||||
*/
|
||||
if (inode->i_sb->s_export_op &&
|
||||
(ovl_same_sb(inode->i_sb) || xinobits)) {
|
||||
inode->i_ino = ino;
|
||||
if (xinobits && fsid && !(ino >> (64 - xinobits)))
|
||||
inode->i_ino |= (unsigned long)fsid << (64 - xinobits);
|
||||
} else {
|
||||
inode->i_ino = get_next_ino();
|
||||
}
|
||||
inode->i_mode = mode;
|
||||
inode->i_flags |= S_NOCMTIME;
|
||||
#ifdef CONFIG_FS_POSIX_ACL
|
||||
inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
|
||||
#endif
|
||||
|
||||
ovl_lockdep_annotate_inode_mutex_key(inode);
|
||||
|
||||
switch (mode & S_IFMT) {
|
||||
case S_IFREG:
|
||||
inode->i_op = &ovl_file_inode_operations;
|
||||
break;
|
||||
|
||||
case S_IFDIR:
|
||||
inode->i_op = &ovl_dir_inode_operations;
|
||||
inode->i_fop = &ovl_dir_operations;
|
||||
break;
|
||||
|
||||
case S_IFLNK:
|
||||
inode->i_op = &ovl_symlink_inode_operations;
|
||||
break;
|
||||
|
||||
default:
|
||||
inode->i_op = &ovl_file_inode_operations;
|
||||
init_special_inode(inode, mode, rdev);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* With inodes index enabled, an overlay inode nlink counts the union of upper
|
||||
* hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure
|
||||
* upper inode, the following nlink modifying operations can happen:
|
||||
*
|
||||
* 1. Lower hardlink copy up
|
||||
* 2. Upper hardlink created, unlinked or renamed over
|
||||
* 3. Lower hardlink whiteout or renamed over
|
||||
*
|
||||
* For the first, copy up case, the union nlink does not change, whether the
|
||||
* operation succeeds or fails, but the upper inode nlink may change.
|
||||
* Therefore, before copy up, we store the union nlink value relative to the
|
||||
* lower inode nlink in the index inode xattr trusted.overlay.nlink.
|
||||
*
|
||||
* For the second, upper hardlink case, the union nlink should be incremented
|
||||
* or decremented IFF the operation succeeds, aligned with nlink change of the
|
||||
* upper inode. Therefore, before link/unlink/rename, we store the union nlink
|
||||
* value relative to the upper inode nlink in the index inode.
|
||||
*
|
||||
* For the last, lower cover up case, we simplify things by preceding the
|
||||
* whiteout or cover up with copy up. This makes sure that there is an index
|
||||
* upper inode where the nlink xattr can be stored before the copied up upper
|
||||
* entry is unlink.
|
||||
*/
|
||||
#define OVL_NLINK_ADD_UPPER (1 << 0)
|
||||
|
||||
/*
|
||||
* On-disk format for indexed nlink:
|
||||
*
|
||||
* nlink relative to the upper inode - "U[+-]NUM"
|
||||
* nlink relative to the lower inode - "L[+-]NUM"
|
||||
*/
|
||||
|
||||
static int ovl_set_nlink_common(struct dentry *dentry,
|
||||
struct dentry *realdentry, const char *format)
|
||||
{
|
||||
struct inode *inode = d_inode(dentry);
|
||||
struct inode *realinode = d_inode(realdentry);
|
||||
char buf[13];
|
||||
int len;
|
||||
|
||||
len = snprintf(buf, sizeof(buf), format,
|
||||
(int) (inode->i_nlink - realinode->i_nlink));
|
||||
|
||||
if (WARN_ON(len >= sizeof(buf)))
|
||||
return -EIO;
|
||||
|
||||
return ovl_do_setxattr(ovl_dentry_upper(dentry),
|
||||
OVL_XATTR_NLINK, buf, len, 0);
|
||||
}
|
||||
|
||||
int ovl_set_nlink_upper(struct dentry *dentry)
|
||||
{
|
||||
return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i");
|
||||
}
|
||||
|
||||
int ovl_set_nlink_lower(struct dentry *dentry)
|
||||
{
|
||||
return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i");
|
||||
}
|
||||
|
||||
unsigned int ovl_get_nlink(struct dentry *lowerdentry,
|
||||
struct dentry *upperdentry,
|
||||
unsigned int fallback)
|
||||
{
|
||||
int nlink_diff;
|
||||
int nlink;
|
||||
char buf[13];
|
||||
int err;
|
||||
|
||||
if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1)
|
||||
return fallback;
|
||||
|
||||
err = vfs_getxattr(upperdentry, OVL_XATTR_NLINK, &buf, sizeof(buf) - 1);
|
||||
if (err < 0)
|
||||
goto fail;
|
||||
|
||||
buf[err] = '\0';
|
||||
if ((buf[0] != 'L' && buf[0] != 'U') ||
|
||||
(buf[1] != '+' && buf[1] != '-'))
|
||||
goto fail;
|
||||
|
||||
err = kstrtoint(buf + 1, 10, &nlink_diff);
|
||||
if (err < 0)
|
||||
goto fail;
|
||||
|
||||
nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink;
|
||||
nlink += nlink_diff;
|
||||
|
||||
if (nlink <= 0)
|
||||
goto fail;
|
||||
|
||||
return nlink;
|
||||
|
||||
fail:
|
||||
pr_warn_ratelimited("overlayfs: failed to get index nlink (%pd2, err=%i)\n",
|
||||
upperdentry, err);
|
||||
return fallback;
|
||||
}
|
||||
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
|
||||
{
|
||||
struct inode *inode;
|
||||
|
||||
inode = new_inode(sb);
|
||||
if (inode)
|
||||
ovl_fill_inode(inode, mode, rdev, 0, 0);
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
static int ovl_inode_test(struct inode *inode, void *data)
|
||||
{
|
||||
return inode->i_private == data;
|
||||
}
|
||||
|
||||
static int ovl_inode_set(struct inode *inode, void *data)
|
||||
{
|
||||
inode->i_private = data;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
|
||||
struct dentry *upperdentry, bool strict)
|
||||
{
|
||||
/*
|
||||
* For directories, @strict verify from lookup path performs consistency
|
||||
* checks, so NULL lower/upper in dentry must match NULL lower/upper in
|
||||
* inode. Non @strict verify from NFS handle decode path passes NULL for
|
||||
* 'unknown' lower/upper.
|
||||
*/
|
||||
if (S_ISDIR(inode->i_mode) && strict) {
|
||||
/* Real lower dir moved to upper layer under us? */
|
||||
if (!lowerdentry && ovl_inode_lower(inode))
|
||||
return false;
|
||||
|
||||
/* Lookup of an uncovered redirect origin? */
|
||||
if (!upperdentry && ovl_inode_upper(inode))
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL.
|
||||
* This happens when finding a copied up overlay inode for a renamed
|
||||
* or hardlinked overlay dentry and lower dentry cannot be followed
|
||||
* by origin because lower fs does not support file handles.
|
||||
*/
|
||||
if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Allow non-NULL __upperdentry in inode even if upperdentry is NULL.
|
||||
* This happens when finding a lower alias for a copied up hard link.
|
||||
*/
|
||||
if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
|
||||
bool is_upper)
|
||||
{
|
||||
struct inode *inode, *key = d_inode(real);
|
||||
|
||||
inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key);
|
||||
if (!inode)
|
||||
return NULL;
|
||||
|
||||
if (!ovl_verify_inode(inode, is_upper ? NULL : real,
|
||||
is_upper ? real : NULL, false)) {
|
||||
iput(inode);
|
||||
return ERR_PTR(-ESTALE);
|
||||
}
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
/*
|
||||
* Does overlay inode need to be hashed by lower inode?
|
||||
*/
|
||||
static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
|
||||
struct dentry *lower, struct dentry *index)
|
||||
{
|
||||
struct ovl_fs *ofs = sb->s_fs_info;
|
||||
|
||||
/* No, if pure upper */
|
||||
if (!lower)
|
||||
return false;
|
||||
|
||||
/* Yes, if already indexed */
|
||||
if (index)
|
||||
return true;
|
||||
|
||||
/* Yes, if won't be copied up */
|
||||
if (!ofs->upper_mnt)
|
||||
return true;
|
||||
|
||||
/* No, if lower hardlink is or will be broken on copy up */
|
||||
if ((upper || !ovl_indexdir(sb)) &&
|
||||
!d_is_dir(lower) && d_inode(lower)->i_nlink > 1)
|
||||
return false;
|
||||
|
||||
/* No, if non-indexed upper with NFS export */
|
||||
if (sb->s_export_op && upper)
|
||||
return false;
|
||||
|
||||
/* Otherwise, hash by lower inode for fsnotify */
|
||||
return true;
|
||||
}
|
||||
|
||||
static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode,
|
||||
struct inode *key)
|
||||
{
|
||||
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
|
||||
return newinode ? inode_insert5(newinode, (unsigned long) key,
|
||||
ovl_inode_test, ovl_inode_set, key) :
|
||||
#else
|
||||
return
|
||||
#endif
|
||||
iget5_locked(sb, (unsigned long) key,
|
||||
ovl_inode_test, ovl_inode_set, key);
|
||||
}
|
||||
|
||||
struct inode *ovl_get_inode(struct super_block *sb,
|
||||
struct ovl_inode_params *oip)
|
||||
{
|
||||
struct dentry *upperdentry = oip->upperdentry;
|
||||
struct ovl_path *lowerpath = oip->lowerpath;
|
||||
struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
|
||||
struct inode *inode;
|
||||
struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
|
||||
bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
|
||||
oip->index);
|
||||
int fsid = bylower ? oip->lowerpath->layer->fsid : 0;
|
||||
bool is_dir;
|
||||
unsigned long ino = 0;
|
||||
|
||||
if (!realinode)
|
||||
realinode = d_inode(lowerdentry);
|
||||
|
||||
/*
|
||||
* Copy up origin (lower) may exist for non-indexed upper, but we must
|
||||
* not use lower as hash key if this is a broken hardlink.
|
||||
*/
|
||||
is_dir = S_ISDIR(realinode->i_mode);
|
||||
if (upperdentry || bylower) {
|
||||
struct inode *key = d_inode(bylower ? lowerdentry :
|
||||
upperdentry);
|
||||
unsigned int nlink = is_dir ? 1 : realinode->i_nlink;
|
||||
|
||||
inode = ovl_iget5(sb, oip->newinode, key);
|
||||
if (!inode)
|
||||
goto out_nomem;
|
||||
if (!(inode->i_state & I_NEW)) {
|
||||
/*
|
||||
* Verify that the underlying files stored in the inode
|
||||
* match those in the dentry.
|
||||
*/
|
||||
if (!ovl_verify_inode(inode, lowerdentry, upperdentry,
|
||||
true)) {
|
||||
iput(inode);
|
||||
inode = ERR_PTR(-ESTALE);
|
||||
goto out;
|
||||
}
|
||||
|
||||
dput(upperdentry);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Recalculate nlink for non-dir due to indexing */
|
||||
if (!is_dir)
|
||||
nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink);
|
||||
set_nlink(inode, nlink);
|
||||
ino = key->i_ino;
|
||||
} else {
|
||||
/* Lower hardlink that will be broken on copy up */
|
||||
inode = new_inode(sb);
|
||||
if (!inode)
|
||||
goto out_nomem;
|
||||
}
|
||||
ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid);
|
||||
ovl_inode_init(inode, upperdentry, lowerdentry);
|
||||
|
||||
if (upperdentry && ovl_is_impuredir(upperdentry))
|
||||
ovl_set_flag(OVL_IMPURE, inode);
|
||||
|
||||
if (oip->index)
|
||||
ovl_set_flag(OVL_INDEX, inode);
|
||||
|
||||
/* Check for non-merge dir that may have whiteouts */
|
||||
if (is_dir) {
|
||||
if (((upperdentry && lowerdentry) || oip->numlower > 1) ||
|
||||
ovl_check_origin_xattr(upperdentry ?: lowerdentry)) {
|
||||
ovl_set_flag(OVL_WHITEOUTS, inode);
|
||||
}
|
||||
}
|
||||
|
||||
if (inode->i_state & I_NEW)
|
||||
unlock_new_inode(inode);
|
||||
out:
|
||||
return inode;
|
||||
|
||||
out_nomem:
|
||||
inode = ERR_PTR(-ENOMEM);
|
||||
goto out;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,381 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/uuid.h>
|
||||
#include "ovl_entry.h"
|
||||
|
||||
enum ovl_path_type {
|
||||
__OVL_PATH_UPPER = (1 << 0),
|
||||
__OVL_PATH_MERGE = (1 << 1),
|
||||
__OVL_PATH_ORIGIN = (1 << 2),
|
||||
};
|
||||
|
||||
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
|
||||
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
|
||||
#define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN)
|
||||
|
||||
#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay."
|
||||
#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque"
|
||||
#define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect"
|
||||
#define OVL_XATTR_ORIGIN OVL_XATTR_PREFIX "origin"
|
||||
#define OVL_XATTR_IMPURE OVL_XATTR_PREFIX "impure"
|
||||
#define OVL_XATTR_NLINK OVL_XATTR_PREFIX "nlink"
|
||||
#define OVL_XATTR_UPPER OVL_XATTR_PREFIX "upper"
|
||||
|
||||
enum ovl_inode_flag {
|
||||
/* Pure upper dir that may contain non pure upper entries */
|
||||
OVL_IMPURE,
|
||||
/* Non-merge dir that may contain whiteout entries */
|
||||
OVL_WHITEOUTS,
|
||||
OVL_INDEX,
|
||||
};
|
||||
|
||||
enum ovl_entry_flag {
|
||||
OVL_E_UPPER_ALIAS,
|
||||
OVL_E_OPAQUE,
|
||||
OVL_E_CONNECTED,
|
||||
};
|
||||
|
||||
/*
|
||||
* The tuple (fh,uuid) is a universal unique identifier for a copy up origin,
|
||||
* where:
|
||||
* origin.fh - exported file handle of the lower file
|
||||
* origin.uuid - uuid of the lower filesystem
|
||||
*/
|
||||
#define OVL_FH_VERSION 0
|
||||
#define OVL_FH_MAGIC 0xfb
|
||||
|
||||
/* CPU byte order required for fid decoding: */
|
||||
#define OVL_FH_FLAG_BIG_ENDIAN (1 << 0)
|
||||
#define OVL_FH_FLAG_ANY_ENDIAN (1 << 1)
|
||||
/* Is the real inode encoded in fid an upper inode? */
|
||||
#define OVL_FH_FLAG_PATH_UPPER (1 << 2)
|
||||
|
||||
#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN | \
|
||||
OVL_FH_FLAG_PATH_UPPER)
|
||||
|
||||
#if defined(__LITTLE_ENDIAN)
|
||||
#define OVL_FH_FLAG_CPU_ENDIAN 0
|
||||
#elif defined(__BIG_ENDIAN)
|
||||
#define OVL_FH_FLAG_CPU_ENDIAN OVL_FH_FLAG_BIG_ENDIAN
|
||||
#else
|
||||
#error Endianness not defined
|
||||
#endif
|
||||
|
||||
/* The type returned by overlay exportfs ops when encoding an ovl_fh handle */
|
||||
#define OVL_FILEID 0xfb
|
||||
|
||||
/* On-disk and in-memeory format for redirect by file handle */
|
||||
struct ovl_fh {
|
||||
u8 version; /* 0 */
|
||||
u8 magic; /* 0xfb */
|
||||
u8 len; /* size of this header + size of fid */
|
||||
u8 flags; /* OVL_FH_FLAG_* */
|
||||
u8 type; /* fid_type of fid */
|
||||
uuid_t uuid; /* uuid of filesystem */
|
||||
u8 fid[0]; /* file identifier */
|
||||
} __packed;
|
||||
|
||||
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_rmdir(dir, dentry);
|
||||
|
||||
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_unlink(dir, dentry, NULL);
|
||||
|
||||
pr_debug("unlink(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
|
||||
struct dentry *new_dentry)
|
||||
{
|
||||
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
|
||||
|
||||
pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode)
|
||||
{
|
||||
int err = vfs_create(dir, dentry, mode, true);
|
||||
|
||||
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode)
|
||||
{
|
||||
int err = vfs_mkdir(dir, dentry, mode);
|
||||
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, dev_t dev)
|
||||
{
|
||||
int err = vfs_mknod(dir, dentry, mode, dev);
|
||||
|
||||
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
|
||||
const char *oldname)
|
||||
{
|
||||
int err = vfs_symlink(dir, dentry, oldname);
|
||||
|
||||
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
int err = vfs_setxattr(dentry, name, value, size, flags);
|
||||
pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0x%x) = %i\n",
|
||||
dentry, name, min((int)size, 48), value, size, flags, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
|
||||
{
|
||||
int err = vfs_removexattr(dentry, name);
|
||||
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
|
||||
struct inode *newdir, struct dentry *newdentry,
|
||||
unsigned int flags)
|
||||
{
|
||||
int err;
|
||||
|
||||
pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
|
||||
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
|
||||
if (err) {
|
||||
pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
|
||||
olddentry, newdentry, err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
|
||||
{
|
||||
int err = vfs_whiteout(dir, dentry);
|
||||
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode)
|
||||
{
|
||||
struct dentry *ret = vfs_tmpfile(dentry, mode, 0);
|
||||
int err = PTR_ERR_OR_ZERO(ret);
|
||||
|
||||
pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* util.c */
|
||||
int ovl_want_write(struct dentry *dentry);
|
||||
void ovl_drop_write(struct dentry *dentry);
|
||||
struct dentry *ovl_workdir(struct dentry *dentry);
|
||||
const struct cred *ovl_override_creds(struct super_block *sb);
|
||||
struct super_block *ovl_same_sb(struct super_block *sb);
|
||||
int ovl_can_decode_fh(struct super_block *sb);
|
||||
struct dentry *ovl_indexdir(struct super_block *sb);
|
||||
bool ovl_index_all(struct super_block *sb);
|
||||
bool ovl_verify_lower(struct super_block *sb);
|
||||
struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
|
||||
bool ovl_dentry_remote(struct dentry *dentry);
|
||||
bool ovl_dentry_weird(struct dentry *dentry);
|
||||
enum ovl_path_type ovl_path_type(struct dentry *dentry);
|
||||
void ovl_path_upper(struct dentry *dentry, struct path *path);
|
||||
void ovl_path_lower(struct dentry *dentry, struct path *path);
|
||||
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
|
||||
struct dentry *ovl_dentry_upper(struct dentry *dentry);
|
||||
struct dentry *ovl_dentry_lower(struct dentry *dentry);
|
||||
struct ovl_layer *ovl_layer_lower(struct dentry *dentry);
|
||||
struct dentry *ovl_dentry_real(struct dentry *dentry);
|
||||
struct dentry *ovl_i_dentry_upper(struct inode *inode);
|
||||
struct inode *ovl_inode_upper(struct inode *inode);
|
||||
struct inode *ovl_inode_lower(struct inode *inode);
|
||||
struct inode *ovl_inode_real(struct inode *inode);
|
||||
struct ovl_dir_cache *ovl_dir_cache(struct inode *inode);
|
||||
void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache);
|
||||
void ovl_dentry_set_flag(unsigned long flag, struct dentry *dentry);
|
||||
void ovl_dentry_clear_flag(unsigned long flag, struct dentry *dentry);
|
||||
bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry);
|
||||
bool ovl_dentry_is_opaque(struct dentry *dentry);
|
||||
bool ovl_dentry_is_whiteout(struct dentry *dentry);
|
||||
void ovl_dentry_set_opaque(struct dentry *dentry);
|
||||
bool ovl_dentry_has_upper_alias(struct dentry *dentry);
|
||||
void ovl_dentry_set_upper_alias(struct dentry *dentry);
|
||||
bool ovl_redirect_dir(struct super_block *sb);
|
||||
const char *ovl_dentry_get_redirect(struct dentry *dentry);
|
||||
void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
|
||||
void ovl_inode_init(struct inode *inode, struct dentry *upperdentry,
|
||||
struct dentry *lowerdentry);
|
||||
void ovl_inode_update(struct inode *inode, struct dentry *upperdentry);
|
||||
void ovl_dentry_version_inc(struct dentry *dentry, bool impurity);
|
||||
u64 ovl_dentry_version_get(struct dentry *dentry);
|
||||
bool ovl_is_whiteout(struct dentry *dentry);
|
||||
struct file *ovl_path_open(struct path *path, int flags);
|
||||
int ovl_copy_up_start(struct dentry *dentry);
|
||||
void ovl_copy_up_end(struct dentry *dentry);
|
||||
bool ovl_check_origin_xattr(struct dentry *dentry);
|
||||
bool ovl_check_dir_xattr(struct dentry *dentry, const char *name);
|
||||
int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry,
|
||||
const char *name, const void *value, size_t size,
|
||||
int xerr);
|
||||
int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry);
|
||||
void ovl_set_flag(unsigned long flag, struct inode *inode);
|
||||
void ovl_clear_flag(unsigned long flag, struct inode *inode);
|
||||
bool ovl_test_flag(unsigned long flag, struct inode *inode);
|
||||
bool ovl_inuse_trylock(struct dentry *dentry);
|
||||
void ovl_inuse_unlock(struct dentry *dentry);
|
||||
bool ovl_need_index(struct dentry *dentry);
|
||||
int ovl_nlink_start(struct dentry *dentry, bool *locked);
|
||||
void ovl_nlink_end(struct dentry *dentry, bool locked);
|
||||
int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir);
|
||||
|
||||
static inline bool ovl_is_impuredir(struct dentry *dentry)
|
||||
{
|
||||
return ovl_check_dir_xattr(dentry, OVL_XATTR_IMPURE);
|
||||
}
|
||||
|
||||
static inline unsigned int ovl_xino_bits(struct super_block *sb)
|
||||
{
|
||||
struct ovl_fs *ofs = sb->s_fs_info;
|
||||
|
||||
return ofs->xino_bits;
|
||||
}
|
||||
|
||||
|
||||
/* namei.c */
|
||||
int ovl_check_fh_len(struct ovl_fh *fh, int fh_len);
|
||||
struct dentry *ovl_decode_real_fh(struct ovl_fh *fh, struct vfsmount *mnt,
|
||||
bool connected);
|
||||
int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
|
||||
struct dentry *upperdentry, struct ovl_path **stackp);
|
||||
int ovl_verify_set_fh(struct dentry *dentry, const char *name,
|
||||
struct dentry *real, bool is_upper, bool set);
|
||||
struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index);
|
||||
int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index);
|
||||
int ovl_get_index_name(struct dentry *origin, struct qstr *name);
|
||||
struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh);
|
||||
struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
|
||||
struct dentry *origin, bool verify);
|
||||
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
|
||||
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
|
||||
unsigned int flags);
|
||||
bool ovl_lower_positive(struct dentry *dentry);
|
||||
|
||||
static inline int ovl_verify_origin(struct dentry *upper,
|
||||
struct dentry *origin, bool set)
|
||||
{
|
||||
return ovl_verify_set_fh(upper, OVL_XATTR_ORIGIN, origin, false, set);
|
||||
}
|
||||
|
||||
static inline int ovl_verify_upper(struct dentry *index,
|
||||
struct dentry *upper, bool set)
|
||||
{
|
||||
return ovl_verify_set_fh(index, OVL_XATTR_UPPER, upper, true, set);
|
||||
}
|
||||
|
||||
/* readdir.c */
|
||||
extern const struct file_operations ovl_dir_operations;
|
||||
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
|
||||
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
|
||||
void ovl_cache_free(struct list_head *list);
|
||||
void ovl_dir_cache_free(struct inode *inode);
|
||||
int ovl_check_d_type_supported(struct path *realpath);
|
||||
void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt,
|
||||
struct dentry *dentry, int level);
|
||||
int ovl_indexdir_cleanup(struct ovl_fs *ofs);
|
||||
|
||||
/* inode.c */
|
||||
int ovl_set_nlink_upper(struct dentry *dentry);
|
||||
int ovl_set_nlink_lower(struct dentry *dentry);
|
||||
unsigned int ovl_get_nlink(struct dentry *lowerdentry,
|
||||
struct dentry *upperdentry,
|
||||
unsigned int fallback);
|
||||
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
|
||||
int ovl_getattr(const struct path *path, struct kstat *stat,
|
||||
u32 request_mask, unsigned int flags);
|
||||
int ovl_permission(struct inode *inode, int mask);
|
||||
int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
|
||||
const void *value, size_t size, int flags);
|
||||
int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
|
||||
void *value, size_t size);
|
||||
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
|
||||
struct posix_acl *ovl_get_acl(struct inode *inode, int type);
|
||||
int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
|
||||
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags);
|
||||
bool ovl_is_private_xattr(const char *name);
|
||||
|
||||
struct ovl_inode_params {
|
||||
struct inode *newinode;
|
||||
struct dentry *upperdentry;
|
||||
struct ovl_path *lowerpath;
|
||||
struct dentry *index;
|
||||
unsigned int numlower;
|
||||
};
|
||||
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
|
||||
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
|
||||
bool is_upper);
|
||||
struct inode *ovl_get_inode(struct super_block *sb,
|
||||
struct ovl_inode_params *oip);
|
||||
static inline void ovl_copyattr(struct inode *from, struct inode *to)
|
||||
{
|
||||
to->i_uid = from->i_uid;
|
||||
to->i_gid = from->i_gid;
|
||||
to->i_mode = from->i_mode;
|
||||
to->i_atime = from->i_atime;
|
||||
to->i_mtime = from->i_mtime;
|
||||
to->i_ctime = from->i_ctime;
|
||||
}
|
||||
|
||||
/* dir.c */
|
||||
extern const struct inode_operations ovl_dir_inode_operations;
|
||||
int ovl_cleanup_and_whiteout(struct dentry *workdir, struct inode *dir,
|
||||
struct dentry *dentry);
|
||||
struct ovl_cattr {
|
||||
dev_t rdev;
|
||||
umode_t mode;
|
||||
const char *link;
|
||||
struct dentry *hardlink;
|
||||
};
|
||||
|
||||
#define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) })
|
||||
|
||||
struct dentry *ovl_create_real(struct inode *dir, struct dentry *newdentry,
|
||||
struct ovl_cattr *attr);
|
||||
int ovl_cleanup(struct inode *dir, struct dentry *dentry);
|
||||
struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr);
|
||||
|
||||
/* copy_up.c */
|
||||
int ovl_copy_up(struct dentry *dentry);
|
||||
int ovl_copy_up_flags(struct dentry *dentry, int flags);
|
||||
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
|
||||
int ovl_set_attr(struct dentry *upper, struct kstat *stat);
|
||||
struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper);
|
||||
int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
|
||||
struct dentry *upper);
|
||||
|
||||
/* export.c */
|
||||
extern const struct export_operations ovl_export_operations;
|
||||
@ -1,111 +0,0 @@
|
||||
/*
|
||||
*
|
||||
* Copyright (C) 2011 Novell Inc.
|
||||
* Copyright (C) 2016 Red Hat, Inc.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published by
|
||||
* the Free Software Foundation.
|
||||
*/
|
||||
|
||||
struct ovl_config {
|
||||
char *lowerdir;
|
||||
char *upperdir;
|
||||
char *workdir;
|
||||
bool default_permissions;
|
||||
bool redirect_dir;
|
||||
bool redirect_follow;
|
||||
const char *redirect_mode;
|
||||
bool index;
|
||||
bool nfs_export;
|
||||
int xino;
|
||||
};
|
||||
|
||||
struct ovl_sb {
|
||||
struct super_block *sb;
|
||||
dev_t pseudo_dev;
|
||||
};
|
||||
|
||||
struct ovl_layer {
|
||||
struct vfsmount *mnt;
|
||||
struct ovl_sb *fs;
|
||||
/* Index of this layer in fs root (upper idx == 0) */
|
||||
int idx;
|
||||
/* One fsid per unique underlying sb (upper fsid == 0) */
|
||||
int fsid;
|
||||
};
|
||||
|
||||
struct ovl_path {
|
||||
struct ovl_layer *layer;
|
||||
struct dentry *dentry;
|
||||
};
|
||||
|
||||
/* private information held for overlayfs's superblock */
|
||||
struct ovl_fs {
|
||||
struct vfsmount *upper_mnt;
|
||||
unsigned int numlower;
|
||||
/* Number of unique lower sb that differ from upper sb */
|
||||
unsigned int numlowerfs;
|
||||
struct ovl_layer *lower_layers;
|
||||
struct ovl_sb *lower_fs;
|
||||
/* workbasedir is the path at workdir= mount option */
|
||||
struct dentry *workbasedir;
|
||||
/* workdir is the 'work' directory under workbasedir */
|
||||
struct dentry *workdir;
|
||||
/* index directory listing overlay inodes by origin file handle */
|
||||
struct dentry *indexdir;
|
||||
long namelen;
|
||||
/* pathnames of lower and upper dirs, for show_options */
|
||||
struct ovl_config config;
|
||||
/* creds of process who forced instantiation of super block */
|
||||
const struct cred *creator_cred;
|
||||
bool tmpfile;
|
||||
bool noxattr;
|
||||
/* Did we take the inuse lock? */
|
||||
bool upperdir_locked;
|
||||
bool workdir_locked;
|
||||
/* Inode numbers in all layers do not use the high xino_bits */
|
||||
unsigned int xino_bits;
|
||||
};
|
||||
|
||||
/* private information held for every overlayfs dentry */
|
||||
struct ovl_entry {
|
||||
union {
|
||||
struct {
|
||||
unsigned long flags;
|
||||
};
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
unsigned numlower;
|
||||
struct ovl_path lowerstack[];
|
||||
};
|
||||
|
||||
struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
|
||||
|
||||
static inline struct ovl_entry *OVL_E(struct dentry *dentry)
|
||||
{
|
||||
return (struct ovl_entry *) dentry->d_fsdata;
|
||||
}
|
||||
|
||||
struct ovl_inode {
|
||||
struct ovl_dir_cache *cache;
|
||||
const char *redirect;
|
||||
u64 version;
|
||||
unsigned long flags;
|
||||
struct inode vfs_inode;
|
||||
struct dentry *__upperdentry;
|
||||
struct inode *lower;
|
||||
|
||||
/* synchronize copy up and more */
|
||||
struct mutex lock;
|
||||
};
|
||||
|
||||
static inline struct ovl_inode *OVL_I(struct inode *inode)
|
||||
{
|
||||
return container_of(inode, struct ovl_inode, vfs_inode);
|
||||
}
|
||||
|
||||
static inline struct dentry *ovl_upperdentry_dereference(struct ovl_inode *oi)
|
||||
{
|
||||
return READ_ONCE(oi->__upperdentry);
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user