#!/bin/bash # IHK SMP-x86 example boot script. # author: Balazs Gerofi # Copyright (C) 2014 RIKEN AICS # # This is an example script for loading IHK, configuring a partition and # booting McKernel on it. # The script reserves half of the CPU cores and 512MB of RAM from NUMA node 0 # when IHK is loaded for the first time, otherwise it destroys the current # McKernel instance and reboots it using the same set of resources as it used # previously. # Note that the script does not output anything unless an error occurs. prefix="@prefix@" BINDIR="${prefix}/bin" SBINDIR="${prefix}/sbin" ETCDIR=@ETCDIR@ KMODDIR="${prefix}/kmod" KERNDIR="${prefix}/@TARGET@/kernel" ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@" mem="512M@0" cpus="" INTERVAL=1 LOGMODE=0 facility="LOG_LOCAL6" chown_option=`logname 2> /dev/null` if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" -o "`systemctl status irqbalance.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then irqbalance_used="yes" else irqbalance_used="no" fi while getopts :i:k:c:m:o:f: OPT do case ${OPT} in f) facility=${OPTARG} ;; o) chown_option=${OPTARG} ;; i) INTERVAL=${OPTARG} expr "${INTERVAL}" + 1 > /dev/null 2>&1 if [ $? -ge 2 ] then echo "invalid -i value" >&2 exit 1 fi if [ ${INTERVAL} -le 0 ] then echo "invalid -i value" >&2 exit 1 fi ;; k) LOGMODE=${OPTARG} expr "${LOGMODE}" + 1 > /dev/null 2>&1 if [ $? -ge 2 ] then echo "invalid -k value" >&2 exit 1 fi if [ ${LOGMODE} -lt 0 -o ${LOGMODE} -gt 2 ] then echo "invalid -k value" >&2 exit 1 fi ;; c) cpus=${OPTARG} ;; m) mem=${OPTARG} ;; *) echo "invalid option -${OPT}" >&2 exit 1 esac done ihk_ikc_irq_core=0 release=`uname -r` major=`echo ${release} | sed -e 's/^\([0-9]*\).*/\1/'` minor=`echo ${release} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/'` patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'` linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}` rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'` if [ "${release}" == "${rhel_release}" ]; then rhel_release=""; fi enable_mcoverlay="no" if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then if [ "${rhel_release}" == "" ]; then if [ ${linux_version_code} -ge 262144 -a ${linux_version_code} -lt 262400 ]; then enable_mcoverlay="yes" fi else if [ ${linux_version_code} -eq 199168 -a ${rhel_release} -ge 327 ]; then enable_mcoverlay="yes" fi fi fi if [ "$cpus" == "" ]; then # Get the number of CPUs on NUMA node 0 nr_cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $4}' | wc -l` # Use the second half of the cores let nr_cpus="$nr_cpus / 2" cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $1}' | tail -n $nr_cpus | xargs echo -n | sed 's/ /,/g'` if [ "$cpus" == "" ]; then echo "error: no available CPUs on NUMA node 0?" >&2; exit 1; fi fi # Remove mcoverlay if loaded if [ "$enable_mcoverlay" == "yes" ]; then if [ "`lsmod | grep mcoverlay`" != "" ]; then if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi if ! rmmod mcoverlay; then echo "error: removing mcoverlay" >&2; exit 1; fi fi fi # Stop irqbalance if [ "${irqbalance_used}" == "yes" ]; then systemctl stop irqbalance_mck.service 2>/dev/null if ! systemctl stop irqbalance.service 2>/dev/null ; then echo "error: stopping irqbalance" >&2; exit 1; fi; fi # Load IHK if not loaded if [ "`lsmod | grep ihk`" == "" ]; then if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk" >&2; exit 1; fi; fi # Load IHK-SMP if not loaded and reserve CPUs and memory if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then ihk_irq="" for i in `seq 64 255`; do if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then ihk_irq=$i break fi done if [ "$ihk_irq" == "" ]; then echo "error: no IRQ available" >&2; exit 1; fi if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then echo "error: loading ihk-smp-x86" >&2; exit 1; fi; if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi # If loaded, but no resources allocated, get CPUs and memory else if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi cpus_allocated=`${SBINDIR}/ihkosctl 0 query cpu` if [ "$cpus_allocated" == "" ]; then if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi fi if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi mem_allocated=`${SBINDIR}/ihkosctl 0 query mem` if [ "$mem_allocated" == "" ]; then if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi fi fi # Load mcctrl if not loaded if [ "`lsmod | grep mcctrl`" == "" ]; then if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko" >&2; exit 1; fi fi # Check for existing OS instance and destroy if [ -c /dev/mcos0 ]; then # Query CPU cores and memory of OS instance so that the same values are used as previously if ! ${SBINDIR}/ihkosctl 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi cpus=`${SBINDIR}/ihkosctl 0 query cpu` if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi mem=`${SBINDIR}/ihkosctl 0 query mem` if ! ${SBINDIR}/ihkconfig 0 destroy 0; then echo "warning: destroy failed" >&2; fi else # Otherwise query IHK-SMP for resources if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi cpus=`${SBINDIR}/ihkconfig 0 query cpu` if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi mem=`${SBINDIR}/ihkconfig 0 query mem` fi if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create" >&2; exit; fi if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs" >&2; exit 1; fi if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory" >&2; exit 1; fi if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image" >&2; exit 1; fi if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then echo "error: setting kernel arguments" >&2; exit 1; fi if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting" >&2; exit 1; fi if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then echo "error: chowning device files" >&2; exit 1; fi if [ "$enable_mcoverlay" == "yes" ]; then if [ ! -e /tmp/mcos ]; then mkdir -p /tmp/mcos; fi if ! mount -t tmpfs tmpfs /tmp/mcos; then echo "error: mount /tmp/mcos" >&2; exit 1; fi if [ ! -e /tmp/mcos/linux_proc ]; then mkdir -p /tmp/mcos/linux_proc; fi if ! mount --bind /proc /tmp/mcos/linux_proc; then echo "error: mount /tmp/mcos/linux_proc" >&2; exit 1; fi if ! insmod ${KMODDIR}/mcoverlay.ko; then echo "error: inserting mcoverlay.ko" >&2; exit 1; fi while [ ! -e /proc/mcos0 ] do sleep 1 done if [ ! -e /tmp/mcos/mcos0_proc ]; then mkdir -p /tmp/mcos/mcos0_proc; fi if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then mkdir -p /tmp/mcos/mcos0_proc_upper; fi if [ ! -e /tmp/mcos/mcos0_proc_work ]; then mkdir -p /tmp/mcos/mcos0_proc_work; fi if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then echo "error: mount /tmp/mcos/mcos0_proc" >&2; exit 1; fi mount --make-rprivate /proc while [ ! -e /sys/devices/virtual/mcos/mcos0/sys/devices/system/node ] do sleep 1 done if [ ! -e /tmp/mcos/mcos0_sys ]; then mkdir -p /tmp/mcos/mcos0_sys; fi if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then mkdir -p /tmp/mcos/mcos0_sys_upper; fi if [ ! -e /tmp/mcos/mcos0_sys_work ]; then mkdir -p /tmp/mcos/mcos0_sys_work; fi if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then echo "error: mount /tmp/mcos/mcos0_sys" >&2; exit 1; fi mount --make-rprivate /sys for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid fi done for nodeid in `find /sys/devices/system/node/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid" ]; then rm -rf /tmp/mcos/mcos0_sys/devices/system/node/$nodeid else # Delete non-existent symlinks for cpuid in `find /sys/devices/system/node/$nodeid/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid/$cpuid" ]; then rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/$cpuid fi done rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/memory* fi done for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid fi done fi if [ ${LOGMODE} -ne 0 ] then # mcklogd survives when McKernel isn't shut down by mcstop+release.sh pkill mcklogd SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility} fi # Start irqbalance with CPUs and IRQ for McKernel banned if [ "${irqbalance_used}" == "yes" ]; then if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }' ; then echo "error: saving /proc/irq/*/smp_affinity" >&2; exit 1; fi; ncpus=`lscpu | grep -E '^CPU\(s\):' | awk '{print $2}'` smp_affinity_mask=`echo $cpus | ncpus=$ncpus perl -e 'while(<>){@tokens = split /,/;foreach $token (@tokens) {@nums = split /-/,$token; for($num = $nums[0]; $num <= $nums[$#nums]; $num++) {$ndx=int($num/32); $mask[$ndx] |= (1<<($num % 32))}}} $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if($j != $nint32s - 1){print ",";} $nblks = $j == $nint32s - 1 ? int(($ENV{'ncpus'} % 32)/4) : 8; for($i = $nblks - 1;$i >= 0;$i--){ printf("%01x",($mask[$j] >> ($i*4)) & 0xf);}}'` if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then echo "error: modifying /proc/irq/*/smp_affinity" >&2; exit 1; fi; banirq=`cat /proc/interrupts| perl -e 'while(<>) { if(/^\s*(\d+).*IHK\-SMP\s*$/) {print $1;}}'` sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > $ETCDIR/irqbalance_mck if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: linking irqbalance_mck" >&2; exit 1; fi; if ! systemctl start irqbalance_mck.service 2>/dev/null ; then echo "error: starting irqbalance_mck" >&2; exit 1; fi; # echo cpus=$cpus mask=$smp_affinity_mask banirq=$banirq fi