Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6d06ab3591 | |||
| 9977ca16bb | |||
| 1631044f7b |
Executable
+439
@@ -0,0 +1,439 @@
|
||||
#!/bin/bash
|
||||
# Builds kernel with the new CFS Bandwidth patches
|
||||
# and nsfd/setns syscall patches.
|
||||
# Also builds Open vSwitch against the built kernel version.
|
||||
# Script only to be run on 64-bit systems; needs a few changes to run on
|
||||
# 32-bit ones.
|
||||
|
||||
# If building for i386 (-t), make sure to install the following 32-bit libs:
|
||||
# sudo apt-get install ia32-libs lib32gcc1 libc6-i386 util-linux devscripts
|
||||
|
||||
# Check for unitialized variables
|
||||
set -o nounset
|
||||
|
||||
# Exit on any failure
|
||||
set -e
|
||||
|
||||
# Location in which to download and build the kernel
|
||||
kdir=/usr/src
|
||||
|
||||
# Kernel version to download
|
||||
kver=3.0.0
|
||||
|
||||
# Save original directory for later.
|
||||
orig_dir=`pwd`
|
||||
|
||||
# Default and custom kernel version string
|
||||
version_string=-with-cfs
|
||||
|
||||
# Run menuconfig later?
|
||||
menuconfig=
|
||||
|
||||
# Use localmodconfig?
|
||||
localmodconfig=
|
||||
|
||||
# Build ubuntu kernel? must be 3.0.0 compatible
|
||||
ubuntu_release=
|
||||
ubuntu_default_release=ubuntu-oneiric
|
||||
ubuntu_base=3.0.0-14 # base tag and version for build
|
||||
ubuntu_tag=Ubuntu-$ubuntu_base.23
|
||||
ubuntu_flavor=generic
|
||||
#ubuntu_config=/boot/config-$ubuntu_base-$ubuntu_flavor
|
||||
ubuntu_config=${orig_dir}/config-3.0.9-with-cfs
|
||||
ubuntu_image=linux-image-$ubuntu_base-$ubuntu_flavor
|
||||
ubuntu_kver=3.0.9 # must match version that is actually built
|
||||
|
||||
# OVS pkg string. Not sure how to find this automatically.
|
||||
ovs_pkg_ver=1.2.0-1ubuntu3
|
||||
|
||||
# Location of kernel config. If not specified, use current .config.
|
||||
# was: ${orig_dir}/config-3.0.0-with-cfs
|
||||
kconfig=
|
||||
|
||||
# Install only?
|
||||
install_only=
|
||||
|
||||
# Use 32-bit?
|
||||
i386=
|
||||
|
||||
function usage {
|
||||
warn "Compiles kernel ${kver} with CBW, setns, and DCTCP patches in ${kdir}"
|
||||
warn "Usage: build.sh [-huimlt] [-v 'versionstring']"
|
||||
warn "-h help"
|
||||
warn "-u build ubuntu kernel"
|
||||
warn "-i install only (don't build)"
|
||||
warn "-m use menuconfig"
|
||||
warn "-l use localmodconfig"
|
||||
warn "-v 'versionstring' use custom version string"
|
||||
warn "-t build for i386 (32-bit)"
|
||||
}
|
||||
|
||||
function parse_opts {
|
||||
custom_version_string=
|
||||
plus=
|
||||
while getopts 'huimltv:' OPTION; do
|
||||
case $OPTION in
|
||||
h) usage; exit 0;;
|
||||
u) ubuntu_release=$ubuntu_default_release;
|
||||
kver=$ubuntu_kver; kconfig=$ubuntu_config;;
|
||||
i) install_only=true;;
|
||||
m) menuconfig=true;;
|
||||
l) localmodconfig=true;;
|
||||
v) custom_version_string=$OPTARG;;
|
||||
t) i386=true; plus=;;
|
||||
?) usage; exit 1;;
|
||||
esac
|
||||
done
|
||||
# Provide feedback which might be useful
|
||||
if [[ "$custom_version_string" != "" ]]; then
|
||||
warn "Using custom version_string: ${custom_version_string}"
|
||||
version_string=$custom_version_string
|
||||
else
|
||||
warn "Using default version_string: ${version_string}"
|
||||
fi
|
||||
if [[ "$ubuntu_release" != "" ]]; then
|
||||
warn "Building Ubuntu kernel for release ${ubuntu_release}"
|
||||
fi
|
||||
}
|
||||
|
||||
function warn {
|
||||
# Echo the provided command in color text.
|
||||
yellow='\e[0;33m' # Yellow
|
||||
reset='\e[0m'
|
||||
echo="echo -e"
|
||||
if [ -n "${2+defined}" ]; then
|
||||
echo="$echo $2"
|
||||
fi
|
||||
$echo "${yellow}$1${reset}"
|
||||
}
|
||||
|
||||
function pre_check {
|
||||
warn "Checking for git"
|
||||
if [[ -z `which git` ]]; then
|
||||
warn "You need git to download kernel. Install? [Y/n] " -n
|
||||
read answer
|
||||
[[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
|
||||
sudo apt-get install git;
|
||||
fi
|
||||
|
||||
warn "Checking for kernel-package build utilities"
|
||||
if [[ -z `which make-kpkg` ]]; then
|
||||
warn "You need kernel-package utilities to build the kernel. Install? [Y/n] " -n
|
||||
read answer
|
||||
[[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
|
||||
sudo apt-get install kernel-package ncurses-dev;
|
||||
fi
|
||||
}
|
||||
|
||||
function fetch_kernel {
|
||||
if [[ "$ubuntu_release" == "" ]]; then
|
||||
srcdir=$kdir/linux-$kver
|
||||
archive=git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-tip.git
|
||||
tag=3.0.0
|
||||
else
|
||||
warn "Pre-installing $ubuntu_image"
|
||||
#sudo apt-get install $ubuntu_image
|
||||
srcdir=$kdir/$ubuntu_release
|
||||
archive=git://kernel.ubuntu.com/ubuntu/$ubuntu_release
|
||||
tag=$ubuntu_tag
|
||||
fi
|
||||
if [[ -d $srcdir ]]; then
|
||||
warn "Linux source exists in $srcdir, skipping.."
|
||||
return
|
||||
fi
|
||||
warn "--> Fetching kernel $srcdir"
|
||||
if git clone $archive $srcdir; then
|
||||
return
|
||||
fi
|
||||
warn "Failed to fetch kernel from $archive"
|
||||
if [[ "$ubuntu_release" == "" ]]; then
|
||||
warn "Trying github"
|
||||
archive=git://github.com/torvalds/linux.git
|
||||
if git clone $archive $srcdir; then
|
||||
return
|
||||
fi
|
||||
fi
|
||||
warn "Giving up."
|
||||
exit 2
|
||||
}
|
||||
|
||||
function work_around_kernel_package_bug {
|
||||
warn "Applying workaround for kernel package bug..."
|
||||
# Fix will likely break on any other kernel version, so watch out.
|
||||
# From:
|
||||
# https://bugs.launchpad.net/ubuntu/+source/kernel-package/+bug/58307/comments/16
|
||||
sed -i -s 's/echo "+"/#echo "+"/' $srcdir/scripts/setlocalversion
|
||||
}
|
||||
|
||||
function copy_patches {
|
||||
warn "Copying patches..."
|
||||
rm -rf $srcdir/patches
|
||||
cp -r ${orig_dir}/../../linux-3.0.0-patches/ $srcdir/patches
|
||||
}
|
||||
|
||||
function apply_patches {
|
||||
cd $srcdir
|
||||
if git checkout mininet ; then
|
||||
# Assume mininet
|
||||
warn "Mininet branch already exists - not applying patches"
|
||||
return
|
||||
fi
|
||||
if [[ "$tag" != "" ]] ; then
|
||||
git checkout $tag
|
||||
fi
|
||||
git checkout -b mininet
|
||||
warn "Applying patches..."
|
||||
git am -3 patches/*.patch
|
||||
work_around_kernel_package_bug
|
||||
}
|
||||
|
||||
# lxc/ns and cfs configuration flags
|
||||
|
||||
config_y='
|
||||
CONFIG_GROUP_SCHED
|
||||
CONFIG_FAIR_GROUP_SCHED
|
||||
CONFIG_RT_GROUP_SCHED
|
||||
CONFIG_CGROUP_SCHED
|
||||
CONFIG_CGROUPS
|
||||
CONFIG_CGROUP_FREEZER
|
||||
CONFIG_CGROUP_DEVICE
|
||||
CONFIG_SCHED_AUTOGROUP
|
||||
CONFIG_BLK_CGROUP
|
||||
CONFIG_CFQ_GROUP_IOSCHED
|
||||
CONFIG_CGROUP_PERF
|
||||
CONFIG_CPUSETS
|
||||
CONFIG_PROC_PID_CPUSET
|
||||
CONFIG_CGROUP_CPUACCT
|
||||
CONFIG_RESOURCE_COUNTERS
|
||||
CONFIG_CGROUP_MEM_RES_CTLR
|
||||
CONFIG_CGROUP_MEM_RES_CTLR_SWAP
|
||||
CONFIG_MM_OWNER
|
||||
CONFIG_NAMESPACES
|
||||
CONFIG_UTS_NS
|
||||
CONFIG_IPC_NS
|
||||
CONFIG_USER_NS
|
||||
CONFIG_PID_NS
|
||||
CONFIG_NET_NS
|
||||
CONFIG_NET_CLS_CGROUP
|
||||
CONFIG_SECURITY_FILE_CAPABILITIES
|
||||
CONFIG_DEVPTS_MULTIPLE_INSTANCES
|
||||
CONFIG_VETH
|
||||
CONFIG_VLAN_8021Q
|
||||
CONFIG_MACVLAN
|
||||
CONFIG_CFS_BANDWIDTH
|
||||
CONFIG_NET_SCHED'
|
||||
|
||||
config_m='
|
||||
CONFIG_BRIDGE
|
||||
CONFIG_NET_SCH_CBQ
|
||||
CONFIG_NET_SCH_HTB
|
||||
CONFIG_NET_SCH_HFSC
|
||||
CONFIG_NET_SCH_PRIO
|
||||
CONFIG_NET_SCH_MULTIQ
|
||||
CONFIG_NET_SCH_RED
|
||||
CONFIG_NET_SCH_SFB
|
||||
CONFIG_NET_SCH_SFQ
|
||||
CONFIG_NET_SCH_TEQL
|
||||
CONFIG_NET_SCH_TBF
|
||||
CONFIG_NET_SCH_GRED
|
||||
CONFIG_NET_SCH_DSMARK
|
||||
CONFIG_NET_SCH_NETEM
|
||||
CONFIG_NET_SCH_DRR
|
||||
CONFIG_NET_SCH_MQPRIO
|
||||
CONFIG_NET_SCH_CHOKE
|
||||
CONFIG_NET_SCH_QFQ
|
||||
CONFIG_NET_SCH_INGRESS
|
||||
'
|
||||
|
||||
config_n='
|
||||
CONFIG_SECURITY_APPARMOR
|
||||
'
|
||||
|
||||
function configure_kernel {
|
||||
cd $srcdir
|
||||
warn "Configuring kernel..."
|
||||
|
||||
if [[ "$menuconfig" == 'true' ]]; then
|
||||
make menuconfig
|
||||
else
|
||||
if [[ "$kconfig" == "" ]]; then
|
||||
warn "Using current kernel config..."
|
||||
else
|
||||
warn "Using specified kernel config: ${kconfig}..."
|
||||
cp $kconfig .config
|
||||
fi
|
||||
|
||||
warn "Making oldconfig..."
|
||||
if [[ "$i386" == 'true' ]]; then
|
||||
linux32=linux32
|
||||
else
|
||||
linux32=
|
||||
fi
|
||||
yes '' | $linux32 make oldconfig 1> /dev/null
|
||||
if [[ "$localmodconfig" == 'true' ]]; then
|
||||
warn "Making localmodconfig..."
|
||||
yes '' | $linux32 make localmodconfig 1> /dev/null
|
||||
fi
|
||||
warn "Setting kernel flags for lxc and cbw..."
|
||||
for flag in $config_y; do
|
||||
if ! grep $flag .config 1> /dev/null; then
|
||||
echo $flag=y >> .config
|
||||
else
|
||||
sed -i -s "s/# $flag is not set/$flag=y/" .config
|
||||
fi
|
||||
done
|
||||
for flag in $config_m; do
|
||||
if ! grep $flag .config 1> /dev/null; then
|
||||
echo $flag=m >> .config
|
||||
else
|
||||
sed -i -s "s/# $flag is not set/$flag=m/" .config
|
||||
sed -i -s "s/$flag=y/$flag=m/" .config
|
||||
fi
|
||||
done
|
||||
for flag in $config_n; do
|
||||
if ! grep $flag .config 1> /dev/null; then
|
||||
echo "# $flag is not set" >> .config
|
||||
else
|
||||
sed -i -s "s/$flag=y/# $flag is not set/" .config
|
||||
sed -i -s "s/$flag=m/# $flag is not set/" .config
|
||||
fi
|
||||
done
|
||||
for flag in $config_y $config_m; do
|
||||
grep $flag .config || echo "WARNING: $flag IS MISSING"
|
||||
done
|
||||
cp .config /tmp
|
||||
warn "RAN CONFIG IN `pwd`"
|
||||
fi
|
||||
}
|
||||
|
||||
function build_kernel {
|
||||
# Have your favourite build method here
|
||||
# This is a standard Debian way of building the kernel
|
||||
# The patches select cfs bandwidth automatically
|
||||
warn "Building kernel-$version_string"
|
||||
cd $srcdir
|
||||
procs=`grep -c ^processor /proc/cpuinfo`
|
||||
procs=`echo $procs + 2 | bc`
|
||||
export CONCURRENCY_LEVEL=$procs
|
||||
if [[ "$i386" == 'true' ]]; then
|
||||
mkpkg_extra_args='--cross-compile - --arch i386'
|
||||
else
|
||||
mkpkg_extra_args=
|
||||
fi
|
||||
make-kpkg clean $mkpkg_extra_args
|
||||
yes '' | fakeroot make-kpkg -j $procs $mkpkg_extra_args --initrd --append-to-version=${version_string} \
|
||||
kernel_image kernel_headers
|
||||
}
|
||||
|
||||
function mod_kernel_dpkg {
|
||||
# Only needed for i386.
|
||||
cd /usr/src
|
||||
if [[ "$i386" == 'true' ]]; then
|
||||
warn "Modifying deb-pkg names for i386"
|
||||
# Based on instructions from http://dotcommie.net/?id=165
|
||||
for pkg_type in linux-image linux-headers; do
|
||||
pkg_name_orig=${pkg_type}-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom_amd64.deb
|
||||
#hook=`readlink -f set_debian_control_i386.sh`
|
||||
hook=${orig_dir}/set_debian_control_i386.sh
|
||||
warn "$pkg_name_orig"
|
||||
fakeroot deb-reversion -s "" --hook $hook $pkg_name_orig
|
||||
# Remove the 1 in the name that deb-reversion adds.
|
||||
pkg_name_mod=${pkg_type}-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom1_i386.deb
|
||||
pkg_name_new=${pkg_type}-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom_i386.deb
|
||||
mv $pkg_name_mod $pkg_name_new
|
||||
warn "Removing original package: $pkg_name_orig"
|
||||
rm -f $pkg_name_orig
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
function install_headers {
|
||||
warn "Installing headers..."
|
||||
sudo dpkg -i /usr/src/linux-headers-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom_*.deb
|
||||
}
|
||||
|
||||
function install_kernel {
|
||||
warn "Installing kernel..."
|
||||
sudo dpkg -i /usr/src/linux-image-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom_*.deb
|
||||
}
|
||||
|
||||
function build_initrd {
|
||||
# Certain versions of Ubuntu install a make-kpkg"
|
||||
# that does not build an initrd along with the rest of the kernel."
|
||||
warn "Building initrd..."
|
||||
#sudo mkdir -p /lib/modules/$kver${version_string}
|
||||
sudo mkinitramfs -v -k -o /boot/initrd.img-$kver${version_string}${plus} $kver${version_string}${plus}
|
||||
}
|
||||
|
||||
function build_ovs_datapath {
|
||||
sudo apt-get install openvswitch-datapath-source
|
||||
if [[ "$i386" == 'true' ]]; then
|
||||
prepend='DEB_HOST_ARCH=i386 '
|
||||
else
|
||||
prepend=
|
||||
fi
|
||||
$prepend sudo module-assistant auto-build openvswitch-datapath -l $kver${version_string}${plus}
|
||||
}
|
||||
|
||||
function mod_ovs_dpkg {
|
||||
# Only needed for i386.
|
||||
cd /usr/src
|
||||
if [[ "$i386" == 'true' ]]; then
|
||||
warn "Modifying deb-pkg names for i386"
|
||||
# Based on instructions from http://dotcommie.net/?id=165
|
||||
for pkg_type in openvswitch-datapath-module; do
|
||||
pkg_name_orig=${pkg_type}-$kver${version_string}${plus}_${ovs_pkg_ver}_amd64.deb
|
||||
#hook=`readlink -f set_debian_control_i386.sh`
|
||||
hook=${orig_dir}/set_debian_control_i386.sh
|
||||
warn "$pkg_name_orig"
|
||||
fakeroot deb-reversion -s "" --hook $hook $pkg_name_orig
|
||||
# Remove the 1 in the name that deb-reversion adds.
|
||||
pkg_name_mod=${pkg_type}-$kver${version_string}${plus}_${ovs_pkg_ver}1_i386.deb
|
||||
pkg_name_new=${pkg_type}-$kver${version_string}${plus}_${ovs_pkg_ver}_i386.deb
|
||||
mv $pkg_name_mod $pkg_name_new
|
||||
warn "Removing original package: $pkg_name_orig"
|
||||
rm -f $pkg_name_orig
|
||||
done
|
||||
fi
|
||||
}
|
||||
|
||||
function install_ovs_datapath {
|
||||
warn "Installing ovs datapath"
|
||||
sudo module-assistant install openvswitch-datapath -l $kver${version_string}${plus}
|
||||
}
|
||||
|
||||
parse_opts $*
|
||||
|
||||
if [[ "$install_only" != 'true' ]] ; then
|
||||
pre_check
|
||||
|
||||
sudo chmod 777 $kdir
|
||||
cd $kdir
|
||||
|
||||
fetch_kernel
|
||||
copy_patches
|
||||
apply_patches
|
||||
configure_kernel
|
||||
build_kernel
|
||||
mod_kernel_dpkg
|
||||
warn "******************************************"
|
||||
warn "Check for kernel .deb installation file in /usr/src/ along with initrd."
|
||||
else
|
||||
install_headers
|
||||
fi
|
||||
|
||||
if [[ "$i386" != 'true' ]]; then
|
||||
# Presumably we'll only want to install on a 64-bit machine.
|
||||
install_kernel
|
||||
fi
|
||||
|
||||
build_ovs_datapath
|
||||
mod_ovs_dpkg
|
||||
|
||||
if [[ "$i386" != 'true' ]]; then
|
||||
install_ovs_datapath
|
||||
build_initrd
|
||||
fi
|
||||
|
||||
cd $orig_dir
|
||||
warn "Done (hopefully)"
|
||||
+121
@@ -0,0 +1,121 @@
|
||||
#!/bin/bash
|
||||
# Builds lxc for kernel patched with setns
|
||||
|
||||
# Check for unitialized variables
|
||||
set -o nounset
|
||||
|
||||
# Exit on any failure
|
||||
set -e
|
||||
|
||||
# Kernel version to use
|
||||
kver=3.0
|
||||
|
||||
# Location in which to download and build lxc
|
||||
lxcdir=$HOME
|
||||
kdir=/lib/modules/`uname -r`/build
|
||||
|
||||
# lxc version to use
|
||||
lxcver=lxc-0.7.5
|
||||
|
||||
# Save original directory for later.
|
||||
orig_dir=`pwd`
|
||||
|
||||
function warn {
|
||||
# Echo the provided command in color text.
|
||||
yellow='\e[0;33m' # Yellow
|
||||
reset='\e[0m'
|
||||
echo="echo -e"
|
||||
$echo "${yellow}$1${reset}"
|
||||
}
|
||||
|
||||
function usage {
|
||||
warn "Usage: $0 [lxc download location] [kernel location]"
|
||||
}
|
||||
|
||||
|
||||
if [[ "$#" > 2 ]]; then
|
||||
warn "Invalid number of args passed."
|
||||
usage
|
||||
exit
|
||||
elif [[ "$#" == 0 ]]; then
|
||||
warn "No args passed."
|
||||
warn "Using default lxc location: ${lxcdir}."
|
||||
warn "Using default kernel location: ${kdir}."
|
||||
elif [[ "$#" == 1 ]]; then
|
||||
lxcdir=$1
|
||||
warn "Using custom lxc location: ${lxcdir}"
|
||||
warn "Using default kernel location: ${kdir}"
|
||||
elif [[ "$#" == 2 ]]; then
|
||||
lxcdir=$1
|
||||
kdir=$2
|
||||
warn "Using custom lxc location: ${lxcdir}"
|
||||
warn "Using custom kernel location: ${kdir}"
|
||||
fi
|
||||
|
||||
function pre_check {
|
||||
warn "Checking for git"
|
||||
if [[ -z `which git` ]]; then
|
||||
read -p \
|
||||
warn "You need git to download lxc. Install? [Y/n] " \
|
||||
answer;
|
||||
[[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
|
||||
sudo apt-get install git;
|
||||
fi
|
||||
|
||||
warn "Checking for linux source code"
|
||||
if [[ ! -d ${kdir} ]]; then
|
||||
warn "Error: Kernel doesn't exist in ${kdir}... exiting"
|
||||
exit
|
||||
fi
|
||||
}
|
||||
|
||||
function fetch_lxc {
|
||||
cd $lxcdir
|
||||
warn "--> Fetching lxc"
|
||||
if [[ -d lxc ]]; then
|
||||
warn "lxc source exists, skipping.."
|
||||
return
|
||||
fi
|
||||
git clone git://lxc.git.sourceforge.net/gitroot/lxc/lxc
|
||||
cd lxc
|
||||
git checkout $lxcver
|
||||
cd ..
|
||||
}
|
||||
|
||||
function copy_patches {
|
||||
rm -rf lxc/patches
|
||||
cp -r ${orig_dir}/../../lxc-$kver-patches lxc/patches
|
||||
}
|
||||
|
||||
function apply_patches {
|
||||
cd lxc
|
||||
warn "Applying patches..."
|
||||
git am -3 patches/*.patch
|
||||
}
|
||||
|
||||
function build_lxc {
|
||||
warn "Building lxc with kernel-${kver}..."
|
||||
processors=`grep -c ^processor /proc/cpuinfo`
|
||||
export CONCURRENCY_LEVEL=$processors
|
||||
make distclean || true
|
||||
./autogen.sh
|
||||
./configure --with-linuxdir=${kdir}
|
||||
make
|
||||
}
|
||||
|
||||
function install_lxc {
|
||||
warn "Installing lxc..."
|
||||
sudo make install
|
||||
# Seems to be missing
|
||||
sudo mkdir -p /usr/local/var/lib/lxc
|
||||
}
|
||||
|
||||
usage
|
||||
pre_check
|
||||
fetch_lxc
|
||||
copy_patches
|
||||
apply_patches
|
||||
build_lxc
|
||||
install_lxc
|
||||
cd $orig_dir
|
||||
warn "Done (hopefully)"
|
||||
Executable
+170
@@ -0,0 +1,170 @@
|
||||
#!/bin/bash
|
||||
# Builds kernel with the new CFS Bandwidth patches
|
||||
# and nsfd/setns syscall patches.
|
||||
|
||||
# Check for unitialized variables
|
||||
set -o nounset
|
||||
|
||||
# Exit on any failure
|
||||
set -e
|
||||
|
||||
# Location in which to download and build the kernel
|
||||
kdir=/usr/src
|
||||
|
||||
# Kernel version to download
|
||||
kver=2.6.35
|
||||
|
||||
# Save original directory for later.
|
||||
orig_dir=`pwd`
|
||||
|
||||
# Kernel version string
|
||||
version_string=-with-cfs
|
||||
|
||||
# Run menuconfig later?
|
||||
menuconfig=
|
||||
|
||||
function warn {
|
||||
# Echo the provided command in color text.
|
||||
yellow='\e[0;33m' # Yellow
|
||||
reset='\e[0m'
|
||||
echo="echo -e"
|
||||
$echo "${yellow}$1${reset}"
|
||||
}
|
||||
|
||||
function usage {
|
||||
warn "Usage: build.sh [version string] [menuconfig]"
|
||||
}
|
||||
|
||||
|
||||
if [[ "$#" > 2 ]]; then
|
||||
warn "Invalid number of args passed."
|
||||
usage
|
||||
exit
|
||||
elif [[ "$#" == 0 ]]; then
|
||||
warn "No args passed. Using default version_string: ${version_string}"
|
||||
elif [[ "$#" == 1 ]]; then
|
||||
warn "Using custom version_string: ${version_string}"
|
||||
version_string=$1
|
||||
elif [[ "$#" == 2 && $2 != 'menuconfig' ]]; then
|
||||
warn "Second arg is either menuconfig or missing."
|
||||
usage
|
||||
else
|
||||
version_string=$1
|
||||
menuconfig=true
|
||||
fi
|
||||
|
||||
function pre_check {
|
||||
warn "Checking for kernel-package build utilities"
|
||||
if [[ -z `which make-kpkg` ]]; then
|
||||
read -p \
|
||||
warn "You need kernel-package utilities to build the kernel. Install? [Y/n] " \
|
||||
answer;
|
||||
[[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
|
||||
sudo apt-get install kernel-package ncurses-dev;
|
||||
fi
|
||||
|
||||
warn "Checking for quilt"
|
||||
if [[ -z `which quilt` ]]; then
|
||||
read -p \
|
||||
warn "You need quilt to install patches. Install? [Y/n] " \
|
||||
answer;
|
||||
[[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
|
||||
sudo apt-get install quilt;
|
||||
fi
|
||||
}
|
||||
|
||||
function fetch_kernel {
|
||||
warn "--> Fetching kernel linux-$kver"
|
||||
if [[ -f linux-$kver.tar.bz2 ]]; then
|
||||
warn "File exists, skipping.."
|
||||
return
|
||||
fi
|
||||
wget http://kernel.org/pub/linux/kernel/v2.6/linux-$kver.tar.bz2
|
||||
warn "Unpacking kernel"
|
||||
tar xjf linux-$kver.tar.bz2
|
||||
}
|
||||
|
||||
function work_around_kernel_package_bug {
|
||||
# Fix will likely break on any other kernel version, so watch out.
|
||||
# From:
|
||||
# https://bugs.launchpad.net/ubuntu/+source/kernel-package/+bug/58307/comments/16
|
||||
sed -i -s 's/echo "+"/#echo "+"/' linux-${kver}/scripts/setlocalversion
|
||||
}
|
||||
|
||||
function copy_patches {
|
||||
rm -rf linux-$kver/patches
|
||||
cp -r ${orig_dir}/../../linux-2.6.35-patches linux-$kver/patches
|
||||
}
|
||||
|
||||
function apply_patches {
|
||||
cd linux-$kver
|
||||
# Apply patch series only if not applied previously.
|
||||
# A better check would look at patches/series and make sure each entry
|
||||
# in `quilt applied` was covered.
|
||||
warn "Checking for applied patches"
|
||||
quilt applied > quilt_applied_stdout 2> quilt_applied_stderr || true
|
||||
if [[ `grep -c "No patches applied" quilt_applied_stderr` == 1 ]]; then
|
||||
warn "Applying patches"
|
||||
quilt push -a
|
||||
else
|
||||
warn "Skipped patches"
|
||||
fi
|
||||
rm quilt_applied
|
||||
}
|
||||
|
||||
function build_kernel {
|
||||
# Have your favourite build method here
|
||||
# This is a standard Debian way of building the kernel
|
||||
# The patches select cfs bandwidth automatically
|
||||
warn "Building kernel..."
|
||||
|
||||
if [[ "$menuconfig" == 'true' ]]; then
|
||||
make menuconfig
|
||||
else
|
||||
warn "Making oldconfig..."
|
||||
yes "" | make oldconfig
|
||||
warn "Making localmodconfig..."
|
||||
make localmodconfig
|
||||
warn "Enabling netns and cpubw..."
|
||||
sed -i -s 's/# CONFIG_VETH is not set/CONFIG_VETH=y/' .config
|
||||
sed -i -s 's/CONFIG_BRIDGE=y/CONFIG_BRIDGE=m/' .config
|
||||
sed -i -s 's/# CONFIG_BRIDGE is not set/CONFIG_BRIDGE=m/' .config
|
||||
sed -i -s 's/# CONFIG_CFS_BANDWIDTH is not set/CONFIG_CFS_BANDWIDTH=y/' .config
|
||||
sed -i -s 's/# CONFIG_NET_NS is not set/CONFIG_NET_NS=y/' .config
|
||||
fi
|
||||
|
||||
warn "Building kernel-$version_string"
|
||||
processors=`grep -c ^processor /proc/cpuinfo`
|
||||
export CONCURRENCY_LEVEL=$processors
|
||||
yes "" | fakeroot make-kpkg --initrd --append-to-version=${version_string} kernel_image
|
||||
|
||||
warn "******************************************"
|
||||
warn "Check for kernel .deb installation file in ../ along with initrd."
|
||||
}
|
||||
|
||||
function install_kernel {
|
||||
warn "Installing kernel..."
|
||||
sudo dpkg -i /usr/src/linux-image-$kver${version_string}_$kver${version_string}-10.00.Custom_amd64.deb
|
||||
}
|
||||
|
||||
function build_initrd {
|
||||
# Certain versions of Ubuntu install a make-kpkg"
|
||||
# that does not build an initrd along with the rest of the kernel."
|
||||
warn "Building initrd..."
|
||||
#sudo mkdir -p /lib/modules/$kver${version_string}
|
||||
sudo mkinitramfs -v -k -o /boot/initrd.img-$kver${version_string} $kver${version_string}
|
||||
}
|
||||
|
||||
pre_check
|
||||
|
||||
sudo chmod 777 $kdir
|
||||
cd $kdir
|
||||
|
||||
fetch_kernel
|
||||
work_around_kernel_package_bug
|
||||
copy_patches
|
||||
apply_patches
|
||||
build_kernel
|
||||
install_kernel
|
||||
build_initrd
|
||||
warn "Done (hopefully)"
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
+85
@@ -0,0 +1,85 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Check for unitialized variables
|
||||
set -o nounset
|
||||
|
||||
# Exit on any failure
|
||||
set -e
|
||||
|
||||
debdir=/usr/src
|
||||
kver=3.0.9-with-cfs
|
||||
kbuild=/lib/modules/$kver/build
|
||||
|
||||
if arch | grep 64 > /dev/null ; then arch=amd64; else arch=i386; fi
|
||||
|
||||
headers=linux-headers-${kver}_${kver}-10.00.Custom_${arch}.deb
|
||||
image=linux-image-${kver}_${kver}-10.00.Custom_${arch}.deb
|
||||
ovs=openvswitch-datapath-module-${kver}_1.2.0-1ubuntu3_${arch}.deb
|
||||
|
||||
echo "Mininet-hifi installer"
|
||||
|
||||
echo "1. Checking for prereqs"
|
||||
if [[ ! -e $debdir/$headers || ! -e $debdir/$image ||
|
||||
! -e $debdir/$ovs ]]; then
|
||||
echo "Can't find kernel packages"
|
||||
echo "$debdir/$headers or $debdir/$image or $debdir/$ovs is missing"
|
||||
exit 1
|
||||
fi
|
||||
if [[ "`ssh-add -l`" == "" ]]; then
|
||||
echo "No SSH keys - nsdi repo checkout will fail."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "2. Getting mainline Mininet from github"
|
||||
cd ~
|
||||
git clone git://github.com/mininet/mininet.git
|
||||
|
||||
echo "3. Installing OpenFlow reference implementation"
|
||||
mininet/util/install.sh -f
|
||||
|
||||
echo "4. Installing Mininet core files"
|
||||
mininet/util/install.sh -n
|
||||
|
||||
echo "5. Adding nsdi repository"
|
||||
cd ~/mininet
|
||||
git remote add nsdi git@gitosis.stanford.edu:mininet-nsdi.git
|
||||
git fetch nsdi
|
||||
git checkout -b mininet-rt remotes/nsdi/mininet-rt
|
||||
sudo make install
|
||||
|
||||
echo "6. Installing kernel packages"
|
||||
sudo dpkg -i $debdir/$headers
|
||||
sudo dpkg -i $debdir/$image
|
||||
sudo dpkg -i $debdir/$ovs
|
||||
|
||||
echo "7. Fetching, building and installing Open vSwitch user code"
|
||||
cd ~
|
||||
git clone git://openvswitch.org/openvswitch
|
||||
cd ~/openvswitch
|
||||
git checkout v1.2.2
|
||||
./boot.sh
|
||||
./configure
|
||||
make all
|
||||
sudo make install
|
||||
sudo cp tests/test-openflowd /usr/local/bin/ovs-openflowd
|
||||
|
||||
echo "8. Building and installing custom lxc package"
|
||||
sudo apt-get -y install libcap-dev
|
||||
cd ~/mininet/util/kbuild/cfs-nsfd-kernel
|
||||
./build-lxc-for-3.0.sh $HOME $kbuild
|
||||
|
||||
echo "9. Setting up /cgroup"
|
||||
sudo apt-get remove cgroup-lite
|
||||
sudo mkdir /cgroup
|
||||
sudo sh -c "echo 'cgroup /cgroup cgroup defaults 0 0' >> /etc/fstab"
|
||||
|
||||
echo "10. Creating /etc/mn/host.conf"
|
||||
sudo mkdir -p /etc/mn
|
||||
sudo sh -c "echo 'lxc.utsname = mnhost' > /etc/mn/host.conf"
|
||||
sudo sh -c "echo 'lxc.network.type = empty' >> /etc/mn/host.conf"
|
||||
|
||||
echo "11. Getting rid of quiet boot"
|
||||
sudo sed -i 's/quiet/text/' /etc/default/grub
|
||||
|
||||
echo "Done! reboot to test"
|
||||
|
||||
Executable
+17
@@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
# Install lxc from source, apply patch, install
|
||||
# (instructions tested with 2.6.35 only):
|
||||
sudo apt-get -y install libcap-dev quilt
|
||||
cd ~/
|
||||
git clone git://lxc.git.sourceforge.net/gitroot/lxc/lxc
|
||||
cd lxc
|
||||
git checkout lxc-0.7.2 -b lxc-0.7.2
|
||||
cp ~/mininet/util/kbuild/cfs-nsfd-kernel/lxc-patches.tar.gz .
|
||||
tar xzf lxc-patches.tar.gz
|
||||
# Modify patch. Small change to the patch: remove the 2nd argument to lxc_cgroup_path_get (it's set to NULL in the patch)
|
||||
sed -i -s 's/cgrouppath, NULL, my_args.name/cgrouppath, my_args.name/' patches/lxc-attach-bug-fix.patch
|
||||
quilt push -a
|
||||
./autogen.sh
|
||||
./configure
|
||||
make
|
||||
sudo make install
|
||||
Binary file not shown.
@@ -0,0 +1,4 @@
|
||||
#!/bin/sh
|
||||
echo `pwd`
|
||||
sed -i -s "s/Architecture: amd64/Architecture: i386/" DEBIAN/control
|
||||
|
||||
Executable
+14
@@ -0,0 +1,14 @@
|
||||
#!/bin/sh
|
||||
# Re-build OVS for the kernel version defined below.
|
||||
|
||||
OVS_DIR=~/openvswitch
|
||||
KERNEL_VER=`uname -r`
|
||||
#KERNEL_VER=2.6.35-with-cfs
|
||||
PROCESSORS=`grep -c ^processor /proc/cpuinfo`
|
||||
cd $OVS_DIR
|
||||
./configure --with-linux=/lib/modules/${KERNEL_VER}/build && \
|
||||
sudo make -j${PROCESSORS} && \
|
||||
sudo cp ./datapath/linux/openvswitch_mod.ko /lib/modules/${KERNEL_VER}/kernel/drivers/net && \
|
||||
echo "Running depmod..."
|
||||
sudo depmod -a ${KERNEL_VER}
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
From 57cc69f4a6d27c0b3ef495589a1d4629a9f1fa3e Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Wed, 6 Jul 2011 22:30:37 -0700
|
||||
Subject: [PATCH 01/19] sched: Don't update shares twice on on_rq parent
|
||||
|
||||
In dequeue_task_fair() we bail on dequeue when we encounter a parenting entity
|
||||
with additional weight. However, we perform a double shares update on this
|
||||
entity as we continue the shares update traversal from this point, despite
|
||||
dequeue_entity() having already updated its queuing cfs_rq.
|
||||
Avoid this by starting from the parent when we resume.
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110707053059.797714697@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
kernel/sched_fair.c | 3 +++
|
||||
1 files changed, 3 insertions(+), 0 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index c768588..c80f030 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -1370,6 +1370,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
*/
|
||||
if (task_sleep && parent_entity(se))
|
||||
set_next_buddy(parent_entity(se));
|
||||
+
|
||||
+ /* avoid re-evaluating load for this entity */
|
||||
+ se = parent_entity(se);
|
||||
break;
|
||||
}
|
||||
flags |= DEQUEUE_SLEEP;
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
+168
@@ -0,0 +1,168 @@
|
||||
From 4ec11a3e21874534f9ffa70a8878bb255618bb33 Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:27 -0700
|
||||
Subject: [PATCH 02/19] sched: Implement hierarchical task accounting for SCHED_OTHER
|
||||
|
||||
Introduce hierarchical task accounting for the group scheduling case in CFS, as
|
||||
well as promoting the responsibility for maintaining rq->nr_running to the
|
||||
scheduling classes.
|
||||
|
||||
The primary motivation for this is that with scheduling classes supporting
|
||||
bandwidth throttling it is possible for entities participating in throttled
|
||||
sub-trees to not have root visible changes in rq->nr_running across activate
|
||||
and de-activate operations. This in turn leads to incorrect idle and
|
||||
weight-per-task load balance decisions.
|
||||
|
||||
This also allows us to make a small fixlet to the fastpath in pick_next_task()
|
||||
under group scheduling.
|
||||
|
||||
Note: this issue also exists with the existing sched_rt throttling mechanism.
|
||||
This patch does not address that.
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184756.878333391@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
kernel/sched.c | 6 ++----
|
||||
kernel/sched_fair.c | 10 ++++++++--
|
||||
kernel/sched_rt.c | 5 ++++-
|
||||
kernel/sched_stoptask.c | 2 ++
|
||||
4 files changed, 16 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index fde6ff9..b015a0e 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -308,7 +308,7 @@ struct task_group root_task_group;
|
||||
/* CFS-related fields in a runqueue */
|
||||
struct cfs_rq {
|
||||
struct load_weight load;
|
||||
- unsigned long nr_running;
|
||||
+ unsigned long nr_running, h_nr_running;
|
||||
|
||||
u64 exec_clock;
|
||||
u64 min_vruntime;
|
||||
@@ -1830,7 +1830,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
|
||||
rq->nr_uninterruptible--;
|
||||
|
||||
enqueue_task(rq, p, flags);
|
||||
- inc_nr_running(rq);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1842,7 +1841,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
|
||||
rq->nr_uninterruptible++;
|
||||
|
||||
dequeue_task(rq, p, flags);
|
||||
- dec_nr_running(rq);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
@@ -4226,7 +4224,7 @@ pick_next_task(struct rq *rq)
|
||||
* Optimization: we know that if all tasks are in
|
||||
* the fair class we can call that function directly:
|
||||
*/
|
||||
- if (likely(rq->nr_running == rq->cfs.nr_running)) {
|
||||
+ if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
|
||||
p = fair_sched_class.pick_next_task(rq);
|
||||
if (likely(p))
|
||||
return p;
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index c80f030..f70bb4b 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -1332,16 +1332,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
break;
|
||||
cfs_rq = cfs_rq_of(se);
|
||||
enqueue_entity(cfs_rq, se, flags);
|
||||
+ cfs_rq->h_nr_running++;
|
||||
flags = ENQUEUE_WAKEUP;
|
||||
}
|
||||
|
||||
for_each_sched_entity(se) {
|
||||
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
||||
+ cfs_rq = cfs_rq_of(se);
|
||||
+ cfs_rq->h_nr_running++;
|
||||
|
||||
update_cfs_load(cfs_rq, 0);
|
||||
update_cfs_shares(cfs_rq);
|
||||
}
|
||||
|
||||
+ inc_nr_running(rq);
|
||||
hrtick_update(rq);
|
||||
}
|
||||
|
||||
@@ -1361,6 +1364,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
for_each_sched_entity(se) {
|
||||
cfs_rq = cfs_rq_of(se);
|
||||
dequeue_entity(cfs_rq, se, flags);
|
||||
+ cfs_rq->h_nr_running--;
|
||||
|
||||
/* Don't dequeue parent if it has other entities besides us */
|
||||
if (cfs_rq->load.weight) {
|
||||
@@ -1379,12 +1383,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
}
|
||||
|
||||
for_each_sched_entity(se) {
|
||||
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
||||
+ cfs_rq = cfs_rq_of(se);
|
||||
+ cfs_rq->h_nr_running--;
|
||||
|
||||
update_cfs_load(cfs_rq, 0);
|
||||
update_cfs_shares(cfs_rq);
|
||||
}
|
||||
|
||||
+ dec_nr_running(rq);
|
||||
hrtick_update(rq);
|
||||
}
|
||||
|
||||
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
|
||||
index 10d0182..1af971b 100644
|
||||
--- a/kernel/sched_rt.c
|
||||
+++ b/kernel/sched_rt.c
|
||||
@@ -949,6 +949,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
|
||||
|
||||
if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
|
||||
enqueue_pushable_task(rq, p);
|
||||
+
|
||||
+ inc_nr_running(rq);
|
||||
}
|
||||
|
||||
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
|
||||
@@ -959,6 +961,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
|
||||
dequeue_rt_entity(rt_se);
|
||||
|
||||
dequeue_pushable_task(rq, p);
|
||||
+
|
||||
+ dec_nr_running(rq);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1851,4 +1855,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
|
||||
rcu_read_unlock();
|
||||
}
|
||||
#endif /* CONFIG_SCHED_DEBUG */
|
||||
-
|
||||
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
|
||||
index 6f43763..8b44e7f 100644
|
||||
--- a/kernel/sched_stoptask.c
|
||||
+++ b/kernel/sched_stoptask.c
|
||||
@@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
|
||||
static void
|
||||
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
+ inc_nr_running(rq);
|
||||
}
|
||||
|
||||
static void
|
||||
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
|
||||
{
|
||||
+ dec_nr_running(rq);
|
||||
}
|
||||
|
||||
static void yield_task_stop(struct rq *rq)
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
+380
@@ -0,0 +1,380 @@
|
||||
From 116f22667986ab86f1a00098a0daf9959b1f6df0 Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:28 -0700
|
||||
Subject: [PATCH 03/19] sched: Introduce primitives to account for CFS bandwidth tracking
|
||||
|
||||
In this patch we introduce the notion of CFS bandwidth, partitioned into
|
||||
globally unassigned bandwidth, and locally claimed bandwidth.
|
||||
|
||||
- The global bandwidth is per task_group, it represents a pool of unclaimed
|
||||
bandwidth that cfs_rqs can allocate from.
|
||||
- The local bandwidth is tracked per-cfs_rq, this represents allotments from
|
||||
the global pool bandwidth assigned to a specific cpu.
|
||||
|
||||
Bandwidth is managed via cgroupfs, adding two new interfaces to the cpu subsystem:
|
||||
- cpu.cfs_period_us : the bandwidth period in usecs
|
||||
- cpu.cfs_quota_us : the cpu bandwidth (in usecs) that this tg will be allowed
|
||||
to consume over period above.
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Signed-off-by: Nikhil Rao <ncrao@google.com>
|
||||
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
|
||||
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184756.972636699@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
init/Kconfig | 12 +++
|
||||
kernel/sched.c | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++-
|
||||
kernel/sched_fair.c | 16 ++++
|
||||
3 files changed, 225 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/init/Kconfig b/init/Kconfig
|
||||
index 412c21b..67579ed 100644
|
||||
--- a/init/Kconfig
|
||||
+++ b/init/Kconfig
|
||||
@@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED
|
||||
depends on CGROUP_SCHED
|
||||
default CGROUP_SCHED
|
||||
|
||||
+config CFS_BANDWIDTH
|
||||
+ bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
|
||||
+ depends on EXPERIMENTAL
|
||||
+ depends on FAIR_GROUP_SCHED
|
||||
+ default n
|
||||
+ help
|
||||
+ This option allows users to define CPU bandwidth rates (limits) for
|
||||
+ tasks running within the fair group scheduler. Groups with no limit
|
||||
+ set are considered to be unconstrained and will run with no
|
||||
+ restriction.
|
||||
+ See tip/Documentation/scheduler/sched-bwc.txt for more information.
|
||||
+
|
||||
config RT_GROUP_SCHED
|
||||
bool "Group scheduling for SCHED_RR/FIFO"
|
||||
depends on EXPERIMENTAL
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index b015a0e..28d838b 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -244,6 +244,14 @@ struct cfs_rq;
|
||||
|
||||
static LIST_HEAD(task_groups);
|
||||
|
||||
+struct cfs_bandwidth {
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+ raw_spinlock_t lock;
|
||||
+ ktime_t period;
|
||||
+ u64 quota;
|
||||
+#endif
|
||||
+};
|
||||
+
|
||||
/* task group related information */
|
||||
struct task_group {
|
||||
struct cgroup_subsys_state css;
|
||||
@@ -275,6 +283,8 @@ struct task_group {
|
||||
#ifdef CONFIG_SCHED_AUTOGROUP
|
||||
struct autogroup *autogroup;
|
||||
#endif
|
||||
+
|
||||
+ struct cfs_bandwidth cfs_bandwidth;
|
||||
};
|
||||
|
||||
/* task_group_lock serializes the addition/removal of task groups */
|
||||
@@ -374,9 +384,48 @@ struct cfs_rq {
|
||||
|
||||
unsigned long load_contribution;
|
||||
#endif
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+ int runtime_enabled;
|
||||
+ s64 runtime_remaining;
|
||||
+#endif
|
||||
#endif
|
||||
};
|
||||
|
||||
+#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
|
||||
+{
|
||||
+ return &tg->cfs_bandwidth;
|
||||
+}
|
||||
+
|
||||
+static inline u64 default_cfs_period(void);
|
||||
+
|
||||
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
+{
|
||||
+ raw_spin_lock_init(&cfs_b->lock);
|
||||
+ cfs_b->quota = RUNTIME_INF;
|
||||
+ cfs_b->period = ns_to_ktime(default_cfs_period());
|
||||
+}
|
||||
+
|
||||
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
+{
|
||||
+ cfs_rq->runtime_enabled = 0;
|
||||
+}
|
||||
+
|
||||
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
+{}
|
||||
+#else
|
||||
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
|
||||
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
|
||||
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
|
||||
+
|
||||
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
|
||||
+{
|
||||
+ return NULL;
|
||||
+}
|
||||
+#endif /* CONFIG_CFS_BANDWIDTH */
|
||||
+#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
+
|
||||
/* Real-Time classes' related field in a runqueue: */
|
||||
struct rt_rq {
|
||||
struct rt_prio_array active;
|
||||
@@ -7958,6 +8007,12 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
|
||||
tg->cfs_rq[cpu] = cfs_rq;
|
||||
init_cfs_rq(cfs_rq, rq);
|
||||
cfs_rq->tg = tg;
|
||||
+ cfs_rq->rq = rq;
|
||||
+#ifdef CONFIG_SMP
|
||||
+ /* allow initial update_cfs_load() to truncate */
|
||||
+ cfs_rq->load_stamp = 1;
|
||||
+#endif
|
||||
+ init_cfs_rq_runtime(cfs_rq);
|
||||
|
||||
tg->se[cpu] = se;
|
||||
/* se could be NULL for root_task_group */
|
||||
@@ -8093,6 +8148,7 @@ void __init sched_init(void)
|
||||
* We achieve this by letting root_task_group's tasks sit
|
||||
* directly in rq->cfs (i.e root_task_group->se[] = NULL).
|
||||
*/
|
||||
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
|
||||
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
@@ -8336,6 +8392,8 @@ static void free_fair_sched_group(struct task_group *tg)
|
||||
{
|
||||
int i;
|
||||
|
||||
+ destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
|
||||
+
|
||||
for_each_possible_cpu(i) {
|
||||
if (tg->cfs_rq)
|
||||
kfree(tg->cfs_rq[i]);
|
||||
@@ -8363,6 +8421,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
|
||||
|
||||
tg->shares = NICE_0_LOAD;
|
||||
|
||||
+ init_cfs_bandwidth(tg_cfs_bandwidth(tg));
|
||||
+
|
||||
for_each_possible_cpu(i) {
|
||||
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
|
||||
GFP_KERNEL, cpu_to_node(i));
|
||||
@@ -8734,7 +8794,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
|
||||
return walk_tg_tree(tg_schedulable, tg_nop, &data);
|
||||
}
|
||||
|
||||
-static int tg_set_bandwidth(struct task_group *tg,
|
||||
+static int tg_set_rt_bandwidth(struct task_group *tg,
|
||||
u64 rt_period, u64 rt_runtime)
|
||||
{
|
||||
int i, err = 0;
|
||||
@@ -8773,7 +8833,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
|
||||
if (rt_runtime_us < 0)
|
||||
rt_runtime = RUNTIME_INF;
|
||||
|
||||
- return tg_set_bandwidth(tg, rt_period, rt_runtime);
|
||||
+ return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
|
||||
}
|
||||
|
||||
long sched_group_rt_runtime(struct task_group *tg)
|
||||
@@ -8798,7 +8858,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
|
||||
if (rt_period == 0)
|
||||
return -EINVAL;
|
||||
|
||||
- return tg_set_bandwidth(tg, rt_period, rt_runtime);
|
||||
+ return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
|
||||
}
|
||||
|
||||
long sched_group_rt_period(struct task_group *tg)
|
||||
@@ -8988,6 +9048,128 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
|
||||
|
||||
return (u64) scale_load_down(tg->shares);
|
||||
}
|
||||
+
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
|
||||
+const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
|
||||
+
|
||||
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
+{
|
||||
+ int i;
|
||||
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
||||
+ static DEFINE_MUTEX(mutex);
|
||||
+
|
||||
+ if (tg == &root_task_group)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ /*
|
||||
+ * Ensure we have at some amount of bandwidth every period. This is
|
||||
+ * to prevent reaching a state of large arrears when throttled via
|
||||
+ * entity_tick() resulting in prolonged exit starvation.
|
||||
+ */
|
||||
+ if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ /*
|
||||
+ * Likewise, bound things on the otherside by preventing insane quota
|
||||
+ * periods. This also allows us to normalize in computing quota
|
||||
+ * feasibility.
|
||||
+ */
|
||||
+ if (period > max_cfs_quota_period)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ mutex_lock(&mutex);
|
||||
+ raw_spin_lock_irq(&cfs_b->lock);
|
||||
+ cfs_b->period = ns_to_ktime(period);
|
||||
+ cfs_b->quota = quota;
|
||||
+ raw_spin_unlock_irq(&cfs_b->lock);
|
||||
+
|
||||
+ for_each_possible_cpu(i) {
|
||||
+ struct cfs_rq *cfs_rq = tg->cfs_rq[i];
|
||||
+ struct rq *rq = rq_of(cfs_rq);
|
||||
+
|
||||
+ raw_spin_lock_irq(&rq->lock);
|
||||
+ cfs_rq->runtime_enabled = quota != RUNTIME_INF;
|
||||
+ cfs_rq->runtime_remaining = 0;
|
||||
+ raw_spin_unlock_irq(&rq->lock);
|
||||
+ }
|
||||
+ mutex_unlock(&mutex);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
|
||||
+{
|
||||
+ u64 quota, period;
|
||||
+
|
||||
+ period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
|
||||
+ if (cfs_quota_us < 0)
|
||||
+ quota = RUNTIME_INF;
|
||||
+ else
|
||||
+ quota = (u64)cfs_quota_us * NSEC_PER_USEC;
|
||||
+
|
||||
+ return tg_set_cfs_bandwidth(tg, period, quota);
|
||||
+}
|
||||
+
|
||||
+long tg_get_cfs_quota(struct task_group *tg)
|
||||
+{
|
||||
+ u64 quota_us;
|
||||
+
|
||||
+ if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
|
||||
+ return -1;
|
||||
+
|
||||
+ quota_us = tg_cfs_bandwidth(tg)->quota;
|
||||
+ do_div(quota_us, NSEC_PER_USEC);
|
||||
+
|
||||
+ return quota_us;
|
||||
+}
|
||||
+
|
||||
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
|
||||
+{
|
||||
+ u64 quota, period;
|
||||
+
|
||||
+ period = (u64)cfs_period_us * NSEC_PER_USEC;
|
||||
+ quota = tg_cfs_bandwidth(tg)->quota;
|
||||
+
|
||||
+ if (period <= 0)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ return tg_set_cfs_bandwidth(tg, period, quota);
|
||||
+}
|
||||
+
|
||||
+long tg_get_cfs_period(struct task_group *tg)
|
||||
+{
|
||||
+ u64 cfs_period_us;
|
||||
+
|
||||
+ cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
|
||||
+ do_div(cfs_period_us, NSEC_PER_USEC);
|
||||
+
|
||||
+ return cfs_period_us;
|
||||
+}
|
||||
+
|
||||
+static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
|
||||
+{
|
||||
+ return tg_get_cfs_quota(cgroup_tg(cgrp));
|
||||
+}
|
||||
+
|
||||
+static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
|
||||
+ s64 cfs_quota_us)
|
||||
+{
|
||||
+ return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
|
||||
+}
|
||||
+
|
||||
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
|
||||
+{
|
||||
+ return tg_get_cfs_period(cgroup_tg(cgrp));
|
||||
+}
|
||||
+
|
||||
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
|
||||
+ u64 cfs_period_us)
|
||||
+{
|
||||
+ return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_CFS_BANDWIDTH */
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
@@ -9022,6 +9204,18 @@ static struct cftype cpu_files[] = {
|
||||
.write_u64 = cpu_shares_write_u64,
|
||||
},
|
||||
#endif
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+ {
|
||||
+ .name = "cfs_quota_us",
|
||||
+ .read_s64 = cpu_cfs_quota_read_s64,
|
||||
+ .write_s64 = cpu_cfs_quota_write_s64,
|
||||
+ },
|
||||
+ {
|
||||
+ .name = "cfs_period_us",
|
||||
+ .read_u64 = cpu_cfs_period_read_u64,
|
||||
+ .write_u64 = cpu_cfs_period_write_u64,
|
||||
+ },
|
||||
+#endif
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
{
|
||||
.name = "rt_runtime_us",
|
||||
@@ -9331,4 +9525,3 @@ struct cgroup_subsys cpuacct_subsys = {
|
||||
.subsys_id = cpuacct_subsys_id,
|
||||
};
|
||||
#endif /* CONFIG_CGROUP_CPUACCT */
|
||||
-
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index f70bb4b..91624cf 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -1256,6 +1256,22 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
|
||||
check_preempt_tick(cfs_rq, curr);
|
||||
}
|
||||
|
||||
+
|
||||
+/**************************************************
|
||||
+ * CFS bandwidth control machinery
|
||||
+ */
|
||||
+
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+/*
|
||||
+ * default period for cfs group bandwidth.
|
||||
+ * default: 0.1s, units: nanoseconds
|
||||
+ */
|
||||
+static inline u64 default_cfs_period(void)
|
||||
+{
|
||||
+ return 100000000ULL;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
/**************************************************
|
||||
* CFS operations on tasks:
|
||||
*/
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
@@ -0,0 +1,221 @@
|
||||
From e68a3cf7b0006f6d8c362833ebc96cbed01a263e Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:29 -0700
|
||||
Subject: [PATCH 04/19] sched: Validate CFS quota hierarchies
|
||||
|
||||
Add constraints validation for CFS bandwidth hierarchies.
|
||||
|
||||
Validate that:
|
||||
max(child bandwidth) <= parent_bandwidth
|
||||
|
||||
In a quota limited hierarchy, an unconstrained entity
|
||||
(e.g. bandwidth==RUNTIME_INF) inherits the bandwidth of its parent.
|
||||
|
||||
This constraint is chosen over sum(child_bandwidth) as notion of over-commit is
|
||||
valuable within SCHED_OTHER. Some basic code from the RT case is re-factored
|
||||
for reuse.
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184757.083774572@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
kernel/sched.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++-------
|
||||
1 files changed, 98 insertions(+), 14 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index 28d838b..75f2dd7 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -249,6 +249,7 @@ struct cfs_bandwidth {
|
||||
raw_spinlock_t lock;
|
||||
ktime_t period;
|
||||
u64 quota;
|
||||
+ s64 hierarchal_quota;
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -1512,7 +1513,8 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
|
||||
update_load_sub(&rq->load, load);
|
||||
}
|
||||
|
||||
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
|
||||
+#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
|
||||
+ (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
|
||||
typedef int (*tg_visitor)(struct task_group *, void *);
|
||||
|
||||
/*
|
||||
@@ -8694,12 +8696,7 @@ unsigned long sched_group_shares(struct task_group *tg)
|
||||
}
|
||||
#endif
|
||||
|
||||
-#ifdef CONFIG_RT_GROUP_SCHED
|
||||
-/*
|
||||
- * Ensure that the real time constraints are schedulable.
|
||||
- */
|
||||
-static DEFINE_MUTEX(rt_constraints_mutex);
|
||||
-
|
||||
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
|
||||
static unsigned long to_ratio(u64 period, u64 runtime)
|
||||
{
|
||||
if (runtime == RUNTIME_INF)
|
||||
@@ -8707,6 +8704,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)
|
||||
|
||||
return div64_u64(runtime << 20, period);
|
||||
}
|
||||
+#endif
|
||||
+
|
||||
+#ifdef CONFIG_RT_GROUP_SCHED
|
||||
+/*
|
||||
+ * Ensure that the real time constraints are schedulable.
|
||||
+ */
|
||||
+static DEFINE_MUTEX(rt_constraints_mutex);
|
||||
|
||||
/* Must be called with tasklist_lock held */
|
||||
static inline int tg_has_rt_tasks(struct task_group *tg)
|
||||
@@ -8727,7 +8731,7 @@ struct rt_schedulable_data {
|
||||
u64 rt_runtime;
|
||||
};
|
||||
|
||||
-static int tg_schedulable(struct task_group *tg, void *data)
|
||||
+static int tg_rt_schedulable(struct task_group *tg, void *data)
|
||||
{
|
||||
struct rt_schedulable_data *d = data;
|
||||
struct task_group *child;
|
||||
@@ -8791,7 +8795,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
|
||||
.rt_runtime = runtime,
|
||||
};
|
||||
|
||||
- return walk_tg_tree(tg_schedulable, tg_nop, &data);
|
||||
+ return walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
|
||||
}
|
||||
|
||||
static int tg_set_rt_bandwidth(struct task_group *tg,
|
||||
@@ -9050,14 +9054,17 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+static DEFINE_MUTEX(cfs_constraints_mutex);
|
||||
+
|
||||
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
|
||||
const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
|
||||
|
||||
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
|
||||
+
|
||||
static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
{
|
||||
- int i;
|
||||
+ int i, ret = 0;
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
||||
- static DEFINE_MUTEX(mutex);
|
||||
|
||||
if (tg == &root_task_group)
|
||||
return -EINVAL;
|
||||
@@ -9078,7 +9085,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
if (period > max_cfs_quota_period)
|
||||
return -EINVAL;
|
||||
|
||||
- mutex_lock(&mutex);
|
||||
+ mutex_lock(&cfs_constraints_mutex);
|
||||
+ ret = __cfs_schedulable(tg, period, quota);
|
||||
+ if (ret)
|
||||
+ goto out_unlock;
|
||||
+
|
||||
raw_spin_lock_irq(&cfs_b->lock);
|
||||
cfs_b->period = ns_to_ktime(period);
|
||||
cfs_b->quota = quota;
|
||||
@@ -9093,9 +9104,10 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
cfs_rq->runtime_remaining = 0;
|
||||
raw_spin_unlock_irq(&rq->lock);
|
||||
}
|
||||
- mutex_unlock(&mutex);
|
||||
+out_unlock:
|
||||
+ mutex_unlock(&cfs_constraints_mutex);
|
||||
|
||||
- return 0;
|
||||
+ return ret;
|
||||
}
|
||||
|
||||
int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
|
||||
@@ -9169,6 +9181,78 @@ static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
|
||||
return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
|
||||
}
|
||||
|
||||
+struct cfs_schedulable_data {
|
||||
+ struct task_group *tg;
|
||||
+ u64 period, quota;
|
||||
+};
|
||||
+
|
||||
+/*
|
||||
+ * normalize group quota/period to be quota/max_period
|
||||
+ * note: units are usecs
|
||||
+ */
|
||||
+static u64 normalize_cfs_quota(struct task_group *tg,
|
||||
+ struct cfs_schedulable_data *d)
|
||||
+{
|
||||
+ u64 quota, period;
|
||||
+
|
||||
+ if (tg == d->tg) {
|
||||
+ period = d->period;
|
||||
+ quota = d->quota;
|
||||
+ } else {
|
||||
+ period = tg_get_cfs_period(tg);
|
||||
+ quota = tg_get_cfs_quota(tg);
|
||||
+ }
|
||||
+
|
||||
+ /* note: these should typically be equivalent */
|
||||
+ if (quota == RUNTIME_INF || quota == -1)
|
||||
+ return RUNTIME_INF;
|
||||
+
|
||||
+ return to_ratio(period, quota);
|
||||
+}
|
||||
+
|
||||
+static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
|
||||
+{
|
||||
+ struct cfs_schedulable_data *d = data;
|
||||
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
||||
+ s64 quota = 0, parent_quota = -1;
|
||||
+
|
||||
+ if (!tg->parent) {
|
||||
+ quota = RUNTIME_INF;
|
||||
+ } else {
|
||||
+ struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
|
||||
+
|
||||
+ quota = normalize_cfs_quota(tg, d);
|
||||
+ parent_quota = parent_b->hierarchal_quota;
|
||||
+
|
||||
+ /*
|
||||
+ * ensure max(child_quota) <= parent_quota, inherit when no
|
||||
+ * limit is set
|
||||
+ */
|
||||
+ if (quota == RUNTIME_INF)
|
||||
+ quota = parent_quota;
|
||||
+ else if (parent_quota != RUNTIME_INF && quota > parent_quota)
|
||||
+ return -EINVAL;
|
||||
+ }
|
||||
+ cfs_b->hierarchal_quota = quota;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
|
||||
+{
|
||||
+ struct cfs_schedulable_data data = {
|
||||
+ .tg = tg,
|
||||
+ .period = period,
|
||||
+ .quota = quota,
|
||||
+ };
|
||||
+
|
||||
+ if (quota != RUNTIME_INF) {
|
||||
+ do_div(data.period, NSEC_PER_USEC);
|
||||
+ do_div(data.quota, NSEC_PER_USEC);
|
||||
+ }
|
||||
+
|
||||
+ return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
|
||||
+}
|
||||
#endif /* CONFIG_CFS_BANDWIDTH */
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
+217
@@ -0,0 +1,217 @@
|
||||
From 50fe68ec9d454eced64cbfc29954ee64cc7225da Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:30 -0700
|
||||
Subject: [PATCH 05/19] sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth
|
||||
|
||||
Account bandwidth usage on the cfs_rq level versus the task_groups to which
|
||||
they belong. Whether we are tracking bandwidth on a given cfs_rq is maintained
|
||||
under cfs_rq->runtime_enabled.
|
||||
|
||||
cfs_rq's which belong to a bandwidth constrained task_group have their runtime
|
||||
accounted via the update_curr() path, which withdraws bandwidth from the global
|
||||
pool as desired. Updates involving the global pool are currently protected
|
||||
under cfs_bandwidth->lock, local runtime is protected by rq->lock.
|
||||
|
||||
This patch only assigns and tracks quota, no action is taken in the case that
|
||||
cfs_rq->runtime_used exceeds cfs_rq->runtime_assigned.
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Signed-off-by: Nikhil Rao <ncrao@google.com>
|
||||
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
|
||||
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184757.179386821@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
include/linux/sched.h | 4 ++
|
||||
kernel/sched.c | 4 ++-
|
||||
kernel/sched_fair.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++-
|
||||
kernel/sysctl.c | 10 ++++++
|
||||
4 files changed, 94 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
||||
index 14a6c7b..adfc8eb 100644
|
||||
--- a/include/linux/sched.h
|
||||
+++ b/include/linux/sched.h
|
||||
@@ -2021,6 +2021,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
|
||||
static inline void sched_autogroup_exit(struct signal_struct *sig) { }
|
||||
#endif
|
||||
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+extern unsigned int sysctl_sched_cfs_bandwidth_slice;
|
||||
+#endif
|
||||
+
|
||||
#ifdef CONFIG_RT_MUTEXES
|
||||
extern int rt_mutex_getprio(struct task_struct *p);
|
||||
extern void rt_mutex_setprio(struct task_struct *p, int prio);
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index 75f2dd7..cdbc7d3 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -248,7 +248,7 @@ struct cfs_bandwidth {
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
raw_spinlock_t lock;
|
||||
ktime_t period;
|
||||
- u64 quota;
|
||||
+ u64 quota, runtime;
|
||||
s64 hierarchal_quota;
|
||||
#endif
|
||||
};
|
||||
@@ -404,6 +404,7 @@ static inline u64 default_cfs_period(void);
|
||||
static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
{
|
||||
raw_spin_lock_init(&cfs_b->lock);
|
||||
+ cfs_b->runtime = 0;
|
||||
cfs_b->quota = RUNTIME_INF;
|
||||
cfs_b->period = ns_to_ktime(default_cfs_period());
|
||||
}
|
||||
@@ -9093,6 +9094,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
raw_spin_lock_irq(&cfs_b->lock);
|
||||
cfs_b->period = ns_to_ktime(period);
|
||||
cfs_b->quota = quota;
|
||||
+ cfs_b->runtime = quota;
|
||||
raw_spin_unlock_irq(&cfs_b->lock);
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index 91624cf..863c9ec 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
|
||||
*/
|
||||
unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
|
||||
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+/*
|
||||
+ * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
|
||||
+ * each time a cfs_rq requests quota.
|
||||
+ *
|
||||
+ * Note: in the case that the slice exceeds the runtime remaining (either due
|
||||
+ * to consumption or the quota being specified to be smaller than the slice)
|
||||
+ * we will always only issue the remaining available time.
|
||||
+ *
|
||||
+ * default: 5 msec, units: microseconds
|
||||
+ */
|
||||
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
|
||||
+#endif
|
||||
+
|
||||
static const struct sched_class fair_sched_class;
|
||||
|
||||
/**************************************************************
|
||||
@@ -305,6 +319,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
|
||||
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
|
||||
+ unsigned long delta_exec);
|
||||
|
||||
/**************************************************************
|
||||
* Scheduling class tree data structure manipulation methods:
|
||||
@@ -602,6 +618,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
|
||||
cpuacct_charge(curtask, delta_exec);
|
||||
account_group_exec_runtime(curtask, delta_exec);
|
||||
}
|
||||
+
|
||||
+ account_cfs_rq_runtime(cfs_rq, delta_exec);
|
||||
}
|
||||
|
||||
static inline void
|
||||
@@ -1270,6 +1288,58 @@ static inline u64 default_cfs_period(void)
|
||||
{
|
||||
return 100000000ULL;
|
||||
}
|
||||
+
|
||||
+static inline u64 sched_cfs_bandwidth_slice(void)
|
||||
+{
|
||||
+ return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
|
||||
+}
|
||||
+
|
||||
+static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
+{
|
||||
+ struct task_group *tg = cfs_rq->tg;
|
||||
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
||||
+ u64 amount = 0, min_amount;
|
||||
+
|
||||
+ /* note: this is a positive sum as runtime_remaining <= 0 */
|
||||
+ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
|
||||
+
|
||||
+ raw_spin_lock(&cfs_b->lock);
|
||||
+ if (cfs_b->quota == RUNTIME_INF)
|
||||
+ amount = min_amount;
|
||||
+ else if (cfs_b->runtime > 0) {
|
||||
+ amount = min(cfs_b->runtime, min_amount);
|
||||
+ cfs_b->runtime -= amount;
|
||||
+ }
|
||||
+ raw_spin_unlock(&cfs_b->lock);
|
||||
+
|
||||
+ cfs_rq->runtime_remaining += amount;
|
||||
+}
|
||||
+
|
||||
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
|
||||
+ unsigned long delta_exec)
|
||||
+{
|
||||
+ if (!cfs_rq->runtime_enabled)
|
||||
+ return;
|
||||
+
|
||||
+ cfs_rq->runtime_remaining -= delta_exec;
|
||||
+ if (cfs_rq->runtime_remaining > 0)
|
||||
+ return;
|
||||
+
|
||||
+ assign_cfs_rq_runtime(cfs_rq);
|
||||
+}
|
||||
+
|
||||
+static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
|
||||
+ unsigned long delta_exec)
|
||||
+{
|
||||
+ if (!cfs_rq->runtime_enabled)
|
||||
+ return;
|
||||
+
|
||||
+ __account_cfs_rq_runtime(cfs_rq, delta_exec);
|
||||
+}
|
||||
+
|
||||
+#else
|
||||
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
|
||||
+ unsigned long delta_exec) {}
|
||||
#endif
|
||||
|
||||
/**************************************************
|
||||
@@ -4264,8 +4334,13 @@ static void set_curr_task_fair(struct rq *rq)
|
||||
{
|
||||
struct sched_entity *se = &rq->curr->se;
|
||||
|
||||
- for_each_sched_entity(se)
|
||||
- set_next_entity(cfs_rq_of(se), se);
|
||||
+ for_each_sched_entity(se) {
|
||||
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
||||
+
|
||||
+ set_next_entity(cfs_rq, se);
|
||||
+ /* ensure bandwidth has been allocated on our new cfs_rq */
|
||||
+ account_cfs_rq_runtime(cfs_rq, 0);
|
||||
+ }
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
|
||||
index f175d98..b38ca7f 100644
|
||||
--- a/kernel/sysctl.c
|
||||
+++ b/kernel/sysctl.c
|
||||
@@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = {
|
||||
.extra2 = &one,
|
||||
},
|
||||
#endif
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+ {
|
||||
+ .procname = "sched_cfs_bandwidth_slice_us",
|
||||
+ .data = &sysctl_sched_cfs_bandwidth_slice,
|
||||
+ .maxlen = sizeof(unsigned int),
|
||||
+ .mode = 0644,
|
||||
+ .proc_handler = proc_dointvec_minmax,
|
||||
+ .extra1 = &one,
|
||||
+ },
|
||||
+#endif
|
||||
#ifdef CONFIG_PROVE_LOCKING
|
||||
{
|
||||
.procname = "prove_locking",
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
+263
@@ -0,0 +1,263 @@
|
||||
From c127107a0b9f7fe08dd11c84ecb6b307052b7688 Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:31 -0700
|
||||
Subject: [PATCH 06/19] sched: Add a timer to handle CFS bandwidth refresh
|
||||
|
||||
This patch adds a per-task_group timer which handles the refresh of the global
|
||||
CFS bandwidth pool.
|
||||
|
||||
Since the RT pool is using a similar timer there's some small refactoring to
|
||||
share this support.
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184757.277271273@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
kernel/sched.c | 107 +++++++++++++++++++++++++++++++++++++++++----------
|
||||
kernel/sched_fair.c | 40 +++++++++++++++++-
|
||||
2 files changed, 123 insertions(+), 24 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index cdbc7d3..4bb2d63 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -193,10 +193,28 @@ static inline int rt_bandwidth_enabled(void)
|
||||
return sysctl_sched_rt_runtime >= 0;
|
||||
}
|
||||
|
||||
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
||||
+static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
|
||||
{
|
||||
- ktime_t now;
|
||||
+ unsigned long delta;
|
||||
+ ktime_t soft, hard, now;
|
||||
|
||||
+ for (;;) {
|
||||
+ if (hrtimer_active(period_timer))
|
||||
+ break;
|
||||
+
|
||||
+ now = hrtimer_cb_get_time(period_timer);
|
||||
+ hrtimer_forward(period_timer, now, period);
|
||||
+
|
||||
+ soft = hrtimer_get_softexpires(period_timer);
|
||||
+ hard = hrtimer_get_expires(period_timer);
|
||||
+ delta = ktime_to_ns(ktime_sub(hard, soft));
|
||||
+ __hrtimer_start_range_ns(period_timer, soft, delta,
|
||||
+ HRTIMER_MODE_ABS_PINNED, 0);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
||||
+{
|
||||
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
|
||||
return;
|
||||
|
||||
@@ -204,22 +222,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
||||
return;
|
||||
|
||||
raw_spin_lock(&rt_b->rt_runtime_lock);
|
||||
- for (;;) {
|
||||
- unsigned long delta;
|
||||
- ktime_t soft, hard;
|
||||
-
|
||||
- if (hrtimer_active(&rt_b->rt_period_timer))
|
||||
- break;
|
||||
-
|
||||
- now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
|
||||
- hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
|
||||
-
|
||||
- soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
|
||||
- hard = hrtimer_get_expires(&rt_b->rt_period_timer);
|
||||
- delta = ktime_to_ns(ktime_sub(hard, soft));
|
||||
- __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
|
||||
- HRTIMER_MODE_ABS_PINNED, 0);
|
||||
- }
|
||||
+ start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
|
||||
raw_spin_unlock(&rt_b->rt_runtime_lock);
|
||||
}
|
||||
|
||||
@@ -250,6 +253,9 @@ struct cfs_bandwidth {
|
||||
ktime_t period;
|
||||
u64 quota, runtime;
|
||||
s64 hierarchal_quota;
|
||||
+
|
||||
+ int idle, timer_active;
|
||||
+ struct hrtimer period_timer;
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -400,6 +406,28 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
|
||||
}
|
||||
|
||||
static inline u64 default_cfs_period(void);
|
||||
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
|
||||
+
|
||||
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
|
||||
+{
|
||||
+ struct cfs_bandwidth *cfs_b =
|
||||
+ container_of(timer, struct cfs_bandwidth, period_timer);
|
||||
+ ktime_t now;
|
||||
+ int overrun;
|
||||
+ int idle = 0;
|
||||
+
|
||||
+ for (;;) {
|
||||
+ now = hrtimer_cb_get_time(timer);
|
||||
+ overrun = hrtimer_forward(timer, now, cfs_b->period);
|
||||
+
|
||||
+ if (!overrun)
|
||||
+ break;
|
||||
+
|
||||
+ idle = do_sched_cfs_period_timer(cfs_b, overrun);
|
||||
+ }
|
||||
+
|
||||
+ return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
|
||||
+}
|
||||
|
||||
static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
{
|
||||
@@ -407,6 +435,9 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
cfs_b->runtime = 0;
|
||||
cfs_b->quota = RUNTIME_INF;
|
||||
cfs_b->period = ns_to_ktime(default_cfs_period());
|
||||
+
|
||||
+ hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
||||
+ cfs_b->period_timer.function = sched_cfs_period_timer;
|
||||
}
|
||||
|
||||
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
@@ -414,8 +445,34 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
cfs_rq->runtime_enabled = 0;
|
||||
}
|
||||
|
||||
+/* requires cfs_b->lock, may release to reprogram timer */
|
||||
+static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
+{
|
||||
+ /*
|
||||
+ * The timer may be active because we're trying to set a new bandwidth
|
||||
+ * period or because we're racing with the tear-down path
|
||||
+ * (timer_active==0 becomes visible before the hrtimer call-back
|
||||
+ * terminates). In either case we ensure that it's re-programmed
|
||||
+ */
|
||||
+ while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
|
||||
+ raw_spin_unlock(&cfs_b->lock);
|
||||
+ /* ensure cfs_b->lock is available while we wait */
|
||||
+ hrtimer_cancel(&cfs_b->period_timer);
|
||||
+
|
||||
+ raw_spin_lock(&cfs_b->lock);
|
||||
+ /* if someone else restarted the timer then we're done */
|
||||
+ if (cfs_b->timer_active)
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ cfs_b->timer_active = 1;
|
||||
+ start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
|
||||
+}
|
||||
+
|
||||
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
-{}
|
||||
+{
|
||||
+ hrtimer_cancel(&cfs_b->period_timer);
|
||||
+}
|
||||
#else
|
||||
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
|
||||
static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
|
||||
@@ -9064,7 +9121,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
|
||||
|
||||
static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
{
|
||||
- int i, ret = 0;
|
||||
+ int i, ret = 0, runtime_enabled;
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
||||
|
||||
if (tg == &root_task_group)
|
||||
@@ -9091,10 +9148,18 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
+ runtime_enabled = quota != RUNTIME_INF;
|
||||
raw_spin_lock_irq(&cfs_b->lock);
|
||||
cfs_b->period = ns_to_ktime(period);
|
||||
cfs_b->quota = quota;
|
||||
cfs_b->runtime = quota;
|
||||
+
|
||||
+ /* restart the period timer (if active) to handle new period expiry */
|
||||
+ if (runtime_enabled && cfs_b->timer_active) {
|
||||
+ /* force a reprogram */
|
||||
+ cfs_b->timer_active = 0;
|
||||
+ __start_cfs_bandwidth(cfs_b);
|
||||
+ }
|
||||
raw_spin_unlock_irq(&cfs_b->lock);
|
||||
|
||||
for_each_possible_cpu(i) {
|
||||
@@ -9102,7 +9167,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
struct rq *rq = rq_of(cfs_rq);
|
||||
|
||||
raw_spin_lock_irq(&rq->lock);
|
||||
- cfs_rq->runtime_enabled = quota != RUNTIME_INF;
|
||||
+ cfs_rq->runtime_enabled = runtime_enabled;
|
||||
cfs_rq->runtime_remaining = 0;
|
||||
raw_spin_unlock_irq(&rq->lock);
|
||||
}
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index 863c9ec..e34c26c 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -1306,9 +1306,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
raw_spin_lock(&cfs_b->lock);
|
||||
if (cfs_b->quota == RUNTIME_INF)
|
||||
amount = min_amount;
|
||||
- else if (cfs_b->runtime > 0) {
|
||||
- amount = min(cfs_b->runtime, min_amount);
|
||||
- cfs_b->runtime -= amount;
|
||||
+ else {
|
||||
+ /* ensure bandwidth timer remains active under consumption */
|
||||
+ if (!cfs_b->timer_active)
|
||||
+ __start_cfs_bandwidth(cfs_b);
|
||||
+
|
||||
+ if (cfs_b->runtime > 0) {
|
||||
+ amount = min(cfs_b->runtime, min_amount);
|
||||
+ cfs_b->runtime -= amount;
|
||||
+ cfs_b->idle = 0;
|
||||
+ }
|
||||
}
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
|
||||
@@ -1337,6 +1344,33 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
|
||||
__account_cfs_rq_runtime(cfs_rq, delta_exec);
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * Responsible for refilling a task_group's bandwidth and unthrottling its
|
||||
+ * cfs_rqs as appropriate. If there has been no activity within the last
|
||||
+ * period the timer is deactivated until scheduling resumes; cfs_b->idle is
|
||||
+ * used to track this state.
|
||||
+ */
|
||||
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
|
||||
+{
|
||||
+ int idle = 1;
|
||||
+
|
||||
+ raw_spin_lock(&cfs_b->lock);
|
||||
+ /* no need to continue the timer with no bandwidth constraint */
|
||||
+ if (cfs_b->quota == RUNTIME_INF)
|
||||
+ goto out_unlock;
|
||||
+
|
||||
+ idle = cfs_b->idle;
|
||||
+ cfs_b->runtime = cfs_b->quota;
|
||||
+
|
||||
+ /* mark as potentially idle for the upcoming period */
|
||||
+ cfs_b->idle = 1;
|
||||
+out_unlock:
|
||||
+ if (idle)
|
||||
+ cfs_b->timer_active = 0;
|
||||
+ raw_spin_unlock(&cfs_b->lock);
|
||||
+
|
||||
+ return idle;
|
||||
+}
|
||||
#else
|
||||
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
|
||||
unsigned long delta_exec) {}
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
@@ -0,0 +1,208 @@
|
||||
From bfd5537a5bca64bb37c64b3156bdbb85dbd46fae Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:32 -0700
|
||||
Subject: [PATCH 07/19] sched: Expire invalid runtime
|
||||
|
||||
Since quota is managed using a global state but consumed on a per-cpu basis
|
||||
we need to ensure that our per-cpu state is appropriately synchronized.
|
||||
Most importantly, runtime that is state (from a previous period) should not be
|
||||
locally consumable.
|
||||
|
||||
We take advantage of existing sched_clock synchronization about the jiffy to
|
||||
efficiently detect whether we have (globally) crossed a quota boundary above.
|
||||
|
||||
One catch is that the direction of spread on sched_clock is undefined,
|
||||
specifically, we don't know whether our local clock is behind or ahead
|
||||
of the one responsible for the current expiration time.
|
||||
|
||||
Fortunately we can differentiate these by considering whether the
|
||||
global deadline has advanced. If it has not, then we assume our clock to be
|
||||
"fast" and advance our local expiration; otherwise, we know the deadline has
|
||||
truly passed and we expire our local runtime.
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184757.379275352@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
kernel/sched.c | 4 ++-
|
||||
kernel/sched_fair.c | 90 +++++++++++++++++++++++++++++++++++++++++++++-----
|
||||
2 files changed, 84 insertions(+), 10 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index 4bb2d63..6a0bcd5 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -253,6 +253,7 @@ struct cfs_bandwidth {
|
||||
ktime_t period;
|
||||
u64 quota, runtime;
|
||||
s64 hierarchal_quota;
|
||||
+ u64 runtime_expires;
|
||||
|
||||
int idle, timer_active;
|
||||
struct hrtimer period_timer;
|
||||
@@ -393,6 +394,7 @@ struct cfs_rq {
|
||||
#endif
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
int runtime_enabled;
|
||||
+ u64 runtime_expires;
|
||||
s64 runtime_remaining;
|
||||
#endif
|
||||
#endif
|
||||
@@ -9152,8 +9154,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
raw_spin_lock_irq(&cfs_b->lock);
|
||||
cfs_b->period = ns_to_ktime(period);
|
||||
cfs_b->quota = quota;
|
||||
- cfs_b->runtime = quota;
|
||||
|
||||
+ __refill_cfs_bandwidth_runtime(cfs_b);
|
||||
/* restart the period timer (if active) to handle new period expiry */
|
||||
if (runtime_enabled && cfs_b->timer_active) {
|
||||
/* force a reprogram */
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index e34c26c..a97d19e 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -1294,11 +1294,30 @@ static inline u64 sched_cfs_bandwidth_slice(void)
|
||||
return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * Replenish runtime according to assigned quota and update expiration time.
|
||||
+ * We use sched_clock_cpu directly instead of rq->clock to avoid adding
|
||||
+ * additional synchronization around rq->lock.
|
||||
+ *
|
||||
+ * requires cfs_b->lock
|
||||
+ */
|
||||
+static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
|
||||
+{
|
||||
+ u64 now;
|
||||
+
|
||||
+ if (cfs_b->quota == RUNTIME_INF)
|
||||
+ return;
|
||||
+
|
||||
+ now = sched_clock_cpu(smp_processor_id());
|
||||
+ cfs_b->runtime = cfs_b->quota;
|
||||
+ cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
|
||||
+}
|
||||
+
|
||||
static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
struct task_group *tg = cfs_rq->tg;
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
||||
- u64 amount = 0, min_amount;
|
||||
+ u64 amount = 0, min_amount, expires;
|
||||
|
||||
/* note: this is a positive sum as runtime_remaining <= 0 */
|
||||
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
|
||||
@@ -1307,9 +1326,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
if (cfs_b->quota == RUNTIME_INF)
|
||||
amount = min_amount;
|
||||
else {
|
||||
- /* ensure bandwidth timer remains active under consumption */
|
||||
- if (!cfs_b->timer_active)
|
||||
+ /*
|
||||
+ * If the bandwidth pool has become inactive, then at least one
|
||||
+ * period must have elapsed since the last consumption.
|
||||
+ * Refresh the global state and ensure bandwidth timer becomes
|
||||
+ * active.
|
||||
+ */
|
||||
+ if (!cfs_b->timer_active) {
|
||||
+ __refill_cfs_bandwidth_runtime(cfs_b);
|
||||
__start_cfs_bandwidth(cfs_b);
|
||||
+ }
|
||||
|
||||
if (cfs_b->runtime > 0) {
|
||||
amount = min(cfs_b->runtime, min_amount);
|
||||
@@ -1317,19 +1343,61 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
cfs_b->idle = 0;
|
||||
}
|
||||
}
|
||||
+ expires = cfs_b->runtime_expires;
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
|
||||
cfs_rq->runtime_remaining += amount;
|
||||
+ /*
|
||||
+ * we may have advanced our local expiration to account for allowed
|
||||
+ * spread between our sched_clock and the one on which runtime was
|
||||
+ * issued.
|
||||
+ */
|
||||
+ if ((s64)(expires - cfs_rq->runtime_expires) > 0)
|
||||
+ cfs_rq->runtime_expires = expires;
|
||||
}
|
||||
|
||||
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
|
||||
- unsigned long delta_exec)
|
||||
+/*
|
||||
+ * Note: This depends on the synchronization provided by sched_clock and the
|
||||
+ * fact that rq->clock snapshots this value.
|
||||
+ */
|
||||
+static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
- if (!cfs_rq->runtime_enabled)
|
||||
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
|
||||
+ struct rq *rq = rq_of(cfs_rq);
|
||||
+
|
||||
+ /* if the deadline is ahead of our clock, nothing to do */
|
||||
+ if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
|
||||
+ return;
|
||||
+
|
||||
+ if (cfs_rq->runtime_remaining < 0)
|
||||
return;
|
||||
|
||||
+ /*
|
||||
+ * If the local deadline has passed we have to consider the
|
||||
+ * possibility that our sched_clock is 'fast' and the global deadline
|
||||
+ * has not truly expired.
|
||||
+ *
|
||||
+ * Fortunately we can check determine whether this the case by checking
|
||||
+ * whether the global deadline has advanced.
|
||||
+ */
|
||||
+
|
||||
+ if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
|
||||
+ /* extend local deadline, drift is bounded above by 2 ticks */
|
||||
+ cfs_rq->runtime_expires += TICK_NSEC;
|
||||
+ } else {
|
||||
+ /* global deadline is ahead, expiration has passed */
|
||||
+ cfs_rq->runtime_remaining = 0;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
|
||||
+ unsigned long delta_exec)
|
||||
+{
|
||||
+ /* dock delta_exec before expiring quota (as it could span periods) */
|
||||
cfs_rq->runtime_remaining -= delta_exec;
|
||||
- if (cfs_rq->runtime_remaining > 0)
|
||||
+ expire_cfs_rq_runtime(cfs_rq);
|
||||
+
|
||||
+ if (likely(cfs_rq->runtime_remaining > 0))
|
||||
return;
|
||||
|
||||
assign_cfs_rq_runtime(cfs_rq);
|
||||
@@ -1360,7 +1428,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
|
||||
goto out_unlock;
|
||||
|
||||
idle = cfs_b->idle;
|
||||
- cfs_b->runtime = cfs_b->quota;
|
||||
+ /* if we're going inactive then everything else can be deferred */
|
||||
+ if (idle)
|
||||
+ goto out_unlock;
|
||||
+
|
||||
+ __refill_cfs_bandwidth_runtime(cfs_b);
|
||||
+
|
||||
|
||||
/* mark as potentially idle for the upcoming period */
|
||||
cfs_b->idle = 1;
|
||||
@@ -1579,7 +1652,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
|
||||
|
||||
return wl;
|
||||
}
|
||||
-
|
||||
#else
|
||||
|
||||
static inline unsigned long effective_load(struct task_group *tg, int cpu,
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
@@ -0,0 +1,234 @@
|
||||
From 726bbbeef1579f5f981d2d98afda0304197b7e19 Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:33 -0700
|
||||
Subject: [PATCH 08/19] sched: Add support for throttling group entities
|
||||
|
||||
Now that consumption is tracked (via update_curr()) we add support to throttle
|
||||
group entities (and their corresponding cfs_rqs) in the case where this is no
|
||||
run-time remaining.
|
||||
|
||||
Throttled entities are dequeued to prevent scheduling, additionally we mark
|
||||
them as throttled (using cfs_rq->throttled) to prevent them from becoming
|
||||
re-enqueued until they are unthrottled. A list of a task_group's throttled
|
||||
entities are maintained on the cfs_bandwidth structure.
|
||||
|
||||
Note: While the machinery for throttling is added in this patch the act of
|
||||
throttling an entity exceeding its bandwidth is deferred until later within
|
||||
the series.
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184757.480608533@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
kernel/sched.c | 7 ++++
|
||||
kernel/sched_fair.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++--
|
||||
2 files changed, 92 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index 6a0bcd5..d631e42 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -257,6 +257,8 @@ struct cfs_bandwidth {
|
||||
|
||||
int idle, timer_active;
|
||||
struct hrtimer period_timer;
|
||||
+ struct list_head throttled_cfs_rq;
|
||||
+
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -396,6 +398,9 @@ struct cfs_rq {
|
||||
int runtime_enabled;
|
||||
u64 runtime_expires;
|
||||
s64 runtime_remaining;
|
||||
+
|
||||
+ int throttled;
|
||||
+ struct list_head throttled_list;
|
||||
#endif
|
||||
#endif
|
||||
};
|
||||
@@ -438,6 +443,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
cfs_b->quota = RUNTIME_INF;
|
||||
cfs_b->period = ns_to_ktime(default_cfs_period());
|
||||
|
||||
+ INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
|
||||
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
||||
cfs_b->period_timer.function = sched_cfs_period_timer;
|
||||
}
|
||||
@@ -445,6 +451,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
cfs_rq->runtime_enabled = 0;
|
||||
+ INIT_LIST_HEAD(&cfs_rq->throttled_list);
|
||||
}
|
||||
|
||||
/* requires cfs_b->lock, may release to reprogram timer */
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index a97d19e..f6823e2 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -1313,7 +1313,8 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
|
||||
cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
|
||||
}
|
||||
|
||||
-static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
+/* returns 0 on failure to allocate runtime */
|
||||
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
struct task_group *tg = cfs_rq->tg;
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
||||
@@ -1354,6 +1355,8 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
*/
|
||||
if ((s64)(expires - cfs_rq->runtime_expires) > 0)
|
||||
cfs_rq->runtime_expires = expires;
|
||||
+
|
||||
+ return cfs_rq->runtime_remaining > 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1400,7 +1403,12 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
|
||||
if (likely(cfs_rq->runtime_remaining > 0))
|
||||
return;
|
||||
|
||||
- assign_cfs_rq_runtime(cfs_rq);
|
||||
+ /*
|
||||
+ * if we're unable to extend our runtime we resched so that the active
|
||||
+ * hierarchy can be throttled
|
||||
+ */
|
||||
+ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
|
||||
+ resched_task(rq_of(cfs_rq)->curr);
|
||||
}
|
||||
|
||||
static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
|
||||
@@ -1412,6 +1420,47 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
|
||||
__account_cfs_rq_runtime(cfs_rq, delta_exec);
|
||||
}
|
||||
|
||||
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
|
||||
+{
|
||||
+ return cfs_rq->throttled;
|
||||
+}
|
||||
+
|
||||
+static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
+{
|
||||
+ struct rq *rq = rq_of(cfs_rq);
|
||||
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
|
||||
+ struct sched_entity *se;
|
||||
+ long task_delta, dequeue = 1;
|
||||
+
|
||||
+ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
|
||||
+
|
||||
+ /* account load preceding throttle */
|
||||
+ update_cfs_load(cfs_rq, 0);
|
||||
+
|
||||
+ task_delta = cfs_rq->h_nr_running;
|
||||
+ for_each_sched_entity(se) {
|
||||
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
|
||||
+ /* throttled entity or throttle-on-deactivate */
|
||||
+ if (!se->on_rq)
|
||||
+ break;
|
||||
+
|
||||
+ if (dequeue)
|
||||
+ dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
|
||||
+ qcfs_rq->h_nr_running -= task_delta;
|
||||
+
|
||||
+ if (qcfs_rq->load.weight)
|
||||
+ dequeue = 0;
|
||||
+ }
|
||||
+
|
||||
+ if (!se)
|
||||
+ rq->nr_running -= task_delta;
|
||||
+
|
||||
+ cfs_rq->throttled = 1;
|
||||
+ raw_spin_lock(&cfs_b->lock);
|
||||
+ list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
|
||||
+ raw_spin_unlock(&cfs_b->lock);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* Responsible for refilling a task_group's bandwidth and unthrottling its
|
||||
* cfs_rqs as appropriate. If there has been no activity within the last
|
||||
@@ -1447,6 +1496,11 @@ out_unlock:
|
||||
#else
|
||||
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
|
||||
unsigned long delta_exec) {}
|
||||
+
|
||||
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
#endif
|
||||
|
||||
/**************************************************
|
||||
@@ -1525,7 +1579,17 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
break;
|
||||
cfs_rq = cfs_rq_of(se);
|
||||
enqueue_entity(cfs_rq, se, flags);
|
||||
+
|
||||
+ /*
|
||||
+ * end evaluation on encountering a throttled cfs_rq
|
||||
+ *
|
||||
+ * note: in the case of encountering a throttled cfs_rq we will
|
||||
+ * post the final h_nr_running increment below.
|
||||
+ */
|
||||
+ if (cfs_rq_throttled(cfs_rq))
|
||||
+ break;
|
||||
cfs_rq->h_nr_running++;
|
||||
+
|
||||
flags = ENQUEUE_WAKEUP;
|
||||
}
|
||||
|
||||
@@ -1533,11 +1597,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
cfs_rq = cfs_rq_of(se);
|
||||
cfs_rq->h_nr_running++;
|
||||
|
||||
+ if (cfs_rq_throttled(cfs_rq))
|
||||
+ break;
|
||||
+
|
||||
update_cfs_load(cfs_rq, 0);
|
||||
update_cfs_shares(cfs_rq);
|
||||
}
|
||||
|
||||
- inc_nr_running(rq);
|
||||
+ if (!se)
|
||||
+ inc_nr_running(rq);
|
||||
hrtick_update(rq);
|
||||
}
|
||||
|
||||
@@ -1557,6 +1625,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
for_each_sched_entity(se) {
|
||||
cfs_rq = cfs_rq_of(se);
|
||||
dequeue_entity(cfs_rq, se, flags);
|
||||
+
|
||||
+ /*
|
||||
+ * end evaluation on encountering a throttled cfs_rq
|
||||
+ *
|
||||
+ * note: in the case of encountering a throttled cfs_rq we will
|
||||
+ * post the final h_nr_running decrement below.
|
||||
+ */
|
||||
+ if (cfs_rq_throttled(cfs_rq))
|
||||
+ break;
|
||||
cfs_rq->h_nr_running--;
|
||||
|
||||
/* Don't dequeue parent if it has other entities besides us */
|
||||
@@ -1579,11 +1656,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
cfs_rq = cfs_rq_of(se);
|
||||
cfs_rq->h_nr_running--;
|
||||
|
||||
+ if (cfs_rq_throttled(cfs_rq))
|
||||
+ break;
|
||||
+
|
||||
update_cfs_load(cfs_rq, 0);
|
||||
update_cfs_shares(cfs_rq);
|
||||
}
|
||||
|
||||
- dec_nr_running(rq);
|
||||
+ if (!se)
|
||||
+ dec_nr_running(rq);
|
||||
hrtick_update(rq);
|
||||
}
|
||||
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
+197
@@ -0,0 +1,197 @@
|
||||
From b5898b8474a236451416cc68b2bea413c533f095 Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:34 -0700
|
||||
Subject: [PATCH 09/19] sched: Add support for unthrottling group entities
|
||||
|
||||
At the start of each period we refresh the global bandwidth pool. At this time
|
||||
we must also unthrottle any cfs_rq entities who are now within bandwidth once
|
||||
more (as quota permits).
|
||||
|
||||
Unthrottled entities have their corresponding cfs_rq->throttled flag cleared
|
||||
and their entities re-enqueued.
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184757.574628950@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
kernel/sched.c | 3 +
|
||||
kernel/sched_fair.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++++--
|
||||
2 files changed, 126 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index d631e42..4b54a73 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -9178,6 +9178,9 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
||||
raw_spin_lock_irq(&rq->lock);
|
||||
cfs_rq->runtime_enabled = runtime_enabled;
|
||||
cfs_rq->runtime_remaining = 0;
|
||||
+
|
||||
+ if (cfs_rq_throttled(cfs_rq))
|
||||
+ unthrottle_cfs_rq(cfs_rq);
|
||||
raw_spin_unlock_irq(&rq->lock);
|
||||
}
|
||||
out_unlock:
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index f6823e2..21e1c02 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -1461,6 +1461,84 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
}
|
||||
|
||||
+static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
+{
|
||||
+ struct rq *rq = rq_of(cfs_rq);
|
||||
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
|
||||
+ struct sched_entity *se;
|
||||
+ int enqueue = 1;
|
||||
+ long task_delta;
|
||||
+
|
||||
+ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
|
||||
+
|
||||
+ cfs_rq->throttled = 0;
|
||||
+ raw_spin_lock(&cfs_b->lock);
|
||||
+ list_del_rcu(&cfs_rq->throttled_list);
|
||||
+ raw_spin_unlock(&cfs_b->lock);
|
||||
+
|
||||
+ if (!cfs_rq->load.weight)
|
||||
+ return;
|
||||
+
|
||||
+ task_delta = cfs_rq->h_nr_running;
|
||||
+ for_each_sched_entity(se) {
|
||||
+ if (se->on_rq)
|
||||
+ enqueue = 0;
|
||||
+
|
||||
+ cfs_rq = cfs_rq_of(se);
|
||||
+ if (enqueue)
|
||||
+ enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
|
||||
+ cfs_rq->h_nr_running += task_delta;
|
||||
+
|
||||
+ if (cfs_rq_throttled(cfs_rq))
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ if (!se)
|
||||
+ rq->nr_running += task_delta;
|
||||
+
|
||||
+ /* determine whether we need to wake up potentially idle cpu */
|
||||
+ if (rq->curr == rq->idle && rq->cfs.nr_running)
|
||||
+ resched_task(rq->curr);
|
||||
+}
|
||||
+
|
||||
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
|
||||
+ u64 remaining, u64 expires)
|
||||
+{
|
||||
+ struct cfs_rq *cfs_rq;
|
||||
+ u64 runtime = remaining;
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+ list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
|
||||
+ throttled_list) {
|
||||
+ struct rq *rq = rq_of(cfs_rq);
|
||||
+
|
||||
+ raw_spin_lock(&rq->lock);
|
||||
+ if (!cfs_rq_throttled(cfs_rq))
|
||||
+ goto next;
|
||||
+
|
||||
+ runtime = -cfs_rq->runtime_remaining + 1;
|
||||
+ if (runtime > remaining)
|
||||
+ runtime = remaining;
|
||||
+ remaining -= runtime;
|
||||
+
|
||||
+ cfs_rq->runtime_remaining += runtime;
|
||||
+ cfs_rq->runtime_expires = expires;
|
||||
+
|
||||
+ /* we check whether we're throttled above */
|
||||
+ if (cfs_rq->runtime_remaining > 0)
|
||||
+ unthrottle_cfs_rq(cfs_rq);
|
||||
+
|
||||
+next:
|
||||
+ raw_spin_unlock(&rq->lock);
|
||||
+
|
||||
+ if (!remaining)
|
||||
+ break;
|
||||
+ }
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ return remaining;
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* Responsible for refilling a task_group's bandwidth and unthrottling its
|
||||
* cfs_rqs as appropriate. If there has been no activity within the last
|
||||
@@ -1469,23 +1547,64 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
*/
|
||||
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
|
||||
{
|
||||
- int idle = 1;
|
||||
+ u64 runtime, runtime_expires;
|
||||
+ int idle = 1, throttled;
|
||||
|
||||
raw_spin_lock(&cfs_b->lock);
|
||||
/* no need to continue the timer with no bandwidth constraint */
|
||||
if (cfs_b->quota == RUNTIME_INF)
|
||||
goto out_unlock;
|
||||
|
||||
- idle = cfs_b->idle;
|
||||
+ throttled = !list_empty(&cfs_b->throttled_cfs_rq);
|
||||
+ /* idle depends on !throttled (for the case of a large deficit) */
|
||||
+ idle = cfs_b->idle && !throttled;
|
||||
+
|
||||
/* if we're going inactive then everything else can be deferred */
|
||||
if (idle)
|
||||
goto out_unlock;
|
||||
|
||||
__refill_cfs_bandwidth_runtime(cfs_b);
|
||||
|
||||
+ if (!throttled) {
|
||||
+ /* mark as potentially idle for the upcoming period */
|
||||
+ cfs_b->idle = 1;
|
||||
+ goto out_unlock;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * There are throttled entities so we must first use the new bandwidth
|
||||
+ * to unthrottle them before making it generally available. This
|
||||
+ * ensures that all existing debts will be paid before a new cfs_rq is
|
||||
+ * allowed to run.
|
||||
+ */
|
||||
+ runtime = cfs_b->runtime;
|
||||
+ runtime_expires = cfs_b->runtime_expires;
|
||||
+ cfs_b->runtime = 0;
|
||||
+
|
||||
+ /*
|
||||
+ * This check is repeated as we are holding onto the new bandwidth
|
||||
+ * while we unthrottle. This can potentially race with an unthrottled
|
||||
+ * group trying to acquire new bandwidth from the global pool.
|
||||
+ */
|
||||
+ while (throttled && runtime > 0) {
|
||||
+ raw_spin_unlock(&cfs_b->lock);
|
||||
+ /* we can't nest cfs_b->lock while distributing bandwidth */
|
||||
+ runtime = distribute_cfs_runtime(cfs_b, runtime,
|
||||
+ runtime_expires);
|
||||
+ raw_spin_lock(&cfs_b->lock);
|
||||
+
|
||||
+ throttled = !list_empty(&cfs_b->throttled_cfs_rq);
|
||||
+ }
|
||||
|
||||
- /* mark as potentially idle for the upcoming period */
|
||||
- cfs_b->idle = 1;
|
||||
+ /* return (any) remaining runtime */
|
||||
+ cfs_b->runtime = runtime;
|
||||
+ /*
|
||||
+ * While we are ensured activity in the period following an
|
||||
+ * unthrottle, this also covers the case in which the new bandwidth is
|
||||
+ * insufficient to cover the existing bandwidth deficit. (Forcing the
|
||||
+ * timer to remain active while there are any throttled entities.)
|
||||
+ */
|
||||
+ cfs_b->idle = 0;
|
||||
out_unlock:
|
||||
if (idle)
|
||||
cfs_b->timer_active = 0;
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
@@ -0,0 +1,136 @@
|
||||
From b152339efae7eb1bdd9ec4e626121e9205299e9d Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:35 -0700
|
||||
Subject: [PATCH 10/19] sched: Allow for positional tg_tree walks
|
||||
|
||||
Extend walk_tg_tree to accept a positional argument
|
||||
|
||||
static int walk_tg_tree_from(struct task_group *from,
|
||||
tg_visitor down, tg_visitor up, void *data)
|
||||
|
||||
Existing semantics are preserved, caller must hold rcu_lock() or sufficient
|
||||
analogue.
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184757.677889157@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
kernel/sched.c | 50 +++++++++++++++++++++++++++++++++++++-------------
|
||||
1 files changed, 37 insertions(+), 13 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index 4b54a73..813a4ce 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -1585,20 +1585,23 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
|
||||
typedef int (*tg_visitor)(struct task_group *, void *);
|
||||
|
||||
/*
|
||||
- * Iterate the full tree, calling @down when first entering a node and @up when
|
||||
- * leaving it for the final time.
|
||||
+ * Iterate task_group tree rooted at *from, calling @down when first entering a
|
||||
+ * node and @up when leaving it for the final time.
|
||||
+ *
|
||||
+ * Caller must hold rcu_lock or sufficient equivalent.
|
||||
*/
|
||||
-static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
|
||||
+static int walk_tg_tree_from(struct task_group *from,
|
||||
+ tg_visitor down, tg_visitor up, void *data)
|
||||
{
|
||||
struct task_group *parent, *child;
|
||||
int ret;
|
||||
|
||||
- rcu_read_lock();
|
||||
- parent = &root_task_group;
|
||||
+ parent = from;
|
||||
+
|
||||
down:
|
||||
ret = (*down)(parent, data);
|
||||
if (ret)
|
||||
- goto out_unlock;
|
||||
+ goto out;
|
||||
list_for_each_entry_rcu(child, &parent->children, siblings) {
|
||||
parent = child;
|
||||
goto down;
|
||||
@@ -1607,19 +1610,29 @@ up:
|
||||
continue;
|
||||
}
|
||||
ret = (*up)(parent, data);
|
||||
- if (ret)
|
||||
- goto out_unlock;
|
||||
+ if (ret || parent == from)
|
||||
+ goto out;
|
||||
|
||||
child = parent;
|
||||
parent = parent->parent;
|
||||
if (parent)
|
||||
goto up;
|
||||
-out_unlock:
|
||||
- rcu_read_unlock();
|
||||
-
|
||||
+out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * Iterate the full tree, calling @down when first entering a node and @up when
|
||||
+ * leaving it for the final time.
|
||||
+ *
|
||||
+ * Caller must hold rcu_lock or sufficient equivalent.
|
||||
+ */
|
||||
+
|
||||
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
|
||||
+{
|
||||
+ return walk_tg_tree_from(&root_task_group, down, up, data);
|
||||
+}
|
||||
+
|
||||
static int tg_nop(struct task_group *tg, void *data)
|
||||
{
|
||||
return 0;
|
||||
@@ -8856,13 +8869,19 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
|
||||
|
||||
static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
|
||||
{
|
||||
+ int ret;
|
||||
+
|
||||
struct rt_schedulable_data data = {
|
||||
.tg = tg,
|
||||
.rt_period = period,
|
||||
.rt_runtime = runtime,
|
||||
};
|
||||
|
||||
- return walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
|
||||
+ rcu_read_lock();
|
||||
+ ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ return ret;
|
||||
}
|
||||
|
||||
static int tg_set_rt_bandwidth(struct task_group *tg,
|
||||
@@ -9319,6 +9338,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
|
||||
|
||||
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
|
||||
{
|
||||
+ int ret;
|
||||
struct cfs_schedulable_data data = {
|
||||
.tg = tg,
|
||||
.period = period,
|
||||
@@ -9330,7 +9350,11 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
|
||||
do_div(data.quota, NSEC_PER_USEC);
|
||||
}
|
||||
|
||||
- return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
|
||||
+ rcu_read_lock();
|
||||
+ ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ return ret;
|
||||
}
|
||||
#endif /* CONFIG_CFS_BANDWIDTH */
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
+230
@@ -0,0 +1,230 @@
|
||||
From b7c5f316287ea56ecbc755110eaa032c588f2374 Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:36 -0700
|
||||
Subject: [PATCH 11/19] sched: Prevent interactions with throttled entities
|
||||
|
||||
From the perspective of load-balance and shares distribution, throttled
|
||||
entities should be invisible.
|
||||
|
||||
However, both of these operations work on 'active' lists and are not
|
||||
inherently aware of what group hierarchies may be present. In some cases this
|
||||
may be side-stepped (e.g. we could sideload via tg_load_down in load balance)
|
||||
while in others (e.g. update_shares()) it is more difficult to compute without
|
||||
incurring some O(n^2) costs.
|
||||
|
||||
Instead, track hierarchicaal throttled state at time of transition. This
|
||||
allows us to easily identify whether an entity belongs to a throttled hierarchy
|
||||
and avoid incorrect interactions with it.
|
||||
|
||||
Also, when an entity leaves a throttled hierarchy we need to advance its
|
||||
time averaging for shares averaging so that the elapsed throttled time is not
|
||||
considered as part of the cfs_rq's operation.
|
||||
|
||||
We also use this information to prevent buddy interactions in the wakeup and
|
||||
yield_to() paths.
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184757.777916795@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
kernel/sched.c | 2 +-
|
||||
kernel/sched_fair.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++---
|
||||
2 files changed, 98 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index 813a4ce..523464e 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -399,7 +399,7 @@ struct cfs_rq {
|
||||
u64 runtime_expires;
|
||||
s64 runtime_remaining;
|
||||
|
||||
- int throttled;
|
||||
+ int throttled, throttle_count;
|
||||
struct list_head throttled_list;
|
||||
#endif
|
||||
#endif
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index 21e1c02..3d7430b 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -725,6 +725,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
+/* we need this in update_cfs_load and load-balance functions below */
|
||||
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
|
||||
# ifdef CONFIG_SMP
|
||||
static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
|
||||
int global_update)
|
||||
@@ -747,7 +749,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
|
||||
u64 now, delta;
|
||||
unsigned long load = cfs_rq->load.weight;
|
||||
|
||||
- if (cfs_rq->tg == &root_task_group)
|
||||
+ if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
|
||||
return;
|
||||
|
||||
now = rq_of(cfs_rq)->clock_task;
|
||||
@@ -856,7 +858,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
|
||||
|
||||
tg = cfs_rq->tg;
|
||||
se = tg->se[cpu_of(rq_of(cfs_rq))];
|
||||
- if (!se)
|
||||
+ if (!se || throttled_hierarchy(cfs_rq))
|
||||
return;
|
||||
#ifndef CONFIG_SMP
|
||||
if (likely(se->load.weight == tg->shares))
|
||||
@@ -1425,6 +1427,65 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
|
||||
return cfs_rq->throttled;
|
||||
}
|
||||
|
||||
+/* check whether cfs_rq, or any parent, is throttled */
|
||||
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
|
||||
+{
|
||||
+ return cfs_rq->throttle_count;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * Ensure that neither of the group entities corresponding to src_cpu or
|
||||
+ * dest_cpu are members of a throttled hierarchy when performing group
|
||||
+ * load-balance operations.
|
||||
+ */
|
||||
+static inline int throttled_lb_pair(struct task_group *tg,
|
||||
+ int src_cpu, int dest_cpu)
|
||||
+{
|
||||
+ struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
|
||||
+
|
||||
+ src_cfs_rq = tg->cfs_rq[src_cpu];
|
||||
+ dest_cfs_rq = tg->cfs_rq[dest_cpu];
|
||||
+
|
||||
+ return throttled_hierarchy(src_cfs_rq) ||
|
||||
+ throttled_hierarchy(dest_cfs_rq);
|
||||
+}
|
||||
+
|
||||
+/* updated child weight may affect parent so we have to do this bottom up */
|
||||
+static int tg_unthrottle_up(struct task_group *tg, void *data)
|
||||
+{
|
||||
+ struct rq *rq = data;
|
||||
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
|
||||
+
|
||||
+ cfs_rq->throttle_count--;
|
||||
+#ifdef CONFIG_SMP
|
||||
+ if (!cfs_rq->throttle_count) {
|
||||
+ u64 delta = rq->clock_task - cfs_rq->load_stamp;
|
||||
+
|
||||
+ /* leaving throttled state, advance shares averaging windows */
|
||||
+ cfs_rq->load_stamp += delta;
|
||||
+ cfs_rq->load_last += delta;
|
||||
+
|
||||
+ /* update entity weight now that we are on_rq again */
|
||||
+ update_cfs_shares(cfs_rq);
|
||||
+ }
|
||||
+#endif
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int tg_throttle_down(struct task_group *tg, void *data)
|
||||
+{
|
||||
+ struct rq *rq = data;
|
||||
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
|
||||
+
|
||||
+ /* group is entering throttled state, record last load */
|
||||
+ if (!cfs_rq->throttle_count)
|
||||
+ update_cfs_load(cfs_rq, 0);
|
||||
+ cfs_rq->throttle_count++;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
struct rq *rq = rq_of(cfs_rq);
|
||||
@@ -1435,7 +1496,9 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
|
||||
|
||||
/* account load preceding throttle */
|
||||
- update_cfs_load(cfs_rq, 0);
|
||||
+ rcu_read_lock();
|
||||
+ walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
|
||||
+ rcu_read_unlock();
|
||||
|
||||
task_delta = cfs_rq->h_nr_running;
|
||||
for_each_sched_entity(se) {
|
||||
@@ -1476,6 +1539,10 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
list_del_rcu(&cfs_rq->throttled_list);
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
|
||||
+ update_rq_clock(rq);
|
||||
+ /* update hierarchical throttle state */
|
||||
+ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
|
||||
+
|
||||
if (!cfs_rq->load.weight)
|
||||
return;
|
||||
|
||||
@@ -1620,6 +1687,17 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
+
|
||||
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static inline int throttled_lb_pair(struct task_group *tg,
|
||||
+ int src_cpu, int dest_cpu)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
#endif
|
||||
|
||||
/**************************************************
|
||||
@@ -2521,6 +2599,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
||||
|
||||
for_each_leaf_cfs_rq(busiest, cfs_rq) {
|
||||
list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
|
||||
+ if (throttled_lb_pair(task_group(p),
|
||||
+ busiest->cpu, this_cpu))
|
||||
+ break;
|
||||
|
||||
if (!can_migrate_task(p, busiest, this_cpu,
|
||||
sd, idle, &pinned))
|
||||
@@ -2632,8 +2713,17 @@ static void update_shares(int cpu)
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
||||
rcu_read_lock();
|
||||
- for_each_leaf_cfs_rq(rq, cfs_rq)
|
||||
+ /*
|
||||
+ * Iterates the task_group tree in a bottom up fashion, see
|
||||
+ * list_add_leaf_cfs_rq() for details.
|
||||
+ */
|
||||
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
|
||||
+ /* throttled entities do not contribute to load */
|
||||
+ if (throttled_hierarchy(cfs_rq))
|
||||
+ continue;
|
||||
+
|
||||
update_shares_cpu(cfs_rq->tg, cpu);
|
||||
+ }
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
@@ -2657,9 +2747,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
|
||||
u64 rem_load, moved_load;
|
||||
|
||||
/*
|
||||
- * empty group
|
||||
+ * empty group or part of a throttled hierarchy
|
||||
*/
|
||||
- if (!busiest_cfs_rq->task_weight)
|
||||
+ if (!busiest_cfs_rq->task_weight ||
|
||||
+ throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
|
||||
continue;
|
||||
|
||||
rem_load = (u64)rem_load_move * busiest_weight;
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
+65
@@ -0,0 +1,65 @@
|
||||
From 41f8be245b607e16567c13f6be065084b73c4977 Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:37 -0700
|
||||
Subject: [PATCH 12/19] sched: Prevent buddy interactions with throttled entities
|
||||
|
||||
Buddies allow us to select "on-rq" entities without actually selecting them
|
||||
from a cfs_rq's rb_tree. As a result we must ensure that throttled entities
|
||||
are not falsely nominated as buddies. The fact that entities are dequeued
|
||||
within throttle_entity is not sufficient for clearing buddy status as the
|
||||
nomination may occur after throttling.
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184757.886850167@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
kernel/sched_fair.c | 18 +++++++++++++++++-
|
||||
1 files changed, 17 insertions(+), 1 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index 3d7430b..3c0120e 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -2372,6 +2372,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
|
||||
if (unlikely(se == pse))
|
||||
return;
|
||||
|
||||
+ /*
|
||||
+ * This is possible from callers such as pull_task(), in which we
|
||||
+ * unconditionally check_prempt_curr() after an enqueue (which may have
|
||||
+ * lead to a throttle). This both saves work and prevents false
|
||||
+ * next-buddy nomination below.
|
||||
+ */
|
||||
+ if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
|
||||
+ return;
|
||||
+
|
||||
if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
|
||||
set_next_buddy(pse);
|
||||
next_buddy_marked = 1;
|
||||
@@ -2380,6 +2389,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
|
||||
/*
|
||||
* We can come here with TIF_NEED_RESCHED already set from new task
|
||||
* wake up path.
|
||||
+ *
|
||||
+ * Note: this also catches the edge-case of curr being in a throttled
|
||||
+ * group (e.g. via set_curr_task), since update_curr() (in the
|
||||
+ * enqueue of curr) will have resulted in resched being set. This
|
||||
+ * prevents us from potentially nominating it as a false LAST_BUDDY
|
||||
+ * below.
|
||||
*/
|
||||
if (test_tsk_need_resched(curr))
|
||||
return;
|
||||
@@ -2502,7 +2517,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
|
||||
{
|
||||
struct sched_entity *se = &p->se;
|
||||
|
||||
- if (!se->on_rq)
|
||||
+ /* throttled hierarchies are not runnable */
|
||||
+ if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
|
||||
return false;
|
||||
|
||||
/* Tell the scheduler that we'd really like pse to run next. */
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
From 7af3c5930e241d0bddc028e3d05ad396c32689be Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:38 -0700
|
||||
Subject: [PATCH 13/19] sched: Migrate throttled tasks on HOTPLUG
|
||||
|
||||
Throttled tasks are invisisble to cpu-offline since they are not eligible for
|
||||
selection by pick_next_task(). The regular 'escape' path for a thread that is
|
||||
blocked at offline is via ttwu->select_task_rq, however this will not handle a
|
||||
throttled group since there are no individual thread wakeups on an unthrottle.
|
||||
|
||||
Resolve this by unthrottling offline cpus so that threads can be migrated.
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184757.989000590@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
kernel/sched.c | 27 +++++++++++++++++++++++++++
|
||||
1 files changed, 27 insertions(+), 0 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index 523464e..7b99d63 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -6310,6 +6310,30 @@ static void calc_global_load_remove(struct rq *rq)
|
||||
rq->calc_load_active = 0;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_CFS_BANDWIDTH
|
||||
+static void unthrottle_offline_cfs_rqs(struct rq *rq)
|
||||
+{
|
||||
+ struct cfs_rq *cfs_rq;
|
||||
+
|
||||
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
|
||||
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
|
||||
+
|
||||
+ if (!cfs_rq->runtime_enabled)
|
||||
+ continue;
|
||||
+
|
||||
+ /*
|
||||
+ * clock_task is not advancing so we just need to make sure
|
||||
+ * there's some valid quota amount
|
||||
+ */
|
||||
+ cfs_rq->runtime_remaining = cfs_b->quota;
|
||||
+ if (cfs_rq_throttled(cfs_rq))
|
||||
+ unthrottle_cfs_rq(cfs_rq);
|
||||
+ }
|
||||
+}
|
||||
+#else
|
||||
+static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
|
||||
+#endif
|
||||
+
|
||||
/*
|
||||
* Migrate all tasks from the rq, sleeping tasks will be migrated by
|
||||
* try_to_wake_up()->select_task_rq().
|
||||
@@ -6335,6 +6359,9 @@ static void migrate_tasks(unsigned int dead_cpu)
|
||||
*/
|
||||
rq->stop = NULL;
|
||||
|
||||
+ /* Ensure any throttled groups are reachable by pick_next_task */
|
||||
+ unthrottle_offline_cfs_rqs(rq);
|
||||
+
|
||||
for ( ; ; ) {
|
||||
/*
|
||||
* There's this thread running, bail when that's the only
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
+133
@@ -0,0 +1,133 @@
|
||||
From d71d241613f903255fec91ba9f959a633c724b4e Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:39 -0700
|
||||
Subject: [PATCH 14/19] sched: Throttle entities exceeding their allowed bandwidth
|
||||
|
||||
With the machinery in place to throttle and unthrottle entities, as well as
|
||||
handle their participation (or lack there of) we can now enable throttling.
|
||||
|
||||
There are 2 points that we must check whether it's time to set throttled state:
|
||||
put_prev_entity() and enqueue_entity().
|
||||
|
||||
- put_prev_entity() is the typical throttle path, we reach it by exceeding our
|
||||
allocated run-time within update_curr()->account_cfs_rq_runtime() and going
|
||||
through a reschedule.
|
||||
|
||||
- enqueue_entity() covers the case of a wake-up into an already throttled
|
||||
group. In this case we know the group cannot be on_rq and can throttle
|
||||
immediately. Checks are added at time of put_prev_entity() and
|
||||
enqueue_entity()
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184758.091415417@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
kernel/sched_fair.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++-
|
||||
1 files changed, 50 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index 3c0120e..831a300 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -989,6 +989,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
|
||||
se->vruntime = vruntime;
|
||||
}
|
||||
|
||||
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
|
||||
+
|
||||
static void
|
||||
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
{
|
||||
@@ -1018,8 +1020,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
__enqueue_entity(cfs_rq, se);
|
||||
se->on_rq = 1;
|
||||
|
||||
- if (cfs_rq->nr_running == 1)
|
||||
+ if (cfs_rq->nr_running == 1) {
|
||||
list_add_leaf_cfs_rq(cfs_rq);
|
||||
+ check_enqueue_throttle(cfs_rq);
|
||||
+ }
|
||||
}
|
||||
|
||||
static void __clear_buddies_last(struct sched_entity *se)
|
||||
@@ -1224,6 +1228,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
|
||||
return se;
|
||||
}
|
||||
|
||||
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
|
||||
+
|
||||
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
|
||||
{
|
||||
/*
|
||||
@@ -1233,6 +1239,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
|
||||
if (prev->on_rq)
|
||||
update_curr(cfs_rq);
|
||||
|
||||
+ /* throttle cfs_rqs exceeding runtime */
|
||||
+ check_cfs_rq_runtime(cfs_rq);
|
||||
+
|
||||
check_spread(cfs_rq, prev);
|
||||
if (prev->on_rq) {
|
||||
update_stats_wait_start(cfs_rq, prev);
|
||||
@@ -1486,7 +1495,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
struct rq *rq = rq_of(cfs_rq);
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
|
||||
@@ -1679,9 +1688,48 @@ out_unlock:
|
||||
|
||||
return idle;
|
||||
}
|
||||
+
|
||||
+/*
|
||||
+ * When a group wakes up we want to make sure that its quota is not already
|
||||
+ * expired/exceeded, otherwise it may be allowed to steal additional ticks of
|
||||
+ * runtime as update_curr() throttling can not not trigger until it's on-rq.
|
||||
+ */
|
||||
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
|
||||
+{
|
||||
+ /* an active group must be handled by the update_curr()->put() path */
|
||||
+ if (!cfs_rq->runtime_enabled || cfs_rq->curr)
|
||||
+ return;
|
||||
+
|
||||
+ /* ensure the group is not already throttled */
|
||||
+ if (cfs_rq_throttled(cfs_rq))
|
||||
+ return;
|
||||
+
|
||||
+ /* update runtime allocation */
|
||||
+ account_cfs_rq_runtime(cfs_rq, 0);
|
||||
+ if (cfs_rq->runtime_remaining <= 0)
|
||||
+ throttle_cfs_rq(cfs_rq);
|
||||
+}
|
||||
+
|
||||
+/* conditionally throttle active cfs_rq's from put_prev_entity() */
|
||||
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
+{
|
||||
+ if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * it's possible for a throttled entity to be forced into a running
|
||||
+ * state (e.g. set_curr_task), in this case we're finished.
|
||||
+ */
|
||||
+ if (cfs_rq_throttled(cfs_rq))
|
||||
+ return;
|
||||
+
|
||||
+ throttle_cfs_rq(cfs_rq);
|
||||
+}
|
||||
#else
|
||||
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
|
||||
unsigned long delta_exec) {}
|
||||
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
|
||||
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
|
||||
|
||||
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
+125
@@ -0,0 +1,125 @@
|
||||
From 11db1560b4dec193a20e2c78fb8238d9f14a1782 Mon Sep 17 00:00:00 2001
|
||||
From: Nikhil Rao <ncrao@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:40 -0700
|
||||
Subject: [PATCH 15/19] sched: Add exports tracking cfs bandwidth control statistics
|
||||
|
||||
This change introduces statistics exports for the cpu sub-system, these are
|
||||
added through the use of a stat file similar to that exported by other
|
||||
subsystems.
|
||||
|
||||
The following exports are included:
|
||||
|
||||
nr_periods: number of periods in which execution occurred
|
||||
nr_throttled: the number of periods above in which execution was throttle
|
||||
throttled_time: cumulative wall-time that any cpus have been throttled for
|
||||
this group
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Signed-off-by: Nikhil Rao <ncrao@google.com>
|
||||
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
|
||||
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184758.198901931@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
kernel/sched.c | 21 +++++++++++++++++++++
|
||||
kernel/sched_fair.c | 7 +++++++
|
||||
2 files changed, 28 insertions(+), 0 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index 7b99d63..08d3aa0 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -259,6 +259,9 @@ struct cfs_bandwidth {
|
||||
struct hrtimer period_timer;
|
||||
struct list_head throttled_cfs_rq;
|
||||
|
||||
+ /* statistics */
|
||||
+ int nr_periods, nr_throttled;
|
||||
+ u64 throttled_time;
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -399,6 +402,7 @@ struct cfs_rq {
|
||||
u64 runtime_expires;
|
||||
s64 runtime_remaining;
|
||||
|
||||
+ u64 throttled_timestamp;
|
||||
int throttled, throttle_count;
|
||||
struct list_head throttled_list;
|
||||
#endif
|
||||
@@ -9383,6 +9387,19 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
|
||||
|
||||
return ret;
|
||||
}
|
||||
+
|
||||
+static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
|
||||
+ struct cgroup_map_cb *cb)
|
||||
+{
|
||||
+ struct task_group *tg = cgroup_tg(cgrp);
|
||||
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
||||
+
|
||||
+ cb->fill(cb, "nr_periods", cfs_b->nr_periods);
|
||||
+ cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
|
||||
+ cb->fill(cb, "throttled_time", cfs_b->throttled_time);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
#endif /* CONFIG_CFS_BANDWIDTH */
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
|
||||
@@ -9429,6 +9446,10 @@ static struct cftype cpu_files[] = {
|
||||
.read_u64 = cpu_cfs_period_read_u64,
|
||||
.write_u64 = cpu_cfs_period_write_u64,
|
||||
},
|
||||
+ {
|
||||
+ .name = "stat",
|
||||
+ .read_map = cpu_stats_show,
|
||||
+ },
|
||||
#endif
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
{
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index 831a300..2060fc9 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -1528,6 +1528,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
rq->nr_running -= task_delta;
|
||||
|
||||
cfs_rq->throttled = 1;
|
||||
+ cfs_rq->throttled_timestamp = rq->clock;
|
||||
raw_spin_lock(&cfs_b->lock);
|
||||
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
@@ -1545,8 +1546,10 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
|
||||
|
||||
cfs_rq->throttled = 0;
|
||||
raw_spin_lock(&cfs_b->lock);
|
||||
+ cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
|
||||
list_del_rcu(&cfs_rq->throttled_list);
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
+ cfs_rq->throttled_timestamp = 0;
|
||||
|
||||
update_rq_clock(rq);
|
||||
/* update hierarchical throttle state */
|
||||
@@ -1634,6 +1637,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
|
||||
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
|
||||
/* idle depends on !throttled (for the case of a large deficit) */
|
||||
idle = cfs_b->idle && !throttled;
|
||||
+ cfs_b->nr_periods += overrun;
|
||||
|
||||
/* if we're going inactive then everything else can be deferred */
|
||||
if (idle)
|
||||
@@ -1647,6 +1651,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
+ /* account preceding periods in which throttling occurred */
|
||||
+ cfs_b->nr_throttled += overrun;
|
||||
+
|
||||
/*
|
||||
* There are throttled entities so we must first use the new bandwidth
|
||||
* to unthrottle them before making it generally available. This
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
@@ -0,0 +1,252 @@
|
||||
From 9baa7b654e1527bfec8f413f7372de6c4aeebb6a Mon Sep 17 00:00:00 2001
|
||||
From: Paul Turner <pjt@google.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:41 -0700
|
||||
Subject: [PATCH 16/19] sched: Return unused runtime on group dequeue
|
||||
|
||||
When a local cfs_rq blocks we return the majority of its remaining quota to the
|
||||
global bandwidth pool for use by other runqueues.
|
||||
|
||||
We do this only when the quota is current and there is more than
|
||||
min_cfs_rq_quota [1ms by default] of runtime remaining on the rq.
|
||||
|
||||
In the case where there are throttled runqueues and we have sufficient
|
||||
bandwidth to meter out a slice, a second timer is kicked off to handle this
|
||||
delivery, unthrottling where appropriate.
|
||||
|
||||
Using a 'worst case' antagonist which executes on each cpu
|
||||
for 1ms before moving onto the next on a fairly large machine:
|
||||
|
||||
no quota generations:
|
||||
|
||||
197.47 ms /cgroup/a/cpuacct.usage
|
||||
199.46 ms /cgroup/a/cpuacct.usage
|
||||
205.46 ms /cgroup/a/cpuacct.usage
|
||||
198.46 ms /cgroup/a/cpuacct.usage
|
||||
208.39 ms /cgroup/a/cpuacct.usage
|
||||
|
||||
Since we are allowed to use "stale" quota our usage is effectively bounded by
|
||||
the rate of input into the global pool and performance is relatively stable.
|
||||
|
||||
with quota generations [1s increments]:
|
||||
|
||||
119.58 ms /cgroup/a/cpuacct.usage
|
||||
119.65 ms /cgroup/a/cpuacct.usage
|
||||
119.64 ms /cgroup/a/cpuacct.usage
|
||||
119.63 ms /cgroup/a/cpuacct.usage
|
||||
119.60 ms /cgroup/a/cpuacct.usage
|
||||
|
||||
The large deficit here is due to quota generations (/intentionally/) preventing
|
||||
us from now using previously stranded slack quota. The cost is that this quota
|
||||
becomes unavailable.
|
||||
|
||||
with quota generations and quota return:
|
||||
|
||||
200.09 ms /cgroup/a/cpuacct.usage
|
||||
200.09 ms /cgroup/a/cpuacct.usage
|
||||
198.09 ms /cgroup/a/cpuacct.usage
|
||||
200.09 ms /cgroup/a/cpuacct.usage
|
||||
200.06 ms /cgroup/a/cpuacct.usage
|
||||
|
||||
By returning unused quota we're able to both stably consume our desired quota
|
||||
and prevent unintentional overages due to the abuse of slack quota from
|
||||
previous quota periods (especially on a large machine).
|
||||
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184758.306848658@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
kernel/sched.c | 15 +++++++-
|
||||
kernel/sched_fair.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
2 files changed, 122 insertions(+), 1 deletions(-)
|
||||
|
||||
diff --git a/kernel/sched.c b/kernel/sched.c
|
||||
index 08d3aa0..8be4ca2 100644
|
||||
--- a/kernel/sched.c
|
||||
+++ b/kernel/sched.c
|
||||
@@ -256,7 +256,7 @@ struct cfs_bandwidth {
|
||||
u64 runtime_expires;
|
||||
|
||||
int idle, timer_active;
|
||||
- struct hrtimer period_timer;
|
||||
+ struct hrtimer period_timer, slack_timer;
|
||||
struct list_head throttled_cfs_rq;
|
||||
|
||||
/* statistics */
|
||||
@@ -418,6 +418,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
|
||||
|
||||
static inline u64 default_cfs_period(void);
|
||||
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
|
||||
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
|
||||
+
|
||||
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
|
||||
+{
|
||||
+ struct cfs_bandwidth *cfs_b =
|
||||
+ container_of(timer, struct cfs_bandwidth, slack_timer);
|
||||
+ do_sched_cfs_slack_timer(cfs_b);
|
||||
+
|
||||
+ return HRTIMER_NORESTART;
|
||||
+}
|
||||
|
||||
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
|
||||
{
|
||||
@@ -450,6 +460,8 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
|
||||
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
||||
cfs_b->period_timer.function = sched_cfs_period_timer;
|
||||
+ hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
||||
+ cfs_b->slack_timer.function = sched_cfs_slack_timer;
|
||||
}
|
||||
|
||||
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
@@ -485,6 +497,7 @@ static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
{
|
||||
hrtimer_cancel(&cfs_b->period_timer);
|
||||
+ hrtimer_cancel(&cfs_b->slack_timer);
|
||||
}
|
||||
#else
|
||||
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
|
||||
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
|
||||
index 2060fc9..edf3b3e 100644
|
||||
--- a/kernel/sched_fair.c
|
||||
+++ b/kernel/sched_fair.c
|
||||
@@ -1071,6 +1071,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
__clear_buddies_skip(se);
|
||||
}
|
||||
|
||||
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
|
||||
+
|
||||
static void
|
||||
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
{
|
||||
@@ -1109,6 +1111,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
||||
if (!(flags & DEQUEUE_SLEEP))
|
||||
se->vruntime -= cfs_rq->min_vruntime;
|
||||
|
||||
+ /* return excess runtime on last dequeue */
|
||||
+ return_cfs_rq_runtime(cfs_rq);
|
||||
+
|
||||
update_min_vruntime(cfs_rq);
|
||||
update_cfs_shares(cfs_rq);
|
||||
}
|
||||
@@ -1696,6 +1701,108 @@ out_unlock:
|
||||
return idle;
|
||||
}
|
||||
|
||||
+/* a cfs_rq won't donate quota below this amount */
|
||||
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
|
||||
+/* minimum remaining period time to redistribute slack quota */
|
||||
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
|
||||
+/* how long we wait to gather additional slack before distributing */
|
||||
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
|
||||
+
|
||||
+/* are we near the end of the current quota period? */
|
||||
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
|
||||
+{
|
||||
+ struct hrtimer *refresh_timer = &cfs_b->period_timer;
|
||||
+ u64 remaining;
|
||||
+
|
||||
+ /* if the call-back is running a quota refresh is already occurring */
|
||||
+ if (hrtimer_callback_running(refresh_timer))
|
||||
+ return 1;
|
||||
+
|
||||
+ /* is a quota refresh about to occur? */
|
||||
+ remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
|
||||
+ if (remaining < min_expire)
|
||||
+ return 1;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
+{
|
||||
+ u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
|
||||
+
|
||||
+ /* if there's a quota refresh soon don't bother with slack */
|
||||
+ if (runtime_refresh_within(cfs_b, min_left))
|
||||
+ return;
|
||||
+
|
||||
+ start_bandwidth_timer(&cfs_b->slack_timer,
|
||||
+ ns_to_ktime(cfs_bandwidth_slack_period));
|
||||
+}
|
||||
+
|
||||
+/* we know any runtime found here is valid as update_curr() precedes return */
|
||||
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
+{
|
||||
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
|
||||
+ s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
|
||||
+
|
||||
+ if (slack_runtime <= 0)
|
||||
+ return;
|
||||
+
|
||||
+ raw_spin_lock(&cfs_b->lock);
|
||||
+ if (cfs_b->quota != RUNTIME_INF &&
|
||||
+ cfs_rq->runtime_expires == cfs_b->runtime_expires) {
|
||||
+ cfs_b->runtime += slack_runtime;
|
||||
+
|
||||
+ /* we are under rq->lock, defer unthrottling using a timer */
|
||||
+ if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
|
||||
+ !list_empty(&cfs_b->throttled_cfs_rq))
|
||||
+ start_cfs_slack_bandwidth(cfs_b);
|
||||
+ }
|
||||
+ raw_spin_unlock(&cfs_b->lock);
|
||||
+
|
||||
+ /* even if it's not valid for return we don't want to try again */
|
||||
+ cfs_rq->runtime_remaining -= slack_runtime;
|
||||
+}
|
||||
+
|
||||
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
||||
+{
|
||||
+ if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
|
||||
+ return;
|
||||
+
|
||||
+ __return_cfs_rq_runtime(cfs_rq);
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * This is done with a timer (instead of inline with bandwidth return) since
|
||||
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
|
||||
+ */
|
||||
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
|
||||
+{
|
||||
+ u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
|
||||
+ u64 expires;
|
||||
+
|
||||
+ /* confirm we're still not at a refresh boundary */
|
||||
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
|
||||
+ return;
|
||||
+
|
||||
+ raw_spin_lock(&cfs_b->lock);
|
||||
+ if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
|
||||
+ runtime = cfs_b->runtime;
|
||||
+ cfs_b->runtime = 0;
|
||||
+ }
|
||||
+ expires = cfs_b->runtime_expires;
|
||||
+ raw_spin_unlock(&cfs_b->lock);
|
||||
+
|
||||
+ if (!runtime)
|
||||
+ return;
|
||||
+
|
||||
+ runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
|
||||
+
|
||||
+ raw_spin_lock(&cfs_b->lock);
|
||||
+ if (expires == cfs_b->runtime_expires)
|
||||
+ cfs_b->runtime = runtime;
|
||||
+ raw_spin_unlock(&cfs_b->lock);
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* When a group wakes up we want to make sure that its quota is not already
|
||||
* expired/exceeded, otherwise it may be allowed to steal additional ticks of
|
||||
@@ -1737,6 +1844,7 @@ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
|
||||
unsigned long delta_exec) {}
|
||||
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
|
||||
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
|
||||
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
|
||||
|
||||
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
|
||||
{
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
@@ -0,0 +1,148 @@
|
||||
From d5edecf4b5298b11f6d39e3029b7620ee06640e7 Mon Sep 17 00:00:00 2001
|
||||
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
|
||||
Date: Thu, 21 Jul 2011 09:43:43 -0700
|
||||
Subject: [PATCH 17/19] sched: Add documentation for bandwidth control
|
||||
|
||||
Basic description of usage and effect for CFS Bandwidth Control.
|
||||
|
||||
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
|
||||
Signed-off-by: Paul Turner <pjt@google.com>
|
||||
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
|
||||
Link: http://lkml.kernel.org/r/20110721184758.498036116@google.com
|
||||
Signed-off-by: Ingo Molnar <mingo@elte.hu>
|
||||
---
|
||||
Documentation/scheduler/sched-bwc.txt | 122 +++++++++++++++++++++++++++++++++
|
||||
1 files changed, 122 insertions(+), 0 deletions(-)
|
||||
create mode 100644 Documentation/scheduler/sched-bwc.txt
|
||||
|
||||
diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt
|
||||
new file mode 100644
|
||||
index 0000000..f6b1873
|
||||
--- /dev/null
|
||||
+++ b/Documentation/scheduler/sched-bwc.txt
|
||||
@@ -0,0 +1,122 @@
|
||||
+CFS Bandwidth Control
|
||||
+=====================
|
||||
+
|
||||
+[ This document only discusses CPU bandwidth control for SCHED_NORMAL.
|
||||
+ The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ]
|
||||
+
|
||||
+CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the
|
||||
+specification of the maximum CPU bandwidth available to a group or hierarchy.
|
||||
+
|
||||
+The bandwidth allowed for a group is specified using a quota and period. Within
|
||||
+each given "period" (microseconds), a group is allowed to consume only up to
|
||||
+"quota" microseconds of CPU time. When the CPU bandwidth consumption of a
|
||||
+group exceeds this limit (for that period), the tasks belonging to its
|
||||
+hierarchy will be throttled and are not allowed to run again until the next
|
||||
+period.
|
||||
+
|
||||
+A group's unused runtime is globally tracked, being refreshed with quota units
|
||||
+above at each period boundary. As threads consume this bandwidth it is
|
||||
+transferred to cpu-local "silos" on a demand basis. The amount transferred
|
||||
+within each of these updates is tunable and described as the "slice".
|
||||
+
|
||||
+Management
|
||||
+----------
|
||||
+Quota and period are managed within the cpu subsystem via cgroupfs.
|
||||
+
|
||||
+cpu.cfs_quota_us: the total available run-time within a period (in microseconds)
|
||||
+cpu.cfs_period_us: the length of a period (in microseconds)
|
||||
+cpu.stat: exports throttling statistics [explained further below]
|
||||
+
|
||||
+The default values are:
|
||||
+ cpu.cfs_period_us=100ms
|
||||
+ cpu.cfs_quota=-1
|
||||
+
|
||||
+A value of -1 for cpu.cfs_quota_us indicates that the group does not have any
|
||||
+bandwidth restriction in place, such a group is described as an unconstrained
|
||||
+bandwidth group. This represents the traditional work-conserving behavior for
|
||||
+CFS.
|
||||
+
|
||||
+Writing any (valid) positive value(s) will enact the specified bandwidth limit.
|
||||
+The minimum quota allowed for the quota or period is 1ms. There is also an
|
||||
+upper bound on the period length of 1s. Additional restrictions exist when
|
||||
+bandwidth limits are used in a hierarchical fashion, these are explained in
|
||||
+more detail below.
|
||||
+
|
||||
+Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit
|
||||
+and return the group to an unconstrained state once more.
|
||||
+
|
||||
+Any updates to a group's bandwidth specification will result in it becoming
|
||||
+unthrottled if it is in a constrained state.
|
||||
+
|
||||
+System wide settings
|
||||
+--------------------
|
||||
+For efficiency run-time is transferred between the global pool and CPU local
|
||||
+"silos" in a batch fashion. This greatly reduces global accounting pressure
|
||||
+on large systems. The amount transferred each time such an update is required
|
||||
+is described as the "slice".
|
||||
+
|
||||
+This is tunable via procfs:
|
||||
+ /proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms)
|
||||
+
|
||||
+Larger slice values will reduce transfer overheads, while smaller values allow
|
||||
+for more fine-grained consumption.
|
||||
+
|
||||
+Statistics
|
||||
+----------
|
||||
+A group's bandwidth statistics are exported via 3 fields in cpu.stat.
|
||||
+
|
||||
+cpu.stat:
|
||||
+- nr_periods: Number of enforcement intervals that have elapsed.
|
||||
+- nr_throttled: Number of times the group has been throttled/limited.
|
||||
+- throttled_time: The total time duration (in nanoseconds) for which entities
|
||||
+ of the group have been throttled.
|
||||
+
|
||||
+This interface is read-only.
|
||||
+
|
||||
+Hierarchical considerations
|
||||
+---------------------------
|
||||
+The interface enforces that an individual entity's bandwidth is always
|
||||
+attainable, that is: max(c_i) <= C. However, over-subscription in the
|
||||
+aggregate case is explicitly allowed to enable work-conserving semantics
|
||||
+within a hierarchy.
|
||||
+ e.g. \Sum (c_i) may exceed C
|
||||
+[ Where C is the parent's bandwidth, and c_i its children ]
|
||||
+
|
||||
+
|
||||
+There are two ways in which a group may become throttled:
|
||||
+ a. it fully consumes its own quota within a period
|
||||
+ b. a parent's quota is fully consumed within its period
|
||||
+
|
||||
+In case b) above, even though the child may have runtime remaining it will not
|
||||
+be allowed to until the parent's runtime is refreshed.
|
||||
+
|
||||
+Examples
|
||||
+--------
|
||||
+1. Limit a group to 1 CPU worth of runtime.
|
||||
+
|
||||
+ If period is 250ms and quota is also 250ms, the group will get
|
||||
+ 1 CPU worth of runtime every 250ms.
|
||||
+
|
||||
+ # echo 250000 > cpu.cfs_quota_us /* quota = 250ms */
|
||||
+ # echo 250000 > cpu.cfs_period_us /* period = 250ms */
|
||||
+
|
||||
+2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine.
|
||||
+
|
||||
+ With 500ms period and 1000ms quota, the group can get 2 CPUs worth of
|
||||
+ runtime every 500ms.
|
||||
+
|
||||
+ # echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */
|
||||
+ # echo 500000 > cpu.cfs_period_us /* period = 500ms */
|
||||
+
|
||||
+ The larger period here allows for increased burst capacity.
|
||||
+
|
||||
+3. Limit a group to 20% of 1 CPU.
|
||||
+
|
||||
+ With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU.
|
||||
+
|
||||
+ # echo 10000 > cpu.cfs_quota_us /* quota = 10ms */
|
||||
+ # echo 50000 > cpu.cfs_period_us /* period = 50ms */
|
||||
+
|
||||
+ By using a small period here we are ensuring a consistent latency
|
||||
+ response at the expense of burst capacity.
|
||||
+
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,437 @@
|
||||
From 46394a392b85376e0c17a4f84e2468a0b62ec5b6 Mon Sep 17 00:00:00 2001
|
||||
From: Vimalkumar <j.vimal+nf@gmail.com>
|
||||
Date: Wed, 7 Sep 2011 14:17:32 -0700
|
||||
Subject: [PATCH 19/19] dctcp patch
|
||||
|
||||
---
|
||||
include/linux/sysctl.h | 3 +
|
||||
include/linux/tcp.h | 10 +++
|
||||
include/net/tcp.h | 3 +
|
||||
kernel/sysctl_binary.c | 3 +
|
||||
net/ipv4/sysctl_net_ipv4.c | 21 +++++
|
||||
net/ipv4/tcp_input.c | 182 ++++++++++++++++++++++++++++++++++++++++----
|
||||
net/ipv4/tcp_output.c | 19 +++++-
|
||||
7 files changed, 225 insertions(+), 16 deletions(-)
|
||||
|
||||
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
|
||||
index 11684d9..fd8c73a 100644
|
||||
--- a/include/linux/sysctl.h
|
||||
+++ b/include/linux/sysctl.h
|
||||
@@ -425,6 +425,9 @@ enum
|
||||
NET_TCP_ALLOWED_CONG_CONTROL=123,
|
||||
NET_TCP_MAX_SSTHRESH=124,
|
||||
NET_TCP_FRTO_RESPONSE=125,
|
||||
+ NET_TCP_DELAYED_ACK=126,
|
||||
+ NET_TCP_DCTCP_ENABLE=127,
|
||||
+ NET_TCP_DCTCP_SHIFT_G=128,
|
||||
};
|
||||
|
||||
enum {
|
||||
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
|
||||
index e64f4c6..9d2ec1c 100644
|
||||
--- a/include/linux/tcp.h
|
||||
+++ b/include/linux/tcp.h
|
||||
@@ -455,6 +455,16 @@ struct tcp_sock {
|
||||
struct tcp_md5sig_info *md5sig_info;
|
||||
#endif
|
||||
|
||||
+/* DCTCP Specific Parameters */
|
||||
+ u32 acked_bytes_ecn;
|
||||
+ u32 acked_bytes_total;
|
||||
+ u32 prior_ack;
|
||||
+ u32 prior_rcv_nxt;
|
||||
+ u32 dctcp_alpha;
|
||||
+ u32 next_seq;
|
||||
+ u32 ce_state; /* 0: last pkt was non-ce , 1: last pkt was ce */
|
||||
+ u32 delayed_ack_reserved;
|
||||
+
|
||||
/* When the cookie options are generated and exchanged, then this
|
||||
* object holds a reference to them (cookie_values->kref). Also
|
||||
* contains related tcp_cookie_transactions fields.
|
||||
diff --git a/include/net/tcp.h b/include/net/tcp.h
|
||||
index cda30ea..b6b1480 100644
|
||||
--- a/include/net/tcp.h
|
||||
+++ b/include/net/tcp.h
|
||||
@@ -223,6 +223,9 @@ extern int sysctl_tcp_max_orphans;
|
||||
extern int sysctl_tcp_fack;
|
||||
extern int sysctl_tcp_reordering;
|
||||
extern int sysctl_tcp_ecn;
|
||||
+extern int sysctl_tcp_delayed_ack;
|
||||
+extern int sysctl_tcp_dctcp_enable;
|
||||
+extern int sysctl_tcp_dctcp_shift_g;
|
||||
extern int sysctl_tcp_dsack;
|
||||
extern long sysctl_tcp_mem[3];
|
||||
extern int sysctl_tcp_wmem[3];
|
||||
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
|
||||
index 20dfc21..f232b5a 100644
|
||||
--- a/kernel/sysctl_binary.c
|
||||
+++ b/kernel/sysctl_binary.c
|
||||
@@ -373,6 +373,9 @@ static const struct bin_table bin_net_ipv4_table[] = {
|
||||
{ CTL_INT, NET_TCP_FACK, "tcp_fack" },
|
||||
{ CTL_INT, NET_TCP_REORDERING, "tcp_reordering" },
|
||||
{ CTL_INT, NET_TCP_ECN, "tcp_ecn" },
|
||||
+ { CTL_INT, NET_TCP_DELAYED_ACK, "tcp_delayed_ack" },
|
||||
+ { CTL_INT, NET_TCP_DCTCP_ENABLE, "tcp_dctcp_enable" },
|
||||
+ { CTL_INT, NET_TCP_DCTCP_SHIFT_G, "tcp_dctcp_shift_g" },
|
||||
{ CTL_INT, NET_TCP_DSACK, "tcp_dsack" },
|
||||
{ CTL_INT, NET_TCP_MEM, "tcp_mem" },
|
||||
{ CTL_INT, NET_TCP_WMEM, "tcp_wmem" },
|
||||
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
|
||||
index 57d0752..c896edf 100644
|
||||
--- a/net/ipv4/sysctl_net_ipv4.c
|
||||
+++ b/net/ipv4/sysctl_net_ipv4.c
|
||||
@@ -440,6 +440,27 @@ static struct ctl_table ipv4_table[] = {
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
+ .procname = "tcp_delayed_ack",
|
||||
+ .data = &sysctl_tcp_delayed_ack,
|
||||
+ .maxlen = sizeof(int),
|
||||
+ .mode = 0644,
|
||||
+ .proc_handler = proc_dointvec
|
||||
+ },
|
||||
+ {
|
||||
+ .procname = "tcp_dctcp_enable",
|
||||
+ .data = &sysctl_tcp_dctcp_enable,
|
||||
+ .maxlen = sizeof(int),
|
||||
+ .mode = 0644,
|
||||
+ .proc_handler = proc_dointvec
|
||||
+ },
|
||||
+ {
|
||||
+ .procname = "tcp_dctcp_shift_g",
|
||||
+ .data = &sysctl_tcp_dctcp_shift_g,
|
||||
+ .maxlen = sizeof(int),
|
||||
+ .mode = 0644,
|
||||
+ .proc_handler = proc_dointvec
|
||||
+ },
|
||||
+ {
|
||||
.procname = "tcp_dsack",
|
||||
.data = &sysctl_tcp_dsack,
|
||||
.maxlen = sizeof(int),
|
||||
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
|
||||
index bef9f04..7b9829b 100644
|
||||
--- a/net/ipv4/tcp_input.c
|
||||
+++ b/net/ipv4/tcp_input.c
|
||||
@@ -98,6 +98,13 @@ int sysctl_tcp_thin_dupack __read_mostly;
|
||||
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
|
||||
int sysctl_tcp_abc __read_mostly;
|
||||
|
||||
+int sysctl_tcp_delayed_ack __read_mostly = 1;
|
||||
+EXPORT_SYMBOL(sysctl_tcp_delayed_ack);
|
||||
+int sysctl_tcp_dctcp_enable __read_mostly;
|
||||
+EXPORT_SYMBOL(sysctl_tcp_dctcp_enable);
|
||||
+int sysctl_tcp_dctcp_shift_g __read_mostly = 4; /* g=1/2^4 */
|
||||
+EXPORT_SYMBOL(sysctl_tcp_dctcp_shift_g);
|
||||
+
|
||||
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
|
||||
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
|
||||
#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
|
||||
@@ -217,16 +224,70 @@ static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
|
||||
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
|
||||
}
|
||||
|
||||
-static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
|
||||
+static inline void TCP_ECN_dctcp_check_ce(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
|
||||
{
|
||||
if (tp->ecn_flags & TCP_ECN_OK) {
|
||||
- if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
|
||||
- tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
|
||||
- /* Funny extension: if ECT is not set on a segment,
|
||||
- * it is surely retransmit. It is not in ECN RFC,
|
||||
- * but Linux follows this rule. */
|
||||
- else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags)))
|
||||
- tcp_enter_quickack_mode((struct sock *)tp);
|
||||
+ u32 temp_rcv_nxt;
|
||||
+
|
||||
+ if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) {
|
||||
+
|
||||
+ /* rcv_nxt is already update in previous process (tcp_rcv_established) */
|
||||
+
|
||||
+ if(sysctl_tcp_dctcp_enable) {
|
||||
+
|
||||
+ /* state has changed from CE=0 to CE=1 && delayed ack has not sent yet */
|
||||
+ if(tp->ce_state == 0 && tp->delayed_ack_reserved) {
|
||||
+
|
||||
+ /* save current rcv_nxt */
|
||||
+ temp_rcv_nxt = tp->rcv_nxt;
|
||||
+ /* generate previous ack with CE=0 */
|
||||
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
|
||||
+ tp->rcv_nxt = tp->prior_rcv_nxt;
|
||||
+ /* printk("CE=0 rcv_nxt= %u nxt= %u\n",tp->rcv_nxt, temp_rcv_nxt); */
|
||||
+ tcp_send_ack(sk);
|
||||
+ /* recover current rcv_nxt */
|
||||
+ tp->rcv_nxt = temp_rcv_nxt;
|
||||
+ }
|
||||
+
|
||||
+ tp->ce_state = 1;
|
||||
+ }
|
||||
+
|
||||
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
|
||||
+
|
||||
+
|
||||
+ /* Funny extension: if ECT is not set on a segment,
|
||||
+ * it is surely retransmit. It is not in ECN RFC,
|
||||
+ * but Linux follows this rule. */
|
||||
+ } else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) {
|
||||
+ tcp_enter_quickack_mode((struct sock *)tp);
|
||||
+ }else {
|
||||
+ /* It has ECT but it doesn't have CE */
|
||||
+
|
||||
+ if(sysctl_tcp_dctcp_enable) {
|
||||
+
|
||||
+ if(tp->ce_state != 0 && tp->delayed_ack_reserved) {
|
||||
+
|
||||
+ /* save current rcv_nxt */
|
||||
+ temp_rcv_nxt = tp->rcv_nxt;
|
||||
+ /* generate previous ack with CE=1 */
|
||||
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
|
||||
+ tp->rcv_nxt = tp->prior_rcv_nxt;
|
||||
+ /* printk("CE=1 rcv_nxt= %u nxt= %u\n",tp->rcv_nxt, temp_rcv_nxt); */
|
||||
+ tcp_send_ack(sk);
|
||||
+ /* recover current rcv_nxt */
|
||||
+ tp->rcv_nxt = temp_rcv_nxt;
|
||||
+ }
|
||||
+
|
||||
+ tp->ce_state = 0;
|
||||
+
|
||||
+ /* deassert only when DCTCP is enabled */
|
||||
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
|
||||
+ }
|
||||
+
|
||||
+ }
|
||||
+
|
||||
+ /* set current rcv_nxt to prior_rcv_nxt */
|
||||
+ tp->prior_rcv_nxt = tp->rcv_nxt;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -581,6 +642,8 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
|
||||
*/
|
||||
tcp_incr_quickack(sk);
|
||||
icsk->icsk_ack.ato = TCP_ATO_MIN;
|
||||
+
|
||||
+ tp->ce_state = 0;
|
||||
} else {
|
||||
int m = now - icsk->icsk_ack.lrcvtime;
|
||||
|
||||
@@ -601,7 +664,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
|
||||
}
|
||||
icsk->icsk_ack.lrcvtime = now;
|
||||
|
||||
- TCP_ECN_check_ce(tp, skb);
|
||||
+ TCP_ECN_dctcp_check_ce(sk, tp, skb);
|
||||
|
||||
if (skb->len >= 128)
|
||||
tcp_grow_window(sk, skb);
|
||||
@@ -827,19 +890,54 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
|
||||
+ __u32 ssthresh_old;
|
||||
+ __u32 cwnd_old;
|
||||
+ __u32 cwnd_new;
|
||||
+
|
||||
tp->prior_ssthresh = 0;
|
||||
tp->bytes_acked = 0;
|
||||
if (icsk->icsk_ca_state < TCP_CA_CWR) {
|
||||
tp->undo_marker = 0;
|
||||
- if (set_ssthresh)
|
||||
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
|
||||
- tp->snd_cwnd = min(tp->snd_cwnd,
|
||||
- tcp_packets_in_flight(tp) + 1U);
|
||||
+
|
||||
+ if(!sysctl_tcp_dctcp_enable) {
|
||||
+
|
||||
+ if (set_ssthresh)
|
||||
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
|
||||
+
|
||||
+ tp->snd_cwnd = min(tp->snd_cwnd,
|
||||
+ tcp_packets_in_flight(tp) + 1U);
|
||||
+
|
||||
+ }else {
|
||||
+
|
||||
+ cwnd_new = max (tp->snd_cwnd - ((tp->snd_cwnd * tp->dctcp_alpha)>>11) , 2U);
|
||||
+
|
||||
+ if(set_ssthresh) {
|
||||
+
|
||||
+ ssthresh_old = tp->snd_ssthresh;
|
||||
+ tp->snd_ssthresh = cwnd_new;
|
||||
+
|
||||
+ /* printk("%llu alpha= %d ssth old= %d new= %d\n", */
|
||||
+ /* ktime_to_us(ktime_get_real()), */
|
||||
+ /* tp->dctcp_alpha, */
|
||||
+ /* ssthresh_old, */
|
||||
+ /* tp->snd_ssthresh); */
|
||||
+ }
|
||||
+
|
||||
+ cwnd_old = tp->snd_cwnd;
|
||||
+ tp->snd_cwnd = cwnd_new;
|
||||
+
|
||||
+ /* printk("%llu alpha= %d cwnd old= %d new= %d\n", */
|
||||
+ /* ktime_to_us(ktime_get_real()), */
|
||||
+ /* tp->dctcp_alpha, */
|
||||
+ /* cwnd_old, */
|
||||
+ /* tp->snd_cwnd); */
|
||||
+ }
|
||||
+
|
||||
tp->snd_cwnd_cnt = 0;
|
||||
tp->high_seq = tp->snd_nxt;
|
||||
tp->snd_cwnd_stamp = tcp_time_stamp;
|
||||
TCP_ECN_queue_cwr(tp);
|
||||
-
|
||||
+
|
||||
tcp_set_ca_state(sk, TCP_CA_CWR);
|
||||
}
|
||||
}
|
||||
@@ -2859,6 +2957,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
|
||||
tcp_try_keep_open(sk);
|
||||
tcp_moderate_cwnd(tp);
|
||||
} else {
|
||||
+ if(!sysctl_tcp_dctcp_enable)
|
||||
tcp_cwnd_down(sk, flag);
|
||||
}
|
||||
}
|
||||
@@ -3624,6 +3723,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
|
||||
int prior_packets;
|
||||
int frto_cwnd = 0;
|
||||
|
||||
+ __u32 alpha_old;
|
||||
+ __u32 acked_bytes;
|
||||
+
|
||||
/* If the ack is older than previous acks
|
||||
* then we can probably ignore it.
|
||||
*/
|
||||
@@ -3680,6 +3782,54 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
|
||||
tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
|
||||
}
|
||||
|
||||
+ /* START: DCTCP Processing */
|
||||
+
|
||||
+ /* calc acked bytes */
|
||||
+ if(after(ack,tp->prior_ack)) {
|
||||
+ acked_bytes = ack - tp->prior_ack;
|
||||
+ } else {
|
||||
+
|
||||
+ if(flag & FLAG_WIN_UPDATE) {
|
||||
+ /* Don't count when it is Window Updated ACK */
|
||||
+ acked_bytes = 0;
|
||||
+ /* printk("acked_byte=0\n"); */
|
||||
+ }else {
|
||||
+ /* Count duplicate ACKs for Retransmission packets and so on as MSS size */
|
||||
+ acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if(flag & FLAG_ECE)
|
||||
+ tp->acked_bytes_ecn += acked_bytes;
|
||||
+
|
||||
+ tp->acked_bytes_total += acked_bytes;
|
||||
+
|
||||
+ tp->prior_ack = ack;
|
||||
+
|
||||
+ /* Expired RTT */
|
||||
+ if (!before(tp->snd_una,tp->next_seq)) {
|
||||
+
|
||||
+ /* For avoiding denominator == 1 */
|
||||
+ if(tp->acked_bytes_total == 0) tp->acked_bytes_total = 1;
|
||||
+
|
||||
+ alpha_old = tp->dctcp_alpha;
|
||||
+
|
||||
+ /* alpha = (1-g) * alpha + g * F */
|
||||
+ tp->dctcp_alpha = alpha_old - (alpha_old >> sysctl_tcp_dctcp_shift_g)
|
||||
+ + (tp->acked_bytes_ecn << (10 - sysctl_tcp_dctcp_shift_g)) / tp->acked_bytes_total;
|
||||
+
|
||||
+ if(tp->dctcp_alpha > 1024) tp->dctcp_alpha = 1024; /* round to 0-1024 */
|
||||
+
|
||||
+ /* printk("bytes_ecn= %d total= %d alpha: old= %d new= %d\n", */
|
||||
+ /* tp->acked_bytes_ecn, tp->acked_bytes_total, alpha_old, tp->dctcp_alpha); */
|
||||
+
|
||||
+ tp->acked_bytes_ecn = 0;
|
||||
+ tp->acked_bytes_total = 0;
|
||||
+ tp->next_seq = tp->snd_nxt;
|
||||
+ }
|
||||
+
|
||||
+ /* END: DCTCP Processing */
|
||||
+
|
||||
/* We passed data and got it acked, remove any soft error
|
||||
* log. Something worked...
|
||||
*/
|
||||
@@ -4480,7 +4630,7 @@ drop:
|
||||
goto queue_and_out;
|
||||
}
|
||||
|
||||
- TCP_ECN_check_ce(tp, skb);
|
||||
+ TCP_ECN_dctcp_check_ce(sk, tp, skb);
|
||||
|
||||
if (tcp_try_rmem_schedule(sk, skb->truesize))
|
||||
goto drop;
|
||||
@@ -4931,6 +5081,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
|
||||
__tcp_select_window(sk) >= tp->rcv_wnd) ||
|
||||
/* We ACK each frame or... */
|
||||
tcp_in_quickack_mode(sk) ||
|
||||
+ /* Delayed ACK is disabled or ... */
|
||||
+ sysctl_tcp_delayed_ack == 0 ||
|
||||
/* We have out of order data. */
|
||||
(ofo_possible && skb_peek(&tp->out_of_order_queue))) {
|
||||
/* Then ack it now */
|
||||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
|
||||
index 882e0b0..2a4d1dc 100644
|
||||
--- a/net/ipv4/tcp_output.c
|
||||
+++ b/net/ipv4/tcp_output.c
|
||||
@@ -308,7 +308,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
|
||||
tp->ecn_flags = 0;
|
||||
- if (sysctl_tcp_ecn == 1) {
|
||||
+ if (sysctl_tcp_ecn == 1 || sysctl_tcp_dctcp_enable) {
|
||||
TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR;
|
||||
tp->ecn_flags = TCP_ECN_OK;
|
||||
}
|
||||
@@ -878,6 +878,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
|
||||
if (likely((tcb->flags & TCPHDR_SYN) == 0))
|
||||
TCP_ECN_send(sk, skb, tcp_header_size);
|
||||
|
||||
+ /* In DCTCP, Assert ECT bit to all packets*/
|
||||
+ if(sysctl_tcp_dctcp_enable)
|
||||
+ INET_ECN_xmit(sk);
|
||||
+
|
||||
#ifdef CONFIG_TCP_MD5SIG
|
||||
/* Calculate the MD5 hash, as we have all we need now */
|
||||
if (md5) {
|
||||
@@ -2624,6 +2628,11 @@ int tcp_connect(struct sock *sk)
|
||||
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
|
||||
TCP_ECN_send_syn(sk, buff);
|
||||
|
||||
+ /* Initialize DCTCP internal parameters */
|
||||
+ tp->next_seq = tp->snd_nxt;
|
||||
+ tp->acked_bytes_ecn = 0;
|
||||
+ tp->acked_bytes_total = 0;
|
||||
+
|
||||
/* Send it off. */
|
||||
TCP_SKB_CB(buff)->when = tcp_time_stamp;
|
||||
tp->retrans_stamp = TCP_SKB_CB(buff)->when;
|
||||
@@ -2660,6 +2669,10 @@ void tcp_send_delayed_ack(struct sock *sk)
|
||||
int ato = icsk->icsk_ack.ato;
|
||||
unsigned long timeout;
|
||||
|
||||
+ /* Delayed ACK reserved flag for DCTCP */
|
||||
+ struct tcp_sock *tp = tcp_sk(sk);
|
||||
+ tp->delayed_ack_reserved = 1;
|
||||
+
|
||||
if (ato > TCP_DELACK_MIN) {
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
int max_ato = HZ / 2;
|
||||
@@ -2711,6 +2724,10 @@ void tcp_send_ack(struct sock *sk)
|
||||
{
|
||||
struct sk_buff *buff;
|
||||
|
||||
+ /* Delayed ACK reserved flag for DCTCP */
|
||||
+ struct tcp_sock *tp = tcp_sk(sk);
|
||||
+ tp->delayed_ack_reserved = 0;
|
||||
+
|
||||
/* If we have been reset, we may not send again. */
|
||||
if (sk->sk_state == TCP_CLOSE)
|
||||
return;
|
||||
--
|
||||
1.7.0.4
|
||||
|
||||
Reference in New Issue
Block a user