Compare commits

...

3 Commits

Author SHA1 Message Date
Brandon Heller 6d06ab3591 Add .config 2012-06-03 00:49:20 -07:00
Brandon Heller 9977ca16bb Fix build error w/.config not there 2012-06-03 05:34:58 +00:00
Brandon Heller 1631044f7b Add 3.0 kernel build files 2012-06-03 03:51:08 +00:00
29 changed files with 17967 additions and 0 deletions
+439
View File
@@ -0,0 +1,439 @@
#!/bin/bash
# Builds kernel with the new CFS Bandwidth patches
# and nsfd/setns syscall patches.
# Also builds Open vSwitch against the built kernel version.
# Script only to be run on 64-bit systems; needs a few changes to run on
# 32-bit ones.
# If building for i386 (-t), make sure to install the following 32-bit libs:
# sudo apt-get install ia32-libs lib32gcc1 libc6-i386 util-linux devscripts
# Check for unitialized variables
set -o nounset
# Exit on any failure
set -e
# Location in which to download and build the kernel
kdir=/usr/src
# Kernel version to download
kver=3.0.0
# Save original directory for later.
orig_dir=`pwd`
# Default and custom kernel version string
version_string=-with-cfs
# Run menuconfig later?
menuconfig=
# Use localmodconfig?
localmodconfig=
# Build ubuntu kernel? must be 3.0.0 compatible
ubuntu_release=
ubuntu_default_release=ubuntu-oneiric
ubuntu_base=3.0.0-14 # base tag and version for build
ubuntu_tag=Ubuntu-$ubuntu_base.23
ubuntu_flavor=generic
#ubuntu_config=/boot/config-$ubuntu_base-$ubuntu_flavor
ubuntu_config=${orig_dir}/config-3.0.9-with-cfs
ubuntu_image=linux-image-$ubuntu_base-$ubuntu_flavor
ubuntu_kver=3.0.9 # must match version that is actually built
# OVS pkg string. Not sure how to find this automatically.
ovs_pkg_ver=1.2.0-1ubuntu3
# Location of kernel config. If not specified, use current .config.
# was: ${orig_dir}/config-3.0.0-with-cfs
kconfig=
# Install only?
install_only=
# Use 32-bit?
i386=
function usage {
warn "Compiles kernel ${kver} with CBW, setns, and DCTCP patches in ${kdir}"
warn "Usage: build.sh [-huimlt] [-v 'versionstring']"
warn "-h help"
warn "-u build ubuntu kernel"
warn "-i install only (don't build)"
warn "-m use menuconfig"
warn "-l use localmodconfig"
warn "-v 'versionstring' use custom version string"
warn "-t build for i386 (32-bit)"
}
function parse_opts {
custom_version_string=
plus=
while getopts 'huimltv:' OPTION; do
case $OPTION in
h) usage; exit 0;;
u) ubuntu_release=$ubuntu_default_release;
kver=$ubuntu_kver; kconfig=$ubuntu_config;;
i) install_only=true;;
m) menuconfig=true;;
l) localmodconfig=true;;
v) custom_version_string=$OPTARG;;
t) i386=true; plus=;;
?) usage; exit 1;;
esac
done
# Provide feedback which might be useful
if [[ "$custom_version_string" != "" ]]; then
warn "Using custom version_string: ${custom_version_string}"
version_string=$custom_version_string
else
warn "Using default version_string: ${version_string}"
fi
if [[ "$ubuntu_release" != "" ]]; then
warn "Building Ubuntu kernel for release ${ubuntu_release}"
fi
}
function warn {
# Echo the provided command in color text.
yellow='\e[0;33m' # Yellow
reset='\e[0m'
echo="echo -e"
if [ -n "${2+defined}" ]; then
echo="$echo $2"
fi
$echo "${yellow}$1${reset}"
}
function pre_check {
warn "Checking for git"
if [[ -z `which git` ]]; then
warn "You need git to download kernel. Install? [Y/n] " -n
read answer
[[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
sudo apt-get install git;
fi
warn "Checking for kernel-package build utilities"
if [[ -z `which make-kpkg` ]]; then
warn "You need kernel-package utilities to build the kernel. Install? [Y/n] " -n
read answer
[[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
sudo apt-get install kernel-package ncurses-dev;
fi
}
function fetch_kernel {
if [[ "$ubuntu_release" == "" ]]; then
srcdir=$kdir/linux-$kver
archive=git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-tip.git
tag=3.0.0
else
warn "Pre-installing $ubuntu_image"
#sudo apt-get install $ubuntu_image
srcdir=$kdir/$ubuntu_release
archive=git://kernel.ubuntu.com/ubuntu/$ubuntu_release
tag=$ubuntu_tag
fi
if [[ -d $srcdir ]]; then
warn "Linux source exists in $srcdir, skipping.."
return
fi
warn "--> Fetching kernel $srcdir"
if git clone $archive $srcdir; then
return
fi
warn "Failed to fetch kernel from $archive"
if [[ "$ubuntu_release" == "" ]]; then
warn "Trying github"
archive=git://github.com/torvalds/linux.git
if git clone $archive $srcdir; then
return
fi
fi
warn "Giving up."
exit 2
}
function work_around_kernel_package_bug {
warn "Applying workaround for kernel package bug..."
# Fix will likely break on any other kernel version, so watch out.
# From:
# https://bugs.launchpad.net/ubuntu/+source/kernel-package/+bug/58307/comments/16
sed -i -s 's/echo "+"/#echo "+"/' $srcdir/scripts/setlocalversion
}
function copy_patches {
warn "Copying patches..."
rm -rf $srcdir/patches
cp -r ${orig_dir}/../../linux-3.0.0-patches/ $srcdir/patches
}
function apply_patches {
cd $srcdir
if git checkout mininet ; then
# Assume mininet
warn "Mininet branch already exists - not applying patches"
return
fi
if [[ "$tag" != "" ]] ; then
git checkout $tag
fi
git checkout -b mininet
warn "Applying patches..."
git am -3 patches/*.patch
work_around_kernel_package_bug
}
# lxc/ns and cfs configuration flags
config_y='
CONFIG_GROUP_SCHED
CONFIG_FAIR_GROUP_SCHED
CONFIG_RT_GROUP_SCHED
CONFIG_CGROUP_SCHED
CONFIG_CGROUPS
CONFIG_CGROUP_FREEZER
CONFIG_CGROUP_DEVICE
CONFIG_SCHED_AUTOGROUP
CONFIG_BLK_CGROUP
CONFIG_CFQ_GROUP_IOSCHED
CONFIG_CGROUP_PERF
CONFIG_CPUSETS
CONFIG_PROC_PID_CPUSET
CONFIG_CGROUP_CPUACCT
CONFIG_RESOURCE_COUNTERS
CONFIG_CGROUP_MEM_RES_CTLR
CONFIG_CGROUP_MEM_RES_CTLR_SWAP
CONFIG_MM_OWNER
CONFIG_NAMESPACES
CONFIG_UTS_NS
CONFIG_IPC_NS
CONFIG_USER_NS
CONFIG_PID_NS
CONFIG_NET_NS
CONFIG_NET_CLS_CGROUP
CONFIG_SECURITY_FILE_CAPABILITIES
CONFIG_DEVPTS_MULTIPLE_INSTANCES
CONFIG_VETH
CONFIG_VLAN_8021Q
CONFIG_MACVLAN
CONFIG_CFS_BANDWIDTH
CONFIG_NET_SCHED'
config_m='
CONFIG_BRIDGE
CONFIG_NET_SCH_CBQ
CONFIG_NET_SCH_HTB
CONFIG_NET_SCH_HFSC
CONFIG_NET_SCH_PRIO
CONFIG_NET_SCH_MULTIQ
CONFIG_NET_SCH_RED
CONFIG_NET_SCH_SFB
CONFIG_NET_SCH_SFQ
CONFIG_NET_SCH_TEQL
CONFIG_NET_SCH_TBF
CONFIG_NET_SCH_GRED
CONFIG_NET_SCH_DSMARK
CONFIG_NET_SCH_NETEM
CONFIG_NET_SCH_DRR
CONFIG_NET_SCH_MQPRIO
CONFIG_NET_SCH_CHOKE
CONFIG_NET_SCH_QFQ
CONFIG_NET_SCH_INGRESS
'
config_n='
CONFIG_SECURITY_APPARMOR
'
function configure_kernel {
cd $srcdir
warn "Configuring kernel..."
if [[ "$menuconfig" == 'true' ]]; then
make menuconfig
else
if [[ "$kconfig" == "" ]]; then
warn "Using current kernel config..."
else
warn "Using specified kernel config: ${kconfig}..."
cp $kconfig .config
fi
warn "Making oldconfig..."
if [[ "$i386" == 'true' ]]; then
linux32=linux32
else
linux32=
fi
yes '' | $linux32 make oldconfig 1> /dev/null
if [[ "$localmodconfig" == 'true' ]]; then
warn "Making localmodconfig..."
yes '' | $linux32 make localmodconfig 1> /dev/null
fi
warn "Setting kernel flags for lxc and cbw..."
for flag in $config_y; do
if ! grep $flag .config 1> /dev/null; then
echo $flag=y >> .config
else
sed -i -s "s/# $flag is not set/$flag=y/" .config
fi
done
for flag in $config_m; do
if ! grep $flag .config 1> /dev/null; then
echo $flag=m >> .config
else
sed -i -s "s/# $flag is not set/$flag=m/" .config
sed -i -s "s/$flag=y/$flag=m/" .config
fi
done
for flag in $config_n; do
if ! grep $flag .config 1> /dev/null; then
echo "# $flag is not set" >> .config
else
sed -i -s "s/$flag=y/# $flag is not set/" .config
sed -i -s "s/$flag=m/# $flag is not set/" .config
fi
done
for flag in $config_y $config_m; do
grep $flag .config || echo "WARNING: $flag IS MISSING"
done
cp .config /tmp
warn "RAN CONFIG IN `pwd`"
fi
}
function build_kernel {
# Have your favourite build method here
# This is a standard Debian way of building the kernel
# The patches select cfs bandwidth automatically
warn "Building kernel-$version_string"
cd $srcdir
procs=`grep -c ^processor /proc/cpuinfo`
procs=`echo $procs + 2 | bc`
export CONCURRENCY_LEVEL=$procs
if [[ "$i386" == 'true' ]]; then
mkpkg_extra_args='--cross-compile - --arch i386'
else
mkpkg_extra_args=
fi
make-kpkg clean $mkpkg_extra_args
yes '' | fakeroot make-kpkg -j $procs $mkpkg_extra_args --initrd --append-to-version=${version_string} \
kernel_image kernel_headers
}
function mod_kernel_dpkg {
# Only needed for i386.
cd /usr/src
if [[ "$i386" == 'true' ]]; then
warn "Modifying deb-pkg names for i386"
# Based on instructions from http://dotcommie.net/?id=165
for pkg_type in linux-image linux-headers; do
pkg_name_orig=${pkg_type}-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom_amd64.deb
#hook=`readlink -f set_debian_control_i386.sh`
hook=${orig_dir}/set_debian_control_i386.sh
warn "$pkg_name_orig"
fakeroot deb-reversion -s "" --hook $hook $pkg_name_orig
# Remove the 1 in the name that deb-reversion adds.
pkg_name_mod=${pkg_type}-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom1_i386.deb
pkg_name_new=${pkg_type}-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom_i386.deb
mv $pkg_name_mod $pkg_name_new
warn "Removing original package: $pkg_name_orig"
rm -f $pkg_name_orig
done
fi
}
function install_headers {
warn "Installing headers..."
sudo dpkg -i /usr/src/linux-headers-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom_*.deb
}
function install_kernel {
warn "Installing kernel..."
sudo dpkg -i /usr/src/linux-image-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom_*.deb
}
function build_initrd {
# Certain versions of Ubuntu install a make-kpkg"
# that does not build an initrd along with the rest of the kernel."
warn "Building initrd..."
#sudo mkdir -p /lib/modules/$kver${version_string}
sudo mkinitramfs -v -k -o /boot/initrd.img-$kver${version_string}${plus} $kver${version_string}${plus}
}
function build_ovs_datapath {
sudo apt-get install openvswitch-datapath-source
if [[ "$i386" == 'true' ]]; then
prepend='DEB_HOST_ARCH=i386 '
else
prepend=
fi
$prepend sudo module-assistant auto-build openvswitch-datapath -l $kver${version_string}${plus}
}
function mod_ovs_dpkg {
# Only needed for i386.
cd /usr/src
if [[ "$i386" == 'true' ]]; then
warn "Modifying deb-pkg names for i386"
# Based on instructions from http://dotcommie.net/?id=165
for pkg_type in openvswitch-datapath-module; do
pkg_name_orig=${pkg_type}-$kver${version_string}${plus}_${ovs_pkg_ver}_amd64.deb
#hook=`readlink -f set_debian_control_i386.sh`
hook=${orig_dir}/set_debian_control_i386.sh
warn "$pkg_name_orig"
fakeroot deb-reversion -s "" --hook $hook $pkg_name_orig
# Remove the 1 in the name that deb-reversion adds.
pkg_name_mod=${pkg_type}-$kver${version_string}${plus}_${ovs_pkg_ver}1_i386.deb
pkg_name_new=${pkg_type}-$kver${version_string}${plus}_${ovs_pkg_ver}_i386.deb
mv $pkg_name_mod $pkg_name_new
warn "Removing original package: $pkg_name_orig"
rm -f $pkg_name_orig
done
fi
}
function install_ovs_datapath {
warn "Installing ovs datapath"
sudo module-assistant install openvswitch-datapath -l $kver${version_string}${plus}
}
parse_opts $*
if [[ "$install_only" != 'true' ]] ; then
pre_check
sudo chmod 777 $kdir
cd $kdir
fetch_kernel
copy_patches
apply_patches
configure_kernel
build_kernel
mod_kernel_dpkg
warn "******************************************"
warn "Check for kernel .deb installation file in /usr/src/ along with initrd."
else
install_headers
fi
if [[ "$i386" != 'true' ]]; then
# Presumably we'll only want to install on a 64-bit machine.
install_kernel
fi
build_ovs_datapath
mod_ovs_dpkg
if [[ "$i386" != 'true' ]]; then
install_ovs_datapath
build_initrd
fi
cd $orig_dir
warn "Done (hopefully)"
+121
View File
@@ -0,0 +1,121 @@
#!/bin/bash
# Builds lxc for kernel patched with setns
# Check for unitialized variables
set -o nounset
# Exit on any failure
set -e
# Kernel version to use
kver=3.0
# Location in which to download and build lxc
lxcdir=$HOME
kdir=/lib/modules/`uname -r`/build
# lxc version to use
lxcver=lxc-0.7.5
# Save original directory for later.
orig_dir=`pwd`
function warn {
# Echo the provided command in color text.
yellow='\e[0;33m' # Yellow
reset='\e[0m'
echo="echo -e"
$echo "${yellow}$1${reset}"
}
function usage {
warn "Usage: $0 [lxc download location] [kernel location]"
}
if [[ "$#" > 2 ]]; then
warn "Invalid number of args passed."
usage
exit
elif [[ "$#" == 0 ]]; then
warn "No args passed."
warn "Using default lxc location: ${lxcdir}."
warn "Using default kernel location: ${kdir}."
elif [[ "$#" == 1 ]]; then
lxcdir=$1
warn "Using custom lxc location: ${lxcdir}"
warn "Using default kernel location: ${kdir}"
elif [[ "$#" == 2 ]]; then
lxcdir=$1
kdir=$2
warn "Using custom lxc location: ${lxcdir}"
warn "Using custom kernel location: ${kdir}"
fi
function pre_check {
warn "Checking for git"
if [[ -z `which git` ]]; then
read -p \
warn "You need git to download lxc. Install? [Y/n] " \
answer;
[[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
sudo apt-get install git;
fi
warn "Checking for linux source code"
if [[ ! -d ${kdir} ]]; then
warn "Error: Kernel doesn't exist in ${kdir}... exiting"
exit
fi
}
function fetch_lxc {
cd $lxcdir
warn "--> Fetching lxc"
if [[ -d lxc ]]; then
warn "lxc source exists, skipping.."
return
fi
git clone git://lxc.git.sourceforge.net/gitroot/lxc/lxc
cd lxc
git checkout $lxcver
cd ..
}
function copy_patches {
rm -rf lxc/patches
cp -r ${orig_dir}/../../lxc-$kver-patches lxc/patches
}
function apply_patches {
cd lxc
warn "Applying patches..."
git am -3 patches/*.patch
}
function build_lxc {
warn "Building lxc with kernel-${kver}..."
processors=`grep -c ^processor /proc/cpuinfo`
export CONCURRENCY_LEVEL=$processors
make distclean || true
./autogen.sh
./configure --with-linuxdir=${kdir}
make
}
function install_lxc {
warn "Installing lxc..."
sudo make install
# Seems to be missing
sudo mkdir -p /usr/local/var/lib/lxc
}
usage
pre_check
fetch_lxc
copy_patches
apply_patches
build_lxc
install_lxc
cd $orig_dir
warn "Done (hopefully)"
+170
View File
@@ -0,0 +1,170 @@
#!/bin/bash
# Builds kernel with the new CFS Bandwidth patches
# and nsfd/setns syscall patches.
# Check for unitialized variables
set -o nounset
# Exit on any failure
set -e
# Location in which to download and build the kernel
kdir=/usr/src
# Kernel version to download
kver=2.6.35
# Save original directory for later.
orig_dir=`pwd`
# Kernel version string
version_string=-with-cfs
# Run menuconfig later?
menuconfig=
function warn {
# Echo the provided command in color text.
yellow='\e[0;33m' # Yellow
reset='\e[0m'
echo="echo -e"
$echo "${yellow}$1${reset}"
}
function usage {
warn "Usage: build.sh [version string] [menuconfig]"
}
if [[ "$#" > 2 ]]; then
warn "Invalid number of args passed."
usage
exit
elif [[ "$#" == 0 ]]; then
warn "No args passed. Using default version_string: ${version_string}"
elif [[ "$#" == 1 ]]; then
warn "Using custom version_string: ${version_string}"
version_string=$1
elif [[ "$#" == 2 && $2 != 'menuconfig' ]]; then
warn "Second arg is either menuconfig or missing."
usage
else
version_string=$1
menuconfig=true
fi
function pre_check {
warn "Checking for kernel-package build utilities"
if [[ -z `which make-kpkg` ]]; then
read -p \
warn "You need kernel-package utilities to build the kernel. Install? [Y/n] " \
answer;
[[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
sudo apt-get install kernel-package ncurses-dev;
fi
warn "Checking for quilt"
if [[ -z `which quilt` ]]; then
read -p \
warn "You need quilt to install patches. Install? [Y/n] " \
answer;
[[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
sudo apt-get install quilt;
fi
}
function fetch_kernel {
warn "--> Fetching kernel linux-$kver"
if [[ -f linux-$kver.tar.bz2 ]]; then
warn "File exists, skipping.."
return
fi
wget http://kernel.org/pub/linux/kernel/v2.6/linux-$kver.tar.bz2
warn "Unpacking kernel"
tar xjf linux-$kver.tar.bz2
}
function work_around_kernel_package_bug {
# Fix will likely break on any other kernel version, so watch out.
# From:
# https://bugs.launchpad.net/ubuntu/+source/kernel-package/+bug/58307/comments/16
sed -i -s 's/echo "+"/#echo "+"/' linux-${kver}/scripts/setlocalversion
}
function copy_patches {
rm -rf linux-$kver/patches
cp -r ${orig_dir}/../../linux-2.6.35-patches linux-$kver/patches
}
function apply_patches {
cd linux-$kver
# Apply patch series only if not applied previously.
# A better check would look at patches/series and make sure each entry
# in `quilt applied` was covered.
warn "Checking for applied patches"
quilt applied > quilt_applied_stdout 2> quilt_applied_stderr || true
if [[ `grep -c "No patches applied" quilt_applied_stderr` == 1 ]]; then
warn "Applying patches"
quilt push -a
else
warn "Skipped patches"
fi
rm quilt_applied
}
function build_kernel {
# Have your favourite build method here
# This is a standard Debian way of building the kernel
# The patches select cfs bandwidth automatically
warn "Building kernel..."
if [[ "$menuconfig" == 'true' ]]; then
make menuconfig
else
warn "Making oldconfig..."
yes "" | make oldconfig
warn "Making localmodconfig..."
make localmodconfig
warn "Enabling netns and cpubw..."
sed -i -s 's/# CONFIG_VETH is not set/CONFIG_VETH=y/' .config
sed -i -s 's/CONFIG_BRIDGE=y/CONFIG_BRIDGE=m/' .config
sed -i -s 's/# CONFIG_BRIDGE is not set/CONFIG_BRIDGE=m/' .config
sed -i -s 's/# CONFIG_CFS_BANDWIDTH is not set/CONFIG_CFS_BANDWIDTH=y/' .config
sed -i -s 's/# CONFIG_NET_NS is not set/CONFIG_NET_NS=y/' .config
fi
warn "Building kernel-$version_string"
processors=`grep -c ^processor /proc/cpuinfo`
export CONCURRENCY_LEVEL=$processors
yes "" | fakeroot make-kpkg --initrd --append-to-version=${version_string} kernel_image
warn "******************************************"
warn "Check for kernel .deb installation file in ../ along with initrd."
}
function install_kernel {
warn "Installing kernel..."
sudo dpkg -i /usr/src/linux-image-$kver${version_string}_$kver${version_string}-10.00.Custom_amd64.deb
}
function build_initrd {
# Certain versions of Ubuntu install a make-kpkg"
# that does not build an initrd along with the rest of the kernel."
warn "Building initrd..."
#sudo mkdir -p /lib/modules/$kver${version_string}
sudo mkinitramfs -v -k -o /boot/initrd.img-$kver${version_string} $kver${version_string}
}
pre_check
sudo chmod 777 $kdir
cd $kdir
fetch_kernel
work_around_kernel_package_bug
copy_patches
apply_patches
build_kernel
install_kernel
build_initrd
warn "Done (hopefully)"
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+85
View File
@@ -0,0 +1,85 @@
#!/bin/bash
# Check for unitialized variables
set -o nounset
# Exit on any failure
set -e
debdir=/usr/src
kver=3.0.9-with-cfs
kbuild=/lib/modules/$kver/build
if arch | grep 64 > /dev/null ; then arch=amd64; else arch=i386; fi
headers=linux-headers-${kver}_${kver}-10.00.Custom_${arch}.deb
image=linux-image-${kver}_${kver}-10.00.Custom_${arch}.deb
ovs=openvswitch-datapath-module-${kver}_1.2.0-1ubuntu3_${arch}.deb
echo "Mininet-hifi installer"
echo "1. Checking for prereqs"
if [[ ! -e $debdir/$headers || ! -e $debdir/$image ||
! -e $debdir/$ovs ]]; then
echo "Can't find kernel packages"
echo "$debdir/$headers or $debdir/$image or $debdir/$ovs is missing"
exit 1
fi
if [[ "`ssh-add -l`" == "" ]]; then
echo "No SSH keys - nsdi repo checkout will fail."
exit 1
fi
echo "2. Getting mainline Mininet from github"
cd ~
git clone git://github.com/mininet/mininet.git
echo "3. Installing OpenFlow reference implementation"
mininet/util/install.sh -f
echo "4. Installing Mininet core files"
mininet/util/install.sh -n
echo "5. Adding nsdi repository"
cd ~/mininet
git remote add nsdi git@gitosis.stanford.edu:mininet-nsdi.git
git fetch nsdi
git checkout -b mininet-rt remotes/nsdi/mininet-rt
sudo make install
echo "6. Installing kernel packages"
sudo dpkg -i $debdir/$headers
sudo dpkg -i $debdir/$image
sudo dpkg -i $debdir/$ovs
echo "7. Fetching, building and installing Open vSwitch user code"
cd ~
git clone git://openvswitch.org/openvswitch
cd ~/openvswitch
git checkout v1.2.2
./boot.sh
./configure
make all
sudo make install
sudo cp tests/test-openflowd /usr/local/bin/ovs-openflowd
echo "8. Building and installing custom lxc package"
sudo apt-get -y install libcap-dev
cd ~/mininet/util/kbuild/cfs-nsfd-kernel
./build-lxc-for-3.0.sh $HOME $kbuild
echo "9. Setting up /cgroup"
sudo apt-get remove cgroup-lite
sudo mkdir /cgroup
sudo sh -c "echo 'cgroup /cgroup cgroup defaults 0 0' >> /etc/fstab"
echo "10. Creating /etc/mn/host.conf"
sudo mkdir -p /etc/mn
sudo sh -c "echo 'lxc.utsname = mnhost' > /etc/mn/host.conf"
sudo sh -c "echo 'lxc.network.type = empty' >> /etc/mn/host.conf"
echo "11. Getting rid of quiet boot"
sudo sed -i 's/quiet/text/' /etc/default/grub
echo "Done! reboot to test"
+17
View File
@@ -0,0 +1,17 @@
#!/bin/bash
# Install lxc from source, apply patch, install
# (instructions tested with 2.6.35 only):
sudo apt-get -y install libcap-dev quilt
cd ~/
git clone git://lxc.git.sourceforge.net/gitroot/lxc/lxc
cd lxc
git checkout lxc-0.7.2 -b lxc-0.7.2
cp ~/mininet/util/kbuild/cfs-nsfd-kernel/lxc-patches.tar.gz .
tar xzf lxc-patches.tar.gz
# Modify patch. Small change to the patch: remove the 2nd argument to lxc_cgroup_path_get (it's set to NULL in the patch)
sed -i -s 's/cgrouppath, NULL, my_args.name/cgrouppath, my_args.name/' patches/lxc-attach-bug-fix.patch
quilt push -a
./autogen.sh
./configure
make
sudo make install
Binary file not shown.
+4
View File
@@ -0,0 +1,4 @@
#!/bin/sh
echo `pwd`
sed -i -s "s/Architecture: amd64/Architecture: i386/" DEBIAN/control
+14
View File
@@ -0,0 +1,14 @@
#!/bin/sh
# Re-build OVS for the kernel version defined below.
OVS_DIR=~/openvswitch
KERNEL_VER=`uname -r`
#KERNEL_VER=2.6.35-with-cfs
PROCESSORS=`grep -c ^processor /proc/cpuinfo`
cd $OVS_DIR
./configure --with-linux=/lib/modules/${KERNEL_VER}/build && \
sudo make -j${PROCESSORS} && \
sudo cp ./datapath/linux/openvswitch_mod.ko /lib/modules/${KERNEL_VER}/kernel/drivers/net && \
echo "Running depmod..."
sudo depmod -a ${KERNEL_VER}
@@ -0,0 +1,36 @@
From 57cc69f4a6d27c0b3ef495589a1d4629a9f1fa3e Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Wed, 6 Jul 2011 22:30:37 -0700
Subject: [PATCH 01/19] sched: Don't update shares twice on on_rq parent
In dequeue_task_fair() we bail on dequeue when we encounter a parenting entity
with additional weight. However, we perform a double shares update on this
entity as we continue the shares update traversal from this point, despite
dequeue_entity() having already updated its queuing cfs_rq.
Avoid this by starting from the parent when we resume.
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110707053059.797714697@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched_fair.c | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c768588..c80f030 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1370,6 +1370,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*/
if (task_sleep && parent_entity(se))
set_next_buddy(parent_entity(se));
+
+ /* avoid re-evaluating load for this entity */
+ se = parent_entity(se);
break;
}
flags |= DEQUEUE_SLEEP;
--
1.7.0.4
@@ -0,0 +1,168 @@
From 4ec11a3e21874534f9ffa70a8878bb255618bb33 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:27 -0700
Subject: [PATCH 02/19] sched: Implement hierarchical task accounting for SCHED_OTHER
Introduce hierarchical task accounting for the group scheduling case in CFS, as
well as promoting the responsibility for maintaining rq->nr_running to the
scheduling classes.
The primary motivation for this is that with scheduling classes supporting
bandwidth throttling it is possible for entities participating in throttled
sub-trees to not have root visible changes in rq->nr_running across activate
and de-activate operations. This in turn leads to incorrect idle and
weight-per-task load balance decisions.
This also allows us to make a small fixlet to the fastpath in pick_next_task()
under group scheduling.
Note: this issue also exists with the existing sched_rt throttling mechanism.
This patch does not address that.
Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184756.878333391@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched.c | 6 ++----
kernel/sched_fair.c | 10 ++++++++--
kernel/sched_rt.c | 5 ++++-
kernel/sched_stoptask.c | 2 ++
4 files changed, 16 insertions(+), 7 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index fde6ff9..b015a0e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -308,7 +308,7 @@ struct task_group root_task_group;
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
- unsigned long nr_running;
+ unsigned long nr_running, h_nr_running;
u64 exec_clock;
u64 min_vruntime;
@@ -1830,7 +1830,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
rq->nr_uninterruptible--;
enqueue_task(rq, p, flags);
- inc_nr_running(rq);
}
/*
@@ -1842,7 +1841,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
rq->nr_uninterruptible++;
dequeue_task(rq, p, flags);
- dec_nr_running(rq);
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -4226,7 +4224,7 @@ pick_next_task(struct rq *rq)
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
- if (likely(rq->nr_running == rq->cfs.nr_running)) {
+ if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq);
if (likely(p))
return p;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c80f030..f70bb4b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1332,16 +1332,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);
+ cfs_rq->h_nr_running++;
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_nr_running++;
update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq);
}
+ inc_nr_running(rq);
hrtick_update(rq);
}
@@ -1361,6 +1364,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
+ cfs_rq->h_nr_running--;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
@@ -1379,12 +1383,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ cfs_rq = cfs_rq_of(se);
+ cfs_rq->h_nr_running--;
update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq);
}
+ dec_nr_running(rq);
hrtick_update(rq);
}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 10d0182..1af971b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -949,6 +949,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+
+ inc_nr_running(rq);
}
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -959,6 +961,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
dequeue_rt_entity(rt_se);
dequeue_pushable_task(rq, p);
+
+ dec_nr_running(rq);
}
/*
@@ -1851,4 +1855,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
rcu_read_unlock();
}
#endif /* CONFIG_SCHED_DEBUG */
-
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 6f43763..8b44e7f 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
static void
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
+ inc_nr_running(rq);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
+ dec_nr_running(rq);
}
static void yield_task_stop(struct rq *rq)
--
1.7.0.4
@@ -0,0 +1,380 @@
From 116f22667986ab86f1a00098a0daf9959b1f6df0 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:28 -0700
Subject: [PATCH 03/19] sched: Introduce primitives to account for CFS bandwidth tracking
In this patch we introduce the notion of CFS bandwidth, partitioned into
globally unassigned bandwidth, and locally claimed bandwidth.
- The global bandwidth is per task_group, it represents a pool of unclaimed
bandwidth that cfs_rqs can allocate from.
- The local bandwidth is tracked per-cfs_rq, this represents allotments from
the global pool bandwidth assigned to a specific cpu.
Bandwidth is managed via cgroupfs, adding two new interfaces to the cpu subsystem:
- cpu.cfs_period_us : the bandwidth period in usecs
- cpu.cfs_quota_us : the cpu bandwidth (in usecs) that this tg will be allowed
to consume over period above.
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184756.972636699@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
init/Kconfig | 12 +++
kernel/sched.c | 201 ++++++++++++++++++++++++++++++++++++++++++++++++++-
kernel/sched_fair.c | 16 ++++
3 files changed, 225 insertions(+), 4 deletions(-)
diff --git a/init/Kconfig b/init/Kconfig
index 412c21b..67579ed 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED
depends on CGROUP_SCHED
default CGROUP_SCHED
+config CFS_BANDWIDTH
+ bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
+ depends on EXPERIMENTAL
+ depends on FAIR_GROUP_SCHED
+ default n
+ help
+ This option allows users to define CPU bandwidth rates (limits) for
+ tasks running within the fair group scheduler. Groups with no limit
+ set are considered to be unconstrained and will run with no
+ restriction.
+ See tip/Documentation/scheduler/sched-bwc.txt for more information.
+
config RT_GROUP_SCHED
bool "Group scheduling for SCHED_RR/FIFO"
depends on EXPERIMENTAL
diff --git a/kernel/sched.c b/kernel/sched.c
index b015a0e..28d838b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -244,6 +244,14 @@ struct cfs_rq;
static LIST_HEAD(task_groups);
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+ raw_spinlock_t lock;
+ ktime_t period;
+ u64 quota;
+#endif
+};
+
/* task group related information */
struct task_group {
struct cgroup_subsys_state css;
@@ -275,6 +283,8 @@ struct task_group {
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup *autogroup;
#endif
+
+ struct cfs_bandwidth cfs_bandwidth;
};
/* task_group_lock serializes the addition/removal of task groups */
@@ -374,9 +384,48 @@ struct cfs_rq {
unsigned long load_contribution;
#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ int runtime_enabled;
+ s64 runtime_remaining;
+#endif
#endif
};
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+ return &tg->cfs_bandwidth;
+}
+
+static inline u64 default_cfs_period(void);
+
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ raw_spin_lock_init(&cfs_b->lock);
+ cfs_b->quota = RUNTIME_INF;
+ cfs_b->period = ns_to_ktime(default_cfs_period());
+}
+
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->runtime_enabled = 0;
+}
+
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{}
+#else
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+ return NULL;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
struct rt_prio_array active;
@@ -7958,6 +8007,12 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
tg->cfs_rq[cpu] = cfs_rq;
init_cfs_rq(cfs_rq, rq);
cfs_rq->tg = tg;
+ cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+ /* allow initial update_cfs_load() to truncate */
+ cfs_rq->load_stamp = 1;
+#endif
+ init_cfs_rq_runtime(cfs_rq);
tg->se[cpu] = se;
/* se could be NULL for root_task_group */
@@ -8093,6 +8148,7 @@ void __init sched_init(void)
* We achieve this by letting root_task_group's tasks sit
* directly in rq->cfs (i.e root_task_group->se[] = NULL).
*/
+ init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8336,6 +8392,8 @@ static void free_fair_sched_group(struct task_group *tg)
{
int i;
+ destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
@@ -8363,6 +8421,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
tg->shares = NICE_0_LOAD;
+ init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
for_each_possible_cpu(i) {
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
@@ -8734,7 +8794,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
return walk_tg_tree(tg_schedulable, tg_nop, &data);
}
-static int tg_set_bandwidth(struct task_group *tg,
+static int tg_set_rt_bandwidth(struct task_group *tg,
u64 rt_period, u64 rt_runtime)
{
int i, err = 0;
@@ -8773,7 +8833,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
if (rt_runtime_us < 0)
rt_runtime = RUNTIME_INF;
- return tg_set_bandwidth(tg, rt_period, rt_runtime);
+ return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
long sched_group_rt_runtime(struct task_group *tg)
@@ -8798,7 +8858,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
if (rt_period == 0)
return -EINVAL;
- return tg_set_bandwidth(tg, rt_period, rt_runtime);
+ return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
long sched_group_rt_period(struct task_group *tg)
@@ -8988,6 +9048,128 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
return (u64) scale_load_down(tg->shares);
}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
+const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+ int i;
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+ static DEFINE_MUTEX(mutex);
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ /*
+ * Ensure we have at some amount of bandwidth every period. This is
+ * to prevent reaching a state of large arrears when throttled via
+ * entity_tick() resulting in prolonged exit starvation.
+ */
+ if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
+ return -EINVAL;
+
+ /*
+ * Likewise, bound things on the otherside by preventing insane quota
+ * periods. This also allows us to normalize in computing quota
+ * feasibility.
+ */
+ if (period > max_cfs_quota_period)
+ return -EINVAL;
+
+ mutex_lock(&mutex);
+ raw_spin_lock_irq(&cfs_b->lock);
+ cfs_b->period = ns_to_ktime(period);
+ cfs_b->quota = quota;
+ raw_spin_unlock_irq(&cfs_b->lock);
+
+ for_each_possible_cpu(i) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+ struct rq *rq = rq_of(cfs_rq);
+
+ raw_spin_lock_irq(&rq->lock);
+ cfs_rq->runtime_enabled = quota != RUNTIME_INF;
+ cfs_rq->runtime_remaining = 0;
+ raw_spin_unlock_irq(&rq->lock);
+ }
+ mutex_unlock(&mutex);
+
+ return 0;
+}
+
+int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+{
+ u64 quota, period;
+
+ period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+ if (cfs_quota_us < 0)
+ quota = RUNTIME_INF;
+ else
+ quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+
+ return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_quota(struct task_group *tg)
+{
+ u64 quota_us;
+
+ if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
+ return -1;
+
+ quota_us = tg_cfs_bandwidth(tg)->quota;
+ do_div(quota_us, NSEC_PER_USEC);
+
+ return quota_us;
+}
+
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+ u64 quota, period;
+
+ period = (u64)cfs_period_us * NSEC_PER_USEC;
+ quota = tg_cfs_bandwidth(tg)->quota;
+
+ if (period <= 0)
+ return -EINVAL;
+
+ return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_period(struct task_group *tg)
+{
+ u64 cfs_period_us;
+
+ cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+ do_div(cfs_period_us, NSEC_PER_USEC);
+
+ return cfs_period_us;
+}
+
+static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+ return tg_get_cfs_quota(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+ s64 cfs_quota_us)
+{
+ return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+}
+
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+ return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+ u64 cfs_period_us)
+{
+ return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
@@ -9022,6 +9204,18 @@ static struct cftype cpu_files[] = {
.write_u64 = cpu_shares_write_u64,
},
#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .name = "cfs_quota_us",
+ .read_s64 = cpu_cfs_quota_read_s64,
+ .write_s64 = cpu_cfs_quota_write_s64,
+ },
+ {
+ .name = "cfs_period_us",
+ .read_u64 = cpu_cfs_period_read_u64,
+ .write_u64 = cpu_cfs_period_write_u64,
+ },
+#endif
#ifdef CONFIG_RT_GROUP_SCHED
{
.name = "rt_runtime_us",
@@ -9331,4 +9525,3 @@ struct cgroup_subsys cpuacct_subsys = {
.subsys_id = cpuacct_subsys_id,
};
#endif /* CONFIG_CGROUP_CPUACCT */
-
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f70bb4b..91624cf 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1256,6 +1256,22 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
check_preempt_tick(cfs_rq, curr);
}
+
+/**************************************************
+ * CFS bandwidth control machinery
+ */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * default period for cfs group bandwidth.
+ * default: 0.1s, units: nanoseconds
+ */
+static inline u64 default_cfs_period(void)
+{
+ return 100000000ULL;
+}
+#endif
+
/**************************************************
* CFS operations on tasks:
*/
--
1.7.0.4
@@ -0,0 +1,221 @@
From e68a3cf7b0006f6d8c362833ebc96cbed01a263e Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:29 -0700
Subject: [PATCH 04/19] sched: Validate CFS quota hierarchies
Add constraints validation for CFS bandwidth hierarchies.
Validate that:
max(child bandwidth) <= parent_bandwidth
In a quota limited hierarchy, an unconstrained entity
(e.g. bandwidth==RUNTIME_INF) inherits the bandwidth of its parent.
This constraint is chosen over sum(child_bandwidth) as notion of over-commit is
valuable within SCHED_OTHER. Some basic code from the RT case is re-factored
for reuse.
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.083774572@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++++-------
1 files changed, 98 insertions(+), 14 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 28d838b..75f2dd7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -249,6 +249,7 @@ struct cfs_bandwidth {
raw_spinlock_t lock;
ktime_t period;
u64 quota;
+ s64 hierarchal_quota;
#endif
};
@@ -1512,7 +1513,8 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
update_load_sub(&rq->load, load);
}
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
+ (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
typedef int (*tg_visitor)(struct task_group *, void *);
/*
@@ -8694,12 +8696,7 @@ unsigned long sched_group_shares(struct task_group *tg)
}
#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-/*
- * Ensure that the real time constraints are schedulable.
- */
-static DEFINE_MUTEX(rt_constraints_mutex);
-
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
static unsigned long to_ratio(u64 period, u64 runtime)
{
if (runtime == RUNTIME_INF)
@@ -8707,6 +8704,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)
return div64_u64(runtime << 20, period);
}
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
/* Must be called with tasklist_lock held */
static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -8727,7 +8731,7 @@ struct rt_schedulable_data {
u64 rt_runtime;
};
-static int tg_schedulable(struct task_group *tg, void *data)
+static int tg_rt_schedulable(struct task_group *tg, void *data)
{
struct rt_schedulable_data *d = data;
struct task_group *child;
@@ -8791,7 +8795,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
.rt_runtime = runtime,
};
- return walk_tg_tree(tg_schedulable, tg_nop, &data);
+ return walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
}
static int tg_set_rt_bandwidth(struct task_group *tg,
@@ -9050,14 +9054,17 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
}
#ifdef CONFIG_CFS_BANDWIDTH
+static DEFINE_MUTEX(cfs_constraints_mutex);
+
const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
+
static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
{
- int i;
+ int i, ret = 0;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- static DEFINE_MUTEX(mutex);
if (tg == &root_task_group)
return -EINVAL;
@@ -9078,7 +9085,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
if (period > max_cfs_quota_period)
return -EINVAL;
- mutex_lock(&mutex);
+ mutex_lock(&cfs_constraints_mutex);
+ ret = __cfs_schedulable(tg, period, quota);
+ if (ret)
+ goto out_unlock;
+
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
@@ -9093,9 +9104,10 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
cfs_rq->runtime_remaining = 0;
raw_spin_unlock_irq(&rq->lock);
}
- mutex_unlock(&mutex);
+out_unlock:
+ mutex_unlock(&cfs_constraints_mutex);
- return 0;
+ return ret;
}
int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
@@ -9169,6 +9181,78 @@ static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
}
+struct cfs_schedulable_data {
+ struct task_group *tg;
+ u64 period, quota;
+};
+
+/*
+ * normalize group quota/period to be quota/max_period
+ * note: units are usecs
+ */
+static u64 normalize_cfs_quota(struct task_group *tg,
+ struct cfs_schedulable_data *d)
+{
+ u64 quota, period;
+
+ if (tg == d->tg) {
+ period = d->period;
+ quota = d->quota;
+ } else {
+ period = tg_get_cfs_period(tg);
+ quota = tg_get_cfs_quota(tg);
+ }
+
+ /* note: these should typically be equivalent */
+ if (quota == RUNTIME_INF || quota == -1)
+ return RUNTIME_INF;
+
+ return to_ratio(period, quota);
+}
+
+static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
+{
+ struct cfs_schedulable_data *d = data;
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+ s64 quota = 0, parent_quota = -1;
+
+ if (!tg->parent) {
+ quota = RUNTIME_INF;
+ } else {
+ struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
+
+ quota = normalize_cfs_quota(tg, d);
+ parent_quota = parent_b->hierarchal_quota;
+
+ /*
+ * ensure max(child_quota) <= parent_quota, inherit when no
+ * limit is set
+ */
+ if (quota == RUNTIME_INF)
+ quota = parent_quota;
+ else if (parent_quota != RUNTIME_INF && quota > parent_quota)
+ return -EINVAL;
+ }
+ cfs_b->hierarchal_quota = quota;
+
+ return 0;
+}
+
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+{
+ struct cfs_schedulable_data data = {
+ .tg = tg,
+ .period = period,
+ .quota = quota,
+ };
+
+ if (quota != RUNTIME_INF) {
+ do_div(data.period, NSEC_PER_USEC);
+ do_div(data.quota, NSEC_PER_USEC);
+ }
+
+ return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+}
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
--
1.7.0.4
@@ -0,0 +1,217 @@
From 50fe68ec9d454eced64cbfc29954ee64cc7225da Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:30 -0700
Subject: [PATCH 05/19] sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth
Account bandwidth usage on the cfs_rq level versus the task_groups to which
they belong. Whether we are tracking bandwidth on a given cfs_rq is maintained
under cfs_rq->runtime_enabled.
cfs_rq's which belong to a bandwidth constrained task_group have their runtime
accounted via the update_curr() path, which withdraws bandwidth from the global
pool as desired. Updates involving the global pool are currently protected
under cfs_bandwidth->lock, local runtime is protected by rq->lock.
This patch only assigns and tracks quota, no action is taken in the case that
cfs_rq->runtime_used exceeds cfs_rq->runtime_assigned.
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.179386821@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
include/linux/sched.h | 4 ++
kernel/sched.c | 4 ++-
kernel/sched_fair.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++-
kernel/sysctl.c | 10 ++++++
4 files changed, 94 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 14a6c7b..adfc8eb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2021,6 +2021,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
static inline void sched_autogroup_exit(struct signal_struct *sig) { }
#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+extern unsigned int sysctl_sched_cfs_bandwidth_slice;
+#endif
+
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);
diff --git a/kernel/sched.c b/kernel/sched.c
index 75f2dd7..cdbc7d3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -248,7 +248,7 @@ struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
raw_spinlock_t lock;
ktime_t period;
- u64 quota;
+ u64 quota, runtime;
s64 hierarchal_quota;
#endif
};
@@ -404,6 +404,7 @@ static inline u64 default_cfs_period(void);
static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
raw_spin_lock_init(&cfs_b->lock);
+ cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
}
@@ -9093,6 +9094,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
+ cfs_b->runtime = quota;
raw_spin_unlock_irq(&cfs_b->lock);
for_each_possible_cpu(i) {
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 91624cf..863c9ec 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
*/
unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
+ * each time a cfs_rq requests quota.
+ *
+ * Note: in the case that the slice exceeds the runtime remaining (either due
+ * to consumption or the quota being specified to be smaller than the slice)
+ * we will always only issue the remaining available time.
+ *
+ * default: 5 msec, units: microseconds
+ */
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
+
static const struct sched_class fair_sched_class;
/**************************************************************
@@ -305,6 +319,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ unsigned long delta_exec);
/**************************************************************
* Scheduling class tree data structure manipulation methods:
@@ -602,6 +618,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
cpuacct_charge(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
+
+ account_cfs_rq_runtime(cfs_rq, delta_exec);
}
static inline void
@@ -1270,6 +1288,58 @@ static inline u64 default_cfs_period(void)
{
return 100000000ULL;
}
+
+static inline u64 sched_cfs_bandwidth_slice(void)
+{
+ return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
+}
+
+static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct task_group *tg = cfs_rq->tg;
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+ u64 amount = 0, min_amount;
+
+ /* note: this is a positive sum as runtime_remaining <= 0 */
+ min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+
+ raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->quota == RUNTIME_INF)
+ amount = min_amount;
+ else if (cfs_b->runtime > 0) {
+ amount = min(cfs_b->runtime, min_amount);
+ cfs_b->runtime -= amount;
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ cfs_rq->runtime_remaining += amount;
+}
+
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ unsigned long delta_exec)
+{
+ if (!cfs_rq->runtime_enabled)
+ return;
+
+ cfs_rq->runtime_remaining -= delta_exec;
+ if (cfs_rq->runtime_remaining > 0)
+ return;
+
+ assign_cfs_rq_runtime(cfs_rq);
+}
+
+static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ unsigned long delta_exec)
+{
+ if (!cfs_rq->runtime_enabled)
+ return;
+
+ __account_cfs_rq_runtime(cfs_rq, delta_exec);
+}
+
+#else
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ unsigned long delta_exec) {}
#endif
/**************************************************
@@ -4264,8 +4334,13 @@ static void set_curr_task_fair(struct rq *rq)
{
struct sched_entity *se = &rq->curr->se;
- for_each_sched_entity(se)
- set_next_entity(cfs_rq_of(se), se);
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ set_next_entity(cfs_rq, se);
+ /* ensure bandwidth has been allocated on our new cfs_rq */
+ account_cfs_rq_runtime(cfs_rq, 0);
+ }
}
#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f175d98..b38ca7f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .procname = "sched_cfs_bandwidth_slice_us",
+ .data = &sysctl_sched_cfs_bandwidth_slice,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
--
1.7.0.4
@@ -0,0 +1,263 @@
From c127107a0b9f7fe08dd11c84ecb6b307052b7688 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:31 -0700
Subject: [PATCH 06/19] sched: Add a timer to handle CFS bandwidth refresh
This patch adds a per-task_group timer which handles the refresh of the global
CFS bandwidth pool.
Since the RT pool is using a similar timer there's some small refactoring to
share this support.
Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.277271273@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched.c | 107 +++++++++++++++++++++++++++++++++++++++++----------
kernel/sched_fair.c | 40 +++++++++++++++++-
2 files changed, 123 insertions(+), 24 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index cdbc7d3..4bb2d63 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -193,10 +193,28 @@ static inline int rt_bandwidth_enabled(void)
return sysctl_sched_rt_runtime >= 0;
}
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
- ktime_t now;
+ unsigned long delta;
+ ktime_t soft, hard, now;
+ for (;;) {
+ if (hrtimer_active(period_timer))
+ break;
+
+ now = hrtimer_cb_get_time(period_timer);
+ hrtimer_forward(period_timer, now, period);
+
+ soft = hrtimer_get_softexpires(period_timer);
+ hard = hrtimer_get_expires(period_timer);
+ delta = ktime_to_ns(ktime_sub(hard, soft));
+ __hrtimer_start_range_ns(period_timer, soft, delta,
+ HRTIMER_MODE_ABS_PINNED, 0);
+ }
+}
+
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
return;
@@ -204,22 +222,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
return;
raw_spin_lock(&rt_b->rt_runtime_lock);
- for (;;) {
- unsigned long delta;
- ktime_t soft, hard;
-
- if (hrtimer_active(&rt_b->rt_period_timer))
- break;
-
- now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
- hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-
- soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
- hard = hrtimer_get_expires(&rt_b->rt_period_timer);
- delta = ktime_to_ns(ktime_sub(hard, soft));
- __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
- HRTIMER_MODE_ABS_PINNED, 0);
- }
+ start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -250,6 +253,9 @@ struct cfs_bandwidth {
ktime_t period;
u64 quota, runtime;
s64 hierarchal_quota;
+
+ int idle, timer_active;
+ struct hrtimer period_timer;
#endif
};
@@ -400,6 +406,28 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
}
static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, period_timer);
+ ktime_t now;
+ int overrun;
+ int idle = 0;
+
+ for (;;) {
+ now = hrtimer_cb_get_time(timer);
+ overrun = hrtimer_forward(timer, now, cfs_b->period);
+
+ if (!overrun)
+ break;
+
+ idle = do_sched_cfs_period_timer(cfs_b, overrun);
+ }
+
+ return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
@@ -407,6 +435,9 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->runtime = 0;
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
+
+ hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->period_timer.function = sched_cfs_period_timer;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -414,8 +445,34 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_rq->runtime_enabled = 0;
}
+/* requires cfs_b->lock, may release to reprogram timer */
+static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ /*
+ * The timer may be active because we're trying to set a new bandwidth
+ * period or because we're racing with the tear-down path
+ * (timer_active==0 becomes visible before the hrtimer call-back
+ * terminates). In either case we ensure that it's re-programmed
+ */
+ while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+ raw_spin_unlock(&cfs_b->lock);
+ /* ensure cfs_b->lock is available while we wait */
+ hrtimer_cancel(&cfs_b->period_timer);
+
+ raw_spin_lock(&cfs_b->lock);
+ /* if someone else restarted the timer then we're done */
+ if (cfs_b->timer_active)
+ return;
+ }
+
+ cfs_b->timer_active = 1;
+ start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
+
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{}
+{
+ hrtimer_cancel(&cfs_b->period_timer);
+}
#else
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
@@ -9064,7 +9121,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
{
- int i, ret = 0;
+ int i, ret = 0, runtime_enabled;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
if (tg == &root_task_group)
@@ -9091,10 +9148,18 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
if (ret)
goto out_unlock;
+ runtime_enabled = quota != RUNTIME_INF;
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
cfs_b->runtime = quota;
+
+ /* restart the period timer (if active) to handle new period expiry */
+ if (runtime_enabled && cfs_b->timer_active) {
+ /* force a reprogram */
+ cfs_b->timer_active = 0;
+ __start_cfs_bandwidth(cfs_b);
+ }
raw_spin_unlock_irq(&cfs_b->lock);
for_each_possible_cpu(i) {
@@ -9102,7 +9167,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
struct rq *rq = rq_of(cfs_rq);
raw_spin_lock_irq(&rq->lock);
- cfs_rq->runtime_enabled = quota != RUNTIME_INF;
+ cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0;
raw_spin_unlock_irq(&rq->lock);
}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 863c9ec..e34c26c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1306,9 +1306,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
- else if (cfs_b->runtime > 0) {
- amount = min(cfs_b->runtime, min_amount);
- cfs_b->runtime -= amount;
+ else {
+ /* ensure bandwidth timer remains active under consumption */
+ if (!cfs_b->timer_active)
+ __start_cfs_bandwidth(cfs_b);
+
+ if (cfs_b->runtime > 0) {
+ amount = min(cfs_b->runtime, min_amount);
+ cfs_b->runtime -= amount;
+ cfs_b->idle = 0;
+ }
}
raw_spin_unlock(&cfs_b->lock);
@@ -1337,6 +1344,33 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
__account_cfs_rq_runtime(cfs_rq, delta_exec);
}
+/*
+ * Responsible for refilling a task_group's bandwidth and unthrottling its
+ * cfs_rqs as appropriate. If there has been no activity within the last
+ * period the timer is deactivated until scheduling resumes; cfs_b->idle is
+ * used to track this state.
+ */
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+{
+ int idle = 1;
+
+ raw_spin_lock(&cfs_b->lock);
+ /* no need to continue the timer with no bandwidth constraint */
+ if (cfs_b->quota == RUNTIME_INF)
+ goto out_unlock;
+
+ idle = cfs_b->idle;
+ cfs_b->runtime = cfs_b->quota;
+
+ /* mark as potentially idle for the upcoming period */
+ cfs_b->idle = 1;
+out_unlock:
+ if (idle)
+ cfs_b->timer_active = 0;
+ raw_spin_unlock(&cfs_b->lock);
+
+ return idle;
+}
#else
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec) {}
--
1.7.0.4
@@ -0,0 +1,208 @@
From bfd5537a5bca64bb37c64b3156bdbb85dbd46fae Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:32 -0700
Subject: [PATCH 07/19] sched: Expire invalid runtime
Since quota is managed using a global state but consumed on a per-cpu basis
we need to ensure that our per-cpu state is appropriately synchronized.
Most importantly, runtime that is state (from a previous period) should not be
locally consumable.
We take advantage of existing sched_clock synchronization about the jiffy to
efficiently detect whether we have (globally) crossed a quota boundary above.
One catch is that the direction of spread on sched_clock is undefined,
specifically, we don't know whether our local clock is behind or ahead
of the one responsible for the current expiration time.
Fortunately we can differentiate these by considering whether the
global deadline has advanced. If it has not, then we assume our clock to be
"fast" and advance our local expiration; otherwise, we know the deadline has
truly passed and we expire our local runtime.
Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.379275352@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched.c | 4 ++-
kernel/sched_fair.c | 90 +++++++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 84 insertions(+), 10 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 4bb2d63..6a0bcd5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -253,6 +253,7 @@ struct cfs_bandwidth {
ktime_t period;
u64 quota, runtime;
s64 hierarchal_quota;
+ u64 runtime_expires;
int idle, timer_active;
struct hrtimer period_timer;
@@ -393,6 +394,7 @@ struct cfs_rq {
#endif
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
+ u64 runtime_expires;
s64 runtime_remaining;
#endif
#endif
@@ -9152,8 +9154,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
- cfs_b->runtime = quota;
+ __refill_cfs_bandwidth_runtime(cfs_b);
/* restart the period timer (if active) to handle new period expiry */
if (runtime_enabled && cfs_b->timer_active) {
/* force a reprogram */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e34c26c..a97d19e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1294,11 +1294,30 @@ static inline u64 sched_cfs_bandwidth_slice(void)
return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
}
+/*
+ * Replenish runtime according to assigned quota and update expiration time.
+ * We use sched_clock_cpu directly instead of rq->clock to avoid adding
+ * additional synchronization around rq->lock.
+ *
+ * requires cfs_b->lock
+ */
+static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+{
+ u64 now;
+
+ if (cfs_b->quota == RUNTIME_INF)
+ return;
+
+ now = sched_clock_cpu(smp_processor_id());
+ cfs_b->runtime = cfs_b->quota;
+ cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+}
+
static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount;
+ u64 amount = 0, min_amount, expires;
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -1307,9 +1326,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
- /* ensure bandwidth timer remains active under consumption */
- if (!cfs_b->timer_active)
+ /*
+ * If the bandwidth pool has become inactive, then at least one
+ * period must have elapsed since the last consumption.
+ * Refresh the global state and ensure bandwidth timer becomes
+ * active.
+ */
+ if (!cfs_b->timer_active) {
+ __refill_cfs_bandwidth_runtime(cfs_b);
__start_cfs_bandwidth(cfs_b);
+ }
if (cfs_b->runtime > 0) {
amount = min(cfs_b->runtime, min_amount);
@@ -1317,19 +1343,61 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
+ expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
+ /*
+ * we may have advanced our local expiration to account for allowed
+ * spread between our sched_clock and the one on which runtime was
+ * issued.
+ */
+ if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+ cfs_rq->runtime_expires = expires;
}
-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec)
+/*
+ * Note: This depends on the synchronization provided by sched_clock and the
+ * fact that rq->clock snapshots this value.
+ */
+static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- if (!cfs_rq->runtime_enabled)
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ struct rq *rq = rq_of(cfs_rq);
+
+ /* if the deadline is ahead of our clock, nothing to do */
+ if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
+ return;
+
+ if (cfs_rq->runtime_remaining < 0)
return;
+ /*
+ * If the local deadline has passed we have to consider the
+ * possibility that our sched_clock is 'fast' and the global deadline
+ * has not truly expired.
+ *
+ * Fortunately we can check determine whether this the case by checking
+ * whether the global deadline has advanced.
+ */
+
+ if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+ /* extend local deadline, drift is bounded above by 2 ticks */
+ cfs_rq->runtime_expires += TICK_NSEC;
+ } else {
+ /* global deadline is ahead, expiration has passed */
+ cfs_rq->runtime_remaining = 0;
+ }
+}
+
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ unsigned long delta_exec)
+{
+ /* dock delta_exec before expiring quota (as it could span periods) */
cfs_rq->runtime_remaining -= delta_exec;
- if (cfs_rq->runtime_remaining > 0)
+ expire_cfs_rq_runtime(cfs_rq);
+
+ if (likely(cfs_rq->runtime_remaining > 0))
return;
assign_cfs_rq_runtime(cfs_rq);
@@ -1360,7 +1428,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
goto out_unlock;
idle = cfs_b->idle;
- cfs_b->runtime = cfs_b->quota;
+ /* if we're going inactive then everything else can be deferred */
+ if (idle)
+ goto out_unlock;
+
+ __refill_cfs_bandwidth_runtime(cfs_b);
+
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
@@ -1579,7 +1652,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
return wl;
}
-
#else
static inline unsigned long effective_load(struct task_group *tg, int cpu,
--
1.7.0.4
@@ -0,0 +1,234 @@
From 726bbbeef1579f5f981d2d98afda0304197b7e19 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:33 -0700
Subject: [PATCH 08/19] sched: Add support for throttling group entities
Now that consumption is tracked (via update_curr()) we add support to throttle
group entities (and their corresponding cfs_rqs) in the case where this is no
run-time remaining.
Throttled entities are dequeued to prevent scheduling, additionally we mark
them as throttled (using cfs_rq->throttled) to prevent them from becoming
re-enqueued until they are unthrottled. A list of a task_group's throttled
entities are maintained on the cfs_bandwidth structure.
Note: While the machinery for throttling is added in this patch the act of
throttling an entity exceeding its bandwidth is deferred until later within
the series.
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.480608533@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched.c | 7 ++++
kernel/sched_fair.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 92 insertions(+), 4 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 6a0bcd5..d631e42 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -257,6 +257,8 @@ struct cfs_bandwidth {
int idle, timer_active;
struct hrtimer period_timer;
+ struct list_head throttled_cfs_rq;
+
#endif
};
@@ -396,6 +398,9 @@ struct cfs_rq {
int runtime_enabled;
u64 runtime_expires;
s64 runtime_remaining;
+
+ int throttled;
+ struct list_head throttled_list;
#endif
#endif
};
@@ -438,6 +443,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
cfs_b->quota = RUNTIME_INF;
cfs_b->period = ns_to_ktime(default_cfs_period());
+ INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->period_timer.function = sched_cfs_period_timer;
}
@@ -445,6 +451,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
+ INIT_LIST_HEAD(&cfs_rq->throttled_list);
}
/* requires cfs_b->lock, may release to reprogram timer */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a97d19e..f6823e2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1313,7 +1313,8 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
}
-static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
@@ -1354,6 +1355,8 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
*/
if ((s64)(expires - cfs_rq->runtime_expires) > 0)
cfs_rq->runtime_expires = expires;
+
+ return cfs_rq->runtime_remaining > 0;
}
/*
@@ -1400,7 +1403,12 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
if (likely(cfs_rq->runtime_remaining > 0))
return;
- assign_cfs_rq_runtime(cfs_rq);
+ /*
+ * if we're unable to extend our runtime we resched so that the active
+ * hierarchy can be throttled
+ */
+ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
+ resched_task(rq_of(cfs_rq)->curr);
}
static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
@@ -1412,6 +1420,47 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
__account_cfs_rq_runtime(cfs_rq, delta_exec);
}
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->throttled;
+}
+
+static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ struct sched_entity *se;
+ long task_delta, dequeue = 1;
+
+ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+ /* account load preceding throttle */
+ update_cfs_load(cfs_rq, 0);
+
+ task_delta = cfs_rq->h_nr_running;
+ for_each_sched_entity(se) {
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ /* throttled entity or throttle-on-deactivate */
+ if (!se->on_rq)
+ break;
+
+ if (dequeue)
+ dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+ qcfs_rq->h_nr_running -= task_delta;
+
+ if (qcfs_rq->load.weight)
+ dequeue = 0;
+ }
+
+ if (!se)
+ rq->nr_running -= task_delta;
+
+ cfs_rq->throttled = 1;
+ raw_spin_lock(&cfs_b->lock);
+ list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ raw_spin_unlock(&cfs_b->lock);
+}
+
/*
* Responsible for refilling a task_group's bandwidth and unthrottling its
* cfs_rqs as appropriate. If there has been no activity within the last
@@ -1447,6 +1496,11 @@ out_unlock:
#else
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec) {}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
#endif
/**************************************************
@@ -1525,7 +1579,17 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);
+
+ /*
+ * end evaluation on encountering a throttled cfs_rq
+ *
+ * note: in the case of encountering a throttled cfs_rq we will
+ * post the final h_nr_running increment below.
+ */
+ if (cfs_rq_throttled(cfs_rq))
+ break;
cfs_rq->h_nr_running++;
+
flags = ENQUEUE_WAKEUP;
}
@@ -1533,11 +1597,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq);
}
- inc_nr_running(rq);
+ if (!se)
+ inc_nr_running(rq);
hrtick_update(rq);
}
@@ -1557,6 +1625,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
+
+ /*
+ * end evaluation on encountering a throttled cfs_rq
+ *
+ * note: in the case of encountering a throttled cfs_rq we will
+ * post the final h_nr_running decrement below.
+ */
+ if (cfs_rq_throttled(cfs_rq))
+ break;
cfs_rq->h_nr_running--;
/* Don't dequeue parent if it has other entities besides us */
@@ -1579,11 +1656,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+
update_cfs_load(cfs_rq, 0);
update_cfs_shares(cfs_rq);
}
- dec_nr_running(rq);
+ if (!se)
+ dec_nr_running(rq);
hrtick_update(rq);
}
--
1.7.0.4
@@ -0,0 +1,197 @@
From b5898b8474a236451416cc68b2bea413c533f095 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:34 -0700
Subject: [PATCH 09/19] sched: Add support for unthrottling group entities
At the start of each period we refresh the global bandwidth pool. At this time
we must also unthrottle any cfs_rq entities who are now within bandwidth once
more (as quota permits).
Unthrottled entities have their corresponding cfs_rq->throttled flag cleared
and their entities re-enqueued.
Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.574628950@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched.c | 3 +
kernel/sched_fair.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 126 insertions(+), 4 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index d631e42..4b54a73 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9178,6 +9178,9 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
raw_spin_lock_irq(&rq->lock);
cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0;
+
+ if (cfs_rq_throttled(cfs_rq))
+ unthrottle_cfs_rq(cfs_rq);
raw_spin_unlock_irq(&rq->lock);
}
out_unlock:
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f6823e2..21e1c02 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1461,6 +1461,84 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
raw_spin_unlock(&cfs_b->lock);
}
+static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ struct sched_entity *se;
+ int enqueue = 1;
+ long task_delta;
+
+ se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+ cfs_rq->throttled = 0;
+ raw_spin_lock(&cfs_b->lock);
+ list_del_rcu(&cfs_rq->throttled_list);
+ raw_spin_unlock(&cfs_b->lock);
+
+ if (!cfs_rq->load.weight)
+ return;
+
+ task_delta = cfs_rq->h_nr_running;
+ for_each_sched_entity(se) {
+ if (se->on_rq)
+ enqueue = 0;
+
+ cfs_rq = cfs_rq_of(se);
+ if (enqueue)
+ enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+ cfs_rq->h_nr_running += task_delta;
+
+ if (cfs_rq_throttled(cfs_rq))
+ break;
+ }
+
+ if (!se)
+ rq->nr_running += task_delta;
+
+ /* determine whether we need to wake up potentially idle cpu */
+ if (rq->curr == rq->idle && rq->cfs.nr_running)
+ resched_task(rq->curr);
+}
+
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
+ u64 remaining, u64 expires)
+{
+ struct cfs_rq *cfs_rq;
+ u64 runtime = remaining;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
+ throttled_list) {
+ struct rq *rq = rq_of(cfs_rq);
+
+ raw_spin_lock(&rq->lock);
+ if (!cfs_rq_throttled(cfs_rq))
+ goto next;
+
+ runtime = -cfs_rq->runtime_remaining + 1;
+ if (runtime > remaining)
+ runtime = remaining;
+ remaining -= runtime;
+
+ cfs_rq->runtime_remaining += runtime;
+ cfs_rq->runtime_expires = expires;
+
+ /* we check whether we're throttled above */
+ if (cfs_rq->runtime_remaining > 0)
+ unthrottle_cfs_rq(cfs_rq);
+
+next:
+ raw_spin_unlock(&rq->lock);
+
+ if (!remaining)
+ break;
+ }
+ rcu_read_unlock();
+
+ return remaining;
+}
+
/*
* Responsible for refilling a task_group's bandwidth and unthrottling its
* cfs_rqs as appropriate. If there has been no activity within the last
@@ -1469,23 +1547,64 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
*/
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
- int idle = 1;
+ u64 runtime, runtime_expires;
+ int idle = 1, throttled;
raw_spin_lock(&cfs_b->lock);
/* no need to continue the timer with no bandwidth constraint */
if (cfs_b->quota == RUNTIME_INF)
goto out_unlock;
- idle = cfs_b->idle;
+ throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+ /* idle depends on !throttled (for the case of a large deficit) */
+ idle = cfs_b->idle && !throttled;
+
/* if we're going inactive then everything else can be deferred */
if (idle)
goto out_unlock;
__refill_cfs_bandwidth_runtime(cfs_b);
+ if (!throttled) {
+ /* mark as potentially idle for the upcoming period */
+ cfs_b->idle = 1;
+ goto out_unlock;
+ }
+
+ /*
+ * There are throttled entities so we must first use the new bandwidth
+ * to unthrottle them before making it generally available. This
+ * ensures that all existing debts will be paid before a new cfs_rq is
+ * allowed to run.
+ */
+ runtime = cfs_b->runtime;
+ runtime_expires = cfs_b->runtime_expires;
+ cfs_b->runtime = 0;
+
+ /*
+ * This check is repeated as we are holding onto the new bandwidth
+ * while we unthrottle. This can potentially race with an unthrottled
+ * group trying to acquire new bandwidth from the global pool.
+ */
+ while (throttled && runtime > 0) {
+ raw_spin_unlock(&cfs_b->lock);
+ /* we can't nest cfs_b->lock while distributing bandwidth */
+ runtime = distribute_cfs_runtime(cfs_b, runtime,
+ runtime_expires);
+ raw_spin_lock(&cfs_b->lock);
+
+ throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+ }
- /* mark as potentially idle for the upcoming period */
- cfs_b->idle = 1;
+ /* return (any) remaining runtime */
+ cfs_b->runtime = runtime;
+ /*
+ * While we are ensured activity in the period following an
+ * unthrottle, this also covers the case in which the new bandwidth is
+ * insufficient to cover the existing bandwidth deficit. (Forcing the
+ * timer to remain active while there are any throttled entities.)
+ */
+ cfs_b->idle = 0;
out_unlock:
if (idle)
cfs_b->timer_active = 0;
--
1.7.0.4
@@ -0,0 +1,136 @@
From b152339efae7eb1bdd9ec4e626121e9205299e9d Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:35 -0700
Subject: [PATCH 10/19] sched: Allow for positional tg_tree walks
Extend walk_tg_tree to accept a positional argument
static int walk_tg_tree_from(struct task_group *from,
tg_visitor down, tg_visitor up, void *data)
Existing semantics are preserved, caller must hold rcu_lock() or sufficient
analogue.
Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.677889157@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched.c | 50 +++++++++++++++++++++++++++++++++++++-------------
1 files changed, 37 insertions(+), 13 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 4b54a73..813a4ce 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1585,20 +1585,23 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
typedef int (*tg_visitor)(struct task_group *, void *);
/*
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
+ * Iterate task_group tree rooted at *from, calling @down when first entering a
+ * node and @up when leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
*/
-static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+static int walk_tg_tree_from(struct task_group *from,
+ tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
int ret;
- rcu_read_lock();
- parent = &root_task_group;
+ parent = from;
+
down:
ret = (*down)(parent, data);
if (ret)
- goto out_unlock;
+ goto out;
list_for_each_entry_rcu(child, &parent->children, siblings) {
parent = child;
goto down;
@@ -1607,19 +1610,29 @@ up:
continue;
}
ret = (*up)(parent, data);
- if (ret)
- goto out_unlock;
+ if (ret || parent == from)
+ goto out;
child = parent;
parent = parent->parent;
if (parent)
goto up;
-out_unlock:
- rcu_read_unlock();
-
+out:
return ret;
}
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+ return walk_tg_tree_from(&root_task_group, down, up, data);
+}
+
static int tg_nop(struct task_group *tg, void *data)
{
return 0;
@@ -8856,13 +8869,19 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{
+ int ret;
+
struct rt_schedulable_data data = {
.tg = tg,
.rt_period = period,
.rt_runtime = runtime,
};
- return walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+ rcu_read_lock();
+ ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+ rcu_read_unlock();
+
+ return ret;
}
static int tg_set_rt_bandwidth(struct task_group *tg,
@@ -9319,6 +9338,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
{
+ int ret;
struct cfs_schedulable_data data = {
.tg = tg,
.period = period,
@@ -9330,7 +9350,11 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
do_div(data.quota, NSEC_PER_USEC);
}
- return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+ rcu_read_lock();
+ ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+ rcu_read_unlock();
+
+ return ret;
}
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
--
1.7.0.4
@@ -0,0 +1,230 @@
From b7c5f316287ea56ecbc755110eaa032c588f2374 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:36 -0700
Subject: [PATCH 11/19] sched: Prevent interactions with throttled entities
From the perspective of load-balance and shares distribution, throttled
entities should be invisible.
However, both of these operations work on 'active' lists and are not
inherently aware of what group hierarchies may be present. In some cases this
may be side-stepped (e.g. we could sideload via tg_load_down in load balance)
while in others (e.g. update_shares()) it is more difficult to compute without
incurring some O(n^2) costs.
Instead, track hierarchicaal throttled state at time of transition. This
allows us to easily identify whether an entity belongs to a throttled hierarchy
and avoid incorrect interactions with it.
Also, when an entity leaves a throttled hierarchy we need to advance its
time averaging for shares averaging so that the elapsed throttled time is not
considered as part of the cfs_rq's operation.
We also use this information to prevent buddy interactions in the wakeup and
yield_to() paths.
Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.777916795@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched.c | 2 +-
kernel/sched_fair.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++---
2 files changed, 98 insertions(+), 7 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 813a4ce..523464e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -399,7 +399,7 @@ struct cfs_rq {
u64 runtime_expires;
s64 runtime_remaining;
- int throttled;
+ int throttled, throttle_count;
struct list_head throttled_list;
#endif
#endif
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 21e1c02..3d7430b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -725,6 +725,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+/* we need this in update_cfs_load and load-balance functions below */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
# ifdef CONFIG_SMP
static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
int global_update)
@@ -747,7 +749,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
u64 now, delta;
unsigned long load = cfs_rq->load.weight;
- if (cfs_rq->tg == &root_task_group)
+ if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
return;
now = rq_of(cfs_rq)->clock_task;
@@ -856,7 +858,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
tg = cfs_rq->tg;
se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se)
+ if (!se || throttled_hierarchy(cfs_rq))
return;
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
@@ -1425,6 +1427,65 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
return cfs_rq->throttled;
}
+/* check whether cfs_rq, or any parent, is throttled */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->throttle_count;
+}
+
+/*
+ * Ensure that neither of the group entities corresponding to src_cpu or
+ * dest_cpu are members of a throttled hierarchy when performing group
+ * load-balance operations.
+ */
+static inline int throttled_lb_pair(struct task_group *tg,
+ int src_cpu, int dest_cpu)
+{
+ struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+
+ src_cfs_rq = tg->cfs_rq[src_cpu];
+ dest_cfs_rq = tg->cfs_rq[dest_cpu];
+
+ return throttled_hierarchy(src_cfs_rq) ||
+ throttled_hierarchy(dest_cfs_rq);
+}
+
+/* updated child weight may affect parent so we have to do this bottom up */
+static int tg_unthrottle_up(struct task_group *tg, void *data)
+{
+ struct rq *rq = data;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ cfs_rq->throttle_count--;
+#ifdef CONFIG_SMP
+ if (!cfs_rq->throttle_count) {
+ u64 delta = rq->clock_task - cfs_rq->load_stamp;
+
+ /* leaving throttled state, advance shares averaging windows */
+ cfs_rq->load_stamp += delta;
+ cfs_rq->load_last += delta;
+
+ /* update entity weight now that we are on_rq again */
+ update_cfs_shares(cfs_rq);
+ }
+#endif
+
+ return 0;
+}
+
+static int tg_throttle_down(struct task_group *tg, void *data)
+{
+ struct rq *rq = data;
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+ /* group is entering throttled state, record last load */
+ if (!cfs_rq->throttle_count)
+ update_cfs_load(cfs_rq, 0);
+ cfs_rq->throttle_count++;
+
+ return 0;
+}
+
static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
@@ -1435,7 +1496,9 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
/* account load preceding throttle */
- update_cfs_load(cfs_rq, 0);
+ rcu_read_lock();
+ walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
+ rcu_read_unlock();
task_delta = cfs_rq->h_nr_running;
for_each_sched_entity(se) {
@@ -1476,6 +1539,10 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
+ update_rq_clock(rq);
+ /* update hierarchical throttle state */
+ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
+
if (!cfs_rq->load.weight)
return;
@@ -1620,6 +1687,17 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
return 0;
}
+
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
+
+static inline int throttled_lb_pair(struct task_group *tg,
+ int src_cpu, int dest_cpu)
+{
+ return 0;
+}
#endif
/**************************************************
@@ -2521,6 +2599,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
for_each_leaf_cfs_rq(busiest, cfs_rq) {
list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+ if (throttled_lb_pair(task_group(p),
+ busiest->cpu, this_cpu))
+ break;
if (!can_migrate_task(p, busiest, this_cpu,
sd, idle, &pinned))
@@ -2632,8 +2713,17 @@ static void update_shares(int cpu)
struct rq *rq = cpu_rq(cpu);
rcu_read_lock();
- for_each_leaf_cfs_rq(rq, cfs_rq)
+ /*
+ * Iterates the task_group tree in a bottom up fashion, see
+ * list_add_leaf_cfs_rq() for details.
+ */
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
+ /* throttled entities do not contribute to load */
+ if (throttled_hierarchy(cfs_rq))
+ continue;
+
update_shares_cpu(cfs_rq->tg, cpu);
+ }
rcu_read_unlock();
}
@@ -2657,9 +2747,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
u64 rem_load, moved_load;
/*
- * empty group
+ * empty group or part of a throttled hierarchy
*/
- if (!busiest_cfs_rq->task_weight)
+ if (!busiest_cfs_rq->task_weight ||
+ throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
continue;
rem_load = (u64)rem_load_move * busiest_weight;
--
1.7.0.4
@@ -0,0 +1,65 @@
From 41f8be245b607e16567c13f6be065084b73c4977 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:37 -0700
Subject: [PATCH 12/19] sched: Prevent buddy interactions with throttled entities
Buddies allow us to select "on-rq" entities without actually selecting them
from a cfs_rq's rb_tree. As a result we must ensure that throttled entities
are not falsely nominated as buddies. The fact that entities are dequeued
within throttle_entity is not sufficient for clearing buddy status as the
nomination may occur after throttling.
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.886850167@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched_fair.c | 18 +++++++++++++++++-
1 files changed, 17 insertions(+), 1 deletions(-)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3d7430b..3c0120e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2372,6 +2372,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (unlikely(se == pse))
return;
+ /*
+ * This is possible from callers such as pull_task(), in which we
+ * unconditionally check_prempt_curr() after an enqueue (which may have
+ * lead to a throttle). This both saves work and prevents false
+ * next-buddy nomination below.
+ */
+ if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+ return;
+
if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
set_next_buddy(pse);
next_buddy_marked = 1;
@@ -2380,6 +2389,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
/*
* We can come here with TIF_NEED_RESCHED already set from new task
* wake up path.
+ *
+ * Note: this also catches the edge-case of curr being in a throttled
+ * group (e.g. via set_curr_task), since update_curr() (in the
+ * enqueue of curr) will have resulted in resched being set. This
+ * prevents us from potentially nominating it as a false LAST_BUDDY
+ * below.
*/
if (test_tsk_need_resched(curr))
return;
@@ -2502,7 +2517,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
{
struct sched_entity *se = &p->se;
- if (!se->on_rq)
+ /* throttled hierarchies are not runnable */
+ if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
return false;
/* Tell the scheduler that we'd really like pse to run next. */
--
1.7.0.4
@@ -0,0 +1,69 @@
From 7af3c5930e241d0bddc028e3d05ad396c32689be Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:38 -0700
Subject: [PATCH 13/19] sched: Migrate throttled tasks on HOTPLUG
Throttled tasks are invisisble to cpu-offline since they are not eligible for
selection by pick_next_task(). The regular 'escape' path for a thread that is
blocked at offline is via ttwu->select_task_rq, however this will not handle a
throttled group since there are no individual thread wakeups on an unthrottle.
Resolve this by unthrottling offline cpus so that threads can be migrated.
Signed-off-by: Paul Turner <pjt@google.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184757.989000590@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched.c | 27 +++++++++++++++++++++++++++
1 files changed, 27 insertions(+), 0 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 523464e..7b99d63 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6310,6 +6310,30 @@ static void calc_global_load_remove(struct rq *rq)
rq->calc_load_active = 0;
}
+#ifdef CONFIG_CFS_BANDWIDTH
+static void unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+ struct cfs_rq *cfs_rq;
+
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+
+ if (!cfs_rq->runtime_enabled)
+ continue;
+
+ /*
+ * clock_task is not advancing so we just need to make sure
+ * there's some valid quota amount
+ */
+ cfs_rq->runtime_remaining = cfs_b->quota;
+ if (cfs_rq_throttled(cfs_rq))
+ unthrottle_cfs_rq(cfs_rq);
+ }
+}
+#else
+static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+#endif
+
/*
* Migrate all tasks from the rq, sleeping tasks will be migrated by
* try_to_wake_up()->select_task_rq().
@@ -6335,6 +6359,9 @@ static void migrate_tasks(unsigned int dead_cpu)
*/
rq->stop = NULL;
+ /* Ensure any throttled groups are reachable by pick_next_task */
+ unthrottle_offline_cfs_rqs(rq);
+
for ( ; ; ) {
/*
* There's this thread running, bail when that's the only
--
1.7.0.4
@@ -0,0 +1,133 @@
From d71d241613f903255fec91ba9f959a633c724b4e Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:39 -0700
Subject: [PATCH 14/19] sched: Throttle entities exceeding their allowed bandwidth
With the machinery in place to throttle and unthrottle entities, as well as
handle their participation (or lack there of) we can now enable throttling.
There are 2 points that we must check whether it's time to set throttled state:
put_prev_entity() and enqueue_entity().
- put_prev_entity() is the typical throttle path, we reach it by exceeding our
allocated run-time within update_curr()->account_cfs_rq_runtime() and going
through a reschedule.
- enqueue_entity() covers the case of a wake-up into an already throttled
group. In this case we know the group cannot be on_rq and can throttle
immediately. Checks are added at time of put_prev_entity() and
enqueue_entity()
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184758.091415417@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched_fair.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 50 insertions(+), 2 deletions(-)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3c0120e..831a300 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -989,6 +989,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
se->vruntime = vruntime;
}
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -1018,8 +1020,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- if (cfs_rq->nr_running == 1)
+ if (cfs_rq->nr_running == 1) {
list_add_leaf_cfs_rq(cfs_rq);
+ check_enqueue_throttle(cfs_rq);
+ }
}
static void __clear_buddies_last(struct sched_entity *se)
@@ -1224,6 +1228,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
return se;
}
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
/*
@@ -1233,6 +1239,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
if (prev->on_rq)
update_curr(cfs_rq);
+ /* throttle cfs_rqs exceeding runtime */
+ check_cfs_rq_runtime(cfs_rq);
+
check_spread(cfs_rq, prev);
if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
@@ -1486,7 +1495,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
return 0;
}
-static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
@@ -1679,9 +1688,48 @@ out_unlock:
return idle;
}
+
+/*
+ * When a group wakes up we want to make sure that its quota is not already
+ * expired/exceeded, otherwise it may be allowed to steal additional ticks of
+ * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ */
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
+{
+ /* an active group must be handled by the update_curr()->put() path */
+ if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+ return;
+
+ /* ensure the group is not already throttled */
+ if (cfs_rq_throttled(cfs_rq))
+ return;
+
+ /* update runtime allocation */
+ account_cfs_rq_runtime(cfs_rq, 0);
+ if (cfs_rq->runtime_remaining <= 0)
+ throttle_cfs_rq(cfs_rq);
+}
+
+/* conditionally throttle active cfs_rq's from put_prev_entity() */
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+ return;
+
+ /*
+ * it's possible for a throttled entity to be forced into a running
+ * state (e.g. set_curr_task), in this case we're finished.
+ */
+ if (cfs_rq_throttled(cfs_rq))
+ return;
+
+ throttle_cfs_rq(cfs_rq);
+}
#else
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec) {}
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
--
1.7.0.4
@@ -0,0 +1,125 @@
From 11db1560b4dec193a20e2c78fb8238d9f14a1782 Mon Sep 17 00:00:00 2001
From: Nikhil Rao <ncrao@google.com>
Date: Thu, 21 Jul 2011 09:43:40 -0700
Subject: [PATCH 15/19] sched: Add exports tracking cfs bandwidth control statistics
This change introduces statistics exports for the cpu sub-system, these are
added through the use of a stat file similar to that exported by other
subsystems.
The following exports are included:
nr_periods: number of periods in which execution occurred
nr_throttled: the number of periods above in which execution was throttle
throttled_time: cumulative wall-time that any cpus have been throttled for
this group
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Nikhil Rao <ncrao@google.com>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184758.198901931@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched.c | 21 +++++++++++++++++++++
kernel/sched_fair.c | 7 +++++++
2 files changed, 28 insertions(+), 0 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 7b99d63..08d3aa0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -259,6 +259,9 @@ struct cfs_bandwidth {
struct hrtimer period_timer;
struct list_head throttled_cfs_rq;
+ /* statistics */
+ int nr_periods, nr_throttled;
+ u64 throttled_time;
#endif
};
@@ -399,6 +402,7 @@ struct cfs_rq {
u64 runtime_expires;
s64 runtime_remaining;
+ u64 throttled_timestamp;
int throttled, throttle_count;
struct list_head throttled_list;
#endif
@@ -9383,6 +9387,19 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
return ret;
}
+
+static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ struct task_group *tg = cgroup_tg(cgrp);
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+
+ cb->fill(cb, "nr_periods", cfs_b->nr_periods);
+ cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
+ cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+
+ return 0;
+}
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -9429,6 +9446,10 @@ static struct cftype cpu_files[] = {
.read_u64 = cpu_cfs_period_read_u64,
.write_u64 = cpu_cfs_period_write_u64,
},
+ {
+ .name = "stat",
+ .read_map = cpu_stats_show,
+ },
#endif
#ifdef CONFIG_RT_GROUP_SCHED
{
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 831a300..2060fc9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1528,6 +1528,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
rq->nr_running -= task_delta;
cfs_rq->throttled = 1;
+ cfs_rq->throttled_timestamp = rq->clock;
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
raw_spin_unlock(&cfs_b->lock);
@@ -1545,8 +1546,10 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 0;
raw_spin_lock(&cfs_b->lock);
+ cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
list_del_rcu(&cfs_rq->throttled_list);
raw_spin_unlock(&cfs_b->lock);
+ cfs_rq->throttled_timestamp = 0;
update_rq_clock(rq);
/* update hierarchical throttle state */
@@ -1634,6 +1637,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
/* idle depends on !throttled (for the case of a large deficit) */
idle = cfs_b->idle && !throttled;
+ cfs_b->nr_periods += overrun;
/* if we're going inactive then everything else can be deferred */
if (idle)
@@ -1647,6 +1651,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
goto out_unlock;
}
+ /* account preceding periods in which throttling occurred */
+ cfs_b->nr_throttled += overrun;
+
/*
* There are throttled entities so we must first use the new bandwidth
* to unthrottle them before making it generally available. This
--
1.7.0.4
@@ -0,0 +1,252 @@
From 9baa7b654e1527bfec8f413f7372de6c4aeebb6a Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Thu, 21 Jul 2011 09:43:41 -0700
Subject: [PATCH 16/19] sched: Return unused runtime on group dequeue
When a local cfs_rq blocks we return the majority of its remaining quota to the
global bandwidth pool for use by other runqueues.
We do this only when the quota is current and there is more than
min_cfs_rq_quota [1ms by default] of runtime remaining on the rq.
In the case where there are throttled runqueues and we have sufficient
bandwidth to meter out a slice, a second timer is kicked off to handle this
delivery, unthrottling where appropriate.
Using a 'worst case' antagonist which executes on each cpu
for 1ms before moving onto the next on a fairly large machine:
no quota generations:
197.47 ms /cgroup/a/cpuacct.usage
199.46 ms /cgroup/a/cpuacct.usage
205.46 ms /cgroup/a/cpuacct.usage
198.46 ms /cgroup/a/cpuacct.usage
208.39 ms /cgroup/a/cpuacct.usage
Since we are allowed to use "stale" quota our usage is effectively bounded by
the rate of input into the global pool and performance is relatively stable.
with quota generations [1s increments]:
119.58 ms /cgroup/a/cpuacct.usage
119.65 ms /cgroup/a/cpuacct.usage
119.64 ms /cgroup/a/cpuacct.usage
119.63 ms /cgroup/a/cpuacct.usage
119.60 ms /cgroup/a/cpuacct.usage
The large deficit here is due to quota generations (/intentionally/) preventing
us from now using previously stranded slack quota. The cost is that this quota
becomes unavailable.
with quota generations and quota return:
200.09 ms /cgroup/a/cpuacct.usage
200.09 ms /cgroup/a/cpuacct.usage
198.09 ms /cgroup/a/cpuacct.usage
200.09 ms /cgroup/a/cpuacct.usage
200.06 ms /cgroup/a/cpuacct.usage
By returning unused quota we're able to both stably consume our desired quota
and prevent unintentional overages due to the abuse of slack quota from
previous quota periods (especially on a large machine).
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184758.306848658@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/sched.c | 15 +++++++-
kernel/sched_fair.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 122 insertions(+), 1 deletions(-)
diff --git a/kernel/sched.c b/kernel/sched.c
index 08d3aa0..8be4ca2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -256,7 +256,7 @@ struct cfs_bandwidth {
u64 runtime_expires;
int idle, timer_active;
- struct hrtimer period_timer;
+ struct hrtimer period_timer, slack_timer;
struct list_head throttled_cfs_rq;
/* statistics */
@@ -418,6 +418,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
static inline u64 default_cfs_period(void);
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, slack_timer);
+ do_sched_cfs_slack_timer(cfs_b);
+
+ return HRTIMER_NORESTART;
+}
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
@@ -450,6 +460,8 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->period_timer.function = sched_cfs_period_timer;
+ hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->slack_timer.function = sched_cfs_slack_timer;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -485,6 +497,7 @@ static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
hrtimer_cancel(&cfs_b->period_timer);
+ hrtimer_cancel(&cfs_b->slack_timer);
}
#else
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2060fc9..edf3b3e 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1071,6 +1071,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
__clear_buddies_skip(se);
}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -1109,6 +1111,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
+ /* return excess runtime on last dequeue */
+ return_cfs_rq_runtime(cfs_rq);
+
update_min_vruntime(cfs_rq);
update_cfs_shares(cfs_rq);
}
@@ -1696,6 +1701,108 @@ out_unlock:
return idle;
}
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+
+/* are we near the end of the current quota period? */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+ struct hrtimer *refresh_timer = &cfs_b->period_timer;
+ u64 remaining;
+
+ /* if the call-back is running a quota refresh is already occurring */
+ if (hrtimer_callback_running(refresh_timer))
+ return 1;
+
+ /* is a quota refresh about to occur? */
+ remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+ if (remaining < min_expire)
+ return 1;
+
+ return 0;
+}
+
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+
+ /* if there's a quota refresh soon don't bother with slack */
+ if (runtime_refresh_within(cfs_b, min_left))
+ return;
+
+ start_bandwidth_timer(&cfs_b->slack_timer,
+ ns_to_ktime(cfs_bandwidth_slack_period));
+}
+
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+
+ if (slack_runtime <= 0)
+ return;
+
+ raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->quota != RUNTIME_INF &&
+ cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+ cfs_b->runtime += slack_runtime;
+
+ /* we are under rq->lock, defer unthrottling using a timer */
+ if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+ !list_empty(&cfs_b->throttled_cfs_rq))
+ start_cfs_slack_bandwidth(cfs_b);
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ /* even if it's not valid for return we don't want to try again */
+ cfs_rq->runtime_remaining -= slack_runtime;
+}
+
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+ return;
+
+ __return_cfs_rq_runtime(cfs_rq);
+}
+
+/*
+ * This is done with a timer (instead of inline with bandwidth return) since
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
+ */
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+ u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+ u64 expires;
+
+ /* confirm we're still not at a refresh boundary */
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+ return;
+
+ raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+ runtime = cfs_b->runtime;
+ cfs_b->runtime = 0;
+ }
+ expires = cfs_b->runtime_expires;
+ raw_spin_unlock(&cfs_b->lock);
+
+ if (!runtime)
+ return;
+
+ runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+
+ raw_spin_lock(&cfs_b->lock);
+ if (expires == cfs_b->runtime_expires)
+ cfs_b->runtime = runtime;
+ raw_spin_unlock(&cfs_b->lock);
+}
+
/*
* When a group wakes up we want to make sure that its quota is not already
* expired/exceeded, otherwise it may be allowed to steal additional ticks of
@@ -1737,6 +1844,7 @@ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
--
1.7.0.4
@@ -0,0 +1,148 @@
From d5edecf4b5298b11f6d39e3029b7620ee06640e7 Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Thu, 21 Jul 2011 09:43:43 -0700
Subject: [PATCH 17/19] sched: Add documentation for bandwidth control
Basic description of usage and effect for CFS Bandwidth Control.
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110721184758.498036116@google.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
Documentation/scheduler/sched-bwc.txt | 122 +++++++++++++++++++++++++++++++++
1 files changed, 122 insertions(+), 0 deletions(-)
create mode 100644 Documentation/scheduler/sched-bwc.txt
diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt
new file mode 100644
index 0000000..f6b1873
--- /dev/null
+++ b/Documentation/scheduler/sched-bwc.txt
@@ -0,0 +1,122 @@
+CFS Bandwidth Control
+=====================
+
+[ This document only discusses CPU bandwidth control for SCHED_NORMAL.
+ The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ]
+
+CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the
+specification of the maximum CPU bandwidth available to a group or hierarchy.
+
+The bandwidth allowed for a group is specified using a quota and period. Within
+each given "period" (microseconds), a group is allowed to consume only up to
+"quota" microseconds of CPU time. When the CPU bandwidth consumption of a
+group exceeds this limit (for that period), the tasks belonging to its
+hierarchy will be throttled and are not allowed to run again until the next
+period.
+
+A group's unused runtime is globally tracked, being refreshed with quota units
+above at each period boundary. As threads consume this bandwidth it is
+transferred to cpu-local "silos" on a demand basis. The amount transferred
+within each of these updates is tunable and described as the "slice".
+
+Management
+----------
+Quota and period are managed within the cpu subsystem via cgroupfs.
+
+cpu.cfs_quota_us: the total available run-time within a period (in microseconds)
+cpu.cfs_period_us: the length of a period (in microseconds)
+cpu.stat: exports throttling statistics [explained further below]
+
+The default values are:
+ cpu.cfs_period_us=100ms
+ cpu.cfs_quota=-1
+
+A value of -1 for cpu.cfs_quota_us indicates that the group does not have any
+bandwidth restriction in place, such a group is described as an unconstrained
+bandwidth group. This represents the traditional work-conserving behavior for
+CFS.
+
+Writing any (valid) positive value(s) will enact the specified bandwidth limit.
+The minimum quota allowed for the quota or period is 1ms. There is also an
+upper bound on the period length of 1s. Additional restrictions exist when
+bandwidth limits are used in a hierarchical fashion, these are explained in
+more detail below.
+
+Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit
+and return the group to an unconstrained state once more.
+
+Any updates to a group's bandwidth specification will result in it becoming
+unthrottled if it is in a constrained state.
+
+System wide settings
+--------------------
+For efficiency run-time is transferred between the global pool and CPU local
+"silos" in a batch fashion. This greatly reduces global accounting pressure
+on large systems. The amount transferred each time such an update is required
+is described as the "slice".
+
+This is tunable via procfs:
+ /proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms)
+
+Larger slice values will reduce transfer overheads, while smaller values allow
+for more fine-grained consumption.
+
+Statistics
+----------
+A group's bandwidth statistics are exported via 3 fields in cpu.stat.
+
+cpu.stat:
+- nr_periods: Number of enforcement intervals that have elapsed.
+- nr_throttled: Number of times the group has been throttled/limited.
+- throttled_time: The total time duration (in nanoseconds) for which entities
+ of the group have been throttled.
+
+This interface is read-only.
+
+Hierarchical considerations
+---------------------------
+The interface enforces that an individual entity's bandwidth is always
+attainable, that is: max(c_i) <= C. However, over-subscription in the
+aggregate case is explicitly allowed to enable work-conserving semantics
+within a hierarchy.
+ e.g. \Sum (c_i) may exceed C
+[ Where C is the parent's bandwidth, and c_i its children ]
+
+
+There are two ways in which a group may become throttled:
+ a. it fully consumes its own quota within a period
+ b. a parent's quota is fully consumed within its period
+
+In case b) above, even though the child may have runtime remaining it will not
+be allowed to until the parent's runtime is refreshed.
+
+Examples
+--------
+1. Limit a group to 1 CPU worth of runtime.
+
+ If period is 250ms and quota is also 250ms, the group will get
+ 1 CPU worth of runtime every 250ms.
+
+ # echo 250000 > cpu.cfs_quota_us /* quota = 250ms */
+ # echo 250000 > cpu.cfs_period_us /* period = 250ms */
+
+2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine.
+
+ With 500ms period and 1000ms quota, the group can get 2 CPUs worth of
+ runtime every 500ms.
+
+ # echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */
+ # echo 500000 > cpu.cfs_period_us /* period = 500ms */
+
+ The larger period here allows for increased burst capacity.
+
+3. Limit a group to 20% of 1 CPU.
+
+ With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU.
+
+ # echo 10000 > cpu.cfs_quota_us /* quota = 10ms */
+ # echo 50000 > cpu.cfs_period_us /* period = 50ms */
+
+ By using a small period here we are ensuring a consistent latency
+ response at the expense of burst capacity.
+
--
1.7.0.4
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,437 @@
From 46394a392b85376e0c17a4f84e2468a0b62ec5b6 Mon Sep 17 00:00:00 2001
From: Vimalkumar <j.vimal+nf@gmail.com>
Date: Wed, 7 Sep 2011 14:17:32 -0700
Subject: [PATCH 19/19] dctcp patch
---
include/linux/sysctl.h | 3 +
include/linux/tcp.h | 10 +++
include/net/tcp.h | 3 +
kernel/sysctl_binary.c | 3 +
net/ipv4/sysctl_net_ipv4.c | 21 +++++
net/ipv4/tcp_input.c | 182 ++++++++++++++++++++++++++++++++++++++++----
net/ipv4/tcp_output.c | 19 +++++-
7 files changed, 225 insertions(+), 16 deletions(-)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 11684d9..fd8c73a 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -425,6 +425,9 @@ enum
NET_TCP_ALLOWED_CONG_CONTROL=123,
NET_TCP_MAX_SSTHRESH=124,
NET_TCP_FRTO_RESPONSE=125,
+ NET_TCP_DELAYED_ACK=126,
+ NET_TCP_DCTCP_ENABLE=127,
+ NET_TCP_DCTCP_SHIFT_G=128,
};
enum {
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index e64f4c6..9d2ec1c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -455,6 +455,16 @@ struct tcp_sock {
struct tcp_md5sig_info *md5sig_info;
#endif
+/* DCTCP Specific Parameters */
+ u32 acked_bytes_ecn;
+ u32 acked_bytes_total;
+ u32 prior_ack;
+ u32 prior_rcv_nxt;
+ u32 dctcp_alpha;
+ u32 next_seq;
+ u32 ce_state; /* 0: last pkt was non-ce , 1: last pkt was ce */
+ u32 delayed_ack_reserved;
+
/* When the cookie options are generated and exchanged, then this
* object holds a reference to them (cookie_values->kref). Also
* contains related tcp_cookie_transactions fields.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index cda30ea..b6b1480 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -223,6 +223,9 @@ extern int sysctl_tcp_max_orphans;
extern int sysctl_tcp_fack;
extern int sysctl_tcp_reordering;
extern int sysctl_tcp_ecn;
+extern int sysctl_tcp_delayed_ack;
+extern int sysctl_tcp_dctcp_enable;
+extern int sysctl_tcp_dctcp_shift_g;
extern int sysctl_tcp_dsack;
extern long sysctl_tcp_mem[3];
extern int sysctl_tcp_wmem[3];
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 20dfc21..f232b5a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -373,6 +373,9 @@ static const struct bin_table bin_net_ipv4_table[] = {
{ CTL_INT, NET_TCP_FACK, "tcp_fack" },
{ CTL_INT, NET_TCP_REORDERING, "tcp_reordering" },
{ CTL_INT, NET_TCP_ECN, "tcp_ecn" },
+ { CTL_INT, NET_TCP_DELAYED_ACK, "tcp_delayed_ack" },
+ { CTL_INT, NET_TCP_DCTCP_ENABLE, "tcp_dctcp_enable" },
+ { CTL_INT, NET_TCP_DCTCP_SHIFT_G, "tcp_dctcp_shift_g" },
{ CTL_INT, NET_TCP_DSACK, "tcp_dsack" },
{ CTL_INT, NET_TCP_MEM, "tcp_mem" },
{ CTL_INT, NET_TCP_WMEM, "tcp_wmem" },
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 57d0752..c896edf 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -440,6 +440,27 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "tcp_delayed_ack",
+ .data = &sysctl_tcp_delayed_ack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_dctcp_enable",
+ .data = &sysctl_tcp_dctcp_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_dctcp_shift_g",
+ .data = &sysctl_tcp_dctcp_shift_g,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "tcp_dsack",
.data = &sysctl_tcp_dsack,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bef9f04..7b9829b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -98,6 +98,13 @@ int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_abc __read_mostly;
+int sysctl_tcp_delayed_ack __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_delayed_ack);
+int sysctl_tcp_dctcp_enable __read_mostly;
+EXPORT_SYMBOL(sysctl_tcp_dctcp_enable);
+int sysctl_tcp_dctcp_shift_g __read_mostly = 4; /* g=1/2^4 */
+EXPORT_SYMBOL(sysctl_tcp_dctcp_shift_g);
+
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */
@@ -217,16 +224,70 @@ static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}
-static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
+static inline void TCP_ECN_dctcp_check_ce(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
{
if (tp->ecn_flags & TCP_ECN_OK) {
- if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
- tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
- /* Funny extension: if ECT is not set on a segment,
- * it is surely retransmit. It is not in ECN RFC,
- * but Linux follows this rule. */
- else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags)))
- tcp_enter_quickack_mode((struct sock *)tp);
+ u32 temp_rcv_nxt;
+
+ if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) {
+
+ /* rcv_nxt is already update in previous process (tcp_rcv_established) */
+
+ if(sysctl_tcp_dctcp_enable) {
+
+ /* state has changed from CE=0 to CE=1 && delayed ack has not sent yet */
+ if(tp->ce_state == 0 && tp->delayed_ack_reserved) {
+
+ /* save current rcv_nxt */
+ temp_rcv_nxt = tp->rcv_nxt;
+ /* generate previous ack with CE=0 */
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ tp->rcv_nxt = tp->prior_rcv_nxt;
+ /* printk("CE=0 rcv_nxt= %u nxt= %u\n",tp->rcv_nxt, temp_rcv_nxt); */
+ tcp_send_ack(sk);
+ /* recover current rcv_nxt */
+ tp->rcv_nxt = temp_rcv_nxt;
+ }
+
+ tp->ce_state = 1;
+ }
+
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+
+
+ /* Funny extension: if ECT is not set on a segment,
+ * it is surely retransmit. It is not in ECN RFC,
+ * but Linux follows this rule. */
+ } else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) {
+ tcp_enter_quickack_mode((struct sock *)tp);
+ }else {
+ /* It has ECT but it doesn't have CE */
+
+ if(sysctl_tcp_dctcp_enable) {
+
+ if(tp->ce_state != 0 && tp->delayed_ack_reserved) {
+
+ /* save current rcv_nxt */
+ temp_rcv_nxt = tp->rcv_nxt;
+ /* generate previous ack with CE=1 */
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ tp->rcv_nxt = tp->prior_rcv_nxt;
+ /* printk("CE=1 rcv_nxt= %u nxt= %u\n",tp->rcv_nxt, temp_rcv_nxt); */
+ tcp_send_ack(sk);
+ /* recover current rcv_nxt */
+ tp->rcv_nxt = temp_rcv_nxt;
+ }
+
+ tp->ce_state = 0;
+
+ /* deassert only when DCTCP is enabled */
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ }
+
+ }
+
+ /* set current rcv_nxt to prior_rcv_nxt */
+ tp->prior_rcv_nxt = tp->rcv_nxt;
}
}
@@ -581,6 +642,8 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
*/
tcp_incr_quickack(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
+
+ tp->ce_state = 0;
} else {
int m = now - icsk->icsk_ack.lrcvtime;
@@ -601,7 +664,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
}
icsk->icsk_ack.lrcvtime = now;
- TCP_ECN_check_ce(tp, skb);
+ TCP_ECN_dctcp_check_ce(sk, tp, skb);
if (skb->len >= 128)
tcp_grow_window(sk, skb);
@@ -827,19 +890,54 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
+ __u32 ssthresh_old;
+ __u32 cwnd_old;
+ __u32 cwnd_new;
+
tp->prior_ssthresh = 0;
tp->bytes_acked = 0;
if (icsk->icsk_ca_state < TCP_CA_CWR) {
tp->undo_marker = 0;
- if (set_ssthresh)
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
- tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp) + 1U);
+
+ if(!sysctl_tcp_dctcp_enable) {
+
+ if (set_ssthresh)
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+
+ tp->snd_cwnd = min(tp->snd_cwnd,
+ tcp_packets_in_flight(tp) + 1U);
+
+ }else {
+
+ cwnd_new = max (tp->snd_cwnd - ((tp->snd_cwnd * tp->dctcp_alpha)>>11) , 2U);
+
+ if(set_ssthresh) {
+
+ ssthresh_old = tp->snd_ssthresh;
+ tp->snd_ssthresh = cwnd_new;
+
+ /* printk("%llu alpha= %d ssth old= %d new= %d\n", */
+ /* ktime_to_us(ktime_get_real()), */
+ /* tp->dctcp_alpha, */
+ /* ssthresh_old, */
+ /* tp->snd_ssthresh); */
+ }
+
+ cwnd_old = tp->snd_cwnd;
+ tp->snd_cwnd = cwnd_new;
+
+ /* printk("%llu alpha= %d cwnd old= %d new= %d\n", */
+ /* ktime_to_us(ktime_get_real()), */
+ /* tp->dctcp_alpha, */
+ /* cwnd_old, */
+ /* tp->snd_cwnd); */
+ }
+
tp->snd_cwnd_cnt = 0;
tp->high_seq = tp->snd_nxt;
tp->snd_cwnd_stamp = tcp_time_stamp;
TCP_ECN_queue_cwr(tp);
-
+
tcp_set_ca_state(sk, TCP_CA_CWR);
}
}
@@ -2859,6 +2957,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
tcp_try_keep_open(sk);
tcp_moderate_cwnd(tp);
} else {
+ if(!sysctl_tcp_dctcp_enable)
tcp_cwnd_down(sk, flag);
}
}
@@ -3624,6 +3723,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
int prior_packets;
int frto_cwnd = 0;
+ __u32 alpha_old;
+ __u32 acked_bytes;
+
/* If the ack is older than previous acks
* then we can probably ignore it.
*/
@@ -3680,6 +3782,54 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
}
+ /* START: DCTCP Processing */
+
+ /* calc acked bytes */
+ if(after(ack,tp->prior_ack)) {
+ acked_bytes = ack - tp->prior_ack;
+ } else {
+
+ if(flag & FLAG_WIN_UPDATE) {
+ /* Don't count when it is Window Updated ACK */
+ acked_bytes = 0;
+ /* printk("acked_byte=0\n"); */
+ }else {
+ /* Count duplicate ACKs for Retransmission packets and so on as MSS size */
+ acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
+ }
+ }
+
+ if(flag & FLAG_ECE)
+ tp->acked_bytes_ecn += acked_bytes;
+
+ tp->acked_bytes_total += acked_bytes;
+
+ tp->prior_ack = ack;
+
+ /* Expired RTT */
+ if (!before(tp->snd_una,tp->next_seq)) {
+
+ /* For avoiding denominator == 1 */
+ if(tp->acked_bytes_total == 0) tp->acked_bytes_total = 1;
+
+ alpha_old = tp->dctcp_alpha;
+
+ /* alpha = (1-g) * alpha + g * F */
+ tp->dctcp_alpha = alpha_old - (alpha_old >> sysctl_tcp_dctcp_shift_g)
+ + (tp->acked_bytes_ecn << (10 - sysctl_tcp_dctcp_shift_g)) / tp->acked_bytes_total;
+
+ if(tp->dctcp_alpha > 1024) tp->dctcp_alpha = 1024; /* round to 0-1024 */
+
+ /* printk("bytes_ecn= %d total= %d alpha: old= %d new= %d\n", */
+ /* tp->acked_bytes_ecn, tp->acked_bytes_total, alpha_old, tp->dctcp_alpha); */
+
+ tp->acked_bytes_ecn = 0;
+ tp->acked_bytes_total = 0;
+ tp->next_seq = tp->snd_nxt;
+ }
+
+ /* END: DCTCP Processing */
+
/* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
@@ -4480,7 +4630,7 @@ drop:
goto queue_and_out;
}
- TCP_ECN_check_ce(tp, skb);
+ TCP_ECN_dctcp_check_ce(sk, tp, skb);
if (tcp_try_rmem_schedule(sk, skb->truesize))
goto drop;
@@ -4931,6 +5081,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
__tcp_select_window(sk) >= tp->rcv_wnd) ||
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
+ /* Delayed ACK is disabled or ... */
+ sysctl_tcp_delayed_ack == 0 ||
/* We have out of order data. */
(ofo_possible && skb_peek(&tp->out_of_order_queue))) {
/* Then ack it now */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 882e0b0..2a4d1dc 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -308,7 +308,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
tp->ecn_flags = 0;
- if (sysctl_tcp_ecn == 1) {
+ if (sysctl_tcp_ecn == 1 || sysctl_tcp_dctcp_enable) {
TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK;
}
@@ -878,6 +878,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
if (likely((tcb->flags & TCPHDR_SYN) == 0))
TCP_ECN_send(sk, skb, tcp_header_size);
+ /* In DCTCP, Assert ECT bit to all packets*/
+ if(sysctl_tcp_dctcp_enable)
+ INET_ECN_xmit(sk);
+
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
@@ -2624,6 +2628,11 @@ int tcp_connect(struct sock *sk)
tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
TCP_ECN_send_syn(sk, buff);
+ /* Initialize DCTCP internal parameters */
+ tp->next_seq = tp->snd_nxt;
+ tp->acked_bytes_ecn = 0;
+ tp->acked_bytes_total = 0;
+
/* Send it off. */
TCP_SKB_CB(buff)->when = tcp_time_stamp;
tp->retrans_stamp = TCP_SKB_CB(buff)->when;
@@ -2660,6 +2669,10 @@ void tcp_send_delayed_ack(struct sock *sk)
int ato = icsk->icsk_ack.ato;
unsigned long timeout;
+ /* Delayed ACK reserved flag for DCTCP */
+ struct tcp_sock *tp = tcp_sk(sk);
+ tp->delayed_ack_reserved = 1;
+
if (ato > TCP_DELACK_MIN) {
const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ / 2;
@@ -2711,6 +2724,10 @@ void tcp_send_ack(struct sock *sk)
{
struct sk_buff *buff;
+ /* Delayed ACK reserved flag for DCTCP */
+ struct tcp_sock *tp = tcp_sk(sk);
+ tp->delayed_ack_reserved = 0;
+
/* If we have been reset, we may not send again. */
if (sk->sk_state == TCP_CLOSE)
return;
--
1.7.0.4