Add .config

Fix build error w/.config not there
Add 3.0 kernel build files
2012-06-03 00:49:20 -07:00 · 2012-06-03 05:34:58 +00:00 · 2012-06-03 03:51:08 +00:00
29 changed files with 17967 additions and 0 deletions
@@ -0,0 +1,439 @@
+#!/bin/bash
+# Builds kernel with the new CFS Bandwidth patches
+# and nsfd/setns syscall patches.
+# Also builds Open vSwitch against the built kernel version.
+# Script only to be run on 64-bit systems; needs a few changes to run on
+# 32-bit ones.
+
+# If building for i386 (-t), make sure to install the following 32-bit libs:
+# sudo apt-get install ia32-libs lib32gcc1 libc6-i386 util-linux devscripts
+
+# Check for unitialized variables
+set -o nounset
+
+# Exit on any failure
+set -e
+
+# Location in which to download and build the kernel
+kdir=/usr/src
+
+# Kernel version to download
+kver=3.0.0
+
+# Save original directory for later.
+orig_dir=`pwd`
+
+# Default and custom kernel version string
+version_string=-with-cfs
+
+# Run menuconfig later?
+menuconfig=
+
+# Use localmodconfig?
+localmodconfig=
+
+# Build ubuntu kernel? must be 3.0.0 compatible
+ubuntu_release=
+ubuntu_default_release=ubuntu-oneiric
+ubuntu_base=3.0.0-14  # base tag and version for build
+ubuntu_tag=Ubuntu-$ubuntu_base.23
+ubuntu_flavor=generic
+#ubuntu_config=/boot/config-$ubuntu_base-$ubuntu_flavor
+ubuntu_config=${orig_dir}/config-3.0.9-with-cfs
+ubuntu_image=linux-image-$ubuntu_base-$ubuntu_flavor
+ubuntu_kver=3.0.9 # must match version that is actually built
+
+# OVS pkg string.  Not sure how to find this automatically.
+ovs_pkg_ver=1.2.0-1ubuntu3
+
+# Location of kernel config.  If not specified, use current .config.
+# was: ${orig_dir}/config-3.0.0-with-cfs
+kconfig=
+
+# Install only?
+install_only=
+
+# Use 32-bit?
+i386=
+
+function usage {
+    warn "Compiles kernel ${kver} with CBW, setns, and DCTCP patches in ${kdir}"
+    warn "Usage: build.sh [-huimlt] [-v 'versionstring']"
+    warn "-h help"
+    warn "-u build ubuntu kernel"
+    warn "-i install only (don't build)"
+    warn "-m use menuconfig"
+    warn "-l use localmodconfig"
+    warn "-v 'versionstring' use custom version string"
+    warn "-t build for i386 (32-bit)"
+}
+
+function parse_opts {
+    custom_version_string=
+    plus=
+    while getopts 'huimltv:' OPTION; do
+        case $OPTION in
+            h) usage; exit 0;;
+            u) ubuntu_release=$ubuntu_default_release;
+		kver=$ubuntu_kver; kconfig=$ubuntu_config;;
+	    i) install_only=true;;
+	    m) menuconfig=true;;
+	    l) localmodconfig=true;;
+	    v) custom_version_string=$OPTARG;;
+	    t) i386=true; plus=;;
+	    ?) usage; exit 1;;
+        esac
+    done
+    # Provide feedback which might be useful
+    if [[ "$custom_version_string" != "" ]]; then
+        warn "Using custom version_string: ${custom_version_string}"
+        version_string=$custom_version_string
+    else
+        warn "Using default version_string: ${version_string}"
+    fi
+    if [[ "$ubuntu_release" != "" ]]; then
+        warn "Building Ubuntu kernel for release ${ubuntu_release}"
+    fi
+}
+
+function warn {
+    # Echo the provided command in color text.
+    yellow='\e[0;33m' # Yellow
+    reset='\e[0m'
+    echo="echo -e"
+    if [ -n "${2+defined}" ]; then
+        echo="$echo $2"
+    fi
+    $echo "${yellow}$1${reset}"
+}
+
+function pre_check {
+    warn "Checking for git"
+    if [[ -z `which git` ]]; then
+        warn "You need git to download kernel.  Install? [Y/n] " -n
+        read answer
+        [[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
+            sudo apt-get install git;
+    fi
+
+    warn "Checking for kernel-package build utilities"
+    if [[ -z `which make-kpkg` ]]; then
+        warn "You need kernel-package utilities to build the kernel.  Install? [Y/n] " -n
+        read answer
+        [[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
+            sudo apt-get install kernel-package ncurses-dev;
+    fi
+}
+
+function fetch_kernel {
+    if [[ "$ubuntu_release" == "" ]]; then
+    srcdir=$kdir/linux-$kver
+    archive=git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-tip.git
+	tag=3.0.0
+    else
+	warn "Pre-installing $ubuntu_image"
+	#sudo apt-get install $ubuntu_image 
+	srcdir=$kdir/$ubuntu_release
+	archive=git://kernel.ubuntu.com/ubuntu/$ubuntu_release
+	tag=$ubuntu_tag
+    fi
+    if [[ -d $srcdir ]]; then
+        warn "Linux source exists in $srcdir, skipping.."
+        return
+    fi
+    warn "--> Fetching kernel $srcdir"
+    if git clone $archive $srcdir; then
+	return
+    fi
+    warn "Failed to fetch kernel from $archive"
+    if [[ "$ubuntu_release" == "" ]]; then
+        warn "Trying github"
+	archive=git://github.com/torvalds/linux.git
+	if git clone $archive $srcdir; then
+	    return
+	fi
+    fi
+    warn "Giving up."
+    exit 2
+}
+
+function work_around_kernel_package_bug {
+    warn "Applying workaround for kernel package bug..."
+    # Fix will likely break on any other kernel version, so watch out.
+    # From:
+    # https://bugs.launchpad.net/ubuntu/+source/kernel-package/+bug/58307/comments/16
+    sed -i -s 's/echo "+"/#echo "+"/' $srcdir/scripts/setlocalversion
+}
+
+function copy_patches {
+    warn "Copying patches..."
+    rm -rf $srcdir/patches
+    cp -r ${orig_dir}/../../linux-3.0.0-patches/ $srcdir/patches
+}
+
+function apply_patches {
+    cd $srcdir
+    if git checkout mininet ; then
+	# Assume mininet 
+        warn "Mininet branch already exists - not applying patches"
+        return
+    fi
+    if [[ "$tag" != "" ]] ; then
+	git checkout $tag
+    fi
+    git checkout -b mininet 
+    warn "Applying patches..."
+    git am -3 patches/*.patch
+    work_around_kernel_package_bug
+}
+
+# lxc/ns and cfs configuration flags
+
+config_y='
+CONFIG_GROUP_SCHED
+CONFIG_FAIR_GROUP_SCHED
+CONFIG_RT_GROUP_SCHED
+CONFIG_CGROUP_SCHED
+CONFIG_CGROUPS
+CONFIG_CGROUP_FREEZER
+CONFIG_CGROUP_DEVICE
+CONFIG_SCHED_AUTOGROUP
+CONFIG_BLK_CGROUP
+CONFIG_CFQ_GROUP_IOSCHED
+CONFIG_CGROUP_PERF
+CONFIG_CPUSETS
+CONFIG_PROC_PID_CPUSET
+CONFIG_CGROUP_CPUACCT
+CONFIG_RESOURCE_COUNTERS
+CONFIG_CGROUP_MEM_RES_CTLR
+CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+CONFIG_MM_OWNER
+CONFIG_NAMESPACES
+CONFIG_UTS_NS
+CONFIG_IPC_NS
+CONFIG_USER_NS
+CONFIG_PID_NS
+CONFIG_NET_NS
+CONFIG_NET_CLS_CGROUP
+CONFIG_SECURITY_FILE_CAPABILITIES
+CONFIG_DEVPTS_MULTIPLE_INSTANCES
+CONFIG_VETH
+CONFIG_VLAN_8021Q
+CONFIG_MACVLAN
+CONFIG_CFS_BANDWIDTH
+CONFIG_NET_SCHED'
+
+config_m='
+CONFIG_BRIDGE
+CONFIG_NET_SCH_CBQ
+CONFIG_NET_SCH_HTB
+CONFIG_NET_SCH_HFSC
+CONFIG_NET_SCH_PRIO
+CONFIG_NET_SCH_MULTIQ
+CONFIG_NET_SCH_RED
+CONFIG_NET_SCH_SFB
+CONFIG_NET_SCH_SFQ
+CONFIG_NET_SCH_TEQL
+CONFIG_NET_SCH_TBF
+CONFIG_NET_SCH_GRED
+CONFIG_NET_SCH_DSMARK
+CONFIG_NET_SCH_NETEM
+CONFIG_NET_SCH_DRR
+CONFIG_NET_SCH_MQPRIO
+CONFIG_NET_SCH_CHOKE
+CONFIG_NET_SCH_QFQ
+CONFIG_NET_SCH_INGRESS
+'
+
+config_n='
+CONFIG_SECURITY_APPARMOR
+'
+
+function configure_kernel {
+    cd $srcdir
+    warn "Configuring kernel..."
+
+    if [[ "$menuconfig" == 'true' ]]; then
+        make menuconfig
+    else
+        if [[ "$kconfig" == "" ]]; then
+            warn "Using current kernel config..."
+        else
+            warn "Using specified kernel config: ${kconfig}..."
+            cp $kconfig .config
+        fi
+
+        warn "Making oldconfig..."
+        if [[ "$i386" == 'true' ]]; then
+            linux32=linux32
+        else
+            linux32=
+        fi
+        yes '' | $linux32 make oldconfig 1> /dev/null
+        if [[ "$localmodconfig" == 'true' ]]; then
+            warn "Making localmodconfig..."
+            yes '' | $linux32 make localmodconfig 1> /dev/null
+        fi
+        warn "Setting kernel flags for lxc and cbw..."
+        for flag in $config_y; do
+            if ! grep $flag .config 1> /dev/null; then
+                echo $flag=y >> .config
+            else
+                sed -i -s "s/# $flag is not set/$flag=y/" .config
+            fi
+        done
+        for flag in $config_m; do
+            if ! grep $flag .config 1> /dev/null; then 
+                echo $flag=m >> .config
+            else
+                sed -i -s "s/# $flag is not set/$flag=m/" .config
+                sed -i -s "s/$flag=y/$flag=m/" .config
+            fi
+        done
+        for flag in $config_n; do
+            if ! grep $flag .config 1> /dev/null; then
+                echo "# $flag is not set" >> .config
+            else
+                sed -i -s "s/$flag=y/# $flag is not set/" .config
+                sed -i -s "s/$flag=m/# $flag is not set/" .config
+            fi
+        done
+        for flag in $config_y $config_m; do 
+            grep $flag .config || echo "WARNING: $flag IS MISSING"
+        done
+        cp .config /tmp
+        warn "RAN CONFIG IN `pwd`"
+    fi
+}
+
+function build_kernel {
+    # Have your favourite build method here
+    # This is a standard Debian way of building the kernel
+    # The patches select cfs bandwidth automatically
+    warn "Building kernel-$version_string"
+    cd $srcdir
+    procs=`grep -c ^processor /proc/cpuinfo`
+    procs=`echo $procs + 2 | bc`
+    export CONCURRENCY_LEVEL=$procs
+    if [[ "$i386" == 'true' ]]; then
+        mkpkg_extra_args='--cross-compile - --arch i386'
+    else
+        mkpkg_extra_args=
+    fi
+    make-kpkg clean $mkpkg_extra_args
+    yes '' | fakeroot make-kpkg -j $procs $mkpkg_extra_args --initrd --append-to-version=${version_string} \
+        kernel_image kernel_headers
+}
+
+function mod_kernel_dpkg {
+    # Only needed for i386.
+    cd /usr/src
+    if [[ "$i386" == 'true' ]]; then
+        warn "Modifying deb-pkg names for i386"
+        # Based on instructions from http://dotcommie.net/?id=165
+        for pkg_type in linux-image linux-headers; do
+            pkg_name_orig=${pkg_type}-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom_amd64.deb
+            #hook=`readlink -f set_debian_control_i386.sh`
+            hook=${orig_dir}/set_debian_control_i386.sh
+            warn "$pkg_name_orig"
+            fakeroot deb-reversion -s "" --hook $hook $pkg_name_orig
+            # Remove the 1 in the name that deb-reversion adds.
+            pkg_name_mod=${pkg_type}-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom1_i386.deb
+            pkg_name_new=${pkg_type}-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom_i386.deb
+            mv $pkg_name_mod $pkg_name_new
+            warn "Removing original package: $pkg_name_orig"
+            rm -f $pkg_name_orig
+       done
+    fi
+}
+
+function install_headers {
+    warn "Installing headers..."
+    sudo dpkg -i /usr/src/linux-headers-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom_*.deb
+}
+
+function install_kernel {
+    warn "Installing kernel..."
+    sudo dpkg -i /usr/src/linux-image-$kver${version_string}${plus}_$kver${version_string}${plus}-10.00.Custom_*.deb
+}
+
+function build_initrd {
+    # Certain versions of Ubuntu install a make-kpkg"
+    # that does not build an initrd along with the rest of the kernel."
+    warn "Building initrd..."
+    #sudo mkdir -p /lib/modules/$kver${version_string}
+    sudo mkinitramfs -v -k -o /boot/initrd.img-$kver${version_string}${plus} $kver${version_string}${plus}
+}
+
+function build_ovs_datapath {
+    sudo apt-get install openvswitch-datapath-source
+    if [[ "$i386" == 'true' ]]; then
+        prepend='DEB_HOST_ARCH=i386 '
+    else
+        prepend=
+    fi
+    $prepend sudo module-assistant auto-build openvswitch-datapath -l $kver${version_string}${plus}
+}
+
+function mod_ovs_dpkg {
+    # Only needed for i386.
+    cd /usr/src
+	if [[ "$i386" == 'true' ]]; then
+        warn "Modifying deb-pkg names for i386"
+	    # Based on instructions from http://dotcommie.net/?id=165
+        for pkg_type in openvswitch-datapath-module; do
+            pkg_name_orig=${pkg_type}-$kver${version_string}${plus}_${ovs_pkg_ver}_amd64.deb
+            #hook=`readlink -f set_debian_control_i386.sh`
+            hook=${orig_dir}/set_debian_control_i386.sh
+            warn "$pkg_name_orig"
+            fakeroot deb-reversion -s "" --hook $hook $pkg_name_orig
+            # Remove the 1 in the name that deb-reversion adds.
+            pkg_name_mod=${pkg_type}-$kver${version_string}${plus}_${ovs_pkg_ver}1_i386.deb
+            pkg_name_new=${pkg_type}-$kver${version_string}${plus}_${ovs_pkg_ver}_i386.deb
+            mv $pkg_name_mod $pkg_name_new
+		    warn "Removing original package: $pkg_name_orig"
+		    rm -f $pkg_name_orig
+	   done
+	fi
+}
+
+function install_ovs_datapath {
+    warn "Installing ovs datapath"
+    sudo module-assistant install openvswitch-datapath -l $kver${version_string}${plus}
+}
+
+parse_opts $*
+
+if [[ "$install_only" != 'true' ]] ; then
+    pre_check
+
+    sudo chmod 777 $kdir
+    cd $kdir
+
+    fetch_kernel
+    copy_patches
+    apply_patches
+    configure_kernel
+    build_kernel
+    mod_kernel_dpkg
+    warn "******************************************"
+    warn "Check for kernel .deb installation file in /usr/src/ along with initrd."
+else
+    install_headers
+fi
+
+if [[ "$i386" != 'true' ]]; then
+    # Presumably we'll only want to install on a 64-bit machine.
+    install_kernel
+fi
+
+build_ovs_datapath
+mod_ovs_dpkg
+
+if [[ "$i386" != 'true' ]]; then
+    install_ovs_datapath
+    build_initrd
+fi
+
+cd $orig_dir
+warn "Done (hopefully)"
@@ -0,0 +1,121 @@
+#!/bin/bash
+# Builds lxc for kernel patched with setns
+
+# Check for unitialized variables
+set -o nounset
+
+# Exit on any failure
+set -e
+
+# Kernel version to use
+kver=3.0
+
+# Location in which to download and build lxc
+lxcdir=$HOME
+kdir=/lib/modules/`uname -r`/build
+
+# lxc version to use
+lxcver=lxc-0.7.5
+
+# Save original directory for later.
+orig_dir=`pwd`
+
+function warn {
+    # Echo the provided command in color text.
+    yellow='\e[0;33m' # Yellow
+    reset='\e[0m'
+    echo="echo -e"
+    $echo "${yellow}$1${reset}"
+}
+
+function usage {
+    warn "Usage: $0 [lxc download location] [kernel location]"
+}
+
+
+if [[ "$#" > 2 ]]; then
+    warn "Invalid number of args passed."
+    usage
+    exit
+elif [[ "$#" == 0 ]]; then
+    warn "No args passed."
+    warn "Using default lxc location: ${lxcdir}." 
+    warn "Using default kernel location: ${kdir}."
+elif [[ "$#" == 1 ]]; then
+    lxcdir=$1
+    warn "Using custom lxc location: ${lxcdir}"
+    warn "Using default kernel location: ${kdir}"
+elif [[ "$#" == 2 ]]; then
+    lxcdir=$1
+    kdir=$2
+    warn "Using custom lxc location: ${lxcdir}"
+    warn "Using custom kernel location: ${kdir}"
+fi
+
+function pre_check {
+    warn "Checking for git"
+    if [[ -z `which git` ]]; then
+        read -p \
+            warn "You need git to download lxc.  Install? [Y/n] " \
+            answer;
+        [[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
+            sudo apt-get install git;
+    fi
+
+    warn "Checking for linux source code"
+    if [[ ! -d ${kdir} ]]; then
+        warn "Error: Kernel doesn't exist in ${kdir}... exiting"
+        exit
+    fi
+}
+
+function fetch_lxc {
+    cd $lxcdir
+    warn "--> Fetching lxc"
+    if [[ -d lxc ]]; then
+        warn "lxc source exists, skipping.."
+        return
+    fi
+    git clone git://lxc.git.sourceforge.net/gitroot/lxc/lxc
+    cd lxc
+    git checkout $lxcver
+    cd ..
+}
+
+function copy_patches {
+    rm -rf lxc/patches
+    cp -r ${orig_dir}/../../lxc-$kver-patches lxc/patches
+}
+
+function apply_patches {
+    cd lxc
+    warn "Applying patches..."
+    git am -3 patches/*.patch
+}
+
+function build_lxc {
+    warn "Building lxc with kernel-${kver}..."
+    processors=`grep -c ^processor /proc/cpuinfo`
+    export CONCURRENCY_LEVEL=$processors
+    make distclean || true
+    ./autogen.sh
+    ./configure --with-linuxdir=${kdir}
+    make
+}
+
+function install_lxc {
+    warn "Installing lxc..."
+    sudo make install
+    # Seems to be missing
+    sudo mkdir -p /usr/local/var/lib/lxc
+}
+
+usage
+pre_check
+fetch_lxc
+copy_patches
+apply_patches
+build_lxc
+install_lxc
+cd $orig_dir
+warn "Done (hopefully)"
@@ -0,0 +1,170 @@
+#!/bin/bash
+# Builds kernel with the new CFS Bandwidth patches
+# and nsfd/setns syscall patches.
+
+# Check for unitialized variables
+set -o nounset
+
+# Exit on any failure
+set -e
+
+# Location in which to download and build the kernel
+kdir=/usr/src
+
+# Kernel version to download
+kver=2.6.35
+
+# Save original directory for later.
+orig_dir=`pwd`
+
+# Kernel version string
+version_string=-with-cfs
+
+# Run menuconfig later?
+menuconfig=
+
+function warn {
+    # Echo the provided command in color text.
+    yellow='\e[0;33m' # Yellow
+    reset='\e[0m'
+    echo="echo -e"
+    $echo "${yellow}$1${reset}"
+}
+
+function usage {
+    warn "Usage: build.sh [version string] [menuconfig]"
+}
+
+
+if [[ "$#" > 2 ]]; then
+    warn "Invalid number of args passed."
+    usage
+    exit
+elif [[ "$#" == 0 ]]; then
+    warn "No args passed.  Using default version_string: ${version_string}"
+elif [[ "$#" == 1 ]]; then
+    warn "Using custom version_string: ${version_string}"
+    version_string=$1
+elif [[ "$#" == 2 && $2 != 'menuconfig' ]]; then
+    warn "Second arg is either menuconfig or missing."
+    usage
+else
+    version_string=$1
+    menuconfig=true
+fi
+
+function pre_check {
+    warn "Checking for kernel-package build utilities"
+    if [[ -z `which make-kpkg` ]]; then
+        read -p \
+            warn "You need kernel-package utilities to build the kernel.  Install? [Y/n] " \
+            answer;
+        [[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
+            sudo apt-get install kernel-package ncurses-dev;
+    fi
+    
+    warn "Checking for quilt"
+    if [[ -z `which quilt` ]]; then
+        read -p \
+            warn "You need quilt to install patches.  Install? [Y/n] " \
+            answer;
+        [[ -z $answer || $answer=="Y" || $answer == "y" ]] && \
+            sudo apt-get install quilt;
+    fi
+}
+
+function fetch_kernel {
+    warn "--> Fetching kernel linux-$kver"
+    if [[ -f linux-$kver.tar.bz2 ]]; then
+        warn "File exists, skipping.."
+        return
+    fi
+    wget http://kernel.org/pub/linux/kernel/v2.6/linux-$kver.tar.bz2
+    warn "Unpacking kernel"
+    tar xjf linux-$kver.tar.bz2
+}
+
+function work_around_kernel_package_bug {
+    # Fix will likely break on any other kernel version, so watch out.
+    # From:
+    # https://bugs.launchpad.net/ubuntu/+source/kernel-package/+bug/58307/comments/16
+    sed -i -s 's/echo "+"/#echo "+"/' linux-${kver}/scripts/setlocalversion
+}
+
+function copy_patches {
+    rm -rf linux-$kver/patches
+    cp -r ${orig_dir}/../../linux-2.6.35-patches linux-$kver/patches
+}
+
+function apply_patches {
+    cd linux-$kver
+    # Apply patch series only if not applied previously.
+    # A better check would look at patches/series and make sure each entry
+    # in `quilt applied` was covered.
+    warn "Checking for applied patches"
+    quilt applied > quilt_applied_stdout 2> quilt_applied_stderr || true
+    if [[ `grep -c "No patches applied" quilt_applied_stderr` == 1 ]]; then
+        warn "Applying patches"
+        quilt push -a
+    else
+        warn "Skipped patches"
+    fi
+    rm quilt_applied
+}
+
+function build_kernel {
+    # Have your favourite build method here
+    # This is a standard Debian way of building the kernel
+    # The patches select cfs bandwidth automatically
+    warn "Building kernel..."
+
+    if [[ "$menuconfig" == 'true' ]]; then
+        make menuconfig
+    else
+        warn "Making oldconfig..."
+        yes "" | make oldconfig
+        warn "Making localmodconfig..."
+        make localmodconfig
+        warn "Enabling netns and cpubw..."
+        sed -i -s 's/# CONFIG_VETH is not set/CONFIG_VETH=y/' .config
+        sed -i -s 's/CONFIG_BRIDGE=y/CONFIG_BRIDGE=m/' .config
+        sed -i -s 's/# CONFIG_BRIDGE is not set/CONFIG_BRIDGE=m/' .config
+        sed -i -s 's/# CONFIG_CFS_BANDWIDTH is not set/CONFIG_CFS_BANDWIDTH=y/' .config
+        sed -i -s 's/# CONFIG_NET_NS is not set/CONFIG_NET_NS=y/' .config
+    fi
+
+    warn "Building kernel-$version_string"
+    processors=`grep -c ^processor /proc/cpuinfo`
+    export CONCURRENCY_LEVEL=$processors
+    yes "" | fakeroot make-kpkg --initrd --append-to-version=${version_string} kernel_image
+  
+    warn "******************************************"
+    warn "Check for kernel .deb installation file in ../ along with initrd."
+}
+
+function install_kernel {
+    warn "Installing kernel..."
+    sudo dpkg -i /usr/src/linux-image-$kver${version_string}_$kver${version_string}-10.00.Custom_amd64.deb
+}
+
+function build_initrd {
+    # Certain versions of Ubuntu install a make-kpkg"
+    # that does not build an initrd along with the rest of the kernel."
+    warn "Building initrd..."
+    #sudo mkdir -p /lib/modules/$kver${version_string}
+    sudo mkinitramfs -v -k -o /boot/initrd.img-$kver${version_string} $kver${version_string}
+}
+
+pre_check
+
+sudo chmod 777 $kdir
+cd $kdir
+
+fetch_kernel
+work_around_kernel_package_bug
+copy_patches
+apply_patches
+build_kernel
+install_kernel
+build_initrd
+warn "Done (hopefully)"
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+# Check for unitialized variables
+set -o nounset
+
+# Exit on any failure
+set -e
+
+debdir=/usr/src
+kver=3.0.9-with-cfs
+kbuild=/lib/modules/$kver/build
+
+if arch | grep 64 > /dev/null ; then arch=amd64; else arch=i386; fi
+
+headers=linux-headers-${kver}_${kver}-10.00.Custom_${arch}.deb
+image=linux-image-${kver}_${kver}-10.00.Custom_${arch}.deb
+ovs=openvswitch-datapath-module-${kver}_1.2.0-1ubuntu3_${arch}.deb
+
+echo "Mininet-hifi installer"
+
+echo "1. Checking for prereqs"
+  if [[ ! -e $debdir/$headers || ! -e $debdir/$image || 
+	! -e $debdir/$ovs ]]; then
+    echo "Can't find kernel packages"
+    echo "$debdir/$headers or $debdir/$image or $debdir/$ovs is missing"
+    exit 1
+  fi
+  if [[ "`ssh-add -l`" == "" ]]; then
+    echo "No SSH keys - nsdi repo checkout will fail."
+    exit 1
+  fi
+
+echo "2. Getting mainline Mininet from github"
+  cd ~
+  git clone git://github.com/mininet/mininet.git
+
+echo "3. Installing OpenFlow reference implementation"
+  mininet/util/install.sh -f
+
+echo "4. Installing Mininet core files"
+  mininet/util/install.sh -n
+
+echo "5. Adding nsdi repository"
+  cd ~/mininet
+  git remote add nsdi git@gitosis.stanford.edu:mininet-nsdi.git
+  git fetch nsdi
+  git checkout -b mininet-rt remotes/nsdi/mininet-rt
+  sudo make install
+
+echo "6. Installing kernel packages"
+  sudo dpkg -i $debdir/$headers
+  sudo dpkg -i $debdir/$image 
+  sudo dpkg -i $debdir/$ovs
+  
+echo "7. Fetching, building and installing Open vSwitch user code"
+  cd ~
+  git clone git://openvswitch.org/openvswitch
+  cd ~/openvswitch
+  git checkout v1.2.2
+  ./boot.sh
+  ./configure
+  make all
+  sudo make install
+  sudo cp tests/test-openflowd /usr/local/bin/ovs-openflowd
+
+echo "8. Building and installing custom lxc package"
+  sudo apt-get -y install libcap-dev
+  cd ~/mininet/util/kbuild/cfs-nsfd-kernel
+  ./build-lxc-for-3.0.sh $HOME $kbuild 
+
+echo "9. Setting up /cgroup"
+  sudo apt-get remove cgroup-lite
+  sudo mkdir /cgroup
+  sudo sh -c "echo 'cgroup /cgroup cgroup defaults 0 0' >> /etc/fstab"
+
+echo "10. Creating /etc/mn/host.conf"
+  sudo mkdir -p /etc/mn
+  sudo sh -c "echo 'lxc.utsname = mnhost' > /etc/mn/host.conf"
+  sudo sh -c "echo 'lxc.network.type = empty' >> /etc/mn/host.conf"
+
+echo "11. Getting rid of quiet boot"
+  sudo sed -i 's/quiet/text/' /etc/default/grub
+
+echo "Done! reboot to test"
+
@@ -0,0 +1,17 @@
+#!/bin/bash
+# Install lxc from source, apply patch, install
+# (instructions tested with 2.6.35 only):
+sudo apt-get -y install libcap-dev quilt
+cd ~/
+git clone git://lxc.git.sourceforge.net/gitroot/lxc/lxc
+cd lxc
+git checkout lxc-0.7.2 -b lxc-0.7.2
+cp ~/mininet/util/kbuild/cfs-nsfd-kernel/lxc-patches.tar.gz .
+tar xzf lxc-patches.tar.gz
+# Modify patch.  Small change to the patch:  remove the 2nd argument to lxc_cgroup_path_get (it's set to NULL in the patch)
+sed -i -s 's/cgrouppath, NULL, my_args.name/cgrouppath, my_args.name/' patches/lxc-attach-bug-fix.patch
+quilt push -a
+./autogen.sh
+./configure
+make
+sudo make install
@@ -0,0 +1,4 @@
+#!/bin/sh
+echo `pwd`
+sed -i -s "s/Architecture: amd64/Architecture: i386/" DEBIAN/control
+
@@ -0,0 +1,14 @@
+#!/bin/sh
+# Re-build OVS for the kernel version defined below.
+
+OVS_DIR=~/openvswitch
+KERNEL_VER=`uname -r`
+#KERNEL_VER=2.6.35-with-cfs
+PROCESSORS=`grep -c ^processor /proc/cpuinfo`
+cd $OVS_DIR
+./configure --with-linux=/lib/modules/${KERNEL_VER}/build && \
+sudo make -j${PROCESSORS} && \
+sudo cp ./datapath/linux/openvswitch_mod.ko /lib/modules/${KERNEL_VER}/kernel/drivers/net && \
+echo "Running depmod..."
+sudo depmod -a ${KERNEL_VER}
+
@@ -0,0 +1,36 @@
+From 57cc69f4a6d27c0b3ef495589a1d4629a9f1fa3e Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Wed, 6 Jul 2011 22:30:37 -0700
+Subject: [PATCH 01/19] sched: Don't update shares twice on on_rq parent
+
+In dequeue_task_fair() we bail on dequeue when we encounter a parenting entity
+with additional weight.  However, we perform a double shares update on this
+entity as we continue the shares update traversal from this point, despite
+dequeue_entity() having already updated its queuing cfs_rq.
+Avoid this by starting from the parent when we resume.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110707053059.797714697@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ kernel/sched_fair.c |    3 +++
+ 1 files changed, 3 insertions(+), 0 deletions(-)
+
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index c768588..c80f030 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -1370,6 +1370,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 			 */
+ 			if (task_sleep && parent_entity(se))
+ 				set_next_buddy(parent_entity(se));
+
+			/* avoid re-evaluating load for this entity */
+			se = parent_entity(se);
+ 			break;
+ 		}
+ 		flags |= DEQUEUE_SLEEP;
+-- 
+1.7.0.4
+
@@ -0,0 +1,168 @@
+From 4ec11a3e21874534f9ffa70a8878bb255618bb33 Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Thu, 21 Jul 2011 09:43:27 -0700
+Subject: [PATCH 02/19] sched: Implement hierarchical task accounting for SCHED_OTHER
+
+Introduce hierarchical task accounting for the group scheduling case in CFS, as
+well as promoting the responsibility for maintaining rq->nr_running to the
+scheduling classes.
+
+The primary motivation for this is that with scheduling classes supporting
+bandwidth throttling it is possible for entities participating in throttled
+sub-trees to not have root visible changes in rq->nr_running across activate
+and de-activate operations.  This in turn leads to incorrect idle and
+weight-per-task load balance decisions.
+
+This also allows us to make a small fixlet to the fastpath in pick_next_task()
+under group scheduling.
+
+Note: this issue also exists with the existing sched_rt throttling mechanism.
+This patch does not address that.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184756.878333391@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ kernel/sched.c          |    6 ++----
+ kernel/sched_fair.c     |   10 ++++++++--
+ kernel/sched_rt.c       |    5 ++++-
+ kernel/sched_stoptask.c |    2 ++
+ 4 files changed, 16 insertions(+), 7 deletions(-)
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index fde6ff9..b015a0e 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -308,7 +308,7 @@ struct task_group root_task_group;
+ /* CFS-related fields in a runqueue */
+ struct cfs_rq {
+ 	struct load_weight load;
+-	unsigned long nr_running;
+	unsigned long nr_running, h_nr_running;
+ 
+ 	u64 exec_clock;
+ 	u64 min_vruntime;
+@@ -1830,7 +1830,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
+ 		rq->nr_uninterruptible--;
+ 
+ 	enqueue_task(rq, p, flags);
+-	inc_nr_running(rq);
+ }
+ 
+ /*
+@@ -1842,7 +1841,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
+ 		rq->nr_uninterruptible++;
+ 
+ 	dequeue_task(rq, p, flags);
+-	dec_nr_running(rq);
+ }
+ 
+ #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+@@ -4226,7 +4224,7 @@ pick_next_task(struct rq *rq)
+ 	 * Optimization: we know that if all tasks are in
+ 	 * the fair class we can call that function directly:
+ 	 */
+-	if (likely(rq->nr_running == rq->cfs.nr_running)) {
+	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+ 		p = fair_sched_class.pick_next_task(rq);
+ 		if (likely(p))
+ 			return p;
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index c80f030..f70bb4b 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -1332,16 +1332,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 			break;
+ 		cfs_rq = cfs_rq_of(se);
+ 		enqueue_entity(cfs_rq, se, flags);
+		cfs_rq->h_nr_running++;
+ 		flags = ENQUEUE_WAKEUP;
+ 	}
+ 
+ 	for_each_sched_entity(se) {
+-		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		cfs_rq = cfs_rq_of(se);
+		cfs_rq->h_nr_running++;
+ 
+ 		update_cfs_load(cfs_rq, 0);
+ 		update_cfs_shares(cfs_rq);
+ 	}
+ 
+	inc_nr_running(rq);
+ 	hrtick_update(rq);
+ }
+ 
+@@ -1361,6 +1364,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 	for_each_sched_entity(se) {
+ 		cfs_rq = cfs_rq_of(se);
+ 		dequeue_entity(cfs_rq, se, flags);
+		cfs_rq->h_nr_running--;
+ 
+ 		/* Don't dequeue parent if it has other entities besides us */
+ 		if (cfs_rq->load.weight) {
+@@ -1379,12 +1383,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 	}
+ 
+ 	for_each_sched_entity(se) {
+-		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+		cfs_rq = cfs_rq_of(se);
+		cfs_rq->h_nr_running--;
+ 
+ 		update_cfs_load(cfs_rq, 0);
+ 		update_cfs_shares(cfs_rq);
+ 	}
+ 
+	dec_nr_running(rq);
+ 	hrtick_update(rq);
+ }
+ 
+diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
+index 10d0182..1af971b 100644
+--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
+@@ -949,6 +949,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+ 
+ 	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+ 		enqueue_pushable_task(rq, p);
+
+	inc_nr_running(rq);
+ }
+ 
+ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+@@ -959,6 +961,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
+ 	dequeue_rt_entity(rt_se);
+ 
+ 	dequeue_pushable_task(rq, p);
+
+	dec_nr_running(rq);
+ }
+ 
+ /*
+@@ -1851,4 +1855,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
+ 	rcu_read_unlock();
+ }
+ #endif /* CONFIG_SCHED_DEBUG */
+-
+diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
+index 6f43763..8b44e7f 100644
+--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
+@@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
+ static void
+ enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+ {
+	inc_nr_running(rq);
+ }
+ 
+ static void
+ dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
+ {
+	dec_nr_running(rq);
+ }
+ 
+ static void yield_task_stop(struct rq *rq)
+-- 
+1.7.0.4
+
@@ -0,0 +1,380 @@
+From 116f22667986ab86f1a00098a0daf9959b1f6df0 Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Thu, 21 Jul 2011 09:43:28 -0700
+Subject: [PATCH 03/19] sched: Introduce primitives to account for CFS bandwidth tracking
+
+In this patch we introduce the notion of CFS bandwidth, partitioned into
+globally unassigned bandwidth, and locally claimed bandwidth.
+
+ - The global bandwidth is per task_group, it represents a pool of unclaimed
+   bandwidth that cfs_rqs can allocate from.
+ - The local bandwidth is tracked per-cfs_rq, this represents allotments from
+   the global pool bandwidth assigned to a specific cpu.
+
+Bandwidth is managed via cgroupfs, adding two new interfaces to the cpu subsystem:
+ - cpu.cfs_period_us : the bandwidth period in usecs
+ - cpu.cfs_quota_us : the cpu bandwidth (in usecs) that this tg will be allowed
+   to consume over period above.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Signed-off-by: Nikhil Rao <ncrao@google.com>
+Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
+Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184756.972636699@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ init/Kconfig        |   12 +++
+ kernel/sched.c      |  201 ++++++++++++++++++++++++++++++++++++++++++++++++++-
+ kernel/sched_fair.c |   16 ++++
+ 3 files changed, 225 insertions(+), 4 deletions(-)
+
+diff --git a/init/Kconfig b/init/Kconfig
+index 412c21b..67579ed 100644
+--- a/init/Kconfig
+++ b/init/Kconfig
+@@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED
+ 	depends on CGROUP_SCHED
+ 	default CGROUP_SCHED
+ 
+config CFS_BANDWIDTH
+	bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
+	depends on EXPERIMENTAL
+	depends on FAIR_GROUP_SCHED
+	default n
+	help
+	  This option allows users to define CPU bandwidth rates (limits) for
+	  tasks running within the fair group scheduler.  Groups with no limit
+	  set are considered to be unconstrained and will run with no
+	  restriction.
+	  See tip/Documentation/scheduler/sched-bwc.txt for more information.
+
+ config RT_GROUP_SCHED
+ 	bool "Group scheduling for SCHED_RR/FIFO"
+ 	depends on EXPERIMENTAL
+diff --git a/kernel/sched.c b/kernel/sched.c
+index b015a0e..28d838b 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -244,6 +244,14 @@ struct cfs_rq;
+ 
+ static LIST_HEAD(task_groups);
+ 
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+	raw_spinlock_t lock;
+	ktime_t period;
+	u64 quota;
+#endif
+};
+
+ /* task group related information */
+ struct task_group {
+ 	struct cgroup_subsys_state css;
+@@ -275,6 +283,8 @@ struct task_group {
+ #ifdef CONFIG_SCHED_AUTOGROUP
+ 	struct autogroup *autogroup;
+ #endif
+
+	struct cfs_bandwidth cfs_bandwidth;
+ };
+ 
+ /* task_group_lock serializes the addition/removal of task groups */
+@@ -374,9 +384,48 @@ struct cfs_rq {
+ 
+ 	unsigned long load_contribution;
+ #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	int runtime_enabled;
+	s64 runtime_remaining;
+#endif
+ #endif
+ };
+ 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+	return &tg->cfs_bandwidth;
+}
+
+static inline u64 default_cfs_period(void);
+
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+	raw_spin_lock_init(&cfs_b->lock);
+	cfs_b->quota = RUNTIME_INF;
+	cfs_b->period = ns_to_ktime(default_cfs_period());
+}
+
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	cfs_rq->runtime_enabled = 0;
+}
+
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{}
+#else
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+	return NULL;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+ /* Real-Time classes' related field in a runqueue: */
+ struct rt_rq {
+ 	struct rt_prio_array active;
+@@ -7958,6 +8007,12 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+ 	tg->cfs_rq[cpu] = cfs_rq;
+ 	init_cfs_rq(cfs_rq, rq);
+ 	cfs_rq->tg = tg;
+	cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+	/* allow initial update_cfs_load() to truncate */
+	cfs_rq->load_stamp = 1;
+#endif
+	init_cfs_rq_runtime(cfs_rq);
+ 
+ 	tg->se[cpu] = se;
+ 	/* se could be NULL for root_task_group */
+@@ -8093,6 +8148,7 @@ void __init sched_init(void)
+ 		 * We achieve this by letting root_task_group's tasks sit
+ 		 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
+ 		 */
+		init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
+ 		init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
+ 
+@@ -8336,6 +8392,8 @@ static void free_fair_sched_group(struct task_group *tg)
+ {
+ 	int i;
+ 
+	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+ 	for_each_possible_cpu(i) {
+ 		if (tg->cfs_rq)
+ 			kfree(tg->cfs_rq[i]);
+@@ -8363,6 +8421,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+ 
+ 	tg->shares = NICE_0_LOAD;
+ 
+	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+ 	for_each_possible_cpu(i) {
+ 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+ 				      GFP_KERNEL, cpu_to_node(i));
+@@ -8734,7 +8794,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+ 	return walk_tg_tree(tg_schedulable, tg_nop, &data);
+ }
+ 
+-static int tg_set_bandwidth(struct task_group *tg,
+static int tg_set_rt_bandwidth(struct task_group *tg,
+ 		u64 rt_period, u64 rt_runtime)
+ {
+ 	int i, err = 0;
+@@ -8773,7 +8833,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+ 	if (rt_runtime_us < 0)
+ 		rt_runtime = RUNTIME_INF;
+ 
+-	return tg_set_bandwidth(tg, rt_period, rt_runtime);
+	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+ }
+ 
+ long sched_group_rt_runtime(struct task_group *tg)
+@@ -8798,7 +8858,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+ 	if (rt_period == 0)
+ 		return -EINVAL;
+ 
+-	return tg_set_bandwidth(tg, rt_period, rt_runtime);
+	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
+ }
+ 
+ long sched_group_rt_period(struct task_group *tg)
+@@ -8988,6 +9048,128 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
+ 
+ 	return (u64) scale_load_down(tg->shares);
+ }
+
+#ifdef CONFIG_CFS_BANDWIDTH
+const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
+const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+	int i;
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+	static DEFINE_MUTEX(mutex);
+
+	if (tg == &root_task_group)
+		return -EINVAL;
+
+	/*
+	 * Ensure we have at some amount of bandwidth every period.  This is
+	 * to prevent reaching a state of large arrears when throttled via
+	 * entity_tick() resulting in prolonged exit starvation.
+	 */
+	if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
+		return -EINVAL;
+
+	/*
+	 * Likewise, bound things on the otherside by preventing insane quota
+	 * periods.  This also allows us to normalize in computing quota
+	 * feasibility.
+	 */
+	if (period > max_cfs_quota_period)
+		return -EINVAL;
+
+	mutex_lock(&mutex);
+	raw_spin_lock_irq(&cfs_b->lock);
+	cfs_b->period = ns_to_ktime(period);
+	cfs_b->quota = quota;
+	raw_spin_unlock_irq(&cfs_b->lock);
+
+	for_each_possible_cpu(i) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+		struct rq *rq = rq_of(cfs_rq);
+
+		raw_spin_lock_irq(&rq->lock);
+		cfs_rq->runtime_enabled = quota != RUNTIME_INF;
+		cfs_rq->runtime_remaining = 0;
+		raw_spin_unlock_irq(&rq->lock);
+	}
+	mutex_unlock(&mutex);
+
+	return 0;
+}
+
+int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+{
+	u64 quota, period;
+
+	period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+	if (cfs_quota_us < 0)
+		quota = RUNTIME_INF;
+	else
+		quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+
+	return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_quota(struct task_group *tg)
+{
+	u64 quota_us;
+
+	if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
+		return -1;
+
+	quota_us = tg_cfs_bandwidth(tg)->quota;
+	do_div(quota_us, NSEC_PER_USEC);
+
+	return quota_us;
+}
+
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+	u64 quota, period;
+
+	period = (u64)cfs_period_us * NSEC_PER_USEC;
+	quota = tg_cfs_bandwidth(tg)->quota;
+
+	if (period <= 0)
+		return -EINVAL;
+
+	return tg_set_cfs_bandwidth(tg, period, quota);
+}
+
+long tg_get_cfs_period(struct task_group *tg)
+{
+	u64 cfs_period_us;
+
+	cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+	do_div(cfs_period_us, NSEC_PER_USEC);
+
+	return cfs_period_us;
+}
+
+static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return tg_get_cfs_quota(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+				s64 cfs_quota_us)
+{
+	return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+}
+
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+				u64 cfs_period_us)
+{
+	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
+ 
+ #ifdef CONFIG_RT_GROUP_SCHED
+@@ -9022,6 +9204,18 @@ static struct cftype cpu_files[] = {
+ 		.write_u64 = cpu_shares_write_u64,
+ 	},
+ #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		.name = "cfs_quota_us",
+		.read_s64 = cpu_cfs_quota_read_s64,
+		.write_s64 = cpu_cfs_quota_write_s64,
+	},
+	{
+		.name = "cfs_period_us",
+		.read_u64 = cpu_cfs_period_read_u64,
+		.write_u64 = cpu_cfs_period_write_u64,
+	},
+#endif
+ #ifdef CONFIG_RT_GROUP_SCHED
+ 	{
+ 		.name = "rt_runtime_us",
+@@ -9331,4 +9525,3 @@ struct cgroup_subsys cpuacct_subsys = {
+ 	.subsys_id = cpuacct_subsys_id,
+ };
+ #endif	/* CONFIG_CGROUP_CPUACCT */
+-
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index f70bb4b..91624cf 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -1256,6 +1256,22 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
+ 		check_preempt_tick(cfs_rq, curr);
+ }
+ 
+
+/**************************************************
+ * CFS bandwidth control machinery
+ */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * default period for cfs group bandwidth.
+ * default: 0.1s, units: nanoseconds
+ */
+static inline u64 default_cfs_period(void)
+{
+	return 100000000ULL;
+}
+#endif
+
+ /**************************************************
+  * CFS operations on tasks:
+  */
+-- 
+1.7.0.4
+
@@ -0,0 +1,221 @@
+From e68a3cf7b0006f6d8c362833ebc96cbed01a263e Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Thu, 21 Jul 2011 09:43:29 -0700
+Subject: [PATCH 04/19] sched: Validate CFS quota hierarchies
+
+Add constraints validation for CFS bandwidth hierarchies.
+
+Validate that:
+   max(child bandwidth) <= parent_bandwidth
+
+In a quota limited hierarchy, an unconstrained entity
+(e.g. bandwidth==RUNTIME_INF) inherits the bandwidth of its parent.
+
+This constraint is chosen over sum(child_bandwidth) as notion of over-commit is
+valuable within SCHED_OTHER.  Some basic code from the RT case is re-factored
+for reuse.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184757.083774572@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ kernel/sched.c |  112 +++++++++++++++++++++++++++++++++++++++++++++++++-------
+ 1 files changed, 98 insertions(+), 14 deletions(-)
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 28d838b..75f2dd7 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -249,6 +249,7 @@ struct cfs_bandwidth {
+ 	raw_spinlock_t lock;
+ 	ktime_t period;
+ 	u64 quota;
+	s64 hierarchal_quota;
+ #endif
+ };
+ 
+@@ -1512,7 +1513,8 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
+ 	update_load_sub(&rq->load, load);
+ }
+ 
+-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
+			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
+ typedef int (*tg_visitor)(struct task_group *, void *);
+ 
+ /*
+@@ -8694,12 +8696,7 @@ unsigned long sched_group_shares(struct task_group *tg)
+ }
+ #endif
+ 
+-#ifdef CONFIG_RT_GROUP_SCHED
+-/*
+- * Ensure that the real time constraints are schedulable.
+- */
+-static DEFINE_MUTEX(rt_constraints_mutex);
+-
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
+ static unsigned long to_ratio(u64 period, u64 runtime)
+ {
+ 	if (runtime == RUNTIME_INF)
+@@ -8707,6 +8704,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)
+ 
+ 	return div64_u64(runtime << 20, period);
+ }
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
+ 
+ /* Must be called with tasklist_lock held */
+ static inline int tg_has_rt_tasks(struct task_group *tg)
+@@ -8727,7 +8731,7 @@ struct rt_schedulable_data {
+ 	u64 rt_runtime;
+ };
+ 
+-static int tg_schedulable(struct task_group *tg, void *data)
+static int tg_rt_schedulable(struct task_group *tg, void *data)
+ {
+ 	struct rt_schedulable_data *d = data;
+ 	struct task_group *child;
+@@ -8791,7 +8795,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+ 		.rt_runtime = runtime,
+ 	};
+ 
+-	return walk_tg_tree(tg_schedulable, tg_nop, &data);
+	return walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+ }
+ 
+ static int tg_set_rt_bandwidth(struct task_group *tg,
+@@ -9050,14 +9054,17 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
+ }
+ 
+ #ifdef CONFIG_CFS_BANDWIDTH
+static DEFINE_MUTEX(cfs_constraints_mutex);
+
+ const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
+ const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+ 
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
+
+ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+ {
+-	int i;
+	int i, ret = 0;
+ 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+-	static DEFINE_MUTEX(mutex);
+ 
+ 	if (tg == &root_task_group)
+ 		return -EINVAL;
+@@ -9078,7 +9085,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+ 	if (period > max_cfs_quota_period)
+ 		return -EINVAL;
+ 
+-	mutex_lock(&mutex);
+	mutex_lock(&cfs_constraints_mutex);
+	ret = __cfs_schedulable(tg, period, quota);
+	if (ret)
+		goto out_unlock;
+
+ 	raw_spin_lock_irq(&cfs_b->lock);
+ 	cfs_b->period = ns_to_ktime(period);
+ 	cfs_b->quota = quota;
+@@ -9093,9 +9104,10 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+ 		cfs_rq->runtime_remaining = 0;
+ 		raw_spin_unlock_irq(&rq->lock);
+ 	}
+-	mutex_unlock(&mutex);
+out_unlock:
+	mutex_unlock(&cfs_constraints_mutex);
+ 
+-	return 0;
+	return ret;
+ }
+ 
+ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+@@ -9169,6 +9181,78 @@ static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+ 	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+ }
+ 
+struct cfs_schedulable_data {
+	struct task_group *tg;
+	u64 period, quota;
+};
+
+/*
+ * normalize group quota/period to be quota/max_period
+ * note: units are usecs
+ */
+static u64 normalize_cfs_quota(struct task_group *tg,
+			       struct cfs_schedulable_data *d)
+{
+	u64 quota, period;
+
+	if (tg == d->tg) {
+		period = d->period;
+		quota = d->quota;
+	} else {
+		period = tg_get_cfs_period(tg);
+		quota = tg_get_cfs_quota(tg);
+	}
+
+	/* note: these should typically be equivalent */
+	if (quota == RUNTIME_INF || quota == -1)
+		return RUNTIME_INF;
+
+	return to_ratio(period, quota);
+}
+
+static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
+{
+	struct cfs_schedulable_data *d = data;
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+	s64 quota = 0, parent_quota = -1;
+
+	if (!tg->parent) {
+		quota = RUNTIME_INF;
+	} else {
+		struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
+
+		quota = normalize_cfs_quota(tg, d);
+		parent_quota = parent_b->hierarchal_quota;
+
+		/*
+		 * ensure max(child_quota) <= parent_quota, inherit when no
+		 * limit is set
+		 */
+		if (quota == RUNTIME_INF)
+			quota = parent_quota;
+		else if (parent_quota != RUNTIME_INF && quota > parent_quota)
+			return -EINVAL;
+	}
+	cfs_b->hierarchal_quota = quota;
+
+	return 0;
+}
+
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+{
+	struct cfs_schedulable_data data = {
+		.tg = tg,
+		.period = period,
+		.quota = quota,
+	};
+
+	if (quota != RUNTIME_INF) {
+		do_div(data.period, NSEC_PER_USEC);
+		do_div(data.quota, NSEC_PER_USEC);
+	}
+
+	return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+}
+ #endif /* CONFIG_CFS_BANDWIDTH */
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
+ 
+-- 
+1.7.0.4
+
@@ -0,0 +1,217 @@
+From 50fe68ec9d454eced64cbfc29954ee64cc7225da Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Thu, 21 Jul 2011 09:43:30 -0700
+Subject: [PATCH 05/19] sched: Accumulate per-cfs_rq cpu usage and charge against bandwidth
+
+Account bandwidth usage on the cfs_rq level versus the task_groups to which
+they belong.  Whether we are tracking bandwidth on a given cfs_rq is maintained
+under cfs_rq->runtime_enabled.
+
+cfs_rq's which belong to a bandwidth constrained task_group have their runtime
+accounted via the update_curr() path, which withdraws bandwidth from the global
+pool as desired.  Updates involving the global pool are currently protected
+under cfs_bandwidth->lock, local runtime is protected by rq->lock.
+
+This patch only assigns and tracks quota, no action is taken in the case that
+cfs_rq->runtime_used exceeds cfs_rq->runtime_assigned.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Signed-off-by: Nikhil Rao <ncrao@google.com>
+Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
+Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184757.179386821@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ include/linux/sched.h |    4 ++
+ kernel/sched.c        |    4 ++-
+ kernel/sched_fair.c   |   79 +++++++++++++++++++++++++++++++++++++++++++++++-
+ kernel/sysctl.c       |   10 ++++++
+ 4 files changed, 94 insertions(+), 3 deletions(-)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 14a6c7b..adfc8eb 100644
+--- a/include/linux/sched.h
+++ b/include/linux/sched.h
+@@ -2021,6 +2021,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
+ static inline void sched_autogroup_exit(struct signal_struct *sig) { }
+ #endif
+ 
+#ifdef CONFIG_CFS_BANDWIDTH
+extern unsigned int sysctl_sched_cfs_bandwidth_slice;
+#endif
+
+ #ifdef CONFIG_RT_MUTEXES
+ extern int rt_mutex_getprio(struct task_struct *p);
+ extern void rt_mutex_setprio(struct task_struct *p, int prio);
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 75f2dd7..cdbc7d3 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -248,7 +248,7 @@ struct cfs_bandwidth {
+ #ifdef CONFIG_CFS_BANDWIDTH
+ 	raw_spinlock_t lock;
+ 	ktime_t period;
+-	u64 quota;
+	u64 quota, runtime;
+ 	s64 hierarchal_quota;
+ #endif
+ };
+@@ -404,6 +404,7 @@ static inline u64 default_cfs_period(void);
+ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+ {
+ 	raw_spin_lock_init(&cfs_b->lock);
+	cfs_b->runtime = 0;
+ 	cfs_b->quota = RUNTIME_INF;
+ 	cfs_b->period = ns_to_ktime(default_cfs_period());
+ }
+@@ -9093,6 +9094,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+ 	raw_spin_lock_irq(&cfs_b->lock);
+ 	cfs_b->period = ns_to_ktime(period);
+ 	cfs_b->quota = quota;
+	cfs_b->runtime = quota;
+ 	raw_spin_unlock_irq(&cfs_b->lock);
+ 
+ 	for_each_possible_cpu(i) {
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index 91624cf..863c9ec 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+  */
+ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+ 
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
+ * each time a cfs_rq requests quota.
+ *
+ * Note: in the case that the slice exceeds the runtime remaining (either due
+ * to consumption or the quota being specified to be smaller than the slice)
+ * we will always only issue the remaining available time.
+ *
+ * default: 5 msec, units: microseconds
+  */
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
+
+ static const struct sched_class fair_sched_class;
+ 
+ /**************************************************************
+@@ -305,6 +319,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
+ 
+ #endif	/* CONFIG_FAIR_GROUP_SCHED */
+ 
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				   unsigned long delta_exec);
+ 
+ /**************************************************************
+  * Scheduling class tree data structure manipulation methods:
+@@ -602,6 +618,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
+ 		cpuacct_charge(curtask, delta_exec);
+ 		account_group_exec_runtime(curtask, delta_exec);
+ 	}
+
+	account_cfs_rq_runtime(cfs_rq, delta_exec);
+ }
+ 
+ static inline void
+@@ -1270,6 +1288,58 @@ static inline u64 default_cfs_period(void)
+ {
+ 	return 100000000ULL;
+ }
+
+static inline u64 sched_cfs_bandwidth_slice(void)
+{
+	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
+}
+
+static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	struct task_group *tg = cfs_rq->tg;
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+	u64 amount = 0, min_amount;
+
+	/* note: this is a positive sum as runtime_remaining <= 0 */
+	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+
+	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->quota == RUNTIME_INF)
+		amount = min_amount;
+	else if (cfs_b->runtime > 0) {
+		amount = min(cfs_b->runtime, min_amount);
+		cfs_b->runtime -= amount;
+	}
+	raw_spin_unlock(&cfs_b->lock);
+
+	cfs_rq->runtime_remaining += amount;
+}
+
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				     unsigned long delta_exec)
+{
+	if (!cfs_rq->runtime_enabled)
+		return;
+
+	cfs_rq->runtime_remaining -= delta_exec;
+	if (cfs_rq->runtime_remaining > 0)
+		return;
+
+	assign_cfs_rq_runtime(cfs_rq);
+}
+
+static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+						   unsigned long delta_exec)
+{
+	if (!cfs_rq->runtime_enabled)
+		return;
+
+	__account_cfs_rq_runtime(cfs_rq, delta_exec);
+}
+
+#else
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				     unsigned long delta_exec) {}
+ #endif
+ 
+ /**************************************************
+@@ -4264,8 +4334,13 @@ static void set_curr_task_fair(struct rq *rq)
+ {
+ 	struct sched_entity *se = &rq->curr->se;
+ 
+-	for_each_sched_entity(se)
+-		set_next_entity(cfs_rq_of(se), se);
+	for_each_sched_entity(se) {
+		struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+		set_next_entity(cfs_rq, se);
+		/* ensure bandwidth has been allocated on our new cfs_rq */
+		account_cfs_rq_runtime(cfs_rq, 0);
+	}
+ }
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+diff --git a/kernel/sysctl.c b/kernel/sysctl.c
+index f175d98..b38ca7f 100644
+--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
+@@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = {
+ 		.extra2		= &one,
+ 	},
+ #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		.procname	= "sched_cfs_bandwidth_slice_us",
+		.data		= &sysctl_sched_cfs_bandwidth_slice,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+	},
+#endif
+ #ifdef CONFIG_PROVE_LOCKING
+ 	{
+ 		.procname	= "prove_locking",
+-- 
+1.7.0.4
+
@@ -0,0 +1,263 @@
+From c127107a0b9f7fe08dd11c84ecb6b307052b7688 Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Thu, 21 Jul 2011 09:43:31 -0700
+Subject: [PATCH 06/19] sched: Add a timer to handle CFS bandwidth refresh
+
+This patch adds a per-task_group timer which handles the refresh of the global
+CFS bandwidth pool.
+
+Since the RT pool is using a similar timer there's some small refactoring to
+share this support.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184757.277271273@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ kernel/sched.c      |  107 +++++++++++++++++++++++++++++++++++++++++----------
+ kernel/sched_fair.c |   40 +++++++++++++++++-
+ 2 files changed, 123 insertions(+), 24 deletions(-)
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index cdbc7d3..4bb2d63 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -193,10 +193,28 @@ static inline int rt_bandwidth_enabled(void)
+ 	return sysctl_sched_rt_runtime >= 0;
+ }
+ 
+-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
+ {
+-	ktime_t now;
+	unsigned long delta;
+	ktime_t soft, hard, now;
+ 
+	for (;;) {
+		if (hrtimer_active(period_timer))
+			break;
+
+		now = hrtimer_cb_get_time(period_timer);
+		hrtimer_forward(period_timer, now, period);
+
+		soft = hrtimer_get_softexpires(period_timer);
+		hard = hrtimer_get_expires(period_timer);
+		delta = ktime_to_ns(ktime_sub(hard, soft));
+		__hrtimer_start_range_ns(period_timer, soft, delta,
+					 HRTIMER_MODE_ABS_PINNED, 0);
+	}
+}
+
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ 	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+ 		return;
+ 
+@@ -204,22 +222,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+ 		return;
+ 
+ 	raw_spin_lock(&rt_b->rt_runtime_lock);
+-	for (;;) {
+-		unsigned long delta;
+-		ktime_t soft, hard;
+-
+-		if (hrtimer_active(&rt_b->rt_period_timer))
+-			break;
+-
+-		now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
+-		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
+-
+-		soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
+-		hard = hrtimer_get_expires(&rt_b->rt_period_timer);
+-		delta = ktime_to_ns(ktime_sub(hard, soft));
+-		__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
+-				HRTIMER_MODE_ABS_PINNED, 0);
+-	}
+	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+ 	raw_spin_unlock(&rt_b->rt_runtime_lock);
+ }
+ 
+@@ -250,6 +253,9 @@ struct cfs_bandwidth {
+ 	ktime_t period;
+ 	u64 quota, runtime;
+ 	s64 hierarchal_quota;
+
+	int idle, timer_active;
+	struct hrtimer period_timer;
+ #endif
+ };
+ 
+@@ -400,6 +406,28 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+ }
+ 
+ static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+	struct cfs_bandwidth *cfs_b =
+		container_of(timer, struct cfs_bandwidth, period_timer);
+	ktime_t now;
+	int overrun;
+	int idle = 0;
+
+	for (;;) {
+		now = hrtimer_cb_get_time(timer);
+		overrun = hrtimer_forward(timer, now, cfs_b->period);
+
+		if (!overrun)
+			break;
+
+		idle = do_sched_cfs_period_timer(cfs_b, overrun);
+	}
+
+	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+ 
+ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+ {
+@@ -407,6 +435,9 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+ 	cfs_b->runtime = 0;
+ 	cfs_b->quota = RUNTIME_INF;
+ 	cfs_b->period = ns_to_ktime(default_cfs_period());
+
+	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	cfs_b->period_timer.function = sched_cfs_period_timer;
+ }
+ 
+ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+@@ -414,8 +445,34 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+ 	cfs_rq->runtime_enabled = 0;
+ }
+ 
+/* requires cfs_b->lock, may release to reprogram timer */
+static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+	/*
+	 * The timer may be active because we're trying to set a new bandwidth
+	 * period or because we're racing with the tear-down path
+	 * (timer_active==0 becomes visible before the hrtimer call-back
+	 * terminates).  In either case we ensure that it's re-programmed
+	 */
+	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+		raw_spin_unlock(&cfs_b->lock);
+		/* ensure cfs_b->lock is available while we wait */
+		hrtimer_cancel(&cfs_b->period_timer);
+
+		raw_spin_lock(&cfs_b->lock);
+		/* if someone else restarted the timer then we're done */
+		if (cfs_b->timer_active)
+			return;
+	}
+
+	cfs_b->timer_active = 1;
+	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
+
+ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+-{}
+{
+	hrtimer_cancel(&cfs_b->period_timer);
+}
+ #else
+ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+@@ -9064,7 +9121,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
+ 
+ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+ {
+-	int i, ret = 0;
+	int i, ret = 0, runtime_enabled;
+ 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+ 
+ 	if (tg == &root_task_group)
+@@ -9091,10 +9148,18 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+ 	if (ret)
+ 		goto out_unlock;
+ 
+	runtime_enabled = quota != RUNTIME_INF;
+ 	raw_spin_lock_irq(&cfs_b->lock);
+ 	cfs_b->period = ns_to_ktime(period);
+ 	cfs_b->quota = quota;
+ 	cfs_b->runtime = quota;
+
+	/* restart the period timer (if active) to handle new period expiry */
+	if (runtime_enabled && cfs_b->timer_active) {
+		/* force a reprogram */
+		cfs_b->timer_active = 0;
+		__start_cfs_bandwidth(cfs_b);
+	}
+ 	raw_spin_unlock_irq(&cfs_b->lock);
+ 
+ 	for_each_possible_cpu(i) {
+@@ -9102,7 +9167,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+ 		struct rq *rq = rq_of(cfs_rq);
+ 
+ 		raw_spin_lock_irq(&rq->lock);
+-		cfs_rq->runtime_enabled = quota != RUNTIME_INF;
+		cfs_rq->runtime_enabled = runtime_enabled;
+ 		cfs_rq->runtime_remaining = 0;
+ 		raw_spin_unlock_irq(&rq->lock);
+ 	}
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index 863c9ec..e34c26c 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -1306,9 +1306,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+ 	raw_spin_lock(&cfs_b->lock);
+ 	if (cfs_b->quota == RUNTIME_INF)
+ 		amount = min_amount;
+-	else if (cfs_b->runtime > 0) {
+-		amount = min(cfs_b->runtime, min_amount);
+-		cfs_b->runtime -= amount;
+	else {
+		/* ensure bandwidth timer remains active under consumption */
+		if (!cfs_b->timer_active)
+			__start_cfs_bandwidth(cfs_b);
+
+		if (cfs_b->runtime > 0) {
+			amount = min(cfs_b->runtime, min_amount);
+			cfs_b->runtime -= amount;
+			cfs_b->idle = 0;
+		}
+ 	}
+ 	raw_spin_unlock(&cfs_b->lock);
+ 
+@@ -1337,6 +1344,33 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
+ }
+ 
+/*
+ * Responsible for refilling a task_group's bandwidth and unthrottling its
+ * cfs_rqs as appropriate. If there has been no activity within the last
+ * period the timer is deactivated until scheduling resumes; cfs_b->idle is
+ * used to track this state.
+ */
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+{
+	int idle = 1;
+
+	raw_spin_lock(&cfs_b->lock);
+	/* no need to continue the timer with no bandwidth constraint */
+	if (cfs_b->quota == RUNTIME_INF)
+		goto out_unlock;
+
+	idle = cfs_b->idle;
+	cfs_b->runtime = cfs_b->quota;
+
+	/* mark as potentially idle for the upcoming period */
+	cfs_b->idle = 1;
+out_unlock:
+	if (idle)
+		cfs_b->timer_active = 0;
+	raw_spin_unlock(&cfs_b->lock);
+
+	return idle;
+}
+ #else
+ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ 				     unsigned long delta_exec) {}
+-- 
+1.7.0.4
+
@@ -0,0 +1,208 @@
+From bfd5537a5bca64bb37c64b3156bdbb85dbd46fae Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Thu, 21 Jul 2011 09:43:32 -0700
+Subject: [PATCH 07/19] sched: Expire invalid runtime
+
+Since quota is managed using a global state but consumed on a per-cpu basis
+we need to ensure that our per-cpu state is appropriately synchronized.
+Most importantly, runtime that is state (from a previous period) should not be
+locally consumable.
+
+We take advantage of existing sched_clock synchronization about the jiffy to
+efficiently detect whether we have (globally) crossed a quota boundary above.
+
+One catch is that the direction of spread on sched_clock is undefined,
+specifically, we don't know whether our local clock is behind or ahead
+of the one responsible for the current expiration time.
+
+Fortunately we can differentiate these by considering whether the
+global deadline has advanced.  If it has not, then we assume our clock to be
+"fast" and advance our local expiration; otherwise, we know the deadline has
+truly passed and we expire our local runtime.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184757.379275352@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ kernel/sched.c      |    4 ++-
+ kernel/sched_fair.c |   90 +++++++++++++++++++++++++++++++++++++++++++++-----
+ 2 files changed, 84 insertions(+), 10 deletions(-)
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 4bb2d63..6a0bcd5 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -253,6 +253,7 @@ struct cfs_bandwidth {
+ 	ktime_t period;
+ 	u64 quota, runtime;
+ 	s64 hierarchal_quota;
+	u64 runtime_expires;
+ 
+ 	int idle, timer_active;
+ 	struct hrtimer period_timer;
+@@ -393,6 +394,7 @@ struct cfs_rq {
+ #endif
+ #ifdef CONFIG_CFS_BANDWIDTH
+ 	int runtime_enabled;
+	u64 runtime_expires;
+ 	s64 runtime_remaining;
+ #endif
+ #endif
+@@ -9152,8 +9154,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+ 	raw_spin_lock_irq(&cfs_b->lock);
+ 	cfs_b->period = ns_to_ktime(period);
+ 	cfs_b->quota = quota;
+-	cfs_b->runtime = quota;
+ 
+	__refill_cfs_bandwidth_runtime(cfs_b);
+ 	/* restart the period timer (if active) to handle new period expiry */
+ 	if (runtime_enabled && cfs_b->timer_active) {
+ 		/* force a reprogram */
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index e34c26c..a97d19e 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -1294,11 +1294,30 @@ static inline u64 sched_cfs_bandwidth_slice(void)
+ 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
+ }
+ 
+/*
+ * Replenish runtime according to assigned quota and update expiration time.
+ * We use sched_clock_cpu directly instead of rq->clock to avoid adding
+ * additional synchronization around rq->lock.
+ *
+ * requires cfs_b->lock
+ */
+static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+{
+	u64 now;
+
+	if (cfs_b->quota == RUNTIME_INF)
+		return;
+
+	now = sched_clock_cpu(smp_processor_id());
+	cfs_b->runtime = cfs_b->quota;
+	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+}
+
+ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+ {
+ 	struct task_group *tg = cfs_rq->tg;
+ 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+-	u64 amount = 0, min_amount;
+	u64 amount = 0, min_amount, expires;
+ 
+ 	/* note: this is a positive sum as runtime_remaining <= 0 */
+ 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+@@ -1307,9 +1326,16 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+ 	if (cfs_b->quota == RUNTIME_INF)
+ 		amount = min_amount;
+ 	else {
+-		/* ensure bandwidth timer remains active under consumption */
+-		if (!cfs_b->timer_active)
+		/*
+		 * If the bandwidth pool has become inactive, then at least one
+		 * period must have elapsed since the last consumption.
+		 * Refresh the global state and ensure bandwidth timer becomes
+		 * active.
+		 */
+		if (!cfs_b->timer_active) {
+			__refill_cfs_bandwidth_runtime(cfs_b);
+ 			__start_cfs_bandwidth(cfs_b);
+		}
+ 
+ 		if (cfs_b->runtime > 0) {
+ 			amount = min(cfs_b->runtime, min_amount);
+@@ -1317,19 +1343,61 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+ 			cfs_b->idle = 0;
+ 		}
+ 	}
+	expires = cfs_b->runtime_expires;
+ 	raw_spin_unlock(&cfs_b->lock);
+ 
+ 	cfs_rq->runtime_remaining += amount;
+	/*
+	 * we may have advanced our local expiration to account for allowed
+	 * spread between our sched_clock and the one on which runtime was
+	 * issued.
+	 */
+	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+		cfs_rq->runtime_expires = expires;
+ }
+ 
+-static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+-				     unsigned long delta_exec)
+/*
+ * Note: This depends on the synchronization provided by sched_clock and the
+ * fact that rq->clock snapshots this value.
+ */
+static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+ {
+-	if (!cfs_rq->runtime_enabled)
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	struct rq *rq = rq_of(cfs_rq);
+
+	/* if the deadline is ahead of our clock, nothing to do */
+	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
+		return;
+
+	if (cfs_rq->runtime_remaining < 0)
+ 		return;
+ 
+	/*
+	 * If the local deadline has passed we have to consider the
+	 * possibility that our sched_clock is 'fast' and the global deadline
+	 * has not truly expired.
+	 *
+	 * Fortunately we can check determine whether this the case by checking
+	 * whether the global deadline has advanced.
+	 */
+
+	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+		/* extend local deadline, drift is bounded above by 2 ticks */
+		cfs_rq->runtime_expires += TICK_NSEC;
+	} else {
+		/* global deadline is ahead, expiration has passed */
+		cfs_rq->runtime_remaining = 0;
+	}
+}
+
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+				     unsigned long delta_exec)
+{
+	/* dock delta_exec before expiring quota (as it could span periods) */
+ 	cfs_rq->runtime_remaining -= delta_exec;
+-	if (cfs_rq->runtime_remaining > 0)
+	expire_cfs_rq_runtime(cfs_rq);
+
+	if (likely(cfs_rq->runtime_remaining > 0))
+ 		return;
+ 
+ 	assign_cfs_rq_runtime(cfs_rq);
+@@ -1360,7 +1428,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+ 		goto out_unlock;
+ 
+ 	idle = cfs_b->idle;
+-	cfs_b->runtime = cfs_b->quota;
+	/* if we're going inactive then everything else can be deferred */
+	if (idle)
+		goto out_unlock;
+
+	__refill_cfs_bandwidth_runtime(cfs_b);
+
+ 
+ 	/* mark as potentially idle for the upcoming period */
+ 	cfs_b->idle = 1;
+@@ -1579,7 +1652,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
+ 
+ 	return wl;
+ }
+-
+ #else
+ 
+ static inline unsigned long effective_load(struct task_group *tg, int cpu,
+-- 
+1.7.0.4
+
@@ -0,0 +1,234 @@
+From 726bbbeef1579f5f981d2d98afda0304197b7e19 Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Thu, 21 Jul 2011 09:43:33 -0700
+Subject: [PATCH 08/19] sched: Add support for throttling group entities
+
+Now that consumption is tracked (via update_curr()) we add support to throttle
+group entities (and their corresponding cfs_rqs) in the case where this is no
+run-time remaining.
+
+Throttled entities are dequeued to prevent scheduling, additionally we mark
+them as throttled (using cfs_rq->throttled) to prevent them from becoming
+re-enqueued until they are unthrottled.  A list of a task_group's throttled
+entities are maintained on the cfs_bandwidth structure.
+
+Note: While the machinery for throttling is added in this patch the act of
+throttling an entity exceeding its bandwidth is deferred until later within
+the series.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184757.480608533@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ kernel/sched.c      |    7 ++++
+ kernel/sched_fair.c |   89 ++++++++++++++++++++++++++++++++++++++++++++++++--
+ 2 files changed, 92 insertions(+), 4 deletions(-)
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 6a0bcd5..d631e42 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -257,6 +257,8 @@ struct cfs_bandwidth {
+ 
+ 	int idle, timer_active;
+ 	struct hrtimer period_timer;
+	struct list_head throttled_cfs_rq;
+
+ #endif
+ };
+ 
+@@ -396,6 +398,9 @@ struct cfs_rq {
+ 	int runtime_enabled;
+ 	u64 runtime_expires;
+ 	s64 runtime_remaining;
+
+	int throttled;
+	struct list_head throttled_list;
+ #endif
+ #endif
+ };
+@@ -438,6 +443,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+ 	cfs_b->quota = RUNTIME_INF;
+ 	cfs_b->period = ns_to_ktime(default_cfs_period());
+ 
+	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
+ 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ 	cfs_b->period_timer.function = sched_cfs_period_timer;
+ }
+@@ -445,6 +451,7 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+ {
+ 	cfs_rq->runtime_enabled = 0;
+	INIT_LIST_HEAD(&cfs_rq->throttled_list);
+ }
+ 
+ /* requires cfs_b->lock, may release to reprogram timer */
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index a97d19e..f6823e2 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -1313,7 +1313,8 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+ 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+ }
+ 
+-static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+ {
+ 	struct task_group *tg = cfs_rq->tg;
+ 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+@@ -1354,6 +1355,8 @@ static void assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+ 	 */
+ 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+ 		cfs_rq->runtime_expires = expires;
+
+	return cfs_rq->runtime_remaining > 0;
+ }
+ 
+ /*
+@@ -1400,7 +1403,12 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ 	if (likely(cfs_rq->runtime_remaining > 0))
+ 		return;
+ 
+-	assign_cfs_rq_runtime(cfs_rq);
+	/*
+	 * if we're unable to extend our runtime we resched so that the active
+	 * hierarchy can be throttled
+	 */
+	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
+		resched_task(rq_of(cfs_rq)->curr);
+ }
+ 
+ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+@@ -1412,6 +1420,47 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
+ }
+ 
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->throttled;
+}
+
+static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	struct sched_entity *se;
+	long task_delta, dequeue = 1;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+	/* account load preceding throttle */
+	update_cfs_load(cfs_rq, 0);
+
+	task_delta = cfs_rq->h_nr_running;
+	for_each_sched_entity(se) {
+		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+		/* throttled entity or throttle-on-deactivate */
+		if (!se->on_rq)
+			break;
+
+		if (dequeue)
+			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+		qcfs_rq->h_nr_running -= task_delta;
+
+		if (qcfs_rq->load.weight)
+			dequeue = 0;
+	}
+
+	if (!se)
+		rq->nr_running -= task_delta;
+
+	cfs_rq->throttled = 1;
+	raw_spin_lock(&cfs_b->lock);
+	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+	raw_spin_unlock(&cfs_b->lock);
+}
+
+ /*
+  * Responsible for refilling a task_group's bandwidth and unthrottling its
+  * cfs_rqs as appropriate. If there has been no activity within the last
+@@ -1447,6 +1496,11 @@ out_unlock:
+ #else
+ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ 				     unsigned long delta_exec) {}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+ #endif
+ 
+ /**************************************************
+@@ -1525,7 +1579,17 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 			break;
+ 		cfs_rq = cfs_rq_of(se);
+ 		enqueue_entity(cfs_rq, se, flags);
+
+		/*
+		 * end evaluation on encountering a throttled cfs_rq
+		 *
+		 * note: in the case of encountering a throttled cfs_rq we will
+		 * post the final h_nr_running increment below.
+		*/
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+ 		cfs_rq->h_nr_running++;
+
+ 		flags = ENQUEUE_WAKEUP;
+ 	}
+ 
+@@ -1533,11 +1597,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 		cfs_rq = cfs_rq_of(se);
+ 		cfs_rq->h_nr_running++;
+ 
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+
+ 		update_cfs_load(cfs_rq, 0);
+ 		update_cfs_shares(cfs_rq);
+ 	}
+ 
+-	inc_nr_running(rq);
+	if (!se)
+		inc_nr_running(rq);
+ 	hrtick_update(rq);
+ }
+ 
+@@ -1557,6 +1625,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 	for_each_sched_entity(se) {
+ 		cfs_rq = cfs_rq_of(se);
+ 		dequeue_entity(cfs_rq, se, flags);
+
+		/*
+		 * end evaluation on encountering a throttled cfs_rq
+		 *
+		 * note: in the case of encountering a throttled cfs_rq we will
+		 * post the final h_nr_running decrement below.
+		*/
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+ 		cfs_rq->h_nr_running--;
+ 
+ 		/* Don't dequeue parent if it has other entities besides us */
+@@ -1579,11 +1656,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+ 		cfs_rq = cfs_rq_of(se);
+ 		cfs_rq->h_nr_running--;
+ 
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+
+ 		update_cfs_load(cfs_rq, 0);
+ 		update_cfs_shares(cfs_rq);
+ 	}
+ 
+-	dec_nr_running(rq);
+	if (!se)
+		dec_nr_running(rq);
+ 	hrtick_update(rq);
+ }
+ 
+-- 
+1.7.0.4
+
@@ -0,0 +1,197 @@
+From b5898b8474a236451416cc68b2bea413c533f095 Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Thu, 21 Jul 2011 09:43:34 -0700
+Subject: [PATCH 09/19] sched: Add support for unthrottling group entities
+
+At the start of each period we refresh the global bandwidth pool.  At this time
+we must also unthrottle any cfs_rq entities who are now within bandwidth once
+more (as quota permits).
+
+Unthrottled entities have their corresponding cfs_rq->throttled flag cleared
+and their entities re-enqueued.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184757.574628950@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ kernel/sched.c      |    3 +
+ kernel/sched_fair.c |  127 +++++++++++++++++++++++++++++++++++++++++++++++++--
+ 2 files changed, 126 insertions(+), 4 deletions(-)
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index d631e42..4b54a73 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -9178,6 +9178,9 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+ 		raw_spin_lock_irq(&rq->lock);
+ 		cfs_rq->runtime_enabled = runtime_enabled;
+ 		cfs_rq->runtime_remaining = 0;
+
+		if (cfs_rq_throttled(cfs_rq))
+			unthrottle_cfs_rq(cfs_rq);
+ 		raw_spin_unlock_irq(&rq->lock);
+ 	}
+ out_unlock:
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index f6823e2..21e1c02 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -1461,6 +1461,84 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+ 	raw_spin_unlock(&cfs_b->lock);
+ }
+ 
+static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	struct sched_entity *se;
+	int enqueue = 1;
+	long task_delta;
+
+	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+
+	cfs_rq->throttled = 0;
+	raw_spin_lock(&cfs_b->lock);
+	list_del_rcu(&cfs_rq->throttled_list);
+	raw_spin_unlock(&cfs_b->lock);
+
+	if (!cfs_rq->load.weight)
+		return;
+
+	task_delta = cfs_rq->h_nr_running;
+	for_each_sched_entity(se) {
+		if (se->on_rq)
+			enqueue = 0;
+
+		cfs_rq = cfs_rq_of(se);
+		if (enqueue)
+			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+		cfs_rq->h_nr_running += task_delta;
+
+		if (cfs_rq_throttled(cfs_rq))
+			break;
+	}
+
+	if (!se)
+		rq->nr_running += task_delta;
+
+	/* determine whether we need to wake up potentially idle cpu */
+	if (rq->curr == rq->idle && rq->cfs.nr_running)
+		resched_task(rq->curr);
+}
+
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
+		u64 remaining, u64 expires)
+{
+	struct cfs_rq *cfs_rq;
+	u64 runtime = remaining;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
+				throttled_list) {
+		struct rq *rq = rq_of(cfs_rq);
+
+		raw_spin_lock(&rq->lock);
+		if (!cfs_rq_throttled(cfs_rq))
+			goto next;
+
+		runtime = -cfs_rq->runtime_remaining + 1;
+		if (runtime > remaining)
+			runtime = remaining;
+		remaining -= runtime;
+
+		cfs_rq->runtime_remaining += runtime;
+		cfs_rq->runtime_expires = expires;
+
+		/* we check whether we're throttled above */
+		if (cfs_rq->runtime_remaining > 0)
+			unthrottle_cfs_rq(cfs_rq);
+
+next:
+		raw_spin_unlock(&rq->lock);
+
+		if (!remaining)
+			break;
+	}
+	rcu_read_unlock();
+
+	return remaining;
+}
+
+ /*
+  * Responsible for refilling a task_group's bandwidth and unthrottling its
+  * cfs_rqs as appropriate. If there has been no activity within the last
+@@ -1469,23 +1547,64 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+  */
+ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+ {
+-	int idle = 1;
+	u64 runtime, runtime_expires;
+	int idle = 1, throttled;
+ 
+ 	raw_spin_lock(&cfs_b->lock);
+ 	/* no need to continue the timer with no bandwidth constraint */
+ 	if (cfs_b->quota == RUNTIME_INF)
+ 		goto out_unlock;
+ 
+-	idle = cfs_b->idle;
+	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+	/* idle depends on !throttled (for the case of a large deficit) */
+	idle = cfs_b->idle && !throttled;
+
+ 	/* if we're going inactive then everything else can be deferred */
+ 	if (idle)
+ 		goto out_unlock;
+ 
+ 	__refill_cfs_bandwidth_runtime(cfs_b);
+ 
+	if (!throttled) {
+		/* mark as potentially idle for the upcoming period */
+		cfs_b->idle = 1;
+		goto out_unlock;
+	}
+
+	/*
+	 * There are throttled entities so we must first use the new bandwidth
+	 * to unthrottle them before making it generally available.  This
+	 * ensures that all existing debts will be paid before a new cfs_rq is
+	 * allowed to run.
+	 */
+	runtime = cfs_b->runtime;
+	runtime_expires = cfs_b->runtime_expires;
+	cfs_b->runtime = 0;
+
+	/*
+	 * This check is repeated as we are holding onto the new bandwidth
+	 * while we unthrottle.  This can potentially race with an unthrottled
+	 * group trying to acquire new bandwidth from the global pool.
+	 */
+	while (throttled && runtime > 0) {
+		raw_spin_unlock(&cfs_b->lock);
+		/* we can't nest cfs_b->lock while distributing bandwidth */
+		runtime = distribute_cfs_runtime(cfs_b, runtime,
+						 runtime_expires);
+		raw_spin_lock(&cfs_b->lock);
+
+		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+	}
+ 
+-	/* mark as potentially idle for the upcoming period */
+-	cfs_b->idle = 1;
+	/* return (any) remaining runtime */
+	cfs_b->runtime = runtime;
+	/*
+	 * While we are ensured activity in the period following an
+	 * unthrottle, this also covers the case in which the new bandwidth is
+	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
+	 * timer to remain active while there are any throttled entities.)
+	 */
+	cfs_b->idle = 0;
+ out_unlock:
+ 	if (idle)
+ 		cfs_b->timer_active = 0;
+-- 
+1.7.0.4
+
@@ -0,0 +1,136 @@
+From b152339efae7eb1bdd9ec4e626121e9205299e9d Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Thu, 21 Jul 2011 09:43:35 -0700
+Subject: [PATCH 10/19] sched: Allow for positional tg_tree walks
+
+Extend walk_tg_tree to accept a positional argument
+
+static int walk_tg_tree_from(struct task_group *from,
+			     tg_visitor down, tg_visitor up, void *data)
+
+Existing semantics are preserved, caller must hold rcu_lock() or sufficient
+analogue.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184757.677889157@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ kernel/sched.c |   50 +++++++++++++++++++++++++++++++++++++-------------
+ 1 files changed, 37 insertions(+), 13 deletions(-)
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 4b54a73..813a4ce 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -1585,20 +1585,23 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
+ typedef int (*tg_visitor)(struct task_group *, void *);
+ 
+ /*
+- * Iterate the full tree, calling @down when first entering a node and @up when
+- * leaving it for the final time.
+ * Iterate task_group tree rooted at *from, calling @down when first entering a
+ * node and @up when leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+  */
+-static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+static int walk_tg_tree_from(struct task_group *from,
+			     tg_visitor down, tg_visitor up, void *data)
+ {
+ 	struct task_group *parent, *child;
+ 	int ret;
+ 
+-	rcu_read_lock();
+-	parent = &root_task_group;
+	parent = from;
+
+ down:
+ 	ret = (*down)(parent, data);
+ 	if (ret)
+-		goto out_unlock;
+		goto out;
+ 	list_for_each_entry_rcu(child, &parent->children, siblings) {
+ 		parent = child;
+ 		goto down;
+@@ -1607,19 +1610,29 @@ up:
+ 		continue;
+ 	}
+ 	ret = (*up)(parent, data);
+-	if (ret)
+-		goto out_unlock;
+	if (ret || parent == from)
+		goto out;
+ 
+ 	child = parent;
+ 	parent = parent->parent;
+ 	if (parent)
+ 		goto up;
+-out_unlock:
+-	rcu_read_unlock();
+-
+out:
+ 	return ret;
+ }
+ 
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+	return walk_tg_tree_from(&root_task_group, down, up, data);
+}
+
+ static int tg_nop(struct task_group *tg, void *data)
+ {
+ 	return 0;
+@@ -8856,13 +8869,19 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
+ 
+ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+ {
+	int ret;
+
+ 	struct rt_schedulable_data data = {
+ 		.tg = tg,
+ 		.rt_period = period,
+ 		.rt_runtime = runtime,
+ 	};
+ 
+-	return walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+	rcu_read_lock();
+	ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+	rcu_read_unlock();
+
+	return ret;
+ }
+ 
+ static int tg_set_rt_bandwidth(struct task_group *tg,
+@@ -9319,6 +9338,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
+ 
+ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+ {
+	int ret;
+ 	struct cfs_schedulable_data data = {
+ 		.tg = tg,
+ 		.period = period,
+@@ -9330,7 +9350,11 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+ 		do_div(data.quota, NSEC_PER_USEC);
+ 	}
+ 
+-	return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+	rcu_read_lock();
+	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+	rcu_read_unlock();
+
+	return ret;
+ }
+ #endif /* CONFIG_CFS_BANDWIDTH */
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
+-- 
+1.7.0.4
+
@@ -0,0 +1,230 @@
+From b7c5f316287ea56ecbc755110eaa032c588f2374 Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Thu, 21 Jul 2011 09:43:36 -0700
+Subject: [PATCH 11/19] sched: Prevent interactions with throttled entities
+
+From the perspective of load-balance and shares distribution, throttled
+entities should be invisible.
+
+However, both of these operations work on 'active' lists and are not
+inherently aware of what group hierarchies may be present.  In some cases this
+may be side-stepped (e.g. we could sideload via tg_load_down in load balance)
+while in others (e.g. update_shares()) it is more difficult to compute without
+incurring some O(n^2) costs.
+
+Instead, track hierarchicaal throttled state at time of transition.  This
+allows us to easily identify whether an entity belongs to a throttled hierarchy
+and avoid incorrect interactions with it.
+
+Also, when an entity leaves a throttled hierarchy we need to advance its
+time averaging for shares averaging so that the elapsed throttled time is not
+considered as part of the cfs_rq's operation.
+
+We also use this information to prevent buddy interactions in the wakeup and
+yield_to() paths.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184757.777916795@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ kernel/sched.c      |    2 +-
+ kernel/sched_fair.c |  103 ++++++++++++++++++++++++++++++++++++++++++++++++---
+ 2 files changed, 98 insertions(+), 7 deletions(-)
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 813a4ce..523464e 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -399,7 +399,7 @@ struct cfs_rq {
+ 	u64 runtime_expires;
+ 	s64 runtime_remaining;
+ 
+-	int throttled;
+	int throttled, throttle_count;
+ 	struct list_head throttled_list;
+ #endif
+ #endif
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index 21e1c02..3d7430b 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -725,6 +725,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ }
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+/* we need this in update_cfs_load and load-balance functions below */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+ # ifdef CONFIG_SMP
+ static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
+ 					    int global_update)
+@@ -747,7 +749,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
+ 	u64 now, delta;
+ 	unsigned long load = cfs_rq->load.weight;
+ 
+-	if (cfs_rq->tg == &root_task_group)
+	if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
+ 		return;
+ 
+ 	now = rq_of(cfs_rq)->clock_task;
+@@ -856,7 +858,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
+ 
+ 	tg = cfs_rq->tg;
+ 	se = tg->se[cpu_of(rq_of(cfs_rq))];
+-	if (!se)
+	if (!se || throttled_hierarchy(cfs_rq))
+ 		return;
+ #ifndef CONFIG_SMP
+ 	if (likely(se->load.weight == tg->shares))
+@@ -1425,6 +1427,65 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+ 	return cfs_rq->throttled;
+ }
+ 
+/* check whether cfs_rq, or any parent, is throttled */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->throttle_count;
+}
+
+/*
+ * Ensure that neither of the group entities corresponding to src_cpu or
+ * dest_cpu are members of a throttled hierarchy when performing group
+ * load-balance operations.
+ */
+static inline int throttled_lb_pair(struct task_group *tg,
+				    int src_cpu, int dest_cpu)
+{
+	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+
+	src_cfs_rq = tg->cfs_rq[src_cpu];
+	dest_cfs_rq = tg->cfs_rq[dest_cpu];
+
+	return throttled_hierarchy(src_cfs_rq) ||
+	       throttled_hierarchy(dest_cfs_rq);
+}
+
+/* updated child weight may affect parent so we have to do this bottom up */
+static int tg_unthrottle_up(struct task_group *tg, void *data)
+{
+	struct rq *rq = data;
+	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+	cfs_rq->throttle_count--;
+#ifdef CONFIG_SMP
+	if (!cfs_rq->throttle_count) {
+		u64 delta = rq->clock_task - cfs_rq->load_stamp;
+
+		/* leaving throttled state, advance shares averaging windows */
+		cfs_rq->load_stamp += delta;
+		cfs_rq->load_last += delta;
+
+		/* update entity weight now that we are on_rq again */
+		update_cfs_shares(cfs_rq);
+	}
+#endif
+
+	return 0;
+}
+
+static int tg_throttle_down(struct task_group *tg, void *data)
+{
+	struct rq *rq = data;
+	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+
+	/* group is entering throttled state, record last load */
+	if (!cfs_rq->throttle_count)
+		update_cfs_load(cfs_rq, 0);
+	cfs_rq->throttle_count++;
+
+	return 0;
+}
+
+ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+ {
+ 	struct rq *rq = rq_of(cfs_rq);
+@@ -1435,7 +1496,9 @@ static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+ 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+ 
+ 	/* account load preceding throttle */
+-	update_cfs_load(cfs_rq, 0);
+	rcu_read_lock();
+	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
+	rcu_read_unlock();
+ 
+ 	task_delta = cfs_rq->h_nr_running;
+ 	for_each_sched_entity(se) {
+@@ -1476,6 +1539,10 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+ 	list_del_rcu(&cfs_rq->throttled_list);
+ 	raw_spin_unlock(&cfs_b->lock);
+ 
+	update_rq_clock(rq);
+	/* update hierarchical throttle state */
+	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
+
+ 	if (!cfs_rq->load.weight)
+ 		return;
+ 
+@@ -1620,6 +1687,17 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+ {
+ 	return 0;
+ }
+
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+static inline int throttled_lb_pair(struct task_group *tg,
+				    int src_cpu, int dest_cpu)
+{
+	return 0;
+}
+ #endif
+ 
+ /**************************************************
+@@ -2521,6 +2599,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ 
+ 	for_each_leaf_cfs_rq(busiest, cfs_rq) {
+ 		list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+			if (throttled_lb_pair(task_group(p),
+					      busiest->cpu, this_cpu))
+				break;
+ 
+ 			if (!can_migrate_task(p, busiest, this_cpu,
+ 						sd, idle, &pinned))
+@@ -2632,8 +2713,17 @@ static void update_shares(int cpu)
+ 	struct rq *rq = cpu_rq(cpu);
+ 
+ 	rcu_read_lock();
+-	for_each_leaf_cfs_rq(rq, cfs_rq)
+	/*
+	 * Iterates the task_group tree in a bottom up fashion, see
+	 * list_add_leaf_cfs_rq() for details.
+	 */
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		/* throttled entities do not contribute to load */
+		if (throttled_hierarchy(cfs_rq))
+			continue;
+
+ 		update_shares_cpu(cfs_rq->tg, cpu);
+	}
+ 	rcu_read_unlock();
+ }
+ 
+@@ -2657,9 +2747,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+ 		u64 rem_load, moved_load;
+ 
+ 		/*
+-		 * empty group
+		 * empty group or part of a throttled hierarchy
+ 		 */
+-		if (!busiest_cfs_rq->task_weight)
+		if (!busiest_cfs_rq->task_weight ||
+		    throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
+ 			continue;
+ 
+ 		rem_load = (u64)rem_load_move * busiest_weight;
+-- 
+1.7.0.4
+
@@ -0,0 +1,65 @@
+From 41f8be245b607e16567c13f6be065084b73c4977 Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Thu, 21 Jul 2011 09:43:37 -0700
+Subject: [PATCH 12/19] sched: Prevent buddy interactions with throttled entities
+
+Buddies allow us to select "on-rq" entities without actually selecting them
+from a cfs_rq's rb_tree.  As a result we must ensure that throttled entities
+are not falsely nominated as buddies.  The fact that entities are dequeued
+within throttle_entity is not sufficient for clearing buddy status as the
+nomination may occur after throttling.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184757.886850167@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ kernel/sched_fair.c |   18 +++++++++++++++++-
+ 1 files changed, 17 insertions(+), 1 deletions(-)
+
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index 3d7430b..3c0120e 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -2372,6 +2372,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+ 	if (unlikely(se == pse))
+ 		return;
+ 
+	/*
+	 * This is possible from callers such as pull_task(), in which we
+	 * unconditionally check_prempt_curr() after an enqueue (which may have
+	 * lead to a throttle).  This both saves work and prevents false
+	 * next-buddy nomination below.
+	 */
+	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+		return;
+
+ 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
+ 		set_next_buddy(pse);
+ 		next_buddy_marked = 1;
+@@ -2380,6 +2389,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+ 	/*
+ 	 * We can come here with TIF_NEED_RESCHED already set from new task
+ 	 * wake up path.
+	 *
+	 * Note: this also catches the edge-case of curr being in a throttled
+	 * group (e.g. via set_curr_task), since update_curr() (in the
+	 * enqueue of curr) will have resulted in resched being set.  This
+	 * prevents us from potentially nominating it as a false LAST_BUDDY
+	 * below.
+ 	 */
+ 	if (test_tsk_need_resched(curr))
+ 		return;
+@@ -2502,7 +2517,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
+ {
+ 	struct sched_entity *se = &p->se;
+ 
+-	if (!se->on_rq)
+	/* throttled hierarchies are not runnable */
+	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
+ 		return false;
+ 
+ 	/* Tell the scheduler that we'd really like pse to run next. */
+-- 
+1.7.0.4
+
@@ -0,0 +1,69 @@
+From 7af3c5930e241d0bddc028e3d05ad396c32689be Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Thu, 21 Jul 2011 09:43:38 -0700
+Subject: [PATCH 13/19] sched: Migrate throttled tasks on HOTPLUG
+
+Throttled tasks are invisisble to cpu-offline since they are not eligible for
+selection by pick_next_task().  The regular 'escape' path for a thread that is
+blocked at offline is via ttwu->select_task_rq, however this will not handle a
+throttled group since there are no individual thread wakeups on an unthrottle.
+
+Resolve this by unthrottling offline cpus so that threads can be migrated.
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184757.989000590@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ kernel/sched.c |   27 +++++++++++++++++++++++++++
+ 1 files changed, 27 insertions(+), 0 deletions(-)
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 523464e..7b99d63 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -6310,6 +6310,30 @@ static void calc_global_load_remove(struct rq *rq)
+ 	rq->calc_load_active = 0;
+ }
+ 
+#ifdef CONFIG_CFS_BANDWIDTH
+static void unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+	struct cfs_rq *cfs_rq;
+
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+
+		if (!cfs_rq->runtime_enabled)
+			continue;
+
+		/*
+		 * clock_task is not advancing so we just need to make sure
+		 * there's some valid quota amount
+		 */
+		cfs_rq->runtime_remaining = cfs_b->quota;
+		if (cfs_rq_throttled(cfs_rq))
+			unthrottle_cfs_rq(cfs_rq);
+	}
+}
+#else
+static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+#endif
+
+ /*
+  * Migrate all tasks from the rq, sleeping tasks will be migrated by
+  * try_to_wake_up()->select_task_rq().
+@@ -6335,6 +6359,9 @@ static void migrate_tasks(unsigned int dead_cpu)
+ 	 */
+ 	rq->stop = NULL;
+ 
+	/* Ensure any throttled groups are reachable by pick_next_task */
+	unthrottle_offline_cfs_rqs(rq);
+
+ 	for ( ; ; ) {
+ 		/*
+ 		 * There's this thread running, bail when that's the only
+-- 
+1.7.0.4
+
@@ -0,0 +1,133 @@
+From d71d241613f903255fec91ba9f959a633c724b4e Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Thu, 21 Jul 2011 09:43:39 -0700
+Subject: [PATCH 14/19] sched: Throttle entities exceeding their allowed bandwidth
+
+With the machinery in place to throttle and unthrottle entities, as well as
+handle their participation (or lack there of) we can now enable throttling.
+
+There are 2 points that we must check whether it's time to set throttled state:
+ put_prev_entity() and enqueue_entity().
+
+- put_prev_entity() is the typical throttle path, we reach it by exceeding our
+  allocated run-time within update_curr()->account_cfs_rq_runtime() and going
+  through a reschedule.
+
+- enqueue_entity() covers the case of a wake-up into an already throttled
+  group.  In this case we know the group cannot be on_rq and can throttle
+  immediately.  Checks are added at time of put_prev_entity() and
+  enqueue_entity()
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184758.091415417@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ kernel/sched_fair.c |   52 +++++++++++++++++++++++++++++++++++++++++++++++++-
+ 1 files changed, 50 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index 3c0120e..831a300 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -989,6 +989,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+ 	se->vruntime = vruntime;
+ }
+ 
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+
+ static void
+ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ {
+@@ -1018,8 +1020,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 		__enqueue_entity(cfs_rq, se);
+ 	se->on_rq = 1;
+ 
+-	if (cfs_rq->nr_running == 1)
+	if (cfs_rq->nr_running == 1) {
+ 		list_add_leaf_cfs_rq(cfs_rq);
+		check_enqueue_throttle(cfs_rq);
+	}
+ }
+ 
+ static void __clear_buddies_last(struct sched_entity *se)
+@@ -1224,6 +1228,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+ 	return se;
+ }
+ 
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
+ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+ {
+ 	/*
+@@ -1233,6 +1239,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
+ 	if (prev->on_rq)
+ 		update_curr(cfs_rq);
+ 
+	/* throttle cfs_rqs exceeding runtime */
+	check_cfs_rq_runtime(cfs_rq);
+
+ 	check_spread(cfs_rq, prev);
+ 	if (prev->on_rq) {
+ 		update_stats_wait_start(cfs_rq, prev);
+@@ -1486,7 +1495,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
+ 	return 0;
+ }
+ 
+-static __used void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+ {
+ 	struct rq *rq = rq_of(cfs_rq);
+ 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+@@ -1679,9 +1688,48 @@ out_unlock:
+ 
+ 	return idle;
+ }
+
+/*
+ * When a group wakes up we want to make sure that its quota is not already
+ * expired/exceeded, otherwise it may be allowed to steal additional ticks of
+ * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ */
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
+{
+	/* an active group must be handled by the update_curr()->put() path */
+	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+		return;
+
+	/* ensure the group is not already throttled */
+	if (cfs_rq_throttled(cfs_rq))
+		return;
+
+	/* update runtime allocation */
+	account_cfs_rq_runtime(cfs_rq, 0);
+	if (cfs_rq->runtime_remaining <= 0)
+		throttle_cfs_rq(cfs_rq);
+}
+
+/* conditionally throttle active cfs_rq's from put_prev_entity() */
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+		return;
+
+	/*
+	 * it's possible for a throttled entity to be forced into a running
+	 * state (e.g. set_curr_task), in this case we're finished.
+	 */
+	if (cfs_rq_throttled(cfs_rq))
+		return;
+
+	throttle_cfs_rq(cfs_rq);
+}
+ #else
+ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ 				     unsigned long delta_exec) {}
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+ 
+ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+ {
+-- 
+1.7.0.4
+
@@ -0,0 +1,125 @@
+From 11db1560b4dec193a20e2c78fb8238d9f14a1782 Mon Sep 17 00:00:00 2001
+From: Nikhil Rao <ncrao@google.com>
+Date: Thu, 21 Jul 2011 09:43:40 -0700
+Subject: [PATCH 15/19] sched: Add exports tracking cfs bandwidth control statistics
+
+This change introduces statistics exports for the cpu sub-system, these are
+added through the use of a stat file similar to that exported by other
+subsystems.
+
+The following exports are included:
+
+nr_periods:	number of periods in which execution occurred
+nr_throttled:	the number of periods above in which execution was throttle
+throttled_time:	cumulative wall-time that any cpus have been throttled for
+this group
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Signed-off-by: Nikhil Rao <ncrao@google.com>
+Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
+Reviewed-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184758.198901931@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ kernel/sched.c      |   21 +++++++++++++++++++++
+ kernel/sched_fair.c |    7 +++++++
+ 2 files changed, 28 insertions(+), 0 deletions(-)
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 7b99d63..08d3aa0 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -259,6 +259,9 @@ struct cfs_bandwidth {
+ 	struct hrtimer period_timer;
+ 	struct list_head throttled_cfs_rq;
+ 
+	/* statistics */
+	int nr_periods, nr_throttled;
+	u64 throttled_time;
+ #endif
+ };
+ 
+@@ -399,6 +402,7 @@ struct cfs_rq {
+ 	u64 runtime_expires;
+ 	s64 runtime_remaining;
+ 
+	u64 throttled_timestamp;
+ 	int throttled, throttle_count;
+ 	struct list_head throttled_list;
+ #endif
+@@ -9383,6 +9387,19 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+ 
+ 	return ret;
+ }
+
+static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
+		struct cgroup_map_cb *cb)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+
+	cb->fill(cb, "nr_periods", cfs_b->nr_periods);
+	cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
+	cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+
+	return 0;
+}
+ #endif /* CONFIG_CFS_BANDWIDTH */
+ #endif /* CONFIG_FAIR_GROUP_SCHED */
+ 
+@@ -9429,6 +9446,10 @@ static struct cftype cpu_files[] = {
+ 		.read_u64 = cpu_cfs_period_read_u64,
+ 		.write_u64 = cpu_cfs_period_write_u64,
+ 	},
+	{
+		.name = "stat",
+		.read_map = cpu_stats_show,
+	},
+ #endif
+ #ifdef CONFIG_RT_GROUP_SCHED
+ 	{
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index 831a300..2060fc9 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -1528,6 +1528,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+ 		rq->nr_running -= task_delta;
+ 
+ 	cfs_rq->throttled = 1;
+	cfs_rq->throttled_timestamp = rq->clock;
+ 	raw_spin_lock(&cfs_b->lock);
+ 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ 	raw_spin_unlock(&cfs_b->lock);
+@@ -1545,8 +1546,10 @@ static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+ 
+ 	cfs_rq->throttled = 0;
+ 	raw_spin_lock(&cfs_b->lock);
+	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+ 	list_del_rcu(&cfs_rq->throttled_list);
+ 	raw_spin_unlock(&cfs_b->lock);
+	cfs_rq->throttled_timestamp = 0;
+ 
+ 	update_rq_clock(rq);
+ 	/* update hierarchical throttle state */
+@@ -1634,6 +1637,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+ 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+ 	/* idle depends on !throttled (for the case of a large deficit) */
+ 	idle = cfs_b->idle && !throttled;
+	cfs_b->nr_periods += overrun;
+ 
+ 	/* if we're going inactive then everything else can be deferred */
+ 	if (idle)
+@@ -1647,6 +1651,9 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+ 		goto out_unlock;
+ 	}
+ 
+	/* account preceding periods in which throttling occurred */
+	cfs_b->nr_throttled += overrun;
+
+ 	/*
+ 	 * There are throttled entities so we must first use the new bandwidth
+ 	 * to unthrottle them before making it generally available.  This
+-- 
+1.7.0.4
+
@@ -0,0 +1,252 @@
+From 9baa7b654e1527bfec8f413f7372de6c4aeebb6a Mon Sep 17 00:00:00 2001
+From: Paul Turner <pjt@google.com>
+Date: Thu, 21 Jul 2011 09:43:41 -0700
+Subject: [PATCH 16/19] sched: Return unused runtime on group dequeue
+
+When a local cfs_rq blocks we return the majority of its remaining quota to the
+global bandwidth pool for use by other runqueues.
+
+We do this only when the quota is current and there is more than
+min_cfs_rq_quota [1ms by default] of runtime remaining on the rq.
+
+In the case where there are throttled runqueues and we have sufficient
+bandwidth to meter out a slice, a second timer is kicked off to handle this
+delivery, unthrottling where appropriate.
+
+Using a 'worst case' antagonist which executes on each cpu
+for 1ms before moving onto the next on a fairly large machine:
+
+no quota generations:
+
+ 197.47 ms       /cgroup/a/cpuacct.usage
+ 199.46 ms       /cgroup/a/cpuacct.usage
+ 205.46 ms       /cgroup/a/cpuacct.usage
+ 198.46 ms       /cgroup/a/cpuacct.usage
+ 208.39 ms       /cgroup/a/cpuacct.usage
+
+Since we are allowed to use "stale" quota our usage is effectively bounded by
+the rate of input into the global pool and performance is relatively stable.
+
+with quota generations [1s increments]:
+
+ 119.58 ms       /cgroup/a/cpuacct.usage
+ 119.65 ms       /cgroup/a/cpuacct.usage
+ 119.64 ms       /cgroup/a/cpuacct.usage
+ 119.63 ms       /cgroup/a/cpuacct.usage
+ 119.60 ms       /cgroup/a/cpuacct.usage
+
+The large deficit here is due to quota generations (/intentionally/) preventing
+us from now using previously stranded slack quota.  The cost is that this quota
+becomes unavailable.
+
+with quota generations and quota return:
+
+ 200.09 ms       /cgroup/a/cpuacct.usage
+ 200.09 ms       /cgroup/a/cpuacct.usage
+ 198.09 ms       /cgroup/a/cpuacct.usage
+ 200.09 ms       /cgroup/a/cpuacct.usage
+ 200.06 ms       /cgroup/a/cpuacct.usage
+
+By returning unused quota we're able to both stably consume our desired quota
+and prevent unintentional overages due to the abuse of slack quota from
+previous quota periods (especially on a large machine).
+
+Signed-off-by: Paul Turner <pjt@google.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184758.306848658@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ kernel/sched.c      |   15 +++++++-
+ kernel/sched_fair.c |  108 +++++++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 122 insertions(+), 1 deletions(-)
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 08d3aa0..8be4ca2 100644
+--- a/kernel/sched.c
+++ b/kernel/sched.c
+@@ -256,7 +256,7 @@ struct cfs_bandwidth {
+ 	u64 runtime_expires;
+ 
+ 	int idle, timer_active;
+-	struct hrtimer period_timer;
+	struct hrtimer period_timer, slack_timer;
+ 	struct list_head throttled_cfs_rq;
+ 
+ 	/* statistics */
+@@ -418,6 +418,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+ 
+ static inline u64 default_cfs_period(void);
+ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+	struct cfs_bandwidth *cfs_b =
+		container_of(timer, struct cfs_bandwidth, slack_timer);
+	do_sched_cfs_slack_timer(cfs_b);
+
+	return HRTIMER_NORESTART;
+}
+ 
+ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+ {
+@@ -450,6 +460,8 @@ static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+ 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
+ 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ 	cfs_b->period_timer.function = sched_cfs_period_timer;
+	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	cfs_b->slack_timer.function = sched_cfs_slack_timer;
+ }
+ 
+ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+@@ -485,6 +497,7 @@ static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+ {
+ 	hrtimer_cancel(&cfs_b->period_timer);
+	hrtimer_cancel(&cfs_b->slack_timer);
+ }
+ #else
+ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index 2060fc9..edf3b3e 100644
+--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
+@@ -1071,6 +1071,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 		__clear_buddies_skip(se);
+ }
+ 
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+
+ static void
+ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ {
+@@ -1109,6 +1111,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 	if (!(flags & DEQUEUE_SLEEP))
+ 		se->vruntime -= cfs_rq->min_vruntime;
+ 
+	/* return excess runtime on last dequeue */
+	return_cfs_rq_runtime(cfs_rq);
+
+ 	update_min_vruntime(cfs_rq);
+ 	update_cfs_shares(cfs_rq);
+ }
+@@ -1696,6 +1701,108 @@ out_unlock:
+ 	return idle;
+ }
+ 
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+
+/* are we near the end of the current quota period? */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+	struct hrtimer *refresh_timer = &cfs_b->period_timer;
+	u64 remaining;
+
+	/* if the call-back is running a quota refresh is already occurring */
+	if (hrtimer_callback_running(refresh_timer))
+		return 1;
+
+	/* is a quota refresh about to occur? */
+	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+	if (remaining < min_expire)
+		return 1;
+
+	return 0;
+}
+
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+
+	/* if there's a quota refresh soon don't bother with slack */
+	if (runtime_refresh_within(cfs_b, min_left))
+		return;
+
+	start_bandwidth_timer(&cfs_b->slack_timer,
+				ns_to_ktime(cfs_bandwidth_slack_period));
+}
+
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+
+	if (slack_runtime <= 0)
+		return;
+
+	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->quota != RUNTIME_INF &&
+	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+		cfs_b->runtime += slack_runtime;
+
+		/* we are under rq->lock, defer unthrottling using a timer */
+		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+		    !list_empty(&cfs_b->throttled_cfs_rq))
+			start_cfs_slack_bandwidth(cfs_b);
+	}
+	raw_spin_unlock(&cfs_b->lock);
+
+	/* even if it's not valid for return we don't want to try again */
+	cfs_rq->runtime_remaining -= slack_runtime;
+}
+
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+	if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+		return;
+
+	__return_cfs_rq_runtime(cfs_rq);
+}
+
+/*
+ * This is done with a timer (instead of inline with bandwidth return) since
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
+ */
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+	u64 expires;
+
+	/* confirm we're still not at a refresh boundary */
+	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+		return;
+
+	raw_spin_lock(&cfs_b->lock);
+	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+		runtime = cfs_b->runtime;
+		cfs_b->runtime = 0;
+	}
+	expires = cfs_b->runtime_expires;
+	raw_spin_unlock(&cfs_b->lock);
+
+	if (!runtime)
+		return;
+
+	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+
+	raw_spin_lock(&cfs_b->lock);
+	if (expires == cfs_b->runtime_expires)
+		cfs_b->runtime = runtime;
+	raw_spin_unlock(&cfs_b->lock);
+}
+
+ /*
+  * When a group wakes up we want to make sure that its quota is not already
+  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
+@@ -1737,6 +1844,7 @@ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+ 				     unsigned long delta_exec) {}
+ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+ 
+ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+ {
+-- 
+1.7.0.4
+
@@ -0,0 +1,148 @@
+From d5edecf4b5298b11f6d39e3029b7620ee06640e7 Mon Sep 17 00:00:00 2001
+From: Bharata B Rao <bharata@linux.vnet.ibm.com>
+Date: Thu, 21 Jul 2011 09:43:43 -0700
+Subject: [PATCH 17/19] sched: Add documentation for bandwidth control
+
+Basic description of usage and effect for CFS Bandwidth Control.
+
+Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
+Signed-off-by: Paul Turner <pjt@google.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Link: http://lkml.kernel.org/r/20110721184758.498036116@google.com
+Signed-off-by: Ingo Molnar <mingo@elte.hu>
+---
+ Documentation/scheduler/sched-bwc.txt |  122 +++++++++++++++++++++++++++++++++
+ 1 files changed, 122 insertions(+), 0 deletions(-)
+ create mode 100644 Documentation/scheduler/sched-bwc.txt
+
+diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt
+new file mode 100644
+index 0000000..f6b1873
+--- /dev/null
+++ b/Documentation/scheduler/sched-bwc.txt
+@@ -0,0 +1,122 @@
+CFS Bandwidth Control
+=====================
+
+[ This document only discusses CPU bandwidth control for SCHED_NORMAL.
+  The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ]
+
+CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the
+specification of the maximum CPU bandwidth available to a group or hierarchy.
+
+The bandwidth allowed for a group is specified using a quota and period. Within
+each given "period" (microseconds), a group is allowed to consume only up to
+"quota" microseconds of CPU time.  When the CPU bandwidth consumption of a
+group exceeds this limit (for that period), the tasks belonging to its
+hierarchy will be throttled and are not allowed to run again until the next
+period.
+
+A group's unused runtime is globally tracked, being refreshed with quota units
+above at each period boundary.  As threads consume this bandwidth it is
+transferred to cpu-local "silos" on a demand basis.  The amount transferred
+within each of these updates is tunable and described as the "slice".
+
+Management
+----------
+Quota and period are managed within the cpu subsystem via cgroupfs.
+
+cpu.cfs_quota_us: the total available run-time within a period (in microseconds)
+cpu.cfs_period_us: the length of a period (in microseconds)
+cpu.stat: exports throttling statistics [explained further below]
+
+The default values are:
+	cpu.cfs_period_us=100ms
+	cpu.cfs_quota=-1
+
+A value of -1 for cpu.cfs_quota_us indicates that the group does not have any
+bandwidth restriction in place, such a group is described as an unconstrained
+bandwidth group.  This represents the traditional work-conserving behavior for
+CFS.
+
+Writing any (valid) positive value(s) will enact the specified bandwidth limit.
+The minimum quota allowed for the quota or period is 1ms.  There is also an
+upper bound on the period length of 1s.  Additional restrictions exist when
+bandwidth limits are used in a hierarchical fashion, these are explained in
+more detail below.
+
+Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit
+and return the group to an unconstrained state once more.
+
+Any updates to a group's bandwidth specification will result in it becoming
+unthrottled if it is in a constrained state.
+
+System wide settings
+--------------------
+For efficiency run-time is transferred between the global pool and CPU local
+"silos" in a batch fashion.  This greatly reduces global accounting pressure
+on large systems.  The amount transferred each time such an update is required
+is described as the "slice".
+
+This is tunable via procfs:
+	/proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms)
+
+Larger slice values will reduce transfer overheads, while smaller values allow
+for more fine-grained consumption.
+
+Statistics
+----------
+A group's bandwidth statistics are exported via 3 fields in cpu.stat.
+
+cpu.stat:
+- nr_periods: Number of enforcement intervals that have elapsed.
+- nr_throttled: Number of times the group has been throttled/limited.
+- throttled_time: The total time duration (in nanoseconds) for which entities
+  of the group have been throttled.
+
+This interface is read-only.
+
+Hierarchical considerations
+---------------------------
+The interface enforces that an individual entity's bandwidth is always
+attainable, that is: max(c_i) <= C. However, over-subscription in the
+aggregate case is explicitly allowed to enable work-conserving semantics
+within a hierarchy.
+  e.g. \Sum (c_i) may exceed C
+[ Where C is the parent's bandwidth, and c_i its children ]
+
+
+There are two ways in which a group may become throttled:
+	a. it fully consumes its own quota within a period
+	b. a parent's quota is fully consumed within its period
+
+In case b) above, even though the child may have runtime remaining it will not
+be allowed to until the parent's runtime is refreshed.
+
+Examples
+--------
+1. Limit a group to 1 CPU worth of runtime.
+
+	If period is 250ms and quota is also 250ms, the group will get
+	1 CPU worth of runtime every 250ms.
+
+	# echo 250000 > cpu.cfs_quota_us /* quota = 250ms */
+	# echo 250000 > cpu.cfs_period_us /* period = 250ms */
+
+2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine.
+
+	With 500ms period and 1000ms quota, the group can get 2 CPUs worth of
+	runtime every 500ms.
+
+	# echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */
+	# echo 500000 > cpu.cfs_period_us /* period = 500ms */
+
+	The larger period here allows for increased burst capacity.
+
+3. Limit a group to 20% of 1 CPU.
+
+	With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU.
+
+	# echo 10000 > cpu.cfs_quota_us /* quota = 10ms */
+	# echo 50000 > cpu.cfs_period_us /* period = 50ms */
+
+	By using a small period here we are ensuring a consistent latency
+	response at the expense of burst capacity.
+
+-- 
+1.7.0.4
+
@@ -0,0 +1,437 @@
+From 46394a392b85376e0c17a4f84e2468a0b62ec5b6 Mon Sep 17 00:00:00 2001
+From: Vimalkumar <j.vimal+nf@gmail.com>
+Date: Wed, 7 Sep 2011 14:17:32 -0700
+Subject: [PATCH 19/19] dctcp patch
+
+---
+ include/linux/sysctl.h     |    3 +
+ include/linux/tcp.h        |   10 +++
+ include/net/tcp.h          |    3 +
+ kernel/sysctl_binary.c     |    3 +
+ net/ipv4/sysctl_net_ipv4.c |   21 +++++
+ net/ipv4/tcp_input.c       |  182 ++++++++++++++++++++++++++++++++++++++++----
+ net/ipv4/tcp_output.c      |   19 +++++-
+ 7 files changed, 225 insertions(+), 16 deletions(-)
+
+diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
+index 11684d9..fd8c73a 100644
+--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
+@@ -425,6 +425,9 @@ enum
+ 	NET_TCP_ALLOWED_CONG_CONTROL=123,
+ 	NET_TCP_MAX_SSTHRESH=124,
+ 	NET_TCP_FRTO_RESPONSE=125,
+	NET_TCP_DELAYED_ACK=126,
+	NET_TCP_DCTCP_ENABLE=127,
+	NET_TCP_DCTCP_SHIFT_G=128,
+ };
+ 
+ enum {
+diff --git a/include/linux/tcp.h b/include/linux/tcp.h
+index e64f4c6..9d2ec1c 100644
+--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
+@@ -455,6 +455,16 @@ struct tcp_sock {
+ 	struct tcp_md5sig_info	*md5sig_info;
+ #endif
+ 
+/* DCTCP Specific Parameters */
+ 	u32	acked_bytes_ecn;
+ 	u32	acked_bytes_total;
+ 	u32	prior_ack;
+ 	u32	prior_rcv_nxt;
+ 	u32	dctcp_alpha;
+ 	u32	next_seq;
+ 	u32	ce_state;	/* 0: last pkt was non-ce , 1: last pkt was ce */
+ 	u32	delayed_ack_reserved;
+
+ 	/* When the cookie options are generated and exchanged, then this
+ 	 * object holds a reference to them (cookie_values->kref).  Also
+ 	 * contains related tcp_cookie_transactions fields.
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index cda30ea..b6b1480 100644
+--- a/include/net/tcp.h
+++ b/include/net/tcp.h
+@@ -223,6 +223,9 @@ extern int sysctl_tcp_max_orphans;
+ extern int sysctl_tcp_fack;
+ extern int sysctl_tcp_reordering;
+ extern int sysctl_tcp_ecn;
+extern int sysctl_tcp_delayed_ack;
+extern int sysctl_tcp_dctcp_enable;
+extern int sysctl_tcp_dctcp_shift_g;
+ extern int sysctl_tcp_dsack;
+ extern long sysctl_tcp_mem[3];
+ extern int sysctl_tcp_wmem[3];
+diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
+index 20dfc21..f232b5a 100644
+--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
+@@ -373,6 +373,9 @@ static const struct bin_table bin_net_ipv4_table[] = {
+ 	{ CTL_INT,	NET_TCP_FACK,				"tcp_fack" },
+ 	{ CTL_INT,	NET_TCP_REORDERING,			"tcp_reordering" },
+ 	{ CTL_INT,	NET_TCP_ECN,				"tcp_ecn" },
+	{ CTL_INT,	NET_TCP_DELAYED_ACK,			"tcp_delayed_ack" },
+	{ CTL_INT,	NET_TCP_DCTCP_ENABLE,			"tcp_dctcp_enable" },
+	{ CTL_INT,	NET_TCP_DCTCP_SHIFT_G,			"tcp_dctcp_shift_g" },
+ 	{ CTL_INT,	NET_TCP_DSACK,				"tcp_dsack" },
+ 	{ CTL_INT,	NET_TCP_MEM,				"tcp_mem" },
+ 	{ CTL_INT,	NET_TCP_WMEM,				"tcp_wmem" },
+diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
+index 57d0752..c896edf 100644
+--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
+@@ -440,6 +440,27 @@ static struct ctl_table ipv4_table[] = {
+ 		.proc_handler	= proc_dointvec
+ 	},
+ 	{
+		.procname	= "tcp_delayed_ack",
+		.data		= &sysctl_tcp_delayed_ack,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_dctcp_enable",
+		.data		= &sysctl_tcp_dctcp_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+		.procname	= "tcp_dctcp_shift_g",
+		.data		= &sysctl_tcp_dctcp_shift_g,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+	{
+ 		.procname	= "tcp_dsack",
+ 		.data		= &sysctl_tcp_dsack,
+ 		.maxlen		= sizeof(int),
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index bef9f04..7b9829b 100644
+--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
+@@ -98,6 +98,13 @@ int sysctl_tcp_thin_dupack __read_mostly;
+ int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
+ int sysctl_tcp_abc __read_mostly;
+ 
+int sysctl_tcp_delayed_ack __read_mostly = 1;
+EXPORT_SYMBOL(sysctl_tcp_delayed_ack);
+int sysctl_tcp_dctcp_enable __read_mostly;
+EXPORT_SYMBOL(sysctl_tcp_dctcp_enable);
+int sysctl_tcp_dctcp_shift_g  __read_mostly = 4; /* g=1/2^4 */
+EXPORT_SYMBOL(sysctl_tcp_dctcp_shift_g);
+
+ #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
+ #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
+ #define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
+@@ -217,16 +224,70 @@ static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
+ 	tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ }
+ 
+-static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb)
+static inline void TCP_ECN_dctcp_check_ce(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+ {
+ 	if (tp->ecn_flags & TCP_ECN_OK) {
+-		if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags))
+-			tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+-		/* Funny extension: if ECT is not set on a segment,
+-		 * it is surely retransmit. It is not in ECN RFC,
+-		 * but Linux follows this rule. */
+-		else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags)))
+-			tcp_enter_quickack_mode((struct sock *)tp);
+	  u32 temp_rcv_nxt;
+
+	  if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) {
+
+	    /* rcv_nxt is already update in previous process (tcp_rcv_established) */
+
+	    if(sysctl_tcp_dctcp_enable) {
+
+	      /* state has changed from CE=0 to CE=1 && delayed ack has not sent yet */
+	      if(tp->ce_state == 0 && tp->delayed_ack_reserved) {
+
+		/* save current rcv_nxt */
+		temp_rcv_nxt = tp->rcv_nxt;
+		/* generate previous ack with CE=0 */
+		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+		tp->rcv_nxt = tp->prior_rcv_nxt;
+		/* printk("CE=0 rcv_nxt= %u nxt= %u\n",tp->rcv_nxt, temp_rcv_nxt);  */
+		tcp_send_ack(sk);
+		/* recover current rcv_nxt */
+		tp->rcv_nxt = temp_rcv_nxt;
+	      }
+	      
+	      tp->ce_state = 1;
+	    }
+
+	    tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+
+
+	    /* Funny extension: if ECT is not set on a segment,
+	     * it is surely retransmit. It is not in ECN RFC,
+	     * but Linux follows this rule. */
+	  } else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) {
+	    tcp_enter_quickack_mode((struct sock *)tp);
+	  }else {
+	    /* It has ECT but it doesn't have CE */
+	    
+	    if(sysctl_tcp_dctcp_enable) {
+	      
+	      if(tp->ce_state != 0 && tp->delayed_ack_reserved) {
+		
+		/* save current rcv_nxt */
+		temp_rcv_nxt = tp->rcv_nxt;
+		/* generate previous ack with CE=1 */
+		tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+		tp->rcv_nxt = tp->prior_rcv_nxt;
+		/* printk("CE=1 rcv_nxt= %u nxt= %u\n",tp->rcv_nxt, temp_rcv_nxt);  */
+		tcp_send_ack(sk);
+		/* recover current rcv_nxt */
+		tp->rcv_nxt = temp_rcv_nxt;
+	      }
+
+	      tp->ce_state = 0;
+
+	      /* deassert only when DCTCP is enabled */
+	      tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+	    }
+
+	  }
+	    
+	  /* set current rcv_nxt to prior_rcv_nxt */
+	  tp->prior_rcv_nxt = tp->rcv_nxt;
+ 	}
+ }
+ 
+@@ -581,6 +642,8 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
+ 		 */
+ 		tcp_incr_quickack(sk);
+ 		icsk->icsk_ack.ato = TCP_ATO_MIN;
+
+		tp->ce_state = 0;
+ 	} else {
+ 		int m = now - icsk->icsk_ack.lrcvtime;
+ 
+@@ -601,7 +664,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
+ 	}
+ 	icsk->icsk_ack.lrcvtime = now;
+ 
+-	TCP_ECN_check_ce(tp, skb);
+	TCP_ECN_dctcp_check_ce(sk, tp, skb);
+ 
+ 	if (skb->len >= 128)
+ 		tcp_grow_window(sk, skb);
+@@ -827,19 +890,54 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	const struct inet_connection_sock *icsk = inet_csk(sk);
+ 
+	__u32 ssthresh_old; 
+	__u32 cwnd_old;
+	__u32 cwnd_new;
+
+ 	tp->prior_ssthresh = 0;
+ 	tp->bytes_acked = 0;
+ 	if (icsk->icsk_ca_state < TCP_CA_CWR) {
+ 		tp->undo_marker = 0;
+-		if (set_ssthresh)
+-			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+-		tp->snd_cwnd = min(tp->snd_cwnd,
+-				   tcp_packets_in_flight(tp) + 1U);
+
+		if(!sysctl_tcp_dctcp_enable) {
+
+		  if (set_ssthresh)
+		    tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+
+		  tp->snd_cwnd = min(tp->snd_cwnd,
+				     tcp_packets_in_flight(tp) + 1U);
+		  
+		}else {
+
+		  cwnd_new = max (tp->snd_cwnd - ((tp->snd_cwnd * tp->dctcp_alpha)>>11) , 2U);
+
+		  if(set_ssthresh) {
+		    
+		    ssthresh_old = tp->snd_ssthresh;
+		    tp->snd_ssthresh =  cwnd_new;
+		    
+		    /* printk("%llu alpha= %d ssth old= %d new= %d\n", */
+		    /* 		    			   ktime_to_us(ktime_get_real()), */
+		    /* 		    			   tp->dctcp_alpha, */
+		    /* 		    			   ssthresh_old, */
+		    /* 		    			   tp->snd_ssthresh); */
+		  }
+		  
+		  cwnd_old = tp->snd_cwnd;
+		  tp->snd_cwnd = cwnd_new;
+		  
+		  /* printk("%llu alpha= %d cwnd old= %d new= %d\n", */
+		  /* 		  			 ktime_to_us(ktime_get_real()), */
+		  /* 		  			 tp->dctcp_alpha, */
+		  /* 		  			 cwnd_old, */
+		  /* 		  			 tp->snd_cwnd); */
+		}
+		
+ 		tp->snd_cwnd_cnt = 0;
+ 		tp->high_seq = tp->snd_nxt;
+ 		tp->snd_cwnd_stamp = tcp_time_stamp;
+ 		TCP_ECN_queue_cwr(tp);
+-
+		
+ 		tcp_set_ca_state(sk, TCP_CA_CWR);
+ 	}
+ }
+@@ -2859,6 +2957,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
+ 		tcp_try_keep_open(sk);
+ 		tcp_moderate_cwnd(tp);
+ 	} else {
+	  if(!sysctl_tcp_dctcp_enable)
+ 		tcp_cwnd_down(sk, flag);
+ 	}
+ }
+@@ -3624,6 +3723,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
+ 	int prior_packets;
+ 	int frto_cwnd = 0;
+ 
+	__u32 alpha_old;
+	__u32 acked_bytes;
+
+ 	/* If the ack is older than previous acks
+ 	 * then we can probably ignore it.
+ 	 */
+@@ -3680,6 +3782,54 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
+ 		tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
+ 	}
+ 
+	/* START: DCTCP Processing */
+
+	/* calc acked bytes */
+	if(after(ack,tp->prior_ack)) {
+	  acked_bytes = ack - tp->prior_ack;
+	} else {
+	  
+	  if(flag & FLAG_WIN_UPDATE) {
+	    /* Don't count when it is Window Updated ACK */
+	    acked_bytes = 0; 
+	    /* printk("acked_byte=0\n"); */
+	  }else {
+	    /* Count duplicate ACKs for Retransmission packets and so on as MSS size */
+	    acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
+	  }
+	}
+
+	if(flag & FLAG_ECE) 
+	  tp->acked_bytes_ecn += acked_bytes;
+
+	tp->acked_bytes_total += acked_bytes;
+
+	tp->prior_ack = ack;
+
+	/* Expired RTT */
+        if (!before(tp->snd_una,tp->next_seq)) {
+
+	  /* For avoiding denominator == 1 */
+	  if(tp->acked_bytes_total == 0) tp->acked_bytes_total = 1;
+
+          alpha_old = tp->dctcp_alpha; 
+
+	  /* alpha = (1-g) * alpha + g * F */
+	  tp->dctcp_alpha = alpha_old - (alpha_old >> sysctl_tcp_dctcp_shift_g)
+	    + (tp->acked_bytes_ecn << (10 - sysctl_tcp_dctcp_shift_g)) / tp->acked_bytes_total;  
+	  
+	  if(tp->dctcp_alpha > 1024) tp->dctcp_alpha = 1024; /* round to 0-1024 */
+
+          /* printk("bytes_ecn= %d total= %d alpha: old= %d new= %d\n", */
+	  /* 	  		 tp->acked_bytes_ecn, tp->acked_bytes_total, alpha_old, tp->dctcp_alpha); */
+	  
+	  tp->acked_bytes_ecn = 0;
+	  tp->acked_bytes_total = 0;
+	  tp->next_seq = tp->snd_nxt;
+        }
+
+	/* END: DCTCP Processing */
+
+ 	/* We passed data and got it acked, remove any soft error
+ 	 * log. Something worked...
+ 	 */
+@@ -4480,7 +4630,7 @@ drop:
+ 		goto queue_and_out;
+ 	}
+ 
+-	TCP_ECN_check_ce(tp, skb);
+	TCP_ECN_dctcp_check_ce(sk, tp, skb);
+ 
+ 	if (tcp_try_rmem_schedule(sk, skb->truesize))
+ 		goto drop;
+@@ -4931,6 +5081,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+ 	     __tcp_select_window(sk) >= tp->rcv_wnd) ||
+ 	    /* We ACK each frame or... */
+ 	    tcp_in_quickack_mode(sk) ||
+	    /* Delayed ACK is disabled or ... */
+	    sysctl_tcp_delayed_ack == 0 ||
+ 	    /* We have out of order data. */
+ 	    (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
+ 		/* Then ack it now */
+diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
+index 882e0b0..2a4d1dc 100644
+--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
+@@ -308,7 +308,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 
+ 	tp->ecn_flags = 0;
+-	if (sysctl_tcp_ecn == 1) {
+	if (sysctl_tcp_ecn == 1 || sysctl_tcp_dctcp_enable) {
+ 		TCP_SKB_CB(skb)->flags |= TCPHDR_ECE | TCPHDR_CWR;
+ 		tp->ecn_flags = TCP_ECN_OK;
+ 	}
+@@ -878,6 +878,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ 	if (likely((tcb->flags & TCPHDR_SYN) == 0))
+ 		TCP_ECN_send(sk, skb, tcp_header_size);
+ 
+	/* In DCTCP, Assert ECT bit to all packets*/
+	if(sysctl_tcp_dctcp_enable)
+		INET_ECN_xmit(sk);
+
+ #ifdef CONFIG_TCP_MD5SIG
+ 	/* Calculate the MD5 hash, as we have all we need now */
+ 	if (md5) {
+@@ -2624,6 +2628,11 @@ int tcp_connect(struct sock *sk)
+ 	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+ 	TCP_ECN_send_syn(sk, buff);
+ 
+	/* Initialize DCTCP internal parameters */
+	tp->next_seq = tp->snd_nxt;
+	tp->acked_bytes_ecn = 0;
+	tp->acked_bytes_total = 0;
+
+ 	/* Send it off. */
+ 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
+ 	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
+@@ -2660,6 +2669,10 @@ void tcp_send_delayed_ack(struct sock *sk)
+ 	int ato = icsk->icsk_ack.ato;
+ 	unsigned long timeout;
+ 
+	/* Delayed ACK reserved flag for DCTCP */
+	struct tcp_sock *tp = tcp_sk(sk);
+	tp->delayed_ack_reserved = 1;
+
+ 	if (ato > TCP_DELACK_MIN) {
+ 		const struct tcp_sock *tp = tcp_sk(sk);
+ 		int max_ato = HZ / 2;
+@@ -2711,6 +2724,10 @@ void tcp_send_ack(struct sock *sk)
+ {
+ 	struct sk_buff *buff;
+ 
+	/* Delayed ACK reserved flag for DCTCP */
+	struct tcp_sock *tp = tcp_sk(sk);
+	tp->delayed_ack_reserved = 0;
+
+ 	/* If we have been reset, we may not send again. */
+ 	if (sk->sk_state == TCP_CLOSE)
+ 		return;
+-- 
+1.7.0.4
+
Author	SHA1	Message	Date
Brandon Heller	6d06ab3591	Add .config	2012-06-03 00:49:20 -07:00
Brandon Heller	9977ca16bb	Fix build error w/.config not there	2012-06-03 05:34:58 +00:00
Brandon Heller	1631044f7b	Add 3.0 kernel build files	2012-06-03 03:51:08 +00:00