struct list_head queue[MAX_RT_PRIO];
};
-struct load_stat {
- struct load_weight load;
- u64 load_update_start, load_update_last;
- unsigned long delta_fair, delta_exec, delta_stat;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+#include <linux/container.h>
+
+struct cfs_rq;
+
+/* task group related information */
+struct task_grp {
+ struct container_subsys_state css;
+ /* schedulable entities of this group on each cpu */
+ struct sched_entity **se;
+ /* runqueue "owned" by this group on each cpu */
+ struct cfs_rq **cfs_rq;
+ unsigned long shares;
};
+/* Default task group's sched entity on each cpu */
+static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
+/* Default task group's cfs_rq on each cpu */
+static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
+
+static struct sched_entity *init_sched_entity_p[CONFIG_NR_CPUS];
+static struct cfs_rq *init_cfs_rq_p[CONFIG_NR_CPUS];
+
+/* Default task group.
+ * Every task in system belong to this group at bootup.
+ */
+static struct task_grp init_task_grp = {
+ .se = init_sched_entity_p,
+ .cfs_rq = init_cfs_rq_p,
+ };
+
+/* return group to which a task belongs */
+static inline struct task_grp *task_grp(struct task_struct *p)
+{
+ return container_of(task_subsys_state(p, cpu_subsys_id),
+ struct task_grp, css);
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+ p->se.cfs_rq = task_grp(p)->cfs_rq[task_cpu(p)];
+ p->se.parent = task_grp(p)->se[task_cpu(p)];
+}
+
+#else
+
+static inline void set_task_cfs_rq(struct task_struct *p) { }
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
unsigned long nr_running;
- s64 fair_clock;
u64 exec_clock;
- s64 wait_runtime;
- u64 sleeper_bonus;
- unsigned long wait_runtime_overruns, wait_runtime_underruns;
+ u64 min_vruntime;
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
struct rb_node *rb_load_balance_curr;
-#ifdef CONFIG_FAIR_GROUP_SCHED
/* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
struct sched_entity *curr;
+#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
* list is used during load balance.
*/
struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
+ struct task_grp *tg; /* group that "owns" this runqueue */
#endif
};
#ifdef CONFIG_NO_HZ
unsigned char in_nohz_recently;
#endif
- struct load_stat ls; /* capture load from *all* tasks on this cpu */
+ struct load_weight load; /* capture load from *all* tasks on this cpu */
unsigned long nr_load_updates;
u64 nr_switches;
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+/*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# define const_debug __read_mostly
+#else
+# define const_debug static const
+#endif
+
+/*
+ * Debugging: various feature bits
+ */
+enum {
+ SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
+ SCHED_FEAT_START_DEBIT = 2,
+ SCHED_FEAT_USE_TREE_AVG = 4,
+ SCHED_FEAT_APPROX_AVG = 8,
+};
+
+const_debug unsigned int sysctl_sched_features =
+ SCHED_FEAT_NEW_FAIR_SLEEPERS *1 |
+ SCHED_FEAT_START_DEBIT *1 |
+ SCHED_FEAT_USE_TREE_AVG *0 |
+ SCHED_FEAT_APPROX_AVG *0;
+
+#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
+
/*
* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
* clock constructed from sched_clock():
return now;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/* Change a task's ->cfs_rq if it moves across CPUs */
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
- p->se.cfs_rq = &task_rq(p)->cfs;
-}
-#else
-static inline void set_task_cfs_rq(struct task_struct *p)
-{
-}
-#endif
-
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
}
#endif
-static u64 div64_likely32(u64 divident, unsigned long divisor)
-{
-#if BITS_PER_LONG == 32
- if (likely(divident <= 0xffffffffULL))
- return (u32)divident / divisor;
- do_div(divident, divisor);
-
- return divident;
-#else
- return divident / divisor;
-#endif
-}
-
#if BITS_PER_LONG == 32
# define WMULT_CONST (~0UL)
#else
return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
}
-static void update_load_add(struct load_weight *lw, unsigned long inc)
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
- lw->inv_weight = 0;
}
-static void update_load_sub(struct load_weight *lw, unsigned long dec)
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
lw->weight -= dec;
- lw->inv_weight = 0;
}
/*
#define sched_class_highest (&rt_sched_class)
-static void __update_curr_load(struct rq *rq, struct load_stat *ls)
-{
- if (rq->curr != rq->idle && ls->load.weight) {
- ls->delta_exec += ls->delta_stat;
- ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
- ls->delta_stat = 0;
- }
-}
-
/*
* Update delta_exec, delta_fair fields for rq.
*
* delta_fair clock advances at a rate inversely proportional to
- * total load (rq->ls.load.weight) on the runqueue, while
+ * total load (rq->load.weight) on the runqueue, while
* delta_exec advances at the same rate as wall-clock (provided
* cpu is not idle).
*
* runqueue over any given interval. This (smoothened) load is used
* during load balance.
*
- * This function is called /before/ updating rq->ls.load
+ * This function is called /before/ updating rq->load
* and when switching tasks.
*/
-static void update_curr_load(struct rq *rq)
-{
- struct load_stat *ls = &rq->ls;
- u64 start;
-
- start = ls->load_update_start;
- ls->load_update_start = rq->clock;
- ls->delta_stat += rq->clock - start;
- /*
- * Stagger updates to ls->delta_fair. Very frequent updates
- * can be expensive.
- */
- if (ls->delta_stat)
- __update_curr_load(rq, ls);
-}
-
static inline void inc_load(struct rq *rq, const struct task_struct *p)
{
- update_curr_load(rq);
- update_load_add(&rq->ls.load, p->se.load.weight);
+ update_load_add(&rq->load, p->se.load.weight);
}
static inline void dec_load(struct rq *rq, const struct task_struct *p)
{
- update_curr_load(rq);
- update_load_sub(&rq->ls.load, p->se.load.weight);
+ update_load_sub(&rq->load, p->se.load.weight);
}
static void inc_nr_running(struct task_struct *p, struct rq *rq)
static void set_load_weight(struct task_struct *p)
{
- p->se.wait_runtime = 0;
-
if (task_has_rt_policy(p)) {
p->se.load.weight = prio_to_weight[0] * 2;
p->se.load.inv_weight = prio_to_wmult[0] >> 1;
/* Used instead of source_load when we know the type == 0 */
unsigned long weighted_cpuload(const int cpu)
{
- return cpu_rq(cpu)->ls.load.weight;
+ return cpu_rq(cpu)->load.weight;
}
static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
{
#ifdef CONFIG_SMP
task_thread_info(p)->cpu = cpu;
- set_task_cfs_rq(p);
#endif
+ set_task_cfs_rq(p);
}
#ifdef CONFIG_SMP
{
int old_cpu = task_cpu(p);
struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
- u64 clock_offset, fair_clock_offset;
+ u64 clock_offset;
clock_offset = old_rq->clock - new_rq->clock;
- fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
-
- if (p->se.wait_start_fair)
- p->se.wait_start_fair -= fair_clock_offset;
- if (p->se.sleep_start_fair)
- p->se.sleep_start_fair -= fair_clock_offset;
#ifdef CONFIG_SCHEDSTATS
if (p->se.wait_start)
if (p->se.block_start)
p->se.block_start -= clock_offset;
#endif
+ if (likely(new_rq->cfs.min_vruntime))
+ p->se.vruntime -= old_rq->cfs.min_vruntime -
+ new_rq->cfs.min_vruntime;
__set_task_cpu(p, new_cpu);
}
*/
static void __sched_fork(struct task_struct *p)
{
- p->se.wait_start_fair = 0;
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
- p->se.wait_runtime = 0;
- p->se.sleep_start_fair = 0;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
- p->se.sum_wait_runtime = 0;
p->se.sum_sleep_runtime = 0;
p->se.sleep_start = 0;
p->se.block_start = 0;
p->se.exec_max = 0;
p->se.slice_max = 0;
p->se.wait_max = 0;
- p->se.wait_runtime_overruns = 0;
- p->se.wait_runtime_underruns = 0;
#endif
INIT_LIST_HEAD(&p->run_list);
*/
static void update_cpu_load(struct rq *this_rq)
{
- unsigned long total_load = this_rq->ls.load.weight;
- unsigned long this_load = total_load;
+ unsigned long this_load = this_rq->load.weight;
int i, scale;
this_rq->nr_load_updates++;
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
+ int oldprio, on_rq, running;
unsigned long flags;
- int oldprio, on_rq;
struct rq *rq;
BUG_ON(prio < 0 || prio > MAX_PRIO);
oldprio = p->prio;
on_rq = p->se.on_rq;
- if (on_rq)
+ running = task_running(rq, p);
+ if (on_rq) {
dequeue_task(rq, p, 0);
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
+ }
if (rt_prio(prio))
p->sched_class = &rt_sched_class;
p->prio = prio;
if (on_rq) {
+ if (running)
+ p->sched_class->set_curr_task(rq);
enqueue_task(rq, p, 0);
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (task_running(rq, p)) {
+ if (running) {
if (p->prio > oldprio)
resched_task(rq->curr);
} else {
int sched_setscheduler(struct task_struct *p, int policy,
struct sched_param *param)
{
- int retval, oldprio, oldpolicy = -1, on_rq;
+ int retval, oldprio, oldpolicy = -1, on_rq, running;
unsigned long flags;
struct rq *rq;
}
update_rq_clock(rq);
on_rq = p->se.on_rq;
- if (on_rq)
+ running = task_running(rq, p);
+ if (on_rq) {
deactivate_task(rq, p, 0);
+ if (running)
+ p->sched_class->put_prev_task(rq, p);
+ }
oldprio = p->prio;
__setscheduler(rq, p, policy, param->sched_priority);
if (on_rq) {
+ if (running)
+ p->sched_class->set_curr_task(rq);
activate_task(rq, p, 0);
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (task_running(rq, p)) {
+ if (running) {
if (p->prio > oldprio)
resched_task(rq->curr);
} else {
static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
{
cfs_rq->tasks_timeline = RB_ROOT;
- cfs_rq->fair_clock = 1;
#ifdef CONFIG_FAIR_GROUP_SCHED
cfs_rq->rq = rq;
#endif
init_cfs_rq(&rq->cfs, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
- list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+ {
+ struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
+ struct sched_entity *se =
+ &per_cpu(init_sched_entity, i);
+
+ init_cfs_rq_p[i] = cfs_rq;
+ init_cfs_rq(cfs_rq, rq);
+ cfs_rq->tg = &init_task_grp;
+ list_add(&cfs_rq->leaf_cfs_rq_list,
+ &rq->leaf_cfs_rq_list);
+
+ init_sched_entity_p[i] = se;
+ se->cfs_rq = &rq->cfs;
+ se->my_q = cfs_rq;
+ se->load.weight = NICE_0_LOAD;
+ se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
+ se->parent = NULL;
+ }
+ init_task_grp.shares = NICE_0_LOAD;
#endif
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
read_lock_irq(&tasklist_lock);
do_each_thread(g, p) {
- p->se.fair_key = 0;
- p->se.wait_runtime = 0;
p->se.exec_start = 0;
- p->se.wait_start_fair = 0;
- p->se.sleep_start_fair = 0;
#ifdef CONFIG_SCHEDSTATS
p->se.wait_start = 0;
p->se.sleep_start = 0;
p->se.block_start = 0;
#endif
- task_rq(p)->cfs.fair_clock = 0;
task_rq(p)->clock = 0;
if (!rt_task(p)) {
}
#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* return corresponding task_grp object of a container */
+static inline struct task_grp *container_tg(struct container *cont)
+{
+ return container_of(container_subsys_state(cont, cpu_subsys_id),
+ struct task_grp, css);
+}
+
+/* allocate runqueue etc for a new task group */
+static struct container_subsys_state *
+sched_create_group(struct container_subsys *ss, struct container *cont)
+{
+ struct task_grp *tg;
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se;
+ int i;
+
+ if (!cont->parent) {
+ /* This is early initialization for the top container */
+ init_task_grp.css.container = cont;
+ return &init_task_grp.css;
+ }
+
+ /* we support only 1-level deep hierarchical scheduler atm */
+ if (cont->parent->parent)
+ return ERR_PTR(-EINVAL);
+
+ tg = kzalloc(sizeof(*tg), GFP_KERNEL);
+ if (!tg)
+ return ERR_PTR(-ENOMEM);
+
+ tg->cfs_rq = kzalloc(sizeof(cfs_rq) * num_possible_cpus(), GFP_KERNEL);
+ if (!tg->cfs_rq)
+ goto err;
+ tg->se = kzalloc(sizeof(se) * num_possible_cpus(), GFP_KERNEL);
+ if (!tg->se)
+ goto err;
+
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+
+ cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
+ cpu_to_node(i));
+ if (!cfs_rq)
+ goto err;
+
+ se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
+ cpu_to_node(i));
+ if (!se)
+ goto err;
+
+ memset(cfs_rq, 0, sizeof(struct cfs_rq));
+ memset(se, 0, sizeof(struct sched_entity));
+
+ tg->cfs_rq[i] = cfs_rq;
+ init_cfs_rq(cfs_rq, rq);
+ cfs_rq->tg = tg;
+ list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+
+ tg->se[i] = se;
+ se->cfs_rq = &rq->cfs;
+ se->my_q = cfs_rq;
+ se->load.weight = NICE_0_LOAD;
+ se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
+ se->parent = NULL;
+ }
+
+ tg->shares = NICE_0_LOAD;
+
+ /* Bind the container to task_grp object we just created */
+ tg->css.container = cont;
+
+ return &tg->css;
+
+err:
+ for_each_possible_cpu(i) {
+ if (tg->cfs_rq && tg->cfs_rq[i])
+ kfree(tg->cfs_rq[i]);
+ if (tg->se && tg->se[i])
+ kfree(tg->se[i]);
+ }
+ if (tg->cfs_rq)
+ kfree(tg->cfs_rq);
+ if (tg->se)
+ kfree(tg->se);
+ if (tg)
+ kfree(tg);
+
+ return ERR_PTR(-ENOMEM);
+}
+
+
+/* destroy runqueue etc associated with a task group */
+static void sched_destroy_group(struct container_subsys *ss,
+ struct container *cont)
+{
+ struct task_grp *tg = container_tg(cont);
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se;
+ int i;
+
+ for_each_possible_cpu(i) {
+ cfs_rq = tg->cfs_rq[i];
+ list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+ }
+
+ /* wait for possible concurrent references to cfs_rqs complete */
+ synchronize_sched();
+
+ /* now it should be safe to free those cfs_rqs */
+ for_each_possible_cpu(i) {
+ cfs_rq = tg->cfs_rq[i];
+ kfree(cfs_rq);
+
+ se = tg->se[i];
+ kfree(se);
+ }
+
+ kfree(tg->cfs_rq);
+ kfree(tg->se);
+ kfree(tg);
+}
+
+static int sched_can_attach(struct container_subsys *ss,
+ struct container *cont, struct task_struct *tsk)
+{
+ /* We don't support RT-tasks being in separate groups */
+ if (tsk->sched_class != &fair_sched_class)
+ return -EINVAL;
+
+ return 0;
+}
+
+/* change task's runqueue when it moves between groups */
+static void sched_move_task(struct container_subsys *ss, struct container *cont,
+ struct container *old_cont, struct task_struct *tsk)
+{
+ int on_rq, running;
+ unsigned long flags;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &flags);
+
+ if (tsk->sched_class != &fair_sched_class)
+ goto done;
+
+ update_rq_clock(rq);
+
+ running = task_running(rq, tsk);
+ on_rq = tsk->se.on_rq;
+
+ if (on_rq) {
+ dequeue_task(rq, tsk, 0);
+ if (unlikely(running))
+ tsk->sched_class->put_prev_task(rq, tsk);
+ }
+
+ set_task_cfs_rq(tsk);
+
+ if (on_rq) {
+ if (unlikely(running))
+ tsk->sched_class->set_curr_task(rq);
+ enqueue_task(rq, tsk, 0);
+ }
+
+done:
+ task_rq_unlock(rq, &flags);
+}
+
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+ struct cfs_rq *cfs_rq = se->cfs_rq;
+ struct rq *rq = cfs_rq->rq;
+ int on_rq;
+
+ spin_lock_irq(&rq->lock);
+
+ on_rq = se->on_rq;
+ if (on_rq)
+ dequeue_entity(cfs_rq, se, 0);
+
+ se->load.weight = shares;
+ se->load.inv_weight = div64_64((1ULL<<32), shares);
+
+ if (on_rq)
+ enqueue_entity(cfs_rq, se, 0);
+
+ spin_unlock_irq(&rq->lock);
+}
+
+static ssize_t cpu_shares_write(struct container *cont, struct cftype *cftype,
+ struct file *file, const char __user *userbuf,
+ size_t nbytes, loff_t *ppos)
+{
+ int i;
+ unsigned long shareval;
+ struct task_grp *tg = container_tg(cont);
+ char buffer[2*sizeof(unsigned long) + 1];
+
+ if (nbytes > 2*sizeof(unsigned long)) /* safety check */
+ return -E2BIG;
+
+ if (copy_from_user(buffer, userbuf, nbytes))
+ return -EFAULT;
+
+ buffer[nbytes] = 0; /* nul-terminate */
+ shareval = simple_strtoul(buffer, NULL, 10);
+
+ tg->shares = shareval;
+ for_each_possible_cpu(i)
+ set_se_shares(tg->se[i], shareval);
+
+ return nbytes;
+}
+
+static u64 cpu_shares_read_uint(struct container *cont, struct cftype *cft)
+{
+ struct task_grp *tg = container_tg(cont);
+
+ return (u64) tg->shares;
+}
+
+struct cftype cpuctl_share = {
+ .name = "shares",
+ .read_uint = cpu_shares_read_uint,
+ .write = cpu_shares_write,
+};
+
+static int sched_populate(struct container_subsys *ss, struct container *cont)
+{
+ return container_add_file(cont, ss, &cpuctl_share);
+}
+
+struct container_subsys cpu_subsys = {
+ .name = "cpu",
+ .create = sched_create_group,
+ .destroy = sched_destroy_group,
+ .can_attach = sched_can_attach,
+ .attach = sched_move_task,
+ .populate = sched_populate,
+ .subsys_id = cpu_subsys_id,
+ .early_init = 1,
+};
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */