* by Peter Williams
* 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
+ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ * Thomas Gleixner, Mike Kravetz
*/
#include <linux/mm.h>
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
/*
- * Some helpers for converting nanosecond timing to jiffy resolution
+ * Helpers for converting nanosecond timing to jiffy resolution
*/
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
-#define JIFFIES_TO_NS(TIME) ((TIME) * (NSEC_PER_SEC / HZ))
#define NICE_0_LOAD SCHED_LOAD_SCALE
#define NICE_0_SHIFT SCHED_LOAD_SHIFT
*
* Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
* Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
- * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
+ * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
*
* The weight assigned to a task group's schedulable entities on every
* cpu (task_group.se[a_cpu]->load.weight) is derived from the task
* tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
*
* Note: It's not necessary that each of a task's group schedulable
- * entity have the same weight on all CPUs. If the group
- * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
- * better distribution of weight could be:
+ * entity have the same weight on all CPUs. If the group
+ * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
+ * better distribution of weight could be:
*
* tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
* tg_A->se[1]->load.weight = 1/2 * 2000 = 667
* Every task in system belong to this group at bootup.
*/
struct task_group init_task_group = {
- .se = init_sched_entity_p,
+ .se = init_sched_entity_p,
.cfs_rq = init_cfs_rq_p,
};
#ifdef CONFIG_FAIR_USER_SCHED
-# define INIT_TASK_GROUP_LOAD 2*NICE_0_LOAD
+# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
#else
# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
#endif
-#define MIN_GROUP_SHARES 2
+#define MIN_GROUP_SHARES 2
static int init_task_group_load = INIT_TASK_GROUP_LOAD;
unsigned long rt_nr_migratory;
/* highest queued rt task prio */
int highest_prio;
+ int overloaded;
+};
+
+#ifdef CONFIG_SMP
+
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+ atomic_t refcount;
+ cpumask_t span;
+ cpumask_t online;
+
+ /*
+ * The "RT overload" flag: it gets set if a CPU has more than
+ * one runnable RT task.
+ */
+ cpumask_t rto_mask;
+ atomic_t rto_count;
};
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+#endif
+
/*
* This is the main, per-CPU runqueue data structure.
*
atomic_t nr_iowait;
#ifdef CONFIG_SMP
+ struct root_domain *rd;
struct sched_domain *sd;
/* For active balancing */
#endif
}
+static inline void check_class_changed(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class,
+ int oldprio, int running)
+{
+ if (prev_class != p->sched_class) {
+ if (prev_class->switched_from)
+ prev_class->switched_from(rq, p, running);
+ p->sched_class->switched_to(rq, p, running);
+ } else
+ p->sched_class->prio_changed(rq, p, oldprio, running);
+}
+
#ifdef CONFIG_SMP
/*
unsigned long flags;
long old_state;
struct rq *rq;
-#ifdef CONFIG_SMP
- int new_cpu;
-#endif
rq = task_rq_lock(p, &flags);
old_state = p->state;
if (unlikely(task_running(rq, p)))
goto out_activate;
- new_cpu = p->sched_class->select_task_rq(p, sync);
- if (new_cpu != cpu) {
- set_task_cpu(p, new_cpu);
+ cpu = p->sched_class->select_task_rq(p, sync);
+ if (cpu != orig_cpu) {
+ set_task_cpu(p, cpu);
task_rq_unlock(rq, &flags);
/* might preempt at this point */
rq = task_rq_lock(p, &flags);
}
}
}
-
#endif
-
out_activate:
#endif /* CONFIG_SMP */
schedstat_inc(p, se.nr_wakeups);
out_running:
p->state = TASK_RUNNING;
- wakeup_balance_rt(rq, p);
+#ifdef CONFIG_SMP
+ if (p->sched_class->task_wake_up)
+ p->sched_class->task_wake_up(rq, p);
+#endif
out:
task_rq_unlock(rq, &flags);
p->se.wait_max = 0;
#endif
- INIT_LIST_HEAD(&p->run_list);
+ INIT_LIST_HEAD(&p->rt.run_list);
p->se.on_rq = 0;
#ifdef CONFIG_PREEMPT_NOTIFIERS
inc_nr_running(p, rq);
}
check_preempt_curr(rq, p);
+#ifdef CONFIG_SMP
+ if (p->sched_class->task_wake_up)
+ p->sched_class->task_wake_up(rq, p);
+#endif
task_rq_unlock(rq, &flags);
}
prev_state = prev->state;
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
- schedule_tail_balance_rt(rq);
+#ifdef CONFIG_SMP
+ if (current->sched_class->post_schedule)
+ current->sched_class->post_schedule(rq);
+#endif
fire_sched_in_preempt_notifiers(current);
if (mm)
switch_count = &prev->nvcsw;
}
- schedule_balance_rt(rq, prev);
+#ifdef CONFIG_SMP
+ if (prev->sched_class->pre_schedule)
+ prev->sched_class->pre_schedule(rq, prev);
+#endif
if (unlikely(!rq->nr_running))
idle_balance(cpu, rq);
unsigned long flags;
int oldprio, on_rq, running;
struct rq *rq;
+ const struct sched_class *prev_class = p->sched_class;
BUG_ON(prio < 0 || prio > MAX_PRIO);
if (on_rq) {
if (running)
p->sched_class->set_curr_task(rq);
+
enqueue_task(rq, p, 0);
- /*
- * Reschedule if we are currently running on this runqueue and
- * our priority decreased, or if we are not currently running on
- * this runqueue and our priority is higher than the current's
- */
- if (running) {
- if (p->prio > oldprio)
- resched_task(rq->curr);
- } else {
- check_preempt_curr(rq, p);
- }
+
+ check_class_changed(rq, p, prev_class, oldprio, running);
}
task_rq_unlock(rq, &flags);
}
{
int retval, oldprio, oldpolicy = -1, on_rq, running;
unsigned long flags;
+ const struct sched_class *prev_class = p->sched_class;
struct rq *rq;
/* may grab non-irq protected spin_locks */
if (on_rq) {
if (running)
p->sched_class->set_curr_task(rq);
+
activate_task(rq, p, 0);
- /*
- * Reschedule if we are currently running on this runqueue and
- * our priority decreased, or if we are not currently running on
- * this runqueue and our priority is higher than the current's
- */
- if (running) {
- if (p->prio > oldprio)
- resched_task(rq->curr);
- } else {
- check_preempt_curr(rq, p);
- }
+
+ check_class_changed(rq, p, prev_class, oldprio, running);
}
__task_rq_unlock(rq);
spin_unlock_irqrestore(&p->pi_lock, flags);
} while (need_resched());
}
-int __sched cond_resched(void)
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
+int __sched _cond_resched(void)
{
if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
system_state == SYSTEM_RUNNING) {
}
return 0;
}
-EXPORT_SYMBOL(cond_resched);
+EXPORT_SYMBOL(_cond_resched);
+#endif
/*
* cond_resched_lock() - if a reschedule is pending, drop the given lock,
if (p->sched_class->set_cpus_allowed)
p->sched_class->set_cpus_allowed(p, &new_mask);
else {
- p->cpus_allowed = new_mask;
+ p->cpus_allowed = new_mask;
p->nr_cpus_allowed = cpus_weight(new_mask);
}
case CPU_ONLINE_FROZEN:
/* Strictly unnecessary, as first user will wake it. */
wake_up_process(cpu_rq(cpu)->migration_thread);
+
+ /* Update our root-domain */
+ rq = cpu_rq(cpu);
+ spin_lock_irqsave(&rq->lock, flags);
+ if (rq->rd) {
+ BUG_ON(!cpu_isset(cpu, rq->rd->span));
+ cpu_set(cpu, rq->rd->online);
+ }
+ spin_unlock_irqrestore(&rq->lock, flags);
break;
#ifdef CONFIG_HOTPLUG_CPU
}
spin_unlock_irq(&rq->lock);
break;
+
+ case CPU_DOWN_PREPARE:
+ /* Update our root-domain */
+ rq = cpu_rq(cpu);
+ spin_lock_irqsave(&rq->lock, flags);
+ if (rq->rd) {
+ BUG_ON(!cpu_isset(cpu, rq->rd->span));
+ cpu_clear(cpu, rq->rd->online);
+ }
+ spin_unlock_irqrestore(&rq->lock, flags);
+ break;
#endif
}
return NOTIFY_OK;
return 1;
}
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+ unsigned long flags;
+ const struct sched_class *class;
+
+ spin_lock_irqsave(&rq->lock, flags);
+
+ if (rq->rd) {
+ struct root_domain *old_rd = rq->rd;
+
+ for (class = sched_class_highest; class; class = class->next) {
+ if (class->leave_domain)
+ class->leave_domain(rq);
+ }
+
+ cpu_clear(rq->cpu, old_rd->span);
+ cpu_clear(rq->cpu, old_rd->online);
+
+ if (atomic_dec_and_test(&old_rd->refcount))
+ kfree(old_rd);
+ }
+
+ atomic_inc(&rd->refcount);
+ rq->rd = rd;
+
+ cpu_set(rq->cpu, rd->span);
+ if (cpu_isset(rq->cpu, cpu_online_map))
+ cpu_set(rq->cpu, rd->online);
+
+ for (class = sched_class_highest; class; class = class->next) {
+ if (class->join_domain)
+ class->join_domain(rq);
+ }
+
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static void init_rootdomain(struct root_domain *rd)
+{
+ memset(rd, 0, sizeof(*rd));
+
+ cpus_clear(rd->span);
+ cpus_clear(rd->online);
+}
+
+static void init_defrootdomain(void)
+{
+ init_rootdomain(&def_root_domain);
+ atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+ struct root_domain *rd;
+
+ rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+ if (!rd)
+ return NULL;
+
+ init_rootdomain(rd);
+
+ return rd;
+}
+
/*
- * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
*/
-static void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
sched_domain_debug(sd, cpu);
+ rq_attach_root(rq, rd);
rcu_assign_pointer(rq->sd, sd);
}
static int build_sched_domains(const cpumask_t *cpu_map)
{
int i;
+ struct root_domain *rd;
#ifdef CONFIG_NUMA
struct sched_group **sched_group_nodes = NULL;
int sd_allnodes = 0;
sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
#endif
+ rd = alloc_rootdomain();
+ if (!rd) {
+ printk(KERN_WARNING "Cannot alloc root domain\n");
+ return -ENOMEM;
+ }
+
/*
* Set up domains for cpus specified by the cpu_map.
*/
#else
sd = &per_cpu(phys_domains, i);
#endif
- cpu_attach_domain(sd, i);
+ cpu_attach_domain(sd, rd, i);
}
return 0;
unregister_sched_domain_sysctl();
for_each_cpu_mask(i, *cpu_map)
- cpu_attach_domain(NULL, i);
+ cpu_attach_domain(NULL, &def_root_domain, i);
synchronize_sched();
arch_destroy_sched_domains(cpu_map);
}
int highest_cpu = 0;
int i, j;
+#ifdef CONFIG_SMP
+ init_defrootdomain();
+#endif
+
for_each_possible_cpu(i) {
struct rt_prio_array *array;
struct rq *rq;
rq->cpu_load[j] = 0;
#ifdef CONFIG_SMP
rq->sd = NULL;
+ rq->rd = NULL;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
rq->migration_thread = NULL;
INIT_LIST_HEAD(&rq->migration_queue);
rq->rt.highest_prio = MAX_RT_PRIO;
+ rq->rt.overloaded = 0;
+ rq_attach_root(rq, &def_root_domain);
#endif
atomic_set(&rq->nr_iowait, 0);
for_each_cpu_mask(i, sdspan)
total_load += tg->cfs_rq[i]->load.weight;
- /* Nothing to do if this group has no load */
+ /* Nothing to do if this group has no load */
if (!total_load)
continue;