*/
#ifdef CONFIG_SMP
-static cpumask_t rt_overload_mask;
-static atomic_t rto_count;
-static inline int rt_overloaded(void)
-{
- return atomic_read(&rto_count);
-}
-static inline cpumask_t *rt_overload(void)
+
+static inline int rt_overloaded(struct rq *rq)
{
- return &rt_overload_mask;
+ return atomic_read(&rq->rd->rto_count);
}
+
static inline void rt_set_overload(struct rq *rq)
{
- cpu_set(rq->cpu, rt_overload_mask);
+ cpu_set(rq->cpu, rq->rd->rto_mask);
/*
* Make sure the mask is visible before we set
* the overload count. That is checked to determine
* updated yet.
*/
wmb();
- atomic_inc(&rto_count);
+ atomic_inc(&rq->rd->rto_count);
}
+
static inline void rt_clear_overload(struct rq *rq)
{
/* the order here really doesn't matter */
- atomic_dec(&rto_count);
- cpu_clear(rq->cpu, rt_overload_mask);
+ atomic_dec(&rq->rd->rto_count);
+ cpu_clear(rq->cpu, rq->rd->rto_mask);
}
static void update_rt_migration(struct rq *rq)
{
- if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1))
- rt_set_overload(rq);
- else
+ if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
+ if (!rq->rt.overloaded) {
+ rt_set_overload(rq);
+ rq->rt.overloaded = 1;
+ }
+ } else if (rq->rt.overloaded) {
rt_clear_overload(rq);
+ rq->rt.overloaded = 0;
+ }
}
#endif /* CONFIG_SMP */
}
#ifdef CONFIG_SMP
+static int find_lowest_rq(struct task_struct *task);
+
static int select_task_rq_rt(struct task_struct *p, int sync)
{
+ struct rq *rq = task_rq(p);
+
+ /*
+ * If the current task is an RT task, then
+ * try to see if we can wake this RT task up on another
+ * runqueue. Otherwise simply start this RT task
+ * on its current runqueue.
+ *
+ * We want to avoid overloading runqueues. Even if
+ * the RT task is of higher priority than the current RT task.
+ * RT tasks behave differently than other tasks. If
+ * one gets preempted, we try to push it off to another queue.
+ * So trying to keep a preempting RT task on the same
+ * cache hot CPU will force the running RT task to
+ * a cold CPU. So we waste all the cache for the lower
+ * RT task in hopes of saving some of a RT task
+ * that is just being woken and probably will have
+ * cold cache anyway.
+ */
+ if (unlikely(rt_task(rq->curr)) &&
+ (p->nr_cpus_allowed > 1)) {
+ int cpu = find_lowest_rq(p);
+
+ return (cpu == -1) ? task_cpu(p) : cpu;
+ }
+
+ /*
+ * Otherwise, just let it ride on the affined RQ and the
+ * post-schedule router will push the preempted task away
+ */
return task_cpu(p);
}
#endif /* CONFIG_SMP */
}
/* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
- int cpu)
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
{
struct rt_prio_array *array = &rq->rt.active;
struct task_struct *next;
struct list_head *queue;
int idx;
- assert_spin_locked(&rq->lock);
-
if (likely(rq->rt.rt_nr_running < 2))
return NULL;
if (queue->next->next != queue) {
/* same prio task */
- next = list_entry(queue->next->next, struct task_struct, run_list);
+ next = list_entry(queue->next->next, struct task_struct,
+ run_list);
if (pick_rt_task(rq, next, cpu))
goto out;
}
static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
-/* Will lock the rq it finds */
-static struct rq *find_lock_lowest_rq(struct task_struct *task,
- struct rq *this_rq)
+static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
{
- struct rq *lowest_rq = NULL;
- int cpu;
- int tries;
- cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
+ int lowest_prio = -1;
+ int lowest_cpu = -1;
+ int count = 0;
+ int cpu;
- cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
+ cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
- for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+ /*
+ * Scan each rq for the lowest prio.
+ */
+ for_each_cpu_mask(cpu, *lowest_mask) {
+ struct rq *rq = cpu_rq(cpu);
+
+ /* We look for lowest RT prio or non-rt CPU */
+ if (rq->rt.highest_prio >= MAX_RT_PRIO) {
+ /*
+ * if we already found a low RT queue
+ * and now we found this non-rt queue
+ * clear the mask and set our bit.
+ * Otherwise just return the queue as is
+ * and the count==1 will cause the algorithm
+ * to use the first bit found.
+ */
+ if (lowest_cpu != -1) {
+ cpus_clear(*lowest_mask);
+ cpu_set(rq->cpu, *lowest_mask);
+ }
+ return 1;
+ }
+
+ /* no locking for now */
+ if ((rq->rt.highest_prio > task->prio)
+ && (rq->rt.highest_prio >= lowest_prio)) {
+ if (rq->rt.highest_prio > lowest_prio) {
+ /* new low - clear old data */
+ lowest_prio = rq->rt.highest_prio;
+ lowest_cpu = cpu;
+ count = 0;
+ }
+ count++;
+ } else
+ cpu_clear(cpu, *lowest_mask);
+ }
+
+ /*
+ * Clear out all the set bits that represent
+ * runqueues that were of higher prio than
+ * the lowest_prio.
+ */
+ if (lowest_cpu > 0) {
/*
- * Scan each rq for the lowest prio.
+ * Perhaps we could add another cpumask op to
+ * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
+ * Then that could be optimized to use memset and such.
*/
- for_each_cpu_mask(cpu, *cpu_mask) {
- struct rq *rq = &per_cpu(runqueues, cpu);
+ for_each_cpu_mask(cpu, *lowest_mask) {
+ if (cpu >= lowest_cpu)
+ break;
+ cpu_clear(cpu, *lowest_mask);
+ }
+ }
- if (cpu == this_rq->cpu)
- continue;
+ return count;
+}
- /* We look for lowest RT prio or non-rt CPU */
- if (rq->rt.highest_prio >= MAX_RT_PRIO) {
- lowest_rq = rq;
- break;
- }
+static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
+{
+ int first;
- /* no locking for now */
- if (rq->rt.highest_prio > task->prio &&
- (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
- lowest_rq = rq;
- }
+ /* "this_cpu" is cheaper to preempt than a remote processor */
+ if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
+ return this_cpu;
+
+ first = first_cpu(*mask);
+ if (first != NR_CPUS)
+ return first;
+
+ return -1;
+}
+
+static int find_lowest_rq(struct task_struct *task)
+{
+ struct sched_domain *sd;
+ cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
+ int this_cpu = smp_processor_id();
+ int cpu = task_cpu(task);
+ int count = find_lowest_cpus(task, lowest_mask);
+
+ if (!count)
+ return -1; /* No targets found */
+
+ /*
+ * There is no sense in performing an optimal search if only one
+ * target is found.
+ */
+ if (count == 1)
+ return first_cpu(*lowest_mask);
+
+ /*
+ * At this point we have built a mask of cpus representing the
+ * lowest priority tasks in the system. Now we want to elect
+ * the best one based on our affinity and topology.
+ *
+ * We prioritize the last cpu that the task executed on since
+ * it is most likely cache-hot in that location.
+ */
+ if (cpu_isset(cpu, *lowest_mask))
+ return cpu;
+
+ /*
+ * Otherwise, we consult the sched_domains span maps to figure
+ * out which cpu is logically closest to our hot cache data.
+ */
+ if (this_cpu == cpu)
+ this_cpu = -1; /* Skip this_cpu opt if the same */
+
+ for_each_domain(cpu, sd) {
+ if (sd->flags & SD_WAKE_AFFINE) {
+ cpumask_t domain_mask;
+ int best_cpu;
+
+ cpus_and(domain_mask, sd->span, *lowest_mask);
+
+ best_cpu = pick_optimal_cpu(this_cpu,
+ &domain_mask);
+ if (best_cpu != -1)
+ return best_cpu;
}
+ }
+
+ /*
+ * And finally, if there were no matches within the domains
+ * just give the caller *something* to work with from the compatible
+ * locations.
+ */
+ return pick_optimal_cpu(this_cpu, lowest_mask);
+}
+
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
+{
+ struct rq *lowest_rq = NULL;
+ int tries;
+ int cpu;
- if (!lowest_rq)
+ for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+ cpu = find_lowest_rq(task);
+
+ if ((cpu == -1) || (cpu == rq->cpu))
break;
+ lowest_rq = cpu_rq(cpu);
+
/* if the prio of this runqueue changed, try again */
- if (double_lock_balance(this_rq, lowest_rq)) {
+ if (double_lock_balance(rq, lowest_rq)) {
/*
* We had to unlock the run queue. In
* the mean time, task could have
* migrated already or had its affinity changed.
* Also make sure that it wasn't scheduled on its rq.
*/
- if (unlikely(task_rq(task) != this_rq ||
- !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
- task_running(this_rq, task) ||
+ if (unlikely(task_rq(task) != rq ||
+ !cpu_isset(lowest_rq->cpu,
+ task->cpus_allowed) ||
+ task_running(rq, task) ||
!task->se.on_rq)) {
+
spin_unlock(&lowest_rq->lock);
lowest_rq = NULL;
break;
int ret = 0;
int paranoid = RT_MAX_TRIES;
- assert_spin_locked(&rq->lock);
+ if (!rq->rt.overloaded)
+ return 0;
next_task = pick_next_highest_task_rt(rq, -1);
if (!next_task)
goto out;
}
- assert_spin_locked(&lowest_rq->lock);
-
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, lowest_rq->cpu);
activate_task(lowest_rq, next_task, 0);
static int pull_rt_task(struct rq *this_rq)
{
- struct task_struct *next;
- struct task_struct *p;
+ int this_cpu = this_rq->cpu, ret = 0, cpu;
+ struct task_struct *p, *next;
struct rq *src_rq;
- cpumask_t *rto_cpumask;
- int this_cpu = this_rq->cpu;
- int cpu;
- int ret = 0;
-
- assert_spin_locked(&this_rq->lock);
- /*
- * If cpusets are used, and we have overlapping
- * run queue cpusets, then this algorithm may not catch all.
- * This is just the price you pay on trying to keep
- * dirtying caches down on large SMP machines.
- */
- if (likely(!rt_overloaded()))
+ if (likely(!rt_overloaded(this_rq)))
return 0;
next = pick_next_task_rt(this_rq);
- rto_cpumask = rt_overload();
-
- for_each_cpu_mask(cpu, *rto_cpumask) {
+ for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
if (double_lock_balance(this_rq, src_rq)) {
/* unlocked our runqueue lock */
struct task_struct *old_next = next;
+
next = pick_next_task_rt(this_rq);
if (next != old_next)
ret = 1;
}
- if (likely(src_rq->rt.rt_nr_running <= 1))
+ if (likely(src_rq->rt.rt_nr_running <= 1)) {
/*
* Small chance that this_rq->curr changed
* but it's really harmless here.
*/
rt_clear_overload(this_rq);
- else
+ } else {
/*
* Heh, the src_rq is now overloaded, since
* we already have the src_rq lock, go straight
* to pulling tasks from it.
*/
goto try_pulling;
+ }
spin_unlock(&src_rq->lock);
continue;
}
*/
if (double_lock_balance(this_rq, src_rq)) {
struct task_struct *old_next = next;
+
next = pick_next_task_rt(this_rq);
if (next != old_next)
ret = 1;
*/
if (p->prio < src_rq->curr->prio ||
(next && next->prio < src_rq->curr->prio))
- goto bail;
+ goto out;
ret = 1;
* case there's an even higher prio task
* in another runqueue. (low likelyhood
* but possible)
- */
-
- /*
+ *
* Update next so that we won't pick a task
* on another cpu with a priority lower (or equal)
* than the one we just picked.
next = p;
}
- bail:
+ out:
spin_unlock(&src_rq->lock);
}
return ret;
}
-static void schedule_balance_rt(struct rq *rq,
- struct task_struct *prev)
+static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
- if (unlikely(rt_task(prev)) &&
- rq->rt.highest_prio > prev->prio)
+ if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
pull_rt_task(rq);
}
-static void schedule_tail_balance_rt(struct rq *rq)
+static void post_schedule_rt(struct rq *rq)
{
/*
* If we have more than one rt_task queued, then
* the lock was owned by prev, we need to release it
* first via finish_lock_switch and then reaquire it here.
*/
- if (unlikely(rq->rt.rt_nr_running > 1)) {
+ if (unlikely(rq->rt.overloaded)) {
spin_lock_irq(&rq->lock);
push_rt_tasks(rq);
spin_unlock_irq(&rq->lock);
}
-static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
+static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
{
- if (unlikely(rt_task(p)) &&
- !task_running(rq, p) &&
- (p->prio >= rq->curr->prio))
+ if (!task_running(rq, p) &&
+ (p->prio >= rq->rt.highest_prio) &&
+ rq->rt.overloaded)
push_rt_tasks(rq);
}
/* don't touch RT tasks */
return 0;
}
+
static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
{
int weight = cpus_weight(*new_mask);
if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
struct rq *rq = task_rq(p);
- if ((p->nr_cpus_allowed <= 1) && (weight > 1))
+ if ((p->nr_cpus_allowed <= 1) && (weight > 1)) {
rq->rt.rt_nr_migratory++;
- else if((p->nr_cpus_allowed > 1) && (weight <= 1)) {
+ } else if ((p->nr_cpus_allowed > 1) && (weight <= 1)) {
BUG_ON(!rq->rt.rt_nr_migratory);
rq->rt.rt_nr_migratory--;
}
p->cpus_allowed = *new_mask;
p->nr_cpus_allowed = weight;
}
-#else /* CONFIG_SMP */
-# define schedule_tail_balance_rt(rq) do { } while (0)
-# define schedule_balance_rt(rq, prev) do { } while (0)
-# define wakeup_balance_rt(rq, p) do { } while (0)
+
+/* Assumes rq->lock is held */
+static void join_domain_rt(struct rq *rq)
+{
+ if (rq->rt.overloaded)
+ rt_set_overload(rq);
+}
+
+/* Assumes rq->lock is held */
+static void leave_domain_rt(struct rq *rq)
+{
+ if (rq->rt.overloaded)
+ rt_clear_overload(rq);
+}
+
+/*
+ * When switch from the rt queue, we bring ourselves to a position
+ * that we might want to pull RT tasks from other runqueues.
+ */
+static void switched_from_rt(struct rq *rq, struct task_struct *p,
+ int running)
+{
+ /*
+ * If there are other RT tasks then we will reschedule
+ * and the scheduling of the other RT tasks will handle
+ * the balancing. But if we are the last RT task
+ * we may need to handle the pulling of RT tasks
+ * now.
+ */
+ if (!rq->rt.rt_nr_running)
+ pull_rt_task(rq);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * When switching a task to RT, we may overload the runqueue
+ * with RT tasks. In this case we try to push them off to
+ * other runqueues.
+ */
+static void switched_to_rt(struct rq *rq, struct task_struct *p,
+ int running)
+{
+ int check_resched = 1;
+
+ /*
+ * If we are already running, then there's nothing
+ * that needs to be done. But if we are not running
+ * we may need to preempt the current running task.
+ * If that current running task is also an RT task
+ * then see if we can move to another run queue.
+ */
+ if (!running) {
+#ifdef CONFIG_SMP
+ if (rq->rt.overloaded && push_rt_task(rq) &&
+ /* Don't resched if we changed runqueues */
+ rq != task_rq(p))
+ check_resched = 0;
+#endif /* CONFIG_SMP */
+ if (check_resched && p->prio < rq->curr->prio)
+ resched_task(rq->curr);
+ }
+}
+
+/*
+ * Priority of the task has changed. This may cause
+ * us to initiate a push or pull.
+ */
+static void prio_changed_rt(struct rq *rq, struct task_struct *p,
+ int oldprio, int running)
+{
+ if (running) {
+#ifdef CONFIG_SMP
+ /*
+ * If our priority decreases while running, we
+ * may need to pull tasks to this runqueue.
+ */
+ if (oldprio < p->prio)
+ pull_rt_task(rq);
+ /*
+ * If there's a higher priority task waiting to run
+ * then reschedule.
+ */
+ if (p->prio > rq->rt.highest_prio)
+ resched_task(p);
+#else
+ /* For UP simply resched on drop of prio */
+ if (oldprio < p->prio)
+ resched_task(p);
#endif /* CONFIG_SMP */
+ } else {
+ /*
+ * This task is not running, but if it is
+ * greater than the current running task
+ * then reschedule.
+ */
+ if (p->prio < rq->curr->prio)
+ resched_task(rq->curr);
+ }
+}
+
static void task_tick_rt(struct rq *rq, struct task_struct *p)
{
.load_balance = load_balance_rt,
.move_one_task = move_one_task_rt,
.set_cpus_allowed = set_cpus_allowed_rt,
+ .join_domain = join_domain_rt,
+ .leave_domain = leave_domain_rt,
+ .pre_schedule = pre_schedule_rt,
+ .post_schedule = post_schedule_rt,
+ .task_wake_up = task_wake_up_rt,
+ .switched_from = switched_from_rt,
#endif
.set_curr_task = set_curr_task_rt,
.task_tick = task_tick_rt,
+
+ .prio_changed = prio_changed_rt,
+ .switched_to = switched_to_rt,
};