sched: RT-balance, only adjust overload state when changing

[linux-2.6-omap-h63xx.git] / kernel / sched_rt.c
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c

index deff0c77d7059909f1744629e98980e5662e2768..a386758ffebb14c701a3cca37cdb9962162c71c9 100644 (file)
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -5,22 +5,14 @@
  
  #ifdef CONFIG_SMP
  
-/*
- * The "RT overload" flag: it gets set if a CPU has more than
- * one runnable RT task.
- */
-static cpumask_t rt_overload_mask;
-static atomic_t rto_count;
-
-static inline int rt_overloaded(void)
+static inline int rt_overloaded(struct rq *rq)
  {
-       return atomic_read(&rto_count);
+       return atomic_read(&rq->rd->rto_count);
  }
  
  static inline void rt_set_overload(struct rq *rq)
  {
-       rq->rt.overloaded = 1;
-       cpu_set(rq->cpu, rt_overload_mask);
+       cpu_set(rq->cpu, rq->rd->rto_mask);
         /*
          * Make sure the mask is visible before we set
          * the overload count. That is checked to determine
@@ -29,23 +21,27 @@ static inline void rt_set_overload(struct rq *rq)
          * updated yet.
          */
         wmb();
-       atomic_inc(&rto_count);
+       atomic_inc(&rq->rd->rto_count);
  }
  
  static inline void rt_clear_overload(struct rq *rq)
  {
         /* the order here really doesn't matter */
-       atomic_dec(&rto_count);
-       cpu_clear(rq->cpu, rt_overload_mask);
-       rq->rt.overloaded = 0;
+       atomic_dec(&rq->rd->rto_count);
+       cpu_clear(rq->cpu, rq->rd->rto_mask);
  }
  
  static void update_rt_migration(struct rq *rq)
  {
-       if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1))
-               rt_set_overload(rq);
-       else
+       if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
+               if (!rq->rt.overloaded) {
+                       rt_set_overload(rq);
+                       rq->rt.overloaded = 1;
+               }
+       } else if (rq->rt.overloaded) {
                 rt_clear_overload(rq);
+               rq->rt.overloaded = 0;
+       }
  }
  #endif /* CONFIG_SMP */
  
@@ -253,8 +249,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
         struct list_head *queue;
         int idx;
  
-       assert_spin_locked(&rq->lock);
-
         if (likely(rq->rt.rt_nr_running < 2))
                 return NULL;
  
@@ -308,7 +302,7 @@ static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
         int       count       = 0;
         int       cpu;
  
-       cpus_and(*lowest_mask, cpu_online_map, task->cpus_allowed);
+       cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
  
         /*
          * Scan each rq for the lowest prio.
@@ -500,8 +494,6 @@ static int push_rt_task(struct rq *rq)
         int ret = 0;
         int paranoid = RT_MAX_TRIES;
  
-       assert_spin_locked(&rq->lock);
-
         if (!rq->rt.overloaded)
                 return 0;
  
@@ -546,8 +538,6 @@ static int push_rt_task(struct rq *rq)
                 goto out;
         }
  
-       assert_spin_locked(&lowest_rq->lock);
-
         deactivate_task(rq, next_task, 0);
         set_task_cpu(next_task, lowest_rq->cpu);
         activate_task(lowest_rq, next_task, 0);
@@ -582,27 +572,16 @@ static void push_rt_tasks(struct rq *rq)
  
  static int pull_rt_task(struct rq *this_rq)
  {
-       struct task_struct *next;
-       struct task_struct *p;
+       int this_cpu = this_rq->cpu, ret = 0, cpu;
+       struct task_struct *p, *next;
         struct rq *src_rq;
-       int this_cpu = this_rq->cpu;
-       int cpu;
-       int ret = 0;
-
-       assert_spin_locked(&this_rq->lock);
  
-       /*
-        * If cpusets are used, and we have overlapping
-        * run queue cpusets, then this algorithm may not catch all.
-        * This is just the price you pay on trying to keep
-        * dirtying caches down on large SMP machines.
-        */
-       if (likely(!rt_overloaded()))
+       if (likely(!rt_overloaded(this_rq)))
                 return 0;
  
         next = pick_next_task_rt(this_rq);
  
-       for_each_cpu_mask(cpu, rt_overload_mask) {
+       for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
                 if (this_cpu == cpu)
                         continue;
  
@@ -616,23 +595,25 @@ static int pull_rt_task(struct rq *this_rq)
                         if (double_lock_balance(this_rq, src_rq)) {
                                 /* unlocked our runqueue lock */
                                 struct task_struct *old_next = next;
+
                                 next = pick_next_task_rt(this_rq);
                                 if (next != old_next)
                                         ret = 1;
                         }
-                       if (likely(src_rq->rt.rt_nr_running <= 1))
+                       if (likely(src_rq->rt.rt_nr_running <= 1)) {
                                 /*
                                  * Small chance that this_rq->curr changed
                                  * but it's really harmless here.
                                  */
                                 rt_clear_overload(this_rq);
-                       else
+                       } else {
                                 /*
                                  * Heh, the src_rq is now overloaded, since
                                  * we already have the src_rq lock, go straight
                                  * to pulling tasks from it.
                                  */
                                 goto try_pulling;
+                       }
                         spin_unlock(&src_rq->lock);
                         continue;
                 }
@@ -646,6 +627,7 @@ static int pull_rt_task(struct rq *this_rq)
                  */
                 if (double_lock_balance(this_rq, src_rq)) {
                         struct task_struct *old_next = next;
+
                         next = pick_next_task_rt(this_rq);
                         if (next != old_next)
                                 ret = 1;
@@ -682,7 +664,7 @@ static int pull_rt_task(struct rq *this_rq)
                          */
                         if (p->prio < src_rq->curr->prio ||
                             (next && next->prio < src_rq->curr->prio))
-                               goto bail;
+                               goto out;
  
                         ret = 1;
  
@@ -694,9 +676,7 @@ static int pull_rt_task(struct rq *this_rq)
                          * case there's an even higher prio task
                          * in another runqueue. (low likelyhood
                          * but possible)
-                        */
-
-                       /*
+                        *
                          * Update next so that we won't pick a task
                          * on another cpu with a priority lower (or equal)
                          * than the one we just picked.
@@ -704,23 +684,21 @@ static int pull_rt_task(struct rq *this_rq)
                         next = p;
  
                 }
- bail:
+ out:
                 spin_unlock(&src_rq->lock);
         }
  
         return ret;
  }
  
-static void schedule_balance_rt(struct rq *rq,
-                               struct task_struct *prev)
+static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
  {
         /* Try to pull RT tasks here if we lower this rq's prio */
-       if (unlikely(rt_task(prev)) &&
-           rq->rt.highest_prio > prev->prio)
+       if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
                 pull_rt_task(rq);
  }
  
-static void schedule_tail_balance_rt(struct rq *rq)
+static void post_schedule_rt(struct rq *rq)
  {
         /*
          * If we have more than one rt_task queued, then
@@ -737,10 +715,9 @@ static void schedule_tail_balance_rt(struct rq *rq)
  }
  
  
-static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
+static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
  {
-       if (unlikely(rt_task(p)) &&
-           !task_running(rq, p) &&
+       if (!task_running(rq, p) &&
             (p->prio >= rq->rt.highest_prio) &&
             rq->rt.overloaded)
                 push_rt_tasks(rq);
@@ -791,11 +768,105 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
         p->nr_cpus_allowed = weight;
  }
  
-#else /* CONFIG_SMP */
-# define schedule_tail_balance_rt(rq)  do { } while (0)
-# define schedule_balance_rt(rq, prev) do { } while (0)
-# define wakeup_balance_rt(rq, p)      do { } while (0)
+/* Assumes rq->lock is held */
+static void join_domain_rt(struct rq *rq)
+{
+       if (rq->rt.overloaded)
+               rt_set_overload(rq);
+}
+
+/* Assumes rq->lock is held */
+static void leave_domain_rt(struct rq *rq)
+{
+       if (rq->rt.overloaded)
+               rt_clear_overload(rq);
+}
+
+/*
+ * When switch from the rt queue, we bring ourselves to a position
+ * that we might want to pull RT tasks from other runqueues.
+ */
+static void switched_from_rt(struct rq *rq, struct task_struct *p,
+                          int running)
+{
+       /*
+        * If there are other RT tasks then we will reschedule
+        * and the scheduling of the other RT tasks will handle
+        * the balancing. But if we are the last RT task
+        * we may need to handle the pulling of RT tasks
+        * now.
+        */
+       if (!rq->rt.rt_nr_running)
+               pull_rt_task(rq);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * When switching a task to RT, we may overload the runqueue
+ * with RT tasks. In this case we try to push them off to
+ * other runqueues.
+ */
+static void switched_to_rt(struct rq *rq, struct task_struct *p,
+                          int running)
+{
+       int check_resched = 1;
+
+       /*
+        * If we are already running, then there's nothing
+        * that needs to be done. But if we are not running
+        * we may need to preempt the current running task.
+        * If that current running task is also an RT task
+        * then see if we can move to another run queue.
+        */
+       if (!running) {
+#ifdef CONFIG_SMP
+               if (rq->rt.overloaded && push_rt_task(rq) &&
+                   /* Don't resched if we changed runqueues */
+                   rq != task_rq(p))
+                       check_resched = 0;
+#endif /* CONFIG_SMP */
+               if (check_resched && p->prio < rq->curr->prio)
+                       resched_task(rq->curr);
+       }
+}
+
+/*
+ * Priority of the task has changed. This may cause
+ * us to initiate a push or pull.
+ */
+static void prio_changed_rt(struct rq *rq, struct task_struct *p,
+                           int oldprio, int running)
+{
+       if (running) {
+#ifdef CONFIG_SMP
+               /*
+                * If our priority decreases while running, we
+                * may need to pull tasks to this runqueue.
+                */
+               if (oldprio < p->prio)
+                       pull_rt_task(rq);
+               /*
+                * If there's a higher priority task waiting to run
+                * then reschedule.
+                */
+               if (p->prio > rq->rt.highest_prio)
+                       resched_task(p);
+#else
+               /* For UP simply resched on drop of prio */
+               if (oldprio < p->prio)
+                       resched_task(p);
  #endif /* CONFIG_SMP */
+       } else {
+               /*
+                * This task is not running, but if it is
+                * greater than the current running task
+                * then reschedule.
+                */
+               if (p->prio < rq->curr->prio)
+                       resched_task(rq->curr);
+       }
+}
+
  
  static void task_tick_rt(struct rq *rq, struct task_struct *p)
  {
@@ -848,8 +919,17 @@ const struct sched_class rt_sched_class = {
         .load_balance           = load_balance_rt,
         .move_one_task          = move_one_task_rt,
         .set_cpus_allowed       = set_cpus_allowed_rt,
+       .join_domain            = join_domain_rt,
+       .leave_domain           = leave_domain_rt,
+       .pre_schedule           = pre_schedule_rt,
+       .post_schedule          = post_schedule_rt,
+       .task_wake_up           = task_wake_up_rt,
+       .switched_from          = switched_from_rt,
  #endif
  
         .set_curr_task          = set_curr_task_rt,
         .task_tick              = task_tick_rt,
+
+       .prio_changed           = prio_changed_rt,
+       .switched_to            = switched_to_rt,
  };