sched: RT-balance, only adjust overload state when changing

[linux-2.6-omap-h63xx.git] / kernel / sched_rt.c
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c

index 9becc3710b609c6b6ae44ce9c413287503c80c49..a386758ffebb14c701a3cca37cdb9962162c71c9 100644 (file)
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -4,20 +4,15 @@
   */
  
  #ifdef CONFIG_SMP
-static cpumask_t rt_overload_mask;
-static atomic_t rto_count;
-static inline int rt_overloaded(void)
-{
-       return atomic_read(&rto_count);
-}
-static inline cpumask_t *rt_overload(void)
+
+static inline int rt_overloaded(struct rq *rq)
  {
-       return &rt_overload_mask;
+       return atomic_read(&rq->rd->rto_count);
  }
+
  static inline void rt_set_overload(struct rq *rq)
  {
-       rq->rt.overloaded = 1;
-       cpu_set(rq->cpu, rt_overload_mask);
+       cpu_set(rq->cpu, rq->rd->rto_mask);
         /*
          * Make sure the mask is visible before we set
          * the overload count. That is checked to determine
@@ -26,22 +21,27 @@ static inline void rt_set_overload(struct rq *rq)
          * updated yet.
          */
         wmb();
-       atomic_inc(&rto_count);
+       atomic_inc(&rq->rd->rto_count);
  }
+
  static inline void rt_clear_overload(struct rq *rq)
  {
         /* the order here really doesn't matter */
-       atomic_dec(&rto_count);
-       cpu_clear(rq->cpu, rt_overload_mask);
-       rq->rt.overloaded = 0;
+       atomic_dec(&rq->rd->rto_count);
+       cpu_clear(rq->cpu, rq->rd->rto_mask);
  }
  
  static void update_rt_migration(struct rq *rq)
  {
-       if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1))
-               rt_set_overload(rq);
-       else
+       if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
+               if (!rq->rt.overloaded) {
+                       rt_set_overload(rq);
+                       rq->rt.overloaded = 1;
+               }
+       } else if (rq->rt.overloaded) {
                 rt_clear_overload(rq);
+               rq->rt.overloaded = 0;
+       }
  }
  #endif /* CONFIG_SMP */
  
@@ -176,7 +176,8 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
          * that is just being woken and probably will have
          * cold cache anyway.
          */
-       if (unlikely(rt_task(rq->curr))) {
+       if (unlikely(rt_task(rq->curr)) &&
+           (p->nr_cpus_allowed > 1)) {
                 int cpu = find_lowest_rq(p);
  
                 return (cpu == -1) ? task_cpu(p) : cpu;
@@ -241,16 +242,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
  }
  
  /* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
-                                                    int cpu)
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
  {
         struct rt_prio_array *array = &rq->rt.active;
         struct task_struct *next;
         struct list_head *queue;
         int idx;
  
-       assert_spin_locked(&rq->lock);
-
         if (likely(rq->rt.rt_nr_running < 2))
                 return NULL;
  
@@ -269,7 +267,8 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
  
         if (queue->next->next != queue) {
                 /* same prio task */
-               next = list_entry(queue->next->next, struct task_struct, run_list);
+               next = list_entry(queue->next->next, struct task_struct,
+                                 run_list);
                 if (pick_rt_task(rq, next, cpu))
                         goto out;
         }
@@ -295,29 +294,36 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
  }
  
  static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
-static DEFINE_PER_CPU(cpumask_t, valid_cpu_mask);
  
  static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
  {
-       int       cpu;
-       cpumask_t *valid_mask = &__get_cpu_var(valid_cpu_mask);
         int       lowest_prio = -1;
-       int       ret         = 0;
+       int       lowest_cpu  = -1;
+       int       count       = 0;
+       int       cpu;
  
-       cpus_clear(*lowest_mask);
-       cpus_and(*valid_mask, cpu_online_map, task->cpus_allowed);
+       cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
  
         /*
          * Scan each rq for the lowest prio.
          */
-       for_each_cpu_mask(cpu, *valid_mask) {
+       for_each_cpu_mask(cpu, *lowest_mask) {
                 struct rq *rq = cpu_rq(cpu);
  
                 /* We look for lowest RT prio or non-rt CPU */
                 if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-                       if (ret)
+                       /*
+                        * if we already found a low RT queue
+                        * and now we found this non-rt queue
+                        * clear the mask and set our bit.
+                        * Otherwise just return the queue as is
+                        * and the count==1 will cause the algorithm
+                        * to use the first bit found.
+                        */
+                       if (lowest_cpu != -1) {
                                 cpus_clear(*lowest_mask);
-                       cpu_set(rq->cpu, *lowest_mask);
+                               cpu_set(rq->cpu, *lowest_mask);
+                       }
                         return 1;
                 }
  
@@ -327,14 +333,33 @@ static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
                         if (rq->rt.highest_prio > lowest_prio) {
                                 /* new low - clear old data */
                                 lowest_prio = rq->rt.highest_prio;
-                               cpus_clear(*lowest_mask);
+                               lowest_cpu = cpu;
+                               count = 0;
                         }
-                       cpu_set(rq->cpu, *lowest_mask);
-                       ret = 1;
+                       count++;
+               } else
+                       cpu_clear(cpu, *lowest_mask);
+       }
+
+       /*
+        * Clear out all the set bits that represent
+        * runqueues that were of higher prio than
+        * the lowest_prio.
+        */
+       if (lowest_cpu > 0) {
+               /*
+                * Perhaps we could add another cpumask op to
+                * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
+                * Then that could be optimized to use memset and such.
+                */
+               for_each_cpu_mask(cpu, *lowest_mask) {
+                       if (cpu >= lowest_cpu)
+                               break;
+                       cpu_clear(cpu, *lowest_mask);
                 }
         }
  
-       return ret;
+       return count;
  }
  
  static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
@@ -358,9 +383,17 @@ static int find_lowest_rq(struct task_struct *task)
         cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
         int this_cpu = smp_processor_id();
         int cpu      = task_cpu(task);
+       int count    = find_lowest_cpus(task, lowest_mask);
+
+       if (!count)
+               return -1; /* No targets found */
  
-       if (!find_lowest_cpus(task, lowest_mask))
-               return -1;
+       /*
+        * There is no sense in performing an optimal search if only one
+        * target is found.
+        */
+       if (count == 1)
+               return first_cpu(*lowest_mask);
  
         /*
          * At this point we have built a mask of cpus representing the
@@ -403,12 +436,11 @@ static int find_lowest_rq(struct task_struct *task)
  }
  
  /* Will lock the rq it finds */
-static struct rq *find_lock_lowest_rq(struct task_struct *task,
-                                     struct rq *rq)
+static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
  {
         struct rq *lowest_rq = NULL;
-       int cpu;
         int tries;
+       int cpu;
  
         for (tries = 0; tries < RT_MAX_TRIES; tries++) {
                 cpu = find_lowest_rq(task);
@@ -427,9 +459,11 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
                          * Also make sure that it wasn't scheduled on its rq.
                          */
                         if (unlikely(task_rq(task) != rq ||
-                                    !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
+                                    !cpu_isset(lowest_rq->cpu,
+                                               task->cpus_allowed) ||
                                      task_running(rq, task) ||
                                      !task->se.on_rq)) {
+
                                 spin_unlock(&lowest_rq->lock);
                                 lowest_rq = NULL;
                                 break;
@@ -460,8 +494,6 @@ static int push_rt_task(struct rq *rq)
         int ret = 0;
         int paranoid = RT_MAX_TRIES;
  
-       assert_spin_locked(&rq->lock);
-
         if (!rq->rt.overloaded)
                 return 0;
  
@@ -506,8 +538,6 @@ static int push_rt_task(struct rq *rq)
                 goto out;
         }
  
-       assert_spin_locked(&lowest_rq->lock);
-
         deactivate_task(rq, next_task, 0);
         set_task_cpu(next_task, lowest_rq->cpu);
         activate_task(lowest_rq, next_task, 0);
@@ -542,30 +572,16 @@ static void push_rt_tasks(struct rq *rq)
  
  static int pull_rt_task(struct rq *this_rq)
  {
-       struct task_struct *next;
-       struct task_struct *p;
+       int this_cpu = this_rq->cpu, ret = 0, cpu;
+       struct task_struct *p, *next;
         struct rq *src_rq;
-       cpumask_t *rto_cpumask;
-       int this_cpu = this_rq->cpu;
-       int cpu;
-       int ret = 0;
  
-       assert_spin_locked(&this_rq->lock);
-
-       /*
-        * If cpusets are used, and we have overlapping
-        * run queue cpusets, then this algorithm may not catch all.
-        * This is just the price you pay on trying to keep
-        * dirtying caches down on large SMP machines.
-        */
-       if (likely(!rt_overloaded()))
+       if (likely(!rt_overloaded(this_rq)))
                 return 0;
  
         next = pick_next_task_rt(this_rq);
  
-       rto_cpumask = rt_overload();
-
-       for_each_cpu_mask(cpu, *rto_cpumask) {
+       for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
                 if (this_cpu == cpu)
                         continue;
  
@@ -579,23 +595,25 @@ static int pull_rt_task(struct rq *this_rq)
                         if (double_lock_balance(this_rq, src_rq)) {
                                 /* unlocked our runqueue lock */
                                 struct task_struct *old_next = next;
+
                                 next = pick_next_task_rt(this_rq);
                                 if (next != old_next)
                                         ret = 1;
                         }
-                       if (likely(src_rq->rt.rt_nr_running <= 1))
+                       if (likely(src_rq->rt.rt_nr_running <= 1)) {
                                 /*
                                  * Small chance that this_rq->curr changed
                                  * but it's really harmless here.
                                  */
                                 rt_clear_overload(this_rq);
-                       else
+                       } else {
                                 /*
                                  * Heh, the src_rq is now overloaded, since
                                  * we already have the src_rq lock, go straight
                                  * to pulling tasks from it.
                                  */
                                 goto try_pulling;
+                       }
                         spin_unlock(&src_rq->lock);
                         continue;
                 }
@@ -609,6 +627,7 @@ static int pull_rt_task(struct rq *this_rq)
                  */
                 if (double_lock_balance(this_rq, src_rq)) {
                         struct task_struct *old_next = next;
+
                         next = pick_next_task_rt(this_rq);
                         if (next != old_next)
                                 ret = 1;
@@ -645,7 +664,7 @@ static int pull_rt_task(struct rq *this_rq)
                          */
                         if (p->prio < src_rq->curr->prio ||
                             (next && next->prio < src_rq->curr->prio))
-                               goto bail;
+                               goto out;
  
                         ret = 1;
  
@@ -657,9 +676,7 @@ static int pull_rt_task(struct rq *this_rq)
                          * case there's an even higher prio task
                          * in another runqueue. (low likelyhood
                          * but possible)
-                        */
-
-                       /*
+                        *
                          * Update next so that we won't pick a task
                          * on another cpu with a priority lower (or equal)
                          * than the one we just picked.
@@ -667,23 +684,21 @@ static int pull_rt_task(struct rq *this_rq)
                         next = p;
  
                 }
- bail:
+ out:
                 spin_unlock(&src_rq->lock);
         }
  
         return ret;
  }
  
-static void schedule_balance_rt(struct rq *rq,
-                               struct task_struct *prev)
+static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
  {
         /* Try to pull RT tasks here if we lower this rq's prio */
-       if (unlikely(rt_task(prev)) &&
-           rq->rt.highest_prio > prev->prio)
+       if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
                 pull_rt_task(rq);
  }
  
-static void schedule_tail_balance_rt(struct rq *rq)
+static void post_schedule_rt(struct rq *rq)
  {
         /*
          * If we have more than one rt_task queued, then
@@ -700,10 +715,9 @@ static void schedule_tail_balance_rt(struct rq *rq)
  }
  
  
-static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
+static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
  {
-       if (unlikely(rt_task(p)) &&
-           !task_running(rq, p) &&
+       if (!task_running(rq, p) &&
             (p->prio >= rq->rt.highest_prio) &&
             rq->rt.overloaded)
                 push_rt_tasks(rq);
@@ -726,6 +740,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
         /* don't touch RT tasks */
         return 0;
  }
+
  static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
  {
         int weight = cpus_weight(*new_mask);
@@ -739,9 +754,9 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
         if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
                 struct rq *rq = task_rq(p);
  
-               if ((p->nr_cpus_allowed <= 1) && (weight > 1))
+               if ((p->nr_cpus_allowed <= 1) && (weight > 1)) {
                         rq->rt.rt_nr_migratory++;
-               else if((p->nr_cpus_allowed > 1) && (weight <= 1)) {
+               } else if ((p->nr_cpus_allowed > 1) && (weight <= 1)) {
                         BUG_ON(!rq->rt.rt_nr_migratory);
                         rq->rt.rt_nr_migratory--;
                 }
@@ -752,12 +767,107 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
         p->cpus_allowed    = *new_mask;
         p->nr_cpus_allowed = weight;
  }
-#else /* CONFIG_SMP */
-# define schedule_tail_balance_rt(rq)  do { } while (0)
-# define schedule_balance_rt(rq, prev) do { } while (0)
-# define wakeup_balance_rt(rq, p)      do { } while (0)
+
+/* Assumes rq->lock is held */
+static void join_domain_rt(struct rq *rq)
+{
+       if (rq->rt.overloaded)
+               rt_set_overload(rq);
+}
+
+/* Assumes rq->lock is held */
+static void leave_domain_rt(struct rq *rq)
+{
+       if (rq->rt.overloaded)
+               rt_clear_overload(rq);
+}
+
+/*
+ * When switch from the rt queue, we bring ourselves to a position
+ * that we might want to pull RT tasks from other runqueues.
+ */
+static void switched_from_rt(struct rq *rq, struct task_struct *p,
+                          int running)
+{
+       /*
+        * If there are other RT tasks then we will reschedule
+        * and the scheduling of the other RT tasks will handle
+        * the balancing. But if we are the last RT task
+        * we may need to handle the pulling of RT tasks
+        * now.
+        */
+       if (!rq->rt.rt_nr_running)
+               pull_rt_task(rq);
+}
  #endif /* CONFIG_SMP */
  
+/*
+ * When switching a task to RT, we may overload the runqueue
+ * with RT tasks. In this case we try to push them off to
+ * other runqueues.
+ */
+static void switched_to_rt(struct rq *rq, struct task_struct *p,
+                          int running)
+{
+       int check_resched = 1;
+
+       /*
+        * If we are already running, then there's nothing
+        * that needs to be done. But if we are not running
+        * we may need to preempt the current running task.
+        * If that current running task is also an RT task
+        * then see if we can move to another run queue.
+        */
+       if (!running) {
+#ifdef CONFIG_SMP
+               if (rq->rt.overloaded && push_rt_task(rq) &&
+                   /* Don't resched if we changed runqueues */
+                   rq != task_rq(p))
+                       check_resched = 0;
+#endif /* CONFIG_SMP */
+               if (check_resched && p->prio < rq->curr->prio)
+                       resched_task(rq->curr);
+       }
+}
+
+/*
+ * Priority of the task has changed. This may cause
+ * us to initiate a push or pull.
+ */
+static void prio_changed_rt(struct rq *rq, struct task_struct *p,
+                           int oldprio, int running)
+{
+       if (running) {
+#ifdef CONFIG_SMP
+               /*
+                * If our priority decreases while running, we
+                * may need to pull tasks to this runqueue.
+                */
+               if (oldprio < p->prio)
+                       pull_rt_task(rq);
+               /*
+                * If there's a higher priority task waiting to run
+                * then reschedule.
+                */
+               if (p->prio > rq->rt.highest_prio)
+                       resched_task(p);
+#else
+               /* For UP simply resched on drop of prio */
+               if (oldprio < p->prio)
+                       resched_task(p);
+#endif /* CONFIG_SMP */
+       } else {
+               /*
+                * This task is not running, but if it is
+                * greater than the current running task
+                * then reschedule.
+                */
+               if (p->prio < rq->curr->prio)
+                       resched_task(rq->curr);
+       }
+}
+
+
  static void task_tick_rt(struct rq *rq, struct task_struct *p)
  {
         update_curr_rt(rq);
@@ -809,8 +919,17 @@ const struct sched_class rt_sched_class = {
         .load_balance           = load_balance_rt,
         .move_one_task          = move_one_task_rt,
         .set_cpus_allowed       = set_cpus_allowed_rt,
+       .join_domain            = join_domain_rt,
+       .leave_domain           = leave_domain_rt,
+       .pre_schedule           = pre_schedule_rt,
+       .post_schedule          = post_schedule_rt,
+       .task_wake_up           = task_wake_up_rt,
+       .switched_from          = switched_from_rt,
  #endif
  
         .set_curr_task          = set_curr_task_rt,
         .task_tick              = task_tick_rt,
+
+       .prio_changed           = prio_changed_rt,
+       .switched_to            = switched_to_rt,
  };