sched: RT-balance, only adjust overload state when changing

[linux-2.6-omap-h63xx.git] / kernel / sched_rt.c
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c

index 5de1aebdbd1b1f132f90594a49d82ea19bf07e57..a386758ffebb14c701a3cca37cdb9962162c71c9 100644 (file)
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -4,19 +4,15 @@
   */
  
  #ifdef CONFIG_SMP
-static cpumask_t rt_overload_mask;
-static atomic_t rto_count;
-static inline int rt_overloaded(void)
-{
-       return atomic_read(&rto_count);
-}
-static inline cpumask_t *rt_overload(void)
+
+static inline int rt_overloaded(struct rq *rq)
  {
-       return &rt_overload_mask;
+       return atomic_read(&rq->rd->rto_count);
  }
+
  static inline void rt_set_overload(struct rq *rq)
  {
-       cpu_set(rq->cpu, rt_overload_mask);
+       cpu_set(rq->cpu, rq->rd->rto_mask);
         /*
          * Make sure the mask is visible before we set
          * the overload count. That is checked to determine
@@ -25,21 +21,27 @@ static inline void rt_set_overload(struct rq *rq)
          * updated yet.
          */
         wmb();
-       atomic_inc(&rto_count);
+       atomic_inc(&rq->rd->rto_count);
  }
+
  static inline void rt_clear_overload(struct rq *rq)
  {
         /* the order here really doesn't matter */
-       atomic_dec(&rto_count);
-       cpu_clear(rq->cpu, rt_overload_mask);
+       atomic_dec(&rq->rd->rto_count);
+       cpu_clear(rq->cpu, rq->rd->rto_mask);
  }
  
  static void update_rt_migration(struct rq *rq)
  {
-       if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1))
-               rt_set_overload(rq);
-       else
+       if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
+               if (!rq->rt.overloaded) {
+                       rt_set_overload(rq);
+                       rq->rt.overloaded = 1;
+               }
+       } else if (rq->rt.overloaded) {
                 rt_clear_overload(rq);
+               rq->rt.overloaded = 0;
+       }
  }
  #endif /* CONFIG_SMP */
  
@@ -151,8 +153,40 @@ yield_task_rt(struct rq *rq)
  }
  
  #ifdef CONFIG_SMP
+static int find_lowest_rq(struct task_struct *task);
+
  static int select_task_rq_rt(struct task_struct *p, int sync)
  {
+       struct rq *rq = task_rq(p);
+
+       /*
+        * If the current task is an RT task, then
+        * try to see if we can wake this RT task up on another
+        * runqueue. Otherwise simply start this RT task
+        * on its current runqueue.
+        *
+        * We want to avoid overloading runqueues. Even if
+        * the RT task is of higher priority than the current RT task.
+        * RT tasks behave differently than other tasks. If
+        * one gets preempted, we try to push it off to another queue.
+        * So trying to keep a preempting RT task on the same
+        * cache hot CPU will force the running RT task to
+        * a cold CPU. So we waste all the cache for the lower
+        * RT task in hopes of saving some of a RT task
+        * that is just being woken and probably will have
+        * cold cache anyway.
+        */
+       if (unlikely(rt_task(rq->curr)) &&
+           (p->nr_cpus_allowed > 1)) {
+               int cpu = find_lowest_rq(p);
+
+               return (cpu == -1) ? task_cpu(p) : cpu;
+       }
+
+       /*
+        * Otherwise, just let it ride on the affined RQ and the
+        * post-schedule router will push the preempted task away
+        */
         return task_cpu(p);
  }
  #endif /* CONFIG_SMP */
@@ -208,16 +242,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
  }
  
  /* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
-                                                    int cpu)
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
  {
         struct rt_prio_array *array = &rq->rt.active;
         struct task_struct *next;
         struct list_head *queue;
         int idx;
  
-       assert_spin_locked(&rq->lock);
-
         if (likely(rq->rt.rt_nr_running < 2))
                 return NULL;
  
@@ -236,7 +267,8 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
  
         if (queue->next->next != queue) {
                 /* same prio task */
-               next = list_entry(queue->next->next, struct task_struct, run_list);
+               next = list_entry(queue->next->next, struct task_struct,
+                                 run_list);
                 if (pick_rt_task(rq, next, cpu))
                         goto out;
         }
@@ -263,55 +295,175 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
  
  static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
  
-/* Will lock the rq it finds */
-static struct rq *find_lock_lowest_rq(struct task_struct *task,
-                                     struct rq *this_rq)
+static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
  {
-       struct rq *lowest_rq = NULL;
-       int cpu;
-       int tries;
-       cpumask_t *cpu_mask = &__get_cpu_var(local_cpu_mask);
+       int       lowest_prio = -1;
+       int       lowest_cpu  = -1;
+       int       count       = 0;
+       int       cpu;
  
-       cpus_and(*cpu_mask, cpu_online_map, task->cpus_allowed);
+       cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
  
-       for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+       /*
+        * Scan each rq for the lowest prio.
+        */
+       for_each_cpu_mask(cpu, *lowest_mask) {
+               struct rq *rq = cpu_rq(cpu);
+
+               /* We look for lowest RT prio or non-rt CPU */
+               if (rq->rt.highest_prio >= MAX_RT_PRIO) {
+                       /*
+                        * if we already found a low RT queue
+                        * and now we found this non-rt queue
+                        * clear the mask and set our bit.
+                        * Otherwise just return the queue as is
+                        * and the count==1 will cause the algorithm
+                        * to use the first bit found.
+                        */
+                       if (lowest_cpu != -1) {
+                               cpus_clear(*lowest_mask);
+                               cpu_set(rq->cpu, *lowest_mask);
+                       }
+                       return 1;
+               }
+
+               /* no locking for now */
+               if ((rq->rt.highest_prio > task->prio)
+                   && (rq->rt.highest_prio >= lowest_prio)) {
+                       if (rq->rt.highest_prio > lowest_prio) {
+                               /* new low - clear old data */
+                               lowest_prio = rq->rt.highest_prio;
+                               lowest_cpu = cpu;
+                               count = 0;
+                       }
+                       count++;
+               } else
+                       cpu_clear(cpu, *lowest_mask);
+       }
+
+       /*
+        * Clear out all the set bits that represent
+        * runqueues that were of higher prio than
+        * the lowest_prio.
+        */
+       if (lowest_cpu > 0) {
                 /*
-                * Scan each rq for the lowest prio.
+                * Perhaps we could add another cpumask op to
+                * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
+                * Then that could be optimized to use memset and such.
                  */
-               for_each_cpu_mask(cpu, *cpu_mask) {
-                       struct rq *rq = &per_cpu(runqueues, cpu);
+               for_each_cpu_mask(cpu, *lowest_mask) {
+                       if (cpu >= lowest_cpu)
+                               break;
+                       cpu_clear(cpu, *lowest_mask);
+               }
+       }
  
-                       if (cpu == this_rq->cpu)
-                               continue;
+       return count;
+}
  
-                       /* We look for lowest RT prio or non-rt CPU */
-                       if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-                               lowest_rq = rq;
-                               break;
-                       }
+static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
+{
+       int first;
  
-                       /* no locking for now */
-                       if (rq->rt.highest_prio > task->prio &&
-                           (!lowest_rq || rq->rt.highest_prio > lowest_rq->rt.highest_prio)) {
-                               lowest_rq = rq;
-                       }
+       /* "this_cpu" is cheaper to preempt than a remote processor */
+       if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
+               return this_cpu;
+
+       first = first_cpu(*mask);
+       if (first != NR_CPUS)
+               return first;
+
+       return -1;
+}
+
+static int find_lowest_rq(struct task_struct *task)
+{
+       struct sched_domain *sd;
+       cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
+       int this_cpu = smp_processor_id();
+       int cpu      = task_cpu(task);
+       int count    = find_lowest_cpus(task, lowest_mask);
+
+       if (!count)
+               return -1; /* No targets found */
+
+       /*
+        * There is no sense in performing an optimal search if only one
+        * target is found.
+        */
+       if (count == 1)
+               return first_cpu(*lowest_mask);
+
+       /*
+        * At this point we have built a mask of cpus representing the
+        * lowest priority tasks in the system.  Now we want to elect
+        * the best one based on our affinity and topology.
+        *
+        * We prioritize the last cpu that the task executed on since
+        * it is most likely cache-hot in that location.
+        */
+       if (cpu_isset(cpu, *lowest_mask))
+               return cpu;
+
+       /*
+        * Otherwise, we consult the sched_domains span maps to figure
+        * out which cpu is logically closest to our hot cache data.
+        */
+       if (this_cpu == cpu)
+               this_cpu = -1; /* Skip this_cpu opt if the same */
+
+       for_each_domain(cpu, sd) {
+               if (sd->flags & SD_WAKE_AFFINE) {
+                       cpumask_t domain_mask;
+                       int       best_cpu;
+
+                       cpus_and(domain_mask, sd->span, *lowest_mask);
+
+                       best_cpu = pick_optimal_cpu(this_cpu,
+                                                   &domain_mask);
+                       if (best_cpu != -1)
+                               return best_cpu;
                 }
+       }
+
+       /*
+        * And finally, if there were no matches within the domains
+        * just give the caller *something* to work with from the compatible
+        * locations.
+        */
+       return pick_optimal_cpu(this_cpu, lowest_mask);
+}
+
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
+{
+       struct rq *lowest_rq = NULL;
+       int tries;
+       int cpu;
  
-               if (!lowest_rq)
+       for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+               cpu = find_lowest_rq(task);
+
+               if ((cpu == -1) || (cpu == rq->cpu))
                         break;
  
+               lowest_rq = cpu_rq(cpu);
+
                 /* if the prio of this runqueue changed, try again */
-               if (double_lock_balance(this_rq, lowest_rq)) {
+               if (double_lock_balance(rq, lowest_rq)) {
                         /*
                          * We had to unlock the run queue. In
                          * the mean time, task could have
                          * migrated already or had its affinity changed.
                          * Also make sure that it wasn't scheduled on its rq.
                          */
-                       if (unlikely(task_rq(task) != this_rq ||
-                                    !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
-                                    task_running(this_rq, task) ||
+                       if (unlikely(task_rq(task) != rq ||
+                                    !cpu_isset(lowest_rq->cpu,
+                                               task->cpus_allowed) ||
+                                    task_running(rq, task) ||
                                      !task->se.on_rq)) {
+
                                 spin_unlock(&lowest_rq->lock);
                                 lowest_rq = NULL;
                                 break;
@@ -342,7 +494,8 @@ static int push_rt_task(struct rq *rq)
         int ret = 0;
         int paranoid = RT_MAX_TRIES;
  
-       assert_spin_locked(&rq->lock);
+       if (!rq->rt.overloaded)
+               return 0;
  
         next_task = pick_next_highest_task_rt(rq, -1);
         if (!next_task)
@@ -385,8 +538,6 @@ static int push_rt_task(struct rq *rq)
                 goto out;
         }
  
-       assert_spin_locked(&lowest_rq->lock);
-
         deactivate_task(rq, next_task, 0);
         set_task_cpu(next_task, lowest_rq->cpu);
         activate_task(lowest_rq, next_task, 0);
@@ -421,30 +572,16 @@ static void push_rt_tasks(struct rq *rq)
  
  static int pull_rt_task(struct rq *this_rq)
  {
-       struct task_struct *next;
-       struct task_struct *p;
+       int this_cpu = this_rq->cpu, ret = 0, cpu;
+       struct task_struct *p, *next;
         struct rq *src_rq;
-       cpumask_t *rto_cpumask;
-       int this_cpu = this_rq->cpu;
-       int cpu;
-       int ret = 0;
-
-       assert_spin_locked(&this_rq->lock);
  
-       /*
-        * If cpusets are used, and we have overlapping
-        * run queue cpusets, then this algorithm may not catch all.
-        * This is just the price you pay on trying to keep
-        * dirtying caches down on large SMP machines.
-        */
-       if (likely(!rt_overloaded()))
+       if (likely(!rt_overloaded(this_rq)))
                 return 0;
  
         next = pick_next_task_rt(this_rq);
  
-       rto_cpumask = rt_overload();
-
-       for_each_cpu_mask(cpu, *rto_cpumask) {
+       for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
                 if (this_cpu == cpu)
                         continue;
  
@@ -458,23 +595,25 @@ static int pull_rt_task(struct rq *this_rq)
                         if (double_lock_balance(this_rq, src_rq)) {
                                 /* unlocked our runqueue lock */
                                 struct task_struct *old_next = next;
+
                                 next = pick_next_task_rt(this_rq);
                                 if (next != old_next)
                                         ret = 1;
                         }
-                       if (likely(src_rq->rt.rt_nr_running <= 1))
+                       if (likely(src_rq->rt.rt_nr_running <= 1)) {
                                 /*
                                  * Small chance that this_rq->curr changed
                                  * but it's really harmless here.
                                  */
                                 rt_clear_overload(this_rq);
-                       else
+                       } else {
                                 /*
                                  * Heh, the src_rq is now overloaded, since
                                  * we already have the src_rq lock, go straight
                                  * to pulling tasks from it.
                                  */
                                 goto try_pulling;
+                       }
                         spin_unlock(&src_rq->lock);
                         continue;
                 }
@@ -488,6 +627,7 @@ static int pull_rt_task(struct rq *this_rq)
                  */
                 if (double_lock_balance(this_rq, src_rq)) {
                         struct task_struct *old_next = next;
+
                         next = pick_next_task_rt(this_rq);
                         if (next != old_next)
                                 ret = 1;
@@ -524,7 +664,7 @@ static int pull_rt_task(struct rq *this_rq)
                          */
                         if (p->prio < src_rq->curr->prio ||
                             (next && next->prio < src_rq->curr->prio))
-                               goto bail;
+                               goto out;
  
                         ret = 1;
  
@@ -536,9 +676,7 @@ static int pull_rt_task(struct rq *this_rq)
                          * case there's an even higher prio task
                          * in another runqueue. (low likelyhood
                          * but possible)
-                        */
-
-                       /*
+                        *
                          * Update next so that we won't pick a task
                          * on another cpu with a priority lower (or equal)
                          * than the one we just picked.
@@ -546,23 +684,21 @@ static int pull_rt_task(struct rq *this_rq)
                         next = p;
  
                 }
- bail:
+ out:
                 spin_unlock(&src_rq->lock);
         }
  
         return ret;
  }
  
-static void schedule_balance_rt(struct rq *rq,
-                               struct task_struct *prev)
+static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
  {
         /* Try to pull RT tasks here if we lower this rq's prio */
-       if (unlikely(rt_task(prev)) &&
-           rq->rt.highest_prio > prev->prio)
+       if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
                 pull_rt_task(rq);
  }
  
-static void schedule_tail_balance_rt(struct rq *rq)
+static void post_schedule_rt(struct rq *rq)
  {
         /*
          * If we have more than one rt_task queued, then
@@ -571,7 +707,7 @@ static void schedule_tail_balance_rt(struct rq *rq)
          * the lock was owned by prev, we need to release it
          * first via finish_lock_switch and then reaquire it here.
          */
-       if (unlikely(rq->rt.rt_nr_running > 1)) {
+       if (unlikely(rq->rt.overloaded)) {
                 spin_lock_irq(&rq->lock);
                 push_rt_tasks(rq);
                 spin_unlock_irq(&rq->lock);
@@ -579,11 +715,11 @@ static void schedule_tail_balance_rt(struct rq *rq)
  }
  
  
-static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
+static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
  {
-       if (unlikely(rt_task(p)) &&
-           !task_running(rq, p) &&
-           (p->prio >= rq->curr->prio))
+       if (!task_running(rq, p) &&
+           (p->prio >= rq->rt.highest_prio) &&
+           rq->rt.overloaded)
                 push_rt_tasks(rq);
  }
  
@@ -604,6 +740,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
         /* don't touch RT tasks */
         return 0;
  }
+
  static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
  {
         int weight = cpus_weight(*new_mask);
@@ -617,9 +754,9 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
         if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
                 struct rq *rq = task_rq(p);
  
-               if ((p->nr_cpus_allowed <= 1) && (weight > 1))
+               if ((p->nr_cpus_allowed <= 1) && (weight > 1)) {
                         rq->rt.rt_nr_migratory++;
-               else if((p->nr_cpus_allowed > 1) && (weight <= 1)) {
+               } else if ((p->nr_cpus_allowed > 1) && (weight <= 1)) {
                         BUG_ON(!rq->rt.rt_nr_migratory);
                         rq->rt.rt_nr_migratory--;
                 }
@@ -630,11 +767,106 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
         p->cpus_allowed    = *new_mask;
         p->nr_cpus_allowed = weight;
  }
-#else /* CONFIG_SMP */
-# define schedule_tail_balance_rt(rq)  do { } while (0)
-# define schedule_balance_rt(rq, prev) do { } while (0)
-# define wakeup_balance_rt(rq, p)      do { } while (0)
+
+/* Assumes rq->lock is held */
+static void join_domain_rt(struct rq *rq)
+{
+       if (rq->rt.overloaded)
+               rt_set_overload(rq);
+}
+
+/* Assumes rq->lock is held */
+static void leave_domain_rt(struct rq *rq)
+{
+       if (rq->rt.overloaded)
+               rt_clear_overload(rq);
+}
+
+/*
+ * When switch from the rt queue, we bring ourselves to a position
+ * that we might want to pull RT tasks from other runqueues.
+ */
+static void switched_from_rt(struct rq *rq, struct task_struct *p,
+                          int running)
+{
+       /*
+        * If there are other RT tasks then we will reschedule
+        * and the scheduling of the other RT tasks will handle
+        * the balancing. But if we are the last RT task
+        * we may need to handle the pulling of RT tasks
+        * now.
+        */
+       if (!rq->rt.rt_nr_running)
+               pull_rt_task(rq);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * When switching a task to RT, we may overload the runqueue
+ * with RT tasks. In this case we try to push them off to
+ * other runqueues.
+ */
+static void switched_to_rt(struct rq *rq, struct task_struct *p,
+                          int running)
+{
+       int check_resched = 1;
+
+       /*
+        * If we are already running, then there's nothing
+        * that needs to be done. But if we are not running
+        * we may need to preempt the current running task.
+        * If that current running task is also an RT task
+        * then see if we can move to another run queue.
+        */
+       if (!running) {
+#ifdef CONFIG_SMP
+               if (rq->rt.overloaded && push_rt_task(rq) &&
+                   /* Don't resched if we changed runqueues */
+                   rq != task_rq(p))
+                       check_resched = 0;
+#endif /* CONFIG_SMP */
+               if (check_resched && p->prio < rq->curr->prio)
+                       resched_task(rq->curr);
+       }
+}
+
+/*
+ * Priority of the task has changed. This may cause
+ * us to initiate a push or pull.
+ */
+static void prio_changed_rt(struct rq *rq, struct task_struct *p,
+                           int oldprio, int running)
+{
+       if (running) {
+#ifdef CONFIG_SMP
+               /*
+                * If our priority decreases while running, we
+                * may need to pull tasks to this runqueue.
+                */
+               if (oldprio < p->prio)
+                       pull_rt_task(rq);
+               /*
+                * If there's a higher priority task waiting to run
+                * then reschedule.
+                */
+               if (p->prio > rq->rt.highest_prio)
+                       resched_task(p);
+#else
+               /* For UP simply resched on drop of prio */
+               if (oldprio < p->prio)
+                       resched_task(p);
  #endif /* CONFIG_SMP */
+       } else {
+               /*
+                * This task is not running, but if it is
+                * greater than the current running task
+                * then reschedule.
+                */
+               if (p->prio < rq->curr->prio)
+                       resched_task(rq->curr);
+       }
+}
+
  
  static void task_tick_rt(struct rq *rq, struct task_struct *p)
  {
@@ -687,8 +919,17 @@ const struct sched_class rt_sched_class = {
         .load_balance           = load_balance_rt,
         .move_one_task          = move_one_task_rt,
         .set_cpus_allowed       = set_cpus_allowed_rt,
+       .join_domain            = join_domain_rt,
+       .leave_domain           = leave_domain_rt,
+       .pre_schedule           = pre_schedule_rt,
+       .post_schedule          = post_schedule_rt,
+       .task_wake_up           = task_wake_up_rt,
+       .switched_from          = switched_from_rt,
  #endif
  
         .set_curr_task          = set_curr_task_rt,
         .task_tick              = task_tick_rt,
+
+       .prio_changed           = prio_changed_rt,
+       .switched_to            = switched_to_rt,
  };