]> pilppa.org Git - linux-2.6-omap-h63xx.git/blobdiff - kernel/sched_rt.c
sched: RT-balance, only adjust overload state when changing
[linux-2.6-omap-h63xx.git] / kernel / sched_rt.c
index 9becc3710b609c6b6ae44ce9c413287503c80c49..a386758ffebb14c701a3cca37cdb9962162c71c9 100644 (file)
@@ -4,20 +4,15 @@
  */
 
 #ifdef CONFIG_SMP
-static cpumask_t rt_overload_mask;
-static atomic_t rto_count;
-static inline int rt_overloaded(void)
-{
-       return atomic_read(&rto_count);
-}
-static inline cpumask_t *rt_overload(void)
+
+static inline int rt_overloaded(struct rq *rq)
 {
-       return &rt_overload_mask;
+       return atomic_read(&rq->rd->rto_count);
 }
+
 static inline void rt_set_overload(struct rq *rq)
 {
-       rq->rt.overloaded = 1;
-       cpu_set(rq->cpu, rt_overload_mask);
+       cpu_set(rq->cpu, rq->rd->rto_mask);
        /*
         * Make sure the mask is visible before we set
         * the overload count. That is checked to determine
@@ -26,22 +21,27 @@ static inline void rt_set_overload(struct rq *rq)
         * updated yet.
         */
        wmb();
-       atomic_inc(&rto_count);
+       atomic_inc(&rq->rd->rto_count);
 }
+
 static inline void rt_clear_overload(struct rq *rq)
 {
        /* the order here really doesn't matter */
-       atomic_dec(&rto_count);
-       cpu_clear(rq->cpu, rt_overload_mask);
-       rq->rt.overloaded = 0;
+       atomic_dec(&rq->rd->rto_count);
+       cpu_clear(rq->cpu, rq->rd->rto_mask);
 }
 
 static void update_rt_migration(struct rq *rq)
 {
-       if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1))
-               rt_set_overload(rq);
-       else
+       if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
+               if (!rq->rt.overloaded) {
+                       rt_set_overload(rq);
+                       rq->rt.overloaded = 1;
+               }
+       } else if (rq->rt.overloaded) {
                rt_clear_overload(rq);
+               rq->rt.overloaded = 0;
+       }
 }
 #endif /* CONFIG_SMP */
 
@@ -176,7 +176,8 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
         * that is just being woken and probably will have
         * cold cache anyway.
         */
-       if (unlikely(rt_task(rq->curr))) {
+       if (unlikely(rt_task(rq->curr)) &&
+           (p->nr_cpus_allowed > 1)) {
                int cpu = find_lowest_rq(p);
 
                return (cpu == -1) ? task_cpu(p) : cpu;
@@ -241,16 +242,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 }
 
 /* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
-                                                    int cpu)
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 {
        struct rt_prio_array *array = &rq->rt.active;
        struct task_struct *next;
        struct list_head *queue;
        int idx;
 
-       assert_spin_locked(&rq->lock);
-
        if (likely(rq->rt.rt_nr_running < 2))
                return NULL;
 
@@ -269,7 +267,8 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
 
        if (queue->next->next != queue) {
                /* same prio task */
-               next = list_entry(queue->next->next, struct task_struct, run_list);
+               next = list_entry(queue->next->next, struct task_struct,
+                                 run_list);
                if (pick_rt_task(rq, next, cpu))
                        goto out;
        }
@@ -295,29 +294,36 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
 }
 
 static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
-static DEFINE_PER_CPU(cpumask_t, valid_cpu_mask);
 
 static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
 {
-       int       cpu;
-       cpumask_t *valid_mask = &__get_cpu_var(valid_cpu_mask);
        int       lowest_prio = -1;
-       int       ret         = 0;
+       int       lowest_cpu  = -1;
+       int       count       = 0;
+       int       cpu;
 
-       cpus_clear(*lowest_mask);
-       cpus_and(*valid_mask, cpu_online_map, task->cpus_allowed);
+       cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
 
        /*
         * Scan each rq for the lowest prio.
         */
-       for_each_cpu_mask(cpu, *valid_mask) {
+       for_each_cpu_mask(cpu, *lowest_mask) {
                struct rq *rq = cpu_rq(cpu);
 
                /* We look for lowest RT prio or non-rt CPU */
                if (rq->rt.highest_prio >= MAX_RT_PRIO) {
-                       if (ret)
+                       /*
+                        * if we already found a low RT queue
+                        * and now we found this non-rt queue
+                        * clear the mask and set our bit.
+                        * Otherwise just return the queue as is
+                        * and the count==1 will cause the algorithm
+                        * to use the first bit found.
+                        */
+                       if (lowest_cpu != -1) {
                                cpus_clear(*lowest_mask);
-                       cpu_set(rq->cpu, *lowest_mask);
+                               cpu_set(rq->cpu, *lowest_mask);
+                       }
                        return 1;
                }
 
@@ -327,14 +333,33 @@ static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
                        if (rq->rt.highest_prio > lowest_prio) {
                                /* new low - clear old data */
                                lowest_prio = rq->rt.highest_prio;
-                               cpus_clear(*lowest_mask);
+                               lowest_cpu = cpu;
+                               count = 0;
                        }
-                       cpu_set(rq->cpu, *lowest_mask);
-                       ret = 1;
+                       count++;
+               } else
+                       cpu_clear(cpu, *lowest_mask);
+       }
+
+       /*
+        * Clear out all the set bits that represent
+        * runqueues that were of higher prio than
+        * the lowest_prio.
+        */
+       if (lowest_cpu > 0) {
+               /*
+                * Perhaps we could add another cpumask op to
+                * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
+                * Then that could be optimized to use memset and such.
+                */
+               for_each_cpu_mask(cpu, *lowest_mask) {
+                       if (cpu >= lowest_cpu)
+                               break;
+                       cpu_clear(cpu, *lowest_mask);
                }
        }
 
-       return ret;
+       return count;
 }
 
 static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
@@ -358,9 +383,17 @@ static int find_lowest_rq(struct task_struct *task)
        cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
        int this_cpu = smp_processor_id();
        int cpu      = task_cpu(task);
+       int count    = find_lowest_cpus(task, lowest_mask);
+
+       if (!count)
+               return -1; /* No targets found */
 
-       if (!find_lowest_cpus(task, lowest_mask))
-               return -1;
+       /*
+        * There is no sense in performing an optimal search if only one
+        * target is found.
+        */
+       if (count == 1)
+               return first_cpu(*lowest_mask);
 
        /*
         * At this point we have built a mask of cpus representing the
@@ -403,12 +436,11 @@ static int find_lowest_rq(struct task_struct *task)
 }
 
 /* Will lock the rq it finds */
-static struct rq *find_lock_lowest_rq(struct task_struct *task,
-                                     struct rq *rq)
+static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 {
        struct rq *lowest_rq = NULL;
-       int cpu;
        int tries;
+       int cpu;
 
        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
                cpu = find_lowest_rq(task);
@@ -427,9 +459,11 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
                         * Also make sure that it wasn't scheduled on its rq.
                         */
                        if (unlikely(task_rq(task) != rq ||
-                                    !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
+                                    !cpu_isset(lowest_rq->cpu,
+                                               task->cpus_allowed) ||
                                     task_running(rq, task) ||
                                     !task->se.on_rq)) {
+
                                spin_unlock(&lowest_rq->lock);
                                lowest_rq = NULL;
                                break;
@@ -460,8 +494,6 @@ static int push_rt_task(struct rq *rq)
        int ret = 0;
        int paranoid = RT_MAX_TRIES;
 
-       assert_spin_locked(&rq->lock);
-
        if (!rq->rt.overloaded)
                return 0;
 
@@ -506,8 +538,6 @@ static int push_rt_task(struct rq *rq)
                goto out;
        }
 
-       assert_spin_locked(&lowest_rq->lock);
-
        deactivate_task(rq, next_task, 0);
        set_task_cpu(next_task, lowest_rq->cpu);
        activate_task(lowest_rq, next_task, 0);
@@ -542,30 +572,16 @@ static void push_rt_tasks(struct rq *rq)
 
 static int pull_rt_task(struct rq *this_rq)
 {
-       struct task_struct *next;
-       struct task_struct *p;
+       int this_cpu = this_rq->cpu, ret = 0, cpu;
+       struct task_struct *p, *next;
        struct rq *src_rq;
-       cpumask_t *rto_cpumask;
-       int this_cpu = this_rq->cpu;
-       int cpu;
-       int ret = 0;
 
-       assert_spin_locked(&this_rq->lock);
-
-       /*
-        * If cpusets are used, and we have overlapping
-        * run queue cpusets, then this algorithm may not catch all.
-        * This is just the price you pay on trying to keep
-        * dirtying caches down on large SMP machines.
-        */
-       if (likely(!rt_overloaded()))
+       if (likely(!rt_overloaded(this_rq)))
                return 0;
 
        next = pick_next_task_rt(this_rq);
 
-       rto_cpumask = rt_overload();
-
-       for_each_cpu_mask(cpu, *rto_cpumask) {
+       for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
                if (this_cpu == cpu)
                        continue;
 
@@ -579,23 +595,25 @@ static int pull_rt_task(struct rq *this_rq)
                        if (double_lock_balance(this_rq, src_rq)) {
                                /* unlocked our runqueue lock */
                                struct task_struct *old_next = next;
+
                                next = pick_next_task_rt(this_rq);
                                if (next != old_next)
                                        ret = 1;
                        }
-                       if (likely(src_rq->rt.rt_nr_running <= 1))
+                       if (likely(src_rq->rt.rt_nr_running <= 1)) {
                                /*
                                 * Small chance that this_rq->curr changed
                                 * but it's really harmless here.
                                 */
                                rt_clear_overload(this_rq);
-                       else
+                       } else {
                                /*
                                 * Heh, the src_rq is now overloaded, since
                                 * we already have the src_rq lock, go straight
                                 * to pulling tasks from it.
                                 */
                                goto try_pulling;
+                       }
                        spin_unlock(&src_rq->lock);
                        continue;
                }
@@ -609,6 +627,7 @@ static int pull_rt_task(struct rq *this_rq)
                 */
                if (double_lock_balance(this_rq, src_rq)) {
                        struct task_struct *old_next = next;
+
                        next = pick_next_task_rt(this_rq);
                        if (next != old_next)
                                ret = 1;
@@ -645,7 +664,7 @@ static int pull_rt_task(struct rq *this_rq)
                         */
                        if (p->prio < src_rq->curr->prio ||
                            (next && next->prio < src_rq->curr->prio))
-                               goto bail;
+                               goto out;
 
                        ret = 1;
 
@@ -657,9 +676,7 @@ static int pull_rt_task(struct rq *this_rq)
                         * case there's an even higher prio task
                         * in another runqueue. (low likelyhood
                         * but possible)
-                        */
-
-                       /*
+                        *
                         * Update next so that we won't pick a task
                         * on another cpu with a priority lower (or equal)
                         * than the one we just picked.
@@ -667,23 +684,21 @@ static int pull_rt_task(struct rq *this_rq)
                        next = p;
 
                }
bail:
out:
                spin_unlock(&src_rq->lock);
        }
 
        return ret;
 }
 
-static void schedule_balance_rt(struct rq *rq,
-                               struct task_struct *prev)
+static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 {
        /* Try to pull RT tasks here if we lower this rq's prio */
-       if (unlikely(rt_task(prev)) &&
-           rq->rt.highest_prio > prev->prio)
+       if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
                pull_rt_task(rq);
 }
 
-static void schedule_tail_balance_rt(struct rq *rq)
+static void post_schedule_rt(struct rq *rq)
 {
        /*
         * If we have more than one rt_task queued, then
@@ -700,10 +715,9 @@ static void schedule_tail_balance_rt(struct rq *rq)
 }
 
 
-static void wakeup_balance_rt(struct rq *rq, struct task_struct *p)
+static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
 {
-       if (unlikely(rt_task(p)) &&
-           !task_running(rq, p) &&
+       if (!task_running(rq, p) &&
            (p->prio >= rq->rt.highest_prio) &&
            rq->rt.overloaded)
                push_rt_tasks(rq);
@@ -726,6 +740,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
        /* don't touch RT tasks */
        return 0;
 }
+
 static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
 {
        int weight = cpus_weight(*new_mask);
@@ -739,9 +754,9 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
        if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
                struct rq *rq = task_rq(p);
 
-               if ((p->nr_cpus_allowed <= 1) && (weight > 1))
+               if ((p->nr_cpus_allowed <= 1) && (weight > 1)) {
                        rq->rt.rt_nr_migratory++;
-               else if((p->nr_cpus_allowed > 1) && (weight <= 1)) {
+               } else if ((p->nr_cpus_allowed > 1) && (weight <= 1)) {
                        BUG_ON(!rq->rt.rt_nr_migratory);
                        rq->rt.rt_nr_migratory--;
                }
@@ -752,12 +767,107 @@ static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
        p->cpus_allowed    = *new_mask;
        p->nr_cpus_allowed = weight;
 }
-#else /* CONFIG_SMP */
-# define schedule_tail_balance_rt(rq)  do { } while (0)
-# define schedule_balance_rt(rq, prev) do { } while (0)
-# define wakeup_balance_rt(rq, p)      do { } while (0)
+
+/* Assumes rq->lock is held */
+static void join_domain_rt(struct rq *rq)
+{
+       if (rq->rt.overloaded)
+               rt_set_overload(rq);
+}
+
+/* Assumes rq->lock is held */
+static void leave_domain_rt(struct rq *rq)
+{
+       if (rq->rt.overloaded)
+               rt_clear_overload(rq);
+}
+
+/*
+ * When switch from the rt queue, we bring ourselves to a position
+ * that we might want to pull RT tasks from other runqueues.
+ */
+static void switched_from_rt(struct rq *rq, struct task_struct *p,
+                          int running)
+{
+       /*
+        * If there are other RT tasks then we will reschedule
+        * and the scheduling of the other RT tasks will handle
+        * the balancing. But if we are the last RT task
+        * we may need to handle the pulling of RT tasks
+        * now.
+        */
+       if (!rq->rt.rt_nr_running)
+               pull_rt_task(rq);
+}
 #endif /* CONFIG_SMP */
 
+/*
+ * When switching a task to RT, we may overload the runqueue
+ * with RT tasks. In this case we try to push them off to
+ * other runqueues.
+ */
+static void switched_to_rt(struct rq *rq, struct task_struct *p,
+                          int running)
+{
+       int check_resched = 1;
+
+       /*
+        * If we are already running, then there's nothing
+        * that needs to be done. But if we are not running
+        * we may need to preempt the current running task.
+        * If that current running task is also an RT task
+        * then see if we can move to another run queue.
+        */
+       if (!running) {
+#ifdef CONFIG_SMP
+               if (rq->rt.overloaded && push_rt_task(rq) &&
+                   /* Don't resched if we changed runqueues */
+                   rq != task_rq(p))
+                       check_resched = 0;
+#endif /* CONFIG_SMP */
+               if (check_resched && p->prio < rq->curr->prio)
+                       resched_task(rq->curr);
+       }
+}
+
+/*
+ * Priority of the task has changed. This may cause
+ * us to initiate a push or pull.
+ */
+static void prio_changed_rt(struct rq *rq, struct task_struct *p,
+                           int oldprio, int running)
+{
+       if (running) {
+#ifdef CONFIG_SMP
+               /*
+                * If our priority decreases while running, we
+                * may need to pull tasks to this runqueue.
+                */
+               if (oldprio < p->prio)
+                       pull_rt_task(rq);
+               /*
+                * If there's a higher priority task waiting to run
+                * then reschedule.
+                */
+               if (p->prio > rq->rt.highest_prio)
+                       resched_task(p);
+#else
+               /* For UP simply resched on drop of prio */
+               if (oldprio < p->prio)
+                       resched_task(p);
+#endif /* CONFIG_SMP */
+       } else {
+               /*
+                * This task is not running, but if it is
+                * greater than the current running task
+                * then reschedule.
+                */
+               if (p->prio < rq->curr->prio)
+                       resched_task(rq->curr);
+       }
+}
+
+
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
        update_curr_rt(rq);
@@ -809,8 +919,17 @@ const struct sched_class rt_sched_class = {
        .load_balance           = load_balance_rt,
        .move_one_task          = move_one_task_rt,
        .set_cpus_allowed       = set_cpus_allowed_rt,
+       .join_domain            = join_domain_rt,
+       .leave_domain           = leave_domain_rt,
+       .pre_schedule           = pre_schedule_rt,
+       .post_schedule          = post_schedule_rt,
+       .task_wake_up           = task_wake_up_rt,
+       .switched_from          = switched_from_rt,
 #endif
 
        .set_curr_task          = set_curr_task_rt,
        .task_tick              = task_tick_rt,
+
+       .prio_changed           = prio_changed_rt,
+       .switched_to            = switched_to_rt,
 };