]> pilppa.org Git - linux-2.6-omap-h63xx.git/blobdiff - kernel/sched.c
[PATCH] sched: force /sbin/init off isolated cpus
[linux-2.6-omap-h63xx.git] / kernel / sched.c
index 4ee400f9d56bc950c33c6afefd616ff1591359fc..ddf418810c3930eb7c15c617e0127c7014555154 100644 (file)
@@ -49,8 +49,9 @@
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
-#include <linux/acct.h>
+#include <linux/tsacct_kern.h>
 #include <linux/kprobes.h>
+#include <linux/delayacct.h>
 #include <asm/tlb.h>
 
 #include <asm/unistd.h>
@@ -237,6 +238,7 @@ struct rq {
        /* For active balancing */
        int active_balance;
        int push_cpu;
+       int cpu;                /* cpu of this runqueue */
 
        struct task_struct *migration_thread;
        struct list_head migration_queue;
@@ -266,6 +268,15 @@ struct rq {
 
 static DEFINE_PER_CPU(struct rq, runqueues);
 
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+       return rq->cpu;
+#else
+       return 0;
+#endif
+}
+
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
  * See detach_destroy_domains: synchronize_sched for details.
@@ -501,9 +512,36 @@ struct file_operations proc_schedstat_operations = {
        .release = single_release,
 };
 
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{
+       if (rq) {
+               rq->rq_sched_info.run_delay += delta_jiffies;
+               rq->rq_sched_info.pcnt++;
+       }
+}
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{
+       if (rq)
+               rq->rq_sched_info.cpu_time += delta_jiffies;
+}
 # define schedstat_inc(rq, field)      do { (rq)->field++; } while (0)
 # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
 #else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{}
 # define schedstat_inc(rq, field)      do { } while (0)
 # define schedstat_add(rq, field, amt) do { } while (0)
 #endif
@@ -523,7 +561,7 @@ static inline struct rq *this_rq_lock(void)
        return rq;
 }
 
-#ifdef CONFIG_SCHEDSTATS
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 /*
  * Called when a process is dequeued from the active array and given
  * the cpu.  We should note that with the exception of interactive
@@ -551,21 +589,16 @@ static inline void sched_info_dequeued(struct task_struct *t)
  */
 static void sched_info_arrive(struct task_struct *t)
 {
-       unsigned long now = jiffies, diff = 0;
-       struct rq *rq = task_rq(t);
+       unsigned long now = jiffies, delta_jiffies = 0;
 
        if (t->sched_info.last_queued)
-               diff = now - t->sched_info.last_queued;
+               delta_jiffies = now - t->sched_info.last_queued;
        sched_info_dequeued(t);
-       t->sched_info.run_delay += diff;
+       t->sched_info.run_delay += delta_jiffies;
        t->sched_info.last_arrival = now;
        t->sched_info.pcnt++;
 
-       if (!rq)
-               return;
-
-       rq->rq_sched_info.run_delay += diff;
-       rq->rq_sched_info.pcnt++;
+       rq_sched_info_arrive(task_rq(t), delta_jiffies);
 }
 
 /*
@@ -585,8 +618,9 @@ static void sched_info_arrive(struct task_struct *t)
  */
 static inline void sched_info_queued(struct task_struct *t)
 {
-       if (!t->sched_info.last_queued)
-               t->sched_info.last_queued = jiffies;
+       if (unlikely(sched_info_on()))
+               if (!t->sched_info.last_queued)
+                       t->sched_info.last_queued = jiffies;
 }
 
 /*
@@ -595,13 +629,10 @@ static inline void sched_info_queued(struct task_struct *t)
  */
 static inline void sched_info_depart(struct task_struct *t)
 {
-       struct rq *rq = task_rq(t);
-       unsigned long diff = jiffies - t->sched_info.last_arrival;
-
-       t->sched_info.cpu_time += diff;
+       unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
 
-       if (rq)
-               rq->rq_sched_info.cpu_time += diff;
+       t->sched_info.cpu_time += delta_jiffies;
+       rq_sched_info_depart(task_rq(t), delta_jiffies);
 }
 
 /*
@@ -610,7 +641,7 @@ static inline void sched_info_depart(struct task_struct *t)
  * the idle task.)  We are only called when prev != next.
  */
 static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
 {
        struct rq *rq = task_rq(prev);
 
@@ -625,10 +656,16 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
        if (next != rq->idle)
                sched_info_arrive(next);
 }
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+       if (unlikely(sched_info_on()))
+               __sched_info_switch(prev, next);
+}
 #else
 #define sched_info_queued(t)           do { } while (0)
 #define sched_info_switch(t, next)     do { } while (0)
-#endif /* CONFIG_SCHEDSTATS */
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 
 /*
  * Adding/removing a task to/from a priority array:
@@ -1530,8 +1567,9 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
 
        INIT_LIST_HEAD(&p->run_list);
        p->array = NULL;
-#ifdef CONFIG_SCHEDSTATS
-       memset(&p->sched_info, 0, sizeof(p->sched_info));
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+       if (unlikely(sched_info_on()))
+               memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
        p->oncpu = 0;
@@ -1717,27 +1755,27 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
        __releases(rq->lock)
 {
        struct mm_struct *mm = rq->prev_mm;
-       unsigned long prev_task_flags;
+       long prev_state;
 
        rq->prev_mm = NULL;
 
        /*
         * A task struct has one reference for the use as "current".
-        * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and
-        * calls schedule one last time. The schedule call will never return,
-        * and the scheduled task must drop that reference.
-        * The test for EXIT_ZOMBIE must occur while the runqueue locks are
+        * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+        * schedule one last time. The schedule call will never return, and
+        * the scheduled task must drop that reference.
+        * The test for TASK_DEAD must occur while the runqueue locks are
         * still held, otherwise prev could be scheduled on another cpu, die
         * there before we look at prev->state, and then the reference would
         * be dropped twice.
         *              Manfred Spraul <manfred@colorfullife.com>
         */
-       prev_task_flags = prev->flags;
+       prev_state = prev->state;
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
        if (mm)
                mmdrop(mm);
-       if (unlikely(prev_task_flags & PF_DEAD)) {
+       if (unlikely(prev_state == TASK_DEAD)) {
                /*
                 * Remove function-return probe instances associated with this
                 * task and put them back on the free list.
@@ -1788,7 +1826,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
                WARN_ON(rq->prev_mm);
                rq->prev_mm = oldmm;
        }
+       /*
+        * Since the runqueue lock will be released by the next
+        * task (which is an invalid locking op but in the case
+        * of the scheduler it's an obvious special-case), so we
+        * do an early lockdep release here:
+        */
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+#endif
 
        /* Here we just switch the register state and the stack. */
        switch_to(prev, next, prev);
@@ -2175,7 +2221,8 @@ out:
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
-                  unsigned long *imbalance, enum idle_type idle, int *sd_idle)
+                  unsigned long *imbalance, enum idle_type idle, int *sd_idle,
+                  cpumask_t *cpus)
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@ -2212,7 +2259,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                sum_weighted_load = sum_nr_running = avg_load = 0;
 
                for_each_cpu_mask(i, group->cpumask) {
-                       struct rq *rq = cpu_rq(i);
+                       struct rq *rq;
+
+                       if (!cpu_isset(i, *cpus))
+                               continue;
+
+                       rq = cpu_rq(i);
 
                        if (*sd_idle && !idle_cpu(i))
                                *sd_idle = 0;
@@ -2430,13 +2482,17 @@ ret:
  */
 static struct rq *
 find_busiest_queue(struct sched_group *group, enum idle_type idle,
-                  unsigned long imbalance)
+                  unsigned long imbalance, cpumask_t *cpus)
 {
        struct rq *busiest = NULL, *rq;
        unsigned long max_load = 0;
        int i;
 
        for_each_cpu_mask(i, group->cpumask) {
+
+               if (!cpu_isset(i, *cpus))
+                       continue;
+
                rq = cpu_rq(i);
 
                if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
@@ -2475,6 +2531,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        struct sched_group *group;
        unsigned long imbalance;
        struct rq *busiest;
+       cpumask_t cpus = CPU_MASK_ALL;
 
        if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
            !sched_smt_power_savings)
@@ -2482,13 +2539,15 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
        schedstat_inc(sd, lb_cnt[idle]);
 
-       group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
+redo:
+       group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+                                                       &cpus);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[idle]);
                goto out_balanced;
        }
 
-       busiest = find_busiest_queue(group, idle, imbalance);
+       busiest = find_busiest_queue(group, idle, imbalance, &cpus);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[idle]);
                goto out_balanced;
@@ -2513,8 +2572,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                double_rq_unlock(this_rq, busiest);
 
                /* All tasks on this runqueue were pinned by CPU affinity */
-               if (unlikely(all_pinned))
+               if (unlikely(all_pinned)) {
+                       cpu_clear(cpu_of(busiest), cpus);
+                       if (!cpus_empty(cpus))
+                               goto redo;
                        goto out_balanced;
+               }
        }
 
        if (!nr_moved) {
@@ -2603,18 +2666,22 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
        unsigned long imbalance;
        int nr_moved = 0;
        int sd_idle = 0;
+       cpumask_t cpus = CPU_MASK_ALL;
 
        if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
                sd_idle = 1;
 
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
-       group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
+redo:
+       group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
+                               &sd_idle, &cpus);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
                goto out_balanced;
        }
 
-       busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
+       busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
+                               &cpus);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
                goto out_balanced;
@@ -2632,6 +2699,12 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
                                        minus_1_or_zero(busiest->nr_running),
                                        imbalance, sd, NEWLY_IDLE, NULL);
                spin_unlock(&busiest->lock);
+
+               if (!nr_moved) {
+                       cpu_clear(cpu_of(busiest), cpus);
+                       if (!cpus_empty(cpus))
+                               goto redo;
+               }
        }
 
        if (!nr_moved) {
@@ -3275,9 +3348,6 @@ need_resched_nonpreemptible:
 
        spin_lock_irq(&rq->lock);
 
-       if (unlikely(prev->flags & PF_DEAD))
-               prev->state = EXIT_DEAD;
-
        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                switch_count = &prev->nvcsw;
@@ -3384,7 +3454,7 @@ EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_PREEMPT
 /*
- * this is is the entry point to schedule() from in-kernel preemption
+ * this is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable.  Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
@@ -3427,7 +3497,7 @@ need_resched:
 EXPORT_SYMBOL(preempt_schedule);
 
 /*
- * this is is the entry point to schedule() from kernel preemption
+ * this is the entry point to schedule() from kernel preemption
  * off of irq context.
  * Note, that this is called and return with irqs disabled. This will
  * protect us against recursive calling from irq.
@@ -3439,7 +3509,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
        struct task_struct *task = current;
        int saved_lock_depth;
 #endif
-       /* Catch callers which need to be fixed*/
+       /* Catch callers which need to be fixed */
        BUG_ON(ti->preempt_count || !irqs_disabled());
 
 need_resched:
@@ -4007,6 +4077,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
  * @p: the task in question.
  * @policy: new policy.
  * @param: structure containing the new RT priority.
+ *
+ * NOTE: the task may be already dead
  */
 int sched_setscheduler(struct task_struct *p, int policy,
                       struct sched_param *param)
@@ -4034,28 +4106,32 @@ recheck:
            (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
            (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
                return -EINVAL;
-       if ((policy == SCHED_NORMAL || policy == SCHED_BATCH)
-                                       != (param->sched_priority == 0))
+       if (is_rt_policy(policy) != (param->sched_priority != 0))
                return -EINVAL;
 
        /*
         * Allow unprivileged RT tasks to decrease priority:
         */
        if (!capable(CAP_SYS_NICE)) {
-               /*
-                * can't change policy, except between SCHED_NORMAL
-                * and SCHED_BATCH:
-                */
-               if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) &&
-                       (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) &&
-                               !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
-                       return -EPERM;
-               /* can't increase priority */
-               if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) &&
-                   param->sched_priority > p->rt_priority &&
-                   param->sched_priority >
-                               p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
-                       return -EPERM;
+               if (is_rt_policy(policy)) {
+                       unsigned long rlim_rtprio;
+                       unsigned long flags;
+
+                       if (!lock_task_sighand(p, &flags))
+                               return -ESRCH;
+                       rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+                       unlock_task_sighand(p, &flags);
+
+                       /* can't set/change the rt policy */
+                       if (policy != p->policy && !rlim_rtprio)
+                               return -EPERM;
+
+                       /* can't increase priority */
+                       if (param->sched_priority > p->rt_priority &&
+                           param->sched_priority > rlim_rtprio)
+                               return -EPERM;
+               }
+
                /* can't change other user's priorities */
                if ((current->euid != p->euid) &&
                    (current->euid != p->uid))
@@ -4120,16 +4196,13 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
                return -EINVAL;
        if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
                return -EFAULT;
-       read_lock_irq(&tasklist_lock);
+
+       rcu_read_lock();
+       retval = -ESRCH;
        p = find_process_by_pid(pid);
-       if (!p) {
-               read_unlock_irq(&tasklist_lock);
-               return -ESRCH;
-       }
-       get_task_struct(p);
-       read_unlock_irq(&tasklist_lock);
-       retval = sched_setscheduler(p, policy, &lparam);
-       put_task_struct(p);
+       if (p != NULL)
+               retval = sched_setscheduler(p, policy, &lparam);
+       rcu_read_unlock();
 
        return retval;
 }
@@ -4311,7 +4384,10 @@ EXPORT_SYMBOL(cpu_present_map);
 
 #ifndef CONFIG_SMP
 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_online_map);
+
 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_possible_map);
 #endif
 
 long sched_getaffinity(pid_t pid, cpumask_t *mask)
@@ -4420,9 +4496,9 @@ asmlinkage long sys_sched_yield(void)
        return 0;
 }
 
-static inline int __resched_legal(void)
+static inline int __resched_legal(int expected_preempt_count)
 {
-       if (unlikely(preempt_count()))
+       if (unlikely(preempt_count() != expected_preempt_count))
                return 0;
        if (unlikely(system_state != SYSTEM_RUNNING))
                return 0;
@@ -4448,7 +4524,7 @@ static void __cond_resched(void)
 
 int __sched cond_resched(void)
 {
-       if (need_resched() && __resched_legal()) {
+       if (need_resched() && __resched_legal(0)) {
                __cond_resched();
                return 1;
        }
@@ -4474,7 +4550,7 @@ int cond_resched_lock(spinlock_t *lock)
                ret = 1;
                spin_lock(lock);
        }
-       if (need_resched() && __resched_legal()) {
+       if (need_resched() && __resched_legal(1)) {
                spin_release(&lock->dep_map, 1, _THIS_IP_);
                _raw_spin_unlock(lock);
                preempt_enable_no_resched();
@@ -4490,7 +4566,7 @@ int __sched cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
 
-       if (need_resched() && __resched_legal()) {
+       if (need_resched() && __resched_legal(0)) {
                raw_local_irq_disable();
                _local_bh_enable();
                raw_local_irq_enable();
@@ -4526,9 +4602,11 @@ void __sched io_schedule(void)
 {
        struct rq *rq = &__raw_get_cpu_var(runqueues);
 
+       delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
        schedule();
        atomic_dec(&rq->nr_iowait);
+       delayacct_blkio_end();
 }
 EXPORT_SYMBOL(io_schedule);
 
@@ -4537,9 +4615,11 @@ long __sched io_schedule_timeout(long timeout)
        struct rq *rq = &__raw_get_cpu_var(runqueues);
        long ret;
 
+       delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
        ret = schedule_timeout(timeout);
        atomic_dec(&rq->nr_iowait);
+       delayacct_blkio_end();
        return ret;
 }
 
@@ -4650,7 +4730,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
        return list_entry(p->sibling.next,struct task_struct,sibling);
 }
 
-static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
+static const char stat_nam[] = "RSDTtZX";
 
 static void show_task(struct task_struct *p)
 {
@@ -4658,12 +4738,9 @@ static void show_task(struct task_struct *p)
        unsigned long free = 0;
        unsigned state;
 
-       printk("%-13.13s ", p->comm);
        state = p->state ? __ffs(p->state) + 1 : 0;
-       if (state < ARRAY_SIZE(stat_nam))
-               printk(stat_nam[state]);
-       else
-               printk("?");
+       printk("%-13.13s %c", p->comm,
+               state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if (BITS_PER_LONG == 32)
        if (state == TASK_RUNNING)
                printk(" running ");
@@ -4740,7 +4817,7 @@ void show_state(void)
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
-void __devinit init_idle(struct task_struct *idle, int cpu)
+void __cpuinit init_idle(struct task_struct *idle, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
@@ -4877,7 +4954,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
                p->timestamp = p->timestamp - rq_src->timestamp_last_tick
                                + rq_dest->timestamp_last_tick;
                deactivate_task(p, rq_src);
-               activate_task(p, rq_dest, 0);
+               __activate_task(p, rq_dest);
                if (TASK_PREEMPTS_CURR(p, rq_dest))
                        resched_task(rq_dest->curr);
        }
@@ -5079,7 +5156,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
        BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
 
        /* Cannot have done final schedule yet: would have vanished. */
-       BUG_ON(p->flags & PF_DEAD);
+       BUG_ON(p->state == TASK_DEAD);
 
        get_task_struct(p);
 
@@ -5200,9 +5277,11 @@ static struct notifier_block __cpuinitdata migration_notifier = {
 int __init migration_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
+       int err;
 
        /* Start one for the boot CPU: */
-       migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+       err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
+       BUG_ON(err == NOTIFY_BAD);
        migration_call(&migration_notifier, CPU_ONLINE, cpu);
        register_cpu_notifier(&migration_notifier);
 
@@ -5382,7 +5461,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 
 /* cpus with isolated domains */
-static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -5776,7 +5855,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
        cache = vmalloc(max_size);
        if (!cache) {
                printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
-               return 1000000; // return 1 msec on very small boxen
+               return 1000000; /* return 1 msec on very small boxen */
        }
 
        while (size <= max_size) {
@@ -6457,7 +6536,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
        for (i = 0; i < MAX_NUMNODES; i++)
                init_numa_sched_groups_power(sched_group_nodes[i]);
 
-       init_numa_sched_groups_power(sched_group_allnodes);
+       if (sched_group_allnodes) {
+               int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
+               struct sched_group *sg = &sched_group_allnodes[group];
+
+               init_numa_sched_groups_power(sg);
+       }
 #endif
 
        /* Attach the domains */
@@ -6663,11 +6747,20 @@ static int update_sched_domains(struct notifier_block *nfb,
 
 void __init sched_init_smp(void)
 {
+       cpumask_t non_isolated_cpus;
+
        lock_cpu_hotplug();
        arch_init_sched_domains(&cpu_online_map);
+       cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map);
+       if (cpus_empty(non_isolated_cpus))
+               cpu_set(smp_processor_id(), non_isolated_cpus);
        unlock_cpu_hotplug();
        /* XXX: Theoretical race here - CPU may be hotplugged now */
        hotcpu_notifier(update_sched_domains, 0);
+
+       /* Move init over to a non-isolated CPU */
+       if (set_cpus_allowed(current, non_isolated_cpus) < 0)
+               BUG();
 }
 #else
 void __init sched_init_smp(void)
@@ -6707,6 +6800,7 @@ void __init sched_init(void)
                        rq->cpu_load[j] = 0;
                rq->active_balance = 0;
                rq->push_cpu = 0;
+               rq->cpu = i;
                rq->migration_thread = NULL;
                INIT_LIST_HEAD(&rq->migration_queue);
 #endif
@@ -6724,6 +6818,11 @@ void __init sched_init(void)
        }
 
        set_load_weight(&init_task);
+
+#ifdef CONFIG_RT_MUTEXES
+       plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
+#endif
+
        /*
         * The boot idle thread does lazy MMU switching as well:
         */