X-Git-Url: http://pilppa.org/gitweb/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fsched.c;h=9474b23c28bf41f5989df3b94c5e810b0f1e971a;hb=e552b6617067ab785256dcec5ca29eeea981aacb;hp=2368a0d882e3c9c06034a2a1b26024650dd14f3e;hpb=9a897c5a6701bcb6f099f7ca20194999102729fd;p=linux-2.6-omap-h63xx.git diff --git a/kernel/sched.c b/kernel/sched.c index 2368a0d882e..9474b23c28b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -160,6 +161,8 @@ struct rt_prio_array { struct cfs_rq; +static LIST_HEAD(task_groups); + /* task group related information */ struct task_group { #ifdef CONFIG_FAIR_CGROUP_SCHED @@ -170,6 +173,11 @@ struct task_group { /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + unsigned int rt_ratio; + /* * shares assigned to a task group governs how much of cpu bandwidth * is allocated to the group. The more shares a group has, the more is @@ -182,7 +190,7 @@ struct task_group { * * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% - * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% + * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% * * The weight assigned to a task group's schedulable entities on every * cpu (task_group.se[a_cpu]->load.weight) is derived from the task @@ -192,9 +200,9 @@ struct task_group { * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; * * Note: It's not necessary that each of a task's group schedulable - * entity have the same weight on all CPUs. If the group - * has 2 of its tasks on CPU0 and 1 task on CPU1, then a - * better distribution of weight could be: + * entity have the same weight on all CPUs. If the group + * has 2 of its tasks on CPU0 and 1 task on CPU1, then a + * better distribution of weight could be: * * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 @@ -207,6 +215,7 @@ struct task_group { unsigned long shares; struct rcu_head rcu; + struct list_head list; }; /* Default task group's sched entity on each cpu */ @@ -214,9 +223,15 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); +static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; + static struct sched_entity *init_sched_entity_p[NR_CPUS]; static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; +static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; +static struct rt_rq *init_rt_rq_p[NR_CPUS]; + /* task_group_mutex serializes add/remove of task groups and also changes to * a task group's cpu shares. */ @@ -239,6 +254,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares); struct task_group init_task_group = { .se = init_sched_entity_p, .cfs_rq = init_cfs_rq_p, + + .rt_se = init_sched_rt_entity_p, + .rt_rq = init_rt_rq_p, }; #ifdef CONFIG_FAIR_USER_SCHED @@ -268,10 +286,13 @@ static inline struct task_group *task_group(struct task_struct *p) } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; p->se.parent = task_group(p)->se[cpu]; + + p->rt.rt_rq = task_group(p)->rt_rq[cpu]; + p->rt.parent = task_group(p)->rt_se[cpu]; } static inline void lock_task_group_list(void) @@ -296,7 +317,7 @@ static inline void unlock_doms_cur(void) #else -static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline void lock_task_group_list(void) { } static inline void unlock_task_group_list(void) { } static inline void lock_doms_cur(void) { } @@ -341,13 +362,23 @@ struct cfs_rq { /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; - int rt_load_balance_idx; - struct list_head *rt_load_balance_head, *rt_load_balance_curr; unsigned long rt_nr_running; +#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED + int highest_prio; /* highest queued rt task prio */ +#endif +#ifdef CONFIG_SMP unsigned long rt_nr_migratory; - /* highest queued rt task prio */ - int highest_prio; int overloaded; +#endif + int rt_throttled; + u64 rt_time; + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; + struct list_head leaf_rt_rq_list; + struct task_group *tg; + struct sched_rt_entity *rt_se; +#endif }; #ifdef CONFIG_SMP @@ -359,8 +390,6 @@ struct rt_rq { * exclusive cpuset is created, we also create and attach a new root-domain * object. * - * By default the system creates a single root-domain with all cpus as - * members (mimicking the global state we have today). */ struct root_domain { atomic_t refcount; @@ -375,6 +404,10 @@ struct root_domain { atomic_t rto_count; }; +/* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ static struct root_domain def_root_domain; #endif @@ -407,11 +440,15 @@ struct rq { u64 nr_switches; struct cfs_rq cfs; + struct rt_rq rt; + u64 rt_period_expire; + int rt_throttled; + #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; + struct list_head leaf_rt_rq_list; #endif - struct rt_rq rt; /* * This is part of a global counter where only the total sum @@ -428,7 +465,7 @@ struct rq { u64 clock, prev_clock_raw; s64 clock_max_delta; - unsigned int clock_warps, clock_overflows; + unsigned int clock_warps, clock_overflows, clock_underflows; u64 idle_clock; unsigned int clock_deep_idle_events; u64 tick_timestamp; @@ -449,6 +486,12 @@ struct rq { struct list_head migration_queue; #endif +#ifdef CONFIG_SCHED_HRTICK + unsigned long hrtick_flags; + ktime_t hrtick_expire; + struct hrtimer hrtick_timer; +#endif + #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; @@ -552,6 +595,23 @@ static void update_rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +unsigned long rt_needs_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 delta; + + if (!rq->rt_throttled) + return 0; + + if (rq->clock > rq->rt_period_expire) + return 1; + + delta = rq->rt_period_expire - rq->clock; + do_div(delta, NSEC_PER_SEC / HZ); + + return (unsigned long)delta; +} + /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -570,6 +630,8 @@ enum { SCHED_FEAT_START_DEBIT = 4, SCHED_FEAT_TREE_AVG = 8, SCHED_FEAT_APPROX_AVG = 16, + SCHED_FEAT_HRTICK = 32, + SCHED_FEAT_DOUBLE_TICK = 64, }; const_debug unsigned int sysctl_sched_features = @@ -577,7 +639,9 @@ const_debug unsigned int sysctl_sched_features = SCHED_FEAT_WAKEUP_PREEMPT * 1 | SCHED_FEAT_START_DEBIT * 1 | SCHED_FEAT_TREE_AVG * 0 | - SCHED_FEAT_APPROX_AVG * 0; + SCHED_FEAT_APPROX_AVG * 0 | + SCHED_FEAT_HRTICK * 1 | + SCHED_FEAT_DOUBLE_TICK * 0; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) @@ -587,6 +651,21 @@ const_debug unsigned int sysctl_sched_features = */ const_debug unsigned int sysctl_sched_nr_migrate = 32; +/* + * period over which we measure -rt task cpu usage in ms. + * default: 1s + */ +const_debug unsigned int sysctl_sched_rt_period = 1000; + +#define SCHED_RT_FRAC_SHIFT 16 +#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) + +/* + * ratio of time -rt tasks may consume. + * default: 95% + */ +const_debug unsigned int sysctl_sched_rt_ratio = 62259; + /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu * clock constructed from sched_clock(): @@ -779,7 +858,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) struct rq *rq = cpu_rq(smp_processor_id()); u64 now = sched_clock(); - touch_softlockup_watchdog(); rq->idle_clock += delta_ns; /* * Override the previous timestamp and ignore all @@ -791,9 +869,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) rq->prev_clock_raw = now; rq->clock += delta_ns; spin_unlock(&rq->lock); + touch_softlockup_watchdog(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); +static void __resched_task(struct task_struct *p, int tif_bit); + +static inline void resched_task(struct task_struct *p) +{ + __resched_task(p, TIF_NEED_RESCHED); +} + +#ifdef CONFIG_SCHED_HRTICK +/* + * Use HR-timers to deliver accurate preemption points. + * + * Its all a bit involved since we cannot program an hrt while holding the + * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a + * reschedule event. + * + * When we get rescheduled we reprogram the hrtick_timer outside of the + * rq->lock. + */ +static inline void resched_hrt(struct task_struct *p) +{ + __resched_task(p, TIF_HRTICK_RESCHED); +} + +static inline void resched_rq(struct rq *rq) +{ + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + resched_task(rq->curr); + spin_unlock_irqrestore(&rq->lock, flags); +} + +enum { + HRTICK_SET, /* re-programm hrtick_timer */ + HRTICK_RESET, /* not a new slice */ +}; + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay, int reset) +{ + assert_spin_locked(&rq->lock); + + /* + * preempt at: now + delay + */ + rq->hrtick_expire = + ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); + /* + * indicate we need to program the timer + */ + __set_bit(HRTICK_SET, &rq->hrtick_flags); + if (reset) + __set_bit(HRTICK_RESET, &rq->hrtick_flags); + + /* + * New slices are called from the schedule path and don't need a + * forced reschedule. + */ + if (reset) + resched_hrt(rq->curr); +} + +static void hrtick_clear(struct rq *rq) +{ + if (hrtimer_active(&rq->hrtick_timer)) + hrtimer_cancel(&rq->hrtick_timer); +} + +/* + * Update the timer from the possible pending state. + */ +static void hrtick_set(struct rq *rq) +{ + ktime_t time; + int set, reset; + unsigned long flags; + + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); + + spin_lock_irqsave(&rq->lock, flags); + set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); + reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); + time = rq->hrtick_expire; + clear_thread_flag(TIF_HRTICK_RESCHED); + spin_unlock_irqrestore(&rq->lock, flags); + + if (set) { + hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); + if (reset && !hrtimer_active(&rq->hrtick_timer)) + resched_rq(rq); + } else + hrtick_clear(rq); +} + +/* + * High-resolution timer tick. + * Runs from hardirq context with interrupts disabled. + */ +static enum hrtimer_restart hrtick(struct hrtimer *timer) +{ + struct rq *rq = container_of(timer, struct rq, hrtick_timer); + + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); + + spin_lock(&rq->lock); + __update_rq_clock(rq); + rq->curr->sched_class->task_tick(rq, rq->curr, 1); + spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; +} + +static inline void init_rq_hrtick(struct rq *rq) +{ + rq->hrtick_flags = 0; + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rq->hrtick_timer.function = hrtick; + rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; +} + +void hrtick_resched(void) +{ + struct rq *rq; + unsigned long flags; + + if (!test_thread_flag(TIF_HRTICK_RESCHED)) + return; + + local_irq_save(flags); + rq = cpu_rq(smp_processor_id()); + hrtick_set(rq); + local_irq_restore(flags); +} +#else +static inline void hrtick_clear(struct rq *rq) +{ +} + +static inline void hrtick_set(struct rq *rq) +{ +} + +static inline void init_rq_hrtick(struct rq *rq) +{ +} + +void hrtick_resched(void) +{ +} +#endif + /* * resched_task - mark a task 'to be rescheduled now'. * @@ -807,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif -static void resched_task(struct task_struct *p) +static void __resched_task(struct task_struct *p, int tif_bit) { int cpu; assert_spin_locked(&task_rq(p)->lock); - if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + if (unlikely(test_tsk_thread_flag(p, tif_bit))) return; - set_tsk_thread_flag(p, TIF_NEED_RESCHED); + set_tsk_thread_flag(p, tif_bit); cpu = task_cpu(p); if (cpu == smp_processor_id()) @@ -839,10 +1085,10 @@ static void resched_cpu(int cpu) spin_unlock_irqrestore(&rq->lock, flags); } #else -static inline void resched_task(struct task_struct *p) +static void __resched_task(struct task_struct *p, int tif_bit) { assert_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); + set_tsk_thread_flag(p, tif_bit); } #endif @@ -1009,12 +1255,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #define sched_class_highest (&rt_sched_class) -static void inc_nr_running(struct task_struct *p, struct rq *rq) +static void inc_nr_running(struct rq *rq) { rq->nr_running++; } -static void dec_nr_running(struct task_struct *p, struct rq *rq) +static void dec_nr_running(struct rq *rq) { rq->nr_running--; } @@ -1104,11 +1350,11 @@ static int effective_prio(struct task_struct *p) */ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) { - if (p->state == TASK_UNINTERRUPTIBLE) + if (task_contributes_to_load(p)) rq->nr_uninterruptible--; enqueue_task(rq, p, wakeup); - inc_nr_running(p, rq); + inc_nr_running(rq); } /* @@ -1116,11 +1362,11 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) */ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) { - if (p->state == TASK_UNINTERRUPTIBLE) + if (task_contributes_to_load(p)) rq->nr_uninterruptible++; dequeue_task(rq, p, sleep); - dec_nr_running(p, rq); + dec_nr_running(rq); } /** @@ -1140,7 +1386,7 @@ unsigned long weighted_cpuload(const int cpu) static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { - set_task_cfs_rq(p, cpu); + set_task_rq(p, cpu); #ifdef CONFIG_SMP /* * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be @@ -1152,6 +1398,18 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) #endif } +static inline void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio, int running) +{ + if (prev_class != p->sched_class) { + if (prev_class->switched_from) + prev_class->switched_from(rq, p, running); + p->sched_class->switched_to(rq, p, running); + } else + p->sched_class->prio_changed(rq, p, oldprio, running); +} + #ifdef CONFIG_SMP /* @@ -1637,8 +1895,7 @@ out: int fastcall wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | - TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); + return try_to_wake_up(p, TASK_ALL, 0); } EXPORT_SYMBOL(wake_up_process); @@ -1671,7 +1928,7 @@ static void __sched_fork(struct task_struct *p) p->se.wait_max = 0; #endif - INIT_LIST_HEAD(&p->run_list); + INIT_LIST_HEAD(&p->rt.run_list); p->se.on_rq = 0; #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1748,7 +2005,7 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); - inc_nr_running(p, rq); + inc_nr_running(rq); } check_preempt_curr(rq, p); #ifdef CONFIG_SMP @@ -3478,12 +3735,14 @@ void scheduler_tick(void) /* * Let rq->clock advance by at least TICK_NSEC: */ - if (unlikely(rq->clock < next_tick)) + if (unlikely(rq->clock < next_tick)) { rq->clock = next_tick; + rq->clock_underflows++; + } rq->tick_timestamp = rq->clock; update_cpu_load(rq); - if (curr != rq->idle) /* FIXME: needed? */ - curr->sched_class->task_tick(rq, curr); + curr->sched_class->task_tick(rq, curr, 0); + update_sched_rt_period(rq); spin_unlock(&rq->lock); #ifdef CONFIG_SMP @@ -3629,6 +3888,8 @@ need_resched_nonpreemptible: schedule_debug(prev); + hrtick_clear(rq); + /* * Do the rq-clock update outside the rq lock: */ @@ -3666,14 +3927,20 @@ need_resched_nonpreemptible: ++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */ + /* + * the context switch might have flipped the stack from under + * us, hence refresh the local variables. + */ + cpu = smp_processor_id(); + rq = cpu_rq(cpu); } else spin_unlock_irq(&rq->lock); - if (unlikely(reacquire_kernel_lock(current) < 0)) { - cpu = smp_processor_id(); - rq = cpu_rq(cpu); + hrtick_set(rq); + + if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; - } + preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; @@ -3689,10 +3956,9 @@ EXPORT_SYMBOL(schedule); asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); -#ifdef CONFIG_PREEMPT_BKL struct task_struct *task = current; int saved_lock_depth; -#endif + /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. @@ -3708,14 +3974,10 @@ asmlinkage void __sched preempt_schedule(void) * clear ->lock_depth so that schedule() doesnt * auto-release the semaphore: */ -#ifdef CONFIG_PREEMPT_BKL saved_lock_depth = task->lock_depth; task->lock_depth = -1; -#endif schedule(); -#ifdef CONFIG_PREEMPT_BKL task->lock_depth = saved_lock_depth; -#endif sub_preempt_count(PREEMPT_ACTIVE); /* @@ -3736,10 +3998,9 @@ EXPORT_SYMBOL(preempt_schedule); asmlinkage void __sched preempt_schedule_irq(void) { struct thread_info *ti = current_thread_info(); -#ifdef CONFIG_PREEMPT_BKL struct task_struct *task = current; int saved_lock_depth; -#endif + /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); @@ -3751,16 +4012,12 @@ asmlinkage void __sched preempt_schedule_irq(void) * clear ->lock_depth so that schedule() doesnt * auto-release the semaphore: */ -#ifdef CONFIG_PREEMPT_BKL saved_lock_depth = task->lock_depth; task->lock_depth = -1; -#endif local_irq_enable(); schedule(); local_irq_disable(); -#ifdef CONFIG_PREEMPT_BKL task->lock_depth = saved_lock_depth; -#endif sub_preempt_count(PREEMPT_ACTIVE); /* @@ -3866,8 +4123,7 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done++; - __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 1, 0, NULL); + __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); @@ -3878,8 +4134,7 @@ void complete_all(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 0, 0, NULL); + __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); @@ -3893,8 +4148,10 @@ do_wait_for_common(struct completion *x, long timeout, int state) wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - if (state == TASK_INTERRUPTIBLE && - signal_pending(current)) { + if ((state == TASK_INTERRUPTIBLE && + signal_pending(current)) || + (state == TASK_KILLABLE && + fatal_signal_pending(current))) { __remove_wait_queue(&x->wait, &wait); return -ERESTARTSYS; } @@ -3954,6 +4211,15 @@ wait_for_completion_interruptible_timeout(struct completion *x, } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); +int __sched wait_for_completion_killable(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + static long __sched sleep_on_common(wait_queue_head_t *q, int state, long timeout) { @@ -4017,6 +4283,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) unsigned long flags; int oldprio, on_rq, running; struct rq *rq; + const struct sched_class *prev_class = p->sched_class; BUG_ON(prio < 0 || prio > MAX_PRIO); @@ -4042,18 +4309,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (on_rq) { if (running) p->sched_class->set_curr_task(rq); + enqueue_task(rq, p, 0); - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (running) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else { - check_preempt_curr(rq, p); - } + + check_class_changed(rq, p, prev_class, oldprio, running); } task_rq_unlock(rq, &flags); } @@ -4253,6 +4512,7 @@ int sched_setscheduler(struct task_struct *p, int policy, { int retval, oldprio, oldpolicy = -1, on_rq, running; unsigned long flags; + const struct sched_class *prev_class = p->sched_class; struct rq *rq; /* may grab non-irq protected spin_locks */ @@ -4346,18 +4606,10 @@ recheck: if (on_rq) { if (running) p->sched_class->set_curr_task(rq); + activate_task(rq, p, 0); - /* - * Reschedule if we are currently running on this runqueue and - * our priority decreased, or if we are not currently running on - * this runqueue and our priority is higher than the current's - */ - if (running) { - if (p->prio > oldprio) - resched_task(rq->curr); - } else { - check_preempt_curr(rq, p); - } + + check_class_changed(rq, p, prev_class, oldprio, running); } __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -4678,7 +4930,8 @@ static void __cond_resched(void) } while (need_resched()); } -int __sched cond_resched(void) +#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) +int __sched _cond_resched(void) { if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && system_state == SYSTEM_RUNNING) { @@ -4687,7 +4940,8 @@ int __sched cond_resched(void) } return 0; } -EXPORT_SYMBOL(cond_resched); +EXPORT_SYMBOL(_cond_resched); +#endif /* * cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -4699,19 +4953,15 @@ EXPORT_SYMBOL(cond_resched); */ int cond_resched_lock(spinlock_t *lock) { + int resched = need_resched() && system_state == SYSTEM_RUNNING; int ret = 0; - if (need_lockbreak(lock)) { + if (spin_needbreak(lock) || resched) { spin_unlock(lock); - cpu_relax(); - ret = 1; - spin_lock(lock); - } - if (need_resched() && system_state == SYSTEM_RUNNING) { - spin_release(&lock->dep_map, 1, _THIS_IP_); - _raw_spin_unlock(lock); - preempt_enable_no_resched(); - __cond_resched(); + if (resched && need_resched()) + __cond_resched(); + else + cpu_relax(); ret = 1; spin_lock(lock); } @@ -4915,8 +5165,7 @@ void sched_show_task(struct task_struct *p) printk(KERN_CONT "%5lu %5d %6d\n", free, task_pid_nr(p), task_pid_nr(p->real_parent)); - if (state != TASK_RUNNING) - show_stack(p, NULL); + show_stack(p, NULL); } void show_state_filter(unsigned long state_filter) @@ -4987,11 +5236,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) - task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); -#else task_thread_info(idle)->preempt_count = 0; -#endif + /* * The idle tasks have their own, simple scheduling class: */ @@ -5076,7 +5322,7 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) p->sched_class->set_cpus_allowed(p, &new_mask); else { p->cpus_allowed = new_mask; - p->nr_cpus_allowed = cpus_weight(new_mask); + p->rt.nr_cpus_allowed = cpus_weight(new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ @@ -5861,6 +6107,9 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) class->leave_domain(rq); } + cpu_clear(rq->cpu, old_rd->span); + cpu_clear(rq->cpu, old_rd->online); + if (atomic_dec_and_test(&old_rd->refcount)) kfree(old_rd); } @@ -5868,6 +6117,10 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) atomic_inc(&rd->refcount); rq->rd = rd; + cpu_set(rq->cpu, rd->span); + if (cpu_isset(rq->cpu, cpu_online_map)) + cpu_set(rq->cpu, rd->online); + for (class = sched_class_highest; class; class = class->next) { if (class->join_domain) class->join_domain(rq); @@ -5876,23 +6129,21 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) spin_unlock_irqrestore(&rq->lock, flags); } -static void init_rootdomain(struct root_domain *rd, const cpumask_t *map) +static void init_rootdomain(struct root_domain *rd) { memset(rd, 0, sizeof(*rd)); - rd->span = *map; - cpus_and(rd->online, rd->span, cpu_online_map); + cpus_clear(rd->span); + cpus_clear(rd->online); } static void init_defrootdomain(void) { - cpumask_t cpus = CPU_MASK_ALL; - - init_rootdomain(&def_root_domain, &cpus); + init_rootdomain(&def_root_domain); atomic_set(&def_root_domain.refcount, 1); } -static struct root_domain *alloc_rootdomain(const cpumask_t *map) +static struct root_domain *alloc_rootdomain(void) { struct root_domain *rd; @@ -5900,7 +6151,7 @@ static struct root_domain *alloc_rootdomain(const cpumask_t *map) if (!rd) return NULL; - init_rootdomain(rd, map); + init_rootdomain(rd); return rd; } @@ -6321,7 +6572,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; #endif - rd = alloc_rootdomain(cpu_map); + rd = alloc_rootdomain(); if (!rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); return -ENOMEM; @@ -6848,6 +7099,73 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } +static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +{ + struct rt_prio_array *array; + int i; + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); + +#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED + rt_rq->highest_prio = MAX_RT_PRIO; +#endif +#ifdef CONFIG_SMP + rt_rq->rt_nr_migratory = 0; + rt_rq->overloaded = 0; +#endif + + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + +#ifdef CONFIG_FAIR_GROUP_SCHED + rt_rq->rq = rq; +#endif +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, + struct cfs_rq *cfs_rq, struct sched_entity *se, + int cpu, int add) +{ + tg->cfs_rq[cpu] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + if (add) + list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + + tg->se[cpu] = se; + se->cfs_rq = &rq->cfs; + se->my_q = cfs_rq; + se->load.weight = tg->shares; + se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); + se->parent = NULL; +} + +static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, + struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, + int cpu, int add) +{ + tg->rt_rq[cpu] = rt_rq; + init_rt_rq(rt_rq, rq); + rt_rq->tg = tg; + rt_rq->rt_se = rt_se; + if (add) + list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); + + tg->rt_se[cpu] = rt_se; + rt_se->rt_rq = &rq->rt; + rt_se->my_q = rt_rq; + rt_se->parent = NULL; + INIT_LIST_HEAD(&rt_se->run_list); +} +#endif + void __init sched_init(void) { int highest_cpu = 0; @@ -6857,8 +7175,11 @@ void __init sched_init(void) init_defrootdomain(); #endif +#ifdef CONFIG_FAIR_GROUP_SCHED + list_add(&init_task_group.list, &task_groups); +#endif + for_each_possible_cpu(i) { - struct rt_prio_array *array; struct rq *rq; rq = cpu_rq(i); @@ -6867,55 +7188,39 @@ void __init sched_init(void) rq->nr_running = 0; rq->clock = 1; init_cfs_rq(&rq->cfs, rq); + init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED - INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); - { - struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); - struct sched_entity *se = - &per_cpu(init_sched_entity, i); - - init_cfs_rq_p[i] = cfs_rq; - init_cfs_rq(cfs_rq, rq); - cfs_rq->tg = &init_task_group; - list_add(&cfs_rq->leaf_cfs_rq_list, - &rq->leaf_cfs_rq_list); - - init_sched_entity_p[i] = se; - se->cfs_rq = &rq->cfs; - se->my_q = cfs_rq; - se->load.weight = init_task_group_load; - se->load.inv_weight = - div64_64(1ULL<<32, init_task_group_load); - se->parent = NULL; - } init_task_group.shares = init_task_group_load; + INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + init_tg_cfs_entry(rq, &init_task_group, + &per_cpu(init_cfs_rq, i), + &per_cpu(init_sched_entity, i), i, 1); + + init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ + INIT_LIST_HEAD(&rq->leaf_rt_rq_list); + init_tg_rt_entry(rq, &init_task_group, + &per_cpu(init_rt_rq, i), + &per_cpu(init_sched_rt_entity, i), i, 1); #endif + rq->rt_period_expire = 0; + rq->rt_throttled = 0; for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq_attach_root(rq, &def_root_domain); rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; rq->cpu = i; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); - rq->rt.highest_prio = MAX_RT_PRIO; - rq->rt.overloaded = 0; + rq_attach_root(rq, &def_root_domain); #endif + init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); - - array = &rq->rt.active; - for (j = 0; j < MAX_RT_PRIO; j++) { - INIT_LIST_HEAD(array->queue + j); - __clear_bit(j, array->bitmap); - } highest_cpu = i; - /* delimiter for bitsearch: */ - __set_bit(MAX_RT_PRIO, array->bitmap); } set_load_weight(&init_task); @@ -7087,7 +7392,7 @@ void set_curr_task(int cpu, struct task_struct *p) #ifdef CONFIG_SMP /* * distribute shares of all task groups among their schedulable entities, - * to reflect load distrbution across cpus. + * to reflect load distribution across cpus. */ static int rebalance_shares(struct sched_domain *sd, int this_cpu) { @@ -7154,7 +7459,7 @@ static int rebalance_shares(struct sched_domain *sd, int this_cpu) * sysctl_sched_max_bal_int_shares represents the maximum interval between * consecutive calls to rebalance_shares() in the same sched domain. * - * These settings allows for the appropriate tradeoff between accuracy of + * These settings allows for the appropriate trade-off between accuracy of * fairness and the associated overhead. * */ @@ -7235,12 +7540,36 @@ static int load_balance_monitor(void *unused) } #endif /* CONFIG_SMP */ +static void free_sched_group(struct task_group *tg) +{ + int i; + + for_each_possible_cpu(i) { + if (tg->cfs_rq) + kfree(tg->cfs_rq[i]); + if (tg->se) + kfree(tg->se[i]); + if (tg->rt_rq) + kfree(tg->rt_rq[i]); + if (tg->rt_se) + kfree(tg->rt_se[i]); + } + + kfree(tg->cfs_rq); + kfree(tg->se); + kfree(tg->rt_rq); + kfree(tg->rt_se); + kfree(tg); +} + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(void) { struct task_group *tg; struct cfs_rq *cfs_rq; struct sched_entity *se; + struct rt_rq *rt_rq; + struct sched_rt_entity *rt_se; struct rq *rq; int i; @@ -7254,100 +7583,89 @@ struct task_group *sched_create_group(void) tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); if (!tg->se) goto err; + tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); + if (!tg->rt_rq) + goto err; + tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); + if (!tg->rt_se) + goto err; + + tg->shares = NICE_0_LOAD; + tg->rt_ratio = 0; /* XXX */ for_each_possible_cpu(i) { rq = cpu_rq(i); - cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, - cpu_to_node(i)); + cfs_rq = kmalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); if (!cfs_rq) goto err; - se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, - cpu_to_node(i)); + se = kmalloc_node(sizeof(struct sched_entity), + GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); if (!se) goto err; - memset(cfs_rq, 0, sizeof(struct cfs_rq)); - memset(se, 0, sizeof(struct sched_entity)); + rt_rq = kmalloc_node(sizeof(struct rt_rq), + GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + if (!rt_rq) + goto err; - tg->cfs_rq[i] = cfs_rq; - init_cfs_rq(cfs_rq, rq); - cfs_rq->tg = tg; + rt_se = kmalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + if (!rt_se) + goto err; - tg->se[i] = se; - se->cfs_rq = &rq->cfs; - se->my_q = cfs_rq; - se->load.weight = NICE_0_LOAD; - se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); - se->parent = NULL; + init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); + init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); } - tg->shares = NICE_0_LOAD; - lock_task_group_list(); for_each_possible_cpu(i) { rq = cpu_rq(i); cfs_rq = tg->cfs_rq[i]; list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + rt_rq = tg->rt_rq[i]; + list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); } + list_add_rcu(&tg->list, &task_groups); unlock_task_group_list(); return tg; err: - for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - if (tg->se) - kfree(tg->se[i]); - } - kfree(tg->cfs_rq); - kfree(tg->se); - kfree(tg); - + free_sched_group(tg); return ERR_PTR(-ENOMEM); } /* rcu callback to free various structures associated with a task group */ -static void free_sched_group(struct rcu_head *rhp) +static void free_sched_group_rcu(struct rcu_head *rhp) { - struct task_group *tg = container_of(rhp, struct task_group, rcu); - struct cfs_rq *cfs_rq; - struct sched_entity *se; - int i; - /* now it should be safe to free those cfs_rqs */ - for_each_possible_cpu(i) { - cfs_rq = tg->cfs_rq[i]; - kfree(cfs_rq); - - se = tg->se[i]; - kfree(se); - } - - kfree(tg->cfs_rq); - kfree(tg->se); - kfree(tg); + free_sched_group(container_of(rhp, struct task_group, rcu)); } /* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { struct cfs_rq *cfs_rq = NULL; + struct rt_rq *rt_rq = NULL; int i; lock_task_group_list(); for_each_possible_cpu(i) { cfs_rq = tg->cfs_rq[i]; list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + rt_rq = tg->rt_rq[i]; + list_del_rcu(&rt_rq->leaf_rt_rq_list); } + list_del_rcu(&tg->list); unlock_task_group_list(); BUG_ON(!cfs_rq); /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group); + call_rcu(&tg->rcu, free_sched_group_rcu); } /* change task's runqueue when it moves between groups. @@ -7363,11 +7681,6 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); - if (tsk->sched_class != &fair_sched_class) { - set_task_cfs_rq(tsk, task_cpu(tsk)); - goto done; - } - update_rq_clock(rq); running = task_current(rq, tsk); @@ -7379,7 +7692,7 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_class->put_prev_task(rq, tsk); } - set_task_cfs_rq(tsk, task_cpu(tsk)); + set_task_rq(tsk, task_cpu(tsk)); if (on_rq) { if (unlikely(running)) @@ -7387,7 +7700,6 @@ void sched_move_task(struct task_struct *tsk) enqueue_task(rq, tsk, 0); } -done: task_rq_unlock(rq, &flags); } @@ -7472,6 +7784,31 @@ unsigned long sched_group_shares(struct task_group *tg) return tg->shares; } +/* + * Ensure the total rt_ratio <= sysctl_sched_rt_ratio + */ +int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) +{ + struct task_group *tgi; + unsigned long total = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(tgi, &task_groups, list) + total += tgi->rt_ratio; + rcu_read_unlock(); + + if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) + return -EINVAL; + + tg->rt_ratio = rt_ratio; + return 0; +} + +unsigned long sched_group_rt_ratio(struct task_group *tg) +{ + return tg->rt_ratio; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_FAIR_CGROUP_SCHED @@ -7547,12 +7884,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) return (u64) tg->shares; } +static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 rt_ratio_val) +{ + return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); +} + +static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + struct task_group *tg = cgroup_tg(cgrp); + + return (u64) tg->rt_ratio; +} + static struct cftype cpu_files[] = { { .name = "shares", .read_uint = cpu_shares_read_uint, .write_uint = cpu_shares_write_uint, }, + { + .name = "rt_ratio", + .read_uint = cpu_rt_ratio_read_uint, + .write_uint = cpu_rt_ratio_write_uint, + }, }; static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)