X-Git-Url: http://pilppa.org/gitweb/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fsched.c;h=b387a8de26a5f9ca8b6e2e9d3ad33f66fbf65882;hb=1ff82fe0024e8070c38346b8abc1ff09612dea4c;hp=b9ee0f4db66a07a749a7f774dc0fec1053e9e3ed;hpb=03319ec8b06849051747a17aa2a0f9aba9277980;p=linux-2.6-omap-h63xx.git diff --git a/kernel/sched.c b/kernel/sched.c index b9ee0f4db66..b387a8de26a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -154,17 +155,21 @@ struct rt_prio_array { struct list_head queue[MAX_RT_PRIO]; }; -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED #include struct cfs_rq; +static LIST_HEAD(task_groups); + /* task group related information */ struct task_group { -#ifdef CONFIG_FAIR_CGROUP_SCHED +#ifdef CONFIG_CGROUP_SCHED struct cgroup_subsys_state css; #endif + +#ifdef CONFIG_FAIR_GROUP_SCHED /* schedulable entities of this group on each cpu */ struct sched_entity **se; /* runqueue "owned" by this group on each cpu */ @@ -205,10 +210,20 @@ struct task_group { * */ unsigned long shares; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + u64 rt_runtime; +#endif struct rcu_head rcu; + struct list_head list; }; +#ifdef CONFIG_FAIR_GROUP_SCHED /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ @@ -216,15 +231,25 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static struct sched_entity *init_sched_entity_p[NR_CPUS]; static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); +static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; -/* task_group_mutex serializes add/remove of task groups and also changes to +static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; +static struct rt_rq *init_rt_rq_p[NR_CPUS]; +#endif + +/* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. */ -static DEFINE_MUTEX(task_group_mutex); +static DEFINE_SPINLOCK(task_group_lock); /* doms_cur_mutex serializes access to doms_cur[] array */ static DEFINE_MUTEX(doms_cur_mutex); +#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP /* kernel thread that runs rebalance_shares() periodically */ static struct task_struct *lb_monitor_task; @@ -233,15 +258,7 @@ static int load_balance_monitor(void *unused); static void set_se_shares(struct sched_entity *se, unsigned long shares); -/* Default task group. - * Every task in system belong to this group at bootup. - */ -struct task_group init_task_group = { - .se = init_sched_entity_p, - .cfs_rq = init_cfs_rq_p, -}; - -#ifdef CONFIG_FAIR_USER_SCHED +#ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) #else # define INIT_TASK_GROUP_LOAD NICE_0_LOAD @@ -250,15 +267,31 @@ struct task_group init_task_group = { #define MIN_GROUP_SHARES 2 static int init_task_group_load = INIT_TASK_GROUP_LOAD; +#endif + +/* Default task group. + * Every task in system belong to this group at bootup. + */ +struct task_group init_task_group = { +#ifdef CONFIG_FAIR_GROUP_SCHED + .se = init_sched_entity_p, + .cfs_rq = init_cfs_rq_p, +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + .rt_se = init_sched_rt_entity_p, + .rt_rq = init_rt_rq_p, +#endif +}; /* return group to which a task belongs */ static inline struct task_group *task_group(struct task_struct *p) { struct task_group *tg; -#ifdef CONFIG_FAIR_USER_SCHED +#ifdef CONFIG_USER_SCHED tg = p->user->tg; -#elif defined(CONFIG_FAIR_CGROUP_SCHED) +#elif defined(CONFIG_CGROUP_SCHED) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); #else @@ -268,20 +301,17 @@ static inline struct task_group *task_group(struct task_struct *p) } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { +#ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; p->se.parent = task_group(p)->se[cpu]; -} - -static inline void lock_task_group_list(void) -{ - mutex_lock(&task_group_mutex); -} +#endif -static inline void unlock_task_group_list(void) -{ - mutex_unlock(&task_group_mutex); +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = task_group(p)->rt_rq[cpu]; + p->rt.parent = task_group(p)->rt_se[cpu]; +#endif } static inline void lock_doms_cur(void) @@ -296,13 +326,11 @@ static inline void unlock_doms_cur(void) #else -static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } -static inline void lock_task_group_list(void) { } -static inline void unlock_task_group_list(void) { } +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline void lock_doms_cur(void) { } static inline void unlock_doms_cur(void) { } -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_GROUP_SCHED */ /* CFS-related fields in a runqueue */ struct cfs_rq { @@ -341,13 +369,25 @@ struct cfs_rq { /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; - int rt_load_balance_idx; - struct list_head *rt_load_balance_head, *rt_load_balance_curr; unsigned long rt_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + int highest_prio; /* highest queued rt task prio */ +#endif +#ifdef CONFIG_SMP unsigned long rt_nr_migratory; - /* highest queued rt task prio */ - int highest_prio; int overloaded; +#endif + int rt_throttled; + u64 rt_time; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + + struct rq *rq; + struct list_head leaf_rt_rq_list; + struct task_group *tg; + struct sched_rt_entity *rt_se; +#endif }; #ifdef CONFIG_SMP @@ -409,11 +449,17 @@ struct rq { u64 nr_switches; struct cfs_rq cfs; + struct rt_rq rt; + u64 rt_period_expire; + int rt_throttled; + #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; #endif - struct rt_rq rt; +#ifdef CONFIG_RT_GROUP_SCHED + struct list_head leaf_rt_rq_list; +#endif /* * This is part of a global counter where only the total sum @@ -430,7 +476,7 @@ struct rq { u64 clock, prev_clock_raw; s64 clock_max_delta; - unsigned int clock_warps, clock_overflows; + unsigned int clock_warps, clock_overflows, clock_underflows; u64 idle_clock; unsigned int clock_deep_idle_events; u64 tick_timestamp; @@ -451,6 +497,12 @@ struct rq { struct list_head migration_queue; #endif +#ifdef CONFIG_SCHED_HRTICK + unsigned long hrtick_flags; + ktime_t hrtick_expire; + struct hrtimer hrtick_timer; +#endif + #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; @@ -554,6 +606,23 @@ static void update_rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +unsigned long rt_needs_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 delta; + + if (!rq->rt_throttled) + return 0; + + if (rq->clock > rq->rt_period_expire) + return 1; + + delta = rq->rt_period_expire - rq->clock; + do_div(delta, NSEC_PER_SEC / HZ); + + return (unsigned long)delta; +} + /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -572,6 +641,8 @@ enum { SCHED_FEAT_START_DEBIT = 4, SCHED_FEAT_TREE_AVG = 8, SCHED_FEAT_APPROX_AVG = 16, + SCHED_FEAT_HRTICK = 32, + SCHED_FEAT_DOUBLE_TICK = 64, }; const_debug unsigned int sysctl_sched_features = @@ -579,7 +650,9 @@ const_debug unsigned int sysctl_sched_features = SCHED_FEAT_WAKEUP_PREEMPT * 1 | SCHED_FEAT_START_DEBIT * 1 | SCHED_FEAT_TREE_AVG * 0 | - SCHED_FEAT_APPROX_AVG * 0; + SCHED_FEAT_APPROX_AVG * 0 | + SCHED_FEAT_HRTICK * 1 | + SCHED_FEAT_DOUBLE_TICK * 0; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) @@ -589,6 +662,23 @@ const_debug unsigned int sysctl_sched_features = */ const_debug unsigned int sysctl_sched_nr_migrate = 32; +/* + * period over which we measure -rt task cpu usage in us. + * default: 1s + */ +unsigned int sysctl_sched_rt_period = 1000000; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu * clock constructed from sched_clock(): @@ -781,7 +871,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) struct rq *rq = cpu_rq(smp_processor_id()); u64 now = sched_clock(); - touch_softlockup_watchdog(); rq->idle_clock += delta_ns; /* * Override the previous timestamp and ignore all @@ -793,9 +882,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) rq->prev_clock_raw = now; rq->clock += delta_ns; spin_unlock(&rq->lock); + touch_softlockup_watchdog(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); +static void __resched_task(struct task_struct *p, int tif_bit); + +static inline void resched_task(struct task_struct *p) +{ + __resched_task(p, TIF_NEED_RESCHED); +} + +#ifdef CONFIG_SCHED_HRTICK +/* + * Use HR-timers to deliver accurate preemption points. + * + * Its all a bit involved since we cannot program an hrt while holding the + * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a + * reschedule event. + * + * When we get rescheduled we reprogram the hrtick_timer outside of the + * rq->lock. + */ +static inline void resched_hrt(struct task_struct *p) +{ + __resched_task(p, TIF_HRTICK_RESCHED); +} + +static inline void resched_rq(struct rq *rq) +{ + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + resched_task(rq->curr); + spin_unlock_irqrestore(&rq->lock, flags); +} + +enum { + HRTICK_SET, /* re-programm hrtick_timer */ + HRTICK_RESET, /* not a new slice */ +}; + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay, int reset) +{ + assert_spin_locked(&rq->lock); + + /* + * preempt at: now + delay + */ + rq->hrtick_expire = + ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); + /* + * indicate we need to program the timer + */ + __set_bit(HRTICK_SET, &rq->hrtick_flags); + if (reset) + __set_bit(HRTICK_RESET, &rq->hrtick_flags); + + /* + * New slices are called from the schedule path and don't need a + * forced reschedule. + */ + if (reset) + resched_hrt(rq->curr); +} + +static void hrtick_clear(struct rq *rq) +{ + if (hrtimer_active(&rq->hrtick_timer)) + hrtimer_cancel(&rq->hrtick_timer); +} + +/* + * Update the timer from the possible pending state. + */ +static void hrtick_set(struct rq *rq) +{ + ktime_t time; + int set, reset; + unsigned long flags; + + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); + + spin_lock_irqsave(&rq->lock, flags); + set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); + reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); + time = rq->hrtick_expire; + clear_thread_flag(TIF_HRTICK_RESCHED); + spin_unlock_irqrestore(&rq->lock, flags); + + if (set) { + hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); + if (reset && !hrtimer_active(&rq->hrtick_timer)) + resched_rq(rq); + } else + hrtick_clear(rq); +} + +/* + * High-resolution timer tick. + * Runs from hardirq context with interrupts disabled. + */ +static enum hrtimer_restart hrtick(struct hrtimer *timer) +{ + struct rq *rq = container_of(timer, struct rq, hrtick_timer); + + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); + + spin_lock(&rq->lock); + __update_rq_clock(rq); + rq->curr->sched_class->task_tick(rq, rq->curr, 1); + spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; +} + +static inline void init_rq_hrtick(struct rq *rq) +{ + rq->hrtick_flags = 0; + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rq->hrtick_timer.function = hrtick; + rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; +} + +void hrtick_resched(void) +{ + struct rq *rq; + unsigned long flags; + + if (!test_thread_flag(TIF_HRTICK_RESCHED)) + return; + + local_irq_save(flags); + rq = cpu_rq(smp_processor_id()); + hrtick_set(rq); + local_irq_restore(flags); +} +#else +static inline void hrtick_clear(struct rq *rq) +{ +} + +static inline void hrtick_set(struct rq *rq) +{ +} + +static inline void init_rq_hrtick(struct rq *rq) +{ +} + +void hrtick_resched(void) +{ +} +#endif + /* * resched_task - mark a task 'to be rescheduled now'. * @@ -809,16 +1066,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif -static void resched_task(struct task_struct *p) +static void __resched_task(struct task_struct *p, int tif_bit) { int cpu; assert_spin_locked(&task_rq(p)->lock); - if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + if (unlikely(test_tsk_thread_flag(p, tif_bit))) return; - set_tsk_thread_flag(p, TIF_NEED_RESCHED); + set_tsk_thread_flag(p, tif_bit); cpu = task_cpu(p); if (cpu == smp_processor_id()) @@ -841,10 +1098,10 @@ static void resched_cpu(int cpu) spin_unlock_irqrestore(&rq->lock, flags); } #else -static inline void resched_task(struct task_struct *p) +static void __resched_task(struct task_struct *p, int tif_bit) { assert_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); + set_tsk_thread_flag(p, tif_bit); } #endif @@ -1011,12 +1268,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #define sched_class_highest (&rt_sched_class) -static void inc_nr_running(struct task_struct *p, struct rq *rq) +static void inc_nr_running(struct rq *rq) { rq->nr_running++; } -static void dec_nr_running(struct task_struct *p, struct rq *rq) +static void dec_nr_running(struct rq *rq) { rq->nr_running--; } @@ -1106,11 +1363,11 @@ static int effective_prio(struct task_struct *p) */ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) { - if (p->state == TASK_UNINTERRUPTIBLE) + if (task_contributes_to_load(p)) rq->nr_uninterruptible--; enqueue_task(rq, p, wakeup); - inc_nr_running(p, rq); + inc_nr_running(rq); } /* @@ -1118,11 +1375,11 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) */ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) { - if (p->state == TASK_UNINTERRUPTIBLE) + if (task_contributes_to_load(p)) rq->nr_uninterruptible++; dequeue_task(rq, p, sleep); - dec_nr_running(p, rq); + dec_nr_running(rq); } /** @@ -1142,7 +1399,7 @@ unsigned long weighted_cpuload(const int cpu) static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { - set_task_cfs_rq(p, cpu); + set_task_rq(p, cpu); #ifdef CONFIG_SMP /* * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be @@ -1574,6 +1831,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) long old_state; struct rq *rq; + smp_wmb(); rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) @@ -1649,14 +1907,13 @@ out: return success; } -int fastcall wake_up_process(struct task_struct *p) +int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | - TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); + return try_to_wake_up(p, TASK_ALL, 0); } EXPORT_SYMBOL(wake_up_process); -int fastcall wake_up_state(struct task_struct *p, unsigned int state) +int wake_up_state(struct task_struct *p, unsigned int state) { return try_to_wake_up(p, state, 0); } @@ -1743,7 +2000,7 @@ void sched_fork(struct task_struct *p, int clone_flags) * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) +void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; @@ -1762,7 +2019,7 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); - inc_nr_running(p, rq); + inc_nr_running(rq); } check_preempt_curr(rq, p); #ifdef CONFIG_SMP @@ -3492,12 +3749,14 @@ void scheduler_tick(void) /* * Let rq->clock advance by at least TICK_NSEC: */ - if (unlikely(rq->clock < next_tick)) + if (unlikely(rq->clock < next_tick)) { rq->clock = next_tick; + rq->clock_underflows++; + } rq->tick_timestamp = rq->clock; update_cpu_load(rq); - if (curr != rq->idle) /* FIXME: needed? */ - curr->sched_class->task_tick(rq, curr); + curr->sched_class->task_tick(rq, curr, 0); + update_sched_rt_period(rq); spin_unlock(&rq->lock); #ifdef CONFIG_SMP @@ -3508,7 +3767,7 @@ void scheduler_tick(void) #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) -void fastcall add_preempt_count(int val) +void __kprobes add_preempt_count(int val) { /* * Underflow? @@ -3524,7 +3783,7 @@ void fastcall add_preempt_count(int val) } EXPORT_SYMBOL(add_preempt_count); -void fastcall sub_preempt_count(int val) +void __kprobes sub_preempt_count(int val) { /* * Underflow? @@ -3643,6 +3902,8 @@ need_resched_nonpreemptible: schedule_debug(prev); + hrtick_clear(rq); + /* * Do the rq-clock update outside the rq lock: */ @@ -3680,14 +3941,20 @@ need_resched_nonpreemptible: ++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */ + /* + * the context switch might have flipped the stack from under + * us, hence refresh the local variables. + */ + cpu = smp_processor_id(); + rq = cpu_rq(cpu); } else spin_unlock_irq(&rq->lock); - if (unlikely(reacquire_kernel_lock(current) < 0)) { - cpu = smp_processor_id(); - rq = cpu_rq(cpu); + hrtick_set(rq); + + if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; - } + preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; @@ -3703,10 +3970,9 @@ EXPORT_SYMBOL(schedule); asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); -#ifdef CONFIG_PREEMPT_BKL struct task_struct *task = current; int saved_lock_depth; -#endif + /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. @@ -3722,14 +3988,10 @@ asmlinkage void __sched preempt_schedule(void) * clear ->lock_depth so that schedule() doesnt * auto-release the semaphore: */ -#ifdef CONFIG_PREEMPT_BKL saved_lock_depth = task->lock_depth; task->lock_depth = -1; -#endif schedule(); -#ifdef CONFIG_PREEMPT_BKL task->lock_depth = saved_lock_depth; -#endif sub_preempt_count(PREEMPT_ACTIVE); /* @@ -3750,10 +4012,9 @@ EXPORT_SYMBOL(preempt_schedule); asmlinkage void __sched preempt_schedule_irq(void) { struct thread_info *ti = current_thread_info(); -#ifdef CONFIG_PREEMPT_BKL struct task_struct *task = current; int saved_lock_depth; -#endif + /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); @@ -3765,16 +4026,12 @@ asmlinkage void __sched preempt_schedule_irq(void) * clear ->lock_depth so that schedule() doesnt * auto-release the semaphore: */ -#ifdef CONFIG_PREEMPT_BKL saved_lock_depth = task->lock_depth; task->lock_depth = -1; -#endif local_irq_enable(); schedule(); local_irq_disable(); -#ifdef CONFIG_PREEMPT_BKL task->lock_depth = saved_lock_depth; -#endif sub_preempt_count(PREEMPT_ACTIVE); /* @@ -3824,7 +4081,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function */ -void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, +void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; @@ -3838,7 +4095,7 @@ EXPORT_SYMBOL(__wake_up); /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ -void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) { __wake_up_common(q, mode, 1, 0, NULL); } @@ -3856,7 +4113,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) * * On UP it can prevent extra preemption. */ -void fastcall +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { unsigned long flags; @@ -3880,8 +4137,7 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done++; - __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 1, 0, NULL); + __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); @@ -3892,8 +4148,7 @@ void complete_all(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, - 0, 0, NULL); + __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); @@ -3907,8 +4162,10 @@ do_wait_for_common(struct completion *x, long timeout, int state) wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { - if (state == TASK_INTERRUPTIBLE && - signal_pending(current)) { + if ((state == TASK_INTERRUPTIBLE && + signal_pending(current)) || + (state == TASK_KILLABLE && + fatal_signal_pending(current))) { __remove_wait_queue(&x->wait, &wait); return -ERESTARTSYS; } @@ -3968,6 +4225,15 @@ wait_for_completion_interruptible_timeout(struct completion *x, } EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); +int __sched wait_for_completion_killable(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + static long __sched sleep_on_common(wait_queue_head_t *q, int state, long timeout) { @@ -4319,6 +4585,15 @@ recheck: return -EPERM; } +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_policy(policy) && task_group(p)->rt_runtime == 0) + return -EPERM; +#endif + retval = security_task_setscheduler(p, policy, param); if (retval) return retval; @@ -4678,7 +4953,8 @@ static void __cond_resched(void) } while (need_resched()); } -int __sched cond_resched(void) +#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) +int __sched _cond_resched(void) { if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && system_state == SYSTEM_RUNNING) { @@ -4687,7 +4963,8 @@ int __sched cond_resched(void) } return 0; } -EXPORT_SYMBOL(cond_resched); +EXPORT_SYMBOL(_cond_resched); +#endif /* * cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -4699,19 +4976,15 @@ EXPORT_SYMBOL(cond_resched); */ int cond_resched_lock(spinlock_t *lock) { + int resched = need_resched() && system_state == SYSTEM_RUNNING; int ret = 0; - if (need_lockbreak(lock)) { + if (spin_needbreak(lock) || resched) { spin_unlock(lock); - cpu_relax(); - ret = 1; - spin_lock(lock); - } - if (need_resched() && system_state == SYSTEM_RUNNING) { - spin_release(&lock->dep_map, 1, _THIS_IP_); - _raw_spin_unlock(lock); - preempt_enable_no_resched(); - __cond_resched(); + if (resched && need_resched()) + __cond_resched(); + else + cpu_relax(); ret = 1; spin_lock(lock); } @@ -4915,8 +5188,7 @@ void sched_show_task(struct task_struct *p) printk(KERN_CONT "%5lu %5d %6d\n", free, task_pid_nr(p), task_pid_nr(p->real_parent)); - if (state != TASK_RUNNING) - show_stack(p, NULL); + show_stack(p, NULL); } void show_state_filter(unsigned long state_filter) @@ -4987,11 +5259,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) - task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); -#else task_thread_info(idle)->preempt_count = 0; -#endif + /* * The idle tasks have their own, simple scheduling class: */ @@ -5076,7 +5345,7 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) p->sched_class->set_cpus_allowed(p, &new_mask); else { p->cpus_allowed = new_mask; - p->nr_cpus_allowed = cpus_weight(new_mask); + p->rt.nr_cpus_allowed = cpus_weight(new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ @@ -6853,6 +7122,76 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } +static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +{ + struct rt_prio_array *array; + int i; + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); + +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + rt_rq->highest_prio = MAX_RT_PRIO; +#endif +#ifdef CONFIG_SMP + rt_rq->rt_nr_migratory = 0; + rt_rq->overloaded = 0; +#endif + + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + +#ifdef CONFIG_RT_GROUP_SCHED + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; +#endif +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, + struct cfs_rq *cfs_rq, struct sched_entity *se, + int cpu, int add) +{ + tg->cfs_rq[cpu] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + if (add) + list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + + tg->se[cpu] = se; + se->cfs_rq = &rq->cfs; + se->my_q = cfs_rq; + se->load.weight = tg->shares; + se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); + se->parent = NULL; +} +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, + struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, + int cpu, int add) +{ + tg->rt_rq[cpu] = rt_rq; + init_rt_rq(rt_rq, rq); + rt_rq->tg = tg; + rt_rq->rt_se = rt_se; + if (add) + list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); + + tg->rt_se[cpu] = rt_se; + rt_se->rt_rq = &rq->rt; + rt_se->my_q = rt_rq; + rt_se->parent = NULL; + INIT_LIST_HEAD(&rt_se->run_list); +} +#endif + void __init sched_init(void) { int highest_cpu = 0; @@ -6862,8 +7201,11 @@ void __init sched_init(void) init_defrootdomain(); #endif +#ifdef CONFIG_GROUP_SCHED + list_add(&init_task_group.list, &task_groups); +#endif + for_each_possible_cpu(i) { - struct rt_prio_array *array; struct rq *rq; rq = cpu_rq(i); @@ -6872,29 +7214,25 @@ void __init sched_init(void) rq->nr_running = 0; rq->clock = 1; init_cfs_rq(&rq->cfs, rq); + init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED - INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); - { - struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); - struct sched_entity *se = - &per_cpu(init_sched_entity, i); - - init_cfs_rq_p[i] = cfs_rq; - init_cfs_rq(cfs_rq, rq); - cfs_rq->tg = &init_task_group; - list_add(&cfs_rq->leaf_cfs_rq_list, - &rq->leaf_cfs_rq_list); - - init_sched_entity_p[i] = se; - se->cfs_rq = &rq->cfs; - se->my_q = cfs_rq; - se->load.weight = init_task_group_load; - se->load.inv_weight = - div64_64(1ULL<<32, init_task_group_load); - se->parent = NULL; - } init_task_group.shares = init_task_group_load; + INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + init_tg_cfs_entry(rq, &init_task_group, + &per_cpu(init_cfs_rq, i), + &per_cpu(init_sched_entity, i), i, 1); + +#endif +#ifdef CONFIG_RT_GROUP_SCHED + init_task_group.rt_runtime = + sysctl_sched_rt_runtime * NSEC_PER_USEC; + INIT_LIST_HEAD(&rq->leaf_rt_rq_list); + init_tg_rt_entry(rq, &init_task_group, + &per_cpu(init_rt_rq, i), + &per_cpu(init_sched_rt_entity, i), i, 1); #endif + rq->rt_period_expire = 0; + rq->rt_throttled = 0; for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; @@ -6907,20 +7245,11 @@ void __init sched_init(void) rq->cpu = i; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); - rq->rt.highest_prio = MAX_RT_PRIO; - rq->rt.overloaded = 0; rq_attach_root(rq, &def_root_domain); #endif + init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); - - array = &rq->rt.active; - for (j = 0; j < MAX_RT_PRIO; j++) { - INIT_LIST_HEAD(array->queue + j); - __clear_bit(j, array->bitmap); - } highest_cpu = i; - /* delimiter for bitsearch: */ - __set_bit(MAX_RT_PRIO, array->bitmap); } set_load_weight(&init_task); @@ -7003,7 +7332,7 @@ void normalize_rt_tasks(void) unsigned long flags; struct rq *rq; - read_lock_irq(&tasklist_lock); + read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { /* * Only normalize user tasks: @@ -7029,16 +7358,16 @@ void normalize_rt_tasks(void) continue; } - spin_lock_irqsave(&p->pi_lock, flags); + spin_lock(&p->pi_lock); rq = __task_rq_lock(p); normalize_task(rq, p); __task_rq_unlock(rq); - spin_unlock_irqrestore(&p->pi_lock, flags); + spin_unlock(&p->pi_lock); } while_each_thread(g, p); - read_unlock_irq(&tasklist_lock); + read_unlock_irqrestore(&tasklist_lock, flags); } #endif /* CONFIG_MAGIC_SYSRQ */ @@ -7087,12 +7416,12 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_GROUP_SCHED -#ifdef CONFIG_SMP +#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP /* * distribute shares of all task groups among their schedulable entities, - * to reflect load distrbution across cpus. + * to reflect load distribution across cpus. */ static int rebalance_shares(struct sched_domain *sd, int this_cpu) { @@ -7159,7 +7488,7 @@ static int rebalance_shares(struct sched_domain *sd, int this_cpu) * sysctl_sched_max_bal_int_shares represents the maximum interval between * consecutive calls to rebalance_shares() in the same sched domain. * - * These settings allows for the appropriate tradeoff between accuracy of + * These settings allows for the appropriate trade-off between accuracy of * fairness and the associated overhead. * */ @@ -7240,19 +7569,29 @@ static int load_balance_monitor(void *unused) } #endif /* CONFIG_SMP */ -/* allocate runqueue etc for a new task group */ -struct task_group *sched_create_group(void) +#ifdef CONFIG_FAIR_GROUP_SCHED +static void free_fair_sched_group(struct task_group *tg) +{ + int i; + + for_each_possible_cpu(i) { + if (tg->cfs_rq) + kfree(tg->cfs_rq[i]); + if (tg->se) + kfree(tg->se[i]); + } + + kfree(tg->cfs_rq); + kfree(tg->se); +} + +static int alloc_fair_sched_group(struct task_group *tg) { - struct task_group *tg; struct cfs_rq *cfs_rq; struct sched_entity *se; struct rq *rq; int i; - tg = kzalloc(sizeof(*tg), GFP_KERNEL); - if (!tg) - return ERR_PTR(-ENOMEM); - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * NR_CPUS, GFP_KERNEL); if (!tg->cfs_rq) goto err; @@ -7260,99 +7599,204 @@ struct task_group *sched_create_group(void) if (!tg->se) goto err; + tg->shares = NICE_0_LOAD; + for_each_possible_cpu(i) { rq = cpu_rq(i); - cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, - cpu_to_node(i)); + cfs_rq = kmalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); if (!cfs_rq) goto err; - se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, - cpu_to_node(i)); + se = kmalloc_node(sizeof(struct sched_entity), + GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); if (!se) goto err; - memset(cfs_rq, 0, sizeof(struct cfs_rq)); - memset(se, 0, sizeof(struct sched_entity)); + init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); + } - tg->cfs_rq[i] = cfs_rq; - init_cfs_rq(cfs_rq, rq); - cfs_rq->tg = tg; + return 1; - tg->se[i] = se; - se->cfs_rq = &rq->cfs; - se->my_q = cfs_rq; - se->load.weight = NICE_0_LOAD; - se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); - se->parent = NULL; - } + err: + return 0; +} - tg->shares = NICE_0_LOAD; +static inline void register_fair_sched_group(struct task_group *tg, int cpu) +{ + list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, + &cpu_rq(cpu)->leaf_cfs_rq_list); +} - lock_task_group_list(); - for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = tg->cfs_rq[i]; - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); - } - unlock_task_group_list(); +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ + list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); +} +#else +static inline void free_fair_sched_group(struct task_group *tg) +{ +} - return tg; +static inline int alloc_fair_sched_group(struct task_group *tg) +{ + return 1; +} + +static inline void register_fair_sched_group(struct task_group *tg, int cpu) +{ +} + +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ +} +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg) +{ + int i; -err: for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - if (tg->se) - kfree(tg->se[i]); + if (tg->rt_rq) + kfree(tg->rt_rq[i]); + if (tg->rt_se) + kfree(tg->rt_se[i]); } - kfree(tg->cfs_rq); - kfree(tg->se); - kfree(tg); - return ERR_PTR(-ENOMEM); + kfree(tg->rt_rq); + kfree(tg->rt_se); } -/* rcu callback to free various structures associated with a task group */ -static void free_sched_group(struct rcu_head *rhp) +static int alloc_rt_sched_group(struct task_group *tg) { - struct task_group *tg = container_of(rhp, struct task_group, rcu); - struct cfs_rq *cfs_rq; - struct sched_entity *se; + struct rt_rq *rt_rq; + struct sched_rt_entity *rt_se; + struct rq *rq; int i; - /* now it should be safe to free those cfs_rqs */ + tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); + if (!tg->rt_rq) + goto err; + tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); + if (!tg->rt_se) + goto err; + + tg->rt_runtime = 0; + for_each_possible_cpu(i) { - cfs_rq = tg->cfs_rq[i]; - kfree(cfs_rq); + rq = cpu_rq(i); + + rt_rq = kmalloc_node(sizeof(struct rt_rq), + GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + if (!rt_rq) + goto err; - se = tg->se[i]; - kfree(se); + rt_se = kmalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + if (!rt_se) + goto err; + + init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); } - kfree(tg->cfs_rq); - kfree(tg->se); + return 1; + + err: + return 0; +} + +static inline void register_rt_sched_group(struct task_group *tg, int cpu) +{ + list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, + &cpu_rq(cpu)->leaf_rt_rq_list); +} + +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) +{ + list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); +} +#else +static inline void free_rt_sched_group(struct task_group *tg) +{ +} + +static inline int alloc_rt_sched_group(struct task_group *tg) +{ + return 1; +} + +static inline void register_rt_sched_group(struct task_group *tg, int cpu) +{ +} + +static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) +{ +} +#endif + +static void free_sched_group(struct task_group *tg) +{ + free_fair_sched_group(tg); + free_rt_sched_group(tg); kfree(tg); } +/* allocate runqueue etc for a new task group */ +struct task_group *sched_create_group(void) +{ + struct task_group *tg; + unsigned long flags; + int i; + + tg = kzalloc(sizeof(*tg), GFP_KERNEL); + if (!tg) + return ERR_PTR(-ENOMEM); + + if (!alloc_fair_sched_group(tg)) + goto err; + + if (!alloc_rt_sched_group(tg)) + goto err; + + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) { + register_fair_sched_group(tg, i); + register_rt_sched_group(tg, i); + } + list_add_rcu(&tg->list, &task_groups); + spin_unlock_irqrestore(&task_group_lock, flags); + + return tg; + +err: + free_sched_group(tg); + return ERR_PTR(-ENOMEM); +} + +/* rcu callback to free various structures associated with a task group */ +static void free_sched_group_rcu(struct rcu_head *rhp) +{ + /* now it should be safe to free those cfs_rqs */ + free_sched_group(container_of(rhp, struct task_group, rcu)); +} + /* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { - struct cfs_rq *cfs_rq = NULL; + unsigned long flags; int i; - lock_task_group_list(); + spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) { - cfs_rq = tg->cfs_rq[i]; - list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + unregister_fair_sched_group(tg, i); + unregister_rt_sched_group(tg, i); } - unlock_task_group_list(); - - BUG_ON(!cfs_rq); + list_del_rcu(&tg->list); + spin_unlock_irqrestore(&task_group_lock, flags); /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group); + call_rcu(&tg->rcu, free_sched_group_rcu); } /* change task's runqueue when it moves between groups. @@ -7368,11 +7812,6 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); - if (tsk->sched_class != &fair_sched_class) { - set_task_cfs_rq(tsk, task_cpu(tsk)); - goto done; - } - update_rq_clock(rq); running = task_current(rq, tsk); @@ -7384,7 +7823,7 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_class->put_prev_task(rq, tsk); } - set_task_cfs_rq(tsk, task_cpu(tsk)); + set_task_rq(tsk, task_cpu(tsk)); if (on_rq) { if (unlikely(running)) @@ -7392,10 +7831,10 @@ void sched_move_task(struct task_struct *tsk) enqueue_task(rq, tsk, 0); } -done: task_rq_unlock(rq, &flags); } +#ifdef CONFIG_FAIR_GROUP_SCHED /* rq->lock to be locked by caller */ static void set_se_shares(struct sched_entity *se, unsigned long shares) { @@ -7421,13 +7860,14 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) } } +static DEFINE_MUTEX(shares_mutex); + int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; - struct cfs_rq *cfs_rq; - struct rq *rq; + unsigned long flags; - lock_task_group_list(); + mutex_lock(&shares_mutex); if (tg->shares == shares) goto done; @@ -7439,10 +7879,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * load_balance_fair) from referring to this group first, * by taking it off the rq->leaf_cfs_rq_list on each cpu. */ - for_each_possible_cpu(i) { - cfs_rq = tg->cfs_rq[i]; - list_del_rcu(&cfs_rq->leaf_cfs_rq_list); - } + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) + unregister_fair_sched_group(tg, i); + spin_unlock_irqrestore(&task_group_lock, flags); /* wait for any ongoing reference to this group to finish */ synchronize_sched(); @@ -7462,13 +7902,12 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * Enable load balance activity on this group, by inserting it back on * each cpu's rq->leaf_cfs_rq_list. */ - for_each_possible_cpu(i) { - rq = cpu_rq(i); - cfs_rq = tg->cfs_rq[i]; - list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); - } + spin_lock_irqsave(&task_group_lock, flags); + for_each_possible_cpu(i) + register_fair_sched_group(tg, i); + spin_unlock_irqrestore(&task_group_lock, flags); done: - unlock_task_group_list(); + mutex_unlock(&shares_mutex); return 0; } @@ -7476,10 +7915,84 @@ unsigned long sched_group_shares(struct task_group *tg) { return tg->shares; } +#endif -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex); + +static unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 16; + + runtime *= (1ULL << 16); + div64_64(runtime, period); + return runtime; +} -#ifdef CONFIG_FAIR_CGROUP_SCHED +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ + struct task_group *tgi; + unsigned long total = 0; + unsigned long global_ratio = + to_ratio(sysctl_sched_rt_period, + sysctl_sched_rt_runtime < 0 ? + RUNTIME_INF : sysctl_sched_rt_runtime); + + rcu_read_lock(); + list_for_each_entry_rcu(tgi, &task_groups, list) { + if (tgi == tg) + continue; + + total += to_ratio(period, tgi->rt_runtime); + } + rcu_read_unlock(); + + return total + to_ratio(period, runtime) < global_ratio; +} + +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + int err = 0; + + rt_period = sysctl_sched_rt_period * NSEC_PER_USEC; + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us == -1) + rt_runtime = rt_period; + + mutex_lock(&rt_constraints_mutex); + if (!__rt_schedulable(tg, rt_period, rt_runtime)) { + err = -EINVAL; + goto unlock; + } + if (rt_runtime_us == -1) + rt_runtime = RUNTIME_INF; + tg->rt_runtime = rt_runtime; + unlock: + mutex_unlock(&rt_constraints_mutex); + + return err; +} + +long sched_group_rt_runtime(struct task_group *tg) +{ + u64 rt_runtime_us; + + if (tg->rt_runtime == RUNTIME_INF) + return -1; + + rt_runtime_us = tg->rt_runtime; + do_div(rt_runtime_us, NSEC_PER_USEC); + return rt_runtime_us; +} +#endif +#endif /* CONFIG_GROUP_SCHED */ + +#ifdef CONFIG_CGROUP_SCHED /* return corresponding task_group object of a cgroup */ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) @@ -7525,9 +8038,15 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk) { +#ifdef CONFIG_RT_GROUP_SCHED + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && cgroup_tg(cgrp)->rt_runtime == 0) + return -EINVAL; +#else /* We don't support RT-tasks being in separate groups */ if (tsk->sched_class != &fair_sched_class) return -EINVAL; +#endif return 0; } @@ -7539,6 +8058,7 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, sched_move_task(tsk); } +#ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) { @@ -7551,13 +8071,70 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) return (u64) tg->shares; } +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) +{ + char buffer[64]; + int retval = 0; + s64 val; + char *end; + + if (!nbytes) + return -EINVAL; + if (nbytes >= sizeof(buffer)) + return -E2BIG; + if (copy_from_user(buffer, userbuf, nbytes)) + return -EFAULT; + + buffer[nbytes] = 0; /* nul-terminate */ + + /* strip newline if necessary */ + if (nbytes && (buffer[nbytes-1] == '\n')) + buffer[nbytes-1] = 0; + val = simple_strtoll(buffer, &end, 0); + if (*end) + return -EINVAL; + + /* Pass to subsystem */ + retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val); + if (!retval) + retval = nbytes; + return retval; +} + +static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) +{ + char tmp[64]; + long val = sched_group_rt_runtime(cgroup_tg(cgrp)); + int len = sprintf(tmp, "%ld\n", val); + + return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); +} +#endif static struct cftype cpu_files[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", .read_uint = cpu_shares_read_uint, .write_uint = cpu_shares_write_uint, }, +#endif +#ifdef CONFIG_RT_GROUP_SCHED + { + .name = "rt_runtime_us", + .read = cpu_rt_runtime_read, + .write = cpu_rt_runtime_write, + }, +#endif }; static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) @@ -7576,7 +8153,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { .early_init = 1, }; -#endif /* CONFIG_FAIR_CGROUP_SCHED */ +#endif /* CONFIG_CGROUP_SCHED */ #ifdef CONFIG_CGROUP_CPUACCT