4 * Kernel scheduler and related syscalls
6 * Copyright (C) 1991-2002 Linus Torvalds
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
22 #include <linux/module.h>
23 #include <linux/nmi.h>
24 #include <linux/init.h>
25 #include <asm/uaccess.h>
26 #include <linux/highmem.h>
27 #include <linux/smp_lock.h>
28 #include <asm/mmu_context.h>
29 #include <linux/interrupt.h>
30 #include <linux/capability.h>
31 #include <linux/completion.h>
32 #include <linux/kernel_stat.h>
33 #include <linux/debug_locks.h>
34 #include <linux/security.h>
35 #include <linux/notifier.h>
36 #include <linux/profile.h>
37 #include <linux/freezer.h>
38 #include <linux/vmalloc.h>
39 #include <linux/blkdev.h>
40 #include <linux/delay.h>
41 #include <linux/smp.h>
42 #include <linux/threads.h>
43 #include <linux/timer.h>
44 #include <linux/rcupdate.h>
45 #include <linux/cpu.h>
46 #include <linux/cpuset.h>
47 #include <linux/percpu.h>
48 #include <linux/kthread.h>
49 #include <linux/seq_file.h>
50 #include <linux/syscalls.h>
51 #include <linux/times.h>
52 #include <linux/tsacct_kern.h>
53 #include <linux/kprobes.h>
54 #include <linux/delayacct.h>
55 #include <linux/reciprocal_div.h>
58 #include <asm/unistd.h>
61 * Scheduler clock - returns current time in nanosec units.
62 * This is default implementation.
63 * Architectures and sub-architectures can override this.
65 unsigned long long __attribute__((weak)) sched_clock(void)
67 return (unsigned long long)jiffies * (1000000000 / HZ);
71 * Convert user-nice values [ -20 ... 0 ... 19 ]
72 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
75 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
76 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
77 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
80 * 'User priority' is the nice value converted to something we
81 * can work with better when scaling various scheduler parameters,
82 * it's a [ 0 ... 39 ] range.
84 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
85 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
86 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
89 * Some helpers for converting nanosecond timing to jiffy resolution
91 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
92 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
94 #define NICE_0_LOAD SCHED_LOAD_SCALE
95 #define NICE_0_SHIFT SCHED_LOAD_SHIFT
98 * These are the 'tuning knobs' of the scheduler:
100 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
101 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
102 * Timeslices get refilled after they expire.
104 #define MIN_TIMESLICE max(5 * HZ / 1000, 1)
105 #define DEF_TIMESLICE (100 * HZ / 1000)
106 #define ON_RUNQUEUE_WEIGHT 30
107 #define CHILD_PENALTY 95
108 #define PARENT_PENALTY 100
109 #define EXIT_WEIGHT 3
110 #define PRIO_BONUS_RATIO 25
111 #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
112 #define INTERACTIVE_DELTA 2
113 #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
114 #define STARVATION_LIMIT (MAX_SLEEP_AVG)
115 #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
118 * If a task is 'interactive' then we reinsert it in the active
119 * array after it has expired its current timeslice. (it will not
120 * continue to run immediately, it will still roundrobin with
121 * other interactive tasks.)
123 * This part scales the interactivity limit depending on niceness.
125 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
126 * Here are a few examples of different nice levels:
128 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
129 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
130 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
131 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
132 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
134 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
135 * priority range a task can explore, a value of '1' means the
136 * task is rated interactive.)
138 * Ie. nice +19 tasks can never get 'interactive' enough to be
139 * reinserted into the active array. And only heavily CPU-hog nice -20
140 * tasks will be expired. Default nice 0 tasks are somewhere between,
141 * it takes some effort for them to get interactive, but it's not
145 #define CURRENT_BONUS(p) \
146 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
149 #define GRANULARITY (10 * HZ / 1000 ? : 1)
152 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
153 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
156 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
157 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
160 #define SCALE(v1,v1_max,v2_max) \
161 (v1) * (v2_max) / (v1_max)
164 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
167 #define TASK_INTERACTIVE(p) \
168 ((p)->prio <= (p)->static_prio - DELTA(p))
170 #define INTERACTIVE_SLEEP(p) \
171 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
172 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
174 #define TASK_PREEMPTS_CURR(p, rq) \
175 ((p)->prio < (rq)->curr->prio)
177 #define SCALE_PRIO(x, prio) \
178 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
180 static unsigned int static_prio_timeslice(int static_prio)
182 if (static_prio < NICE_TO_PRIO(0))
183 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
185 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
190 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
191 * Since cpu_power is a 'constant', we can use a reciprocal divide.
193 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
195 return reciprocal_divide(load, sg->reciprocal_cpu_power);
199 * Each time a sched group cpu_power is changed,
200 * we must compute its reciprocal value
202 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
204 sg->__cpu_power += val;
205 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
210 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
211 * to time slice values: [800ms ... 100ms ... 5ms]
213 * The higher a thread's priority, the bigger timeslices
214 * it gets during one round of execution. But even the lowest
215 * priority thread gets MIN_TIMESLICE worth of execution time.
218 static inline unsigned int task_timeslice(struct task_struct *p)
220 return static_prio_timeslice(p->static_prio);
223 static inline int rt_policy(int policy)
225 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
230 static inline int task_has_rt_policy(struct task_struct *p)
232 return rt_policy(p->policy);
236 * This is the priority-queue data structure of the RT scheduling class:
238 struct rt_prio_array {
239 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
240 struct list_head queue[MAX_RT_PRIO];
244 struct load_weight load;
245 u64 load_update_start, load_update_last;
246 unsigned long delta_fair, delta_exec, delta_stat;
249 /* CFS-related fields in a runqueue */
251 struct load_weight load;
252 unsigned long nr_running;
258 unsigned long wait_runtime_overruns, wait_runtime_underruns;
260 struct rb_root tasks_timeline;
261 struct rb_node *rb_leftmost;
262 struct rb_node *rb_load_balance_curr;
263 #ifdef CONFIG_FAIR_GROUP_SCHED
264 /* 'curr' points to currently running entity on this cfs_rq.
265 * It is set to NULL otherwise (i.e when none are currently running).
267 struct sched_entity *curr;
268 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
270 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
271 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
272 * (like users, containers etc.)
274 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
275 * list is used during load balance.
277 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
281 /* Real-Time classes' related field in a runqueue: */
283 struct rt_prio_array active;
284 int rt_load_balance_idx;
285 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
289 * The prio-array type of the old scheduler:
292 unsigned int nr_active;
293 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
294 struct list_head queue[MAX_PRIO];
298 * This is the main, per-CPU runqueue data structure.
300 * Locking rule: those places that want to lock multiple runqueues
301 * (such as the load balancing or the thread migration code), lock
302 * acquire operations must be ordered by ascending &runqueue.
305 spinlock_t lock; /* runqueue lock */
308 * nr_running and cpu_load should be in the same cacheline because
309 * remote CPUs use both these fields when doing load calculation.
311 unsigned long nr_running;
312 unsigned long raw_weighted_load;
313 #define CPU_LOAD_IDX_MAX 5
314 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
315 unsigned char idle_at_tick;
317 unsigned char in_nohz_recently;
319 struct load_stat ls; /* capture load from *all* tasks on this cpu */
320 unsigned long nr_load_updates;
324 #ifdef CONFIG_FAIR_GROUP_SCHED
325 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
330 * This is part of a global counter where only the total sum
331 * over all CPUs matters. A task can increase this counter on
332 * one CPU and if it got migrated afterwards it may decrease
333 * it on another CPU. Always updated under the runqueue lock:
335 unsigned long nr_uninterruptible;
337 unsigned long expired_timestamp;
338 unsigned long long most_recent_timestamp;
340 struct task_struct *curr, *idle;
341 unsigned long next_balance;
342 struct mm_struct *prev_mm;
344 struct prio_array *active, *expired, arrays[2];
345 int best_expired_prio;
347 u64 clock, prev_clock_raw;
350 unsigned int clock_warps, clock_overflows;
351 unsigned int clock_unstable_events;
353 struct sched_class *load_balance_class;
358 struct sched_domain *sd;
360 /* For active balancing */
363 int cpu; /* cpu of this runqueue */
365 struct task_struct *migration_thread;
366 struct list_head migration_queue;
369 #ifdef CONFIG_SCHEDSTATS
371 struct sched_info rq_sched_info;
373 /* sys_sched_yield() stats */
374 unsigned long yld_exp_empty;
375 unsigned long yld_act_empty;
376 unsigned long yld_both_empty;
377 unsigned long yld_cnt;
379 /* schedule() stats */
380 unsigned long sched_switch;
381 unsigned long sched_cnt;
382 unsigned long sched_goidle;
384 /* try_to_wake_up() stats */
385 unsigned long ttwu_cnt;
386 unsigned long ttwu_local;
388 struct lock_class_key rq_lock_key;
391 static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
392 static DEFINE_MUTEX(sched_hotcpu_mutex);
394 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
396 rq->curr->sched_class->check_preempt_curr(rq, p);
399 static inline int cpu_of(struct rq *rq)
409 * Per-runqueue clock, as finegrained as the platform can give us:
411 static unsigned long long __rq_clock(struct rq *rq)
413 u64 prev_raw = rq->prev_clock_raw;
414 u64 now = sched_clock();
415 s64 delta = now - prev_raw;
416 u64 clock = rq->clock;
419 * Protect against sched_clock() occasionally going backwards:
421 if (unlikely(delta < 0)) {
426 * Catch too large forward jumps too:
428 if (unlikely(delta > 2*TICK_NSEC)) {
430 rq->clock_overflows++;
432 if (unlikely(delta > rq->clock_max_delta))
433 rq->clock_max_delta = delta;
438 rq->prev_clock_raw = now;
444 static inline unsigned long long rq_clock(struct rq *rq)
446 int this_cpu = smp_processor_id();
448 if (this_cpu == cpu_of(rq))
449 return __rq_clock(rq);
455 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
456 * See detach_destroy_domains: synchronize_sched for details.
458 * The domain tree of any CPU may only be accessed from within
459 * preempt-disabled sections.
461 #define for_each_domain(cpu, __sd) \
462 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
464 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
465 #define this_rq() (&__get_cpu_var(runqueues))
466 #define task_rq(p) cpu_rq(task_cpu(p))
467 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
469 #ifdef CONFIG_FAIR_GROUP_SCHED
470 /* Change a task's ->cfs_rq if it moves across CPUs */
471 static inline void set_task_cfs_rq(struct task_struct *p)
473 p->se.cfs_rq = &task_rq(p)->cfs;
476 static inline void set_task_cfs_rq(struct task_struct *p)
481 #ifndef prepare_arch_switch
482 # define prepare_arch_switch(next) do { } while (0)
484 #ifndef finish_arch_switch
485 # define finish_arch_switch(prev) do { } while (0)
488 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
489 static inline int task_running(struct rq *rq, struct task_struct *p)
491 return rq->curr == p;
494 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
498 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
500 #ifdef CONFIG_DEBUG_SPINLOCK
501 /* this is a valid case when another task releases the spinlock */
502 rq->lock.owner = current;
505 * If we are tracking spinlock dependencies then we have to
506 * fix up the runqueue lock - which gets 'carried over' from
509 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
511 spin_unlock_irq(&rq->lock);
514 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
515 static inline int task_running(struct rq *rq, struct task_struct *p)
520 return rq->curr == p;
524 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
528 * We can optimise this out completely for !SMP, because the
529 * SMP rebalancing from interrupt is the only thing that cares
534 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
535 spin_unlock_irq(&rq->lock);
537 spin_unlock(&rq->lock);
541 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
545 * After ->oncpu is cleared, the task can be moved to a different CPU.
546 * We must ensure this doesn't happen until the switch is completely
552 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
556 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
559 * __task_rq_lock - lock the runqueue a given task resides on.
560 * Must be called interrupts disabled.
562 static inline struct rq *__task_rq_lock(struct task_struct *p)
569 spin_lock(&rq->lock);
570 if (unlikely(rq != task_rq(p))) {
571 spin_unlock(&rq->lock);
572 goto repeat_lock_task;
578 * task_rq_lock - lock the runqueue a given task resides on and disable
579 * interrupts. Note the ordering: we can safely lookup the task_rq without
580 * explicitly disabling preemption.
582 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
588 local_irq_save(*flags);
590 spin_lock(&rq->lock);
591 if (unlikely(rq != task_rq(p))) {
592 spin_unlock_irqrestore(&rq->lock, *flags);
593 goto repeat_lock_task;
598 static inline void __task_rq_unlock(struct rq *rq)
601 spin_unlock(&rq->lock);
604 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
607 spin_unlock_irqrestore(&rq->lock, *flags);
611 * this_rq_lock - lock this runqueue and disable interrupts.
613 static inline struct rq *this_rq_lock(void)
620 spin_lock(&rq->lock);
626 * resched_task - mark a task 'to be rescheduled now'.
628 * On UP this means the setting of the need_resched flag, on SMP it
629 * might also involve a cross-CPU call to trigger the scheduler on
634 #ifndef tsk_is_polling
635 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
638 static void resched_task(struct task_struct *p)
642 assert_spin_locked(&task_rq(p)->lock);
644 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
647 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
650 if (cpu == smp_processor_id())
653 /* NEED_RESCHED must be visible before we test polling */
655 if (!tsk_is_polling(p))
656 smp_send_reschedule(cpu);
659 static void resched_cpu(int cpu)
661 struct rq *rq = cpu_rq(cpu);
664 if (!spin_trylock_irqsave(&rq->lock, flags))
666 resched_task(cpu_curr(cpu));
667 spin_unlock_irqrestore(&rq->lock, flags);
670 static inline void resched_task(struct task_struct *p)
672 assert_spin_locked(&task_rq(p)->lock);
673 set_tsk_need_resched(p);
677 static u64 div64_likely32(u64 divident, unsigned long divisor)
679 #if BITS_PER_LONG == 32
680 if (likely(divident <= 0xffffffffULL))
681 return (u32)divident / divisor;
682 do_div(divident, divisor);
686 return divident / divisor;
690 #if BITS_PER_LONG == 32
691 # define WMULT_CONST (~0UL)
693 # define WMULT_CONST (1UL << 32)
696 #define WMULT_SHIFT 32
698 static inline unsigned long
699 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
700 struct load_weight *lw)
704 if (unlikely(!lw->inv_weight))
705 lw->inv_weight = WMULT_CONST / lw->weight;
707 tmp = (u64)delta_exec * weight;
709 * Check whether we'd overflow the 64-bit multiplication:
711 if (unlikely(tmp > WMULT_CONST)) {
712 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
715 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
718 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
721 static inline unsigned long
722 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
724 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
727 static void update_load_add(struct load_weight *lw, unsigned long inc)
733 static void update_load_sub(struct load_weight *lw, unsigned long dec)
739 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
741 if (rq->curr != rq->idle && ls->load.weight) {
742 ls->delta_exec += ls->delta_stat;
743 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
749 * Update delta_exec, delta_fair fields for rq.
751 * delta_fair clock advances at a rate inversely proportional to
752 * total load (rq->ls.load.weight) on the runqueue, while
753 * delta_exec advances at the same rate as wall-clock (provided
756 * delta_exec / delta_fair is a measure of the (smoothened) load on this
757 * runqueue over any given interval. This (smoothened) load is used
758 * during load balance.
760 * This function is called /before/ updating rq->ls.load
761 * and when switching tasks.
763 static void update_curr_load(struct rq *rq, u64 now)
765 struct load_stat *ls = &rq->ls;
768 start = ls->load_update_start;
769 ls->load_update_start = now;
770 ls->delta_stat += now - start;
772 * Stagger updates to ls->delta_fair. Very frequent updates
775 if (ls->delta_stat >= sysctl_sched_stat_granularity)
776 __update_curr_load(rq, ls);
780 * To aid in avoiding the subversion of "niceness" due to uneven distribution
781 * of tasks with abnormal "nice" values across CPUs the contribution that
782 * each task makes to its run queue's load is weighted according to its
783 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
784 * scaled version of the new time slice allocation that they receive on time
789 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
790 * If static_prio_timeslice() is ever changed to break this assumption then
791 * this code will need modification
793 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
794 #define load_weight(lp) \
795 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
796 #define PRIO_TO_LOAD_WEIGHT(prio) \
797 load_weight(static_prio_timeslice(prio))
798 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
799 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
801 #define WEIGHT_IDLEPRIO 2
802 #define WMULT_IDLEPRIO (1 << 31)
805 * Nice levels are multiplicative, with a gentle 10% change for every
806 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
807 * nice 1, it will get ~10% less CPU time than another CPU-bound task
808 * that remained on nice 0.
810 * The "10% effect" is relative and cumulative: from _any_ nice level,
811 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
812 * it's +10% CPU usage.
814 static const int prio_to_weight[40] = {
815 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
816 /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
817 /* 0 */ NICE_0_LOAD /* 1024 */,
818 /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
819 /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
822 static const u32 prio_to_wmult[40] = {
823 48356, 60446, 75558, 94446, 118058, 147573,
824 184467, 230589, 288233, 360285, 450347,
825 562979, 703746, 879575, 1099582, 1374389,
826 717986, 2147483, 2684354, 3355443, 4194304,
827 244160, 6557201, 8196502, 10250518, 12782640,
828 16025997, 19976592, 24970740, 31350126, 39045157,
829 49367440, 61356675, 76695844, 95443717, 119304647,
830 148102320, 186737708, 238609294, 286331153,
834 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
836 update_curr_load(rq, now);
837 update_load_add(&rq->ls.load, p->se.load.weight);
841 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
843 update_curr_load(rq, now);
844 update_load_sub(&rq->ls.load, p->se.load.weight);
847 static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
850 inc_load(rq, p, now);
853 static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
856 dec_load(rq, p, now);
859 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
862 * runqueue iterator, to support SMP load-balancing between different
863 * scheduling classes, without having to expose their internal data
864 * structures to the load-balancing proper:
868 struct task_struct *(*start)(void *);
869 struct task_struct *(*next)(void *);
872 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
873 unsigned long max_nr_move, unsigned long max_load_move,
874 struct sched_domain *sd, enum cpu_idle_type idle,
875 int *all_pinned, unsigned long *load_moved,
876 int this_best_prio, int best_prio, int best_prio_seen,
877 struct rq_iterator *iterator);
879 #include "sched_stats.h"
880 #include "sched_rt.c"
881 #include "sched_fair.c"
882 #include "sched_idletask.c"
883 #ifdef CONFIG_SCHED_DEBUG
884 # include "sched_debug.c"
887 #define sched_class_highest (&rt_sched_class)
889 static void set_load_weight(struct task_struct *p)
891 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
892 p->se.wait_runtime = 0;
894 if (task_has_rt_policy(p)) {
895 p->se.load.weight = prio_to_weight[0] * 2;
896 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
901 * SCHED_IDLE tasks get minimal weight:
903 if (p->policy == SCHED_IDLE) {
904 p->se.load.weight = WEIGHT_IDLEPRIO;
905 p->se.load.inv_weight = WMULT_IDLEPRIO;
909 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
910 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
914 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
916 sched_info_queued(p);
917 p->sched_class->enqueue_task(rq, p, wakeup, now);
922 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
924 p->sched_class->dequeue_task(rq, p, sleep, now);
929 * __normal_prio - return the priority that is based on the static prio
931 static inline int __normal_prio(struct task_struct *p)
933 return p->static_prio;
937 * Calculate the expected normal priority: i.e. priority
938 * without taking RT-inheritance into account. Might be
939 * boosted by interactivity modifiers. Changes upon fork,
940 * setprio syscalls, and whenever the interactivity
941 * estimator recalculates.
943 static inline int normal_prio(struct task_struct *p)
947 if (task_has_rt_policy(p))
948 prio = MAX_RT_PRIO-1 - p->rt_priority;
950 prio = __normal_prio(p);
955 * Calculate the current priority, i.e. the priority
956 * taken into account by the scheduler. This value might
957 * be boosted by RT tasks, or might be boosted by
958 * interactivity modifiers. Will be RT if the task got
959 * RT-boosted. If not then it returns p->normal_prio.
961 static int effective_prio(struct task_struct *p)
963 p->normal_prio = normal_prio(p);
965 * If we are RT tasks or we were boosted to RT priority,
966 * keep the priority unchanged. Otherwise, update priority
967 * to the normal priority:
969 if (!rt_prio(p->prio))
970 return p->normal_prio;
975 * activate_task - move a task to the runqueue.
977 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
979 u64 now = rq_clock(rq);
981 if (p->state == TASK_UNINTERRUPTIBLE)
982 rq->nr_uninterruptible--;
984 enqueue_task(rq, p, wakeup, now);
985 inc_nr_running(p, rq, now);
989 * activate_idle_task - move idle task to the _front_ of runqueue.
991 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
993 u64 now = rq_clock(rq);
995 if (p->state == TASK_UNINTERRUPTIBLE)
996 rq->nr_uninterruptible--;
998 enqueue_task(rq, p, 0, now);
999 inc_nr_running(p, rq, now);
1003 * deactivate_task - remove a task from the runqueue.
1005 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1007 u64 now = rq_clock(rq);
1009 if (p->state == TASK_UNINTERRUPTIBLE)
1010 rq->nr_uninterruptible++;
1012 dequeue_task(rq, p, sleep, now);
1013 dec_nr_running(p, rq, now);
1017 * task_curr - is this task currently executing on a CPU?
1018 * @p: the task in question.
1020 inline int task_curr(const struct task_struct *p)
1022 return cpu_curr(task_cpu(p)) == p;
1025 /* Used instead of source_load when we know the type == 0 */
1026 unsigned long weighted_cpuload(const int cpu)
1028 return cpu_rq(cpu)->ls.load.weight;
1031 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1034 task_thread_info(p)->cpu = cpu;
1041 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1043 int old_cpu = task_cpu(p);
1044 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
1045 u64 clock_offset, fair_clock_offset;
1047 clock_offset = old_rq->clock - new_rq->clock;
1048 fair_clock_offset = old_rq->cfs.fair_clock -
1049 new_rq->cfs.fair_clock;
1050 if (p->se.wait_start)
1051 p->se.wait_start -= clock_offset;
1052 if (p->se.wait_start_fair)
1053 p->se.wait_start_fair -= fair_clock_offset;
1054 if (p->se.sleep_start)
1055 p->se.sleep_start -= clock_offset;
1056 if (p->se.block_start)
1057 p->se.block_start -= clock_offset;
1058 if (p->se.sleep_start_fair)
1059 p->se.sleep_start_fair -= fair_clock_offset;
1061 __set_task_cpu(p, new_cpu);
1064 struct migration_req {
1065 struct list_head list;
1067 struct task_struct *task;
1070 struct completion done;
1074 * The task's runqueue lock must be held.
1075 * Returns true if you have to wait for migration thread.
1078 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1080 struct rq *rq = task_rq(p);
1083 * If the task is not on a runqueue (and not running), then
1084 * it is sufficient to simply update the task's cpu field.
1086 if (!p->se.on_rq && !task_running(rq, p)) {
1087 set_task_cpu(p, dest_cpu);
1091 init_completion(&req->done);
1093 req->dest_cpu = dest_cpu;
1094 list_add(&req->list, &rq->migration_queue);
1100 * wait_task_inactive - wait for a thread to unschedule.
1102 * The caller must ensure that the task *will* unschedule sometime soon,
1103 * else this function might spin for a *long* time. This function can't
1104 * be called with interrupts off, or it may introduce deadlock with
1105 * smp_call_function() if an IPI is sent by the same process we are
1106 * waiting to become inactive.
1108 void wait_task_inactive(struct task_struct *p)
1110 unsigned long flags;
1116 * We do the initial early heuristics without holding
1117 * any task-queue locks at all. We'll only try to get
1118 * the runqueue lock when things look like they will
1124 * If the task is actively running on another CPU
1125 * still, just relax and busy-wait without holding
1128 * NOTE! Since we don't hold any locks, it's not
1129 * even sure that "rq" stays as the right runqueue!
1130 * But we don't care, since "task_running()" will
1131 * return false if the runqueue has changed and p
1132 * is actually now running somewhere else!
1134 while (task_running(rq, p))
1138 * Ok, time to look more closely! We need the rq
1139 * lock now, to be *sure*. If we're wrong, we'll
1140 * just go back and repeat.
1142 rq = task_rq_lock(p, &flags);
1143 running = task_running(rq, p);
1144 on_rq = p->se.on_rq;
1145 task_rq_unlock(rq, &flags);
1148 * Was it really running after all now that we
1149 * checked with the proper locks actually held?
1151 * Oops. Go back and try again..
1153 if (unlikely(running)) {
1159 * It's not enough that it's not actively running,
1160 * it must be off the runqueue _entirely_, and not
1163 * So if it wa still runnable (but just not actively
1164 * running right now), it's preempted, and we should
1165 * yield - it could be a while.
1167 if (unlikely(on_rq)) {
1173 * Ahh, all good. It wasn't running, and it wasn't
1174 * runnable, which means that it will never become
1175 * running in the future either. We're all done!
1180 * kick_process - kick a running thread to enter/exit the kernel
1181 * @p: the to-be-kicked thread
1183 * Cause a process which is running on another CPU to enter
1184 * kernel-mode, without any delay. (to get signals handled.)
1186 * NOTE: this function doesnt have to take the runqueue lock,
1187 * because all it wants to ensure is that the remote task enters
1188 * the kernel. If the IPI races and the task has been migrated
1189 * to another CPU then no harm is done and the purpose has been
1192 void kick_process(struct task_struct *p)
1198 if ((cpu != smp_processor_id()) && task_curr(p))
1199 smp_send_reschedule(cpu);
1204 * Return a low guess at the load of a migration-source cpu weighted
1205 * according to the scheduling class and "nice" value.
1207 * We want to under-estimate the load of migration sources, to
1208 * balance conservatively.
1210 static inline unsigned long source_load(int cpu, int type)
1212 struct rq *rq = cpu_rq(cpu);
1213 unsigned long total = weighted_cpuload(cpu);
1218 return min(rq->cpu_load[type-1], total);
1222 * Return a high guess at the load of a migration-target cpu weighted
1223 * according to the scheduling class and "nice" value.
1225 static inline unsigned long target_load(int cpu, int type)
1227 struct rq *rq = cpu_rq(cpu);
1228 unsigned long total = weighted_cpuload(cpu);
1233 return max(rq->cpu_load[type-1], total);
1237 * Return the average load per task on the cpu's run queue
1239 static inline unsigned long cpu_avg_load_per_task(int cpu)
1241 struct rq *rq = cpu_rq(cpu);
1242 unsigned long total = weighted_cpuload(cpu);
1243 unsigned long n = rq->nr_running;
1245 return n ? total / n : SCHED_LOAD_SCALE;
1249 * find_idlest_group finds and returns the least busy CPU group within the
1252 static struct sched_group *
1253 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1255 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1256 unsigned long min_load = ULONG_MAX, this_load = 0;
1257 int load_idx = sd->forkexec_idx;
1258 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1261 unsigned long load, avg_load;
1265 /* Skip over this group if it has no CPUs allowed */
1266 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1269 local_group = cpu_isset(this_cpu, group->cpumask);
1271 /* Tally up the load of all CPUs in the group */
1274 for_each_cpu_mask(i, group->cpumask) {
1275 /* Bias balancing toward cpus of our domain */
1277 load = source_load(i, load_idx);
1279 load = target_load(i, load_idx);
1284 /* Adjust by relative CPU power of the group */
1285 avg_load = sg_div_cpu_power(group,
1286 avg_load * SCHED_LOAD_SCALE);
1289 this_load = avg_load;
1291 } else if (avg_load < min_load) {
1292 min_load = avg_load;
1296 group = group->next;
1297 } while (group != sd->groups);
1299 if (!idlest || 100*this_load < imbalance*min_load)
1305 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1308 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1311 unsigned long load, min_load = ULONG_MAX;
1315 /* Traverse only the allowed CPUs */
1316 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1318 for_each_cpu_mask(i, tmp) {
1319 load = weighted_cpuload(i);
1321 if (load < min_load || (load == min_load && i == this_cpu)) {
1331 * sched_balance_self: balance the current task (running on cpu) in domains
1332 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1335 * Balance, ie. select the least loaded group.
1337 * Returns the target CPU number, or the same CPU if no balancing is needed.
1339 * preempt must be disabled.
1341 static int sched_balance_self(int cpu, int flag)
1343 struct task_struct *t = current;
1344 struct sched_domain *tmp, *sd = NULL;
1346 for_each_domain(cpu, tmp) {
1348 * If power savings logic is enabled for a domain, stop there.
1350 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1352 if (tmp->flags & flag)
1358 struct sched_group *group;
1359 int new_cpu, weight;
1361 if (!(sd->flags & flag)) {
1367 group = find_idlest_group(sd, t, cpu);
1373 new_cpu = find_idlest_cpu(group, t, cpu);
1374 if (new_cpu == -1 || new_cpu == cpu) {
1375 /* Now try balancing at a lower domain level of cpu */
1380 /* Now try balancing at a lower domain level of new_cpu */
1383 weight = cpus_weight(span);
1384 for_each_domain(cpu, tmp) {
1385 if (weight <= cpus_weight(tmp->span))
1387 if (tmp->flags & flag)
1390 /* while loop will break here if sd == NULL */
1396 #endif /* CONFIG_SMP */
1399 * wake_idle() will wake a task on an idle cpu if task->cpu is
1400 * not idle and an idle cpu is available. The span of cpus to
1401 * search starts with cpus closest then further out as needed,
1402 * so we always favor a closer, idle cpu.
1404 * Returns the CPU we should wake onto.
1406 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1407 static int wake_idle(int cpu, struct task_struct *p)
1410 struct sched_domain *sd;
1414 * If it is idle, then it is the best cpu to run this task.
1416 * This cpu is also the best, if it has more than one task already.
1417 * Siblings must be also busy(in most cases) as they didn't already
1418 * pickup the extra load from this cpu and hence we need not check
1419 * sibling runqueue info. This will avoid the checks and cache miss
1420 * penalities associated with that.
1422 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1425 for_each_domain(cpu, sd) {
1426 if (sd->flags & SD_WAKE_IDLE) {
1427 cpus_and(tmp, sd->span, p->cpus_allowed);
1428 for_each_cpu_mask(i, tmp) {
1439 static inline int wake_idle(int cpu, struct task_struct *p)
1446 * try_to_wake_up - wake up a thread
1447 * @p: the to-be-woken-up thread
1448 * @state: the mask of task states that can be woken
1449 * @sync: do a synchronous wakeup?
1451 * Put it on the run-queue if it's not already there. The "current"
1452 * thread is always on the run-queue (except when the actual
1453 * re-schedule is in progress), and as such you're allowed to do
1454 * the simpler "current->state = TASK_RUNNING" to mark yourself
1455 * runnable without the overhead of this.
1457 * returns failure only if the task is already active.
1459 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1461 int cpu, this_cpu, success = 0;
1462 unsigned long flags;
1466 struct sched_domain *sd, *this_sd = NULL;
1467 unsigned long load, this_load;
1471 rq = task_rq_lock(p, &flags);
1472 old_state = p->state;
1473 if (!(old_state & state))
1480 this_cpu = smp_processor_id();
1483 if (unlikely(task_running(rq, p)))
1488 schedstat_inc(rq, ttwu_cnt);
1489 if (cpu == this_cpu) {
1490 schedstat_inc(rq, ttwu_local);
1494 for_each_domain(this_cpu, sd) {
1495 if (cpu_isset(cpu, sd->span)) {
1496 schedstat_inc(sd, ttwu_wake_remote);
1502 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1506 * Check for affine wakeup and passive balancing possibilities.
1509 int idx = this_sd->wake_idx;
1510 unsigned int imbalance;
1512 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1514 load = source_load(cpu, idx);
1515 this_load = target_load(this_cpu, idx);
1517 new_cpu = this_cpu; /* Wake to this CPU if we can */
1519 if (this_sd->flags & SD_WAKE_AFFINE) {
1520 unsigned long tl = this_load;
1521 unsigned long tl_per_task;
1523 tl_per_task = cpu_avg_load_per_task(this_cpu);
1526 * If sync wakeup then subtract the (maximum possible)
1527 * effect of the currently running task from the load
1528 * of the current CPU:
1531 tl -= current->se.load.weight;
1534 tl + target_load(cpu, idx) <= tl_per_task) ||
1535 100*(tl + p->se.load.weight) <= imbalance*load) {
1537 * This domain has SD_WAKE_AFFINE and
1538 * p is cache cold in this domain, and
1539 * there is no bad imbalance.
1541 schedstat_inc(this_sd, ttwu_move_affine);
1547 * Start passive balancing when half the imbalance_pct
1550 if (this_sd->flags & SD_WAKE_BALANCE) {
1551 if (imbalance*this_load <= 100*load) {
1552 schedstat_inc(this_sd, ttwu_move_balance);
1558 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1560 new_cpu = wake_idle(new_cpu, p);
1561 if (new_cpu != cpu) {
1562 set_task_cpu(p, new_cpu);
1563 task_rq_unlock(rq, &flags);
1564 /* might preempt at this point */
1565 rq = task_rq_lock(p, &flags);
1566 old_state = p->state;
1567 if (!(old_state & state))
1572 this_cpu = smp_processor_id();
1577 #endif /* CONFIG_SMP */
1578 activate_task(rq, p, 1);
1580 * Sync wakeups (i.e. those types of wakeups where the waker
1581 * has indicated that it will leave the CPU in short order)
1582 * don't trigger a preemption, if the woken up task will run on
1583 * this cpu. (in this case the 'I will reschedule' promise of
1584 * the waker guarantees that the freshly woken up task is going
1585 * to be considered on this CPU.)
1587 if (!sync || cpu != this_cpu)
1588 check_preempt_curr(rq, p);
1592 p->state = TASK_RUNNING;
1594 task_rq_unlock(rq, &flags);
1599 int fastcall wake_up_process(struct task_struct *p)
1601 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1602 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1604 EXPORT_SYMBOL(wake_up_process);
1606 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1608 return try_to_wake_up(p, state, 0);
1612 * Perform scheduler related setup for a newly forked process p.
1613 * p is forked by current.
1615 * __sched_fork() is basic setup used by init_idle() too:
1617 static void __sched_fork(struct task_struct *p)
1619 p->se.wait_start_fair = 0;
1620 p->se.wait_start = 0;
1621 p->se.exec_start = 0;
1622 p->se.sum_exec_runtime = 0;
1623 p->se.delta_exec = 0;
1624 p->se.delta_fair_run = 0;
1625 p->se.delta_fair_sleep = 0;
1626 p->se.wait_runtime = 0;
1627 p->se.sum_wait_runtime = 0;
1628 p->se.sum_sleep_runtime = 0;
1629 p->se.sleep_start = 0;
1630 p->se.sleep_start_fair = 0;
1631 p->se.block_start = 0;
1632 p->se.sleep_max = 0;
1633 p->se.block_max = 0;
1636 p->se.wait_runtime_overruns = 0;
1637 p->se.wait_runtime_underruns = 0;
1639 INIT_LIST_HEAD(&p->run_list);
1643 * We mark the process as running here, but have not actually
1644 * inserted it onto the runqueue yet. This guarantees that
1645 * nobody will actually run it, and a signal or other external
1646 * event cannot wake it up and insert it on the runqueue either.
1648 p->state = TASK_RUNNING;
1652 * fork()/clone()-time setup:
1654 void sched_fork(struct task_struct *p, int clone_flags)
1656 int cpu = get_cpu();
1661 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1663 __set_task_cpu(p, cpu);
1666 * Make sure we do not leak PI boosting priority to the child:
1668 p->prio = current->normal_prio;
1670 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1671 if (likely(sched_info_on()))
1672 memset(&p->sched_info, 0, sizeof(p->sched_info));
1674 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1677 #ifdef CONFIG_PREEMPT
1678 /* Want to start with kernel preemption disabled. */
1679 task_thread_info(p)->preempt_count = 1;
1685 * After fork, child runs first. (default) If set to 0 then
1686 * parent will (try to) run first.
1688 unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1691 * wake_up_new_task - wake up a newly created task for the first time.
1693 * This function will do some initial scheduler statistics housekeeping
1694 * that must be done for every newly created context, then puts the task
1695 * on the runqueue and wakes it.
1697 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1699 unsigned long flags;
1703 rq = task_rq_lock(p, &flags);
1704 BUG_ON(p->state != TASK_RUNNING);
1705 this_cpu = smp_processor_id(); /* parent's CPU */
1707 p->prio = effective_prio(p);
1709 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1710 task_cpu(p) != this_cpu || !current->se.on_rq) {
1711 activate_task(rq, p, 0);
1714 * Let the scheduling class do new task startup
1715 * management (if any):
1717 p->sched_class->task_new(rq, p);
1719 check_preempt_curr(rq, p);
1720 task_rq_unlock(rq, &flags);
1724 * prepare_task_switch - prepare to switch tasks
1725 * @rq: the runqueue preparing to switch
1726 * @next: the task we are going to switch to.
1728 * This is called with the rq lock held and interrupts off. It must
1729 * be paired with a subsequent finish_task_switch after the context
1732 * prepare_task_switch sets up locking and calls architecture specific
1735 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1737 prepare_lock_switch(rq, next);
1738 prepare_arch_switch(next);
1742 * finish_task_switch - clean up after a task-switch
1743 * @rq: runqueue associated with task-switch
1744 * @prev: the thread we just switched away from.
1746 * finish_task_switch must be called after the context switch, paired
1747 * with a prepare_task_switch call before the context switch.
1748 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1749 * and do any other architecture-specific cleanup actions.
1751 * Note that we may have delayed dropping an mm in context_switch(). If
1752 * so, we finish that here outside of the runqueue lock. (Doing it
1753 * with the lock held can cause deadlocks; see schedule() for
1756 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1757 __releases(rq->lock)
1759 struct mm_struct *mm = rq->prev_mm;
1765 * A task struct has one reference for the use as "current".
1766 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1767 * schedule one last time. The schedule call will never return, and
1768 * the scheduled task must drop that reference.
1769 * The test for TASK_DEAD must occur while the runqueue locks are
1770 * still held, otherwise prev could be scheduled on another cpu, die
1771 * there before we look at prev->state, and then the reference would
1773 * Manfred Spraul <manfred@colorfullife.com>
1775 prev_state = prev->state;
1776 finish_arch_switch(prev);
1777 finish_lock_switch(rq, prev);
1780 if (unlikely(prev_state == TASK_DEAD)) {
1782 * Remove function-return probe instances associated with this
1783 * task and put them back on the free list.
1785 kprobe_flush_task(prev);
1786 put_task_struct(prev);
1791 * schedule_tail - first thing a freshly forked thread must call.
1792 * @prev: the thread we just switched away from.
1794 asmlinkage void schedule_tail(struct task_struct *prev)
1795 __releases(rq->lock)
1797 struct rq *rq = this_rq();
1799 finish_task_switch(rq, prev);
1800 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1801 /* In this case, finish_task_switch does not reenable preemption */
1804 if (current->set_child_tid)
1805 put_user(current->pid, current->set_child_tid);
1809 * context_switch - switch to the new MM and the new
1810 * thread's register state.
1813 context_switch(struct rq *rq, struct task_struct *prev,
1814 struct task_struct *next)
1816 struct mm_struct *mm, *oldmm;
1818 prepare_task_switch(rq, next);
1820 oldmm = prev->active_mm;
1822 * For paravirt, this is coupled with an exit in switch_to to
1823 * combine the page table reload and the switch backend into
1826 arch_enter_lazy_cpu_mode();
1828 if (unlikely(!mm)) {
1829 next->active_mm = oldmm;
1830 atomic_inc(&oldmm->mm_count);
1831 enter_lazy_tlb(oldmm, next);
1833 switch_mm(oldmm, mm, next);
1835 if (unlikely(!prev->mm)) {
1836 prev->active_mm = NULL;
1837 rq->prev_mm = oldmm;
1840 * Since the runqueue lock will be released by the next
1841 * task (which is an invalid locking op but in the case
1842 * of the scheduler it's an obvious special-case), so we
1843 * do an early lockdep release here:
1845 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1846 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1849 /* Here we just switch the register state and the stack. */
1850 switch_to(prev, next, prev);
1854 * this_rq must be evaluated again because prev may have moved
1855 * CPUs since it called schedule(), thus the 'rq' on its stack
1856 * frame will be invalid.
1858 finish_task_switch(this_rq(), prev);
1862 * nr_running, nr_uninterruptible and nr_context_switches:
1864 * externally visible scheduler statistics: current number of runnable
1865 * threads, current number of uninterruptible-sleeping threads, total
1866 * number of context switches performed since bootup.
1868 unsigned long nr_running(void)
1870 unsigned long i, sum = 0;
1872 for_each_online_cpu(i)
1873 sum += cpu_rq(i)->nr_running;
1878 unsigned long nr_uninterruptible(void)
1880 unsigned long i, sum = 0;
1882 for_each_possible_cpu(i)
1883 sum += cpu_rq(i)->nr_uninterruptible;
1886 * Since we read the counters lockless, it might be slightly
1887 * inaccurate. Do not allow it to go below zero though:
1889 if (unlikely((long)sum < 0))
1895 unsigned long long nr_context_switches(void)
1898 unsigned long long sum = 0;
1900 for_each_possible_cpu(i)
1901 sum += cpu_rq(i)->nr_switches;
1906 unsigned long nr_iowait(void)
1908 unsigned long i, sum = 0;
1910 for_each_possible_cpu(i)
1911 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1916 unsigned long nr_active(void)
1918 unsigned long i, running = 0, uninterruptible = 0;
1920 for_each_online_cpu(i) {
1921 running += cpu_rq(i)->nr_running;
1922 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1925 if (unlikely((long)uninterruptible < 0))
1926 uninterruptible = 0;
1928 return running + uninterruptible;
1932 * Update rq->cpu_load[] statistics. This function is usually called every
1933 * scheduler tick (TICK_NSEC).
1935 static void update_cpu_load(struct rq *this_rq)
1937 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1938 unsigned long total_load = this_rq->ls.load.weight;
1939 unsigned long this_load = total_load;
1940 struct load_stat *ls = &this_rq->ls;
1941 u64 now = __rq_clock(this_rq);
1944 this_rq->nr_load_updates++;
1945 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1948 /* Update delta_fair/delta_exec fields first */
1949 update_curr_load(this_rq, now);
1951 fair_delta64 = ls->delta_fair + 1;
1954 exec_delta64 = ls->delta_exec + 1;
1957 sample_interval64 = now - ls->load_update_last;
1958 ls->load_update_last = now;
1960 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1961 sample_interval64 = TICK_NSEC;
1963 if (exec_delta64 > sample_interval64)
1964 exec_delta64 = sample_interval64;
1966 idle_delta64 = sample_interval64 - exec_delta64;
1968 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1969 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1971 this_load = (unsigned long)tmp64;
1975 /* Update our load: */
1976 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1977 unsigned long old_load, new_load;
1979 /* scale is effectively 1 << i now, and >> i divides by scale */
1981 old_load = this_rq->cpu_load[i];
1982 new_load = this_load;
1984 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1991 * double_rq_lock - safely lock two runqueues
1993 * Note this does not disable interrupts like task_rq_lock,
1994 * you need to do so manually before calling.
1996 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1997 __acquires(rq1->lock)
1998 __acquires(rq2->lock)
2000 BUG_ON(!irqs_disabled());
2002 spin_lock(&rq1->lock);
2003 __acquire(rq2->lock); /* Fake it out ;) */
2006 spin_lock(&rq1->lock);
2007 spin_lock(&rq2->lock);
2009 spin_lock(&rq2->lock);
2010 spin_lock(&rq1->lock);
2016 * double_rq_unlock - safely unlock two runqueues
2018 * Note this does not restore interrupts like task_rq_unlock,
2019 * you need to do so manually after calling.
2021 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2022 __releases(rq1->lock)
2023 __releases(rq2->lock)
2025 spin_unlock(&rq1->lock);
2027 spin_unlock(&rq2->lock);
2029 __release(rq2->lock);
2033 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2035 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2036 __releases(this_rq->lock)
2037 __acquires(busiest->lock)
2038 __acquires(this_rq->lock)
2040 if (unlikely(!irqs_disabled())) {
2041 /* printk() doesn't work good under rq->lock */
2042 spin_unlock(&this_rq->lock);
2045 if (unlikely(!spin_trylock(&busiest->lock))) {
2046 if (busiest < this_rq) {
2047 spin_unlock(&this_rq->lock);
2048 spin_lock(&busiest->lock);
2049 spin_lock(&this_rq->lock);
2051 spin_lock(&busiest->lock);
2056 * If dest_cpu is allowed for this process, migrate the task to it.
2057 * This is accomplished by forcing the cpu_allowed mask to only
2058 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
2059 * the cpu_allowed mask is restored.
2061 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2063 struct migration_req req;
2064 unsigned long flags;
2067 rq = task_rq_lock(p, &flags);
2068 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2069 || unlikely(cpu_is_offline(dest_cpu)))
2072 /* force the process onto the specified CPU */
2073 if (migrate_task(p, dest_cpu, &req)) {
2074 /* Need to wait for migration thread (might exit: take ref). */
2075 struct task_struct *mt = rq->migration_thread;
2077 get_task_struct(mt);
2078 task_rq_unlock(rq, &flags);
2079 wake_up_process(mt);
2080 put_task_struct(mt);
2081 wait_for_completion(&req.done);
2086 task_rq_unlock(rq, &flags);
2090 * sched_exec - execve() is a valuable balancing opportunity, because at
2091 * this point the task has the smallest effective memory and cache footprint.
2093 void sched_exec(void)
2095 int new_cpu, this_cpu = get_cpu();
2096 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2098 if (new_cpu != this_cpu)
2099 sched_migrate_task(current, new_cpu);
2103 * pull_task - move a task from a remote runqueue to the local runqueue.
2104 * Both runqueues must be locked.
2106 static void pull_task(struct rq *src_rq, struct task_struct *p,
2107 struct rq *this_rq, int this_cpu)
2109 deactivate_task(src_rq, p, 0);
2110 set_task_cpu(p, this_cpu);
2111 activate_task(this_rq, p, 0);
2113 * Note that idle threads have a prio of MAX_PRIO, for this test
2114 * to be always true for them.
2116 check_preempt_curr(this_rq, p);
2120 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2123 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2124 struct sched_domain *sd, enum cpu_idle_type idle,
2128 * We do not migrate tasks that are:
2129 * 1) running (obviously), or
2130 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2131 * 3) are cache-hot on their current CPU.
2133 if (!cpu_isset(this_cpu, p->cpus_allowed))
2137 if (task_running(rq, p))
2141 * Aggressive migration if too many balance attempts have failed:
2143 if (sd->nr_balance_failed > sd->cache_nice_tries)
2149 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2150 unsigned long max_nr_move, unsigned long max_load_move,
2151 struct sched_domain *sd, enum cpu_idle_type idle,
2152 int *all_pinned, unsigned long *load_moved,
2153 int this_best_prio, int best_prio, int best_prio_seen,
2154 struct rq_iterator *iterator)
2156 int pulled = 0, pinned = 0, skip_for_load;
2157 struct task_struct *p;
2158 long rem_load_move = max_load_move;
2160 if (max_nr_move == 0 || max_load_move == 0)
2166 * Start the load-balancing iterator:
2168 p = iterator->start(iterator->arg);
2173 * To help distribute high priority tasks accross CPUs we don't
2174 * skip a task if it will be the highest priority task (i.e. smallest
2175 * prio value) on its new queue regardless of its load weight
2177 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2178 SCHED_LOAD_SCALE_FUZZ;
2179 if (skip_for_load && p->prio < this_best_prio)
2180 skip_for_load = !best_prio_seen && p->prio == best_prio;
2181 if (skip_for_load ||
2182 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2184 best_prio_seen |= p->prio == best_prio;
2185 p = iterator->next(iterator->arg);
2189 pull_task(busiest, p, this_rq, this_cpu);
2191 rem_load_move -= p->se.load.weight;
2194 * We only want to steal up to the prescribed number of tasks
2195 * and the prescribed amount of weighted load.
2197 if (pulled < max_nr_move && rem_load_move > 0) {
2198 if (p->prio < this_best_prio)
2199 this_best_prio = p->prio;
2200 p = iterator->next(iterator->arg);
2205 * Right now, this is the only place pull_task() is called,
2206 * so we can safely collect pull_task() stats here rather than
2207 * inside pull_task().
2209 schedstat_add(sd, lb_gained[idle], pulled);
2212 *all_pinned = pinned;
2213 *load_moved = max_load_move - rem_load_move;
2218 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2219 * load from busiest to this_rq, as part of a balancing operation within
2220 * "domain". Returns the number of tasks moved.
2222 * Called with both runqueues locked.
2224 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2225 unsigned long max_nr_move, unsigned long max_load_move,
2226 struct sched_domain *sd, enum cpu_idle_type idle,
2229 struct sched_class *class = sched_class_highest;
2230 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2231 long rem_load_move = max_load_move;
2234 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2235 max_nr_move, (unsigned long)rem_load_move,
2236 sd, idle, all_pinned, &load_moved);
2237 total_nr_moved += nr_moved;
2238 max_nr_move -= nr_moved;
2239 rem_load_move -= load_moved;
2240 class = class->next;
2241 } while (class && max_nr_move && rem_load_move > 0);
2243 return total_nr_moved;
2247 * find_busiest_group finds and returns the busiest CPU group within the
2248 * domain. It calculates and returns the amount of weighted load which
2249 * should be moved to restore balance via the imbalance parameter.
2251 static struct sched_group *
2252 find_busiest_group(struct sched_domain *sd, int this_cpu,
2253 unsigned long *imbalance, enum cpu_idle_type idle,
2254 int *sd_idle, cpumask_t *cpus, int *balance)
2256 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2257 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2258 unsigned long max_pull;
2259 unsigned long busiest_load_per_task, busiest_nr_running;
2260 unsigned long this_load_per_task, this_nr_running;
2262 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2263 int power_savings_balance = 1;
2264 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2265 unsigned long min_nr_running = ULONG_MAX;
2266 struct sched_group *group_min = NULL, *group_leader = NULL;
2269 max_load = this_load = total_load = total_pwr = 0;
2270 busiest_load_per_task = busiest_nr_running = 0;
2271 this_load_per_task = this_nr_running = 0;
2272 if (idle == CPU_NOT_IDLE)
2273 load_idx = sd->busy_idx;
2274 else if (idle == CPU_NEWLY_IDLE)
2275 load_idx = sd->newidle_idx;
2277 load_idx = sd->idle_idx;
2280 unsigned long load, group_capacity;
2283 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2284 unsigned long sum_nr_running, sum_weighted_load;
2286 local_group = cpu_isset(this_cpu, group->cpumask);
2289 balance_cpu = first_cpu(group->cpumask);
2291 /* Tally up the load of all CPUs in the group */
2292 sum_weighted_load = sum_nr_running = avg_load = 0;
2294 for_each_cpu_mask(i, group->cpumask) {
2297 if (!cpu_isset(i, *cpus))
2302 if (*sd_idle && !idle_cpu(i))
2305 /* Bias balancing toward cpus of our domain */
2307 if (idle_cpu(i) && !first_idle_cpu) {
2312 load = target_load(i, load_idx);
2314 load = source_load(i, load_idx);
2317 sum_nr_running += rq->nr_running;
2318 sum_weighted_load += weighted_cpuload(i);
2322 * First idle cpu or the first cpu(busiest) in this sched group
2323 * is eligible for doing load balancing at this and above
2326 if (local_group && balance_cpu != this_cpu && balance) {
2331 total_load += avg_load;
2332 total_pwr += group->__cpu_power;
2334 /* Adjust by relative CPU power of the group */
2335 avg_load = sg_div_cpu_power(group,
2336 avg_load * SCHED_LOAD_SCALE);
2338 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2341 this_load = avg_load;
2343 this_nr_running = sum_nr_running;
2344 this_load_per_task = sum_weighted_load;
2345 } else if (avg_load > max_load &&
2346 sum_nr_running > group_capacity) {
2347 max_load = avg_load;
2349 busiest_nr_running = sum_nr_running;
2350 busiest_load_per_task = sum_weighted_load;
2353 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2355 * Busy processors will not participate in power savings
2358 if (idle == CPU_NOT_IDLE ||
2359 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2363 * If the local group is idle or completely loaded
2364 * no need to do power savings balance at this domain
2366 if (local_group && (this_nr_running >= group_capacity ||
2368 power_savings_balance = 0;
2371 * If a group is already running at full capacity or idle,
2372 * don't include that group in power savings calculations
2374 if (!power_savings_balance || sum_nr_running >= group_capacity
2379 * Calculate the group which has the least non-idle load.
2380 * This is the group from where we need to pick up the load
2383 if ((sum_nr_running < min_nr_running) ||
2384 (sum_nr_running == min_nr_running &&
2385 first_cpu(group->cpumask) <
2386 first_cpu(group_min->cpumask))) {
2388 min_nr_running = sum_nr_running;
2389 min_load_per_task = sum_weighted_load /
2394 * Calculate the group which is almost near its
2395 * capacity but still has some space to pick up some load
2396 * from other group and save more power
2398 if (sum_nr_running <= group_capacity - 1) {
2399 if (sum_nr_running > leader_nr_running ||
2400 (sum_nr_running == leader_nr_running &&
2401 first_cpu(group->cpumask) >
2402 first_cpu(group_leader->cpumask))) {
2403 group_leader = group;
2404 leader_nr_running = sum_nr_running;
2409 group = group->next;
2410 } while (group != sd->groups);
2412 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2415 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2417 if (this_load >= avg_load ||
2418 100*max_load <= sd->imbalance_pct*this_load)
2421 busiest_load_per_task /= busiest_nr_running;
2423 * We're trying to get all the cpus to the average_load, so we don't
2424 * want to push ourselves above the average load, nor do we wish to
2425 * reduce the max loaded cpu below the average load, as either of these
2426 * actions would just result in more rebalancing later, and ping-pong
2427 * tasks around. Thus we look for the minimum possible imbalance.
2428 * Negative imbalances (*we* are more loaded than anyone else) will
2429 * be counted as no imbalance for these purposes -- we can't fix that
2430 * by pulling tasks to us. Be careful of negative numbers as they'll
2431 * appear as very large values with unsigned longs.
2433 if (max_load <= busiest_load_per_task)
2437 * In the presence of smp nice balancing, certain scenarios can have
2438 * max load less than avg load(as we skip the groups at or below
2439 * its cpu_power, while calculating max_load..)
2441 if (max_load < avg_load) {
2443 goto small_imbalance;
2446 /* Don't want to pull so many tasks that a group would go idle */
2447 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2449 /* How much load to actually move to equalise the imbalance */
2450 *imbalance = min(max_pull * busiest->__cpu_power,
2451 (avg_load - this_load) * this->__cpu_power)
2455 * if *imbalance is less than the average load per runnable task
2456 * there is no gaurantee that any tasks will be moved so we'll have
2457 * a think about bumping its value to force at least one task to be
2460 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2461 unsigned long tmp, pwr_now, pwr_move;
2465 pwr_move = pwr_now = 0;
2467 if (this_nr_running) {
2468 this_load_per_task /= this_nr_running;
2469 if (busiest_load_per_task > this_load_per_task)
2472 this_load_per_task = SCHED_LOAD_SCALE;
2474 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2475 busiest_load_per_task * imbn) {
2476 *imbalance = busiest_load_per_task;
2481 * OK, we don't have enough imbalance to justify moving tasks,
2482 * however we may be able to increase total CPU power used by
2486 pwr_now += busiest->__cpu_power *
2487 min(busiest_load_per_task, max_load);
2488 pwr_now += this->__cpu_power *
2489 min(this_load_per_task, this_load);
2490 pwr_now /= SCHED_LOAD_SCALE;
2492 /* Amount of load we'd subtract */
2493 tmp = sg_div_cpu_power(busiest,
2494 busiest_load_per_task * SCHED_LOAD_SCALE);
2496 pwr_move += busiest->__cpu_power *
2497 min(busiest_load_per_task, max_load - tmp);
2499 /* Amount of load we'd add */
2500 if (max_load * busiest->__cpu_power <
2501 busiest_load_per_task * SCHED_LOAD_SCALE)
2502 tmp = sg_div_cpu_power(this,
2503 max_load * busiest->__cpu_power);
2505 tmp = sg_div_cpu_power(this,
2506 busiest_load_per_task * SCHED_LOAD_SCALE);
2507 pwr_move += this->__cpu_power *
2508 min(this_load_per_task, this_load + tmp);
2509 pwr_move /= SCHED_LOAD_SCALE;
2511 /* Move if we gain throughput */
2512 if (pwr_move <= pwr_now)
2515 *imbalance = busiest_load_per_task;
2521 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2522 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2525 if (this == group_leader && group_leader != group_min) {
2526 *imbalance = min_load_per_task;
2536 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2539 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2540 unsigned long imbalance, cpumask_t *cpus)
2542 struct rq *busiest = NULL, *rq;
2543 unsigned long max_load = 0;
2546 for_each_cpu_mask(i, group->cpumask) {
2549 if (!cpu_isset(i, *cpus))
2553 wl = weighted_cpuload(i);
2555 if (rq->nr_running == 1 && wl > imbalance)
2558 if (wl > max_load) {
2568 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2569 * so long as it is large enough.
2571 #define MAX_PINNED_INTERVAL 512
2573 static inline unsigned long minus_1_or_zero(unsigned long n)
2575 return n > 0 ? n - 1 : 0;
2579 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2580 * tasks if there is an imbalance.
2582 static int load_balance(int this_cpu, struct rq *this_rq,
2583 struct sched_domain *sd, enum cpu_idle_type idle,
2586 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2587 struct sched_group *group;
2588 unsigned long imbalance;
2590 cpumask_t cpus = CPU_MASK_ALL;
2591 unsigned long flags;
2594 * When power savings policy is enabled for the parent domain, idle
2595 * sibling can pick up load irrespective of busy siblings. In this case,
2596 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2597 * portraying it as CPU_NOT_IDLE.
2599 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2600 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2603 schedstat_inc(sd, lb_cnt[idle]);
2606 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2613 schedstat_inc(sd, lb_nobusyg[idle]);
2617 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2619 schedstat_inc(sd, lb_nobusyq[idle]);
2623 BUG_ON(busiest == this_rq);
2625 schedstat_add(sd, lb_imbalance[idle], imbalance);
2628 if (busiest->nr_running > 1) {
2630 * Attempt to move tasks. If find_busiest_group has found
2631 * an imbalance but busiest->nr_running <= 1, the group is
2632 * still unbalanced. nr_moved simply stays zero, so it is
2633 * correctly treated as an imbalance.
2635 local_irq_save(flags);
2636 double_rq_lock(this_rq, busiest);
2637 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2638 minus_1_or_zero(busiest->nr_running),
2639 imbalance, sd, idle, &all_pinned);
2640 double_rq_unlock(this_rq, busiest);
2641 local_irq_restore(flags);
2644 * some other cpu did the load balance for us.
2646 if (nr_moved && this_cpu != smp_processor_id())
2647 resched_cpu(this_cpu);
2649 /* All tasks on this runqueue were pinned by CPU affinity */
2650 if (unlikely(all_pinned)) {
2651 cpu_clear(cpu_of(busiest), cpus);
2652 if (!cpus_empty(cpus))
2659 schedstat_inc(sd, lb_failed[idle]);
2660 sd->nr_balance_failed++;
2662 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2664 spin_lock_irqsave(&busiest->lock, flags);
2666 /* don't kick the migration_thread, if the curr
2667 * task on busiest cpu can't be moved to this_cpu
2669 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2670 spin_unlock_irqrestore(&busiest->lock, flags);
2672 goto out_one_pinned;
2675 if (!busiest->active_balance) {
2676 busiest->active_balance = 1;
2677 busiest->push_cpu = this_cpu;
2680 spin_unlock_irqrestore(&busiest->lock, flags);
2682 wake_up_process(busiest->migration_thread);
2685 * We've kicked active balancing, reset the failure
2688 sd->nr_balance_failed = sd->cache_nice_tries+1;
2691 sd->nr_balance_failed = 0;
2693 if (likely(!active_balance)) {
2694 /* We were unbalanced, so reset the balancing interval */
2695 sd->balance_interval = sd->min_interval;
2698 * If we've begun active balancing, start to back off. This
2699 * case may not be covered by the all_pinned logic if there
2700 * is only 1 task on the busy runqueue (because we don't call
2703 if (sd->balance_interval < sd->max_interval)
2704 sd->balance_interval *= 2;
2707 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2708 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2713 schedstat_inc(sd, lb_balanced[idle]);
2715 sd->nr_balance_failed = 0;
2718 /* tune up the balancing interval */
2719 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2720 (sd->balance_interval < sd->max_interval))
2721 sd->balance_interval *= 2;
2723 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2724 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2730 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2731 * tasks if there is an imbalance.
2733 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2734 * this_rq is locked.
2737 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2739 struct sched_group *group;
2740 struct rq *busiest = NULL;
2741 unsigned long imbalance;
2744 cpumask_t cpus = CPU_MASK_ALL;
2747 * When power savings policy is enabled for the parent domain, idle
2748 * sibling can pick up load irrespective of busy siblings. In this case,
2749 * let the state of idle sibling percolate up as IDLE, instead of
2750 * portraying it as CPU_NOT_IDLE.
2752 if (sd->flags & SD_SHARE_CPUPOWER &&
2753 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2756 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2758 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2759 &sd_idle, &cpus, NULL);
2761 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2765 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2768 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2772 BUG_ON(busiest == this_rq);
2774 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2777 if (busiest->nr_running > 1) {
2778 /* Attempt to move tasks */
2779 double_lock_balance(this_rq, busiest);
2780 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2781 minus_1_or_zero(busiest->nr_running),
2782 imbalance, sd, CPU_NEWLY_IDLE, NULL);
2783 spin_unlock(&busiest->lock);
2786 cpu_clear(cpu_of(busiest), cpus);
2787 if (!cpus_empty(cpus))
2793 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2794 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2795 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2798 sd->nr_balance_failed = 0;
2803 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2804 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2805 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2807 sd->nr_balance_failed = 0;
2813 * idle_balance is called by schedule() if this_cpu is about to become
2814 * idle. Attempts to pull tasks from other CPUs.
2816 static void idle_balance(int this_cpu, struct rq *this_rq)
2818 struct sched_domain *sd;
2819 int pulled_task = -1;
2820 unsigned long next_balance = jiffies + HZ;
2822 for_each_domain(this_cpu, sd) {
2823 unsigned long interval;
2825 if (!(sd->flags & SD_LOAD_BALANCE))
2828 if (sd->flags & SD_BALANCE_NEWIDLE)
2829 /* If we've pulled tasks over stop searching: */
2830 pulled_task = load_balance_newidle(this_cpu,
2833 interval = msecs_to_jiffies(sd->balance_interval);
2834 if (time_after(next_balance, sd->last_balance + interval))
2835 next_balance = sd->last_balance + interval;
2839 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2841 * We are going idle. next_balance may be set based on
2842 * a busy processor. So reset next_balance.
2844 this_rq->next_balance = next_balance;
2849 * active_load_balance is run by migration threads. It pushes running tasks
2850 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2851 * running on each physical CPU where possible, and avoids physical /
2852 * logical imbalances.
2854 * Called with busiest_rq locked.
2856 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2858 int target_cpu = busiest_rq->push_cpu;
2859 struct sched_domain *sd;
2860 struct rq *target_rq;
2862 /* Is there any task to move? */
2863 if (busiest_rq->nr_running <= 1)
2866 target_rq = cpu_rq(target_cpu);
2869 * This condition is "impossible", if it occurs
2870 * we need to fix it. Originally reported by
2871 * Bjorn Helgaas on a 128-cpu setup.
2873 BUG_ON(busiest_rq == target_rq);
2875 /* move a task from busiest_rq to target_rq */
2876 double_lock_balance(busiest_rq, target_rq);
2878 /* Search for an sd spanning us and the target CPU. */
2879 for_each_domain(target_cpu, sd) {
2880 if ((sd->flags & SD_LOAD_BALANCE) &&
2881 cpu_isset(busiest_cpu, sd->span))
2886 schedstat_inc(sd, alb_cnt);
2888 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2889 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
2891 schedstat_inc(sd, alb_pushed);
2893 schedstat_inc(sd, alb_failed);
2895 spin_unlock(&target_rq->lock);
2900 atomic_t load_balancer;
2902 } nohz ____cacheline_aligned = {
2903 .load_balancer = ATOMIC_INIT(-1),
2904 .cpu_mask = CPU_MASK_NONE,
2908 * This routine will try to nominate the ilb (idle load balancing)
2909 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2910 * load balancing on behalf of all those cpus. If all the cpus in the system
2911 * go into this tickless mode, then there will be no ilb owner (as there is
2912 * no need for one) and all the cpus will sleep till the next wakeup event
2915 * For the ilb owner, tick is not stopped. And this tick will be used
2916 * for idle load balancing. ilb owner will still be part of
2919 * While stopping the tick, this cpu will become the ilb owner if there
2920 * is no other owner. And will be the owner till that cpu becomes busy
2921 * or if all cpus in the system stop their ticks at which point
2922 * there is no need for ilb owner.
2924 * When the ilb owner becomes busy, it nominates another owner, during the
2925 * next busy scheduler_tick()
2927 int select_nohz_load_balancer(int stop_tick)
2929 int cpu = smp_processor_id();
2932 cpu_set(cpu, nohz.cpu_mask);
2933 cpu_rq(cpu)->in_nohz_recently = 1;
2936 * If we are going offline and still the leader, give up!
2938 if (cpu_is_offline(cpu) &&
2939 atomic_read(&nohz.load_balancer) == cpu) {
2940 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2945 /* time for ilb owner also to sleep */
2946 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2947 if (atomic_read(&nohz.load_balancer) == cpu)
2948 atomic_set(&nohz.load_balancer, -1);
2952 if (atomic_read(&nohz.load_balancer) == -1) {
2953 /* make me the ilb owner */
2954 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2956 } else if (atomic_read(&nohz.load_balancer) == cpu)
2959 if (!cpu_isset(cpu, nohz.cpu_mask))
2962 cpu_clear(cpu, nohz.cpu_mask);
2964 if (atomic_read(&nohz.load_balancer) == cpu)
2965 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2972 static DEFINE_SPINLOCK(balancing);
2975 * It checks each scheduling domain to see if it is due to be balanced,
2976 * and initiates a balancing operation if so.
2978 * Balancing parameters are set up in arch_init_sched_domains.
2980 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
2983 struct rq *rq = cpu_rq(cpu);
2984 unsigned long interval;
2985 struct sched_domain *sd;
2986 /* Earliest time when we have to do rebalance again */
2987 unsigned long next_balance = jiffies + 60*HZ;
2989 for_each_domain(cpu, sd) {
2990 if (!(sd->flags & SD_LOAD_BALANCE))
2993 interval = sd->balance_interval;
2994 if (idle != CPU_IDLE)
2995 interval *= sd->busy_factor;
2997 /* scale ms to jiffies */
2998 interval = msecs_to_jiffies(interval);
2999 if (unlikely(!interval))
3001 if (interval > HZ*NR_CPUS/10)
3002 interval = HZ*NR_CPUS/10;
3005 if (sd->flags & SD_SERIALIZE) {
3006 if (!spin_trylock(&balancing))
3010 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3011 if (load_balance(cpu, rq, sd, idle, &balance)) {
3013 * We've pulled tasks over so either we're no
3014 * longer idle, or one of our SMT siblings is
3017 idle = CPU_NOT_IDLE;
3019 sd->last_balance = jiffies;
3021 if (sd->flags & SD_SERIALIZE)
3022 spin_unlock(&balancing);
3024 if (time_after(next_balance, sd->last_balance + interval))
3025 next_balance = sd->last_balance + interval;
3028 * Stop the load balance at this level. There is another
3029 * CPU in our sched group which is doing load balancing more
3035 rq->next_balance = next_balance;
3039 * run_rebalance_domains is triggered when needed from the scheduler tick.
3040 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3041 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3043 static void run_rebalance_domains(struct softirq_action *h)
3045 int this_cpu = smp_processor_id();
3046 struct rq *this_rq = cpu_rq(this_cpu);
3047 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3048 CPU_IDLE : CPU_NOT_IDLE;
3050 rebalance_domains(this_cpu, idle);
3054 * If this cpu is the owner for idle load balancing, then do the
3055 * balancing on behalf of the other idle cpus whose ticks are
3058 if (this_rq->idle_at_tick &&
3059 atomic_read(&nohz.load_balancer) == this_cpu) {
3060 cpumask_t cpus = nohz.cpu_mask;
3064 cpu_clear(this_cpu, cpus);
3065 for_each_cpu_mask(balance_cpu, cpus) {
3067 * If this cpu gets work to do, stop the load balancing
3068 * work being done for other cpus. Next load
3069 * balancing owner will pick it up.
3074 rebalance_domains(balance_cpu, SCHED_IDLE);
3076 rq = cpu_rq(balance_cpu);
3077 if (time_after(this_rq->next_balance, rq->next_balance))
3078 this_rq->next_balance = rq->next_balance;
3085 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3087 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3088 * idle load balancing owner or decide to stop the periodic load balancing,
3089 * if the whole system is idle.
3091 static inline void trigger_load_balance(struct rq *rq, int cpu)
3095 * If we were in the nohz mode recently and busy at the current
3096 * scheduler tick, then check if we need to nominate new idle
3099 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3100 rq->in_nohz_recently = 0;
3102 if (atomic_read(&nohz.load_balancer) == cpu) {
3103 cpu_clear(cpu, nohz.cpu_mask);
3104 atomic_set(&nohz.load_balancer, -1);
3107 if (atomic_read(&nohz.load_balancer) == -1) {
3109 * simple selection for now: Nominate the
3110 * first cpu in the nohz list to be the next
3113 * TBD: Traverse the sched domains and nominate
3114 * the nearest cpu in the nohz.cpu_mask.
3116 int ilb = first_cpu(nohz.cpu_mask);
3124 * If this cpu is idle and doing idle load balancing for all the
3125 * cpus with ticks stopped, is it time for that to stop?
3127 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3128 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3134 * If this cpu is idle and the idle load balancing is done by
3135 * someone else, then no need raise the SCHED_SOFTIRQ
3137 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3138 cpu_isset(cpu, nohz.cpu_mask))
3141 if (time_after_eq(jiffies, rq->next_balance))
3142 raise_softirq(SCHED_SOFTIRQ);
3145 #else /* CONFIG_SMP */
3148 * on UP we do not need to balance between CPUs:
3150 static inline void idle_balance(int cpu, struct rq *rq)
3154 /* Avoid "used but not defined" warning on UP */
3155 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3156 unsigned long max_nr_move, unsigned long max_load_move,
3157 struct sched_domain *sd, enum cpu_idle_type idle,
3158 int *all_pinned, unsigned long *load_moved,
3159 int this_best_prio, int best_prio, int best_prio_seen,
3160 struct rq_iterator *iterator)
3169 DEFINE_PER_CPU(struct kernel_stat, kstat);
3171 EXPORT_PER_CPU_SYMBOL(kstat);
3174 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3175 * that have not yet been banked in case the task is currently running.
3177 unsigned long long task_sched_runtime(struct task_struct *p)
3179 unsigned long flags;
3183 rq = task_rq_lock(p, &flags);
3184 ns = p->se.sum_exec_runtime;
3185 if (rq->curr == p) {
3186 delta_exec = rq_clock(rq) - p->se.exec_start;
3187 if ((s64)delta_exec > 0)
3190 task_rq_unlock(rq, &flags);
3196 * Account user cpu time to a process.
3197 * @p: the process that the cpu time gets accounted to
3198 * @hardirq_offset: the offset to subtract from hardirq_count()
3199 * @cputime: the cpu time spent in user space since the last update
3201 void account_user_time(struct task_struct *p, cputime_t cputime)
3203 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3206 p->utime = cputime_add(p->utime, cputime);
3208 /* Add user time to cpustat. */
3209 tmp = cputime_to_cputime64(cputime);
3210 if (TASK_NICE(p) > 0)
3211 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3213 cpustat->user = cputime64_add(cpustat->user, tmp);
3217 * Account system cpu time to a process.
3218 * @p: the process that the cpu time gets accounted to
3219 * @hardirq_offset: the offset to subtract from hardirq_count()
3220 * @cputime: the cpu time spent in kernel space since the last update
3222 void account_system_time(struct task_struct *p, int hardirq_offset,
3225 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3226 struct rq *rq = this_rq();
3229 p->stime = cputime_add(p->stime, cputime);
3231 /* Add system time to cpustat. */
3232 tmp = cputime_to_cputime64(cputime);
3233 if (hardirq_count() - hardirq_offset)
3234 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3235 else if (softirq_count())
3236 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3237 else if (p != rq->idle)
3238 cpustat->system = cputime64_add(cpustat->system, tmp);
3239 else if (atomic_read(&rq->nr_iowait) > 0)
3240 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3242 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3243 /* Account for system time used */
3244 acct_update_integrals(p);
3248 * Account for involuntary wait time.
3249 * @p: the process from which the cpu time has been stolen
3250 * @steal: the cpu time spent in involuntary wait
3252 void account_steal_time(struct task_struct *p, cputime_t steal)
3254 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3255 cputime64_t tmp = cputime_to_cputime64(steal);
3256 struct rq *rq = this_rq();
3258 if (p == rq->idle) {
3259 p->stime = cputime_add(p->stime, steal);
3260 if (atomic_read(&rq->nr_iowait) > 0)
3261 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3263 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3265 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3269 * This function gets called by the timer code, with HZ frequency.
3270 * We call it with interrupts disabled.
3272 * It also gets called by the fork code, when changing the parent's
3275 void scheduler_tick(void)
3277 int cpu = smp_processor_id();
3278 struct rq *rq = cpu_rq(cpu);
3279 struct task_struct *curr = rq->curr;
3281 spin_lock(&rq->lock);
3282 if (curr != rq->idle) /* FIXME: needed? */
3283 curr->sched_class->task_tick(rq, curr);
3284 update_cpu_load(rq);
3285 spin_unlock(&rq->lock);
3288 rq->idle_at_tick = idle_cpu(cpu);
3289 trigger_load_balance(rq, cpu);
3293 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3295 void fastcall add_preempt_count(int val)
3300 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3302 preempt_count() += val;
3304 * Spinlock count overflowing soon?
3306 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3309 EXPORT_SYMBOL(add_preempt_count);
3311 void fastcall sub_preempt_count(int val)
3316 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3319 * Is the spinlock portion underflowing?
3321 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3322 !(preempt_count() & PREEMPT_MASK)))
3325 preempt_count() -= val;
3327 EXPORT_SYMBOL(sub_preempt_count);
3332 * Print scheduling while atomic bug:
3334 static noinline void __schedule_bug(struct task_struct *prev)
3336 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3337 prev->comm, preempt_count(), prev->pid);
3338 debug_show_held_locks(prev);
3339 if (irqs_disabled())
3340 print_irqtrace_events(prev);
3345 * Various schedule()-time debugging checks and statistics:
3347 static inline void schedule_debug(struct task_struct *prev)
3350 * Test if we are atomic. Since do_exit() needs to call into
3351 * schedule() atomically, we ignore that path for now.
3352 * Otherwise, whine if we are scheduling when we should not be.
3354 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3355 __schedule_bug(prev);
3357 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3359 schedstat_inc(this_rq(), sched_cnt);
3363 * Pick up the highest-prio task:
3365 static inline struct task_struct *
3366 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3368 struct sched_class *class;
3369 struct task_struct *p;
3372 * Optimization: we know that if all tasks are in
3373 * the fair class we can call that function directly:
3375 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3376 p = fair_sched_class.pick_next_task(rq, now);
3381 class = sched_class_highest;
3383 p = class->pick_next_task(rq, now);
3387 * Will never be NULL as the idle class always
3388 * returns a non-NULL p:
3390 class = class->next;
3395 * schedule() is the main scheduler function.
3397 asmlinkage void __sched schedule(void)
3399 struct task_struct *prev, *next;
3407 cpu = smp_processor_id();
3411 switch_count = &prev->nivcsw;
3413 release_kernel_lock(prev);
3414 need_resched_nonpreemptible:
3416 schedule_debug(prev);
3418 spin_lock_irq(&rq->lock);
3419 clear_tsk_need_resched(prev);
3421 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3422 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3423 unlikely(signal_pending(prev)))) {
3424 prev->state = TASK_RUNNING;
3426 deactivate_task(rq, prev, 1);
3428 switch_count = &prev->nvcsw;
3431 if (unlikely(!rq->nr_running))
3432 idle_balance(cpu, rq);
3434 now = __rq_clock(rq);
3435 prev->sched_class->put_prev_task(rq, prev, now);
3436 next = pick_next_task(rq, prev, now);
3438 sched_info_switch(prev, next);
3440 if (likely(prev != next)) {
3445 context_switch(rq, prev, next); /* unlocks the rq */
3447 spin_unlock_irq(&rq->lock);
3449 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3450 cpu = smp_processor_id();
3452 goto need_resched_nonpreemptible;
3454 preempt_enable_no_resched();
3455 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3458 EXPORT_SYMBOL(schedule);
3460 #ifdef CONFIG_PREEMPT
3462 * this is the entry point to schedule() from in-kernel preemption
3463 * off of preempt_enable. Kernel preemptions off return from interrupt
3464 * occur there and call schedule directly.
3466 asmlinkage void __sched preempt_schedule(void)
3468 struct thread_info *ti = current_thread_info();
3469 #ifdef CONFIG_PREEMPT_BKL
3470 struct task_struct *task = current;
3471 int saved_lock_depth;
3474 * If there is a non-zero preempt_count or interrupts are disabled,
3475 * we do not want to preempt the current task. Just return..
3477 if (likely(ti->preempt_count || irqs_disabled()))
3481 add_preempt_count(PREEMPT_ACTIVE);
3483 * We keep the big kernel semaphore locked, but we
3484 * clear ->lock_depth so that schedule() doesnt
3485 * auto-release the semaphore:
3487 #ifdef CONFIG_PREEMPT_BKL
3488 saved_lock_depth = task->lock_depth;
3489 task->lock_depth = -1;
3492 #ifdef CONFIG_PREEMPT_BKL
3493 task->lock_depth = saved_lock_depth;
3495 sub_preempt_count(PREEMPT_ACTIVE);
3497 /* we could miss a preemption opportunity between schedule and now */
3499 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3502 EXPORT_SYMBOL(preempt_schedule);
3505 * this is the entry point to schedule() from kernel preemption
3506 * off of irq context.
3507 * Note, that this is called and return with irqs disabled. This will
3508 * protect us against recursive calling from irq.
3510 asmlinkage void __sched preempt_schedule_irq(void)
3512 struct thread_info *ti = current_thread_info();
3513 #ifdef CONFIG_PREEMPT_BKL
3514 struct task_struct *task = current;
3515 int saved_lock_depth;
3517 /* Catch callers which need to be fixed */
3518 BUG_ON(ti->preempt_count || !irqs_disabled());
3521 add_preempt_count(PREEMPT_ACTIVE);
3523 * We keep the big kernel semaphore locked, but we
3524 * clear ->lock_depth so that schedule() doesnt
3525 * auto-release the semaphore:
3527 #ifdef CONFIG_PREEMPT_BKL
3528 saved_lock_depth = task->lock_depth;
3529 task->lock_depth = -1;
3533 local_irq_disable();
3534 #ifdef CONFIG_PREEMPT_BKL
3535 task->lock_depth = saved_lock_depth;
3537 sub_preempt_count(PREEMPT_ACTIVE);
3539 /* we could miss a preemption opportunity between schedule and now */
3541 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3545 #endif /* CONFIG_PREEMPT */
3547 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3550 return try_to_wake_up(curr->private, mode, sync);
3552 EXPORT_SYMBOL(default_wake_function);
3555 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3556 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3557 * number) then we wake all the non-exclusive tasks and one exclusive task.
3559 * There are circumstances in which we can try to wake a task which has already
3560 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3561 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3563 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3564 int nr_exclusive, int sync, void *key)
3566 struct list_head *tmp, *next;
3568 list_for_each_safe(tmp, next, &q->task_list) {
3569 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3570 unsigned flags = curr->flags;
3572 if (curr->func(curr, mode, sync, key) &&
3573 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3579 * __wake_up - wake up threads blocked on a waitqueue.
3581 * @mode: which threads
3582 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3583 * @key: is directly passed to the wakeup function
3585 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3586 int nr_exclusive, void *key)
3588 unsigned long flags;
3590 spin_lock_irqsave(&q->lock, flags);
3591 __wake_up_common(q, mode, nr_exclusive, 0, key);
3592 spin_unlock_irqrestore(&q->lock, flags);
3594 EXPORT_SYMBOL(__wake_up);
3597 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3599 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3601 __wake_up_common(q, mode, 1, 0, NULL);
3605 * __wake_up_sync - wake up threads blocked on a waitqueue.
3607 * @mode: which threads
3608 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3610 * The sync wakeup differs that the waker knows that it will schedule
3611 * away soon, so while the target thread will be woken up, it will not
3612 * be migrated to another CPU - ie. the two threads are 'synchronized'
3613 * with each other. This can prevent needless bouncing between CPUs.
3615 * On UP it can prevent extra preemption.
3618 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3620 unsigned long flags;
3626 if (unlikely(!nr_exclusive))
3629 spin_lock_irqsave(&q->lock, flags);
3630 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3631 spin_unlock_irqrestore(&q->lock, flags);
3633 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3635 void fastcall complete(struct completion *x)
3637 unsigned long flags;
3639 spin_lock_irqsave(&x->wait.lock, flags);
3641 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3643 spin_unlock_irqrestore(&x->wait.lock, flags);
3645 EXPORT_SYMBOL(complete);
3647 void fastcall complete_all(struct completion *x)
3649 unsigned long flags;
3651 spin_lock_irqsave(&x->wait.lock, flags);
3652 x->done += UINT_MAX/2;
3653 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3655 spin_unlock_irqrestore(&x->wait.lock, flags);
3657 EXPORT_SYMBOL(complete_all);
3659 void fastcall __sched wait_for_completion(struct completion *x)
3663 spin_lock_irq(&x->wait.lock);
3665 DECLARE_WAITQUEUE(wait, current);
3667 wait.flags |= WQ_FLAG_EXCLUSIVE;
3668 __add_wait_queue_tail(&x->wait, &wait);
3670 __set_current_state(TASK_UNINTERRUPTIBLE);
3671 spin_unlock_irq(&x->wait.lock);
3673 spin_lock_irq(&x->wait.lock);
3675 __remove_wait_queue(&x->wait, &wait);
3678 spin_unlock_irq(&x->wait.lock);
3680 EXPORT_SYMBOL(wait_for_completion);
3682 unsigned long fastcall __sched
3683 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3687 spin_lock_irq(&x->wait.lock);
3689 DECLARE_WAITQUEUE(wait, current);
3691 wait.flags |= WQ_FLAG_EXCLUSIVE;
3692 __add_wait_queue_tail(&x->wait, &wait);
3694 __set_current_state(TASK_UNINTERRUPTIBLE);
3695 spin_unlock_irq(&x->wait.lock);
3696 timeout = schedule_timeout(timeout);
3697 spin_lock_irq(&x->wait.lock);
3699 __remove_wait_queue(&x->wait, &wait);
3703 __remove_wait_queue(&x->wait, &wait);
3707 spin_unlock_irq(&x->wait.lock);
3710 EXPORT_SYMBOL(wait_for_completion_timeout);
3712 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3718 spin_lock_irq(&x->wait.lock);
3720 DECLARE_WAITQUEUE(wait, current);
3722 wait.flags |= WQ_FLAG_EXCLUSIVE;
3723 __add_wait_queue_tail(&x->wait, &wait);
3725 if (signal_pending(current)) {
3727 __remove_wait_queue(&x->wait, &wait);
3730 __set_current_state(TASK_INTERRUPTIBLE);
3731 spin_unlock_irq(&x->wait.lock);
3733 spin_lock_irq(&x->wait.lock);
3735 __remove_wait_queue(&x->wait, &wait);
3739 spin_unlock_irq(&x->wait.lock);
3743 EXPORT_SYMBOL(wait_for_completion_interruptible);
3745 unsigned long fastcall __sched
3746 wait_for_completion_interruptible_timeout(struct completion *x,
3747 unsigned long timeout)
3751 spin_lock_irq(&x->wait.lock);
3753 DECLARE_WAITQUEUE(wait, current);
3755 wait.flags |= WQ_FLAG_EXCLUSIVE;
3756 __add_wait_queue_tail(&x->wait, &wait);
3758 if (signal_pending(current)) {
3759 timeout = -ERESTARTSYS;
3760 __remove_wait_queue(&x->wait, &wait);
3763 __set_current_state(TASK_INTERRUPTIBLE);
3764 spin_unlock_irq(&x->wait.lock);
3765 timeout = schedule_timeout(timeout);
3766 spin_lock_irq(&x->wait.lock);
3768 __remove_wait_queue(&x->wait, &wait);
3772 __remove_wait_queue(&x->wait, &wait);
3776 spin_unlock_irq(&x->wait.lock);
3779 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3782 #define SLEEP_ON_VAR \
3783 unsigned long flags; \
3784 wait_queue_t wait; \
3785 init_waitqueue_entry(&wait, current);
3787 #define SLEEP_ON_HEAD \
3788 spin_lock_irqsave(&q->lock,flags); \
3789 __add_wait_queue(q, &wait); \
3790 spin_unlock(&q->lock);
3792 #define SLEEP_ON_TAIL \
3793 spin_lock_irq(&q->lock); \
3794 __remove_wait_queue(q, &wait); \
3795 spin_unlock_irqrestore(&q->lock, flags);
3797 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3801 current->state = TASK_INTERRUPTIBLE;
3807 EXPORT_SYMBOL(interruptible_sleep_on);
3809 long fastcall __sched
3810 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3814 current->state = TASK_INTERRUPTIBLE;
3817 timeout = schedule_timeout(timeout);
3822 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3824 void fastcall __sched sleep_on(wait_queue_head_t *q)
3828 current->state = TASK_UNINTERRUPTIBLE;
3834 EXPORT_SYMBOL(sleep_on);
3836 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3840 current->state = TASK_UNINTERRUPTIBLE;
3843 timeout = schedule_timeout(timeout);
3849 EXPORT_SYMBOL(sleep_on_timeout);
3851 #ifdef CONFIG_RT_MUTEXES
3854 * rt_mutex_setprio - set the current priority of a task
3856 * @prio: prio value (kernel-internal form)
3858 * This function changes the 'effective' priority of a task. It does
3859 * not touch ->normal_prio like __setscheduler().
3861 * Used by the rt_mutex code to implement priority inheritance logic.
3863 void rt_mutex_setprio(struct task_struct *p, int prio)
3865 unsigned long flags;
3870 BUG_ON(prio < 0 || prio > MAX_PRIO);
3872 rq = task_rq_lock(p, &flags);
3876 on_rq = p->se.on_rq;
3878 dequeue_task(rq, p, 0, now);
3881 p->sched_class = &rt_sched_class;
3883 p->sched_class = &fair_sched_class;
3888 enqueue_task(rq, p, 0, now);
3890 * Reschedule if we are currently running on this runqueue and
3891 * our priority decreased, or if we are not currently running on
3892 * this runqueue and our priority is higher than the current's
3894 if (task_running(rq, p)) {
3895 if (p->prio > oldprio)
3896 resched_task(rq->curr);
3898 check_preempt_curr(rq, p);
3901 task_rq_unlock(rq, &flags);
3906 void set_user_nice(struct task_struct *p, long nice)
3908 int old_prio, delta, on_rq;
3909 unsigned long flags;
3913 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3916 * We have to be careful, if called from sys_setpriority(),
3917 * the task might be in the middle of scheduling on another CPU.
3919 rq = task_rq_lock(p, &flags);
3922 * The RT priorities are set via sched_setscheduler(), but we still
3923 * allow the 'normal' nice value to be set - but as expected
3924 * it wont have any effect on scheduling until the task is
3925 * SCHED_FIFO/SCHED_RR:
3927 if (task_has_rt_policy(p)) {
3928 p->static_prio = NICE_TO_PRIO(nice);
3931 on_rq = p->se.on_rq;
3933 dequeue_task(rq, p, 0, now);
3934 dec_load(rq, p, now);
3937 p->static_prio = NICE_TO_PRIO(nice);
3940 p->prio = effective_prio(p);
3941 delta = p->prio - old_prio;
3944 enqueue_task(rq, p, 0, now);
3945 inc_load(rq, p, now);
3947 * If the task increased its priority or is running and
3948 * lowered its priority, then reschedule its CPU:
3950 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3951 resched_task(rq->curr);
3954 task_rq_unlock(rq, &flags);
3956 EXPORT_SYMBOL(set_user_nice);
3959 * can_nice - check if a task can reduce its nice value
3963 int can_nice(const struct task_struct *p, const int nice)
3965 /* convert nice value [19,-20] to rlimit style value [1,40] */
3966 int nice_rlim = 20 - nice;
3968 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3969 capable(CAP_SYS_NICE));
3972 #ifdef __ARCH_WANT_SYS_NICE
3975 * sys_nice - change the priority of the current process.
3976 * @increment: priority increment
3978 * sys_setpriority is a more generic, but much slower function that
3979 * does similar things.
3981 asmlinkage long sys_nice(int increment)
3986 * Setpriority might change our priority at the same moment.
3987 * We don't have to worry. Conceptually one call occurs first
3988 * and we have a single winner.
3990 if (increment < -40)
3995 nice = PRIO_TO_NICE(current->static_prio) + increment;
4001 if (increment < 0 && !can_nice(current, nice))
4004 retval = security_task_setnice(current, nice);
4008 set_user_nice(current, nice);
4015 * task_prio - return the priority value of a given task.
4016 * @p: the task in question.
4018 * This is the priority value as seen by users in /proc.
4019 * RT tasks are offset by -200. Normal tasks are centered
4020 * around 0, value goes from -16 to +15.
4022 int task_prio(const struct task_struct *p)
4024 return p->prio - MAX_RT_PRIO;
4028 * task_nice - return the nice value of a given task.
4029 * @p: the task in question.
4031 int task_nice(const struct task_struct *p)
4033 return TASK_NICE(p);
4035 EXPORT_SYMBOL_GPL(task_nice);
4038 * idle_cpu - is a given cpu idle currently?
4039 * @cpu: the processor in question.
4041 int idle_cpu(int cpu)
4043 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4047 * idle_task - return the idle task for a given cpu.
4048 * @cpu: the processor in question.
4050 struct task_struct *idle_task(int cpu)
4052 return cpu_rq(cpu)->idle;
4056 * find_process_by_pid - find a process with a matching PID value.
4057 * @pid: the pid in question.
4059 static inline struct task_struct *find_process_by_pid(pid_t pid)
4061 return pid ? find_task_by_pid(pid) : current;
4064 /* Actually do priority change: must hold rq lock. */
4066 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4068 BUG_ON(p->se.on_rq);
4071 switch (p->policy) {
4075 p->sched_class = &fair_sched_class;
4079 p->sched_class = &rt_sched_class;
4083 p->rt_priority = prio;
4084 p->normal_prio = normal_prio(p);
4085 /* we are holding p->pi_lock already */
4086 p->prio = rt_mutex_getprio(p);
4091 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4092 * @p: the task in question.
4093 * @policy: new policy.
4094 * @param: structure containing the new RT priority.
4096 * NOTE that the task may be already dead.
4098 int sched_setscheduler(struct task_struct *p, int policy,
4099 struct sched_param *param)
4101 int retval, oldprio, oldpolicy = -1, on_rq;
4102 unsigned long flags;
4105 /* may grab non-irq protected spin_locks */
4106 BUG_ON(in_interrupt());
4108 /* double check policy once rq lock held */
4110 policy = oldpolicy = p->policy;
4111 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4112 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4113 policy != SCHED_IDLE)
4116 * Valid priorities for SCHED_FIFO and SCHED_RR are
4117 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4118 * SCHED_BATCH and SCHED_IDLE is 0.
4120 if (param->sched_priority < 0 ||
4121 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4122 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4124 if (rt_policy(policy) != (param->sched_priority != 0))
4128 * Allow unprivileged RT tasks to decrease priority:
4130 if (!capable(CAP_SYS_NICE)) {
4131 if (rt_policy(policy)) {
4132 unsigned long rlim_rtprio;
4134 if (!lock_task_sighand(p, &flags))
4136 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4137 unlock_task_sighand(p, &flags);
4139 /* can't set/change the rt policy */
4140 if (policy != p->policy && !rlim_rtprio)
4143 /* can't increase priority */
4144 if (param->sched_priority > p->rt_priority &&
4145 param->sched_priority > rlim_rtprio)
4149 * Like positive nice levels, dont allow tasks to
4150 * move out of SCHED_IDLE either:
4152 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4155 /* can't change other user's priorities */
4156 if ((current->euid != p->euid) &&
4157 (current->euid != p->uid))
4161 retval = security_task_setscheduler(p, policy, param);
4165 * make sure no PI-waiters arrive (or leave) while we are
4166 * changing the priority of the task:
4168 spin_lock_irqsave(&p->pi_lock, flags);
4170 * To be able to change p->policy safely, the apropriate
4171 * runqueue lock must be held.
4173 rq = __task_rq_lock(p);
4174 /* recheck policy now with rq lock held */
4175 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4176 policy = oldpolicy = -1;
4177 __task_rq_unlock(rq);
4178 spin_unlock_irqrestore(&p->pi_lock, flags);
4181 on_rq = p->se.on_rq;
4183 deactivate_task(rq, p, 0);
4185 __setscheduler(rq, p, policy, param->sched_priority);
4187 activate_task(rq, p, 0);
4189 * Reschedule if we are currently running on this runqueue and
4190 * our priority decreased, or if we are not currently running on
4191 * this runqueue and our priority is higher than the current's
4193 if (task_running(rq, p)) {
4194 if (p->prio > oldprio)
4195 resched_task(rq->curr);
4197 check_preempt_curr(rq, p);
4200 __task_rq_unlock(rq);
4201 spin_unlock_irqrestore(&p->pi_lock, flags);
4203 rt_mutex_adjust_pi(p);
4207 EXPORT_SYMBOL_GPL(sched_setscheduler);
4210 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4212 struct sched_param lparam;
4213 struct task_struct *p;
4216 if (!param || pid < 0)
4218 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4223 p = find_process_by_pid(pid);
4225 retval = sched_setscheduler(p, policy, &lparam);
4232 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4233 * @pid: the pid in question.
4234 * @policy: new policy.
4235 * @param: structure containing the new RT priority.
4237 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4238 struct sched_param __user *param)
4240 /* negative values for policy are not valid */
4244 return do_sched_setscheduler(pid, policy, param);
4248 * sys_sched_setparam - set/change the RT priority of a thread
4249 * @pid: the pid in question.
4250 * @param: structure containing the new RT priority.
4252 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4254 return do_sched_setscheduler(pid, -1, param);
4258 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4259 * @pid: the pid in question.
4261 asmlinkage long sys_sched_getscheduler(pid_t pid)
4263 struct task_struct *p;
4264 int retval = -EINVAL;
4270 read_lock(&tasklist_lock);
4271 p = find_process_by_pid(pid);
4273 retval = security_task_getscheduler(p);
4277 read_unlock(&tasklist_lock);
4284 * sys_sched_getscheduler - get the RT priority of a thread
4285 * @pid: the pid in question.
4286 * @param: structure containing the RT priority.
4288 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4290 struct sched_param lp;
4291 struct task_struct *p;
4292 int retval = -EINVAL;
4294 if (!param || pid < 0)
4297 read_lock(&tasklist_lock);
4298 p = find_process_by_pid(pid);
4303 retval = security_task_getscheduler(p);
4307 lp.sched_priority = p->rt_priority;
4308 read_unlock(&tasklist_lock);
4311 * This one might sleep, we cannot do it with a spinlock held ...
4313 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4319 read_unlock(&tasklist_lock);
4323 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4325 cpumask_t cpus_allowed;
4326 struct task_struct *p;
4329 mutex_lock(&sched_hotcpu_mutex);
4330 read_lock(&tasklist_lock);
4332 p = find_process_by_pid(pid);
4334 read_unlock(&tasklist_lock);
4335 mutex_unlock(&sched_hotcpu_mutex);
4340 * It is not safe to call set_cpus_allowed with the
4341 * tasklist_lock held. We will bump the task_struct's
4342 * usage count and then drop tasklist_lock.
4345 read_unlock(&tasklist_lock);
4348 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4349 !capable(CAP_SYS_NICE))
4352 retval = security_task_setscheduler(p, 0, NULL);
4356 cpus_allowed = cpuset_cpus_allowed(p);
4357 cpus_and(new_mask, new_mask, cpus_allowed);
4358 retval = set_cpus_allowed(p, new_mask);
4362 mutex_unlock(&sched_hotcpu_mutex);
4366 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4367 cpumask_t *new_mask)
4369 if (len < sizeof(cpumask_t)) {
4370 memset(new_mask, 0, sizeof(cpumask_t));
4371 } else if (len > sizeof(cpumask_t)) {
4372 len = sizeof(cpumask_t);
4374 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4378 * sys_sched_setaffinity - set the cpu affinity of a process
4379 * @pid: pid of the process
4380 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4381 * @user_mask_ptr: user-space pointer to the new cpu mask
4383 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4384 unsigned long __user *user_mask_ptr)
4389 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4393 return sched_setaffinity(pid, new_mask);
4397 * Represents all cpu's present in the system
4398 * In systems capable of hotplug, this map could dynamically grow
4399 * as new cpu's are detected in the system via any platform specific
4400 * method, such as ACPI for e.g.
4403 cpumask_t cpu_present_map __read_mostly;
4404 EXPORT_SYMBOL(cpu_present_map);
4407 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4408 EXPORT_SYMBOL(cpu_online_map);
4410 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4411 EXPORT_SYMBOL(cpu_possible_map);
4414 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4416 struct task_struct *p;
4419 mutex_lock(&sched_hotcpu_mutex);
4420 read_lock(&tasklist_lock);
4423 p = find_process_by_pid(pid);
4427 retval = security_task_getscheduler(p);
4431 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4434 read_unlock(&tasklist_lock);
4435 mutex_unlock(&sched_hotcpu_mutex);
4443 * sys_sched_getaffinity - get the cpu affinity of a process
4444 * @pid: pid of the process
4445 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4446 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4448 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4449 unsigned long __user *user_mask_ptr)
4454 if (len < sizeof(cpumask_t))
4457 ret = sched_getaffinity(pid, &mask);
4461 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4464 return sizeof(cpumask_t);
4468 * sys_sched_yield - yield the current processor to other threads.
4470 * This function yields the current CPU to other tasks. If there are no
4471 * other threads running on this CPU then this function will return.
4473 asmlinkage long sys_sched_yield(void)
4475 struct rq *rq = this_rq_lock();
4477 schedstat_inc(rq, yld_cnt);
4478 if (unlikely(rq->nr_running == 1))
4479 schedstat_inc(rq, yld_act_empty);
4481 current->sched_class->yield_task(rq, current);
4484 * Since we are going to call schedule() anyway, there's
4485 * no need to preempt or enable interrupts:
4487 __release(rq->lock);
4488 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4489 _raw_spin_unlock(&rq->lock);
4490 preempt_enable_no_resched();
4497 static void __cond_resched(void)
4499 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4500 __might_sleep(__FILE__, __LINE__);
4503 * The BKS might be reacquired before we have dropped
4504 * PREEMPT_ACTIVE, which could trigger a second
4505 * cond_resched() call.
4508 add_preempt_count(PREEMPT_ACTIVE);
4510 sub_preempt_count(PREEMPT_ACTIVE);
4511 } while (need_resched());
4514 int __sched cond_resched(void)
4516 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4517 system_state == SYSTEM_RUNNING) {
4523 EXPORT_SYMBOL(cond_resched);
4526 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4527 * call schedule, and on return reacquire the lock.
4529 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4530 * operations here to prevent schedule() from being called twice (once via
4531 * spin_unlock(), once by hand).
4533 int cond_resched_lock(spinlock_t *lock)
4537 if (need_lockbreak(lock)) {
4543 if (need_resched() && system_state == SYSTEM_RUNNING) {
4544 spin_release(&lock->dep_map, 1, _THIS_IP_);
4545 _raw_spin_unlock(lock);
4546 preempt_enable_no_resched();
4553 EXPORT_SYMBOL(cond_resched_lock);
4555 int __sched cond_resched_softirq(void)
4557 BUG_ON(!in_softirq());
4559 if (need_resched() && system_state == SYSTEM_RUNNING) {
4567 EXPORT_SYMBOL(cond_resched_softirq);
4570 * yield - yield the current processor to other threads.
4572 * This is a shortcut for kernel-space yielding - it marks the
4573 * thread runnable and calls sys_sched_yield().
4575 void __sched yield(void)
4577 set_current_state(TASK_RUNNING);
4580 EXPORT_SYMBOL(yield);
4583 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4584 * that process accounting knows that this is a task in IO wait state.
4586 * But don't do that if it is a deliberate, throttling IO wait (this task
4587 * has set its backing_dev_info: the queue against which it should throttle)
4589 void __sched io_schedule(void)
4591 struct rq *rq = &__raw_get_cpu_var(runqueues);
4593 delayacct_blkio_start();
4594 atomic_inc(&rq->nr_iowait);
4596 atomic_dec(&rq->nr_iowait);
4597 delayacct_blkio_end();
4599 EXPORT_SYMBOL(io_schedule);
4601 long __sched io_schedule_timeout(long timeout)
4603 struct rq *rq = &__raw_get_cpu_var(runqueues);
4606 delayacct_blkio_start();
4607 atomic_inc(&rq->nr_iowait);
4608 ret = schedule_timeout(timeout);
4609 atomic_dec(&rq->nr_iowait);
4610 delayacct_blkio_end();
4615 * sys_sched_get_priority_max - return maximum RT priority.
4616 * @policy: scheduling class.
4618 * this syscall returns the maximum rt_priority that can be used
4619 * by a given scheduling class.
4621 asmlinkage long sys_sched_get_priority_max(int policy)
4628 ret = MAX_USER_RT_PRIO-1;
4640 * sys_sched_get_priority_min - return minimum RT priority.
4641 * @policy: scheduling class.
4643 * this syscall returns the minimum rt_priority that can be used
4644 * by a given scheduling class.
4646 asmlinkage long sys_sched_get_priority_min(int policy)
4664 * sys_sched_rr_get_interval - return the default timeslice of a process.
4665 * @pid: pid of the process.
4666 * @interval: userspace pointer to the timeslice value.
4668 * this syscall writes the default timeslice value of a given process
4669 * into the user-space timespec buffer. A value of '0' means infinity.
4672 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4674 struct task_struct *p;
4675 int retval = -EINVAL;
4682 read_lock(&tasklist_lock);
4683 p = find_process_by_pid(pid);
4687 retval = security_task_getscheduler(p);
4691 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4692 0 : static_prio_timeslice(p->static_prio), &t);
4693 read_unlock(&tasklist_lock);
4694 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4698 read_unlock(&tasklist_lock);
4702 static const char stat_nam[] = "RSDTtZX";
4704 static void show_task(struct task_struct *p)
4706 unsigned long free = 0;
4709 state = p->state ? __ffs(p->state) + 1 : 0;
4710 printk("%-13.13s %c", p->comm,
4711 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4712 #if (BITS_PER_LONG == 32)
4713 if (state == TASK_RUNNING)
4714 printk(" running ");
4716 printk(" %08lX ", thread_saved_pc(p));
4718 if (state == TASK_RUNNING)
4719 printk(" running task ");
4721 printk(" %016lx ", thread_saved_pc(p));
4723 #ifdef CONFIG_DEBUG_STACK_USAGE
4725 unsigned long *n = end_of_stack(p);
4728 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4731 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
4733 printk(" (L-TLB)\n");
4735 printk(" (NOTLB)\n");
4737 if (state != TASK_RUNNING)
4738 show_stack(p, NULL);
4741 void show_state_filter(unsigned long state_filter)
4743 struct task_struct *g, *p;
4745 #if (BITS_PER_LONG == 32)
4748 printk(" task PC stack pid father child younger older\n");
4752 printk(" task PC stack pid father child younger older\n");
4754 read_lock(&tasklist_lock);
4755 do_each_thread(g, p) {
4757 * reset the NMI-timeout, listing all files on a slow
4758 * console might take alot of time:
4760 touch_nmi_watchdog();
4761 if (!state_filter || (p->state & state_filter))
4763 } while_each_thread(g, p);
4765 touch_all_softlockup_watchdogs();
4767 #ifdef CONFIG_SCHED_DEBUG
4768 sysrq_sched_debug_show();
4770 read_unlock(&tasklist_lock);
4772 * Only show locks if all tasks are dumped:
4774 if (state_filter == -1)
4775 debug_show_all_locks();
4778 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4780 idle->sched_class = &idle_sched_class;
4784 * init_idle - set up an idle thread for a given CPU
4785 * @idle: task in question
4786 * @cpu: cpu the idle task belongs to
4788 * NOTE: this function does not set the idle thread's NEED_RESCHED
4789 * flag, to make booting more robust.
4791 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4793 struct rq *rq = cpu_rq(cpu);
4794 unsigned long flags;
4797 idle->se.exec_start = sched_clock();
4799 idle->prio = idle->normal_prio = MAX_PRIO;
4800 idle->cpus_allowed = cpumask_of_cpu(cpu);
4801 __set_task_cpu(idle, cpu);
4803 spin_lock_irqsave(&rq->lock, flags);
4804 rq->curr = rq->idle = idle;
4805 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4808 spin_unlock_irqrestore(&rq->lock, flags);
4810 /* Set the preempt count _outside_ the spinlocks! */
4811 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4812 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4814 task_thread_info(idle)->preempt_count = 0;
4817 * The idle tasks have their own, simple scheduling class:
4819 idle->sched_class = &idle_sched_class;
4823 * In a system that switches off the HZ timer nohz_cpu_mask
4824 * indicates which cpus entered this state. This is used
4825 * in the rcu update to wait only for active cpus. For system
4826 * which do not switch off the HZ timer nohz_cpu_mask should
4827 * always be CPU_MASK_NONE.
4829 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4832 * Increase the granularity value when there are more CPUs,
4833 * because with more CPUs the 'effective latency' as visible
4834 * to users decreases. But the relationship is not linear,
4835 * so pick a second-best guess by going with the log2 of the
4838 * This idea comes from the SD scheduler of Con Kolivas:
4840 static inline void sched_init_granularity(void)
4842 unsigned int factor = 1 + ilog2(num_online_cpus());
4843 const unsigned long gran_limit = 10000000;
4845 sysctl_sched_granularity *= factor;
4846 if (sysctl_sched_granularity > gran_limit)
4847 sysctl_sched_granularity = gran_limit;
4849 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4850 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4855 * This is how migration works:
4857 * 1) we queue a struct migration_req structure in the source CPU's
4858 * runqueue and wake up that CPU's migration thread.
4859 * 2) we down() the locked semaphore => thread blocks.
4860 * 3) migration thread wakes up (implicitly it forces the migrated
4861 * thread off the CPU)
4862 * 4) it gets the migration request and checks whether the migrated
4863 * task is still in the wrong runqueue.
4864 * 5) if it's in the wrong runqueue then the migration thread removes
4865 * it and puts it into the right queue.
4866 * 6) migration thread up()s the semaphore.
4867 * 7) we wake up and the migration is done.
4871 * Change a given task's CPU affinity. Migrate the thread to a
4872 * proper CPU and schedule it away if the CPU it's executing on
4873 * is removed from the allowed bitmask.
4875 * NOTE: the caller must have a valid reference to the task, the
4876 * task must not exit() & deallocate itself prematurely. The
4877 * call is not atomic; no spinlocks may be held.
4879 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4881 struct migration_req req;
4882 unsigned long flags;
4886 rq = task_rq_lock(p, &flags);
4887 if (!cpus_intersects(new_mask, cpu_online_map)) {
4892 p->cpus_allowed = new_mask;
4893 /* Can the task run on the task's current CPU? If so, we're done */
4894 if (cpu_isset(task_cpu(p), new_mask))
4897 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4898 /* Need help from migration thread: drop lock and wait. */
4899 task_rq_unlock(rq, &flags);
4900 wake_up_process(rq->migration_thread);
4901 wait_for_completion(&req.done);
4902 tlb_migrate_finish(p->mm);
4906 task_rq_unlock(rq, &flags);
4910 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4913 * Move (not current) task off this cpu, onto dest cpu. We're doing
4914 * this because either it can't run here any more (set_cpus_allowed()
4915 * away from this CPU, or CPU going down), or because we're
4916 * attempting to rebalance this task on exec (sched_exec).
4918 * So we race with normal scheduler movements, but that's OK, as long
4919 * as the task is no longer on this CPU.
4921 * Returns non-zero if task was successfully migrated.
4923 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4925 struct rq *rq_dest, *rq_src;
4928 if (unlikely(cpu_is_offline(dest_cpu)))
4931 rq_src = cpu_rq(src_cpu);
4932 rq_dest = cpu_rq(dest_cpu);
4934 double_rq_lock(rq_src, rq_dest);
4935 /* Already moved. */
4936 if (task_cpu(p) != src_cpu)
4938 /* Affinity changed (again). */
4939 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4942 on_rq = p->se.on_rq;
4944 deactivate_task(rq_src, p, 0);
4945 set_task_cpu(p, dest_cpu);
4947 activate_task(rq_dest, p, 0);
4948 check_preempt_curr(rq_dest, p);
4952 double_rq_unlock(rq_src, rq_dest);
4957 * migration_thread - this is a highprio system thread that performs
4958 * thread migration by bumping thread off CPU then 'pushing' onto
4961 static int migration_thread(void *data)
4963 int cpu = (long)data;
4967 BUG_ON(rq->migration_thread != current);
4969 set_current_state(TASK_INTERRUPTIBLE);
4970 while (!kthread_should_stop()) {
4971 struct migration_req *req;
4972 struct list_head *head;
4976 spin_lock_irq(&rq->lock);
4978 if (cpu_is_offline(cpu)) {
4979 spin_unlock_irq(&rq->lock);
4983 if (rq->active_balance) {
4984 active_load_balance(rq, cpu);
4985 rq->active_balance = 0;
4988 head = &rq->migration_queue;
4990 if (list_empty(head)) {
4991 spin_unlock_irq(&rq->lock);
4993 set_current_state(TASK_INTERRUPTIBLE);
4996 req = list_entry(head->next, struct migration_req, list);
4997 list_del_init(head->next);
4999 spin_unlock(&rq->lock);
5000 __migrate_task(req->task, cpu, req->dest_cpu);
5003 complete(&req->done);
5005 __set_current_state(TASK_RUNNING);
5009 /* Wait for kthread_stop */
5010 set_current_state(TASK_INTERRUPTIBLE);
5011 while (!kthread_should_stop()) {
5013 set_current_state(TASK_INTERRUPTIBLE);
5015 __set_current_state(TASK_RUNNING);
5019 #ifdef CONFIG_HOTPLUG_CPU
5021 * Figure out where task on dead CPU should go, use force if neccessary.
5022 * NOTE: interrupts should be disabled by the caller
5024 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5026 unsigned long flags;
5033 mask = node_to_cpumask(cpu_to_node(dead_cpu));
5034 cpus_and(mask, mask, p->cpus_allowed);
5035 dest_cpu = any_online_cpu(mask);
5037 /* On any allowed CPU? */
5038 if (dest_cpu == NR_CPUS)
5039 dest_cpu = any_online_cpu(p->cpus_allowed);
5041 /* No more Mr. Nice Guy. */
5042 if (dest_cpu == NR_CPUS) {
5043 rq = task_rq_lock(p, &flags);
5044 cpus_setall(p->cpus_allowed);
5045 dest_cpu = any_online_cpu(p->cpus_allowed);
5046 task_rq_unlock(rq, &flags);
5049 * Don't tell them about moving exiting tasks or
5050 * kernel threads (both mm NULL), since they never
5053 if (p->mm && printk_ratelimit())
5054 printk(KERN_INFO "process %d (%s) no "
5055 "longer affine to cpu%d\n",
5056 p->pid, p->comm, dead_cpu);
5058 if (!__migrate_task(p, dead_cpu, dest_cpu))
5063 * While a dead CPU has no uninterruptible tasks queued at this point,
5064 * it might still have a nonzero ->nr_uninterruptible counter, because
5065 * for performance reasons the counter is not stricly tracking tasks to
5066 * their home CPUs. So we just add the counter to another CPU's counter,
5067 * to keep the global sum constant after CPU-down:
5069 static void migrate_nr_uninterruptible(struct rq *rq_src)
5071 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5072 unsigned long flags;
5074 local_irq_save(flags);
5075 double_rq_lock(rq_src, rq_dest);
5076 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5077 rq_src->nr_uninterruptible = 0;
5078 double_rq_unlock(rq_src, rq_dest);
5079 local_irq_restore(flags);
5082 /* Run through task list and migrate tasks from the dead cpu. */
5083 static void migrate_live_tasks(int src_cpu)
5085 struct task_struct *p, *t;
5087 write_lock_irq(&tasklist_lock);
5089 do_each_thread(t, p) {
5093 if (task_cpu(p) == src_cpu)
5094 move_task_off_dead_cpu(src_cpu, p);
5095 } while_each_thread(t, p);
5097 write_unlock_irq(&tasklist_lock);
5101 * Schedules idle task to be the next runnable task on current CPU.
5102 * It does so by boosting its priority to highest possible and adding it to
5103 * the _front_ of the runqueue. Used by CPU offline code.
5105 void sched_idle_next(void)
5107 int this_cpu = smp_processor_id();
5108 struct rq *rq = cpu_rq(this_cpu);
5109 struct task_struct *p = rq->idle;
5110 unsigned long flags;
5112 /* cpu has to be offline */
5113 BUG_ON(cpu_online(this_cpu));
5116 * Strictly not necessary since rest of the CPUs are stopped by now
5117 * and interrupts disabled on the current cpu.
5119 spin_lock_irqsave(&rq->lock, flags);
5121 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5123 /* Add idle task to the _front_ of its priority queue: */
5124 activate_idle_task(p, rq);
5126 spin_unlock_irqrestore(&rq->lock, flags);
5130 * Ensures that the idle task is using init_mm right before its cpu goes
5133 void idle_task_exit(void)
5135 struct mm_struct *mm = current->active_mm;
5137 BUG_ON(cpu_online(smp_processor_id()));
5140 switch_mm(mm, &init_mm, current);
5144 /* called under rq->lock with disabled interrupts */
5145 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5147 struct rq *rq = cpu_rq(dead_cpu);
5149 /* Must be exiting, otherwise would be on tasklist. */
5150 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5152 /* Cannot have done final schedule yet: would have vanished. */
5153 BUG_ON(p->state == TASK_DEAD);
5158 * Drop lock around migration; if someone else moves it,
5159 * that's OK. No task can be added to this CPU, so iteration is
5161 * NOTE: interrupts should be left disabled --dev@
5163 spin_unlock(&rq->lock);
5164 move_task_off_dead_cpu(dead_cpu, p);
5165 spin_lock(&rq->lock);
5170 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5171 static void migrate_dead_tasks(unsigned int dead_cpu)
5173 struct rq *rq = cpu_rq(dead_cpu);
5174 struct task_struct *next;
5177 if (!rq->nr_running)
5179 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5182 migrate_dead(dead_cpu, next);
5185 #endif /* CONFIG_HOTPLUG_CPU */
5188 * migration_call - callback that gets triggered when a CPU is added.
5189 * Here we can start up the necessary migration thread for the new CPU.
5191 static int __cpuinit
5192 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5194 struct task_struct *p;
5195 int cpu = (long)hcpu;
5196 unsigned long flags;
5200 case CPU_LOCK_ACQUIRE:
5201 mutex_lock(&sched_hotcpu_mutex);
5204 case CPU_UP_PREPARE:
5205 case CPU_UP_PREPARE_FROZEN:
5206 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5209 p->flags |= PF_NOFREEZE;
5210 kthread_bind(p, cpu);
5211 /* Must be high prio: stop_machine expects to yield to it. */
5212 rq = task_rq_lock(p, &flags);
5213 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5214 task_rq_unlock(rq, &flags);
5215 cpu_rq(cpu)->migration_thread = p;
5219 case CPU_ONLINE_FROZEN:
5220 /* Strictly unneccessary, as first user will wake it. */
5221 wake_up_process(cpu_rq(cpu)->migration_thread);
5224 #ifdef CONFIG_HOTPLUG_CPU
5225 case CPU_UP_CANCELED:
5226 case CPU_UP_CANCELED_FROZEN:
5227 if (!cpu_rq(cpu)->migration_thread)
5229 /* Unbind it from offline cpu so it can run. Fall thru. */
5230 kthread_bind(cpu_rq(cpu)->migration_thread,
5231 any_online_cpu(cpu_online_map));
5232 kthread_stop(cpu_rq(cpu)->migration_thread);
5233 cpu_rq(cpu)->migration_thread = NULL;
5237 case CPU_DEAD_FROZEN:
5238 migrate_live_tasks(cpu);
5240 kthread_stop(rq->migration_thread);
5241 rq->migration_thread = NULL;
5242 /* Idle task back to normal (off runqueue, low prio) */
5243 rq = task_rq_lock(rq->idle, &flags);
5244 deactivate_task(rq, rq->idle, 0);
5245 rq->idle->static_prio = MAX_PRIO;
5246 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5247 rq->idle->sched_class = &idle_sched_class;
5248 migrate_dead_tasks(cpu);
5249 task_rq_unlock(rq, &flags);
5250 migrate_nr_uninterruptible(rq);
5251 BUG_ON(rq->nr_running != 0);
5253 /* No need to migrate the tasks: it was best-effort if
5254 * they didn't take sched_hotcpu_mutex. Just wake up
5255 * the requestors. */
5256 spin_lock_irq(&rq->lock);
5257 while (!list_empty(&rq->migration_queue)) {
5258 struct migration_req *req;
5260 req = list_entry(rq->migration_queue.next,
5261 struct migration_req, list);
5262 list_del_init(&req->list);
5263 complete(&req->done);
5265 spin_unlock_irq(&rq->lock);
5268 case CPU_LOCK_RELEASE:
5269 mutex_unlock(&sched_hotcpu_mutex);
5275 /* Register at highest priority so that task migration (migrate_all_tasks)
5276 * happens before everything else.
5278 static struct notifier_block __cpuinitdata migration_notifier = {
5279 .notifier_call = migration_call,
5283 int __init migration_init(void)
5285 void *cpu = (void *)(long)smp_processor_id();
5288 /* Start one for the boot CPU: */
5289 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5290 BUG_ON(err == NOTIFY_BAD);
5291 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5292 register_cpu_notifier(&migration_notifier);
5300 /* Number of possible processor ids */
5301 int nr_cpu_ids __read_mostly = NR_CPUS;
5302 EXPORT_SYMBOL(nr_cpu_ids);
5304 #undef SCHED_DOMAIN_DEBUG
5305 #ifdef SCHED_DOMAIN_DEBUG
5306 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5311 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5315 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5320 struct sched_group *group = sd->groups;
5321 cpumask_t groupmask;
5323 cpumask_scnprintf(str, NR_CPUS, sd->span);
5324 cpus_clear(groupmask);
5327 for (i = 0; i < level + 1; i++)
5329 printk("domain %d: ", level);
5331 if (!(sd->flags & SD_LOAD_BALANCE)) {
5332 printk("does not load-balance\n");
5334 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5339 printk("span %s\n", str);
5341 if (!cpu_isset(cpu, sd->span))
5342 printk(KERN_ERR "ERROR: domain->span does not contain "
5344 if (!cpu_isset(cpu, group->cpumask))
5345 printk(KERN_ERR "ERROR: domain->groups does not contain"
5349 for (i = 0; i < level + 2; i++)
5355 printk(KERN_ERR "ERROR: group is NULL\n");
5359 if (!group->__cpu_power) {
5361 printk(KERN_ERR "ERROR: domain->cpu_power not "
5365 if (!cpus_weight(group->cpumask)) {
5367 printk(KERN_ERR "ERROR: empty group\n");
5370 if (cpus_intersects(groupmask, group->cpumask)) {
5372 printk(KERN_ERR "ERROR: repeated CPUs\n");
5375 cpus_or(groupmask, groupmask, group->cpumask);
5377 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5380 group = group->next;
5381 } while (group != sd->groups);
5384 if (!cpus_equal(sd->span, groupmask))
5385 printk(KERN_ERR "ERROR: groups don't span "
5393 if (!cpus_subset(groupmask, sd->span))
5394 printk(KERN_ERR "ERROR: parent span is not a superset "
5395 "of domain->span\n");
5400 # define sched_domain_debug(sd, cpu) do { } while (0)
5403 static int sd_degenerate(struct sched_domain *sd)
5405 if (cpus_weight(sd->span) == 1)
5408 /* Following flags need at least 2 groups */
5409 if (sd->flags & (SD_LOAD_BALANCE |
5410 SD_BALANCE_NEWIDLE |
5414 SD_SHARE_PKG_RESOURCES)) {
5415 if (sd->groups != sd->groups->next)
5419 /* Following flags don't use groups */
5420 if (sd->flags & (SD_WAKE_IDLE |
5429 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5431 unsigned long cflags = sd->flags, pflags = parent->flags;
5433 if (sd_degenerate(parent))
5436 if (!cpus_equal(sd->span, parent->span))
5439 /* Does parent contain flags not in child? */
5440 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5441 if (cflags & SD_WAKE_AFFINE)
5442 pflags &= ~SD_WAKE_BALANCE;
5443 /* Flags needing groups don't count if only 1 group in parent */
5444 if (parent->groups == parent->groups->next) {
5445 pflags &= ~(SD_LOAD_BALANCE |
5446 SD_BALANCE_NEWIDLE |
5450 SD_SHARE_PKG_RESOURCES);
5452 if (~cflags & pflags)
5459 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5460 * hold the hotplug lock.
5462 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5464 struct rq *rq = cpu_rq(cpu);
5465 struct sched_domain *tmp;
5467 /* Remove the sched domains which do not contribute to scheduling. */
5468 for (tmp = sd; tmp; tmp = tmp->parent) {
5469 struct sched_domain *parent = tmp->parent;
5472 if (sd_parent_degenerate(tmp, parent)) {
5473 tmp->parent = parent->parent;
5475 parent->parent->child = tmp;
5479 if (sd && sd_degenerate(sd)) {
5485 sched_domain_debug(sd, cpu);
5487 rcu_assign_pointer(rq->sd, sd);
5490 /* cpus with isolated domains */
5491 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5493 /* Setup the mask of cpus configured for isolated domains */
5494 static int __init isolated_cpu_setup(char *str)
5496 int ints[NR_CPUS], i;
5498 str = get_options(str, ARRAY_SIZE(ints), ints);
5499 cpus_clear(cpu_isolated_map);
5500 for (i = 1; i <= ints[0]; i++)
5501 if (ints[i] < NR_CPUS)
5502 cpu_set(ints[i], cpu_isolated_map);
5506 __setup ("isolcpus=", isolated_cpu_setup);
5509 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5510 * to a function which identifies what group(along with sched group) a CPU
5511 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5512 * (due to the fact that we keep track of groups covered with a cpumask_t).
5514 * init_sched_build_groups will build a circular linked list of the groups
5515 * covered by the given span, and will set each group's ->cpumask correctly,
5516 * and ->cpu_power to 0.
5519 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5520 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5521 struct sched_group **sg))
5523 struct sched_group *first = NULL, *last = NULL;
5524 cpumask_t covered = CPU_MASK_NONE;
5527 for_each_cpu_mask(i, span) {
5528 struct sched_group *sg;
5529 int group = group_fn(i, cpu_map, &sg);
5532 if (cpu_isset(i, covered))
5535 sg->cpumask = CPU_MASK_NONE;
5536 sg->__cpu_power = 0;
5538 for_each_cpu_mask(j, span) {
5539 if (group_fn(j, cpu_map, NULL) != group)
5542 cpu_set(j, covered);
5543 cpu_set(j, sg->cpumask);
5554 #define SD_NODES_PER_DOMAIN 16
5559 * find_next_best_node - find the next node to include in a sched_domain
5560 * @node: node whose sched_domain we're building
5561 * @used_nodes: nodes already in the sched_domain
5563 * Find the next node to include in a given scheduling domain. Simply
5564 * finds the closest node not already in the @used_nodes map.
5566 * Should use nodemask_t.
5568 static int find_next_best_node(int node, unsigned long *used_nodes)
5570 int i, n, val, min_val, best_node = 0;
5574 for (i = 0; i < MAX_NUMNODES; i++) {
5575 /* Start at @node */
5576 n = (node + i) % MAX_NUMNODES;
5578 if (!nr_cpus_node(n))
5581 /* Skip already used nodes */
5582 if (test_bit(n, used_nodes))
5585 /* Simple min distance search */
5586 val = node_distance(node, n);
5588 if (val < min_val) {
5594 set_bit(best_node, used_nodes);
5599 * sched_domain_node_span - get a cpumask for a node's sched_domain
5600 * @node: node whose cpumask we're constructing
5601 * @size: number of nodes to include in this span
5603 * Given a node, construct a good cpumask for its sched_domain to span. It
5604 * should be one that prevents unnecessary balancing, but also spreads tasks
5607 static cpumask_t sched_domain_node_span(int node)
5609 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5610 cpumask_t span, nodemask;
5614 bitmap_zero(used_nodes, MAX_NUMNODES);
5616 nodemask = node_to_cpumask(node);
5617 cpus_or(span, span, nodemask);
5618 set_bit(node, used_nodes);
5620 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5621 int next_node = find_next_best_node(node, used_nodes);
5623 nodemask = node_to_cpumask(next_node);
5624 cpus_or(span, span, nodemask);
5631 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5634 * SMT sched-domains:
5636 #ifdef CONFIG_SCHED_SMT
5637 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5638 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5640 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5641 struct sched_group **sg)
5644 *sg = &per_cpu(sched_group_cpus, cpu);
5650 * multi-core sched-domains:
5652 #ifdef CONFIG_SCHED_MC
5653 static DEFINE_PER_CPU(struct sched_domain, core_domains);
5654 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5657 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5658 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5659 struct sched_group **sg)
5662 cpumask_t mask = cpu_sibling_map[cpu];
5663 cpus_and(mask, mask, *cpu_map);
5664 group = first_cpu(mask);
5666 *sg = &per_cpu(sched_group_core, group);
5669 #elif defined(CONFIG_SCHED_MC)
5670 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5671 struct sched_group **sg)
5674 *sg = &per_cpu(sched_group_core, cpu);
5679 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5680 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
5682 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5683 struct sched_group **sg)
5686 #ifdef CONFIG_SCHED_MC
5687 cpumask_t mask = cpu_coregroup_map(cpu);
5688 cpus_and(mask, mask, *cpu_map);
5689 group = first_cpu(mask);
5690 #elif defined(CONFIG_SCHED_SMT)
5691 cpumask_t mask = cpu_sibling_map[cpu];
5692 cpus_and(mask, mask, *cpu_map);
5693 group = first_cpu(mask);
5698 *sg = &per_cpu(sched_group_phys, group);
5704 * The init_sched_build_groups can't handle what we want to do with node
5705 * groups, so roll our own. Now each node has its own list of groups which
5706 * gets dynamically allocated.
5708 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5709 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5711 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5712 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
5714 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5715 struct sched_group **sg)
5717 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5720 cpus_and(nodemask, nodemask, *cpu_map);
5721 group = first_cpu(nodemask);
5724 *sg = &per_cpu(sched_group_allnodes, group);
5728 static void init_numa_sched_groups_power(struct sched_group *group_head)
5730 struct sched_group *sg = group_head;
5736 for_each_cpu_mask(j, sg->cpumask) {
5737 struct sched_domain *sd;
5739 sd = &per_cpu(phys_domains, j);
5740 if (j != first_cpu(sd->groups->cpumask)) {
5742 * Only add "power" once for each
5748 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5751 if (sg != group_head)
5757 /* Free memory allocated for various sched_group structures */
5758 static void free_sched_groups(const cpumask_t *cpu_map)
5762 for_each_cpu_mask(cpu, *cpu_map) {
5763 struct sched_group **sched_group_nodes
5764 = sched_group_nodes_bycpu[cpu];
5766 if (!sched_group_nodes)
5769 for (i = 0; i < MAX_NUMNODES; i++) {
5770 cpumask_t nodemask = node_to_cpumask(i);
5771 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5773 cpus_and(nodemask, nodemask, *cpu_map);
5774 if (cpus_empty(nodemask))
5784 if (oldsg != sched_group_nodes[i])
5787 kfree(sched_group_nodes);
5788 sched_group_nodes_bycpu[cpu] = NULL;
5792 static void free_sched_groups(const cpumask_t *cpu_map)
5798 * Initialize sched groups cpu_power.
5800 * cpu_power indicates the capacity of sched group, which is used while
5801 * distributing the load between different sched groups in a sched domain.
5802 * Typically cpu_power for all the groups in a sched domain will be same unless
5803 * there are asymmetries in the topology. If there are asymmetries, group
5804 * having more cpu_power will pickup more load compared to the group having
5807 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5808 * the maximum number of tasks a group can handle in the presence of other idle
5809 * or lightly loaded groups in the same sched domain.
5811 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5813 struct sched_domain *child;
5814 struct sched_group *group;
5816 WARN_ON(!sd || !sd->groups);
5818 if (cpu != first_cpu(sd->groups->cpumask))
5823 sd->groups->__cpu_power = 0;
5826 * For perf policy, if the groups in child domain share resources
5827 * (for example cores sharing some portions of the cache hierarchy
5828 * or SMT), then set this domain groups cpu_power such that each group
5829 * can handle only one task, when there are other idle groups in the
5830 * same sched domain.
5832 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5834 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5835 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
5840 * add cpu_power of each child group to this groups cpu_power
5842 group = child->groups;
5844 sg_inc_cpu_power(sd->groups, group->__cpu_power);
5845 group = group->next;
5846 } while (group != child->groups);
5850 * Build sched domains for a given set of cpus and attach the sched domains
5851 * to the individual cpus
5853 static int build_sched_domains(const cpumask_t *cpu_map)
5857 struct sched_group **sched_group_nodes = NULL;
5858 int sd_allnodes = 0;
5861 * Allocate the per-node list of sched groups
5863 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
5865 if (!sched_group_nodes) {
5866 printk(KERN_WARNING "Can not alloc sched group node list\n");
5869 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5873 * Set up domains for cpus specified by the cpu_map.
5875 for_each_cpu_mask(i, *cpu_map) {
5876 struct sched_domain *sd = NULL, *p;
5877 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5879 cpus_and(nodemask, nodemask, *cpu_map);
5882 if (cpus_weight(*cpu_map) >
5883 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5884 sd = &per_cpu(allnodes_domains, i);
5885 *sd = SD_ALLNODES_INIT;
5886 sd->span = *cpu_map;
5887 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
5893 sd = &per_cpu(node_domains, i);
5895 sd->span = sched_domain_node_span(cpu_to_node(i));
5899 cpus_and(sd->span, sd->span, *cpu_map);
5903 sd = &per_cpu(phys_domains, i);
5905 sd->span = nodemask;
5909 cpu_to_phys_group(i, cpu_map, &sd->groups);
5911 #ifdef CONFIG_SCHED_MC
5913 sd = &per_cpu(core_domains, i);
5915 sd->span = cpu_coregroup_map(i);
5916 cpus_and(sd->span, sd->span, *cpu_map);
5919 cpu_to_core_group(i, cpu_map, &sd->groups);
5922 #ifdef CONFIG_SCHED_SMT
5924 sd = &per_cpu(cpu_domains, i);
5925 *sd = SD_SIBLING_INIT;
5926 sd->span = cpu_sibling_map[i];
5927 cpus_and(sd->span, sd->span, *cpu_map);
5930 cpu_to_cpu_group(i, cpu_map, &sd->groups);
5934 #ifdef CONFIG_SCHED_SMT
5935 /* Set up CPU (sibling) groups */
5936 for_each_cpu_mask(i, *cpu_map) {
5937 cpumask_t this_sibling_map = cpu_sibling_map[i];
5938 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
5939 if (i != first_cpu(this_sibling_map))
5942 init_sched_build_groups(this_sibling_map, cpu_map,
5947 #ifdef CONFIG_SCHED_MC
5948 /* Set up multi-core groups */
5949 for_each_cpu_mask(i, *cpu_map) {
5950 cpumask_t this_core_map = cpu_coregroup_map(i);
5951 cpus_and(this_core_map, this_core_map, *cpu_map);
5952 if (i != first_cpu(this_core_map))
5954 init_sched_build_groups(this_core_map, cpu_map,
5955 &cpu_to_core_group);
5959 /* Set up physical groups */
5960 for (i = 0; i < MAX_NUMNODES; i++) {
5961 cpumask_t nodemask = node_to_cpumask(i);
5963 cpus_and(nodemask, nodemask, *cpu_map);
5964 if (cpus_empty(nodemask))
5967 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
5971 /* Set up node groups */
5973 init_sched_build_groups(*cpu_map, cpu_map,
5974 &cpu_to_allnodes_group);
5976 for (i = 0; i < MAX_NUMNODES; i++) {
5977 /* Set up node groups */
5978 struct sched_group *sg, *prev;
5979 cpumask_t nodemask = node_to_cpumask(i);
5980 cpumask_t domainspan;
5981 cpumask_t covered = CPU_MASK_NONE;
5984 cpus_and(nodemask, nodemask, *cpu_map);
5985 if (cpus_empty(nodemask)) {
5986 sched_group_nodes[i] = NULL;
5990 domainspan = sched_domain_node_span(i);
5991 cpus_and(domainspan, domainspan, *cpu_map);
5993 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
5995 printk(KERN_WARNING "Can not alloc domain group for "
5999 sched_group_nodes[i] = sg;
6000 for_each_cpu_mask(j, nodemask) {
6001 struct sched_domain *sd;
6002 sd = &per_cpu(node_domains, j);
6005 sg->__cpu_power = 0;
6006 sg->cpumask = nodemask;
6008 cpus_or(covered, covered, nodemask);
6011 for (j = 0; j < MAX_NUMNODES; j++) {
6012 cpumask_t tmp, notcovered;
6013 int n = (i + j) % MAX_NUMNODES;
6015 cpus_complement(notcovered, covered);
6016 cpus_and(tmp, notcovered, *cpu_map);
6017 cpus_and(tmp, tmp, domainspan);
6018 if (cpus_empty(tmp))
6021 nodemask = node_to_cpumask(n);
6022 cpus_and(tmp, tmp, nodemask);
6023 if (cpus_empty(tmp))
6026 sg = kmalloc_node(sizeof(struct sched_group),
6030 "Can not alloc domain group for node %d\n", j);
6033 sg->__cpu_power = 0;
6035 sg->next = prev->next;
6036 cpus_or(covered, covered, tmp);
6043 /* Calculate CPU power for physical packages and nodes */
6044 #ifdef CONFIG_SCHED_SMT
6045 for_each_cpu_mask(i, *cpu_map) {
6046 struct sched_domain *sd = &per_cpu(cpu_domains, i);
6048 init_sched_groups_power(i, sd);
6051 #ifdef CONFIG_SCHED_MC
6052 for_each_cpu_mask(i, *cpu_map) {
6053 struct sched_domain *sd = &per_cpu(core_domains, i);
6055 init_sched_groups_power(i, sd);
6059 for_each_cpu_mask(i, *cpu_map) {
6060 struct sched_domain *sd = &per_cpu(phys_domains, i);
6062 init_sched_groups_power(i, sd);
6066 for (i = 0; i < MAX_NUMNODES; i++)
6067 init_numa_sched_groups_power(sched_group_nodes[i]);
6070 struct sched_group *sg;
6072 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6073 init_numa_sched_groups_power(sg);
6077 /* Attach the domains */
6078 for_each_cpu_mask(i, *cpu_map) {
6079 struct sched_domain *sd;
6080 #ifdef CONFIG_SCHED_SMT
6081 sd = &per_cpu(cpu_domains, i);
6082 #elif defined(CONFIG_SCHED_MC)
6083 sd = &per_cpu(core_domains, i);
6085 sd = &per_cpu(phys_domains, i);
6087 cpu_attach_domain(sd, i);
6094 free_sched_groups(cpu_map);
6099 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6101 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6103 cpumask_t cpu_default_map;
6107 * Setup mask for cpus without special case scheduling requirements.
6108 * For now this just excludes isolated cpus, but could be used to
6109 * exclude other special cases in the future.
6111 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6113 err = build_sched_domains(&cpu_default_map);
6118 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6120 free_sched_groups(cpu_map);
6124 * Detach sched domains from a group of cpus specified in cpu_map
6125 * These cpus will now be attached to the NULL domain
6127 static void detach_destroy_domains(const cpumask_t *cpu_map)
6131 for_each_cpu_mask(i, *cpu_map)
6132 cpu_attach_domain(NULL, i);
6133 synchronize_sched();
6134 arch_destroy_sched_domains(cpu_map);
6138 * Partition sched domains as specified by the cpumasks below.
6139 * This attaches all cpus from the cpumasks to the NULL domain,
6140 * waits for a RCU quiescent period, recalculates sched
6141 * domain information and then attaches them back to the
6142 * correct sched domains
6143 * Call with hotplug lock held
6145 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6147 cpumask_t change_map;
6150 cpus_and(*partition1, *partition1, cpu_online_map);
6151 cpus_and(*partition2, *partition2, cpu_online_map);
6152 cpus_or(change_map, *partition1, *partition2);
6154 /* Detach sched domains from all of the affected cpus */
6155 detach_destroy_domains(&change_map);
6156 if (!cpus_empty(*partition1))
6157 err = build_sched_domains(partition1);
6158 if (!err && !cpus_empty(*partition2))
6159 err = build_sched_domains(partition2);
6164 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6165 int arch_reinit_sched_domains(void)
6169 mutex_lock(&sched_hotcpu_mutex);
6170 detach_destroy_domains(&cpu_online_map);
6171 err = arch_init_sched_domains(&cpu_online_map);
6172 mutex_unlock(&sched_hotcpu_mutex);
6177 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6181 if (buf[0] != '0' && buf[0] != '1')
6185 sched_smt_power_savings = (buf[0] == '1');
6187 sched_mc_power_savings = (buf[0] == '1');
6189 ret = arch_reinit_sched_domains();
6191 return ret ? ret : count;
6194 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6198 #ifdef CONFIG_SCHED_SMT
6200 err = sysfs_create_file(&cls->kset.kobj,
6201 &attr_sched_smt_power_savings.attr);
6203 #ifdef CONFIG_SCHED_MC
6204 if (!err && mc_capable())
6205 err = sysfs_create_file(&cls->kset.kobj,
6206 &attr_sched_mc_power_savings.attr);
6212 #ifdef CONFIG_SCHED_MC
6213 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6215 return sprintf(page, "%u\n", sched_mc_power_savings);
6217 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6218 const char *buf, size_t count)
6220 return sched_power_savings_store(buf, count, 0);
6222 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6223 sched_mc_power_savings_store);
6226 #ifdef CONFIG_SCHED_SMT
6227 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6229 return sprintf(page, "%u\n", sched_smt_power_savings);
6231 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6232 const char *buf, size_t count)
6234 return sched_power_savings_store(buf, count, 1);
6236 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6237 sched_smt_power_savings_store);
6241 * Force a reinitialization of the sched domains hierarchy. The domains
6242 * and groups cannot be updated in place without racing with the balancing
6243 * code, so we temporarily attach all running cpus to the NULL domain
6244 * which will prevent rebalancing while the sched domains are recalculated.
6246 static int update_sched_domains(struct notifier_block *nfb,
6247 unsigned long action, void *hcpu)
6250 case CPU_UP_PREPARE:
6251 case CPU_UP_PREPARE_FROZEN:
6252 case CPU_DOWN_PREPARE:
6253 case CPU_DOWN_PREPARE_FROZEN:
6254 detach_destroy_domains(&cpu_online_map);
6257 case CPU_UP_CANCELED:
6258 case CPU_UP_CANCELED_FROZEN:
6259 case CPU_DOWN_FAILED:
6260 case CPU_DOWN_FAILED_FROZEN:
6262 case CPU_ONLINE_FROZEN:
6264 case CPU_DEAD_FROZEN:
6266 * Fall through and re-initialise the domains.
6273 /* The hotplug lock is already held by cpu_up/cpu_down */
6274 arch_init_sched_domains(&cpu_online_map);
6279 void __init sched_init_smp(void)
6281 cpumask_t non_isolated_cpus;
6283 mutex_lock(&sched_hotcpu_mutex);
6284 arch_init_sched_domains(&cpu_online_map);
6285 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6286 if (cpus_empty(non_isolated_cpus))
6287 cpu_set(smp_processor_id(), non_isolated_cpus);
6288 mutex_unlock(&sched_hotcpu_mutex);
6289 /* XXX: Theoretical race here - CPU may be hotplugged now */
6290 hotcpu_notifier(update_sched_domains, 0);
6292 /* Move init over to a non-isolated CPU */
6293 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6295 sched_init_granularity();
6298 void __init sched_init_smp(void)
6300 sched_init_granularity();
6302 #endif /* CONFIG_SMP */
6304 int in_sched_functions(unsigned long addr)
6306 /* Linker adds these: start and end of __sched functions */
6307 extern char __sched_text_start[], __sched_text_end[];
6309 return in_lock_functions(addr) ||
6310 (addr >= (unsigned long)__sched_text_start
6311 && addr < (unsigned long)__sched_text_end);
6314 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6316 cfs_rq->tasks_timeline = RB_ROOT;
6317 cfs_rq->fair_clock = 1;
6318 #ifdef CONFIG_FAIR_GROUP_SCHED
6323 void __init sched_init(void)
6325 u64 now = sched_clock();
6326 int highest_cpu = 0;
6330 * Link up the scheduling class hierarchy:
6332 rt_sched_class.next = &fair_sched_class;
6333 fair_sched_class.next = &idle_sched_class;
6334 idle_sched_class.next = NULL;
6336 for_each_possible_cpu(i) {
6337 struct rt_prio_array *array;
6341 spin_lock_init(&rq->lock);
6342 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6345 init_cfs_rq(&rq->cfs, rq);
6346 #ifdef CONFIG_FAIR_GROUP_SCHED
6347 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6348 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6350 rq->ls.load_update_last = now;
6351 rq->ls.load_update_start = now;
6353 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6354 rq->cpu_load[j] = 0;
6357 rq->active_balance = 0;
6358 rq->next_balance = jiffies;
6361 rq->migration_thread = NULL;
6362 INIT_LIST_HEAD(&rq->migration_queue);
6364 atomic_set(&rq->nr_iowait, 0);
6366 array = &rq->rt.active;
6367 for (j = 0; j < MAX_RT_PRIO; j++) {
6368 INIT_LIST_HEAD(array->queue + j);
6369 __clear_bit(j, array->bitmap);
6372 /* delimiter for bitsearch: */
6373 __set_bit(MAX_RT_PRIO, array->bitmap);
6376 set_load_weight(&init_task);
6379 nr_cpu_ids = highest_cpu + 1;
6380 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6383 #ifdef CONFIG_RT_MUTEXES
6384 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6388 * The boot idle thread does lazy MMU switching as well:
6390 atomic_inc(&init_mm.mm_count);
6391 enter_lazy_tlb(&init_mm, current);
6394 * Make us the idle thread. Technically, schedule() should not be
6395 * called from this thread, however somewhere below it might be,
6396 * but because we are the idle thread, we just pick up running again
6397 * when this runqueue becomes "idle".
6399 init_idle(current, smp_processor_id());
6401 * During early bootup we pretend to be a normal task:
6403 current->sched_class = &fair_sched_class;
6406 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6407 void __might_sleep(char *file, int line)
6410 static unsigned long prev_jiffy; /* ratelimiting */
6412 if ((in_atomic() || irqs_disabled()) &&
6413 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6414 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6416 prev_jiffy = jiffies;
6417 printk(KERN_ERR "BUG: sleeping function called from invalid"
6418 " context at %s:%d\n", file, line);
6419 printk("in_atomic():%d, irqs_disabled():%d\n",
6420 in_atomic(), irqs_disabled());
6421 debug_show_held_locks(current);
6422 if (irqs_disabled())
6423 print_irqtrace_events(current);
6428 EXPORT_SYMBOL(__might_sleep);
6431 #ifdef CONFIG_MAGIC_SYSRQ
6432 void normalize_rt_tasks(void)
6434 struct task_struct *g, *p;
6435 unsigned long flags;
6439 read_lock_irq(&tasklist_lock);
6440 do_each_thread(g, p) {
6442 p->se.wait_runtime = 0;
6443 p->se.wait_start_fair = 0;
6444 p->se.wait_start = 0;
6445 p->se.exec_start = 0;
6446 p->se.sleep_start = 0;
6447 p->se.sleep_start_fair = 0;
6448 p->se.block_start = 0;
6449 task_rq(p)->cfs.fair_clock = 0;
6450 task_rq(p)->clock = 0;
6454 * Renice negative nice level userspace
6457 if (TASK_NICE(p) < 0 && p->mm)
6458 set_user_nice(p, 0);
6462 spin_lock_irqsave(&p->pi_lock, flags);
6463 rq = __task_rq_lock(p);
6466 * Do not touch the migration thread:
6468 if (p == rq->migration_thread)
6472 on_rq = p->se.on_rq;
6474 deactivate_task(task_rq(p), p, 0);
6475 __setscheduler(rq, p, SCHED_NORMAL, 0);
6477 activate_task(task_rq(p), p, 0);
6478 resched_task(rq->curr);
6483 __task_rq_unlock(rq);
6484 spin_unlock_irqrestore(&p->pi_lock, flags);
6485 } while_each_thread(g, p);
6487 read_unlock_irq(&tasklist_lock);
6490 #endif /* CONFIG_MAGIC_SYSRQ */
6494 * These functions are only useful for the IA64 MCA handling.
6496 * They can only be called when the whole system has been
6497 * stopped - every CPU needs to be quiescent, and no scheduling
6498 * activity can take place. Using them for anything else would
6499 * be a serious bug, and as a result, they aren't even visible
6500 * under any other configuration.
6504 * curr_task - return the current task for a given cpu.
6505 * @cpu: the processor in question.
6507 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6509 struct task_struct *curr_task(int cpu)
6511 return cpu_curr(cpu);
6515 * set_curr_task - set the current task for a given cpu.
6516 * @cpu: the processor in question.
6517 * @p: the task pointer to set.
6519 * Description: This function must only be used when non-maskable interrupts
6520 * are serviced on a separate stack. It allows the architecture to switch the
6521 * notion of the current task on a cpu in a non-blocking manner. This function
6522 * must be called with all CPU's synchronized, and interrupts disabled, the
6523 * and caller must save the original value of the current task (see
6524 * curr_task() above) and restore that value before reenabling interrupts and
6525 * re-starting the system.
6527 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6529 void set_curr_task(int cpu, struct task_struct *p)