*/
static DEFINE_SPINLOCK(task_group_lock);
+#ifdef CONFIG_SMP
+static int root_task_group_empty(void)
+{
+ return list_empty(&root_task_group.children);
+}
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_USER_SCHED
# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
#else
+#ifdef CONFIG_SMP
+static int root_task_group_empty(void)
+{
+ return 1;
+}
+#endif
+
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
static inline struct task_group *task_group(struct task_struct *p)
{
/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
/* sys_sched_yield() stats */
- unsigned int yld_exp_empty;
- unsigned int yld_act_empty;
- unsigned int yld_both_empty;
unsigned int yld_count;
/* schedule() stats */
assert_spin_locked(&task_rq(p)->lock);
- if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+ if (test_tsk_need_resched(p))
return;
- set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+ set_tsk_need_resched(p);
cpu = task_cpu(p);
if (cpu == smp_processor_id())
* lockless. The worst case is that the other CPU runs the
* idle task through an additional NOOP schedule()
*/
- set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+ set_tsk_need_resched(rq->idle);
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
* it must be off the runqueue _entirely_, and not
* preempted!
*
- * So if it wa still runnable (but just not actively
+ * So if it was still runnable (but just not actively
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
sync = 0;
#ifdef CONFIG_SMP
- if (sched_feat(LB_WAKEUP_UPDATE)) {
+ if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
struct sched_domain *sd;
this_cpu = raw_smp_processor_id();
#ifdef CONFIG_PREEMPT_NOTIFIERS
/**
- * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
* @notifier: notifier struct to register
*/
void preempt_notifier_register(struct preempt_notifier *notifier)
struct sched_domain *sd, enum cpu_idle_type idle,
int *all_pinned)
{
+ int tsk_cache_hot = 0;
/*
* We do not migrate tasks that are:
* 1) running (obviously), or
* 2) too many balance attempts have failed.
*/
- if (!task_hot(p, rq->clock, sd) ||
- sd->nr_balance_failed > sd->cache_nice_tries) {
+ tsk_cache_hot = task_hot(p, rq->clock, sd);
+ if (!tsk_cache_hot ||
+ sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
- if (task_hot(p, rq->clock, sd)) {
+ if (tsk_cache_hot) {
schedstat_inc(sd, lb_hot_gained[idle]);
schedstat_inc(p, se.nr_forced_migrations);
}
return 1;
}
- if (task_hot(p, rq->clock, sd)) {
+ if (tsk_cache_hot) {
schedstat_inc(p, se.nr_failed_migrations_hot);
return 0;
}
return 0;
}
+/********** Helpers for find_busiest_group ************************/
+/**
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ * during load balancing.
+ */
+struct sd_lb_stats {
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *this; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_pwr; /* Total power of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+
+ /** Statistics of this group */
+ unsigned long this_load;
+ unsigned long this_load_per_task;
+ unsigned long this_nr_running;
+
+ /* Statistics of the busiest group */
+ unsigned long max_load;
+ unsigned long busiest_load_per_task;
+ unsigned long busiest_nr_running;
+
+ int group_imb; /* Is there imbalance in this sd */
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ int power_savings_balance; /* Is powersave balance needed for this sd */
+ struct sched_group *group_min; /* Least loaded group in sd */
+ struct sched_group *group_leader; /* Group which relieves group_min */
+ unsigned long min_load_per_task; /* load_per_task in group_min */
+ unsigned long leader_nr_running; /* Nr running of group_leader */
+ unsigned long min_nr_running; /* Nr running of group_min */
+#endif
+};
-/*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the amount of weighted load which
- * should be moved to restore balance via the imbalance parameter.
+/**
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+ unsigned long avg_load; /*Avg load across the CPUs of the group */
+ unsigned long group_load; /* Total load over the CPUs of the group */
+ unsigned long sum_nr_running; /* Nr tasks running in the group */
+ unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+ unsigned long group_capacity;
+ int group_imb; /* Is there an imbalance in the group ? */
+};
+
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
*/
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum cpu_idle_type idle,
- int *sd_idle, const struct cpumask *cpus, int *balance)
+static inline unsigned int group_first_cpu(struct sched_group *group)
{
- struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
- unsigned long max_load, avg_load, total_load, this_load, total_pwr;
- unsigned long max_pull;
- unsigned long busiest_load_per_task, busiest_nr_running;
- unsigned long this_load_per_task, this_nr_running;
- int load_idx, group_imb = 0;
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- int power_savings_balance = 1;
- unsigned long leader_nr_running = 0, min_load_per_task = 0;
- unsigned long min_nr_running = ULONG_MAX;
- struct sched_group *group_min = NULL, *group_leader = NULL;
-#endif
+ return cpumask_first(sched_group_cpus(group));
+}
- max_load = this_load = total_load = total_pwr = 0;
- busiest_load_per_task = busiest_nr_running = 0;
- this_load_per_task = this_nr_running = 0;
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+ enum cpu_idle_type idle)
+{
+ int load_idx;
- if (idle == CPU_NOT_IDLE)
+ switch (idle) {
+ case CPU_NOT_IDLE:
load_idx = sd->busy_idx;
- else if (idle == CPU_NEWLY_IDLE)
+ break;
+
+ case CPU_NEWLY_IDLE:
load_idx = sd->newidle_idx;
- else
+ break;
+ default:
load_idx = sd->idle_idx;
+ break;
+ }
- do {
- unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
- int local_group;
- int i;
- int __group_imb = 0;
- unsigned int balance_cpu = -1, first_idle_cpu = 0;
- unsigned long sum_nr_running, sum_weighted_load;
- unsigned long sum_avg_load_per_task;
- unsigned long avg_load_per_task;
-
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_cpus(group));
+ return load_idx;
+}
- if (local_group)
- balance_cpu = cpumask_first(sched_group_cpus(group));
- /* Tally up the load of all CPUs in the group */
- sum_weighted_load = sum_nr_running = avg_load = 0;
- sum_avg_load_per_task = avg_load_per_task = 0;
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @group: sched_group whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @local_group: Does group contain this_cpu.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sgs: variable to hold the statistics for this group.
+ */
+static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+ enum cpu_idle_type idle, int load_idx, int *sd_idle,
+ int local_group, const struct cpumask *cpus,
+ int *balance, struct sg_lb_stats *sgs)
+{
+ unsigned long load, max_cpu_load, min_cpu_load;
+ int i;
+ unsigned int balance_cpu = -1, first_idle_cpu = 0;
+ unsigned long sum_avg_load_per_task;
+ unsigned long avg_load_per_task;
- max_cpu_load = 0;
- min_cpu_load = ~0UL;
+ if (local_group)
+ balance_cpu = group_first_cpu(group);
- for_each_cpu_and(i, sched_group_cpus(group), cpus) {
- struct rq *rq = cpu_rq(i);
+ /* Tally up the load of all CPUs in the group */
+ sum_avg_load_per_task = avg_load_per_task = 0;
+ max_cpu_load = 0;
+ min_cpu_load = ~0UL;
- if (*sd_idle && rq->nr_running)
- *sd_idle = 0;
+ for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ struct rq *rq = cpu_rq(i);
- /* Bias balancing toward cpus of our domain */
- if (local_group) {
- if (idle_cpu(i) && !first_idle_cpu) {
- first_idle_cpu = 1;
- balance_cpu = i;
- }
+ if (*sd_idle && rq->nr_running)
+ *sd_idle = 0;
- load = target_load(i, load_idx);
- } else {
- load = source_load(i, load_idx);
- if (load > max_cpu_load)
- max_cpu_load = load;
- if (min_cpu_load > load)
- min_cpu_load = load;
+ /* Bias balancing toward cpus of our domain */
+ if (local_group) {
+ if (idle_cpu(i) && !first_idle_cpu) {
+ first_idle_cpu = 1;
+ balance_cpu = i;
}
- avg_load += load;
- sum_nr_running += rq->nr_running;
- sum_weighted_load += weighted_cpuload(i);
-
- sum_avg_load_per_task += cpu_avg_load_per_task(i);
+ load = target_load(i, load_idx);
+ } else {
+ load = source_load(i, load_idx);
+ if (load > max_cpu_load)
+ max_cpu_load = load;
+ if (min_cpu_load > load)
+ min_cpu_load = load;
}
- /*
- * First idle cpu or the first cpu(busiest) in this sched group
- * is eligible for doing load balancing at this and above
- * domains. In the newly idle case, we will allow all the cpu's
- * to do the newly idle load balance.
- */
- if (idle != CPU_NEWLY_IDLE && local_group &&
- balance_cpu != this_cpu && balance) {
- *balance = 0;
- goto ret;
- }
+ sgs->group_load += load;
+ sgs->sum_nr_running += rq->nr_running;
+ sgs->sum_weighted_load += weighted_cpuload(i);
- total_load += avg_load;
- total_pwr += group->__cpu_power;
+ sum_avg_load_per_task += cpu_avg_load_per_task(i);
+ }
- /* Adjust by relative CPU power of the group */
- avg_load = sg_div_cpu_power(group,
- avg_load * SCHED_LOAD_SCALE);
+ /*
+ * First idle cpu or the first cpu(busiest) in this sched group
+ * is eligible for doing load balancing at this and above
+ * domains. In the newly idle case, we will allow all the cpu's
+ * to do the newly idle load balance.
+ */
+ if (idle != CPU_NEWLY_IDLE && local_group &&
+ balance_cpu != this_cpu && balance) {
+ *balance = 0;
+ return;
+ }
+ /* Adjust by relative CPU power of the group */
+ sgs->avg_load = sg_div_cpu_power(group,
+ sgs->group_load * SCHED_LOAD_SCALE);
- /*
- * Consider the group unbalanced when the imbalance is larger
- * than the average weight of two tasks.
- *
- * APZ: with cgroup the avg task weight can vary wildly and
- * might not be a suitable number - should we keep a
- * normalized nr_running number somewhere that negates
- * the hierarchy?
- */
- avg_load_per_task = sg_div_cpu_power(group,
- sum_avg_load_per_task * SCHED_LOAD_SCALE);
- if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
- __group_imb = 1;
+ /*
+ * Consider the group unbalanced when the imbalance is larger
+ * than the average weight of two tasks.
+ *
+ * APZ: with cgroup the avg task weight can vary wildly and
+ * might not be a suitable number - should we keep a
+ * normalized nr_running number somewhere that negates
+ * the hierarchy?
+ */
+ avg_load_per_task = sg_div_cpu_power(group,
+ sum_avg_load_per_task * SCHED_LOAD_SCALE);
+
+ if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+ sgs->group_imb = 1;
+
+ sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+
+}
+
+/**
+ * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: sched_domain whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sds: variable to hold the statistics for this sched_domain.
+ */
+static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+ enum cpu_idle_type idle, int *sd_idle,
+ const struct cpumask *cpus, int *balance,
+ struct sd_lb_stats *sds)
+{
+ struct sched_group *group = sd->groups;
+ struct sg_lb_stats sgs;
+ int load_idx;
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+ sds->power_savings_balance = 1;
+ sds->min_nr_running = ULONG_MAX;
+#endif
+ load_idx = get_sd_load_idx(sd, idle);
+
+ do {
+ int local_group;
+
+ local_group = cpumask_test_cpu(this_cpu,
+ sched_group_cpus(group));
+ memset(&sgs, 0, sizeof(sgs));
+ update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+ local_group, cpus, balance, &sgs);
- group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+ if (local_group && balance && !(*balance))
+ return;
+
+ sds->total_load += sgs.group_load;
+ sds->total_pwr += group->__cpu_power;
if (local_group) {
- this_load = avg_load;
- this = group;
- this_nr_running = sum_nr_running;
- this_load_per_task = sum_weighted_load;
- } else if (avg_load > max_load &&
- (sum_nr_running > group_capacity || __group_imb)) {
- max_load = avg_load;
- busiest = group;
- busiest_nr_running = sum_nr_running;
- busiest_load_per_task = sum_weighted_load;
- group_imb = __group_imb;
+ sds->this_load = sgs.avg_load;
+ sds->this = group;
+ sds->this_nr_running = sgs.sum_nr_running;
+ sds->this_load_per_task = sgs.sum_weighted_load;
+ } else if (sgs.avg_load > sds->max_load &&
+ (sgs.sum_nr_running > sgs.group_capacity ||
+ sgs.group_imb)) {
+ sds->max_load = sgs.avg_load;
+ sds->busiest = group;
+ sds->busiest_nr_running = sgs.sum_nr_running;
+ sds->busiest_load_per_task = sgs.sum_weighted_load;
+ sds->group_imb = sgs.group_imb;
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
* If the local group is idle or completely loaded
* no need to do power savings balance at this domain
*/
- if (local_group && (this_nr_running >= group_capacity ||
- !this_nr_running))
- power_savings_balance = 0;
+ if (local_group &&
+ (sds->this_nr_running >= sgs.group_capacity ||
+ !sds->this_nr_running))
+ sds->power_savings_balance = 0;
/*
* If a group is already running at full capacity or idle,
* don't include that group in power savings calculations
*/
- if (!power_savings_balance || sum_nr_running >= group_capacity
- || !sum_nr_running)
+ if (!sds->power_savings_balance ||
+ sgs.sum_nr_running >= sgs.group_capacity ||
+ !sgs.sum_nr_running)
goto group_next;
/*
* This is the group from where we need to pick up the load
* for saving power
*/
- if ((sum_nr_running < min_nr_running) ||
- (sum_nr_running == min_nr_running &&
- cpumask_first(sched_group_cpus(group)) >
- cpumask_first(sched_group_cpus(group_min)))) {
- group_min = group;
- min_nr_running = sum_nr_running;
- min_load_per_task = sum_weighted_load /
- sum_nr_running;
+ if ((sgs.sum_nr_running < sds->min_nr_running) ||
+ (sgs.sum_nr_running == sds->min_nr_running &&
+ group_first_cpu(group) >
+ group_first_cpu(sds->group_min))) {
+ sds->group_min = group;
+ sds->min_nr_running = sgs.sum_nr_running;
+ sds->min_load_per_task = sgs.sum_weighted_load /
+ sgs.sum_nr_running;
}
/*
* capacity but still has some space to pick up some load
* from other group and save more power
*/
- if (sum_nr_running <= group_capacity - 1) {
- if (sum_nr_running > leader_nr_running ||
- (sum_nr_running == leader_nr_running &&
- cpumask_first(sched_group_cpus(group)) <
- cpumask_first(sched_group_cpus(group_leader)))) {
- group_leader = group;
- leader_nr_running = sum_nr_running;
- }
+ if (sgs.sum_nr_running > sgs.group_capacity - 1)
+ goto group_next;
+
+ if (sgs.sum_nr_running > sds->leader_nr_running ||
+ (sgs.sum_nr_running == sds->leader_nr_running &&
+ group_first_cpu(group) <
+ group_first_cpu(sds->group_leader))) {
+ sds->group_leader = group;
+ sds->leader_nr_running = sgs.sum_nr_running;
}
group_next:
#endif
group = group->next;
} while (group != sd->groups);
- if (!busiest || this_load >= max_load || busiest_nr_running == 0)
+}
+/******* find_busiest_group() helpers end here *********************/
+
+/*
+ * find_busiest_group finds and returns the busiest CPU group within the
+ * domain. It calculates and returns the amount of weighted load which
+ * should be moved to restore balance via the imbalance parameter.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+ unsigned long *imbalance, enum cpu_idle_type idle,
+ int *sd_idle, const struct cpumask *cpus, int *balance)
+{
+ struct sd_lb_stats sds;
+ unsigned long max_pull;
+
+ memset(&sds, 0, sizeof(sds));
+
+ /*
+ * Compute the various statistics relavent for load balancing at
+ * this level.
+ */
+ update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+ balance, &sds);
+
+ if (balance && !(*balance))
+ goto ret;
+
+ if (!sds.busiest || sds.this_load >= sds.max_load
+ || sds.busiest_nr_running == 0)
goto out_balanced;
- avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+ sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
- if (this_load >= avg_load ||
- 100*max_load <= sd->imbalance_pct*this_load)
+ if (sds.this_load >= sds.avg_load ||
+ 100*sds.max_load <= sd->imbalance_pct * sds.this_load)
goto out_balanced;
- busiest_load_per_task /= busiest_nr_running;
- if (group_imb)
- busiest_load_per_task = min(busiest_load_per_task, avg_load);
+ sds.busiest_load_per_task /= sds.busiest_nr_running;
+ if (sds.group_imb)
+ sds.busiest_load_per_task =
+ min(sds.busiest_load_per_task, sds.avg_load);
/*
* We're trying to get all the cpus to the average_load, so we don't
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
*/
- if (max_load <= busiest_load_per_task)
+ if (sds.max_load <= sds.busiest_load_per_task)
goto out_balanced;
/*
* max load less than avg load(as we skip the groups at or below
* its cpu_power, while calculating max_load..)
*/
- if (max_load < avg_load) {
+ if (sds.max_load < sds.avg_load) {
*imbalance = 0;
goto small_imbalance;
}
/* Don't want to pull so many tasks that a group would go idle */
- max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+ max_pull = min(sds.max_load - sds.avg_load,
+ sds.max_load - sds.busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * busiest->__cpu_power,
- (avg_load - this_load) * this->__cpu_power)
+ *imbalance = min(max_pull * sds.busiest->__cpu_power,
+ (sds.avg_load - sds.this_load) * sds.this->__cpu_power)
/ SCHED_LOAD_SCALE;
/*
* a think about bumping its value to force at least one task to be
* moved
*/
- if (*imbalance < busiest_load_per_task) {
+ if (*imbalance < sds.busiest_load_per_task) {
unsigned long tmp, pwr_now, pwr_move;
unsigned int imbn;
small_imbalance:
pwr_move = pwr_now = 0;
imbn = 2;
- if (this_nr_running) {
- this_load_per_task /= this_nr_running;
- if (busiest_load_per_task > this_load_per_task)
+ if (sds.this_nr_running) {
+ sds.this_load_per_task /= sds.this_nr_running;
+ if (sds.busiest_load_per_task >
+ sds.this_load_per_task)
imbn = 1;
} else
- this_load_per_task = cpu_avg_load_per_task(this_cpu);
-
- if (max_load - this_load + busiest_load_per_task >=
- busiest_load_per_task * imbn) {
- *imbalance = busiest_load_per_task;
- return busiest;
+ sds.this_load_per_task =
+ cpu_avg_load_per_task(this_cpu);
+
+ if (sds.max_load - sds.this_load +
+ sds.busiest_load_per_task >=
+ sds.busiest_load_per_task * imbn) {
+ *imbalance = sds.busiest_load_per_task;
+ return sds.busiest;
}
/*
* moving them.
*/
- pwr_now += busiest->__cpu_power *
- min(busiest_load_per_task, max_load);
- pwr_now += this->__cpu_power *
- min(this_load_per_task, this_load);
+ pwr_now += sds.busiest->__cpu_power *
+ min(sds.busiest_load_per_task, sds.max_load);
+ pwr_now += sds.this->__cpu_power *
+ min(sds.this_load_per_task, sds.this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
- tmp = sg_div_cpu_power(busiest,
- busiest_load_per_task * SCHED_LOAD_SCALE);
- if (max_load > tmp)
- pwr_move += busiest->__cpu_power *
- min(busiest_load_per_task, max_load - tmp);
+ tmp = sg_div_cpu_power(sds.busiest,
+ sds.busiest_load_per_task * SCHED_LOAD_SCALE);
+ if (sds.max_load > tmp)
+ pwr_move += sds.busiest->__cpu_power *
+ min(sds.busiest_load_per_task,
+ sds.max_load - tmp);
/* Amount of load we'd add */
- if (max_load * busiest->__cpu_power <
- busiest_load_per_task * SCHED_LOAD_SCALE)
- tmp = sg_div_cpu_power(this,
- max_load * busiest->__cpu_power);
+ if (sds.max_load * sds.busiest->__cpu_power <
+ sds.busiest_load_per_task * SCHED_LOAD_SCALE)
+ tmp = sg_div_cpu_power(sds.this,
+ sds.max_load * sds.busiest->__cpu_power);
else
- tmp = sg_div_cpu_power(this,
- busiest_load_per_task * SCHED_LOAD_SCALE);
- pwr_move += this->__cpu_power *
- min(this_load_per_task, this_load + tmp);
+ tmp = sg_div_cpu_power(sds.this,
+ sds.busiest_load_per_task * SCHED_LOAD_SCALE);
+ pwr_move += sds.this->__cpu_power *
+ min(sds.this_load_per_task,
+ sds.this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
if (pwr_move > pwr_now)
- *imbalance = busiest_load_per_task;
+ *imbalance = sds.busiest_load_per_task;
}
- return busiest;
+ return sds.busiest;
out_balanced:
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
goto ret;
- if (this == group_leader && group_leader != group_min) {
- *imbalance = min_load_per_task;
- if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
- cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
- cpumask_first(sched_group_cpus(group_leader));
- }
- return group_min;
+ if (sds.this != sds.group_leader || sds.group_leader == sds.group_min)
+ goto ret;
+
+ *imbalance = sds.min_load_per_task;
+ if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+ cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+ group_first_cpu(sds.group_leader);
}
+ return sds.group_min;
+
#endif
ret:
*imbalance = 0;
#endif
}
+static inline int on_null_domain(int cpu)
+{
+ return !rcu_dereference(cpu_rq(cpu)->sd);
+}
+
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*
cpumask_test_cpu(cpu, nohz.cpu_mask))
return;
#endif
- if (time_after_eq(jiffies, rq->next_balance))
+ /* Don't need to rebalance while attached to NULL domain */
+ if (time_after_eq(jiffies, rq->next_balance) &&
+ likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
}
#endif
}
+static void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+ if (prev->state == TASK_RUNNING) {
+ u64 runtime = prev->se.sum_exec_runtime;
+
+ runtime -= prev->se.prev_sum_exec_runtime;
+ runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+
+ /*
+ * In order to avoid avg_overlap growing stale when we are
+ * indeed overlapping and hence not getting put to sleep, grow
+ * the avg_overlap on preemption.
+ *
+ * We use the average preemption runtime because that
+ * correlates to the amount of cache footprint a task can
+ * build up.
+ */
+ update_avg(&prev->se.avg_overlap, runtime);
+ }
+ prev->sched_class->put_prev_task(rq, prev);
+}
+
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq)
{
const struct sched_class *class;
struct task_struct *p;
if (unlikely(!rq->nr_running))
idle_balance(cpu, rq);
- prev->sched_class->put_prev_task(rq, prev);
- next = pick_next_task(rq, prev);
+ put_prev_task(rq, prev);
+ next = pick_next_task(rq);
if (likely(prev != next)) {
sched_info_switch(prev, next);
* between schedule and now.
*/
barrier();
- } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+ } while (need_resched());
}
EXPORT_SYMBOL(preempt_schedule);
* between schedule and now.
*/
barrier();
- } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+ } while (need_resched());
}
#endif /* CONFIG_PREEMPT */
if (!rq->nr_running)
break;
update_rq_clock(rq);
- next = pick_next_task(rq, rq->curr);
+ next = pick_next_task(rq);
if (!next)
break;
next->sched_class->put_prev_task(rq, next);