X-Git-Url: http://pilppa.org/gitweb/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fsched.c;h=e76b11ca6df320648ae32eb122afa80fc4c4faca;hb=edfaa7c36574f1bf09c65ad602412db9da5f96bf;hp=98dcdf272db373c8459a552f1db46687a1600987;hpb=c99da91e7a12724127475a85cc7a38214b3504e2;p=linux-2.6-omap-h63xx.git diff --git a/kernel/sched.c b/kernel/sched.c index 98dcdf272db..e76b11ca6df 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -209,9 +209,8 @@ static inline struct task_group *task_group(struct task_struct *p) tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), struct task_group, css); #else - tg = &init_task_group; + tg = &init_task_group; #endif - return tg; } @@ -249,15 +248,16 @@ struct cfs_rq { #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ - /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in * a hierarchy). Non-leaf lrqs hold other higher schedulable entities * (like users, containers etc.) * * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This * list is used during load balance. */ - struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ - struct task_group *tg; /* group that "owns" this runqueue */ + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ #endif }; @@ -300,7 +300,7 @@ struct rq { /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; #endif - struct rt_rq rt; + struct rt_rq rt; /* * This is part of a global counter where only the total sum @@ -457,8 +457,8 @@ enum { SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, SCHED_FEAT_WAKEUP_PREEMPT = 2, SCHED_FEAT_START_DEBIT = 4, - SCHED_FEAT_TREE_AVG = 8, - SCHED_FEAT_APPROX_AVG = 16, + SCHED_FEAT_TREE_AVG = 8, + SCHED_FEAT_APPROX_AVG = 16, }; const_debug unsigned int sysctl_sched_features = @@ -488,7 +488,12 @@ unsigned long long cpu_clock(int cpu) local_irq_save(flags); rq = cpu_rq(cpu); - update_rq_clock(rq); + /* + * Only call sched_clock() if the scheduler has already been + * initialized (some code might call cpu_clock() very early): + */ + if (rq->idle) + update_rq_clock(rq); now = rq->clock; local_irq_restore(flags); @@ -503,10 +508,15 @@ EXPORT_SYMBOL_GPL(cpu_clock); # define finish_arch_switch(prev) do { } while (0) #endif +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + #ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline int task_running(struct rq *rq, struct task_struct *p) { - return rq->curr == p; + return task_current(rq, p); } static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) @@ -535,7 +545,7 @@ static inline int task_running(struct rq *rq, struct task_struct *p) #ifdef CONFIG_SMP return p->oncpu; #else - return rq->curr == p; + return task_current(rq, p); #endif } @@ -591,7 +601,7 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) /* * task_rq_lock - lock the runqueue a given task resides on and disable - * interrupts. Note the ordering: we can safely lookup the task_rq without + * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) @@ -658,6 +668,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) struct rq *rq = cpu_rq(smp_processor_id()); u64 now = sched_clock(); + touch_softlockup_watchdog(); rq->idle_clock += delta_ns; /* * Override the previous timestamp and ignore all @@ -779,7 +790,7 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that * each task makes to its run queue's load is weighted according to its - * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a * scaled version of the new time slice allocation that they receive on time * slice expiry etc. */ @@ -854,6 +865,12 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, struct rq_iterator *iterator); #endif +#ifdef CONFIG_CGROUP_CPUACCT +static void cpuacct_charge(struct task_struct *tsk, u64 cputime); +#else +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} +#endif + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -1848,7 +1865,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * and do any other architecture-specific cleanup actions. * * Note that we may have delayed dropping an mm in context_switch(). If - * so, we finish that here outside of the runqueue lock. (Doing it + * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) */ @@ -2130,7 +2147,7 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only - * allow dest_cpu, which will force the cpu onto dest_cpu. Then + * allow dest_cpu, which will force the cpu onto dest_cpu. Then * the cpu_allowed mask is restored. */ static void sched_migrate_task(struct task_struct *p, int dest_cpu) @@ -2575,7 +2592,7 @@ group_next: * tasks around. Thus we look for the minimum possible imbalance. * Negative imbalances (*we* are more loaded than anyone else) will * be counted as no imbalance for these purposes -- we can't fix that - * by pulling tasks to us. Be careful of negative numbers as they'll + * by pulling tasks to us. Be careful of negative numbers as they'll * appear as very large values with unsigned longs. */ if (max_load <= busiest_load_per_task) @@ -3010,7 +3027,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) /* * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by + * we need to fix it. Originally reported by * Bjorn Helgaas on a 128-cpu setup. */ BUG_ON(busiest_rq == target_rq); @@ -3042,7 +3059,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) #ifdef CONFIG_NO_HZ static struct { atomic_t load_balancer; - cpumask_t cpu_mask; + cpumask_t cpu_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), .cpu_mask = CPU_MASK_NONE, @@ -3323,7 +3340,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime; - if (rq->curr == p) { + if (task_current(rq, p)) { update_rq_clock(rq); delta_exec = rq->clock - p->se.exec_start; if ((s64)delta_exec > 0) @@ -3546,7 +3563,7 @@ static noinline void __schedule_bug(struct task_struct *prev) static inline void schedule_debug(struct task_struct *prev) { /* - * Test if we are atomic. Since do_exit() needs to call into + * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ @@ -3668,7 +3685,7 @@ EXPORT_SYMBOL(schedule); #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption - * off of preempt_enable. Kernel preemptions off return from interrupt + * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ asmlinkage void __sched preempt_schedule(void) @@ -3680,7 +3697,7 @@ asmlinkage void __sched preempt_schedule(void) #endif /* * If there is a non-zero preempt_count or interrupts are disabled, - * we do not want to preempt the current task. Just return.. + * we do not want to preempt the current task. Just return.. */ if (likely(ti->preempt_count || irqs_disabled())) return; @@ -3766,12 +3783,12 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, EXPORT_SYMBOL(default_wake_function); /* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve * number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, @@ -4010,7 +4027,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) oldprio = p->prio; on_rq = p->se.on_rq; - running = task_running(rq, p); + running = task_current(rq, p); if (on_rq) { dequeue_task(rq, p, 0); if (running) @@ -4321,7 +4338,7 @@ recheck: } update_rq_clock(rq); on_rq = p->se.on_rq; - running = task_running(rq, p); + running = task_current(rq, p); if (on_rq) { deactivate_task(rq, p, 0); if (running) @@ -4384,8 +4401,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) * @policy: new policy. * @param: structure containing the new RT priority. */ -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, - struct sched_param __user *param) +asmlinkage long +sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { /* negative values for policy are not valid */ if (policy < 0) @@ -4485,7 +4502,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) /* * It is not safe to call set_cpus_allowed with the - * tasklist_lock held. We will bump the task_struct's + * tasklist_lock held. We will bump the task_struct's * usage count and then drop tasklist_lock. */ get_task_struct(p); @@ -4681,7 +4698,7 @@ EXPORT_SYMBOL(cond_resched); * cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * - * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ @@ -4735,7 +4752,7 @@ void __sched yield(void) EXPORT_SYMBOL(yield); /* - * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. * * But don't do that if it is a deliberate, throttling IO wait (this task @@ -4844,17 +4861,21 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) if (retval) goto out_unlock; - if (p->policy == SCHED_FIFO) - time_slice = 0; - else if (p->policy == SCHED_RR) + /* + * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER + * tasks that are on an otherwise idle runqueue: + */ + time_slice = 0; + if (p->policy == SCHED_RR) { time_slice = DEF_TIMESLICE; - else { + } else { struct sched_entity *se = &p->se; unsigned long flags; struct rq *rq; rq = task_rq_lock(p, &flags); - time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); + if (rq->cfs.load.weight) + time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); task_rq_unlock(rq, &flags); } read_unlock(&tasklist_lock); @@ -4897,7 +4918,7 @@ static void show_task(struct task_struct *p) } #endif printk(KERN_CONT "%5lu %5d %6d\n", free, - task_pid_nr(p), task_pid_nr(p->parent)); + task_pid_nr(p), task_pid_nr(p->real_parent)); if (state != TASK_RUNNING) show_stack(p, NULL); @@ -5040,7 +5061,7 @@ static inline void sched_init_granularity(void) * is removed from the allowed bitmask. * * NOTE: the caller must have a valid reference to the task, the - * task must not exit() & deallocate itself prematurely. The + * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) @@ -5077,7 +5098,7 @@ out: EXPORT_SYMBOL_GPL(set_cpus_allowed); /* - * Move (not current) task off this cpu, onto dest cpu. We're doing + * Move (not current) task off this cpu, onto dest cpu. We're doing * this because either it can't run here any more (set_cpus_allowed() * away from this CPU, or CPU going down), or because we're * attempting to rebalance this task on exec (sched_exec). @@ -5222,7 +5243,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) * Try to stay on the same cpuset, where the * current cpuset may be a subset of all cpus. * The cpuset_cpus_allowed_locked() variant of - * cpuset_cpus_allowed() will not block. It must be + * cpuset_cpus_allowed() will not block. It must be * called within calls to cpuset_lock/cpuset_unlock. */ rq = task_rq_lock(p, &flags); @@ -5235,10 +5256,11 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) * kernel threads (both mm NULL), since they never * leave kernel. */ - if (p->mm && printk_ratelimit()) + if (p->mm && printk_ratelimit()) { printk(KERN_INFO "process %d (%s) no " "longer affine to cpu%d\n", - task_pid_nr(p), p->comm, dead_cpu); + task_pid_nr(p), p->comm, dead_cpu); + } } } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); } @@ -5340,7 +5362,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) /* * Drop lock around migration; if someone else moves it, - * that's OK. No task can be added to this CPU, so iteration is + * that's OK. No task can be added to this CPU, so iteration is * fine. */ spin_unlock_irq(&rq->lock); @@ -5404,7 +5426,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) /* * In the intermediate directories, both the child directory and * procname are dynamically allocated and could fail but the mode - * will always be set. In the lowest directory the names are + * will always be set. In the lowest directory the names are * static strings and all have proc handlers. */ for (entry = *tablep; entry->mode; entry++) { @@ -5575,7 +5597,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_CANCELED_FROZEN: if (!cpu_rq(cpu)->migration_thread) break; - /* Unbind it from offline cpu so it can run. Fall thru. */ + /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread, any_online_cpu(cpu_online_map)); kthread_stop(cpu_rq(cpu)->migration_thread); @@ -5602,9 +5624,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); - /* No need to migrate the tasks: it was best-effort if - * they didn't take sched_hotcpu_mutex. Just wake up - * the requestors. */ + /* + * No need to migrate the tasks: it was best-effort if + * they didn't take sched_hotcpu_mutex. Just wake up + * the requestors. + */ spin_lock_irq(&rq->lock); while (!list_empty(&rq->migration_queue)) { struct migration_req *req; @@ -5912,7 +5936,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, * @node: node whose sched_domain we're building * @used_nodes: nodes already in the sched_domain * - * Find the next node to include in a given scheduling domain. Simply + * Find the next node to include in a given scheduling domain. Simply * finds the closest node not already in the @used_nodes map. * * Should use nodemask_t. @@ -5952,7 +5976,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes) * @node: node whose cpumask we're constructing * @size: number of nodes to include in this span * - * Given a node, construct a good cpumask for its sched_domain to span. It + * Given a node, construct a good cpumask for its sched_domain to span. It * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ @@ -5989,8 +6013,8 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); -static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg) +static int +cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) { if (sg) *sg = &per_cpu(sched_group_cpus, cpu); @@ -6007,8 +6031,8 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core); #endif #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) -static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg) +static int +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) { int group; cpumask_t mask = per_cpu(cpu_sibling_map, cpu); @@ -6019,8 +6043,8 @@ static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, return group; } #elif defined(CONFIG_SCHED_MC) -static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg) +static int +cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) { if (sg) *sg = &per_cpu(sched_group_core, cpu); @@ -6031,8 +6055,8 @@ static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, static DEFINE_PER_CPU(struct sched_domain, phys_domains); static DEFINE_PER_CPU(struct sched_group, sched_group_phys); -static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, - struct sched_group **sg) +static int +cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) { int group; #ifdef CONFIG_SCHED_MC @@ -6212,7 +6236,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) * Allocate the per-node list of sched groups */ sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), - GFP_KERNEL); + GFP_KERNEL); if (!sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); return -ENOMEM; @@ -6459,7 +6483,7 @@ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ static cpumask_t fallback_doms; /* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * Set up scheduler domains and groups. Callers must hold the hotplug lock. * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ @@ -6501,19 +6525,19 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) /* * Partition sched domains as specified by the 'ndoms_new' - * cpumasks in the array doms_new[] of cpumasks. This compares + * cpumasks in the array doms_new[] of cpumasks. This compares * doms_new[] to the current sched domain partitioning, doms_cur[]. * It destroys each deleted domain and builds each new domain. * * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. - * The masks don't intersect (don't overlap.) We should setup one - * sched domain for each mask. CPUs not in any of the cpumasks will - * not be load balanced. If the same cpumask appears both in the + * The masks don't intersect (don't overlap.) We should setup one + * sched domain for each mask. CPUs not in any of the cpumasks will + * not be load balanced. If the same cpumask appears both in the * current 'doms_cur' domains and in the new 'doms_new', we can leave * it as it is. * - * The passed in 'doms_new' should be kmalloc'd. This routine takes - * ownership of it and will kfree it when done with it. If the caller + * The passed in 'doms_new' should be kmalloc'd. This routine takes + * ownership of it and will kfree it when done with it. If the caller * failed the kmalloc call, then it can pass in doms_new == NULL, * and partition_sched_domains() will fallback to the single partition * 'fallback_doms'. @@ -6643,7 +6667,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) #endif /* - * Force a reinitialization of the sched domains hierarchy. The domains + * Force a reinitialization of the sched domains hierarchy. The domains * and groups cannot be updated in place without racing with the balancing * code, so we temporarily attach all running cpus to the NULL domain * which will prevent rebalancing while the sched domains are recalculated. @@ -6933,8 +6957,8 @@ struct task_struct *curr_task(int cpu) * @p: the task pointer to set. * * Description: This function must only be used when non-maskable interrupts - * are serviced on a separate stack. It allows the architecture to switch the - * notion of the current task on a cpu in a non-blocking manner. This function + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function * must be called with all CPU's synchronized, and interrupts disabled, the * and caller must save the original value of the current task (see * curr_task() above) and restore that value before reenabling interrupts and @@ -7083,7 +7107,7 @@ void sched_move_task(struct task_struct *tsk) update_rq_clock(rq); - running = task_running(rq, tsk); + running = task_current(rq, tsk); on_rq = tsk->se.on_rq; if (on_rq) { @@ -7129,6 +7153,14 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; + /* + * A weight of 0 or 1 can cause arithmetics problems. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ + if (shares < 2) + shares = 2; + spin_lock(&tg->lock); if (tg->shares == shares) goto done; @@ -7183,16 +7215,17 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) return &tg->css; } -static void cpu_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cgrp) +static void +cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); sched_destroy_group(tg); } -static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *cgrp, struct task_struct *tsk) +static int +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *tsk) { /* We don't support RT-tasks being in separate groups */ if (tsk->sched_class != &fair_sched_class) @@ -7221,38 +7254,12 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) return (u64) tg->shares; } -static u64 cpu_usage_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct task_group *tg = cgroup_tg(cgrp); - unsigned long flags; - u64 res = 0; - int i; - - for_each_possible_cpu(i) { - /* - * Lock to prevent races with updating 64-bit counters - * on 32-bit arches. - */ - spin_lock_irqsave(&cpu_rq(i)->lock, flags); - res += tg->se[i]->sum_exec_runtime; - spin_unlock_irqrestore(&cpu_rq(i)->lock, flags); - } - /* Convert from ns to ms */ - do_div(res, NSEC_PER_MSEC); - - return res; -} - static struct cftype cpu_files[] = { { .name = "shares", .read_uint = cpu_shares_read_uint, .write_uint = cpu_shares_write_uint, }, - { - .name = "usage", - .read_uint = cpu_usage_read, - }, }; static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) @@ -7272,3 +7279,126 @@ struct cgroup_subsys cpu_cgroup_subsys = { }; #endif /* CONFIG_FAIR_CGROUP_SCHED */ + +#ifdef CONFIG_CGROUP_CPUACCT + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +/* track cpu usage of a group of tasks */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 *cpuusage; +}; + +struct cgroup_subsys cpuacct_subsys; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cont) +{ + return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_create( + struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + + if (!ca) + return ERR_PTR(-ENOMEM); + + ca->cpuusage = alloc_percpu(u64); + if (!ca->cpuusage) { + kfree(ca); + return ERR_PTR(-ENOMEM); + } + + return &ca->css; +} + +/* destroy an existing cpu accounting group */ +static void +cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct cpuacct *ca = cgroup_ca(cont); + + free_percpu(ca->cpuusage); + kfree(ca); +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) +{ + struct cpuacct *ca = cgroup_ca(cont); + u64 totalcpuusage = 0; + int i; + + for_each_possible_cpu(i) { + u64 *cpuusage = percpu_ptr(ca->cpuusage, i); + + /* + * Take rq->lock to make 64-bit addition safe on 32-bit + * platforms. + */ + spin_lock_irq(&cpu_rq(i)->lock); + totalcpuusage += *cpuusage; + spin_unlock_irq(&cpu_rq(i)->lock); + } + + return totalcpuusage; +} + +static struct cftype files[] = { + { + .name = "usage", + .read_uint = cpuusage_read, + }, +}; + +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); +} + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +static void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ + struct cpuacct *ca; + + if (!cpuacct_subsys.active) + return; + + ca = task_ca(tsk); + if (ca) { + u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk)); + + *cpuusage += cputime; + } +} + +struct cgroup_subsys cpuacct_subsys = { + .name = "cpuacct", + .create = cpuacct_create, + .destroy = cpuacct_destroy, + .populate = cpuacct_populate, + .subsys_id = cpuacct_subsys_id, +}; +#endif /* CONFIG_CGROUP_CPUACCT */