X-Git-Url: http://pilppa.org/gitweb/?a=blobdiff_plain;f=kernel%2Fcpuset.c;h=1535af3a912d9d7e6a21fb7d6c610e808c1cddda;hb=c32e066057fe0914da262c94e52cefb142f965b4;hp=44d13c246e5c7a86d0260030d81424a30ae686ac;hpb=8a39cc60bfa5a72f32d975729a354daca124f6de;p=linux-2.6-omap-h63xx.git diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 44d13c246e5..1535af3a912 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -4,15 +4,14 @@ * Processor and Memory placement constraints for sets of tasks. * * Copyright (C) 2003 BULL SA. - * Copyright (C) 2004 Silicon Graphics, Inc. + * Copyright (C) 2004-2006 Silicon Graphics, Inc. * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel - * Portions Copyright (c) 2004 Silicon Graphics, Inc. * - * 2003-10-10 Written by Simon Derr + * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. - * 2004 May-July Rework by Paul Jackson + * 2004 May-July Rework by Paul Jackson. * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux @@ -42,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -108,7 +108,9 @@ typedef enum { CS_MEM_EXCLUSIVE, CS_MEMORY_MIGRATE, CS_REMOVED, - CS_NOTIFY_ON_RELEASE + CS_NOTIFY_ON_RELEASE, + CS_SPREAD_PAGE, + CS_SPREAD_SLAB, } cpuset_flagbits_t; /* convenient tests for these bits */ @@ -137,8 +139,18 @@ static inline int is_memory_migrate(const struct cpuset *cs) return test_bit(CS_MEMORY_MIGRATE, &cs->flags); } +static inline int is_spread_page(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_PAGE, &cs->flags); +} + +static inline int is_spread_slab(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_SLAB, &cs->flags); +} + /* - * Increment this atomic integer everytime any cpuset changes its + * Increment this integer everytime any cpuset changes its * mems_allowed value. Users of cpusets can track this generation * number, and avoid having to lock and reload mems_allowed unless * the cpuset they're using changes generation. @@ -152,8 +164,11 @@ static inline int is_memory_migrate(const struct cpuset *cs) * on every visit to __alloc_pages(), to efficiently check whether * its current->cpuset->mems_allowed has changed, requiring an update * of its current->mems_allowed. + * + * Since cpuset_mems_generation is guarded by manage_mutex, + * there is no need to mark it atomic. */ -static atomic_t cpuset_mems_generation = ATOMIC_INIT(1); +static int cpuset_mems_generation; static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), @@ -378,11 +393,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data, return 0; } -static struct super_block *cpuset_get_sb(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) +static int cpuset_get_sb(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, cpuset_fill_super); + return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt); } static struct file_system_type cpuset_fs_type = { @@ -602,12 +617,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * current->cpuset if a task has its memory placement changed. * Do not call this routine if in_interrupt(). * - * Call without callback_mutex or task_lock() held. May be called - * with or without manage_mutex held. Doesn't need task_lock to guard - * against another task changing a non-NULL cpuset pointer to NULL, - * as that is only done by a task on itself, and if the current task - * is here, it is not simultaneously in the exit code NULL'ing its - * cpuset pointer. This routine also might acquire callback_mutex and + * Call without callback_mutex or task_lock() held. May be + * called with or without manage_mutex held. Thanks in part to + * 'the_top_cpuset_hack', the tasks cpuset pointer will never + * be NULL. This routine also might acquire callback_mutex and * current->mm->mmap_sem during call. * * Reading current->cpuset->mems_generation doesn't need task_lock @@ -657,6 +670,14 @@ void cpuset_update_task_memory_state(void) cs = tsk->cpuset; /* Maybe changed when task not locked */ guarantee_online_mems(cs, &tsk->mems_allowed); tsk->cpuset_mems_generation = cs->mems_generation; + if (is_spread_page(cs)) + tsk->flags |= PF_SPREAD_PAGE; + else + tsk->flags &= ~PF_SPREAD_PAGE; + if (is_spread_slab(cs)) + tsk->flags |= PF_SPREAD_SLAB; + else + tsk->flags &= ~PF_SPREAD_SLAB; task_unlock(tsk); mutex_unlock(&callback_mutex); mpol_rebind_task(tsk, &tsk->mems_allowed); @@ -813,6 +834,55 @@ static int update_cpumask(struct cpuset *cs, char *buf) return 0; } +/* + * cpuset_migrate_mm + * + * Migrate memory region from one set of nodes to another. + * + * Temporarilly set tasks mems_allowed to target nodes of migration, + * so that the migration code can allocate pages on these nodes. + * + * Call holding manage_mutex, so our current->cpuset won't change + * during this call, as manage_mutex holds off any attach_task() + * calls. Therefore we don't need to take task_lock around the + * call to guarantee_online_mems(), as we know no one is changing + * our tasks cpuset. + * + * Hold callback_mutex around the two modifications of our tasks + * mems_allowed to synchronize with cpuset_mems_allowed(). + * + * While the mm_struct we are migrating is typically from some + * other task, the task_struct mems_allowed that we are hacking + * is for our current task, which must allocate new pages for that + * migrating memory region. + * + * We call cpuset_update_task_memory_state() before hacking + * our tasks mems_allowed, so that we are assured of being in + * sync with our tasks cpuset, and in particular, callbacks to + * cpuset_update_task_memory_state() from nested page allocations + * won't see any mismatch of our cpuset and task mems_generation + * values, so won't overwrite our hacked tasks mems_allowed + * nodemask. + */ + +static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to) +{ + struct task_struct *tsk = current; + + cpuset_update_task_memory_state(); + + mutex_lock(&callback_mutex); + tsk->mems_allowed = *to; + mutex_unlock(&callback_mutex); + + do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + + mutex_lock(&callback_mutex); + guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed); + mutex_unlock(&callback_mutex); +} + /* * Handle user request to change the 'mems' memory placement * of a cpuset. Needs to validate the request, update the @@ -858,7 +928,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) mutex_lock(&callback_mutex); cs->mems_allowed = trialcs.mems_allowed; - cs->mems_generation = atomic_inc_return(&cpuset_mems_generation); + cs->mems_generation = cpuset_mems_generation++; mutex_unlock(&callback_mutex); set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ @@ -925,10 +995,8 @@ static int update_nodemask(struct cpuset *cs, char *buf) struct mm_struct *mm = mmarray[i]; mpol_rebind_mm(mm, &cs->mems_allowed); - if (migrate) { - do_migrate_pages(mm, &oldmem, &cs->mems_allowed, - MPOL_MF_MOVE_ALL); - } + if (migrate) + cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); mmput(mm); } @@ -956,7 +1024,8 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, - * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) + * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, + * CS_SPREAD_PAGE, CS_SPREAD_SLAB) * cs: the cpuset to update * buf: the buffer where we read the 0 or 1 * @@ -1109,6 +1178,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) cpumask_t cpus; nodemask_t from, to; struct mm_struct *mm; + int retval; if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; @@ -1137,6 +1207,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) get_task_struct(tsk); } + retval = security_task_setscheduler(tsk, 0, NULL); + if (retval) { + put_task_struct(tsk); + return retval; + } + mutex_lock(&callback_mutex); task_lock(tsk); @@ -1162,11 +1238,11 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) mm = get_task_mm(tsk); if (mm) { mpol_rebind_mm(mm, &to); + if (is_memory_migrate(cs)) + cpuset_migrate_mm(mm, &from, &to); mmput(mm); } - if (is_memory_migrate(cs)) - do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); put_task_struct(tsk); synchronize_rcu(); if (atomic_dec_and_test(&oldcs->count)) @@ -1187,6 +1263,8 @@ typedef enum { FILE_NOTIFY_ON_RELEASE, FILE_MEMORY_PRESSURE_ENABLED, FILE_MEMORY_PRESSURE, + FILE_SPREAD_PAGE, + FILE_SPREAD_SLAB, FILE_TASKLIST, } cpuset_filetype_t; @@ -1246,6 +1324,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us case FILE_MEMORY_PRESSURE: retval = -EACCES; break; + case FILE_SPREAD_PAGE: + retval = update_flag(CS_SPREAD_PAGE, cs, buffer); + cs->mems_generation = cpuset_mems_generation++; + break; + case FILE_SPREAD_SLAB: + retval = update_flag(CS_SPREAD_SLAB, cs, buffer); + cs->mems_generation = cpuset_mems_generation++; + break; case FILE_TASKLIST: retval = attach_task(cs, buffer, &pathbuf); break; @@ -1355,6 +1441,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, case FILE_MEMORY_PRESSURE: s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); break; + case FILE_SPREAD_PAGE: + *s++ = is_spread_page(cs) ? '1' : '0'; + break; + case FILE_SPREAD_SLAB: + *s++ = is_spread_slab(cs) ? '1' : '0'; + break; default: retval = -EINVAL; goto out; @@ -1718,6 +1810,16 @@ static struct cftype cft_memory_pressure = { .private = FILE_MEMORY_PRESSURE, }; +static struct cftype cft_spread_page = { + .name = "memory_spread_page", + .private = FILE_SPREAD_PAGE, +}; + +static struct cftype cft_spread_slab = { + .name = "memory_spread_slab", + .private = FILE_SPREAD_SLAB, +}; + static int cpuset_populate_dir(struct dentry *cs_dentry) { int err; @@ -1736,6 +1838,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) return err; if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) return err; + if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0) + return err; + if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0) + return err; if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) return err; return 0; @@ -1764,12 +1870,16 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) cs->flags = 0; if (notify_on_release(parent)) set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); + if (is_spread_page(parent)) + set_bit(CS_SPREAD_PAGE, &cs->flags); + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); cs->cpus_allowed = CPU_MASK_NONE; cs->mems_allowed = NODE_MASK_NONE; atomic_set(&cs->count, 0); INIT_LIST_HEAD(&cs->sibling); INIT_LIST_HEAD(&cs->children); - cs->mems_generation = atomic_inc_return(&cpuset_mems_generation); + cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); cs->parent = parent; @@ -1859,7 +1969,7 @@ int __init cpuset_init_early(void) struct task_struct *tsk = current; tsk->cpuset = &top_cpuset; - tsk->cpuset->mems_generation = atomic_inc_return(&cpuset_mems_generation); + tsk->cpuset->mems_generation = cpuset_mems_generation++; return 0; } @@ -1878,7 +1988,7 @@ int __init cpuset_init(void) top_cpuset.mems_allowed = NODE_MASK_ALL; fmeter_init(&top_cpuset.fmeter); - top_cpuset.mems_generation = atomic_inc_return(&cpuset_mems_generation); + top_cpuset.mems_generation = cpuset_mems_generation++; init_task.cpuset = &top_cpuset; @@ -1969,7 +2079,7 @@ void cpuset_fork(struct task_struct *child) * because tsk is already marked PF_EXITING, so attach_task() won't * mess with it, or task is a failed fork, never visible to attach_task. * - * Hack: + * the_top_cpuset_hack: * * Set the exiting tasks cpuset to the root cpuset (top_cpuset). * @@ -2008,7 +2118,7 @@ void cpuset_exit(struct task_struct *tsk) struct cpuset *cs; cs = tsk->cpuset; - tsk->cpuset = &top_cpuset; /* Hack - see comment above */ + tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */ if (notify_on_release(cs)) { char *pathbuf = NULL; @@ -2129,30 +2239,37 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * So only GFP_KERNEL allocations, if all nodes in the cpuset are * short of memory, might require taking the callback_mutex mutex. * - * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() - * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing - * hardwall cpusets - no allocation on a node outside the cpuset is - * allowed (unless in interrupt, of course). + * The first call here from mm/page_alloc:get_page_from_freelist() + * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so + * no allocation on a node outside the cpuset is allowed (unless in + * interrupt, of course). * - * The second loop doesn't even call here for GFP_ATOMIC requests - * (if the __alloc_pages() local variable 'wait' is set). That check - * and the checks below have the combined affect in the second loop of - * the __alloc_pages() routine that: + * The second pass through get_page_from_freelist() doesn't even call + * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() + * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set + * in alloc_flags. That logic and the checks below have the combined + * affect that: * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. + * + * Rule: + * Don't call cpuset_zone_allowed() if you can't sleep, unless you + * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables + * the code that might scan up ancestor cpusets and sleep. **/ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { int node; /* node that zone z is on */ const struct cpuset *cs; /* current cpuset ancestors */ - int allowed = 1; /* is allocation in zone z allowed? */ + int allowed; /* is allocation in zone z allowed? */ if (in_interrupt()) return 1; node = z->zone_pgdat->node_id; + might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); if (node_isset(node, current->mems_allowed)) return 1; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ @@ -2200,6 +2317,44 @@ void cpuset_unlock(void) mutex_unlock(&callback_mutex); } +/** + * cpuset_mem_spread_node() - On which node to begin search for a page + * + * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for + * tasks in a cpuset with is_spread_page or is_spread_slab set), + * and if the memory allocation used cpuset_mem_spread_node() + * to determine on which node to start looking, as it will for + * certain page cache or slab cache pages such as used for file + * system buffers and inode caches, then instead of starting on the + * local node to look for a free page, rather spread the starting + * node around the tasks mems_allowed nodes. + * + * We don't have to worry about the returned node being offline + * because "it can't happen", and even if it did, it would be ok. + * + * The routines calling guarantee_online_mems() are careful to + * only set nodes in task->mems_allowed that are online. So it + * should not be possible for the following code to return an + * offline node. But if it did, that would be ok, as this routine + * is not returning the node where the allocation must be, only + * the node where the search should start. The zonelist passed to + * __alloc_pages() will include all nodes. If the slab allocator + * is passed an offline node, it will fall back to the local node. + * See kmem_cache_alloc_node(). + */ + +int cpuset_mem_spread_node(void) +{ + int node; + + node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); + if (node == MAX_NUMNODES) + node = first_node(current->mems_allowed); + current->cpuset_mem_spread_rotor = node; + return node; +} +EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); + /** * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? * @p: pointer to task_struct of some other task. @@ -2281,43 +2436,49 @@ void __cpuset_memory_pressure_bump(void) * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, * and we take manage_mutex, keeping attach_task() from changing it - * anyway. + * anyway. No need to check that tsk->cpuset != NULL, thanks to + * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks + * cpuset to top_cpuset. */ - static int proc_cpuset_show(struct seq_file *m, void *v) { - struct cpuset *cs; + struct pid *pid; struct task_struct *tsk; char *buf; - int retval = 0; + int retval; + retval = -ENOMEM; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) - return -ENOMEM; + goto out; + + retval = -ESRCH; + pid = m->private; + tsk = get_pid_task(pid, PIDTYPE_PID); + if (!tsk) + goto out_free; - tsk = m->private; + retval = -EINVAL; mutex_lock(&manage_mutex); - cs = tsk->cpuset; - if (!cs) { - retval = -EINVAL; - goto out; - } - retval = cpuset_path(cs, buf, PAGE_SIZE); + retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); if (retval < 0) - goto out; + goto out_unlock; seq_puts(m, buf); seq_putc(m, '\n'); -out: +out_unlock: mutex_unlock(&manage_mutex); + put_task_struct(tsk); +out_free: kfree(buf); +out: return retval; } static int cpuset_open(struct inode *inode, struct file *file) { - struct task_struct *tsk = PROC_I(inode)->task; - return single_open(file, proc_cpuset_show, tsk); + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cpuset_show, pid); } struct file_operations proc_cpuset_operations = {