2 * Simple NUMA memory policy for the Linux kernel.
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
7 * NUMA policy allows the user to give hints in which node(s) memory should
10 * Support four policies per VMA and per process:
12 * The VMA policy has priority over the process policy for a page fault.
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
20 * bind Only allocate memory on a specific set of nodes,
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
50 fix mmap readahead to honour policy and enable policy for any page cache
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
55 handle mremap for shared memory (currently ignored for the policy)
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
62 #include <linux/mempolicy.h>
64 #include <linux/highmem.h>
65 #include <linux/hugetlb.h>
66 #include <linux/kernel.h>
67 #include <linux/sched.h>
69 #include <linux/nodemask.h>
70 #include <linux/cpuset.h>
71 #include <linux/gfp.h>
72 #include <linux/slab.h>
73 #include <linux/string.h>
74 #include <linux/module.h>
75 #include <linux/interrupt.h>
76 #include <linux/init.h>
77 #include <linux/compat.h>
78 #include <linux/mempolicy.h>
79 #include <asm/tlbflush.h>
80 #include <asm/uaccess.h>
82 static kmem_cache_t *policy_cache;
83 static kmem_cache_t *sn_cache;
85 #define PDprintk(fmt...)
87 /* Highest zone. An specific allocation for a zone below that is not
89 static int policy_zone;
91 struct mempolicy default_policy = {
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
96 /* Do sanity checking on a policy */
97 static int mpol_check_policy(int mode, nodemask_t *nodes)
99 int empty = nodes_empty(*nodes);
107 case MPOL_INTERLEAVE:
108 /* Preferred will only use the first bit, but allow
114 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
117 /* Copy a node mask from user space. */
118 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
119 unsigned long maxnode, int mode)
122 unsigned long nlongs;
123 unsigned long endmask;
127 if (maxnode == 0 || !nmask)
130 nlongs = BITS_TO_LONGS(maxnode);
131 if ((maxnode % BITS_PER_LONG) == 0)
134 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
136 /* When the user specified more nodes than supported just check
137 if the non supported part is all zero. */
138 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
139 if (nlongs > PAGE_SIZE/sizeof(long))
141 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
143 if (get_user(t, nmask + k))
145 if (k == nlongs - 1) {
151 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
155 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
157 nodes_addr(*nodes)[nlongs-1] &= endmask;
158 /* Update current mems_allowed */
159 cpuset_update_current_mems_allowed();
160 /* Ignore nodes not set in current->mems_allowed */
161 /* AK: shouldn't this error out instead? */
162 cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
163 return mpol_check_policy(mode, nodes);
166 /* Generate a custom zonelist for the BIND policy. */
167 static struct zonelist *bind_zonelist(nodemask_t *nodes)
172 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
173 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
177 for_each_node_mask(nd, *nodes) {
179 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
180 struct zone *z = &NODE_DATA(nd)->node_zones[k];
181 if (!z->present_pages)
183 zl->zones[num++] = z;
189 zl->zones[num] = NULL;
193 /* Create a new policy */
194 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
196 struct mempolicy *policy;
198 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
199 if (mode == MPOL_DEFAULT)
201 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
203 return ERR_PTR(-ENOMEM);
204 atomic_set(&policy->refcnt, 1);
206 case MPOL_INTERLEAVE:
207 policy->v.nodes = *nodes;
210 policy->v.preferred_node = first_node(*nodes);
211 if (policy->v.preferred_node >= MAX_NUMNODES)
212 policy->v.preferred_node = -1;
215 policy->v.zonelist = bind_zonelist(nodes);
216 if (policy->v.zonelist == NULL) {
217 kmem_cache_free(policy_cache, policy);
218 return ERR_PTR(-ENOMEM);
222 policy->policy = mode;
226 /* Ensure all existing pages follow the policy. */
227 static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
228 unsigned long addr, unsigned long end, nodemask_t *nodes)
233 spin_lock(&mm->page_table_lock);
234 orig_pte = pte = pte_offset_map(pmd, addr);
239 if (!pte_present(*pte))
244 nid = pfn_to_nid(pfn);
245 if (!node_isset(nid, *nodes))
247 } while (pte++, addr += PAGE_SIZE, addr != end);
249 spin_unlock(&mm->page_table_lock);
253 static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
254 unsigned long addr, unsigned long end, nodemask_t *nodes)
259 pmd = pmd_offset(pud, addr);
261 next = pmd_addr_end(addr, end);
262 if (pmd_none_or_clear_bad(pmd))
264 if (check_pte_range(mm, pmd, addr, next, nodes))
266 } while (pmd++, addr = next, addr != end);
270 static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
271 unsigned long addr, unsigned long end, nodemask_t *nodes)
276 pud = pud_offset(pgd, addr);
278 next = pud_addr_end(addr, end);
279 if (pud_none_or_clear_bad(pud))
281 if (check_pmd_range(mm, pud, addr, next, nodes))
283 } while (pud++, addr = next, addr != end);
287 static inline int check_pgd_range(struct mm_struct *mm,
288 unsigned long addr, unsigned long end, nodemask_t *nodes)
293 pgd = pgd_offset(mm, addr);
295 next = pgd_addr_end(addr, end);
296 if (pgd_none_or_clear_bad(pgd))
298 if (check_pud_range(mm, pgd, addr, next, nodes))
300 } while (pgd++, addr = next, addr != end);
304 /* Step 1: check the range */
305 static struct vm_area_struct *
306 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
307 nodemask_t *nodes, unsigned long flags)
310 struct vm_area_struct *first, *vma, *prev;
312 first = find_vma(mm, start);
314 return ERR_PTR(-EFAULT);
316 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
317 if (!vma->vm_next && vma->vm_end < end)
318 return ERR_PTR(-EFAULT);
319 if (prev && prev->vm_end < vma->vm_start)
320 return ERR_PTR(-EFAULT);
321 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
322 unsigned long endvma = vma->vm_end;
325 if (vma->vm_start > start)
326 start = vma->vm_start;
327 err = check_pgd_range(vma->vm_mm,
328 start, endvma, nodes);
330 first = ERR_PTR(err);
339 /* Apply policy to a single VMA */
340 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
343 struct mempolicy *old = vma->vm_policy;
345 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
346 vma->vm_start, vma->vm_end, vma->vm_pgoff,
347 vma->vm_ops, vma->vm_file,
348 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
350 if (vma->vm_ops && vma->vm_ops->set_policy)
351 err = vma->vm_ops->set_policy(vma, new);
354 vma->vm_policy = new;
360 /* Step 2: apply policy to a range and do splits. */
361 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
362 unsigned long end, struct mempolicy *new)
364 struct vm_area_struct *next;
368 for (; vma && vma->vm_start < end; vma = next) {
370 if (vma->vm_start < start)
371 err = split_vma(vma->vm_mm, vma, start, 1);
372 if (!err && vma->vm_end > end)
373 err = split_vma(vma->vm_mm, vma, end, 0);
375 err = policy_vma(vma, new);
382 /* Change policy for a memory range */
383 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
385 unsigned long __user *nmask, unsigned long maxnode,
388 struct vm_area_struct *vma;
389 struct mm_struct *mm = current->mm;
390 struct mempolicy *new;
395 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
397 if (start & ~PAGE_MASK)
399 if (mode == MPOL_DEFAULT)
400 flags &= ~MPOL_MF_STRICT;
401 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
408 err = get_nodes(&nodes, nmask, maxnode, mode);
412 new = mpol_new(mode, &nodes);
416 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
417 mode,nodes_addr(nodes)[0]);
419 down_write(&mm->mmap_sem);
420 vma = check_range(mm, start, end, &nodes, flags);
423 err = mbind_range(vma, start, end, new);
424 up_write(&mm->mmap_sem);
429 /* Set the process memory policy */
430 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
431 unsigned long maxnode)
434 struct mempolicy *new;
437 if (mode < 0 || mode > MPOL_MAX)
439 err = get_nodes(&nodes, nmask, maxnode, mode);
442 new = mpol_new(mode, &nodes);
445 mpol_free(current->mempolicy);
446 current->mempolicy = new;
447 if (new && new->policy == MPOL_INTERLEAVE)
448 current->il_next = first_node(new->v.nodes);
452 /* Fill a zone bitmap for a policy */
453 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
460 for (i = 0; p->v.zonelist->zones[i]; i++)
461 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
465 case MPOL_INTERLEAVE:
469 /* or use current node instead of online map? */
470 if (p->v.preferred_node < 0)
471 *nodes = node_online_map;
473 node_set(p->v.preferred_node, *nodes);
480 static int lookup_node(struct mm_struct *mm, unsigned long addr)
485 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
487 err = page_to_nid(p);
493 /* Copy a kernel node mask to user space */
494 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
497 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
498 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
501 if (copy > PAGE_SIZE)
503 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
507 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
510 /* Retrieve NUMA policy */
511 asmlinkage long sys_get_mempolicy(int __user *policy,
512 unsigned long __user *nmask,
513 unsigned long maxnode,
514 unsigned long addr, unsigned long flags)
517 struct mm_struct *mm = current->mm;
518 struct vm_area_struct *vma = NULL;
519 struct mempolicy *pol = current->mempolicy;
521 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
523 if (nmask != NULL && maxnode < MAX_NUMNODES)
525 if (flags & MPOL_F_ADDR) {
526 down_read(&mm->mmap_sem);
527 vma = find_vma_intersection(mm, addr, addr+1);
529 up_read(&mm->mmap_sem);
532 if (vma->vm_ops && vma->vm_ops->get_policy)
533 pol = vma->vm_ops->get_policy(vma, addr);
535 pol = vma->vm_policy;
540 pol = &default_policy;
542 if (flags & MPOL_F_NODE) {
543 if (flags & MPOL_F_ADDR) {
544 err = lookup_node(mm, addr);
548 } else if (pol == current->mempolicy &&
549 pol->policy == MPOL_INTERLEAVE) {
550 pval = current->il_next;
559 up_read(¤t->mm->mmap_sem);
563 if (policy && put_user(pval, policy))
569 get_zonemask(pol, &nodes);
570 err = copy_nodes_to_user(nmask, maxnode, &nodes);
575 up_read(¤t->mm->mmap_sem);
581 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
582 compat_ulong_t __user *nmask,
583 compat_ulong_t maxnode,
584 compat_ulong_t addr, compat_ulong_t flags)
587 unsigned long __user *nm = NULL;
588 unsigned long nr_bits, alloc_size;
589 DECLARE_BITMAP(bm, MAX_NUMNODES);
591 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
592 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
595 nm = compat_alloc_user_space(alloc_size);
597 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
600 err = copy_from_user(bm, nm, alloc_size);
601 /* ensure entire bitmap is zeroed */
602 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
603 err |= compat_put_bitmap(nmask, bm, nr_bits);
609 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
610 compat_ulong_t maxnode)
613 unsigned long __user *nm = NULL;
614 unsigned long nr_bits, alloc_size;
615 DECLARE_BITMAP(bm, MAX_NUMNODES);
617 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
618 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
621 err = compat_get_bitmap(bm, nmask, nr_bits);
622 nm = compat_alloc_user_space(alloc_size);
623 err |= copy_to_user(nm, bm, alloc_size);
629 return sys_set_mempolicy(mode, nm, nr_bits+1);
632 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
633 compat_ulong_t mode, compat_ulong_t __user *nmask,
634 compat_ulong_t maxnode, compat_ulong_t flags)
637 unsigned long __user *nm = NULL;
638 unsigned long nr_bits, alloc_size;
641 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
642 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
645 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
646 nm = compat_alloc_user_space(alloc_size);
647 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
653 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
658 /* Return effective policy for a VMA */
660 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
662 struct mempolicy *pol = task->mempolicy;
665 if (vma->vm_ops && vma->vm_ops->get_policy)
666 pol = vma->vm_ops->get_policy(vma, addr);
667 else if (vma->vm_policy &&
668 vma->vm_policy->policy != MPOL_DEFAULT)
669 pol = vma->vm_policy;
672 pol = &default_policy;
676 /* Return a zonelist representing a mempolicy */
677 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
681 switch (policy->policy) {
683 nd = policy->v.preferred_node;
688 /* Lower zones don't get a policy applied */
689 /* Careful: current->mems_allowed might have moved */
690 if (gfp_zone(gfp) >= policy_zone)
691 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
692 return policy->v.zonelist;
694 case MPOL_INTERLEAVE: /* should not happen */
702 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
705 /* Do dynamic interleaving for a process */
706 static unsigned interleave_nodes(struct mempolicy *policy)
709 struct task_struct *me = current;
712 BUG_ON(nid >= MAX_NUMNODES);
713 next = next_node(nid, policy->v.nodes);
714 if (next >= MAX_NUMNODES)
715 next = first_node(policy->v.nodes);
720 /* Do static interleaving for a VMA with known offset. */
721 static unsigned offset_il_node(struct mempolicy *pol,
722 struct vm_area_struct *vma, unsigned long off)
724 unsigned nnodes = nodes_weight(pol->v.nodes);
725 unsigned target = (unsigned)off % nnodes;
731 nid = next_node(nid, pol->v.nodes);
733 } while (c <= target);
734 BUG_ON(nid >= MAX_NUMNODES);
738 /* Allocate a page in interleaved policy.
739 Own path because it needs to do special accounting. */
740 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid)
745 BUG_ON(!node_online(nid));
746 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
747 page = __alloc_pages(gfp, order, zl);
748 if (page && page_zone(page) == zl->zones[0]) {
749 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
756 * alloc_page_vma - Allocate a page for a VMA.
759 * %GFP_USER user allocation.
760 * %GFP_KERNEL kernel allocations,
761 * %GFP_HIGHMEM highmem/user allocations,
762 * %GFP_FS allocation should not call back into a file system.
763 * %GFP_ATOMIC don't sleep.
765 * @vma: Pointer to VMA or NULL if not available.
766 * @addr: Virtual Address of the allocation. Must be inside the VMA.
768 * This function allocates a page from the kernel page pool and applies
769 * a NUMA policy associated with the VMA or the current process.
770 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
771 * mm_struct of the VMA to prevent it from going away. Should be used for
772 * all allocations for pages that will be mapped into
773 * user space. Returns NULL when no page can be allocated.
775 * Should be called with the mm_sem of the vma hold.
778 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
780 struct mempolicy *pol = get_vma_policy(current, vma, addr);
782 cpuset_update_current_mems_allowed();
784 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
788 BUG_ON(addr >= vma->vm_end);
789 BUG_ON(addr < vma->vm_start);
791 off += (addr - vma->vm_start) >> PAGE_SHIFT;
792 nid = offset_il_node(pol, vma, off);
794 /* fall back to process interleaving */
795 nid = interleave_nodes(pol);
797 return alloc_page_interleave(gfp, 0, nid);
799 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
803 * alloc_pages_current - Allocate pages.
806 * %GFP_USER user allocation,
807 * %GFP_KERNEL kernel allocation,
808 * %GFP_HIGHMEM highmem allocation,
809 * %GFP_FS don't call back into a file system.
810 * %GFP_ATOMIC don't sleep.
811 * @order: Power of two of allocation size in pages. 0 is a single page.
813 * Allocate a page from the kernel page pool. When not in
814 * interrupt context and apply the current process NUMA policy.
815 * Returns NULL when no page can be allocated.
817 * Don't call cpuset_update_current_mems_allowed() unless
818 * 1) it's ok to take cpuset_sem (can WAIT), and
819 * 2) allocating for current task (not interrupt).
821 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
823 struct mempolicy *pol = current->mempolicy;
825 if ((gfp & __GFP_WAIT) && !in_interrupt())
826 cpuset_update_current_mems_allowed();
827 if (!pol || in_interrupt())
828 pol = &default_policy;
829 if (pol->policy == MPOL_INTERLEAVE)
830 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
831 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
833 EXPORT_SYMBOL(alloc_pages_current);
835 /* Slow path of a mempolicy copy */
836 struct mempolicy *__mpol_copy(struct mempolicy *old)
838 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
841 return ERR_PTR(-ENOMEM);
843 atomic_set(&new->refcnt, 1);
844 if (new->policy == MPOL_BIND) {
845 int sz = ksize(old->v.zonelist);
846 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
847 if (!new->v.zonelist) {
848 kmem_cache_free(policy_cache, new);
849 return ERR_PTR(-ENOMEM);
851 memcpy(new->v.zonelist, old->v.zonelist, sz);
856 /* Slow path of a mempolicy comparison */
857 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
861 if (a->policy != b->policy)
866 case MPOL_INTERLEAVE:
867 return nodes_equal(a->v.nodes, b->v.nodes);
869 return a->v.preferred_node == b->v.preferred_node;
872 for (i = 0; a->v.zonelist->zones[i]; i++)
873 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
875 return b->v.zonelist->zones[i] == NULL;
883 /* Slow path of a mpol destructor. */
884 void __mpol_free(struct mempolicy *p)
886 if (!atomic_dec_and_test(&p->refcnt))
888 if (p->policy == MPOL_BIND)
889 kfree(p->v.zonelist);
890 p->policy = MPOL_DEFAULT;
891 kmem_cache_free(policy_cache, p);
895 * Hugetlb policy. Same as above, just works with node numbers instead of
899 /* Find first node suitable for an allocation */
900 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
902 struct mempolicy *pol = get_vma_policy(current, vma, addr);
904 switch (pol->policy) {
906 return numa_node_id();
908 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
909 case MPOL_INTERLEAVE:
910 return interleave_nodes(pol);
912 return pol->v.preferred_node >= 0 ?
913 pol->v.preferred_node : numa_node_id();
919 /* Find secondary valid nodes for an allocation */
920 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
922 struct mempolicy *pol = get_vma_policy(current, vma, addr);
924 switch (pol->policy) {
927 case MPOL_INTERLEAVE:
931 for (z = pol->v.zonelist->zones; *z; z++)
932 if ((*z)->zone_pgdat->node_id == nid)
943 * Shared memory backing store policy support.
945 * Remember policies even when nobody has shared memory mapped.
946 * The policies are kept in Red-Black tree linked from the inode.
947 * They are protected by the sp->lock spinlock, which should be held
948 * for any accesses to the tree.
951 /* lookup first element intersecting start-end */
952 /* Caller holds sp->lock */
953 static struct sp_node *
954 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
956 struct rb_node *n = sp->root.rb_node;
959 struct sp_node *p = rb_entry(n, struct sp_node, nd);
963 else if (end <= p->start)
971 struct sp_node *w = NULL;
972 struct rb_node *prev = rb_prev(n);
975 w = rb_entry(prev, struct sp_node, nd);
980 return rb_entry(n, struct sp_node, nd);
983 /* Insert a new shared policy into the list. */
984 /* Caller holds sp->lock */
985 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
987 struct rb_node **p = &sp->root.rb_node;
988 struct rb_node *parent = NULL;
993 nd = rb_entry(parent, struct sp_node, nd);
994 if (new->start < nd->start)
996 else if (new->end > nd->end)
1001 rb_link_node(&new->nd, parent, p);
1002 rb_insert_color(&new->nd, &sp->root);
1003 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1004 new->policy ? new->policy->policy : 0);
1007 /* Find shared policy intersecting idx */
1009 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1011 struct mempolicy *pol = NULL;
1014 if (!sp->root.rb_node)
1016 spin_lock(&sp->lock);
1017 sn = sp_lookup(sp, idx, idx+1);
1019 mpol_get(sn->policy);
1022 spin_unlock(&sp->lock);
1026 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1028 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1029 rb_erase(&n->nd, &sp->root);
1030 mpol_free(n->policy);
1031 kmem_cache_free(sn_cache, n);
1035 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1037 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1048 /* Replace a policy range. */
1049 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1050 unsigned long end, struct sp_node *new)
1052 struct sp_node *n, *new2 = NULL;
1055 spin_lock(&sp->lock);
1056 n = sp_lookup(sp, start, end);
1057 /* Take care of old policies in the same range. */
1058 while (n && n->start < end) {
1059 struct rb_node *next = rb_next(&n->nd);
1060 if (n->start >= start) {
1066 /* Old policy spanning whole new range. */
1069 spin_unlock(&sp->lock);
1070 new2 = sp_alloc(end, n->end, n->policy);
1076 sp_insert(sp, new2);
1084 n = rb_entry(next, struct sp_node, nd);
1088 spin_unlock(&sp->lock);
1090 mpol_free(new2->policy);
1091 kmem_cache_free(sn_cache, new2);
1096 int mpol_set_shared_policy(struct shared_policy *info,
1097 struct vm_area_struct *vma, struct mempolicy *npol)
1100 struct sp_node *new = NULL;
1101 unsigned long sz = vma_pages(vma);
1103 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1105 sz, npol? npol->policy : -1,
1106 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1109 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1113 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1115 kmem_cache_free(sn_cache, new);
1119 /* Free a backing policy store on inode delete. */
1120 void mpol_free_shared_policy(struct shared_policy *p)
1123 struct rb_node *next;
1125 if (!p->root.rb_node)
1127 spin_lock(&p->lock);
1128 next = rb_first(&p->root);
1130 n = rb_entry(next, struct sp_node, nd);
1131 next = rb_next(&n->nd);
1132 rb_erase(&n->nd, &p->root);
1133 mpol_free(n->policy);
1134 kmem_cache_free(sn_cache, n);
1136 spin_unlock(&p->lock);
1139 /* assumes fs == KERNEL_DS */
1140 void __init numa_policy_init(void)
1142 policy_cache = kmem_cache_create("numa_policy",
1143 sizeof(struct mempolicy),
1144 0, SLAB_PANIC, NULL, NULL);
1146 sn_cache = kmem_cache_create("shared_policy_node",
1147 sizeof(struct sp_node),
1148 0, SLAB_PANIC, NULL, NULL);
1150 /* Set interleaving policy for system init. This way not all
1151 the data structures allocated at system boot end up in node zero. */
1153 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1155 printk("numa_policy_init: interleaving failed\n");
1158 /* Reset policy of current process to default.
1159 * Assumes fs == KERNEL_DS */
1160 void numa_default_policy(void)
1162 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);