mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * Subject to the GNU Public License, version 2.
   6  *
   7  * NUMA policy allows the user to give hints in which node(s) memory should
   8  * be allocated.
   9  *
  10  * Support four policies per VMA and per process:
  11  *
  12  * The VMA policy has priority over the process policy for a page fault.
  13  *
  14  * interleave     Allocate memory interleaved over a set of nodes,
  15  *                with normal fallback if it fails.
  16  *                For VMA based allocations this interleaves based on the
  17  *                offset into the backing object or offset into the mapping
  18  *                for anonymous memory. For process policy an process counter
  19  *                is used.
  20  * bind           Only allocate memory on a specific set of nodes,
  21  *                no fallback.
  22  * preferred       Try a specific node first before normal fallback.
  23  *                As a special case node -1 here means do the allocation
  24  *                on the local CPU. This is normally identical to default,
  25  *                but useful to set in a VMA when you have a non default
  26  *                process policy.
  27  * default        Allocate on the local node first, or when on a VMA
  28  *                use the process policy. This is what Linux always did
  29  *                in a NUMA aware kernel and still does by, ahem, default.
  30  *
  31  * The process policy is applied for most non interrupt memory allocations
  32  * in that process' context. Interrupts ignore the policies and always
  33  * try to allocate on the local CPU. The VMA policy is only applied for memory
  34  * allocations for a VMA in the VM.
  35  *
  36  * Currently there are a few corner cases in swapping where the policy
  37  * is not applied, but the majority should be handled. When process policy
  38  * is used it is not remembered over swap outs/swap ins.
  39  *
  40  * Only the highest zone in the zone hierarchy gets policied. Allocations
  41  * requesting a lower zone just use default policy. This implies that
  42  * on systems with highmem kernel lowmem allocation don't get policied.
  43  * Same with GFP_DMA allocations.
  44  *
  45  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  46  * all users and remembered even when nobody has memory mapped.
  47  */
  48
  49 /* Notebook:
  50    fix mmap readahead to honour policy and enable policy for any page cache
  51    object
  52    statistics for bigpages
  53    global policy for page cache? currently it uses process policy. Requires
  54    first item above.
  55    handle mremap for shared memory (currently ignored for the policy)
  56    grows down?
  57    make bind policy root only? It can trigger oom much faster and the
  58    kernel is not always grateful with that.
  59    could replace all the switch()es with a mempolicy_ops structure.
  60 */
  61
  62 #include <linux/mempolicy.h>
  63 #include <linux/mm.h>
  64 #include <linux/highmem.h>
  65 #include <linux/hugetlb.h>
  66 #include <linux/kernel.h>
  67 #include <linux/sched.h>
  68 #include <linux/mm.h>
  69 #include <linux/nodemask.h>
  70 #include <linux/cpuset.h>
  71 #include <linux/gfp.h>
  72 #include <linux/slab.h>
  73 #include <linux/string.h>
  74 #include <linux/module.h>
  75 #include <linux/interrupt.h>
  76 #include <linux/init.h>
  77 #include <linux/compat.h>
  78 #include <linux/mempolicy.h>
  79 #include <asm/tlbflush.h>
  80 #include <asm/uaccess.h>
  81
  82 static kmem_cache_t *policy_cache;
  83 static kmem_cache_t *sn_cache;
  84
  85 #define PDprintk(fmt...)
  86
  87 /* Highest zone. An specific allocation for a zone below that is not
  88    policied. */
  89 static int policy_zone;
  90
  91 struct mempolicy default_policy = {
  92         .refcnt = ATOMIC_INIT(1), /* never free it */
  93         .policy = MPOL_DEFAULT,
  94 };
  95
  96 /* Do sanity checking on a policy */
  97 static int mpol_check_policy(int mode, nodemask_t *nodes)
  98 {
  99         int empty = nodes_empty(*nodes);
 100
 101         switch (mode) {
 102         case MPOL_DEFAULT:
 103                 if (!empty)
 104                         return -EINVAL;
 105                 break;
 106         case MPOL_BIND:
 107         case MPOL_INTERLEAVE:
 108                 /* Preferred will only use the first bit, but allow
 109                    more for now. */
 110                 if (empty)
 111                         return -EINVAL;
 112                 break;
 113         }
 114         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 115 }
 116
 117 /* Copy a node mask from user space. */
 118 static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
 119                      unsigned long maxnode, int mode)
 120 {
 121         unsigned long k;
 122         unsigned long nlongs;
 123         unsigned long endmask;
 124
 125         --maxnode;
 126         nodes_clear(*nodes);
 127         if (maxnode == 0 || !nmask)
 128                 return 0;
 129
 130         nlongs = BITS_TO_LONGS(maxnode);
 131         if ((maxnode % BITS_PER_LONG) == 0)
 132                 endmask = ~0UL;
 133         else
 134                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 135
 136         /* When the user specified more nodes than supported just check
 137            if the non supported part is all zero. */
 138         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 139                 if (nlongs > PAGE_SIZE/sizeof(long))
 140                         return -EINVAL;
 141                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 142                         unsigned long t;
 143                         if (get_user(t, nmask + k))
 144                                 return -EFAULT;
 145                         if (k == nlongs - 1) {
 146                                 if (t & endmask)
 147                                         return -EINVAL;
 148                         } else if (t)
 149                                 return -EINVAL;
 150                 }
 151                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 152                 endmask = ~0UL;
 153         }
 154
 155         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 156                 return -EFAULT;
 157         nodes_addr(*nodes)[nlongs-1] &= endmask;
 158         /* Update current mems_allowed */
 159         cpuset_update_current_mems_allowed();
 160         /* Ignore nodes not set in current->mems_allowed */
 161         /* AK: shouldn't this error out instead? */
 162         cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
 163         return mpol_check_policy(mode, nodes);
 164 }
 165
 166 /* Generate a custom zonelist for the BIND policy. */
 167 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 168 {
 169         struct zonelist *zl;
 170         int num, max, nd;
 171
 172         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 173         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 174         if (!zl)
 175                 return NULL;
 176         num = 0;
 177         for_each_node_mask(nd, *nodes) {
 178                 int k;
 179                 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
 180                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 181                         if (!z->present_pages)
 182                                 continue;
 183                         zl->zones[num++] = z;
 184                         if (k > policy_zone)
 185                                 policy_zone = k;
 186                 }
 187         }
 188         BUG_ON(num >= max);
 189         zl->zones[num] = NULL;
 190         return zl;
 191 }
 192
 193 /* Create a new policy */
 194 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 195 {
 196         struct mempolicy *policy;
 197
 198         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 199         if (mode == MPOL_DEFAULT)
 200                 return NULL;
 201         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 202         if (!policy)
 203                 return ERR_PTR(-ENOMEM);
 204         atomic_set(&policy->refcnt, 1);
 205         switch (mode) {
 206         case MPOL_INTERLEAVE:
 207                 policy->v.nodes = *nodes;
 208                 break;
 209         case MPOL_PREFERRED:
 210                 policy->v.preferred_node = first_node(*nodes);
 211                 if (policy->v.preferred_node >= MAX_NUMNODES)
 212                         policy->v.preferred_node = -1;
 213                 break;
 214         case MPOL_BIND:
 215                 policy->v.zonelist = bind_zonelist(nodes);
 216                 if (policy->v.zonelist == NULL) {
 217                         kmem_cache_free(policy_cache, policy);
 218                         return ERR_PTR(-ENOMEM);
 219                 }
 220                 break;
 221         }
 222         policy->policy = mode;
 223         return policy;
 224 }
 225
 226 /* Ensure all existing pages follow the policy. */
 227 static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
 228                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 229 {
 230         pte_t *orig_pte;
 231         pte_t *pte;
 232
 233         spin_lock(&mm->page_table_lock);
 234         orig_pte = pte = pte_offset_map(pmd, addr);
 235         do {
 236                 unsigned long pfn;
 237                 unsigned int nid;
 238
 239                 if (!pte_present(*pte))
 240                         continue;
 241                 pfn = pte_pfn(*pte);
 242                 if (!pfn_valid(pfn))
 243                         continue;
 244                 nid = pfn_to_nid(pfn);
 245                 if (!node_isset(nid, *nodes))
 246                         break;
 247         } while (pte++, addr += PAGE_SIZE, addr != end);
 248         pte_unmap(orig_pte);
 249         spin_unlock(&mm->page_table_lock);
 250         return addr != end;
 251 }
 252
 253 static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
 254                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 255 {
 256         pmd_t *pmd;
 257         unsigned long next;
 258
 259         pmd = pmd_offset(pud, addr);
 260         do {
 261                 next = pmd_addr_end(addr, end);
 262                 if (pmd_none_or_clear_bad(pmd))
 263                         continue;
 264                 if (check_pte_range(mm, pmd, addr, next, nodes))
 265                         return -EIO;
 266         } while (pmd++, addr = next, addr != end);
 267         return 0;
 268 }
 269
 270 static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
 271                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 272 {
 273         pud_t *pud;
 274         unsigned long next;
 275
 276         pud = pud_offset(pgd, addr);
 277         do {
 278                 next = pud_addr_end(addr, end);
 279                 if (pud_none_or_clear_bad(pud))
 280                         continue;
 281                 if (check_pmd_range(mm, pud, addr, next, nodes))
 282                         return -EIO;
 283         } while (pud++, addr = next, addr != end);
 284         return 0;
 285 }
 286
 287 static inline int check_pgd_range(struct mm_struct *mm,
 288                 unsigned long addr, unsigned long end, nodemask_t *nodes)
 289 {
 290         pgd_t *pgd;
 291         unsigned long next;
 292
 293         pgd = pgd_offset(mm, addr);
 294         do {
 295                 next = pgd_addr_end(addr, end);
 296                 if (pgd_none_or_clear_bad(pgd))
 297                         continue;
 298                 if (check_pud_range(mm, pgd, addr, next, nodes))
 299                         return -EIO;
 300         } while (pgd++, addr = next, addr != end);
 301         return 0;
 302 }
 303
 304 /* Step 1: check the range */
 305 static struct vm_area_struct *
 306 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 307             nodemask_t *nodes, unsigned long flags)
 308 {
 309         int err;
 310         struct vm_area_struct *first, *vma, *prev;
 311
 312         first = find_vma(mm, start);
 313         if (!first)
 314                 return ERR_PTR(-EFAULT);
 315         prev = NULL;
 316         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 317                 if (!vma->vm_next && vma->vm_end < end)
 318                         return ERR_PTR(-EFAULT);
 319                 if (prev && prev->vm_end < vma->vm_start)
 320                         return ERR_PTR(-EFAULT);
 321                 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
 322                         unsigned long endvma = vma->vm_end;
 323                         if (endvma > end)
 324                                 endvma = end;
 325                         if (vma->vm_start > start)
 326                                 start = vma->vm_start;
 327                         err = check_pgd_range(vma->vm_mm,
 328                                            start, endvma, nodes);
 329                         if (err) {
 330                                 first = ERR_PTR(err);
 331                                 break;
 332                         }
 333                 }
 334                 prev = vma;
 335         }
 336         return first;
 337 }
 338
 339 /* Apply policy to a single VMA */
 340 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 341 {
 342         int err = 0;
 343         struct mempolicy *old = vma->vm_policy;
 344
 345         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 346                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 347                  vma->vm_ops, vma->vm_file,
 348                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 349
 350         if (vma->vm_ops && vma->vm_ops->set_policy)
 351                 err = vma->vm_ops->set_policy(vma, new);
 352         if (!err) {
 353                 mpol_get(new);
 354                 vma->vm_policy = new;
 355                 mpol_free(old);
 356         }
 357         return err;
 358 }
 359
 360 /* Step 2: apply policy to a range and do splits. */
 361 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 362                        unsigned long end, struct mempolicy *new)
 363 {
 364         struct vm_area_struct *next;
 365         int err;
 366
 367         err = 0;
 368         for (; vma && vma->vm_start < end; vma = next) {
 369                 next = vma->vm_next;
 370                 if (vma->vm_start < start)
 371                         err = split_vma(vma->vm_mm, vma, start, 1);
 372                 if (!err && vma->vm_end > end)
 373                         err = split_vma(vma->vm_mm, vma, end, 0);
 374                 if (!err)
 375                         err = policy_vma(vma, new);
 376                 if (err)
 377                         break;
 378         }
 379         return err;
 380 }
 381
 382 /* Change policy for a memory range */
 383 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 384                           unsigned long mode,
 385                           unsigned long __user *nmask, unsigned long maxnode,
 386                           unsigned flags)
 387 {
 388         struct vm_area_struct *vma;
 389         struct mm_struct *mm = current->mm;
 390         struct mempolicy *new;
 391         unsigned long end;
 392         nodemask_t nodes;
 393         int err;
 394
 395         if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
 396                 return -EINVAL;
 397         if (start & ~PAGE_MASK)
 398                 return -EINVAL;
 399         if (mode == MPOL_DEFAULT)
 400                 flags &= ~MPOL_MF_STRICT;
 401         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 402         end = start + len;
 403         if (end < start)
 404                 return -EINVAL;
 405         if (end == start)
 406                 return 0;
 407
 408         err = get_nodes(&nodes, nmask, maxnode, mode);
 409         if (err)
 410                 return err;
 411
 412         new = mpol_new(mode, &nodes);
 413         if (IS_ERR(new))
 414                 return PTR_ERR(new);
 415
 416         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 417                         mode,nodes_addr(nodes)[0]);
 418
 419         down_write(&mm->mmap_sem);
 420         vma = check_range(mm, start, end, &nodes, flags);
 421         err = PTR_ERR(vma);
 422         if (!IS_ERR(vma))
 423                 err = mbind_range(vma, start, end, new);
 424         up_write(&mm->mmap_sem);
 425         mpol_free(new);
 426         return err;
 427 }
 428
 429 /* Set the process memory policy */
 430 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 431                                    unsigned long maxnode)
 432 {
 433         int err;
 434         struct mempolicy *new;
 435         nodemask_t nodes;
 436
 437         if (mode < 0 || mode > MPOL_MAX)
 438                 return -EINVAL;
 439         err = get_nodes(&nodes, nmask, maxnode, mode);
 440         if (err)
 441                 return err;
 442         new = mpol_new(mode, &nodes);
 443         if (IS_ERR(new))
 444                 return PTR_ERR(new);
 445         mpol_free(current->mempolicy);
 446         current->mempolicy = new;
 447         if (new && new->policy == MPOL_INTERLEAVE)
 448                 current->il_next = first_node(new->v.nodes);
 449         return 0;
 450 }
 451
 452 /* Fill a zone bitmap for a policy */
 453 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 454 {
 455         int i;
 456
 457         nodes_clear(*nodes);
 458         switch (p->policy) {
 459         case MPOL_BIND:
 460                 for (i = 0; p->v.zonelist->zones[i]; i++)
 461                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes);
 462                 break;
 463         case MPOL_DEFAULT:
 464                 break;
 465         case MPOL_INTERLEAVE:
 466                 *nodes = p->v.nodes;
 467                 break;
 468         case MPOL_PREFERRED:
 469                 /* or use current node instead of online map? */
 470                 if (p->v.preferred_node < 0)
 471                         *nodes = node_online_map;
 472                 else
 473                         node_set(p->v.preferred_node, *nodes);
 474                 break;
 475         default:
 476                 BUG();
 477         }
 478 }
 479
 480 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 481 {
 482         struct page *p;
 483         int err;
 484
 485         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 486         if (err >= 0) {
 487                 err = page_to_nid(p);
 488                 put_page(p);
 489         }
 490         return err;
 491 }
 492
 493 /* Copy a kernel node mask to user space */
 494 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 495                               nodemask_t *nodes)
 496 {
 497         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 498         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 499
 500         if (copy > nbytes) {
 501                 if (copy > PAGE_SIZE)
 502                         return -EINVAL;
 503                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 504                         return -EFAULT;
 505                 copy = nbytes;
 506         }
 507         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 508 }
 509
 510 /* Retrieve NUMA policy */
 511 asmlinkage long sys_get_mempolicy(int __user *policy,
 512                                   unsigned long __user *nmask,
 513                                   unsigned long maxnode,
 514                                   unsigned long addr, unsigned long flags)
 515 {
 516         int err, pval;
 517         struct mm_struct *mm = current->mm;
 518         struct vm_area_struct *vma = NULL;
 519         struct mempolicy *pol = current->mempolicy;
 520
 521         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 522                 return -EINVAL;
 523         if (nmask != NULL && maxnode < MAX_NUMNODES)
 524                 return -EINVAL;
 525         if (flags & MPOL_F_ADDR) {
 526                 down_read(&mm->mmap_sem);
 527                 vma = find_vma_intersection(mm, addr, addr+1);
 528                 if (!vma) {
 529                         up_read(&mm->mmap_sem);
 530                         return -EFAULT;
 531                 }
 532                 if (vma->vm_ops && vma->vm_ops->get_policy)
 533                         pol = vma->vm_ops->get_policy(vma, addr);
 534                 else
 535                         pol = vma->vm_policy;
 536         } else if (addr)
 537                 return -EINVAL;
 538
 539         if (!pol)
 540                 pol = &default_policy;
 541
 542         if (flags & MPOL_F_NODE) {
 543                 if (flags & MPOL_F_ADDR) {
 544                         err = lookup_node(mm, addr);
 545                         if (err < 0)
 546                                 goto out;
 547                         pval = err;
 548                 } else if (pol == current->mempolicy &&
 549                                 pol->policy == MPOL_INTERLEAVE) {
 550                         pval = current->il_next;
 551                 } else {
 552                         err = -EINVAL;
 553                         goto out;
 554                 }
 555         } else
 556                 pval = pol->policy;
 557
 558         if (vma) {
 559                 up_read(&current->mm->mmap_sem);
 560                 vma = NULL;
 561         }
 562
 563         if (policy && put_user(pval, policy))
 564                 return -EFAULT;
 565
 566         err = 0;
 567         if (nmask) {
 568                 nodemask_t nodes;
 569                 get_zonemask(pol, &nodes);
 570                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 571         }
 572
 573  out:
 574         if (vma)
 575                 up_read(&current->mm->mmap_sem);
 576         return err;
 577 }
 578
 579 #ifdef CONFIG_COMPAT
 580
 581 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 582                                      compat_ulong_t __user *nmask,
 583                                      compat_ulong_t maxnode,
 584                                      compat_ulong_t addr, compat_ulong_t flags)
 585 {
 586         long err;
 587         unsigned long __user *nm = NULL;
 588         unsigned long nr_bits, alloc_size;
 589         DECLARE_BITMAP(bm, MAX_NUMNODES);
 590
 591         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 592         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 593
 594         if (nmask)
 595                 nm = compat_alloc_user_space(alloc_size);
 596
 597         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 598
 599         if (!err && nmask) {
 600                 err = copy_from_user(bm, nm, alloc_size);
 601                 /* ensure entire bitmap is zeroed */
 602                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 603                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 604         }
 605
 606         return err;
 607 }
 608
 609 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 610                                      compat_ulong_t maxnode)
 611 {
 612         long err = 0;
 613         unsigned long __user *nm = NULL;
 614         unsigned long nr_bits, alloc_size;
 615         DECLARE_BITMAP(bm, MAX_NUMNODES);
 616
 617         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 618         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 619
 620         if (nmask) {
 621                 err = compat_get_bitmap(bm, nmask, nr_bits);
 622                 nm = compat_alloc_user_space(alloc_size);
 623                 err |= copy_to_user(nm, bm, alloc_size);
 624         }
 625
 626         if (err)
 627                 return -EFAULT;
 628
 629         return sys_set_mempolicy(mode, nm, nr_bits+1);
 630 }
 631
 632 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 633                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 634                              compat_ulong_t maxnode, compat_ulong_t flags)
 635 {
 636         long err = 0;
 637         unsigned long __user *nm = NULL;
 638         unsigned long nr_bits, alloc_size;
 639         nodemask_t bm;
 640
 641         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 642         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 643
 644         if (nmask) {
 645                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 646                 nm = compat_alloc_user_space(alloc_size);
 647                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 648         }
 649
 650         if (err)
 651                 return -EFAULT;
 652
 653         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 654 }
 655
 656 #endif
 657
 658 /* Return effective policy for a VMA */
 659 struct mempolicy *
 660 get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
 661 {
 662         struct mempolicy *pol = task->mempolicy;
 663
 664         if (vma) {
 665                 if (vma->vm_ops && vma->vm_ops->get_policy)
 666                         pol = vma->vm_ops->get_policy(vma, addr);
 667                 else if (vma->vm_policy &&
 668                                 vma->vm_policy->policy != MPOL_DEFAULT)
 669                         pol = vma->vm_policy;
 670         }
 671         if (!pol)
 672                 pol = &default_policy;
 673         return pol;
 674 }
 675
 676 /* Return a zonelist representing a mempolicy */
 677 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 678 {
 679         int nd;
 680
 681         switch (policy->policy) {
 682         case MPOL_PREFERRED:
 683                 nd = policy->v.preferred_node;
 684                 if (nd < 0)
 685                         nd = numa_node_id();
 686                 break;
 687         case MPOL_BIND:
 688                 /* Lower zones don't get a policy applied */
 689                 /* Careful: current->mems_allowed might have moved */
 690                 if (gfp_zone(gfp) >= policy_zone)
 691                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 692                                 return policy->v.zonelist;
 693                 /*FALL THROUGH*/
 694         case MPOL_INTERLEAVE: /* should not happen */
 695         case MPOL_DEFAULT:
 696                 nd = numa_node_id();
 697                 break;
 698         default:
 699                 nd = 0;
 700                 BUG();
 701         }
 702         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 703 }
 704
 705 /* Do dynamic interleaving for a process */
 706 static unsigned interleave_nodes(struct mempolicy *policy)
 707 {
 708         unsigned nid, next;
 709         struct task_struct *me = current;
 710
 711         nid = me->il_next;
 712         BUG_ON(nid >= MAX_NUMNODES);
 713         next = next_node(nid, policy->v.nodes);
 714         if (next >= MAX_NUMNODES)
 715                 next = first_node(policy->v.nodes);
 716         me->il_next = next;
 717         return nid;
 718 }
 719
 720 /* Do static interleaving for a VMA with known offset. */
 721 static unsigned offset_il_node(struct mempolicy *pol,
 722                 struct vm_area_struct *vma, unsigned long off)
 723 {
 724         unsigned nnodes = nodes_weight(pol->v.nodes);
 725         unsigned target = (unsigned)off % nnodes;
 726         int c;
 727         int nid = -1;
 728
 729         c = 0;
 730         do {
 731                 nid = next_node(nid, pol->v.nodes);
 732                 c++;
 733         } while (c <= target);
 734         BUG_ON(nid >= MAX_NUMNODES);
 735         return nid;
 736 }
 737
 738 /* Allocate a page in interleaved policy.
 739    Own path because it needs to do special accounting. */
 740 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid)
 741 {
 742         struct zonelist *zl;
 743         struct page *page;
 744
 745         BUG_ON(!node_online(nid));
 746         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
 747         page = __alloc_pages(gfp, order, zl);
 748         if (page && page_zone(page) == zl->zones[0]) {
 749                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
 750                 put_cpu();
 751         }
 752         return page;
 753 }
 754
 755 /**
 756  *      alloc_page_vma  - Allocate a page for a VMA.
 757  *
 758  *      @gfp:
 759  *      %GFP_USER    user allocation.
 760  *      %GFP_KERNEL  kernel allocations,
 761  *      %GFP_HIGHMEM highmem/user allocations,
 762  *      %GFP_FS      allocation should not call back into a file system.
 763  *      %GFP_ATOMIC  don't sleep.
 764  *
 765  *      @vma:  Pointer to VMA or NULL if not available.
 766  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
 767  *
 768  *      This function allocates a page from the kernel page pool and applies
 769  *      a NUMA policy associated with the VMA or the current process.
 770  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
 771  *      mm_struct of the VMA to prevent it from going away. Should be used for
 772  *      all allocations for pages that will be mapped into
 773  *      user space. Returns NULL when no page can be allocated.
 774  *
 775  *      Should be called with the mm_sem of the vma hold.
 776  */
 777 struct page *
 778 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 779 {
 780         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 781
 782         cpuset_update_current_mems_allowed();
 783
 784         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
 785                 unsigned nid;
 786                 if (vma) {
 787                         unsigned long off;
 788                         BUG_ON(addr >= vma->vm_end);
 789                         BUG_ON(addr < vma->vm_start);
 790                         off = vma->vm_pgoff;
 791                         off += (addr - vma->vm_start) >> PAGE_SHIFT;
 792                         nid = offset_il_node(pol, vma, off);
 793                 } else {
 794                         /* fall back to process interleaving */
 795                         nid = interleave_nodes(pol);
 796                 }
 797                 return alloc_page_interleave(gfp, 0, nid);
 798         }
 799         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
 800 }
 801
 802 /**
 803  *      alloc_pages_current - Allocate pages.
 804  *
 805  *      @gfp:
 806  *              %GFP_USER   user allocation,
 807  *              %GFP_KERNEL kernel allocation,
 808  *              %GFP_HIGHMEM highmem allocation,
 809  *              %GFP_FS     don't call back into a file system.
 810  *              %GFP_ATOMIC don't sleep.
 811  *      @order: Power of two of allocation size in pages. 0 is a single page.
 812  *
 813  *      Allocate a page from the kernel page pool.  When not in
 814  *      interrupt context and apply the current process NUMA policy.
 815  *      Returns NULL when no page can be allocated.
 816  *
 817  *      Don't call cpuset_update_current_mems_allowed() unless
 818  *      1) it's ok to take cpuset_sem (can WAIT), and
 819  *      2) allocating for current task (not interrupt).
 820  */
 821 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 822 {
 823         struct mempolicy *pol = current->mempolicy;
 824
 825         if ((gfp & __GFP_WAIT) && !in_interrupt())
 826                 cpuset_update_current_mems_allowed();
 827         if (!pol || in_interrupt())
 828                 pol = &default_policy;
 829         if (pol->policy == MPOL_INTERLEAVE)
 830                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
 831         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
 832 }
 833 EXPORT_SYMBOL(alloc_pages_current);
 834
 835 /* Slow path of a mempolicy copy */
 836 struct mempolicy *__mpol_copy(struct mempolicy *old)
 837 {
 838         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 839
 840         if (!new)
 841                 return ERR_PTR(-ENOMEM);
 842         *new = *old;
 843         atomic_set(&new->refcnt, 1);
 844         if (new->policy == MPOL_BIND) {
 845                 int sz = ksize(old->v.zonelist);
 846                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
 847                 if (!new->v.zonelist) {
 848                         kmem_cache_free(policy_cache, new);
 849                         return ERR_PTR(-ENOMEM);
 850                 }
 851                 memcpy(new->v.zonelist, old->v.zonelist, sz);
 852         }
 853         return new;
 854 }
 855
 856 /* Slow path of a mempolicy comparison */
 857 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 858 {
 859         if (!a || !b)
 860                 return 0;
 861         if (a->policy != b->policy)
 862                 return 0;
 863         switch (a->policy) {
 864         case MPOL_DEFAULT:
 865                 return 1;
 866         case MPOL_INTERLEAVE:
 867                 return nodes_equal(a->v.nodes, b->v.nodes);
 868         case MPOL_PREFERRED:
 869                 return a->v.preferred_node == b->v.preferred_node;
 870         case MPOL_BIND: {
 871                 int i;
 872                 for (i = 0; a->v.zonelist->zones[i]; i++)
 873                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
 874                                 return 0;
 875                 return b->v.zonelist->zones[i] == NULL;
 876         }
 877         default:
 878                 BUG();
 879                 return 0;
 880         }
 881 }
 882
 883 /* Slow path of a mpol destructor. */
 884 void __mpol_free(struct mempolicy *p)
 885 {
 886         if (!atomic_dec_and_test(&p->refcnt))
 887                 return;
 888         if (p->policy == MPOL_BIND)
 889                 kfree(p->v.zonelist);
 890         p->policy = MPOL_DEFAULT;
 891         kmem_cache_free(policy_cache, p);
 892 }
 893
 894 /*
 895  * Hugetlb policy. Same as above, just works with node numbers instead of
 896  * zonelists.
 897  */
 898
 899 /* Find first node suitable for an allocation */
 900 int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
 901 {
 902         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 903
 904         switch (pol->policy) {
 905         case MPOL_DEFAULT:
 906                 return numa_node_id();
 907         case MPOL_BIND:
 908                 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
 909         case MPOL_INTERLEAVE:
 910                 return interleave_nodes(pol);
 911         case MPOL_PREFERRED:
 912                 return pol->v.preferred_node >= 0 ?
 913                                 pol->v.preferred_node : numa_node_id();
 914         }
 915         BUG();
 916         return 0;
 917 }
 918
 919 /* Find secondary valid nodes for an allocation */
 920 int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
 921 {
 922         struct mempolicy *pol = get_vma_policy(current, vma, addr);
 923
 924         switch (pol->policy) {
 925         case MPOL_PREFERRED:
 926         case MPOL_DEFAULT:
 927         case MPOL_INTERLEAVE:
 928                 return 1;
 929         case MPOL_BIND: {
 930                 struct zone **z;
 931                 for (z = pol->v.zonelist->zones; *z; z++)
 932                         if ((*z)->zone_pgdat->node_id == nid)
 933                                 return 1;
 934                 return 0;
 935         }
 936         default:
 937                 BUG();
 938                 return 0;
 939         }
 940 }
 941
 942 /*
 943  * Shared memory backing store policy support.
 944  *
 945  * Remember policies even when nobody has shared memory mapped.
 946  * The policies are kept in Red-Black tree linked from the inode.
 947  * They are protected by the sp->lock spinlock, which should be held
 948  * for any accesses to the tree.
 949  */
 950
 951 /* lookup first element intersecting start-end */
 952 /* Caller holds sp->lock */
 953 static struct sp_node *
 954 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
 955 {
 956         struct rb_node *n = sp->root.rb_node;
 957
 958         while (n) {
 959                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
 960
 961                 if (start >= p->end)
 962                         n = n->rb_right;
 963                 else if (end <= p->start)
 964                         n = n->rb_left;
 965                 else
 966                         break;
 967         }
 968         if (!n)
 969                 return NULL;
 970         for (;;) {
 971                 struct sp_node *w = NULL;
 972                 struct rb_node *prev = rb_prev(n);
 973                 if (!prev)
 974                         break;
 975                 w = rb_entry(prev, struct sp_node, nd);
 976                 if (w->end <= start)
 977                         break;
 978                 n = prev;
 979         }
 980         return rb_entry(n, struct sp_node, nd);
 981 }
 982
 983 /* Insert a new shared policy into the list. */
 984 /* Caller holds sp->lock */
 985 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
 986 {
 987         struct rb_node **p = &sp->root.rb_node;
 988         struct rb_node *parent = NULL;
 989         struct sp_node *nd;
 990
 991         while (*p) {
 992                 parent = *p;
 993                 nd = rb_entry(parent, struct sp_node, nd);
 994                 if (new->start < nd->start)
 995                         p = &(*p)->rb_left;
 996                 else if (new->end > nd->end)
 997                         p = &(*p)->rb_right;
 998                 else
 999                         BUG();
1000         }
1001         rb_link_node(&new->nd, parent, p);
1002         rb_insert_color(&new->nd, &sp->root);
1003         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1004                  new->policy ? new->policy->policy : 0);
1005 }
1006
1007 /* Find shared policy intersecting idx */
1008 struct mempolicy *
1009 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1010 {
1011         struct mempolicy *pol = NULL;
1012         struct sp_node *sn;
1013
1014         if (!sp->root.rb_node)
1015                 return NULL;
1016         spin_lock(&sp->lock);
1017         sn = sp_lookup(sp, idx, idx+1);
1018         if (sn) {
1019                 mpol_get(sn->policy);
1020                 pol = sn->policy;
1021         }
1022         spin_unlock(&sp->lock);
1023         return pol;
1024 }
1025
1026 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1027 {
1028         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1029         rb_erase(&n->nd, &sp->root);
1030         mpol_free(n->policy);
1031         kmem_cache_free(sn_cache, n);
1032 }
1033
1034 struct sp_node *
1035 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1036 {
1037         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1038
1039         if (!n)
1040                 return NULL;
1041         n->start = start;
1042         n->end = end;
1043         mpol_get(pol);
1044         n->policy = pol;
1045         return n;
1046 }
1047
1048 /* Replace a policy range. */
1049 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1050                                  unsigned long end, struct sp_node *new)
1051 {
1052         struct sp_node *n, *new2 = NULL;
1053
1054 restart:
1055         spin_lock(&sp->lock);
1056         n = sp_lookup(sp, start, end);
1057         /* Take care of old policies in the same range. */
1058         while (n && n->start < end) {
1059                 struct rb_node *next = rb_next(&n->nd);
1060                 if (n->start >= start) {
1061                         if (n->end <= end)
1062                                 sp_delete(sp, n);
1063                         else
1064                                 n->start = end;
1065                 } else {
1066                         /* Old policy spanning whole new range. */
1067                         if (n->end > end) {
1068                                 if (!new2) {
1069                                         spin_unlock(&sp->lock);
1070                                         new2 = sp_alloc(end, n->end, n->policy);
1071                                         if (!new2)
1072                                                 return -ENOMEM;
1073                                         goto restart;
1074                                 }
1075                                 n->end = start;
1076                                 sp_insert(sp, new2);
1077                                 new2 = NULL;
1078                                 break;
1079                         } else
1080                                 n->end = start;
1081                 }
1082                 if (!next)
1083                         break;
1084                 n = rb_entry(next, struct sp_node, nd);
1085         }
1086         if (new)
1087                 sp_insert(sp, new);
1088         spin_unlock(&sp->lock);
1089         if (new2) {
1090                 mpol_free(new2->policy);
1091                 kmem_cache_free(sn_cache, new2);
1092         }
1093         return 0;
1094 }
1095
1096 int mpol_set_shared_policy(struct shared_policy *info,
1097                         struct vm_area_struct *vma, struct mempolicy *npol)
1098 {
1099         int err;
1100         struct sp_node *new = NULL;
1101         unsigned long sz = vma_pages(vma);
1102
1103         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1104                  vma->vm_pgoff,
1105                  sz, npol? npol->policy : -1,
1106                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1107
1108         if (npol) {
1109                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1110                 if (!new)
1111                         return -ENOMEM;
1112         }
1113         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1114         if (err && new)
1115                 kmem_cache_free(sn_cache, new);
1116         return err;
1117 }
1118
1119 /* Free a backing policy store on inode delete. */
1120 void mpol_free_shared_policy(struct shared_policy *p)
1121 {
1122         struct sp_node *n;
1123         struct rb_node *next;
1124
1125         if (!p->root.rb_node)
1126                 return;
1127         spin_lock(&p->lock);
1128         next = rb_first(&p->root);
1129         while (next) {
1130                 n = rb_entry(next, struct sp_node, nd);
1131                 next = rb_next(&n->nd);
1132                 rb_erase(&n->nd, &p->root);
1133                 mpol_free(n->policy);
1134                 kmem_cache_free(sn_cache, n);
1135         }
1136         spin_unlock(&p->lock);
1137 }
1138
1139 /* assumes fs == KERNEL_DS */
1140 void __init numa_policy_init(void)
1141 {
1142         policy_cache = kmem_cache_create("numa_policy",
1143                                          sizeof(struct mempolicy),
1144                                          0, SLAB_PANIC, NULL, NULL);
1145
1146         sn_cache = kmem_cache_create("shared_policy_node",
1147                                      sizeof(struct sp_node),
1148                                      0, SLAB_PANIC, NULL, NULL);
1149
1150         /* Set interleaving policy for system init. This way not all
1151            the data structures allocated at system boot end up in node zero. */
1152
1153         if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1154                                                         MAX_NUMNODES) < 0)
1155                 printk("numa_policy_init: interleaving failed\n");
1156 }
1157
1158 /* Reset policy of current process to default.
1159  * Assumes fs == KERNEL_DS */
1160 void numa_default_policy(void)
1161 {
1162         sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1163 }