#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
+/* The number of pages to migrate per call to migrate_pages() */
+#define MIGRATE_CHUNK_SIZE 256
+
static kmem_cache_t *policy_cache;
static kmem_cache_t *sn_cache;
}
return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
}
+
/* Generate a custom zonelist for the BIND policy. */
static struct zonelist *bind_zonelist(nodemask_t *nodes)
{
struct zonelist *zl;
- int num, max, nd;
+ int num, max, nd, k;
max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
- zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+ zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
if (!zl)
return NULL;
num = 0;
- for_each_node_mask(nd, *nodes)
- zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
+ /* First put in the highest zones from all nodes, then all the next
+ lower zones etc. Avoid empty zones because the memory allocator
+ doesn't like them. If you implement node hot removal you
+ have to fix that. */
+ for (k = policy_zone; k >= 0; k--) {
+ for_each_node_mask(nd, *nodes) {
+ struct zone *z = &NODE_DATA(nd)->node_zones[k];
+ if (z->present_pages > 0)
+ zl->zones[num++] = z;
+ }
+ }
zl->zones[num] = NULL;
return zl;
}
}
static void gather_stats(struct page *, void *);
-static void migrate_page_add(struct vm_area_struct *vma,
- struct page *page, struct list_head *pagelist, unsigned long flags);
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags);
/* Scan through pages checking if pages follow certain conditions. */
static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
page = vm_normal_page(vma, addr, *pte);
if (!page)
continue;
+ /*
+ * The check for PageReserved here is important to avoid
+ * handling zero pages and other pages that may have been
+ * marked special by the system.
+ *
+ * If the PageReserved would not be checked here then f.e.
+ * the location of the zero page could have an influence
+ * on MPOL_MF_STRICT, zero pages would be counted for
+ * the per node stats, and there would be useless attempts
+ * to put zero pages on the migration list.
+ */
if (PageReserved(page))
continue;
nid = page_to_nid(page);
if (flags & MPOL_MF_STATS)
gather_stats(page, private);
- else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- spin_unlock(ptl);
- migrate_page_add(vma, page, private, flags);
- spin_lock(ptl);
- }
+ else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ migrate_page_add(page, private, flags);
else
break;
} while (pte++, addr += PAGE_SIZE, addr != end);
int err;
struct vm_area_struct *first, *vma, *prev;
+ /* Clear the LRU lists so pages can be isolated */
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ lru_add_drain_all();
+
first = find_vma(mm, start);
if (!first)
return ERR_PTR(-EFAULT);
* page migration
*/
-/* Check if we are the only process mapping the page in question */
-static inline int single_mm_mapping(struct mm_struct *mm,
- struct address_space *mapping)
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
{
- struct vm_area_struct *vma;
- struct prio_tree_iter iter;
- int rc = 1;
+ /*
+ * Avoid migrating a page that is shared with others.
+ */
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+ if (isolate_lru_page(page))
+ list_add_tail(&page->lru, pagelist);
+ }
+}
- spin_lock(&mapping->i_mmap_lock);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
- if (mm != vma->vm_mm) {
- rc = 0;
- goto out;
+/*
+ * Migrate the list 'pagelist' of pages to a certain destination.
+ *
+ * Specify destination with either non-NULL vma or dest_node >= 0
+ * Return the number of pages not migrated or error code
+ */
+static int migrate_pages_to(struct list_head *pagelist,
+ struct vm_area_struct *vma, int dest)
+{
+ LIST_HEAD(newlist);
+ LIST_HEAD(moved);
+ LIST_HEAD(failed);
+ int err = 0;
+ unsigned long offset = 0;
+ int nr_pages;
+ struct page *page;
+ struct list_head *p;
+
+redo:
+ nr_pages = 0;
+ list_for_each(p, pagelist) {
+ if (vma) {
+ /*
+ * The address passed to alloc_page_vma is used to
+ * generate the proper interleave behavior. We fake
+ * the address here by an increasing offset in order
+ * to get the proper distribution of pages.
+ *
+ * No decision has been made as to which page
+ * a certain old page is moved to so we cannot
+ * specify the correct address.
+ */
+ page = alloc_page_vma(GFP_HIGHUSER, vma,
+ offset + vma->vm_start);
+ offset += PAGE_SIZE;
}
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
- if (mm != vma->vm_mm) {
- rc = 0;
+ else
+ page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
+
+ if (!page) {
+ err = -ENOMEM;
goto out;
}
+ list_add_tail(&page->lru, &newlist);
+ nr_pages++;
+ if (nr_pages > MIGRATE_CHUNK_SIZE)
+ break;
+ }
+ err = migrate_pages(pagelist, &newlist, &moved, &failed);
+
+ putback_lru_pages(&moved); /* Call release pages instead ?? */
+
+ if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
+ goto redo;
out:
- spin_unlock(&mapping->i_mmap_lock);
- return rc;
+ /* Return leftover allocated pages */
+ while (!list_empty(&newlist)) {
+ page = list_entry(newlist.next, struct page, lru);
+ list_del(&page->lru);
+ __free_page(page);
+ }
+ list_splice(&failed, pagelist);
+ if (err < 0)
+ return err;
+
+ /* Calculate number of leftover pages */
+ nr_pages = 0;
+ list_for_each(p, pagelist)
+ nr_pages++;
+ return nr_pages;
}
/*
- * Add a page to be migrated to the pagelist
+ * Migrate pages from one node to a target node.
+ * Returns error or the number of pages not migrated.
*/
-static void migrate_page_add(struct vm_area_struct *vma,
- struct page *page, struct list_head *pagelist, unsigned long flags)
+int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
{
- /*
- * Avoid migrating a page that is shared by others and not writable.
- */
- if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
- mapping_writably_mapped(page->mapping) ||
- single_mm_mapping(vma->vm_mm, page->mapping)) {
- int rc = isolate_lru_page(page);
-
- if (rc == 1)
- list_add(&page->lru, pagelist);
- /*
- * If the isolate attempt was not successful then we just
- * encountered an unswappable page. Something must be wrong.
- */
- WARN_ON(rc == 0);
- }
-}
+ nodemask_t nmask;
+ LIST_HEAD(pagelist);
+ int err = 0;
-static int swap_pages(struct list_head *pagelist)
-{
- LIST_HEAD(moved);
- LIST_HEAD(failed);
- int n;
+ nodes_clear(nmask);
+ node_set(source, nmask);
- n = migrate_pages(pagelist, NULL, &moved, &failed);
- putback_lru_pages(&failed);
- putback_lru_pages(&moved);
+ check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+ flags | MPOL_MF_DISCONTIG_OK, &pagelist);
- return n;
+ if (!list_empty(&pagelist)) {
+ err = migrate_pages_to(&pagelist, NULL, dest);
+ if (!list_empty(&pagelist))
+ putback_lru_pages(&pagelist);
+ }
+ return err;
}
/*
- * For now migrate_pages simply swaps out the pages from nodes that are in
- * the source set but not in the target set. In the future, we would
- * want a function that moves pages between the two nodesets in such
- * a way as to preserve the physical layout as much as possible.
+ * Move pages between the two nodesets so as to preserve the physical
+ * layout as much as possible.
*
* Returns the number of page that could not be moved.
*/
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
{
LIST_HEAD(pagelist);
- int count = 0;
- nodemask_t nodes;
+ int busy = 0;
+ int err = 0;
+ nodemask_t tmp;
- nodes_andnot(nodes, *from_nodes, *to_nodes);
+ down_read(&mm->mmap_sem);
- down_read(&mm->mmap_sem);
- check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
- flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+/*
+ * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+ * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
+ * bit in 'tmp', and return that <source, dest> pair for migration.
+ * The pair of nodemasks 'to' and 'from' define the map.
+ *
+ * If no pair of bits is found that way, fallback to picking some
+ * pair of 'source' and 'dest' bits that are not the same. If the
+ * 'source' and 'dest' bits are the same, this represents a node
+ * that will be migrating to itself, so no pages need move.
+ *
+ * If no bits are left in 'tmp', or if all remaining bits left
+ * in 'tmp' correspond to the same bit in 'to', return false
+ * (nothing left to migrate).
+ *
+ * This lets us pick a pair of nodes to migrate between, such that
+ * if possible the dest node is not already occupied by some other
+ * source node, minimizing the risk of overloading the memory on a
+ * node that would happen if we migrated incoming memory to a node
+ * before migrating outgoing memory source that same node.
+ *
+ * A single scan of tmp is sufficient. As we go, we remember the
+ * most recent <s, d> pair that moved (s != d). If we find a pair
+ * that not only moved, but what's better, moved to an empty slot
+ * (d is not set in tmp), then we break out then, with that pair.
+ * Otherwise when we finish scannng from_tmp, we at least have the
+ * most recent <s, d> pair that moved. If we get all the way through
+ * the scan of tmp without finding any node that moved, much less
+ * moved to an empty node, then there is nothing left worth migrating.
+ */
- if (!list_empty(&pagelist)) {
- count = swap_pages(&pagelist);
- putback_lru_pages(&pagelist);
+ tmp = *from_nodes;
+ while (!nodes_empty(tmp)) {
+ int s,d;
+ int source = -1;
+ int dest = 0;
+
+ for_each_node_mask(s, tmp) {
+ d = node_remap(s, *from_nodes, *to_nodes);
+ if (s == d)
+ continue;
+
+ source = s; /* Node moved. Memorize */
+ dest = d;
+
+ /* dest not in remaining from nodes? */
+ if (!node_isset(dest, tmp))
+ break;
+ }
+ if (source == -1)
+ break;
+
+ node_clear(source, tmp);
+ err = migrate_to_node(mm, source, dest, flags);
+ if (err > 0)
+ busy += err;
+ if (err < 0)
+ break;
}
up_read(&mm->mmap_sem);
- return count;
+ if (err < 0)
+ return err;
+ return busy;
}
long do_mbind(unsigned long start, unsigned long len,
int nr_failed = 0;
err = mbind_range(vma, start, end, new);
+
if (!list_empty(&pagelist))
- nr_failed = swap_pages(&pagelist);
+ nr_failed = migrate_pages_to(&pagelist, vma, -1);
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
return 0;
+ if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
+ return -EINVAL;
nlongs = BITS_TO_LONGS(maxnode);
if ((maxnode % BITS_PER_LONG) == 0)
return nid;
}
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ */
+unsigned slab_node(struct mempolicy *policy)
+{
+ switch (policy->policy) {
+ case MPOL_INTERLEAVE:
+ return interleave_nodes(policy);
+
+ case MPOL_BIND:
+ /*
+ * Follow bind policy behavior and start allocation at the
+ * first node.
+ */
+ return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+
+ case MPOL_PREFERRED:
+ if (policy->v.preferred_node >= 0)
+ return policy->v.preferred_node;
+ /* Fall through */
+
+ default:
+ return numa_node_id();
+ }
+}
+
/* Do static interleaving for a VMA with known offset. */
static unsigned offset_il_node(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long off)
return interleave_nodes(pol);
}
+#ifdef CONFIG_HUGETLBFS
/* Return a zonelist suitable for a huge page allocation. */
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
{
}
return zonelist_policy(GFP_HIGHUSER, pol);
}
+#endif
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
return 0;
}
+void mpol_shared_policy_init(struct shared_policy *info, int policy,
+ nodemask_t *policy_nodes)
+{
+ info->root = RB_ROOT;
+ spin_lock_init(&info->lock);
+
+ if (policy != MPOL_DEFAULT) {
+ struct mempolicy *newpol;
+
+ /* Falls back to MPOL_DEFAULT on any error */
+ newpol = mpol_new(policy, policy_nodes);
+ if (!IS_ERR(newpol)) {
+ /* Create pseudo-vma that contains just the policy */
+ struct vm_area_struct pvma;
+
+ memset(&pvma, 0, sizeof(struct vm_area_struct));
+ /* Policy covers entire file */
+ pvma.vm_end = TASK_SIZE;
+ mpol_set_shared_policy(info, &pvma, newpol);
+ mpol_free(newpol);
+ }
+ }
+}
+
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma, struct mempolicy *npol)
{