} pageout_t;
struct scan_control {
- /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
- unsigned long nr_to_scan;
-
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
unsigned long nr_mapped; /* From page_state */
- /* Ask shrink_caches, or shrink_zone to scan at this priority */
- unsigned int priority;
-
/* This context's GFP mask */
gfp_t gfp_mask;
*
* Returns the number of slab objects which we shrunk.
*/
-int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+ unsigned long lru_pages)
{
struct shrinker *shrinker;
- int ret = 0;
+ unsigned long ret = 0;
if (scanned == 0)
scanned = SWAP_CLUSTER_MAX;
/*
* shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
*/
-static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+static unsigned long shrink_list(struct list_head *page_list,
+ struct scan_control *sc)
{
LIST_HEAD(ret_pages);
struct pagevec freed_pvec;
int pgactivate = 0;
- int reclaimed = 0;
+ unsigned long reclaimed = 0;
cond_resched();
*
* returns the number of pages put back.
*/
-int putback_lru_pages(struct list_head *l)
+unsigned long putback_lru_pages(struct list_head *l)
{
struct page *page;
struct page *page2;
- int count = 0;
+ unsigned long count = 0;
list_for_each_entry_safe(page, page2, l, lru) {
move_to_lru(page);
* the page.
*/
if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
- return 1;
+ return -EAGAIN;
/*
* Establish swap ptes for anonymous pages or destroy pte
* If the page was not migrated then the PageSwapCache bit
* is still set and the operation may continue.
*/
- try_to_unmap(page, 1);
+ if (try_to_unmap(page, 1) == SWAP_FAIL)
+ /* A vma has VM_LOCKED set -> Permanent failure */
+ return -EPERM;
/*
* Give up if we were unable to remove all mappings.
*/
if (page_mapcount(page))
- return 1;
+ return -EAGAIN;
write_lock_irq(&mapping->tree_lock);
if (!page_mapping(page) || page_count(page) != nr_refs ||
*radix_pointer != page) {
write_unlock_irq(&mapping->tree_lock);
- return 1;
+ return -EAGAIN;
}
/*
*/
int migrate_page(struct page *newpage, struct page *page)
{
+ int rc;
+
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- if (migrate_page_remove_references(newpage, page, 2))
- return -EAGAIN;
+ rc = migrate_page_remove_references(newpage, page, 2);
+
+ if (rc)
+ return rc;
migrate_page_copy(newpage, page);
*
* Return: Number of pages not migrated when "to" ran empty.
*/
-int migrate_pages(struct list_head *from, struct list_head *to,
+unsigned long migrate_pages(struct list_head *from, struct list_head *to,
struct list_head *moved, struct list_head *failed)
{
- int retry;
- int nr_failed = 0;
+ unsigned long retry;
+ unsigned long nr_failed = 0;
int pass = 0;
struct page *page;
struct page *page2;
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
- if (TestClearPageLRU(page)) {
+ if (PageLRU(page)) {
ret = 1;
get_page(page);
+ ClearPageLRU(page);
if (PageActive(page))
del_page_from_active_list(zone, page);
else
*
* returns how many pages were moved onto *@dst.
*/
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
- struct list_head *dst, int *scanned)
+static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+ struct list_head *src, struct list_head *dst,
+ unsigned long *scanned)
{
- int nr_taken = 0;
+ unsigned long nr_taken = 0;
struct page *page;
- int scan = 0;
+ unsigned long scan = 0;
while (scan++ < nr_to_scan && !list_empty(src)) {
+ struct list_head *target;
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
- if (!TestClearPageLRU(page))
- BUG();
+ BUG_ON(!PageLRU(page));
+
list_del(&page->lru);
- if (get_page_testone(page)) {
+ target = src;
+ if (likely(get_page_unless_zero(page))) {
/*
- * It is being freed elsewhere
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
*/
- __put_page(page);
- SetPageLRU(page);
- list_add(&page->lru, src);
- continue;
- } else {
- list_add(&page->lru, dst);
+ ClearPageLRU(page);
+ target = dst;
nr_taken++;
- }
+ } /* else it is being freed elsewhere */
+
+ list_add(&page->lru, target);
}
*scanned = scan;
/*
* shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
*/
-static void shrink_cache(struct zone *zone, struct scan_control *sc)
+static void shrink_cache(unsigned long max_scan, struct zone *zone,
+ struct scan_control *sc)
{
LIST_HEAD(page_list);
struct pagevec pvec;
- int max_scan = sc->nr_to_scan;
+ unsigned long nr_scanned = 0;
pagevec_init(&pvec, 1);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
- while (max_scan > 0) {
+ do {
struct page *page;
- int nr_taken;
- int nr_scan;
- int nr_freed;
+ unsigned long nr_taken;
+ unsigned long nr_scan;
+ unsigned long nr_freed;
nr_taken = isolate_lru_pages(sc->swap_cluster_max,
&zone->inactive_list,
if (nr_taken == 0)
goto done;
- max_scan -= nr_scan;
+ nr_scanned += nr_scan;
nr_freed = shrink_list(&page_list, sc);
local_irq_disable();
*/
while (!list_empty(&page_list)) {
page = lru_to_page(&page_list);
- if (TestSetPageLRU(page))
- BUG();
+ BUG_ON(PageLRU(page));
+ SetPageLRU(page);
list_del(&page->lru);
if (PageActive(page))
add_page_to_active_list(zone, page);
spin_lock_irq(&zone->lru_lock);
}
}
- }
+ } while (nr_scanned < max_scan);
spin_unlock_irq(&zone->lru_lock);
done:
pagevec_release(&pvec);
* But we had to alter page->flags anyway.
*/
static void
-refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+refill_inactive_zone(unsigned long nr_pages, struct zone *zone,
+ struct scan_control *sc)
{
- int pgmoved;
+ unsigned long pgmoved;
int pgdeactivate = 0;
- int pgscanned;
- int nr_pages = sc->nr_to_scan;
+ unsigned long pgscanned;
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
LIST_HEAD(l_active); /* Pages to go onto the active_list */
while (!list_empty(&l_inactive)) {
page = lru_to_page(&l_inactive);
prefetchw_prev_lru_page(page, &l_inactive, flags);
- if (TestSetPageLRU(page))
- BUG();
- if (!TestClearPageActive(page))
- BUG();
+ BUG_ON(PageLRU(page));
+ SetPageLRU(page);
+ BUG_ON(!PageActive(page));
+ ClearPageActive(page);
+
list_move(&page->lru, &zone->inactive_list);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
while (!list_empty(&l_active)) {
page = lru_to_page(&l_active);
prefetchw_prev_lru_page(page, &l_active, flags);
- if (TestSetPageLRU(page))
- BUG();
+ BUG_ON(PageLRU(page));
+ SetPageLRU(page);
BUG_ON(!PageActive(page));
list_move(&page->lru, &zone->active_list);
pgmoved++;
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static void
-shrink_zone(struct zone *zone, struct scan_control *sc)
+static void shrink_zone(int priority, struct zone *zone,
+ struct scan_control *sc)
{
unsigned long nr_active;
unsigned long nr_inactive;
+ unsigned long nr_to_scan;
atomic_inc(&zone->reclaim_in_progress);
* Add one to `nr_to_scan' just to make sure that the kernel will
* slowly sift through the active list.
*/
- zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
+ zone->nr_scan_active += (zone->nr_active >> priority) + 1;
nr_active = zone->nr_scan_active;
if (nr_active >= sc->swap_cluster_max)
zone->nr_scan_active = 0;
else
nr_active = 0;
- zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
+ zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
nr_inactive = zone->nr_scan_inactive;
if (nr_inactive >= sc->swap_cluster_max)
zone->nr_scan_inactive = 0;
while (nr_active || nr_inactive) {
if (nr_active) {
- sc->nr_to_scan = min(nr_active,
+ nr_to_scan = min(nr_active,
(unsigned long)sc->swap_cluster_max);
- nr_active -= sc->nr_to_scan;
- refill_inactive_zone(zone, sc);
+ nr_active -= nr_to_scan;
+ refill_inactive_zone(nr_to_scan, zone, sc);
}
if (nr_inactive) {
- sc->nr_to_scan = min(nr_inactive,
+ nr_to_scan = min(nr_inactive,
(unsigned long)sc->swap_cluster_max);
- nr_inactive -= sc->nr_to_scan;
- shrink_cache(zone, sc);
+ nr_inactive -= nr_to_scan;
+ shrink_cache(nr_to_scan, zone, sc);
}
}
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static void
-shrink_caches(struct zone **zones, struct scan_control *sc)
+static void shrink_caches(int priority, struct zone **zones,
+ struct scan_control *sc)
{
int i;
if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
continue;
- zone->temp_priority = sc->priority;
- if (zone->prev_priority > sc->priority)
- zone->prev_priority = sc->priority;
+ zone->temp_priority = priority;
+ if (zone->prev_priority > priority)
+ zone->prev_priority = priority;
- if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
- shrink_zone(zone, sc);
+ shrink_zone(priority, zone, sc);
}
}
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
*/
-int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
{
int priority;
int ret = 0;
- int total_scanned = 0, total_reclaimed = 0;
+ unsigned long total_scanned = 0;
+ unsigned long total_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
- struct scan_control sc;
unsigned long lru_pages = 0;
int i;
-
- sc.gfp_mask = gfp_mask;
- sc.may_writepage = !laptop_mode;
- sc.may_swap = 1;
+ struct scan_control sc = {
+ .gfp_mask = gfp_mask,
+ .may_writepage = !laptop_mode,
+ .swap_cluster_max = SWAP_CLUSTER_MAX,
+ .may_swap = 1,
+ };
inc_page_state(allocstall);
sc.nr_mapped = read_page_state(nr_mapped);
sc.nr_scanned = 0;
sc.nr_reclaimed = 0;
- sc.priority = priority;
- sc.swap_cluster_max = SWAP_CLUSTER_MAX;
if (!priority)
disable_swap_token();
- shrink_caches(zones, &sc);
+ shrink_caches(priority, zones, &sc);
shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
if (reclaim_state) {
sc.nr_reclaimed += reclaim_state->reclaimed_slab;
* that's undesirable in laptop mode, where we *want* lumpy
* writeout. So in laptop mode, write out the whole world.
*/
- if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
+ if (total_scanned > sc.swap_cluster_max +
+ sc.swap_cluster_max / 2) {
wakeup_pdflush(laptop_mode ? 0 : total_scanned);
sc.may_writepage = 1;
}
* the page allocator fallback scheme to ensure that aging of pages is balanced
* across the zones.
*/
-static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+ int order)
{
- int to_free = nr_pages;
+ unsigned long to_free = nr_pages;
int all_zones_ok;
int priority;
int i;
- int total_scanned, total_reclaimed;
+ unsigned long total_scanned;
+ unsigned long total_reclaimed;
struct reclaim_state *reclaim_state = current->reclaim_state;
- struct scan_control sc;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .may_swap = 1,
+ .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+ };
loop_again:
total_scanned = 0;
total_reclaimed = 0;
- sc.gfp_mask = GFP_KERNEL;
- sc.may_writepage = !laptop_mode;
- sc.may_swap = 1;
+ sc.may_writepage = !laptop_mode,
sc.nr_mapped = read_page_state(nr_mapped);
inc_page_state(pageoutrun);
zone->prev_priority = priority;
sc.nr_scanned = 0;
sc.nr_reclaimed = 0;
- sc.priority = priority;
- sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
- shrink_zone(zone, &sc);
+ shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
lru_pages);
* Try to free `nr_pages' of memory, system-wide. Returns the number of freed
* pages.
*/
-int shrink_all_memory(int nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_pages)
{
pg_data_t *pgdat;
- int nr_to_free = nr_pages;
- int ret = 0;
+ unsigned long nr_to_free = nr_pages;
+ unsigned long ret = 0;
struct reclaim_state reclaim_state = {
.reclaimed_slab = 0,
};
current->reclaim_state = &reclaim_state;
for_each_pgdat(pgdat) {
- int freed;
+ unsigned long freed;
+
freed = balance_pgdat(pgdat, nr_to_free, 0);
ret += freed;
nr_to_free -= freed;
- if (nr_to_free <= 0)
+ if ((long)nr_to_free <= 0)
break;
}
current->reclaim_state = NULL;
away, we get changed to run anywhere: as the first one comes back,
restore their cpu bindings. */
static int __devinit cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+ unsigned long action, void *hcpu)
{
pg_data_t *pgdat;
cpumask_t mask;
static int __init kswapd_init(void)
{
pg_data_t *pgdat;
+
swap_setup();
- for_each_pgdat(pgdat)
- pgdat->kswapd
- = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
+ for_each_pgdat(pgdat) {
+ pid_t pid;
+
+ pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
+ BUG_ON(pid < 0);
+ pgdat->kswapd = find_task_by_pid(pid);
+ }
total_memory = nr_free_pagecache_pages();
hotcpu_notifier(cpu_callback, 0);
return 0;
/*
* Try to free up some pages from this zone through reclaim.
*/
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
- int nr_pages;
+ const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
struct reclaim_state reclaim_state;
- struct scan_control sc;
- cpumask_t mask;
- int node_id;
-
- if (time_before(jiffies,
- zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
- return 0;
-
- if (!(gfp_mask & __GFP_WAIT) ||
- zone->all_unreclaimable ||
- atomic_read(&zone->reclaim_in_progress) > 0)
- return 0;
-
- node_id = zone->zone_pgdat->node_id;
- mask = node_to_cpumask(node_id);
- if (!cpus_empty(mask) && node_id != numa_node_id())
- return 0;
-
- sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
- sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
- sc.nr_scanned = 0;
- sc.nr_reclaimed = 0;
- sc.priority = ZONE_RECLAIM_PRIORITY + 1;
- sc.nr_mapped = read_page_state(nr_mapped);
- sc.gfp_mask = gfp_mask;
+ int priority;
+ struct scan_control sc = {
+ .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+ .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+ .nr_mapped = read_page_state(nr_mapped),
+ .swap_cluster_max = max_t(unsigned long, nr_pages,
+ SWAP_CLUSTER_MAX),
+ .gfp_mask = gfp_mask,
+ };
disable_swap_token();
-
- nr_pages = 1 << order;
- if (nr_pages > SWAP_CLUSTER_MAX)
- sc.swap_cluster_max = nr_pages;
- else
- sc.swap_cluster_max = SWAP_CLUSTER_MAX;
-
cond_resched();
/*
* We need to be able to allocate from the reserves for RECLAIM_SWAP
* Free memory by calling shrink zone with increasing priorities
* until we have enough memory freed.
*/
+ priority = ZONE_RECLAIM_PRIORITY;
do {
- sc.priority--;
- shrink_zone(zone, &sc);
-
- } while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
+ shrink_zone(priority, zone, &sc);
+ priority--;
+ } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
/*
return sc.nr_reclaimed >= nr_pages;
}
-#endif
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+ cpumask_t mask;
+ int node_id;
+
+ /*
+ * Do not reclaim if there was a recent unsuccessful attempt at zone
+ * reclaim. In that case we let allocations go off node for the
+ * zone_reclaim_interval. Otherwise we would scan for each off-node
+ * page allocation.
+ */
+ if (time_before(jiffies,
+ zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+ return 0;
+
+ /*
+ * Avoid concurrent zone reclaims, do not reclaim in a zone that does
+ * not have reclaimable pages and if we should not delay the allocation
+ * then do not scan.
+ */
+ if (!(gfp_mask & __GFP_WAIT) ||
+ zone->all_unreclaimable ||
+ atomic_read(&zone->reclaim_in_progress) > 0 ||
+ (current->flags & PF_MEMALLOC))
+ return 0;
+
+ /*
+ * Only run zone reclaim on the local zone or on zones that do not
+ * have associated processors. This will favor the local processor
+ * over remote processors and spread off node memory allocations
+ * as wide as possible.
+ */
+ node_id = zone->zone_pgdat->node_id;
+ mask = node_to_cpumask(node_id);
+ if (!cpus_empty(mask) && node_id != numa_node_id())
+ return 0;
+ return __zone_reclaim(zone, gfp_mask, order);
+}
+#endif