X-Git-Url: http://pilppa.org/gitweb/gitweb.cgi?a=blobdiff_plain;f=mm%2Fvmscan.c;h=b0af7593d01e315a83c79ec6841c9a4a3b91c1e1;hb=7bb3b137abf2b7073e683c14cfe062d811d35247;hp=bf903b2d198f0820a2d03041b06de25af7a4d1d7;hpb=05814450070f13b671fc9dbf89477677aa0258cb;p=linux-2.6-omap-h63xx.git diff --git a/mm/vmscan.c b/mm/vmscan.c index bf903b2d198..b0af7593d01 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -71,6 +71,9 @@ struct scan_control { int may_writepage; + /* Can pages be swapped as part of reclaim? */ + int may_swap; + /* This context's SWAP_CLUSTER_MAX. If freeing memory for * suspend, we effectively ignore SWAP_CLUSTER_MAX. * In this context, it doesn't matter that we scan the @@ -440,6 +443,10 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) BUG_ON(PageActive(page)); sc->nr_scanned++; + + if (!sc->may_swap && page_mapped(page)) + goto keep_locked; + /* Double the slab pressure for mapped and swapcache pages */ if (page_mapped(page) || PageSwapCache(page)) sc->nr_scanned++; @@ -458,6 +465,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * Try to allocate it some swap space here. */ if (PageAnon(page) && !PageSwapCache(page)) { + if (!sc->may_swap) + goto keep_locked; if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; } @@ -472,7 +481,13 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page)) { + /* + * No unmapping if we do not swap + */ + if (!sc->may_swap) + goto keep_locked; + + switch (try_to_unmap(page, 0)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -487,7 +502,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) goto keep_locked; if (!may_enter_fs) goto keep_locked; - if (laptop_mode && !sc->may_writepage) + if (!sc->may_writepage) goto keep_locked; /* Page is dirty, try to write it out here */ @@ -586,7 +601,7 @@ static inline void move_to_lru(struct page *page) } /* - * Add isolated pages on the list back to the LRU + * Add isolated pages on the list back to the LRU. * * returns the number of pages put back. */ @@ -603,6 +618,15 @@ int putback_lru_pages(struct list_head *l) return count; } +/* + * Non migratable page + */ +int fail_migrate_page(struct page *newpage, struct page *page) +{ + return -EIO; +} +EXPORT_SYMBOL(fail_migrate_page); + /* * swapout a single page * page is locked upon entry, unlocked on exit @@ -612,7 +636,7 @@ static int swap_page(struct page *page) struct address_space *mapping = page_mapping(page); if (page_mapped(page) && mapping) - if (try_to_unmap(page) != SWAP_SUCCESS) + if (try_to_unmap(page, 1) != SWAP_SUCCESS) goto unlock_retry; if (PageDirty(page)) { @@ -648,6 +672,167 @@ unlock_retry: retry: return -EAGAIN; } +EXPORT_SYMBOL(swap_page); + +/* + * Page migration was first developed in the context of the memory hotplug + * project. The main authors of the migration code are: + * + * IWAMOTO Toshihiro + * Hirokazu Takahashi + * Dave Hansen + * Christoph Lameter + */ + +/* + * Remove references for a page and establish the new page with the correct + * basic settings to be able to stop accesses to the page. + */ +int migrate_page_remove_references(struct page *newpage, + struct page *page, int nr_refs) +{ + struct address_space *mapping = page_mapping(page); + struct page **radix_pointer; + + /* + * Avoid doing any of the following work if the page count + * indicates that the page is in use or truncate has removed + * the page. + */ + if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) + return 1; + + /* + * Establish swap ptes for anonymous pages or destroy pte + * maps for files. + * + * In order to reestablish file backed mappings the fault handlers + * will take the radix tree_lock which may then be used to stop + * processses from accessing this page until the new page is ready. + * + * A process accessing via a swap pte (an anonymous page) will take a + * page_lock on the old page which will block the process until the + * migration attempt is complete. At that time the PageSwapCache bit + * will be examined. If the page was migrated then the PageSwapCache + * bit will be clear and the operation to retrieve the page will be + * retried which will find the new page in the radix tree. Then a new + * direct mapping may be generated based on the radix tree contents. + * + * If the page was not migrated then the PageSwapCache bit + * is still set and the operation may continue. + */ + try_to_unmap(page, 1); + + /* + * Give up if we were unable to remove all mappings. + */ + if (page_mapcount(page)) + return 1; + + write_lock_irq(&mapping->tree_lock); + + radix_pointer = (struct page **)radix_tree_lookup_slot( + &mapping->page_tree, + page_index(page)); + + if (!page_mapping(page) || page_count(page) != nr_refs || + *radix_pointer != page) { + write_unlock_irq(&mapping->tree_lock); + return 1; + } + + /* + * Now we know that no one else is looking at the page. + * + * Certain minimal information about a page must be available + * in order for other subsystems to properly handle the page if they + * find it through the radix tree update before we are finished + * copying the page. + */ + get_page(newpage); + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapCache(page)) { + SetPageSwapCache(newpage); + set_page_private(newpage, page_private(page)); + } + + *radix_pointer = newpage; + __put_page(page); + write_unlock_irq(&mapping->tree_lock); + + return 0; +} +EXPORT_SYMBOL(migrate_page_remove_references); + +/* + * Copy the page to its new location + */ +void migrate_page_copy(struct page *newpage, struct page *page) +{ + copy_highpage(newpage, page); + + if (PageError(page)) + SetPageError(newpage); + if (PageReferenced(page)) + SetPageReferenced(newpage); + if (PageUptodate(page)) + SetPageUptodate(newpage); + if (PageActive(page)) + SetPageActive(newpage); + if (PageChecked(page)) + SetPageChecked(newpage); + if (PageMappedToDisk(page)) + SetPageMappedToDisk(newpage); + + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + set_page_dirty(newpage); + } + + ClearPageSwapCache(page); + ClearPageActive(page); + ClearPagePrivate(page); + set_page_private(page, 0); + page->mapping = NULL; + + /* + * If any waiters have accumulated on the new page then + * wake them up. + */ + if (PageWriteback(newpage)) + end_page_writeback(newpage); +} +EXPORT_SYMBOL(migrate_page_copy); + +/* + * Common logic to directly migrate a single page suitable for + * pages that do not use PagePrivate. + * + * Pages are locked upon entry and exit. + */ +int migrate_page(struct page *newpage, struct page *page) +{ + BUG_ON(PageWriteback(page)); /* Writeback must be complete */ + + if (migrate_page_remove_references(newpage, page, 2)) + return -EAGAIN; + + migrate_page_copy(newpage, page); + + /* + * Remove auxiliary swap entries and replace + * them with real ptes. + * + * Note that a real pte entry will allow processes that are not + * waiting on the page lock to use the new page via the page tables + * before the new page is unlocked. + */ + remove_from_swap(newpage); + return 0; +} +EXPORT_SYMBOL(migrate_page); + /* * migrate_pages * @@ -658,14 +843,9 @@ retry: * pages are swapped out. * * The function returns after 10 attempts or if no pages - * are movable anymore because t has become empty + * are movable anymore because to has become empty * or no retryable pages exist anymore. * - * SIMPLIFIED VERSION: This implementation of migrate_pages - * is only swapping out pages and never touches the second - * list. The direct migration patchset - * extends this function to avoid the use of swap. - * * Return: Number of pages not migrated when "to" ran empty. */ int migrate_pages(struct list_head *from, struct list_head *to, @@ -686,6 +866,9 @@ redo: retry = 0; list_for_each_entry_safe(page, page2, from, lru) { + struct page *newpage = NULL; + struct address_space *mapping; + cond_resched(); rc = 0; @@ -693,6 +876,9 @@ redo: /* page was freed from under us. So we are done. */ goto next; + if (to && list_empty(to)) + break; + /* * Skip locked pages during the first two passes to give the * functions holding the lock time to release the page. Later we @@ -729,12 +915,84 @@ redo: } } + if (!to) { + rc = swap_page(page); + goto next; + } + + newpage = lru_to_page(to); + lock_page(newpage); + /* - * Page is properly locked and writeback is complete. + * Pages are properly locked and writeback is complete. * Try to migrate the page. */ - rc = swap_page(page); - goto next; + mapping = page_mapping(page); + if (!mapping) + goto unlock_both; + + if (mapping->a_ops->migratepage) { + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(newpage, page); + goto unlock_both; + } + + /* + * Default handling if a filesystem does not provide + * a migration function. We can only migrate clean + * pages so try to write out any dirty pages first. + */ + if (PageDirty(page)) { + switch (pageout(page, mapping)) { + case PAGE_KEEP: + case PAGE_ACTIVATE: + goto unlock_both; + + case PAGE_SUCCESS: + unlock_page(newpage); + goto next; + + case PAGE_CLEAN: + ; /* try to migrate the page below */ + } + } + + /* + * Buffers are managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (!page_has_buffers(page) || + try_to_release_page(page, GFP_KERNEL)) { + rc = migrate_page(newpage, page); + goto unlock_both; + } + + /* + * On early passes with mapped pages simply + * retry. There may be a lock held for some + * buffers that may go away. Later + * swap them out. + */ + if (pass > 4) { + /* + * Persistently unable to drop buffers..... As a + * measure of last resort we fall back to + * swap_page(). + */ + unlock_page(newpage); + newpage = NULL; + rc = swap_page(page); + goto next; + } + +unlock_both: + unlock_page(newpage); unlock_page: unlock_page(page); @@ -747,7 +1005,10 @@ next: list_move(&page->lru, failed); nr_failed++; } else { - /* Success */ + if (newpage) { + /* Successful migration. Return page to LRU */ + move_to_lru(newpage); + } list_move(&page->lru, moved); } } @@ -760,46 +1021,33 @@ next: return nr_failed + retry; } -static void lru_add_drain_per_cpu(void *dummy) -{ - lru_add_drain(); -} - /* * Isolate one page from the LRU lists and put it on the - * indicated list. Do necessary cache draining if the - * page is not on the LRU lists yet. + * indicated list with elevated refcount. * * Result: * 0 = page not on LRU list * 1 = page removed from LRU list and added to the specified list. - * -ENOENT = page is being freed elsewhere. */ int isolate_lru_page(struct page *page) { - int rc = 0; - struct zone *zone = page_zone(page); + int ret = 0; -redo: - spin_lock_irq(&zone->lru_lock); - rc = __isolate_lru_page(page); - if (rc == 1) { - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - } - spin_unlock_irq(&zone->lru_lock); - if (rc == 0) { - /* - * Maybe this page is still waiting for a cpu to drain it - * from one of the lru lists? - */ - rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL); - if (rc == 0 && PageLRU(page)) - goto redo; + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + if (TestClearPageLRU(page)) { + ret = 1; + get_page(page); + if (PageActive(page)) + del_page_from_active_list(zone, page); + else + del_page_from_inactive_list(zone, page); + } + spin_unlock_irq(&zone->lru_lock); } - return rc; + + return ret; } #endif @@ -831,18 +1079,20 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - switch (__isolate_lru_page(page)) { - case 1: - /* Succeeded to isolate page */ - list_move(&page->lru, dst); - nr_taken++; - break; - case -ENOENT: - /* Not possible to isolate */ - list_move(&page->lru, src); - break; - default: + if (!TestClearPageLRU(page)) BUG(); + list_del(&page->lru); + if (get_page_testone(page)) { + /* + * It is being freed elsewhere + */ + __put_page(page); + SetPageLRU(page); + list_add(&page->lru, src); + continue; + } else { + list_add(&page->lru, dst); + nr_taken++; } } @@ -945,9 +1195,47 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) struct page *page; struct pagevec pvec; int reclaim_mapped = 0; - long mapped_ratio; - long distress; - long swap_tendency; + + if (unlikely(sc->may_swap)) { + long mapped_ratio; + long distress; + long swap_tendency; + + /* + * `distress' is a measure of how much trouble we're having + * reclaiming pages. 0 -> no problems. 100 -> great trouble. + */ + distress = 100 >> zone->prev_priority; + + /* + * The point of this algorithm is to decide when to start + * reclaiming mapped memory instead of just pagecache. Work out + * how much memory + * is mapped. + */ + mapped_ratio = (sc->nr_mapped * 100) / total_memory; + + /* + * Now decide how much we really want to unmap some pages. The + * mapped ratio is downgraded - just because there's a lot of + * mapped memory doesn't necessarily mean that page reclaim + * isn't succeeding. + * + * The distress ratio is important - we don't want to start + * going oom. + * + * A 100% value of vm_swappiness overrides this algorithm + * altogether. + */ + swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; + + /* + * Now use this metric to decide whether to start moving mapped + * memory onto the inactive list. + */ + if (swap_tendency >= 100) + reclaim_mapped = 1; + } lru_add_drain(); spin_lock_irq(&zone->lru_lock); @@ -957,37 +1245,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) zone->nr_active -= pgmoved; spin_unlock_irq(&zone->lru_lock); - /* - * `distress' is a measure of how much trouble we're having reclaiming - * pages. 0 -> no problems. 100 -> great trouble. - */ - distress = 100 >> zone->prev_priority; - - /* - * The point of this algorithm is to decide when to start reclaiming - * mapped memory instead of just pagecache. Work out how much memory - * is mapped. - */ - mapped_ratio = (sc->nr_mapped * 100) / total_memory; - - /* - * Now decide how much we really want to unmap some pages. The mapped - * ratio is downgraded - just because there's a lot of mapped memory - * doesn't necessarily mean that page reclaim isn't succeeding. - * - * The distress ratio is important - we don't want to start going oom. - * - * A 100% value of vm_swappiness overrides this algorithm altogether. - */ - swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; - - /* - * Now use this metric to decide whether to start moving mapped memory - * onto the inactive list. - */ - if (swap_tendency >= 100) - reclaim_mapped = 1; - while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); @@ -1176,7 +1433,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) int i; sc.gfp_mask = gfp_mask; - sc.may_writepage = 0; + sc.may_writepage = !laptop_mode; + sc.may_swap = 1; inc_page_state(allocstall); @@ -1278,7 +1536,8 @@ loop_again: total_scanned = 0; total_reclaimed = 0; sc.gfp_mask = GFP_KERNEL; - sc.may_writepage = 0; + sc.may_writepage = !laptop_mode; + sc.may_swap = 1; sc.nr_mapped = read_page_state(nr_mapped); inc_page_state(pageoutrun); @@ -1362,9 +1621,7 @@ scan: sc.nr_reclaimed = 0; sc.priority = priority; sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; - atomic_inc(&zone->reclaim_in_progress); shrink_zone(zone, &sc); - atomic_dec(&zone->reclaim_in_progress); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); @@ -1576,3 +1833,119 @@ static int __init kswapd_init(void) } module_init(kswapd_init) + +#ifdef CONFIG_NUMA +/* + * Zone reclaim mode + * + * If non-zero call zone_reclaim when the number of free pages falls below + * the watermarks. + * + * In the future we may add flags to the mode. However, the page allocator + * should only have to check that zone_reclaim_mode != 0 before calling + * zone_reclaim(). + */ +int zone_reclaim_mode __read_mostly; + +#define RECLAIM_OFF 0 +#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ +#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ + +/* + * Mininum time between zone reclaim scans + */ +int zone_reclaim_interval __read_mostly = 30*HZ; + +/* + * Priority for ZONE_RECLAIM. This determines the fraction of pages + * of a node considered for each zone_reclaim. 4 scans 1/16th of + * a zone. + */ +#define ZONE_RECLAIM_PRIORITY 4 + +/* + * Try to free up some pages from this zone through reclaim. + */ +int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +{ + int nr_pages; + struct task_struct *p = current; + struct reclaim_state reclaim_state; + struct scan_control sc; + cpumask_t mask; + int node_id; + + if (time_before(jiffies, + zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) + return 0; + + if (!(gfp_mask & __GFP_WAIT) || + zone->all_unreclaimable || + atomic_read(&zone->reclaim_in_progress) > 0) + return 0; + + node_id = zone->zone_pgdat->node_id; + mask = node_to_cpumask(node_id); + if (!cpus_empty(mask) && node_id != numa_node_id()) + return 0; + + sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE); + sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); + sc.nr_scanned = 0; + sc.nr_reclaimed = 0; + sc.priority = ZONE_RECLAIM_PRIORITY + 1; + sc.nr_mapped = read_page_state(nr_mapped); + sc.gfp_mask = gfp_mask; + + disable_swap_token(); + + nr_pages = 1 << order; + if (nr_pages > SWAP_CLUSTER_MAX) + sc.swap_cluster_max = nr_pages; + else + sc.swap_cluster_max = SWAP_CLUSTER_MAX; + + cond_resched(); + /* + * We need to be able to allocate from the reserves for RECLAIM_SWAP + * and we also need to be able to write out pages for RECLAIM_WRITE + * and RECLAIM_SWAP. + */ + p->flags |= PF_MEMALLOC | PF_SWAPWRITE; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + /* + * Free memory by calling shrink zone with increasing priorities + * until we have enough memory freed. + */ + do { + sc.priority--; + shrink_zone(zone, &sc); + + } while (sc.nr_reclaimed < nr_pages && sc.priority > 0); + + if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { + /* + * shrink_slab does not currently allow us to determine + * how many pages were freed in the zone. So we just + * shake the slab and then go offnode for a single allocation. + * + * shrink_slab will free memory on all zones and may take + * a long time. + */ + shrink_slab(sc.nr_scanned, gfp_mask, order); + } + + p->reclaim_state = NULL; + current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); + + if (sc.nr_reclaimed == 0) + zone->last_unsuccessful_zone_reclaim = jiffies; + + return sc.nr_reclaimed >= nr_pages; +} +#endif +