x86_64: mcelog tolerant level cleanup

[linux-2.6-omap-h63xx.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 56651a10c36645a7f58c87ac9b97599b72255b61..d419e10e3daa2dea26c6da35fb8e418b27629c6f 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -66,17 +66,8 @@ struct scan_control {
         int swappiness;
  
         int all_unreclaimable;
-};
  
-/*
- * The list of shrinker callbacks used by to apply pressure to
- * ageable caches.
- */
-struct shrinker {
-       shrinker_t              shrinker;
-       struct list_head        list;
-       int                     seeks;  /* seeks to recreate an obj */
-       long                    nr;     /* objs pending delete */
+       int order;
  };
  
  #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -121,34 +112,25 @@ static DECLARE_RWSEM(shrinker_rwsem);
  /*
   * Add a shrinker callback to be called from the vm
   */
-struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
+void register_shrinker(struct shrinker *shrinker)
  {
-        struct shrinker *shrinker;
-
-        shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
-        if (shrinker) {
-               shrinker->shrinker = theshrinker;
-               shrinker->seeks = seeks;
-               shrinker->nr = 0;
-               down_write(&shrinker_rwsem);
-               list_add_tail(&shrinker->list, &shrinker_list);
-               up_write(&shrinker_rwsem);
-       }
-       return shrinker;
+       shrinker->nr = 0;
+       down_write(&shrinker_rwsem);
+       list_add_tail(&shrinker->list, &shrinker_list);
+       up_write(&shrinker_rwsem);
  }
-EXPORT_SYMBOL(set_shrinker);
+EXPORT_SYMBOL(register_shrinker);
  
  /*
   * Remove one
   */
-void remove_shrinker(struct shrinker *shrinker)
+void unregister_shrinker(struct shrinker *shrinker)
  {
         down_write(&shrinker_rwsem);
         list_del(&shrinker->list);
         up_write(&shrinker_rwsem);
-       kfree(shrinker);
  }
-EXPORT_SYMBOL(remove_shrinker);
+EXPORT_SYMBOL(unregister_shrinker);
  
  #define SHRINK_BATCH 128
  /*
@@ -185,7 +167,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
         list_for_each_entry(shrinker, &shrinker_list, list) {
                 unsigned long long delta;
                 unsigned long total_scan;
-               unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
+               unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask);
  
                 delta = (4 * scanned) / shrinker->seeks;
                 delta *= max_pass;
@@ -213,8 +195,8 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                         int shrink_ret;
                         int nr_before;
  
-                       nr_before = (*shrinker->shrinker)(0, gfp_mask);
-                       shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
+                       nr_before = (*shrinker->shrink)(0, gfp_mask);
+                       shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask);
                         if (shrink_ret == -1)
                                 break;
                         if (shrink_ret < nr_before)
@@ -284,12 +266,8 @@ static void handle_write_error(struct address_space *mapping,
                                 struct page *page, int error)
  {
         lock_page(page);
-       if (page_mapping(page) == mapping) {
-               if (error == -ENOSPC)
-                       set_bit(AS_ENOSPC, &mapping->flags);
-               else
-                       set_bit(AS_EIO, &mapping->flags);
-       }
+       if (page_mapping(page) == mapping)
+               mapping_set_error(mapping, error);
         unlock_page(page);
  }
  
@@ -485,7 +463,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  
                 referenced = page_referenced(page, 1);
                 /* In active use or really unfreeable?  Activate it. */
-               if (referenced && page_mapping_inuse(page))
+               if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
+                                       referenced && page_mapping_inuse(page))
                         goto activate_locked;
  
  #ifdef CONFIG_SWAP
@@ -518,7 +497,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 }
  
                 if (PageDirty(page)) {
-                       if (referenced)
+                       if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
                                 goto keep_locked;
                         if (!may_enter_fs)
                                 goto keep_locked;
@@ -602,6 +581,51 @@ keep:
         return nr_reclaimed;
  }
  
+/* LRU Isolation modes. */
+#define ISOLATE_INACTIVE 0     /* Isolate inactive pages. */
+#define ISOLATE_ACTIVE 1       /* Isolate active pages. */
+#define ISOLATE_BOTH 2         /* Isolate both active and inactive pages. */
+
+/*
+ * Attempt to remove the specified page from its LRU.  Only take this page
+ * if it is of the appropriate PageActive status.  Pages which are being
+ * freed elsewhere are also ignored.
+ *
+ * page:       page to consider
+ * mode:       one of the LRU isolation modes defined above
+ *
+ * returns 0 on success, -ve errno on failure.
+ */
+static int __isolate_lru_page(struct page *page, int mode)
+{
+       int ret = -EINVAL;
+
+       /* Only take pages on the LRU. */
+       if (!PageLRU(page))
+               return ret;
+
+       /*
+        * When checking the active state, we need to be sure we are
+        * dealing with comparible boolean values.  Take the logical not
+        * of each.
+        */
+       if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+               return ret;
+
+       ret = -EBUSY;
+       if (likely(get_page_unless_zero(page))) {
+               /*
+                * Be careful not to clear PageLRU until after we're
+                * sure the page is not being freed elsewhere -- the
+                * page release code relies on it.
+                */
+               ClearPageLRU(page);
+               ret = 0;
+       }
+
+       return ret;
+}
+
  /*
   * zone->lru_lock is heavily contended.  Some of the functions that
   * shrink the lists perform better by taking out a batch of pages
@@ -616,44 +640,114 @@ keep:
   * @src:       The LRU list to pull pages off.
   * @dst:       The temp list to put pages on to.
   * @scanned:   The number of pages that were scanned.
+ * @order:     The caller's attempted allocation order
+ * @mode:      One of the LRU isolation modes
   *
   * returns how many pages were moved onto *@dst.
   */
  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 struct list_head *src, struct list_head *dst,
-               unsigned long *scanned)
+               unsigned long *scanned, int order, int mode)
  {
         unsigned long nr_taken = 0;
-       struct page *page;
         unsigned long scan;
  
         for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
-               struct list_head *target;
+               struct page *page;
+               unsigned long pfn;
+               unsigned long end_pfn;
+               unsigned long page_pfn;
+               int zone_id;
+
                 page = lru_to_page(src);
                 prefetchw_prev_lru_page(page, src, flags);
  
                 VM_BUG_ON(!PageLRU(page));
  
-               list_del(&page->lru);
-               target = src;
-               if (likely(get_page_unless_zero(page))) {
-                       /*
-                        * Be careful not to clear PageLRU until after we're
-                        * sure the page is not being freed elsewhere -- the
-                        * page release code relies on it.
-                        */
-                       ClearPageLRU(page);
-                       target = dst;
+               switch (__isolate_lru_page(page, mode)) {
+               case 0:
+                       list_move(&page->lru, dst);
                         nr_taken++;
-               } /* else it is being freed elsewhere */
+                       break;
+
+               case -EBUSY:
+                       /* else it is being freed elsewhere */
+                       list_move(&page->lru, src);
+                       continue;
+
+               default:
+                       BUG();
+               }
+
+               if (!order)
+                       continue;
  
-               list_add(&page->lru, target);
+               /*
+                * Attempt to take all pages in the order aligned region
+                * surrounding the tag page.  Only take those pages of
+                * the same active state as that tag page.  We may safely
+                * round the target page pfn down to the requested order
+                * as the mem_map is guarenteed valid out to MAX_ORDER,
+                * where that page is in a different zone we will detect
+                * it from its zone id and abort this block scan.
+                */
+               zone_id = page_zone_id(page);
+               page_pfn = page_to_pfn(page);
+               pfn = page_pfn & ~((1 << order) - 1);
+               end_pfn = pfn + (1 << order);
+               for (; pfn < end_pfn; pfn++) {
+                       struct page *cursor_page;
+
+                       /* The target page is in the block, ignore it. */
+                       if (unlikely(pfn == page_pfn))
+                               continue;
+
+                       /* Avoid holes within the zone. */
+                       if (unlikely(!pfn_valid_within(pfn)))
+                               break;
+
+                       cursor_page = pfn_to_page(pfn);
+                       /* Check that we have not crossed a zone boundary. */
+                       if (unlikely(page_zone_id(cursor_page) != zone_id))
+                               continue;
+                       switch (__isolate_lru_page(cursor_page, mode)) {
+                       case 0:
+                               list_move(&cursor_page->lru, dst);
+                               nr_taken++;
+                               scan++;
+                               break;
+
+                       case -EBUSY:
+                               /* else it is being freed elsewhere */
+                               list_move(&cursor_page->lru, src);
+                       default:
+                               break;
+                       }
+               }
         }
  
         *scanned = scan;
         return nr_taken;
  }
  
+/*
+ * clear_active_flags() is a helper for shrink_active_list(), clearing
+ * any active bits from the pages in the list.
+ */
+static unsigned long clear_active_flags(struct list_head *page_list)
+{
+       int nr_active = 0;
+       struct page *page;
+
+       list_for_each_entry(page, page_list, lru)
+               if (PageActive(page)) {
+                       ClearPageActive(page);
+                       nr_active++;
+               }
+
+       return nr_active;
+}
+
  /*
   * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
   * of reclaimed pages
@@ -675,11 +769,18 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                 unsigned long nr_taken;
                 unsigned long nr_scan;
                 unsigned long nr_freed;
+               unsigned long nr_active;
  
                 nr_taken = isolate_lru_pages(sc->swap_cluster_max,
-                                            &zone->inactive_list,
-                                            &page_list, &nr_scan);
-               __mod_zone_page_state(zone, NR_INACTIVE, -nr_taken);
+                            &zone->inactive_list,
+                            &page_list, &nr_scan, sc->order,
+                            (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
+                                            ISOLATE_BOTH : ISOLATE_INACTIVE);
+               nr_active = clear_active_flags(&page_list);
+
+               __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
+               __mod_zone_page_state(zone, NR_INACTIVE,
+                                               -(nr_taken - nr_active));
                 zone->pages_scanned += nr_scan;
                 spin_unlock_irq(&zone->lru_lock);
  
@@ -824,7 +925,7 @@ force_reclaim_mapped:
         lru_add_drain();
         spin_lock_irq(&zone->lru_lock);
         pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
-                                   &l_hold, &pgscanned);
+                           &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
         zone->pages_scanned += pgscanned;
         __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
         spin_unlock_irq(&zone->lru_lock);
@@ -1015,7 +1116,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
   * holds filesystem locks which prevent writeout this might not work, and the
   * allocation attempt will fail.
   */
-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
  {
         int priority;
         int ret = 0;
@@ -1030,6 +1131,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                 .swap_cluster_max = SWAP_CLUSTER_MAX,
                 .may_swap = 1,
                 .swappiness = vm_swappiness,
+               .order = order,
         };
  
         count_vm_event(ALLOCSTALL);
@@ -1135,6 +1237,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                 .may_swap = 1,
                 .swap_cluster_max = SWAP_CLUSTER_MAX,
                 .swappiness = vm_swappiness,
+               .order = order,
         };
         /*
          * temp_priority is used to remember the scanning priority at which
@@ -1318,6 +1421,7 @@ static int kswapd(void *p)
          * trying to free the first piece of memory in the first place).
          */
         tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+       set_freezable();
  
         order = 0;
         for ( ; ; ) {
@@ -1532,7 +1636,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
         pg_data_t *pgdat;
         cpumask_t mask;
  
-       if (action == CPU_ONLINE) {
+       if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
                 for_each_online_pgdat(pgdat) {
                         mask = node_to_cpumask(pgdat->node_id);
                         if (any_online_cpu(mask) != NR_CPUS)