Memory controller: memory accounting

author Balbir Singh <balbir@linux.vnet.ibm.com>

Thu, 7 Feb 2008 08:13:53 +0000 (00:13 -0800)

committer Linus Torvalds <torvalds@woody.linux-foundation.org>

Thu, 7 Feb 2008 16:42:18 +0000 (08:42 -0800)
author Balbir Singh <balbir@linux.vnet.ibm.com>
Thu, 7 Feb 2008 08:13:53 +0000 (00:13 -0800)
committer Linus Torvalds <torvalds@woody.linux-foundation.org>
Thu, 7 Feb 2008 16:42:18 +0000 (08:42 -0800)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index 7d1f119c796efb42e470572d2e05dd607d267408..f5b47efab48b8d9d602f8b65df20e5334144e822 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -30,6 +30,13 @@ extern void mm_free_cgroup(struct mm_struct *mm);
  extern void page_assign_page_cgroup(struct page *page,
                                         struct page_cgroup *pc);
  extern struct page_cgroup *page_get_page_cgroup(struct page *page);
+extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm);
+extern void mem_cgroup_uncharge(struct page_cgroup *pc);
+
+static inline void mem_cgroup_uncharge_page(struct page *page)
+{
+       mem_cgroup_uncharge(page_get_page_cgroup(page));
+}
  
  #else /* CONFIG_CGROUP_MEM_CONT */
  static inline void mm_init_cgroup(struct mm_struct *mm,
@@ -51,6 +58,19 @@ static inline struct page_cgroup *page_get_page_cgroup(struct page *page)
         return NULL;
  }
  
+static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
+{
+       return 0;
+}
+
+static inline void mem_cgroup_uncharge(struct page_cgroup *pc)
+{
+}
+
+static inline void mem_cgroup_uncharge_page(struct page *page)
+{
+}
+
  #endif /* CONFIG_CGROUP_MEM_CONT */
  
  #endif /* _LINUX_MEMCONTROL_H */
diff --git a/mm/filemap.c b/mm/filemap.c

index 81fb9bff0d4f9022db02f9d3194ebc5ccd0c350e..b7a01e9279534d43d13c1c99aa6f20ac5b1c930c 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
  #include <linux/syscalls.h>
  #include <linux/cpuset.h>
  #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
+#include <linux/memcontrol.h>
  #include "internal.h"
  
  /*
@@ -118,6 +119,7 @@ void __remove_from_page_cache(struct page *page)
  {
         struct address_space *mapping = page->mapping;
  
+       mem_cgroup_uncharge_page(page);
         radix_tree_delete(&mapping->page_tree, page->index);
         page->mapping = NULL;
         mapping->nrpages--;
@@ -461,6 +463,11 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
         int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
  
         if (error == 0) {
+
+               error = mem_cgroup_charge(page, current->mm);
+               if (error)
+                       goto out;
+
                 write_lock_irq(&mapping->tree_lock);
                 error = radix_tree_insert(&mapping->page_tree, offset, page);
                 if (!error) {
@@ -470,10 +477,13 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
                         page->index = offset;
                         mapping->nrpages++;
                         __inc_zone_page_state(page, NR_FILE_PAGES);
-               }
+               } else
+                       mem_cgroup_uncharge_page(page);
+
                 write_unlock_irq(&mapping->tree_lock);
                 radix_tree_preload_end();
         }
+out:
         return error;
  }
  EXPORT_SYMBOL(add_to_page_cache);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 4d4805eb37c77b06a98a9d06690982420734e077..ebca767292dc2ad2aac09d47e1c819e600fa02ce 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,6 +21,9 @@
  #include <linux/memcontrol.h>
  #include <linux/cgroup.h>
  #include <linux/mm.h>
+#include <linux/page-flags.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rcupdate.h>
  
  struct cgroup_subsys mem_cgroup_subsys;
  
@@ -31,7 +34,9 @@ struct cgroup_subsys mem_cgroup_subsys;
   * to help the administrator determine what knobs to tune.
   *
   * TODO: Add a water mark for the memory controller. Reclaim will begin when
- * we hit the water mark.
+ * we hit the water mark. May be even add a low water mark, such that
+ * no reclaim occurs from a cgroup at it's low water mark, this is
+ * a feature that will be implemented much later in the future.
   */
  struct mem_cgroup {
         struct cgroup_subsys_state css;
@@ -48,6 +53,14 @@ struct mem_cgroup {
         struct list_head inactive_list;
  };
  
+/*
+ * We use the lower bit of the page->page_cgroup pointer as a bit spin
+ * lock. We need to ensure that page->page_cgroup is atleast two
+ * byte aligned (based on comments from Nick Piggin)
+ */
+#define PAGE_CGROUP_LOCK_BIT   0x0
+#define PAGE_CGROUP_LOCK               (1 << PAGE_CGROUP_LOCK_BIT)
+
  /*
   * A page_cgroup page is associated with every page descriptor. The
   * page_cgroup helps us identify information about the cgroup
@@ -56,6 +69,8 @@ struct page_cgroup {
         struct list_head lru;           /* per cgroup LRU list */
         struct page *page;
         struct mem_cgroup *mem_cgroup;
+       atomic_t ref_cnt;               /* Helpful when pages move b/w  */
+                                       /* mapped and cached states     */
  };
  
  
@@ -88,14 +103,157 @@ void mm_free_cgroup(struct mm_struct *mm)
         css_put(&mm->mem_cgroup->css);
  }
  
+static inline int page_cgroup_locked(struct page *page)
+{
+       return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
+                                       &page->page_cgroup);
+}
+
  void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
  {
-       page->page_cgroup = (unsigned long)pc;
+       int locked;
+
+       /*
+        * While resetting the page_cgroup we might not hold the
+        * page_cgroup lock. free_hot_cold_page() is an example
+        * of such a scenario
+        */
+       if (pc)
+               VM_BUG_ON(!page_cgroup_locked(page));
+       locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
+       page->page_cgroup = ((unsigned long)pc | locked);
  }
  
  struct page_cgroup *page_get_page_cgroup(struct page *page)
  {
-       return page->page_cgroup;
+       return (struct page_cgroup *)
+               (page->page_cgroup & ~PAGE_CGROUP_LOCK);
+}
+
+void __always_inline lock_page_cgroup(struct page *page)
+{
+       bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+       VM_BUG_ON(!page_cgroup_locked(page));
+}
+
+void __always_inline unlock_page_cgroup(struct page *page)
+{
+       bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+}
+
+/*
+ * Charge the memory controller for page usage.
+ * Return
+ * 0 if the charge was successful
+ * < 0 if the cgroup is over its limit
+ */
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
+{
+       struct mem_cgroup *mem;
+       struct page_cgroup *pc, *race_pc;
+
+       /*
+        * Should page_cgroup's go to their own slab?
+        * One could optimize the performance of the charging routine
+        * by saving a bit in the page_flags and using it as a lock
+        * to see if the cgroup page already has a page_cgroup associated
+        * with it
+        */
+       lock_page_cgroup(page);
+       pc = page_get_page_cgroup(page);
+       /*
+        * The page_cgroup exists and the page has already been accounted
+        */
+       if (pc) {
+               atomic_inc(&pc->ref_cnt);
+               goto done;
+       }
+
+       unlock_page_cgroup(page);
+
+       pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
+       if (pc == NULL)
+               goto err;
+
+       rcu_read_lock();
+       /*
+        * We always charge the cgroup the mm_struct belongs to
+        * the mm_struct's mem_cgroup changes on task migration if the
+        * thread group leader migrates. It's possible that mm is not
+        * set, if so charge the init_mm (happens for pagecache usage).
+        */
+       if (!mm)
+               mm = &init_mm;
+
+       mem = rcu_dereference(mm->mem_cgroup);
+       /*
+        * For every charge from the cgroup, increment reference
+        * count
+        */
+       css_get(&mem->css);
+       rcu_read_unlock();
+
+       /*
+        * If we created the page_cgroup, we should free it on exceeding
+        * the cgroup limit.
+        */
+       if (res_counter_charge(&mem->res, 1)) {
+               css_put(&mem->css);
+               goto free_pc;
+       }
+
+       lock_page_cgroup(page);
+       /*
+        * Check if somebody else beat us to allocating the page_cgroup
+        */
+       race_pc = page_get_page_cgroup(page);
+       if (race_pc) {
+               kfree(pc);
+               pc = race_pc;
+               atomic_inc(&pc->ref_cnt);
+               res_counter_uncharge(&mem->res, 1);
+               css_put(&mem->css);
+               goto done;
+       }
+
+       atomic_set(&pc->ref_cnt, 1);
+       pc->mem_cgroup = mem;
+       pc->page = page;
+       page_assign_page_cgroup(page, pc);
+
+done:
+       unlock_page_cgroup(page);
+       return 0;
+free_pc:
+       kfree(pc);
+       return -ENOMEM;
+err:
+       unlock_page_cgroup(page);
+       return -ENOMEM;
+}
+
+/*
+ * Uncharging is always a welcome operation, we never complain, simply
+ * uncharge.
+ */
+void mem_cgroup_uncharge(struct page_cgroup *pc)
+{
+       struct mem_cgroup *mem;
+       struct page *page;
+
+       if (!pc)
+               return;
+
+       if (atomic_dec_and_test(&pc->ref_cnt)) {
+               page = pc->page;
+               lock_page_cgroup(page);
+               mem = pc->mem_cgroup;
+               css_put(&mem->css);
+               page_assign_page_cgroup(page, NULL);
+               unlock_page_cgroup(page);
+               res_counter_uncharge(&mem->res, 1);
+               kfree(pc);
+       }
  }
  
  static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
@@ -150,6 +308,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                 return NULL;
  
         res_counter_init(&mem->res);
+       INIT_LIST_HEAD(&mem->active_list);
+       INIT_LIST_HEAD(&mem->inactive_list);
         return &mem->css;
  }
  
diff --git a/mm/memory.c b/mm/memory.c

index 9d073fa0a2d02631f53ba3117e80b32efc2b9ae9..0ba224ea6ba4b723c50a8c231c53fbf4e56d2cc6 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -50,6 +50,7 @@
  #include <linux/delayacct.h>
  #include <linux/init.h>
  #include <linux/writeback.h>
+#include <linux/memcontrol.h>
  
  #include <asm/pgalloc.h>
  #include <asm/uaccess.h>
@@ -1144,16 +1145,20 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
  {
         int retval;
         pte_t *pte;
-       spinlock_t *ptl;  
+       spinlock_t *ptl;
+
+       retval = mem_cgroup_charge(page, mm);
+       if (retval)
+               goto out;
  
         retval = -EINVAL;
         if (PageAnon(page))
-               goto out;
+               goto out_uncharge;
         retval = -ENOMEM;
         flush_dcache_page(page);
         pte = get_locked_pte(mm, addr, &ptl);
         if (!pte)
-               goto out;
+               goto out_uncharge;
         retval = -EBUSY;
         if (!pte_none(*pte))
                 goto out_unlock;
@@ -1165,8 +1170,12 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
         set_pte_at(mm, addr, pte, mk_pte(page, prot));
  
         retval = 0;
+       pte_unmap_unlock(pte, ptl);
+       return retval;
  out_unlock:
         pte_unmap_unlock(pte, ptl);
+out_uncharge:
+       mem_cgroup_uncharge_page(page);
  out:
         return retval;
  }
@@ -1641,6 +1650,9 @@ gotten:
         cow_user_page(new_page, old_page, address, vma);
         __SetPageUptodate(new_page);
  
+       if (mem_cgroup_charge(new_page, mm))
+               goto oom_free_new;
+
         /*
          * Re-check the pte - we dropped the lock
          */
@@ -1672,7 +1684,9 @@ gotten:
                 /* Free the old page.. */
                 new_page = old_page;
                 ret |= VM_FAULT_WRITE;
-       }
+       } else
+               mem_cgroup_uncharge_page(new_page);
+
         if (new_page)
                 page_cache_release(new_page);
         if (old_page)
@@ -1696,6 +1710,8 @@ unlock:
                 put_page(dirty_page);
         }
         return ret;
+oom_free_new:
+       __free_page(new_page);
  oom:
         if (old_page)
                 page_cache_release(old_page);
@@ -2036,6 +2052,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 count_vm_event(PGMAJFAULT);
         }
  
+       if (mem_cgroup_charge(page, mm)) {
+               delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+               ret = VM_FAULT_OOM;
+               goto out;
+       }
+
         mark_page_accessed(page);
         lock_page(page);
         delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2073,8 +2095,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
         if (write_access) {
                 /* XXX: We could OR the do_wp_page code with this one? */
                 if (do_wp_page(mm, vma, address,
-                               page_table, pmd, ptl, pte) & VM_FAULT_OOM)
+                               page_table, pmd, ptl, pte) & VM_FAULT_OOM) {
+                       mem_cgroup_uncharge_page(page);
                         ret = VM_FAULT_OOM;
+               }
                 goto out;
         }
  
@@ -2085,6 +2109,7 @@ unlock:
  out:
         return ret;
  out_nomap:
+       mem_cgroup_uncharge_page(page);
         pte_unmap_unlock(page_table, ptl);
         unlock_page(page);
         page_cache_release(page);
@@ -2114,6 +2139,9 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 goto oom;
         __SetPageUptodate(page);
  
+       if (mem_cgroup_charge(page, mm))
+               goto oom_free_page;
+
         entry = mk_pte(page, vma->vm_page_prot);
         entry = maybe_mkwrite(pte_mkdirty(entry), vma);
  
@@ -2131,8 +2159,11 @@ unlock:
         pte_unmap_unlock(page_table, ptl);
         return 0;
  release:
+       mem_cgroup_uncharge_page(page);
         page_cache_release(page);
         goto unlock;
+oom_free_page:
+       __free_page(page);
  oom:
         return VM_FAULT_OOM;
  }
@@ -2246,6 +2277,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  
         }
  
+       if (mem_cgroup_charge(page, mm)) {
+               ret = VM_FAULT_OOM;
+               goto out;
+       }
+
         page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
  
         /*
@@ -2281,6 +2317,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 /* no need to invalidate: a not-present page won't be cached */
                 update_mmu_cache(vma, address, entry);
         } else {
+               mem_cgroup_uncharge_page(page);
                 if (anon)
                         page_cache_release(page);
                 else
diff --git a/mm/migrate.c b/mm/migrate.c

index 857a987e36904a5850a4d663c6d9e41e494ffe1f..417bbda14e5b749b9c8512ac1b44ebcb31c346f1 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -29,6 +29,7 @@
  #include <linux/mempolicy.h>
  #include <linux/vmalloc.h>
  #include <linux/security.h>
+#include <linux/memcontrol.h>
  
  #include "internal.h"
  
@@ -152,6 +153,11 @@ static void remove_migration_pte(struct vm_area_struct *vma,
                 return;
         }
  
+       if (mem_cgroup_charge(new, mm)) {
+               pte_unmap(ptep);
+               return;
+       }
+
         ptl = pte_lockptr(mm, pmd);
         spin_lock(ptl);
         pte = *ptep;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 37576b822f06c98c2d3342af1cdecbe0e5a8f25b..26a54a17dc9f5dfc0191da8f0dd774476ab8eee0 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,7 @@
  #include <linux/backing-dev.h>
  #include <linux/fault-inject.h>
  #include <linux/page-isolation.h>
+#include <linux/memcontrol.h>
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
@@ -987,6 +988,7 @@ static void free_hot_cold_page(struct page *page, int cold)
  
         if (!PageHighMem(page))
                 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
+       VM_BUG_ON(page_get_page_cgroup(page));
         arch_free_page(page, 0);
         kernel_map_pages(page, 1, 0);
  
@@ -2525,6 +2527,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                 set_page_links(page, zone, nid, pfn);
                 init_page_count(page);
                 reset_page_mapcount(page);
+               page_assign_page_cgroup(page, NULL);
                 SetPageReserved(page);
  
                 /*
diff --git a/mm/rmap.c b/mm/rmap.c

index 57ad276900c94903a2febd53e1c0d995958a38a9..4a3487921effd04b4ab3c3cdbddf216259366d10 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -48,6 +48,7 @@
  #include <linux/rcupdate.h>
  #include <linux/module.h>
  #include <linux/kallsyms.h>
+#include <linux/memcontrol.h>
  
  #include <asm/tlbflush.h>
  
@@ -554,8 +555,14 @@ void page_add_anon_rmap(struct page *page,
         VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
         if (atomic_inc_and_test(&page->_mapcount))
                 __page_set_anon_rmap(page, vma, address);
-       else
+       else {
                 __page_check_anon_rmap(page, vma, address);
+               /*
+                * We unconditionally charged during prepare, we uncharge here
+                * This takes care of balancing the reference counts
+                */
+               mem_cgroup_uncharge_page(page);
+       }
  }
  
  /*
@@ -586,6 +593,12 @@ void page_add_file_rmap(struct page *page)
  {
         if (atomic_inc_and_test(&page->_mapcount))
                 __inc_zone_page_state(page, NR_FILE_MAPPED);
+       else
+               /*
+                * We unconditionally charged during prepare, we uncharge here
+                * This takes care of balancing the reference counts
+                */
+               mem_cgroup_uncharge_page(page);
  }
  
  #ifdef CONFIG_DEBUG_VM
@@ -646,6 +659,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
                         page_clear_dirty(page);
                         set_page_dirty(page);
                 }
+               mem_cgroup_uncharge_page(page);
+
                 __dec_zone_page_state(page,
                                 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
         }
diff --git a/mm/swap_state.c b/mm/swap_state.c

index ec42f01a8d02669fb5bd2c73dcffefb5bb619f6c..f96e3ff1e7913472107791f35360c273ab0862a4 100644 (file)
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,6 +17,7 @@
  #include <linux/backing-dev.h>
  #include <linux/pagevec.h>
  #include <linux/migrate.h>
+#include <linux/memcontrol.h>
  
  #include <asm/pgtable.h>
  
@@ -76,6 +77,11 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
         BUG_ON(PagePrivate(page));
         error = radix_tree_preload(gfp_mask);
         if (!error) {
+
+               error = mem_cgroup_charge(page, current->mm);
+               if (error)
+                       goto out;
+
                 write_lock_irq(&swapper_space.tree_lock);
                 error = radix_tree_insert(&swapper_space.page_tree,
                                                 entry.val, page);
@@ -86,10 +92,13 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
                         total_swapcache_pages++;
                         __inc_zone_page_state(page, NR_FILE_PAGES);
                         INC_CACHE_INFO(add_total);
+               } else {
+                       mem_cgroup_uncharge_page(page);
                 }
                 write_unlock_irq(&swapper_space.tree_lock);
                 radix_tree_preload_end();
         }
+out:
         return error;
  }
  
@@ -104,6 +113,7 @@ void __delete_from_swap_cache(struct page *page)
         BUG_ON(PageWriteback(page));
         BUG_ON(PagePrivate(page));
  
+       mem_cgroup_uncharge_page(page);
         radix_tree_delete(&swapper_space.page_tree, page_private(page));
         set_page_private(page, 0);
         ClearPageSwapCache(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c

index afae7b1f680b5f4140d305d2a5be7fb476f50fd5..fddc4cc4149bce91491ee9d08aa8099969793236 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -27,6 +27,7 @@
  #include <linux/mutex.h>
  #include <linux/capability.h>
  #include <linux/syscalls.h>
+#include <linux/memcontrol.h>
  
  #include <asm/pgtable.h>
  #include <asm/tlbflush.h>
@@ -506,9 +507,12 @@ unsigned int count_swap_pages(int type, int free)
   * just let do_wp_page work it out if a write is requested later - to
   * force COW, vm_page_prot omits write permission from any private vma.
   */
-static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
+static int unuse_pte(struct vm_area_struct *vma, pte_t *pte,
                 unsigned long addr, swp_entry_t entry, struct page *page)
  {
+       if (mem_cgroup_charge(page, vma->vm_mm))
+               return -ENOMEM;
+
         inc_mm_counter(vma->vm_mm, anon_rss);
         get_page(page);
         set_pte_at(vma->vm_mm, addr, pte,
@@ -520,6 +524,7 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
          * immediately swapped out again after swapon.
          */
         activate_page(page);
+       return 1;
  }
  
  static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -529,7 +534,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
         pte_t swp_pte = swp_entry_to_pte(entry);
         pte_t *pte;
         spinlock_t *ptl;
-       int found = 0;
+       int ret = 0;
  
         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
         do {
@@ -538,13 +543,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                  * Test inline before going to call unuse_pte.
                  */
                 if (unlikely(pte_same(*pte, swp_pte))) {
-                       unuse_pte(vma, pte++, addr, entry, page);
-                       found = 1;
+                       ret = unuse_pte(vma, pte++, addr, entry, page);
                         break;
                 }
         } while (pte++, addr += PAGE_SIZE, addr != end);
         pte_unmap_unlock(pte - 1, ptl);
-       return found;
+       return ret;
  }
  
  static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -553,14 +557,16 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
  {
         pmd_t *pmd;
         unsigned long next;
+       int ret;
  
         pmd = pmd_offset(pud, addr);
         do {
                 next = pmd_addr_end(addr, end);
                 if (pmd_none_or_clear_bad(pmd))
                         continue;
-               if (unuse_pte_range(vma, pmd, addr, next, entry, page))
-                       return 1;
+               ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
+               if (ret)
+                       return ret;
         } while (pmd++, addr = next, addr != end);
         return 0;
  }
@@ -571,14 +577,16 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
  {
         pud_t *pud;
         unsigned long next;
+       int ret;
  
         pud = pud_offset(pgd, addr);
         do {
                 next = pud_addr_end(addr, end);
                 if (pud_none_or_clear_bad(pud))
                         continue;
-               if (unuse_pmd_range(vma, pud, addr, next, entry, page))
-                       return 1;
+               ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
+               if (ret)
+                       return ret;
         } while (pud++, addr = next, addr != end);
         return 0;
  }
@@ -588,6 +596,7 @@ static int unuse_vma(struct vm_area_struct *vma,
  {
         pgd_t *pgd;
         unsigned long addr, end, next;
+       int ret;
  
         if (page->mapping) {
                 addr = page_address_in_vma(page, vma);
@@ -605,8 +614,9 @@ static int unuse_vma(struct vm_area_struct *vma,
                 next = pgd_addr_end(addr, end);
                 if (pgd_none_or_clear_bad(pgd))
                         continue;
-               if (unuse_pud_range(vma, pgd, addr, next, entry, page))
-                       return 1;
+               ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
+               if (ret)
+                       return ret;
         } while (pgd++, addr = next, addr != end);
         return 0;
  }
@@ -615,6 +625,7 @@ static int unuse_mm(struct mm_struct *mm,
                                 swp_entry_t entry, struct page *page)
  {
         struct vm_area_struct *vma;
+       int ret = 0;
  
         if (!down_read_trylock(&mm->mmap_sem)) {
                 /*
@@ -627,15 +638,11 @@ static int unuse_mm(struct mm_struct *mm,
                 lock_page(page);
         }
         for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               if (vma->anon_vma && unuse_vma(vma, entry, page))
+               if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
                         break;
         }
         up_read(&mm->mmap_sem);
-       /*
-        * Currently unuse_mm cannot fail, but leave error handling
-        * at call sites for now, since we change it from time to time.
-        */
-       return 0;
+       return (ret < 0)? ret: 0;
  }
  
  /*
author	Balbir Singh <balbir@linux.vnet.ibm.com>
	Thu, 7 Feb 2008 08:13:53 +0000 (00:13 -0800)
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>
	Thu, 7 Feb 2008 16:42:18 +0000 (08:42 -0800)
include/linux/memcontrol.h		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/migrate.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history
mm/swap_state.c		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history