hugetlb: guarantee that COW faults for a process that called mmap(MAP_PRIVATE) on...

author Mel Gorman <mel@csn.ul.ie>

Thu, 24 Jul 2008 04:27:25 +0000 (21:27 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 24 Jul 2008 17:47:16 +0000 (10:47 -0700)
author Mel Gorman <mel@csn.ul.ie>
Thu, 24 Jul 2008 04:27:25 +0000 (21:27 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Jul 2008 17:47:16 +0000 (10:47 -0700)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c

index 1576bbecd084575e2cb97fffe6846c2039309dd5..428eff5b73f35a399717590e8f6b6b8492b6a239 100644 (file)
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -441,7 +441,7 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
                         v_offset = 0;
  
                 __unmap_hugepage_range(vma,
-                               vma->vm_start + v_offset, vma->vm_end);
+                               vma->vm_start + v_offset, vma->vm_end, NULL);
         }
  }
  
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index 185b14c9f021fb9d3fb76ae6a1296f8e71206a73..abbc187193a1b9435b86daeb2462b4f20cc3b016 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -23,8 +23,10 @@ int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __us
  int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *);
  int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
  int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int, int);
-void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
-void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long);
+void unmap_hugepage_range(struct vm_area_struct *,
+                       unsigned long, unsigned long, struct page *);
+void __unmap_hugepage_range(struct vm_area_struct *,
+                       unsigned long, unsigned long, struct page *);
  int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
  int hugetlb_report_meminfo(char *);
  int hugetlb_report_node_meminfo(int, char *);
@@ -74,7 +76,7 @@ static inline unsigned long hugetlb_total_pages(void)
  #define follow_huge_addr(mm, addr, write)      ERR_PTR(-EINVAL)
  #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
  #define hugetlb_prefault(mapping, vma)         ({ BUG(); 0; })
-#define unmap_hugepage_range(vma, start, end)  BUG()
+#define unmap_hugepage_range(vma, start, end, page)    BUG()
  #define hugetlb_report_meminfo(buf)            0
  #define hugetlb_report_node_meminfo(n, buf)    0
  #define follow_huge_pmd(mm, addr, pmd, write)  NULL
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 0af500db3632e809956b4acc8f8580724efa628d..a2d29b84501f104227244b1a0fd9bad30da7467f 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -40,6 +40,9 @@ static int hugetlb_next_nid;
   */
  static DEFINE_SPINLOCK(hugetlb_lock);
  
+#define HPAGE_RESV_OWNER    (1UL << (BITS_PER_LONG - 1))
+#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2))
+#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
  /*
   * These helpers are used to track how many pages are reserved for
   * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
@@ -54,17 +57,32 @@ static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
  {
         VM_BUG_ON(!is_vm_hugetlb_page(vma));
         if (!(vma->vm_flags & VM_SHARED))
-               return (unsigned long)vma->vm_private_data;
+               return (unsigned long)vma->vm_private_data & ~HPAGE_RESV_MASK;
         return 0;
  }
  
  static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
                                                         unsigned long reserve)
  {
+       unsigned long flags;
         VM_BUG_ON(!is_vm_hugetlb_page(vma));
         VM_BUG_ON(vma->vm_flags & VM_SHARED);
  
-       vma->vm_private_data = (void *)reserve;
+       flags = (unsigned long)vma->vm_private_data & HPAGE_RESV_MASK;
+       vma->vm_private_data = (void *)(reserve | flags);
+}
+
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+{
+       unsigned long reserveflags = (unsigned long)vma->vm_private_data;
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       vma->vm_private_data = (void *)(reserveflags | flags);
+}
+
+static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
+{
+       VM_BUG_ON(!is_vm_hugetlb_page(vma));
+       return ((unsigned long)vma->vm_private_data & flag) != 0;
  }
  
  /* Decrement the reserved pages in the hugepage pool by one */
@@ -78,14 +96,18 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
                  * Only the process that called mmap() has reserves for
                  * private mappings.
                  */
-               if (vma_resv_huge_pages(vma)) {
+               if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+                       unsigned long flags, reserve;
                         resv_huge_pages--;
+                       flags = (unsigned long)vma->vm_private_data &
+                                                       HPAGE_RESV_MASK;
                         reserve = (unsigned long)vma->vm_private_data - 1;
-                       vma->vm_private_data = (void *)reserve;
+                       vma->vm_private_data = (void *)(reserve | flags);
                 }
         }
  }
  
+/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
  void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
  {
         VM_BUG_ON(!is_vm_hugetlb_page(vma));
@@ -153,7 +175,7 @@ static struct page *dequeue_huge_page(void)
  }
  
  static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
-                               unsigned long address)
+                               unsigned long address, int avoid_reserve)
  {
         int nid;
         struct page *page = NULL;
@@ -173,6 +195,10 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
                         free_huge_pages - resv_huge_pages == 0)
                 return NULL;
  
+       /* If reserves cannot be used, ensure enough pages are in the pool */
+       if (avoid_reserve && free_huge_pages - resv_huge_pages == 0)
+               return NULL;
+
         for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                 MAX_NR_ZONES - 1, nodemask) {
                 nid = zone_to_nid(zone);
@@ -183,7 +209,9 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
                         list_del(&page->lru);
                         free_huge_pages--;
                         free_huge_pages_node[nid]--;
-                       decrement_hugepage_resv_vma(vma);
+
+                       if (!avoid_reserve)
+                               decrement_hugepage_resv_vma(vma);
  
                         break;
                 }
@@ -534,7 +562,7 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
  }
  
  static struct page *alloc_huge_page(struct vm_area_struct *vma,
-                                   unsigned long addr)
+                                   unsigned long addr, int avoid_reserve)
  {
         struct page *page;
         struct address_space *mapping = vma->vm_file->f_mapping;
@@ -546,14 +574,15 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
          * will not have accounted against quota. Check that the quota can be
          * made before satisfying the allocation
          */
-       if (!vma_has_private_reserves(vma)) {
+       if (!(vma->vm_flags & VM_SHARED) &&
+                       !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
                 chg = 1;
                 if (hugetlb_get_quota(inode->i_mapping, chg))
                         return ERR_PTR(-ENOSPC);
         }
  
         spin_lock(&hugetlb_lock);
-       page = dequeue_huge_page_vma(vma, addr);
+       page = dequeue_huge_page_vma(vma, addr, avoid_reserve);
         spin_unlock(&hugetlb_lock);
  
         if (!page) {
@@ -909,7 +938,7 @@ nomem:
  }
  
  void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                           unsigned long end)
+                           unsigned long end, struct page *ref_page)
  {
         struct mm_struct *mm = vma->vm_mm;
         unsigned long address;
@@ -937,6 +966,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                 if (huge_pmd_unshare(mm, &address, ptep))
                         continue;
  
+               /*
+                * If a reference page is supplied, it is because a specific
+                * page is being unmapped, not a range. Ensure the page we
+                * are about to unmap is the actual page of interest.
+                */
+               if (ref_page) {
+                       pte = huge_ptep_get(ptep);
+                       if (huge_pte_none(pte))
+                               continue;
+                       page = pte_page(pte);
+                       if (page != ref_page)
+                               continue;
+
+                       /*
+                        * Mark the VMA as having unmapped its page so that
+                        * future faults in this VMA will fail rather than
+                        * looking like data was lost
+                        */
+                       set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+               }
+
                 pte = huge_ptep_get_and_clear(mm, address, ptep);
                 if (huge_pte_none(pte))
                         continue;
@@ -955,7 +1005,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
  }
  
  void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                         unsigned long end)
+                         unsigned long end, struct page *ref_page)
  {
         /*
          * It is undesirable to test vma->vm_file as it should be non-null
@@ -967,19 +1017,68 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
          */
         if (vma->vm_file) {
                 spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
-               __unmap_hugepage_range(vma, start, end);
+               __unmap_hugepage_range(vma, start, end, ref_page);
                 spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
         }
  }
  
+/*
+ * This is called when the original mapper is failing to COW a MAP_PRIVATE
+ * mappping it owns the reserve page for. The intention is to unmap the page
+ * from other VMAs and let the children be SIGKILLed if they are faulting the
+ * same region.
+ */
+int unmap_ref_private(struct mm_struct *mm,
+                                       struct vm_area_struct *vma,
+                                       struct page *page,
+                                       unsigned long address)
+{
+       struct vm_area_struct *iter_vma;
+       struct address_space *mapping;
+       struct prio_tree_iter iter;
+       pgoff_t pgoff;
+
+       /*
+        * vm_pgoff is in PAGE_SIZE units, hence the different calculation
+        * from page cache lookup which is in HPAGE_SIZE units.
+        */
+       address = address & huge_page_mask(hstate_vma(vma));
+       pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
+               + (vma->vm_pgoff >> PAGE_SHIFT);
+       mapping = (struct address_space *)page_private(page);
+
+       vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+               /* Do not unmap the current VMA */
+               if (iter_vma == vma)
+                       continue;
+
+               /*
+                * Unmap the page from other VMAs without their own reserves.
+                * They get marked to be SIGKILLed if they fault in these
+                * areas. This is because a future no-page fault on this VMA
+                * could insert a zeroed page instead of the data existing
+                * from the time of fork. This would look like data corruption
+                */
+               if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
+                       unmap_hugepage_range(iter_vma,
+                               address, address + HPAGE_SIZE,
+                               page);
+       }
+
+       return 1;
+}
+
  static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-                       unsigned long address, pte_t *ptep, pte_t pte)
+                       unsigned long address, pte_t *ptep, pte_t pte,
+                       struct page *pagecache_page)
  {
         struct page *old_page, *new_page;
         int avoidcopy;
+       int outside_reserve = 0;
  
         old_page = pte_page(pte);
  
+retry_avoidcopy:
         /* If no-one else is actually using this page, avoid the copy
          * and just make the page writable */
         avoidcopy = (page_count(old_page) == 1);
@@ -988,11 +1087,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                 return 0;
         }
  
+       /*
+        * If the process that created a MAP_PRIVATE mapping is about to
+        * perform a COW due to a shared page count, attempt to satisfy
+        * the allocation without using the existing reserves. The pagecache
+        * page is used to determine if the reserve at this address was
+        * consumed or not. If reserves were used, a partial faulted mapping
+        * at the time of fork() could consume its reserves on COW instead
+        * of the full address range.
+        */
+       if (!(vma->vm_flags & VM_SHARED) &&
+                       is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+                       old_page != pagecache_page)
+               outside_reserve = 1;
+
         page_cache_get(old_page);
-       new_page = alloc_huge_page(vma, address);
+       new_page = alloc_huge_page(vma, address, outside_reserve);
  
         if (IS_ERR(new_page)) {
                 page_cache_release(old_page);
+
+               /*
+                * If a process owning a MAP_PRIVATE mapping fails to COW,
+                * it is due to references held by a child and an insufficient
+                * huge page pool. To guarantee the original mappers
+                * reliability, unmap the page from child processes. The child
+                * may get SIGKILLed if it later faults.
+                */
+               if (outside_reserve) {
+                       BUG_ON(huge_pte_none(pte));
+                       if (unmap_ref_private(mm, vma, old_page, address)) {
+                               BUG_ON(page_count(old_page) != 1);
+                               BUG_ON(huge_pte_none(pte));
+                               goto retry_avoidcopy;
+                       }
+                       WARN_ON_ONCE(1);
+               }
+
                 return -PTR_ERR(new_page);
         }
  
@@ -1015,6 +1146,20 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
         return 0;
  }
  
+/* Return the pagecache page at a given address within a VMA */
+static struct page *hugetlbfs_pagecache_page(struct vm_area_struct *vma,
+                       unsigned long address)
+{
+       struct address_space *mapping;
+       unsigned long idx;
+
+       mapping = vma->vm_file->f_mapping;
+       idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+               + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+       return find_lock_page(mapping, idx);
+}
+
  static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         unsigned long address, pte_t *ptep, int write_access)
  {
@@ -1025,6 +1170,18 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
         struct address_space *mapping;
         pte_t new_pte;
  
+       /*
+        * Currently, we are forced to kill the process in the event the
+        * original mapper has unmapped pages from the child due to a failed
+        * COW. Warn that such a situation has occured as it may not be obvious
+        */
+       if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
+               printk(KERN_WARNING
+                       "PID %d killed due to inadequate hugepage pool\n",
+                       current->pid);
+               return ret;
+       }
+
         mapping = vma->vm_file->f_mapping;
         idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
                 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
@@ -1039,7 +1196,7 @@ retry:
                 size = i_size_read(mapping->host) >> HPAGE_SHIFT;
                 if (idx >= size)
                         goto out;
-               page = alloc_huge_page(vma, address);
+               page = alloc_huge_page(vma, address, 0);
                 if (IS_ERR(page)) {
                         ret = -PTR_ERR(page);
                         goto out;
@@ -1081,7 +1238,7 @@ retry:
  
         if (write_access && !(vma->vm_flags & VM_SHARED)) {
                 /* Optimization, do the COW without a second fault */
-               ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+               ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
         }
  
         spin_unlock(&mm->page_table_lock);
@@ -1126,8 +1283,15 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         spin_lock(&mm->page_table_lock);
         /* Check for a racing update before calling hugetlb_cow */
         if (likely(pte_same(entry, huge_ptep_get(ptep))))
-               if (write_access && !pte_write(entry))
-                       ret = hugetlb_cow(mm, vma, address, ptep, entry);
+               if (write_access && !pte_write(entry)) {
+                       struct page *page;
+                       page = hugetlbfs_pagecache_page(vma, address);
+                       ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
+                       if (page) {
+                               unlock_page(page);
+                               put_page(page);
+                       }
+               }
         spin_unlock(&mm->page_table_lock);
         mutex_unlock(&hugetlb_instantiation_mutex);
  
@@ -1371,6 +1535,7 @@ int hugetlb_reserve_pages(struct inode *inode,
         else {
                 chg = to - from;
                 set_vma_resv_huge_pages(vma, chg);
+               set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
         }
  
         if (chg < 0)
diff --git a/mm/memory.c b/mm/memory.c

index 82f3f1c5cf171eb869269f85d590c69290040ccb..72932489a082c313998a0cebb1ce62cdd40c769a 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -901,7 +901,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                         }
  
                         if (unlikely(is_vm_hugetlb_page(vma))) {
-                               unmap_hugepage_range(vma, start, end);
+                               unmap_hugepage_range(vma, start, end, NULL);
                                 zap_work -= (end - start) /
                                                 (HPAGE_SIZE / PAGE_SIZE);
                                 start = end;
author	Mel Gorman <mel@csn.ul.ie>
	Thu, 24 Jul 2008 04:27:25 +0000 (21:27 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 24 Jul 2008 17:47:16 +0000 (10:47 -0700)
fs/hugetlbfs/inode.c		patch \| blob \| history
include/linux/hugetlb.h		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history