kernel-doc: allow unnamed bit-fields

[linux-2.6-omap-h63xx.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index ce3c9e4492d803b011f50ea8641e477056116ddd..48c122d42ed743dcc90178170b4aff380ec3ad8c 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -371,57 +371,93 @@ static inline int is_cow_mapping(unsigned int flags)
  }
  
  /*
- * This function gets the "struct page" associated with a pte.
+ * vm_normal_page -- This function gets the "struct page" associated with a pte.
   *
- * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
- * will have each page table entry just pointing to a raw page frame
- * number, and as far as the VM layer is concerned, those do not have
- * pages associated with them - even if the PFN might point to memory
- * that otherwise is perfectly fine and has a "struct page".
+ * "Special" mappings do not wish to be associated with a "struct page" (either
+ * it doesn't exist, or it exists but they don't want to touch it). In this
+ * case, NULL is returned here. "Normal" mappings do have a struct page.
   *
- * The way we recognize those mappings is through the rules set up
- * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
- * and the vm_pgoff will point to the first PFN mapped: thus every
- * page that is a raw mapping will always honor the rule
+ * There are 2 broad cases. Firstly, an architecture may define a pte_special()
+ * pte bit, in which case this function is trivial. Secondly, an architecture
+ * may not have a spare pte bit, which requires a more complicated scheme,
+ * described below.
+ *
+ * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
+ * special mapping (even if there are underlying and valid "struct pages").
+ * COWed pages of a VM_PFNMAP are always normal.
+ *
+ * The way we recognize COWed pages within VM_PFNMAP mappings is through the
+ * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
+ * set, and the vm_pgoff will point to the first PFN mapped: thus every special
+ * mapping will always honor the rule
   *
   *     pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
   *
- * and if that isn't true, the page has been COW'ed (in which case it
- * _does_ have a "struct page" associated with it even if it is in a
- * VM_PFNMAP range).
+ * And for normal mappings this is false.
+ *
+ * This restricts such mappings to be a linear translation from virtual address
+ * to pfn. To get around this restriction, we allow arbitrary mappings so long
+ * as the vma is not a COW mapping; in that case, we know that all ptes are
+ * special (because none can have been COWed).
+ *
+ *
+ * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
+ *
+ * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
+ * page" backing, however the difference is that _all_ pages with a struct
+ * page (that is, those where pfn_valid is true) are refcounted and considered
+ * normal pages by the VM. The disadvantage is that pages are refcounted
+ * (which can be slower and simply not an option for some PFNMAP users). The
+ * advantage is that we don't have to follow the strict linearity rule of
+ * PFNMAP mappings in order to support COWable mappings.
+ *
   */
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+# define HAVE_PTE_SPECIAL 1
+#else
+# define HAVE_PTE_SPECIAL 0
+#endif
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+                               pte_t pte)
  {
-       unsigned long pfn = pte_pfn(pte);
+       unsigned long pfn;
  
-       if (unlikely(vma->vm_flags & VM_PFNMAP)) {
-               unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
-               if (pfn == vma->vm_pgoff + off)
-                       return NULL;
-               if (!is_cow_mapping(vma->vm_flags))
-                       return NULL;
+       if (HAVE_PTE_SPECIAL) {
+               if (likely(!pte_special(pte))) {
+                       VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+                       return pte_page(pte);
+               }
+               VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
+               return NULL;
         }
  
-#ifdef CONFIG_DEBUG_VM
-       /*
-        * Add some anal sanity checks for now. Eventually,
-        * we should just do "return pfn_to_page(pfn)", but
-        * in the meantime we check that we get a valid pfn,
-        * and that the resulting page looks ok.
-        */
-       if (unlikely(!pfn_valid(pfn))) {
-               print_bad_pte(vma, pte, addr);
-               return NULL;
+       /* !HAVE_PTE_SPECIAL case follows: */
+
+       pfn = pte_pfn(pte);
+
+       if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+               if (vma->vm_flags & VM_MIXEDMAP) {
+                       if (!pfn_valid(pfn))
+                               return NULL;
+                       goto out;
+               } else {
+                       unsigned long off;
+                       off = (addr - vma->vm_start) >> PAGE_SHIFT;
+                       if (pfn == vma->vm_pgoff + off)
+                               return NULL;
+                       if (!is_cow_mapping(vma->vm_flags))
+                               return NULL;
+               }
         }
-#endif
+
+       VM_BUG_ON(!pfn_valid(pfn));
  
         /*
-        * NOTE! We still have PageReserved() pages in the page 
-        * tables. 
+        * NOTE! We still have PageReserved() pages in the page tables.
          *
-        * The PAGE_ZERO() pages and various VDSO mappings can
-        * cause them to exist.
+        * eg. VDSO mappings can cause them to exist.
          */
+out:
         return pfn_to_page(pfn);
  }
  
@@ -933,7 +969,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                 goto no_page_table;
         
         pmd = pmd_offset(pud, address);
-       if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+       if (pmd_none(*pmd))
                 goto no_page_table;
  
         if (pmd_huge(*pmd)) {
@@ -942,6 +978,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                 goto out;
         }
  
+       if (unlikely(pmd_bad(*pmd)))
+               goto no_page_table;
+
         ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
         if (!ptep)
                 goto out;
@@ -1057,8 +1096,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                 if (pages)
                         foll_flags |= FOLL_GET;
                 if (!write && !(vma->vm_flags & VM_LOCKED) &&
-                   (!vma->vm_ops || (!vma->vm_ops->nopage &&
-                                       !vma->vm_ops->fault)))
+                   (!vma->vm_ops || !vma->vm_ops->fault))
                         foll_flags |= FOLL_ANON;
  
                 do {
@@ -1141,8 +1179,10 @@ pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
   * old drivers should use this, and they needed to mark their
   * pages reserved for the old functions anyway.
   */
-static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
+static int insert_page(struct vm_area_struct *vma, unsigned long addr,
+                       struct page *page, pgprot_t prot)
  {
+       struct mm_struct *mm = vma->vm_mm;
         int retval;
         pte_t *pte;
         spinlock_t *ptl;
@@ -1202,40 +1242,26 @@ out:
   *
   * The page does not need to be reserved.
   */
-int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
+                       struct page *page)
  {
         if (addr < vma->vm_start || addr >= vma->vm_end)
                 return -EFAULT;
         if (!page_count(page))
                 return -EINVAL;
         vma->vm_flags |= VM_INSERTPAGE;
-       return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
+       return insert_page(vma, addr, page, vma->vm_page_prot);
  }
  EXPORT_SYMBOL(vm_insert_page);
  
-/**
- * vm_insert_pfn - insert single pfn into user vma
- * @vma: user vma to map to
- * @addr: target user address of this page
- * @pfn: source kernel pfn
- *
- * Similar to vm_inert_page, this allows drivers to insert individual pages
- * they've allocated into a user vma. Same comments apply.
- *
- * This function should only be called from a vm_ops->fault handler, and
- * in that case the handler should return NULL.
- */
-int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-               unsigned long pfn)
+static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+                       unsigned long pfn, pgprot_t prot)
  {
         struct mm_struct *mm = vma->vm_mm;
         int retval;
         pte_t *pte, entry;
         spinlock_t *ptl;
  
-       BUG_ON(!(vma->vm_flags & VM_PFNMAP));
-       BUG_ON(is_cow_mapping(vma->vm_flags));
-
         retval = -ENOMEM;
         pte = get_locked_pte(mm, addr, &ptl);
         if (!pte)
@@ -1245,19 +1271,74 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                 goto out_unlock;
  
         /* Ok, finally just insert the thing.. */
-       entry = pfn_pte(pfn, vma->vm_page_prot);
+       entry = pte_mkspecial(pfn_pte(pfn, prot));
         set_pte_at(mm, addr, pte, entry);
-       update_mmu_cache(vma, addr, entry);
+       update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */
  
         retval = 0;
  out_unlock:
         pte_unmap_unlock(pte, ptl);
-
  out:
         return retval;
  }
+
+/**
+ * vm_insert_pfn - insert single pfn into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ *
+ * Similar to vm_inert_page, this allows drivers to insert individual pages
+ * they've allocated into a user vma. Same comments apply.
+ *
+ * This function should only be called from a vm_ops->fault handler, and
+ * in that case the handler should return NULL.
+ */
+int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+                       unsigned long pfn)
+{
+       /*
+        * Technically, architectures with pte_special can avoid all these
+        * restrictions (same for remap_pfn_range).  However we would like
+        * consistency in testing and feature parity among all, so we should
+        * try to keep these invariants in place for everybody.
+        */
+       BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+       BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+                                               (VM_PFNMAP|VM_MIXEDMAP));
+       BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+       BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
+       if (addr < vma->vm_start || addr >= vma->vm_end)
+               return -EFAULT;
+       return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+}
  EXPORT_SYMBOL(vm_insert_pfn);
  
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+                       unsigned long pfn)
+{
+       BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
+
+       if (addr < vma->vm_start || addr >= vma->vm_end)
+               return -EFAULT;
+
+       /*
+        * If we don't have pte special, then we have to use the pfn_valid()
+        * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
+        * refcount the page if pfn_valid is true (hence insert_page rather
+        * than insert_pfn).
+        */
+       if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
+               struct page *page;
+
+               page = pfn_to_page(pfn);
+               return insert_page(vma, addr, page, vma->vm_page_prot);
+       }
+       return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_mixed);
+
  /*
   * maps a range of physical memory into the requested pages. the old
   * mappings are removed. any references to nonexistent pages results
@@ -1276,7 +1357,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
         arch_enter_lazy_mmu_mode();
         do {
                 BUG_ON(!pte_none(*pte));
-               set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+               set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                 pfn++;
         } while (pte++, addr += PAGE_SIZE, addr != end);
         arch_leave_lazy_mmu_mode();
@@ -1711,7 +1792,7 @@ unlock:
         }
         return ret;
  oom_free_new:
-       __free_page(new_page);
+       page_cache_release(new_page);
  oom:
         if (old_page)
                 page_cache_release(old_page);
@@ -2093,12 +2174,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
         unlock_page(page);
  
         if (write_access) {
-               /* XXX: We could OR the do_wp_page code with this one? */
-               if (do_wp_page(mm, vma, address,
-                               page_table, pmd, ptl, pte) & VM_FAULT_OOM) {
-                       mem_cgroup_uncharge_page(page);
-                       ret = VM_FAULT_OOM;
-               }
+               ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
+               if (ret & VM_FAULT_ERROR)
+                       ret &= VM_FAULT_ERROR;
                 goto out;
         }
  
@@ -2163,7 +2241,7 @@ release:
         page_cache_release(page);
         goto unlock;
  oom_free_page:
-       __free_page(page);
+       page_cache_release(page);
  oom:
         return VM_FAULT_OOM;
  }
@@ -2202,20 +2280,9 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  
         BUG_ON(vma->vm_flags & VM_PFNMAP);
  
-       if (likely(vma->vm_ops->fault)) {
-               ret = vma->vm_ops->fault(vma, &vmf);
-               if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
-                       return ret;
-       } else {
-               /* Legacy ->nopage path */
-               ret = 0;
-               vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
-               /* no page was available -- either SIGBUS or OOM */
-               if (unlikely(vmf.page == NOPAGE_SIGBUS))
-                       return VM_FAULT_SIGBUS;
-               else if (unlikely(vmf.page == NOPAGE_OOM))
-                       return VM_FAULT_OOM;
-       }
+       ret = vma->vm_ops->fault(vma, &vmf);
+       if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+               return ret;
  
         /*
          * For consistency in subsequent calls, make the faulted page always
@@ -2380,10 +2447,13 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
         unsigned long pfn;
  
         pte_unmap(page_table);
-       BUG_ON(!(vma->vm_flags & VM_PFNMAP));
-       BUG_ON(is_cow_mapping(vma->vm_flags));
+       BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+       BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
  
         pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
+
+       BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+
         if (unlikely(pfn == NOPFN_OOM))
                 return VM_FAULT_OOM;
         else if (unlikely(pfn == NOPFN_SIGBUS))
@@ -2461,7 +2531,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
         if (!pte_present(entry)) {
                 if (pte_none(entry)) {
                         if (vma->vm_ops) {
-                               if (vma->vm_ops->fault || vma->vm_ops->nopage)
+                               if (likely(vma->vm_ops->fault))
                                         return do_linear_fault(mm, vma, address,
                                                 pte, pmd, write_access, entry);
                                 if (unlikely(vma->vm_ops->nopfn))