pte = pmd_page(*pmd);
        pmd_clear(pmd);
        dec_page_state(nr_page_table_pages);
+       pte_lock_deinit(pte);
        pte_free(pte);
        pmd_free(pmd);
 free:
 
        if (pgd_list)
                pgd_list->private = (unsigned long) &page->index;
        pgd_list = page;
-       page->private = (unsigned long) &pgd_list;
+       set_page_private(page, (unsigned long)&pgd_list);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
        struct page *next, **pprev, *page = virt_to_page(pgd);
        next = (struct page *) page->index;
-       pprev = (struct page **) page->private;
+       pprev = (struct page **)page_private(page);
        *pprev = next;
        if (next)
                next->private = (unsigned long) pprev;
 
        struct page *page = virt_to_page(pgd);
        page->index = (unsigned long)pgd_list;
        if (pgd_list)
-               pgd_list->private = (unsigned long)&page->index;
+               set_page_private(pgd_list, (unsigned long)&page->index);
        pgd_list = page;
-       page->private = (unsigned long)&pgd_list;
+       set_page_private(page, (unsigned long)&pgd_list);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
        struct page *next, **pprev, *page = virt_to_page(pgd);
        next = (struct page *)page->index;
-       pprev = (struct page **)page->private;
+       pprev = (struct page **)page_private(page);
        *pprev = next;
        if (next)
-               next->private = (unsigned long)pprev;
+               set_page_private(next, (unsigned long)pprev);
 }
 
 void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
 
 
        if(!proc_mm || !ptrace_faultinfo){
                free_page(mmu->id.stack);
+               pte_lock_deinit(virt_to_page(mmu->last_page_table));
                pte_free_kernel((pte_t *) mmu->last_page_table);
                 dec_page_state(nr_page_table_pages);
 #ifdef CONFIG_3_LEVEL_PGTABLES
 
                cachefs_uncache_page(vnode->cache, page);
 #endif
 
-               pageio = (struct cachefs_page *) page->private;
-               page->private = 0;
+               pageio = (struct cachefs_page *) page_private(page);
+               set_page_private(page, 0);
                ClearPagePrivate(page);
 
                if (pageio)
 
 __clear_page_buffers(struct page *page)
 {
        ClearPagePrivate(page);
-       page->private = 0;
+       set_page_private(page, 0);
        page_cache_release(page);
 }
 
 
        atomic_t io_count;
        struct metapage *mp[MPS_PER_PAGE];
 };
-#define mp_anchor(page) ((struct meta_anchor *)page->private)
+#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
 
 static inline struct metapage *page_to_mp(struct page *page, uint offset)
 {
                if (!a)
                        return -ENOMEM;
                memset(a, 0, sizeof(struct meta_anchor));
-               page->private = (unsigned long)a;
+               set_page_private(page, (unsigned long)a);
                SetPagePrivate(page);
                kmap(page);
        }
        a->mp[index] = NULL;
        if (--a->mp_count == 0) {
                kfree(a);
-               page->private = 0;
+               set_page_private(page, 0);
                ClearPagePrivate(page);
                kunmap(page);
        }
 #else
 static inline struct metapage *page_to_mp(struct page *page, uint offset)
 {
-       return PagePrivate(page) ? (struct metapage *)page->private : NULL;
+       return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
 }
 
 static inline int insert_metapage(struct page *page, struct metapage *mp)
 {
        if (mp) {
-               page->private = (unsigned long)mp;
+               set_page_private(page, (unsigned long)mp);
                SetPagePrivate(page);
                kmap(page);
        }
 
 static inline void remove_metapage(struct page *page, struct metapage *mp)
 {
-       page->private = 0;
+       set_page_private(page, 0);
        ClearPagePrivate(page);
        kunmap(page);
 }
 
        size_t          offset,
        size_t          length)
 {
-       page->private |= page_region_mask(offset, length);
-       if (page->private == ~0UL)
+       set_page_private(page,
+               page_private(page) | page_region_mask(offset, length));
+       if (page_private(page) == ~0UL)
                SetPageUptodate(page);
 }
 
 {
        unsigned long   mask = page_region_mask(offset, length);
 
-       return (mask && (page->private & mask) == mask);
+       return (mask && (page_private(page) & mask) == mask);
 }
 
 /*
 
 /* If we *know* page->private refers to buffer_heads */
 #define page_buffers(page)                                     \
        ({                                                      \
-               BUG_ON(!PagePrivate(page));             \
-               ((struct buffer_head *)(page)->private);        \
+               BUG_ON(!PagePrivate(page));                     \
+               ((struct buffer_head *)page_private(page));     \
        })
 #define page_has_buffers(page) PagePrivate(page)
 
 {
        page_cache_get(page);
        SetPagePrivate(page);
-       page->private = (unsigned long)head;
+       set_page_private(page, (unsigned long)head);
 }
 
 static inline void get_bh(struct buffer_head *bh)
 
                                         * to show when page is mapped
                                         * & limit reverse map searches.
                                         */
-       unsigned long private;          /* Mapping-private opaque data:
+       union {
+               unsigned long private;  /* Mapping-private opaque data:
                                         * usually used for buffer_heads
                                         * if PagePrivate set; used for
                                         * swp_entry_t if PageSwapCache
                                         * When page is free, this indicates
                                         * order in the buddy system.
                                         */
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+               spinlock_t ptl;
+#endif
+       } u;
        struct address_space *mapping;  /* If low bit clear, points to
                                         * inode address_space, or NULL.
                                         * If page mapped as anonymous
 #endif /* WANT_PAGE_VIRTUAL */
 };
 
+#define page_private(page)             ((page)->u.private)
+#define set_page_private(page, v)      ((page)->u.private = (v))
+
 /*
  * FIXME: take this include out, include page-flags.h in
  * files which need it (119 of them)
 
 #ifdef CONFIG_HUGETLB_PAGE
 
-static inline int page_count(struct page *p)
+static inline int page_count(struct page *page)
 {
-       if (PageCompound(p))
-               p = (struct page *)p->private;
-       return atomic_read(&(p)->_count) + 1;
+       if (PageCompound(page))
+               page = (struct page *)page_private(page);
+       return atomic_read(&page->_count) + 1;
 }
 
 static inline void get_page(struct page *page)
 {
        if (unlikely(PageCompound(page)))
-               page = (struct page *)page->private;
+               page = (struct page *)page_private(page);
        atomic_inc(&page->_count);
 }
 
 static inline pgoff_t page_index(struct page *page)
 {
        if (unlikely(PageSwapCache(page)))
-               return page->private;
+               return page_private(page);
        return page->index;
 }
 
 }
 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
 
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * We tuck a spinlock to guard each pagetable page into its struct page,
+ * at page->private, with BUILD_BUG_ON to make sure that this will not
+ * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
+ * When freeing, reset page->mapping so free_pages_check won't complain.
+ */
+#define __pte_lockptr(page)    &((page)->u.ptl)
+#define pte_lock_init(_page)   do {                                    \
+       spin_lock_init(__pte_lockptr(_page));                           \
+} while (0)
+#define pte_lock_deinit(page)  ((page)->mapping = NULL)
+#define pte_lockptr(mm, pmd)   ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
+#else
+/*
+ * We use mm->page_table_lock to guard all pagetable pages of the mm.
+ */
+#define pte_lock_init(page)    do {} while (0)
+#define pte_lock_deinit(page)  do {} while (0)
+#define pte_lockptr(mm, pmd)   ({(void)(pmd); &(mm)->page_table_lock;})
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+
 #define pte_offset_map_lock(mm, pmd, address, ptlp)    \
 ({                                                     \
-       spinlock_t *__ptl = &(mm)->page_table_lock;     \
+       spinlock_t *__ptl = pte_lockptr(mm, pmd);       \
        pte_t *__pte = pte_offset_map(pmd, address);    \
        *(ptlp) = __ptl;                                \
        spin_lock(__ptl);                               \
 
        if (pages) {
                unsigned int count, i;
                pages->mapping = NULL;
-               pages->private = order;
+               set_page_private(pages, order);
                count = 1 << order;
                for (i = 0; i < count; i++)
                        SetPageReserved(pages + i);
 {
        unsigned int order, count, i;
 
-       order = page->private;
+       order = page_private(page);
        count = 1 << order;
        for (i = 0; i < count; i++)
                ClearPageReserved(page + i);
 
 config SPARSEMEM_EXTREME
        def_bool y
        depends on SPARSEMEM && !SPARSEMEM_STATIC
+
+# Heavily threaded applications may benefit from splitting the mm-wide
+# page_table_lock, so that faults on different parts of the user address
+# space can be handled with less contention: split it at this NR_CPUS.
+# Default to 4 for wider testing, though 8 might be more appropriate.
+# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
+# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
+#
+config SPLIT_PTLOCK_CPUS
+       int
+       default "4096" if ARM && !CPU_CACHE_VIPT
+       default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
+       default "4"
 
         * in the ->sync_page() methods make essential use of the
         * page_mapping(), merely passing the page down to the backing
         * device's unplug functions when it's non-NULL, which in turn
-        * ignore it for all cases but swap, where only page->private is
+        * ignore it for all cases but swap, where only page_private(page) is
         * of interest. When page_mapping() does go NULL, the entire
         * call stack gracefully ignores the page and returns.
         * -- wli
 
 {
        struct page *page = pmd_page(*pmd);
        pmd_clear(pmd);
+       pte_lock_deinit(page);
        pte_free_tlb(tlb, page);
        dec_page_state(nr_page_table_pages);
        tlb->mm->nr_ptes--;
        if (!new)
                return -ENOMEM;
 
+       pte_lock_init(new);
        spin_lock(&mm->page_table_lock);
-       if (pmd_present(*pmd))          /* Another has populated it */
+       if (pmd_present(*pmd)) {        /* Another has populated it */
+               pte_lock_deinit(new);
                pte_free(new);
-       else {
+       } else {
                mm->nr_ptes++;
                inc_page_state(nr_page_table_pages);
                pmd_populate(mm, pmd, new);
        if (!dst_pte)
                return -ENOMEM;
        src_pte = pte_offset_map_nested(src_pmd, addr);
-       src_ptl = &src_mm->page_table_lock;
+       src_ptl = pte_lockptr(src_mm, src_pmd);
        spin_lock(src_ptl);
 
        do {
  * (but do_wp_page is only called after already making such a check;
  * and do_anonymous_page and do_no_page can safely check later on).
  */
-static inline int pte_unmap_same(struct mm_struct *mm,
+static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
                                pte_t *page_table, pte_t orig_pte)
 {
        int same = 1;
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
        if (sizeof(pte_t) > sizeof(unsigned long)) {
-               spin_lock(&mm->page_table_lock);
+               spinlock_t *ptl = pte_lockptr(mm, pmd);
+               spin_lock(ptl);
                same = pte_same(*page_table, orig_pte);
-               spin_unlock(&mm->page_table_lock);
+               spin_unlock(ptl);
        }
 #endif
        pte_unmap(page_table);
        pte_t pte;
        int ret = VM_FAULT_MINOR;
 
-       if (!pte_unmap_same(mm, page_table, orig_pte))
+       if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
                goto out;
 
        entry = pte_to_swp_entry(orig_pte);
                page_cache_get(page);
                entry = mk_pte(page, vma->vm_page_prot);
 
-               ptl = &mm->page_table_lock;
+               ptl = pte_lockptr(mm, pmd);
                spin_lock(ptl);
                if (!pte_none(*page_table))
                        goto release;
        pgoff_t pgoff;
        int err;
 
-       if (!pte_unmap_same(mm, page_table, orig_pte))
+       if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
                return VM_FAULT_MINOR;
 
        if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
                                        pte, pmd, write_access, entry);
        }
 
-       ptl = &mm->page_table_lock;
+       ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
        if (unlikely(!pte_same(*pte, entry)))
                goto unlock;
 
        struct address_space *mapping = NULL;
        struct mm_struct *mm = vma->vm_mm;
        pte_t *old_pte, *new_pte, pte;
-       spinlock_t *old_ptl;
+       spinlock_t *old_ptl, *new_ptl;
 
        if (vma->vm_file) {
                /*
                        new_vma->vm_truncate_count = 0;
        }
 
+       /*
+        * We don't have to worry about the ordering of src and dst
+        * pte locks because exclusive mmap_sem prevents deadlock.
+        */
        old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
        new_pte = pte_offset_map_nested(new_pmd, new_addr);
+       new_ptl = pte_lockptr(mm, new_pmd);
+       if (new_ptl != old_ptl)
+               spin_lock(new_ptl);
 
        for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
                                   new_pte++, new_addr += PAGE_SIZE) {
                set_pte_at(mm, new_addr, new_pte, pte);
        }
 
+       if (new_ptl != old_ptl)
+               spin_unlock(new_ptl);
        pte_unmap_nested(new_pte - 1);
        pte_unmap_unlock(old_pte - 1, old_ptl);
        if (mapping)
 
                struct page *p = page + i;
 
                SetPageCompound(p);
-               p->private = (unsigned long)page;
+               set_page_private(p, (unsigned long)page);
        }
 }
 
 
                if (!PageCompound(p))
                        bad_page(__FUNCTION__, page);
-               if (p->private != (unsigned long)page)
+               if (page_private(p) != (unsigned long)page)
                        bad_page(__FUNCTION__, page);
                ClearPageCompound(p);
        }
  * So, we don't need atomic page->flags operations here.
  */
 static inline unsigned long page_order(struct page *page) {
-       return page->private;
+       return page_private(page);
 }
 
 static inline void set_page_order(struct page *page, int order) {
-       page->private = order;
+       set_page_private(page, order);
        __SetPagePrivate(page);
 }
 
 static inline void rmv_page_order(struct page *page)
 {
        __ClearPagePrivate(page);
-       page->private = 0;
+       set_page_private(page, 0);
 }
 
 /*
  * (a) the buddy is free &&
  * (b) the buddy is on the buddy system &&
  * (c) a page and its buddy have the same order.
- * for recording page's order, we use page->private and PG_private.
+ * for recording page's order, we use page_private(page) and PG_private.
  *
  */
 static inline int page_is_buddy(struct page *page, int order)
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with PG_Private.Page's
- * order is recorded in page->private field.
+ * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were   
  * free, the remainder of the region must be split into blocks.   
        page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
                        1 << PG_referenced | 1 << PG_arch_1 |
                        1 << PG_checked | 1 << PG_mappedtodisk);
-       page->private = 0;
+       set_page_private(page, 0);
        set_page_refs(page, order);
        kernel_map_pages(page, 1 << order, 1);
 }
 
                unlock_page(page);
                goto out;
        }
-       bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write);
+       bio = get_swap_bio(GFP_NOIO, page_private(page), page,
+                               end_swap_bio_write);
        if (bio == NULL) {
                set_page_dirty(page);
                unlock_page(page);
 
        BUG_ON(!PageLocked(page));
        ClearPageUptodate(page);
-       bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read);
+       bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
+                               end_swap_bio_read);
        if (bio == NULL) {
                unlock_page(page);
                ret = -ENOMEM;
 
                return NULL;
        }
 
-       ptl = &mm->page_table_lock;
+       ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
        if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
                *ptlp = ptl;
        update_hiwater_rss(mm);
 
        if (PageAnon(page)) {
-               swp_entry_t entry = { .val = page->private };
+               swp_entry_t entry = { .val = page_private(page) };
                /*
                 * Store the swap location in the pte.
                 * See handle_pte_fault() ...
 
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
 
-/* Keep swapped page count in private field of indirect struct page */
-#define nr_swapped             private
-
 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
 enum sgp_type {
        SGP_QUICK,      /* don't try more than file page cache lookup */
 
        entry->val = value;
        info->swapped += incdec;
-       if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
-               kmap_atomic_to_page(entry)->nr_swapped += incdec;
+       if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
+               struct page *page = kmap_atomic_to_page(entry);
+               set_page_private(page, page_private(page) + incdec);
+       }
 }
 
 /*
 
                spin_unlock(&info->lock);
                page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
-               if (page) {
-                       page->nr_swapped = 0;
-               }
+               if (page)
+                       set_page_private(page, 0);
                spin_lock(&info->lock);
 
                if (!page) {
                        diroff = 0;
                }
                subdir = dir[diroff];
-               if (subdir && subdir->nr_swapped) {
+               if (subdir && page_private(subdir)) {
                        size = limit - idx;
                        if (size > ENTRIES_PER_PAGE)
                                size = ENTRIES_PER_PAGE;
                        nr_swaps_freed += freed;
                        if (offset)
                                spin_lock(&info->lock);
-                       subdir->nr_swapped -= freed;
+                       set_page_private(subdir, page_private(subdir) - freed);
                        if (offset)
                                spin_unlock(&info->lock);
-                       BUG_ON(subdir->nr_swapped > offset);
+                       BUG_ON(page_private(subdir) > offset);
                }
                if (offset)
                        offset = 0;
                        dir = shmem_dir_map(subdir);
                }
                subdir = *dir;
-               if (subdir && subdir->nr_swapped) {
+               if (subdir && page_private(subdir)) {
                        ptr = shmem_swp_map(subdir);
                        size = limit - idx;
                        if (size > ENTRIES_PER_PAGE)
 
 void put_page(struct page *page)
 {
        if (unlikely(PageCompound(page))) {
-               page = (struct page *)page->private;
+               page = (struct page *)page_private(page);
                if (put_page_testzero(page)) {
                        void (*dtor)(struct page *page);
 
 
                        page_cache_get(page);
                        SetPageLocked(page);
                        SetPageSwapCache(page);
-                       page->private = entry.val;
+                       set_page_private(page, entry.val);
                        total_swapcache_pages++;
                        pagecache_acct(1);
                }
        BUG_ON(PageWriteback(page));
        BUG_ON(PagePrivate(page));
 
-       radix_tree_delete(&swapper_space.page_tree, page->private);
-       page->private = 0;
+       radix_tree_delete(&swapper_space.page_tree, page_private(page));
+       set_page_private(page, 0);
        ClearPageSwapCache(page);
        total_swapcache_pages--;
        pagecache_acct(-1);
 {
        swp_entry_t entry;
 
-       entry.val = page->private;
+       entry.val = page_private(page);
 
        write_lock_irq(&swapper_space.tree_lock);
        __delete_from_swap_cache(page);
 
        swp_entry_t entry;
 
        down_read(&swap_unplug_sem);
-       entry.val = page->private;
+       entry.val = page_private(page);
        if (PageSwapCache(page)) {
                struct block_device *bdev = swap_info[swp_type(entry)].bdev;
                struct backing_dev_info *bdi;
                /*
                 * If the page is removed from swapcache from under us (with a
                 * racy try_to_unuse/swapoff) we need an additional reference
-                * count to avoid reading garbage from page->private above. If
-                * the WARN_ON triggers during a swapoff it maybe the race
+                * count to avoid reading garbage from page_private(page) above.
+                * If the WARN_ON triggers during a swapoff it maybe the race
                 * condition and it's harmless. However if it triggers without
                 * swapoff it signals a problem.
                 */
        struct swap_info_struct *p;
        swp_entry_t entry;
 
-       entry.val = page->private;
+       entry.val = page_private(page);
        p = swap_info_get(entry);
        if (p) {
                /* Subtract the 1 for the swap cache itself */
        if (page_count(page) != 2) /* 2: us + cache */
                return 0;
 
-       entry.val = page->private;
+       entry.val = page_private(page);
        p = swap_info_get(entry);
        if (!p)
                return 0;
        BUG_ON(!PageLocked(page));      /* It pins the swap_info_struct */
 
        if (PageSwapCache(page)) {
-               swp_entry_t entry = { .val = page->private };
+               swp_entry_t entry = { .val = page_private(page) };
                struct swap_info_struct *sis;
 
                sis = get_swap_info_struct(swp_type(entry));
 
 
 #ifdef CONFIG_SWAP
                if (PageSwapCache(page)) {
-                       swp_entry_t swap = { .val = page->private };
+                       swp_entry_t swap = { .val = page_private(page) };
                        __delete_from_swap_cache(page);
                        write_unlock_irq(&mapping->tree_lock);
                        swap_free(swap);