Merge source.mvista.com:/home/git/linux-omap-2.6

[linux-2.6-omap-h63xx.git] / mm / slab.c
diff --git a/mm/slab.c b/mm/slab.c

index 71370256a7eb11e8b6eeb0c88cbd0e23955ab163..d0bd7f07ab041d3d7da06c5cec610035c3058b33 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -294,6 +294,7 @@ struct kmem_list3 {
         unsigned long next_reap;
         int free_touched;
         unsigned int free_limit;
+       unsigned int colour_next;       /* Per-node cache coloring */
         spinlock_t list_lock;
         struct array_cache *shared;     /* shared per node */
         struct array_cache **alien;     /* on other nodes */
@@ -344,6 +345,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
         INIT_LIST_HEAD(&parent->slabs_free);
         parent->shared = NULL;
         parent->alien = NULL;
+       parent->colour_next = 0;
         spin_lock_init(&parent->list_lock);
         parent->free_objects = 0;
         parent->free_touched = 0;
@@ -390,7 +392,6 @@ struct kmem_cache {
  
         size_t colour;          /* cache colouring range */
         unsigned int colour_off;        /* colour offset */
-       unsigned int colour_next;       /* cache colouring */
         struct kmem_cache *slabp_cache;
         unsigned int slab_size;
         unsigned int dflags;    /* dynamic flags */
@@ -788,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char *
         dump_stack();
  }
  
+#ifdef CONFIG_NUMA
+/*
+ * Special reaping functions for NUMA systems called from cache_reap().
+ * These take care of doing round robin flushing of alien caches (containing
+ * objects freed on different nodes from which they were allocated) and the
+ * flushing of remote pcps by calling drain_node_pages.
+ */
+static DEFINE_PER_CPU(unsigned long, reap_node);
+
+static void init_reap_node(int cpu)
+{
+       int node;
+
+       node = next_node(cpu_to_node(cpu), node_online_map);
+       if (node == MAX_NUMNODES)
+               node = 0;
+
+       __get_cpu_var(reap_node) = node;
+}
+
+static void next_reap_node(void)
+{
+       int node = __get_cpu_var(reap_node);
+
+       /*
+        * Also drain per cpu pages on remote zones
+        */
+       if (node != numa_node_id())
+               drain_node_pages(node);
+
+       node = next_node(node, node_online_map);
+       if (unlikely(node >= MAX_NUMNODES))
+               node = first_node(node_online_map);
+       __get_cpu_var(reap_node) = node;
+}
+
+#else
+#define init_reap_node(cpu) do { } while (0)
+#define next_reap_node(void) do { } while (0)
+#endif
+
  /*
   * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
   * via the workqueue/eventd.
@@ -805,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
          * at that time.
          */
         if (keventd_up() && reap_work->func == NULL) {
+               init_reap_node(cpu);
                 INIT_WORK(reap_work, cache_reap, NULL);
                 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
         }
@@ -883,14 +926,31 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
         }
  }
  
-static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
+/*
+ * Called from cache_reap() to regularly drain alien caches round robin.
+ */
+static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+{
+       int node = __get_cpu_var(reap_node);
+
+       if (l3->alien) {
+               struct array_cache *ac = l3->alien[node];
+               if (ac && ac->avail) {
+                       spin_lock_irq(&ac->lock);
+                       __drain_alien_cache(cachep, ac, node);
+                       spin_unlock_irq(&ac->lock);
+               }
+       }
+}
+
+static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
  {
         int i = 0;
         struct array_cache *ac;
         unsigned long flags;
  
         for_each_online_node(i) {
-               ac = l3->alien[i];
+               ac = alien[i];
                 if (ac) {
                         spin_lock_irqsave(&ac->lock, flags);
                         __drain_alien_cache(cachep, ac, i);
@@ -899,9 +959,19 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
         }
  }
  #else
-#define alloc_alien_cache(node, limit) do { } while (0)
-#define free_alien_cache(ac_ptr) do { } while (0)
-#define drain_alien_cache(cachep, l3) do { } while (0)
+
+#define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
+
+static inline struct array_cache **alloc_alien_cache(int node, int limit)
+{
+       return (struct array_cache **) 0x01020304ul;
+}
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+}
+
  #endif
  
  static int __devinit cpuup_callback(struct notifier_block *nfb,
@@ -935,6 +1005,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                                 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
                                     ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
  
+                               /*
+                                * The l3s don't come and go as CPUs come and
+                                * go.  cache_chain_mutex is sufficient
+                                * protection here.
+                                */
                                 cachep->nodelists[node] = l3;
                         }
  
@@ -949,26 +1024,46 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                    & array cache's */
                 list_for_each_entry(cachep, &cache_chain, next) {
                         struct array_cache *nc;
+                       struct array_cache *shared;
+                       struct array_cache **alien;
  
                         nc = alloc_arraycache(node, cachep->limit,
-                                             cachep->batchcount);
+                                               cachep->batchcount);
                         if (!nc)
                                 goto bad;
+                       shared = alloc_arraycache(node,
+                                       cachep->shared * cachep->batchcount,
+                                       0xbaadf00d);
+                       if (!shared)
+                               goto bad;
+
+                       alien = alloc_alien_cache(node, cachep->limit);
+                       if (!alien)
+                               goto bad;
                         cachep->array[cpu] = nc;
  
                         l3 = cachep->nodelists[node];
                         BUG_ON(!l3);
-                       if (!l3->shared) {
-                               if (!(nc = alloc_arraycache(node,
-                                                           cachep->shared *
-                                                           cachep->batchcount,
-                                                           0xbaadf00d)))
-                                       goto bad;
  
-                               /* we are serialised from CPU_DEAD or
-                                  CPU_UP_CANCELLED by the cpucontrol lock */
-                               l3->shared = nc;
+                       spin_lock_irq(&l3->list_lock);
+                       if (!l3->shared) {
+                               /*
+                                * We are serialised from CPU_DEAD or
+                                * CPU_UP_CANCELLED by the cpucontrol lock
+                                */
+                               l3->shared = shared;
+                               shared = NULL;
                         }
+#ifdef CONFIG_NUMA
+                       if (!l3->alien) {
+                               l3->alien = alien;
+                               alien = NULL;
+                       }
+#endif
+                       spin_unlock_irq(&l3->list_lock);
+
+                       kfree(shared);
+                       free_alien_cache(alien);
                 }
                 mutex_unlock(&cache_chain_mutex);
                 break;
@@ -977,25 +1072,34 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                 break;
  #ifdef CONFIG_HOTPLUG_CPU
         case CPU_DEAD:
+               /*
+                * Even if all the cpus of a node are down, we don't free the
+                * kmem_list3 of any cache. This to avoid a race between
+                * cpu_down, and a kmalloc allocation from another cpu for
+                * memory from the node of the cpu going down.  The list3
+                * structure is usually allocated from kmem_cache_create() and
+                * gets destroyed at kmem_cache_destroy().
+                */
                 /* fall thru */
         case CPU_UP_CANCELED:
                 mutex_lock(&cache_chain_mutex);
  
                 list_for_each_entry(cachep, &cache_chain, next) {
                         struct array_cache *nc;
+                       struct array_cache *shared;
+                       struct array_cache **alien;
                         cpumask_t mask;
  
                         mask = node_to_cpumask(node);
-                       spin_lock_irq(&cachep->spinlock);
                         /* cpu is dead; no one can alloc from it. */
                         nc = cachep->array[cpu];
                         cachep->array[cpu] = NULL;
                         l3 = cachep->nodelists[node];
  
                         if (!l3)
-                               goto unlock_cache;
+                               goto free_array_cache;
  
-                       spin_lock(&l3->list_lock);
+                       spin_lock_irq(&l3->list_lock);
  
                         /* Free limit for this kmem_list3 */
                         l3->free_limit -= cachep->batchcount;
@@ -1003,34 +1107,44 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                                 free_block(cachep, nc->entry, nc->avail, node);
  
                         if (!cpus_empty(mask)) {
-                               spin_unlock(&l3->list_lock);
-                               goto unlock_cache;
+                               spin_unlock_irq(&l3->list_lock);
+                               goto free_array_cache;
                         }
  
-                       if (l3->shared) {
+                       shared = l3->shared;
+                       if (shared) {
                                 free_block(cachep, l3->shared->entry,
                                            l3->shared->avail, node);
-                               kfree(l3->shared);
                                 l3->shared = NULL;
                         }
-                       if (l3->alien) {
-                               drain_alien_cache(cachep, l3);
-                               free_alien_cache(l3->alien);
-                               l3->alien = NULL;
-                       }
  
-                       /* free slabs belonging to this node */
-                       if (__node_shrink(cachep, node)) {
-                               cachep->nodelists[node] = NULL;
-                               spin_unlock(&l3->list_lock);
-                               kfree(l3);
-                       } else {
-                               spin_unlock(&l3->list_lock);
+                       alien = l3->alien;
+                       l3->alien = NULL;
+
+                       spin_unlock_irq(&l3->list_lock);
+
+                       kfree(shared);
+                       if (alien) {
+                               drain_alien_cache(cachep, alien);
+                               free_alien_cache(alien);
                         }
-                     unlock_cache:
-                       spin_unlock_irq(&cachep->spinlock);
+free_array_cache:
                         kfree(nc);
                 }
+               /*
+                * In the previous loop, all the objects were freed to
+                * the respective cache's slabs,  now we can go ahead and
+                * shrink each nodelist to its limit.
+                */
+               list_for_each_entry(cachep, &cache_chain, next) {
+                       l3 = cachep->nodelists[node];
+                       if (!l3)
+                               continue;
+                       spin_lock_irq(&l3->list_lock);
+                       /* free slabs belonging to this node */
+                       __node_shrink(cachep, node);
+                       spin_unlock_irq(&l3->list_lock);
+               }
                 mutex_unlock(&cache_chain_mutex);
                 break;
  #endif
@@ -1070,6 +1184,7 @@ void __init kmem_cache_init(void)
         struct cache_sizes *sizes;
         struct cache_names *names;
         int i;
+       int order;
  
         for (i = 0; i < NUM_INIT_LISTS; i++) {
                 kmem_list3_init(&initkmem_list3[i]);
@@ -1113,13 +1228,16 @@ void __init kmem_cache_init(void)
  
         cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
  
-       cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
-                      &left_over, &cache_cache.num);
+       for (order = 0; order < MAX_ORDER; order++) {
+               cache_estimate(order, cache_cache.buffer_size,
+                       cache_line_size(), 0, &left_over, &cache_cache.num);
+               if (cache_cache.num)
+                       break;
+       }
         if (!cache_cache.num)
                 BUG();
-
+       cache_cache.gfporder = order;
         cache_cache.colour = left_over / cache_cache.colour_off;
-       cache_cache.colour_next = 0;
         cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
                                       sizeof(struct slab), cache_line_size());
  
@@ -1575,36 +1693,44 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
                         size_t size, size_t align, unsigned long flags)
  {
         size_t left_over = 0;
+       int gfporder;
  
-       for (;; cachep->gfporder++) {
+       for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
                 unsigned int num;
                 size_t remainder;
  
-               if (cachep->gfporder > MAX_GFP_ORDER) {
-                       cachep->num = 0;
-                       break;
-               }
-
-               cache_estimate(cachep->gfporder, size, align, flags,
-                              &remainder, &num);
+               cache_estimate(gfporder, size, align, flags, &remainder, &num);
                 if (!num)
                         continue;
+
                 /* More than offslab_limit objects will cause problems */
-               if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
+               if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)
                         break;
  
+               /* Found something acceptable - save it away */
                 cachep->num = num;
+               cachep->gfporder = gfporder;
                 left_over = remainder;
  
+               /*
+                * A VFS-reclaimable slab tends to have most allocations
+                * as GFP_NOFS and we really don't want to have to be allocating
+                * higher-order pages when we are unable to shrink dcache.
+                */
+               if (flags & SLAB_RECLAIM_ACCOUNT)
+                       break;
+
                 /*
                  * Large number of objects is good, but very large slabs are
                  * currently bad for the gfp()s.
                  */
-               if (cachep->gfporder >= slab_break_gfp_order)
+               if (gfporder >= slab_break_gfp_order)
                         break;
  
-               if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
-                       /* Acceptable internal fragmentation */
+               /*
+                * Acceptable internal fragmentation?
+                */
+               if ((left_over * 8) <= (PAGE_SIZE << gfporder))
                         break;
         }
         return left_over;
@@ -1664,6 +1790,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                 BUG();
         }
  
+       /*
+        * Prevent CPUs from coming and going.
+        * lock_cpu_hotplug() nests outside cache_chain_mutex
+        */
+       lock_cpu_hotplug();
+
         mutex_lock(&cache_chain_mutex);
  
         list_for_each(p, &cache_chain) {
@@ -1810,17 +1942,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
  
         size = ALIGN(size, align);
  
-       if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
-               /*
-                * A VFS-reclaimable slab tends to have most allocations
-                * as GFP_NOFS and we really don't want to have to be allocating
-                * higher-order pages when we are unable to shrink dcache.
-                */
-               cachep->gfporder = 0;
-               cache_estimate(cachep->gfporder, size, align, flags,
-                              &left_over, &cachep->num);
-       } else
-               left_over = calculate_slab_order(cachep, size, align, flags);
+       left_over = calculate_slab_order(cachep, size, align, flags);
  
         if (!cachep->num) {
                 printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -1865,8 +1987,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         cachep->dtor = dtor;
         cachep->name = name;
  
-       /* Don't let CPUs to come and go */
-       lock_cpu_hotplug();
  
         if (g_cpucache_up == FULL) {
                 enable_cpucache(cachep);
@@ -1925,12 +2045,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
  
         /* cache setup completed, link it into the list */
         list_add(&cachep->next, &cache_chain);
-       unlock_cpu_hotplug();
        oops:
         if (!cachep && (flags & SLAB_PANIC))
                 panic("kmem_cache_create(): failed to create slab `%s'\n",
                       name);
         mutex_unlock(&cache_chain_mutex);
+       unlock_cpu_hotplug();
         return cachep;
  }
  EXPORT_SYMBOL(kmem_cache_create);
@@ -2011,18 +2131,16 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
  
         smp_call_function_all_cpus(do_drain, cachep);
         check_irq_on();
-       spin_lock_irq(&cachep->spinlock);
         for_each_online_node(node) {
                 l3 = cachep->nodelists[node];
                 if (l3) {
-                       spin_lock(&l3->list_lock);
+                       spin_lock_irq(&l3->list_lock);
                         drain_array_locked(cachep, l3->shared, 1, node);
-                       spin_unlock(&l3->list_lock);
+                       spin_unlock_irq(&l3->list_lock);
                         if (l3->alien)
-                               drain_alien_cache(cachep, l3);
+                               drain_alien_cache(cachep, l3->alien);
                 }
         }
-       spin_unlock_irq(&cachep->spinlock);
  }
  
  static int __node_shrink(struct kmem_cache *cachep, int node)
@@ -2324,20 +2442,20 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                  */
                 ctor_flags |= SLAB_CTOR_ATOMIC;
  
-       /* About to mess with non-constant members - lock. */
+       /* Take the l3 list lock to change the colour_next on this node */
         check_irq_off();
-       spin_lock(&cachep->spinlock);
+       l3 = cachep->nodelists[nodeid];
+       spin_lock(&l3->list_lock);
  
         /* Get colour for the slab, and cal the next value. */
-       offset = cachep->colour_next;
-       cachep->colour_next++;
-       if (cachep->colour_next >= cachep->colour)
-               cachep->colour_next = 0;
-       offset *= cachep->colour_off;
+       offset = l3->colour_next;
+       l3->colour_next++;
+       if (l3->colour_next >= cachep->colour)
+               l3->colour_next = 0;
+       spin_unlock(&l3->list_lock);
  
-       spin_unlock(&cachep->spinlock);
+       offset *= cachep->colour_off;
  
-       check_irq_off();
         if (local_flags & __GFP_WAIT)
                 local_irq_enable();
  
@@ -2367,7 +2485,6 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
         if (local_flags & __GFP_WAIT)
                 local_irq_disable();
         check_irq_off();
-       l3 = cachep->nodelists[nodeid];
         spin_lock(&l3->list_lock);
  
         /* Make slab active. */
@@ -2500,7 +2617,7 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
                        "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
                        cachep->name, cachep->num, slabp, slabp->inuse);
                 for (i = 0;
-                    i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
+                    i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
                      i++) {
                         if ((i % 16) == 0)
                                 printk("\n%03x:", i);
@@ -2725,6 +2842,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
         BUG_ON(!l3);
  
        retry:
+       check_irq_off();
         spin_lock(&l3->list_lock);
         entry = l3->slabs_partial.next;
         if (entry == &l3->slabs_partial) {
@@ -3304,11 +3422,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
         smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
  
         check_irq_on();
-       spin_lock_irq(&cachep->spinlock);
+       spin_lock(&cachep->spinlock);
         cachep->batchcount = batchcount;
         cachep->limit = limit;
         cachep->shared = shared;
-       spin_unlock_irq(&cachep->spinlock);
+       spin_unlock(&cachep->spinlock);
  
         for_each_online_cpu(i) {
                 struct array_cache *ccold = new.new[i];
@@ -3439,8 +3557,7 @@ static void cache_reap(void *unused)
                 check_irq_on();
  
                 l3 = searchp->nodelists[numa_node_id()];
-               if (l3->alien)
-                       drain_alien_cache(searchp, l3);
+               reap_alien(searchp, l3);
                 spin_lock_irq(&l3->list_lock);
  
                 drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3490,7 +3607,7 @@ static void cache_reap(void *unused)
         }
         check_irq_on();
         mutex_unlock(&cache_chain_mutex);
-       drain_remote_pages();
+       next_reap_node();
         /* Setup the next iteration */
         schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
  }
@@ -3564,8 +3681,7 @@ static int s_show(struct seq_file *m, void *p)
         int node;
         struct kmem_list3 *l3;
  
-       check_irq_on();
-       spin_lock_irq(&cachep->spinlock);
+       spin_lock(&cachep->spinlock);
         active_objs = 0;
         num_slabs = 0;
         for_each_online_node(node) {
@@ -3573,7 +3689,8 @@ static int s_show(struct seq_file *m, void *p)
                 if (!l3)
                         continue;
  
-               spin_lock(&l3->list_lock);
+               check_irq_on();
+               spin_lock_irq(&l3->list_lock);
  
                 list_for_each(q, &l3->slabs_full) {
                         slabp = list_entry(q, struct slab, list);
@@ -3598,9 +3715,10 @@ static int s_show(struct seq_file *m, void *p)
                         num_slabs++;
                 }
                 free_objects += l3->free_objects;
-               shared_avail += l3->shared->avail;
+               if (l3->shared)
+                       shared_avail += l3->shared->avail;
  
-               spin_unlock(&l3->list_lock);
+               spin_unlock_irq(&l3->list_lock);
         }
         num_slabs += active_slabs;
         num_objs = num_slabs * cachep->num;
@@ -3644,7 +3762,7 @@ static int s_show(struct seq_file *m, void *p)
         }
  #endif
         seq_putc(m, '\n');
-       spin_unlock_irq(&cachep->spinlock);
+       spin_unlock(&cachep->spinlock);
         return 0;
  }