percpu: finer grained locking to break deadlock and allow atomic free

author Tejun Heo <tj@kernel.org>

Fri, 6 Mar 2009 15:44:13 +0000 (00:44 +0900)

committer Tejun Heo <tj@kernel.org>

Sat, 7 Mar 2009 05:46:35 +0000 (14:46 +0900)
author Tejun Heo <tj@kernel.org>
Fri, 6 Mar 2009 15:44:13 +0000 (00:44 +0900)
committer Tejun Heo <tj@kernel.org>
Sat, 7 Mar 2009 05:46:35 +0000 (14:46 +0900)
diff --git a/mm/percpu.c b/mm/percpu.c

index 4c8a419119dae00a6957001cc5587228d64c3b14..bfe6a3afaf45e4be409d8231434447735d40de9c 100644 (file)
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -62,6 +62,7 @@
  #include <linux/pfn.h>
  #include <linux/rbtree.h>
  #include <linux/slab.h>
+#include <linux/spinlock.h>
  #include <linux/vmalloc.h>
  #include <linux/workqueue.h>
  
@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
  static int pcpu_reserved_chunk_limit;
  
  /*
- * One mutex to rule them all.
- *
- * The following mutex is grabbed in the outermost public alloc/free
- * interface functions and released only when the operation is
- * complete.  As such, every function in this file other than the
- * outermost functions are called under pcpu_mutex.
- *
- * It can easily be switched to use spinlock such that only the area
- * allocation and page population commit are protected with it doing
- * actual [de]allocation without holding any lock.  However, given
- * what this allocator does, I think it's better to let them run
- * sequentially.
+ * Synchronization rules.
+ *
+ * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
+ * protects allocation/reclaim paths, chunks and chunk->page arrays.
+ * The latter is a spinlock and protects the index data structures -
+ * chunk slots, rbtree, chunks and area maps in chunks.
+ *
+ * During allocation, pcpu_alloc_mutex is kept locked all the time and
+ * pcpu_lock is grabbed and released as necessary.  All actual memory
+ * allocations are done using GFP_KERNEL with pcpu_lock released.
+ *
+ * Free path accesses and alters only the index data structures, so it
+ * can be safely called from atomic context.  When memory needs to be
+ * returned to the system, free path schedules reclaim_work which
+ * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
+ * reclaimed, release both locks and frees the chunks.  Note that it's
+ * necessary to grab both locks to remove a chunk from circulation as
+ * allocation path might be referencing the chunk with only
+ * pcpu_alloc_mutex locked.
   */
-static DEFINE_MUTEX(pcpu_mutex);
+static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
+static DEFINE_SPINLOCK(pcpu_lock);     /* protects index data structures */
  
  static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
  static struct rb_root pcpu_addr_root = RB_ROOT;        /* chunks by address */
@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
   * kzalloc() is used; otherwise, vmalloc() is used.  The returned
   * memory is always zeroed.
   *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
   * RETURNS:
   * Pointer to the allocated area on success, NULL on failure.
   */
@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size)
   * New slot according to the changed state is determined and @chunk is
   * moved to the slot.  Note that the reserved chunk is never put on
   * chunk slots.
+ *
+ * CONTEXT:
+ * pcpu_lock.
   */
  static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
  {
@@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
   * searchs for the chunk with the highest start address which isn't
   * beyond @addr.
   *
+ * CONTEXT:
+ * pcpu_lock.
+ *
   * RETURNS:
   * The address of the found chunk.
   */
@@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
   * @new: chunk to insert
   *
   * Insert @new into address rb tree.
+ *
+ * CONTEXT:
+ * pcpu_lock.
   */
  static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
  {
@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
   * A single allocation can split an area into three areas, so this
   * function makes sure that @chunk->map has at least two extra slots.
   *
+ * CONTEXT:
+ * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
+ * if area map is extended.
+ *
   * RETURNS:
   * 0 if noop, 1 if successfully extended, -errno on failure.
   */
@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
         if (chunk->map_alloc >= chunk->map_used + 2)
                 return 0;
  
+       spin_unlock_irq(&pcpu_lock);
+
         new_alloc = PCPU_DFL_MAP_ALLOC;
         while (new_alloc < chunk->map_used + 2)
                 new_alloc *= 2;
  
         new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
-       if (!new)
+       if (!new) {
+               spin_lock_irq(&pcpu_lock);
                 return -ENOMEM;
+       }
+
+       /*
+        * Acquire pcpu_lock and switch to new area map.  Only free
+        * could have happened inbetween, so map_used couldn't have
+        * grown.
+        */
+       spin_lock_irq(&pcpu_lock);
+       BUG_ON(new_alloc < chunk->map_used + 2);
  
         size = chunk->map_alloc * sizeof(chunk->map[0]);
         memcpy(new, chunk->map, size);
@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
   * is inserted after the target block.
   *
   * @chunk->map must have enough free slots to accomodate the split.
+ *
+ * CONTEXT:
+ * pcpu_lock.
   */
  static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
                              int head, int tail)
@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
   *
   * @chunk->map must have at least two free slots.
   *
+ * CONTEXT:
+ * pcpu_lock.
+ *
   * RETURNS:
   * Allocated offset in @chunk on success, -1 if no matching area is
   * found.
@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
   * Free area starting from @freeme to @chunk.  Note that this function
   * only modifies the allocation map.  It doesn't depopulate or unmap
   * the area.
+ *
+ * CONTEXT:
+ * pcpu_lock.
   */
  static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
  {
@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
   * For each cpu, depopulate and unmap pages [@page_start,@page_end)
   * from @chunk.  If @flush is true, vcache is flushed before unmapping
   * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
   */
  static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
                                   bool flush)
@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
   *
   * For each cpu, populate and map pages [@page_start,@page_end) into
   * @chunk.  The area is cleared on return.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
   */
  static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
  {
@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
   * @align: alignment of area (max PAGE_SIZE)
   * @reserved: allocate from the reserved chunk if available
   *
- * Allocate percpu area of @size bytes aligned at @align.  Might
- * sleep.  Might trigger writeouts.
+ * Allocate percpu area of @size bytes aligned at @align.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
   *
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
  static void *pcpu_alloc(size_t size, size_t align, bool reserved)
  {
-       void *ptr = NULL;
         struct pcpu_chunk *chunk;
         int slot, off;
  
@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
                 return NULL;
         }
  
-       mutex_lock(&pcpu_mutex);
+       mutex_lock(&pcpu_alloc_mutex);
+       spin_lock_irq(&pcpu_lock);
  
         /* serve reserved allocations from the reserved chunk if available */
         if (reserved && pcpu_reserved_chunk) {
                 chunk = pcpu_reserved_chunk;
                 if (size > chunk->contig_hint ||
                     pcpu_extend_area_map(chunk) < 0)
-                       goto out_unlock;
+                       goto fail_unlock;
                 off = pcpu_alloc_area(chunk, size, align);
                 if (off >= 0)
                         goto area_found;
-               goto out_unlock;
+               goto fail_unlock;
         }
  
+restart:
         /* search through normal chunks */
         for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
                 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
                         if (size > chunk->contig_hint)
                                 continue;
-                       if (pcpu_extend_area_map(chunk) < 0)
-                               goto out_unlock;
+
+                       switch (pcpu_extend_area_map(chunk)) {
+                       case 0:
+                               break;
+                       case 1:
+                               goto restart;   /* pcpu_lock dropped, restart */
+                       default:
+                               goto fail_unlock;
+                       }
+
                         off = pcpu_alloc_area(chunk, size, align);
                         if (off >= 0)
                                 goto area_found;
@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
         }
  
         /* hmmm... no space left, create a new chunk */
+       spin_unlock_irq(&pcpu_lock);
+
         chunk = alloc_pcpu_chunk();
         if (!chunk)
-               goto out_unlock;
+               goto fail_unlock_mutex;
+
+       spin_lock_irq(&pcpu_lock);
         pcpu_chunk_relocate(chunk, -1);
         pcpu_chunk_addr_insert(chunk);
-
-       off = pcpu_alloc_area(chunk, size, align);
-       if (off < 0)
-               goto out_unlock;
+       goto restart;
  
  area_found:
+       spin_unlock_irq(&pcpu_lock);
+
         /* populate, map and clear the area */
         if (pcpu_populate_chunk(chunk, off, size)) {
+               spin_lock_irq(&pcpu_lock);
                 pcpu_free_area(chunk, off);
-               goto out_unlock;
+               goto fail_unlock;
         }
  
-       ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
-out_unlock:
-       mutex_unlock(&pcpu_mutex);
-       return ptr;
+       mutex_unlock(&pcpu_alloc_mutex);
+
+       return __addr_to_pcpu_ptr(chunk->vm->addr + off);
+
+fail_unlock:
+       spin_unlock_irq(&pcpu_lock);
+fail_unlock_mutex:
+       mutex_unlock(&pcpu_alloc_mutex);
+       return NULL;
  }
  
  /**
@@ -825,6 +897,9 @@ out_unlock:
   * Allocate percpu area of @size bytes aligned at @align.  Might
   * sleep.  Might trigger writeouts.
   *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
   * percpu area if arch has set it up; otherwise, allocation is served
   * from the same dynamic area.  Might sleep.  Might trigger writeouts.
   *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
   * RETURNS:
   * Percpu pointer to the allocated area on success, NULL on failure.
   */
@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
   * @work: unused
   *
   * Reclaim all fully free chunks except for the first one.
+ *
+ * CONTEXT:
+ * workqueue context.
   */
  static void pcpu_reclaim(struct work_struct *work)
  {
@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work)
         struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
         struct pcpu_chunk *chunk, *next;
  
-       mutex_lock(&pcpu_mutex);
+       mutex_lock(&pcpu_alloc_mutex);
+       spin_lock_irq(&pcpu_lock);
  
         list_for_each_entry_safe(chunk, next, head, list) {
                 WARN_ON(chunk->immutable);
@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work)
                 list_move(&chunk->list, &todo);
         }
  
-       mutex_unlock(&pcpu_mutex);
+       spin_unlock_irq(&pcpu_lock);
+       mutex_unlock(&pcpu_alloc_mutex);
  
         list_for_each_entry_safe(chunk, next, &todo, list) {
                 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
@@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work)
   * free_percpu - free percpu area
   * @ptr: pointer to area to free
   *
- * Free percpu area @ptr.  Might sleep.
+ * Free percpu area @ptr.
+ *
+ * CONTEXT:
+ * Can be called from atomic context.
   */
  void free_percpu(void *ptr)
  {
         void *addr = __pcpu_ptr_to_addr(ptr);
         struct pcpu_chunk *chunk;
+       unsigned long flags;
         int off;
  
         if (!ptr)
                 return;
  
-       mutex_lock(&pcpu_mutex);
+       spin_lock_irqsave(&pcpu_lock, flags);
  
         chunk = pcpu_chunk_addr_search(addr);
         off = addr - chunk->vm->addr;
@@ -917,7 +1004,7 @@ void free_percpu(void *ptr)
                         }
         }
  
-       mutex_unlock(&pcpu_mutex);
+       spin_unlock_irqrestore(&pcpu_lock, flags);
  }
  EXPORT_SYMBOL_GPL(free_percpu);
author	Tejun Heo <tj@kernel.org>
	Fri, 6 Mar 2009 15:44:13 +0000 (00:44 +0900)
committer	Tejun Heo <tj@kernel.org>
	Sat, 7 Mar 2009 05:46:35 +0000 (14:46 +0900)