]> pilppa.org Git - linux-2.6-omap-h63xx.git/blobdiff - fs/btrfs/extent-tree.c
Btrfs: leave btree locks spinning more often
[linux-2.6-omap-h63xx.git] / fs / btrfs / extent-tree.c
index 293da650873f5193c726c4883bc6ecc5114b0a1a..8933d15a240fa0bdc079598d99e7ee739e12728d 100644 (file)
@@ -19,7 +19,8 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
-#include <linux/version.h>
+#include <linux/sort.h>
+#include <linux/rcupdate.h>
 #include "compat.h"
 #include "hash.h"
 #include "crc32c.h"
@@ -30,7 +31,6 @@
 #include "volumes.h"
 #include "locking.h"
 #include "ref-cache.h"
-#include "compat.h"
 
 #define PENDING_EXTENT_INSERT 0
 #define PENDING_EXTENT_DELETE 1
@@ -49,17 +49,27 @@ struct pending_extent_op {
        int del;
 };
 
-static int finish_current_insert(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *extent_root, int all);
-static int del_pending_extents(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *extent_root, int all);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root,
-                         u64 bytenr, u64 num_bytes, int is_data);
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root, u64 parent,
+                                        u64 root_objectid, u64 ref_generation,
+                                        u64 owner, struct btrfs_key *ins,
+                                        int ref_mod);
+static int update_reserved_extents(struct btrfs_root *root,
+                                  u64 bytenr, u64 num, int reserve);
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc,
                              int mark_free);
+static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       u64 bytenr, u64 num_bytes, u64 parent,
+                                       u64 root_objectid, u64 ref_generation,
+                                       u64 owner_objectid, int pin,
+                                       int ref_to_drop);
+
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *extent_root, u64 alloc_bytes,
+                         u64 flags, int force);
 
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
@@ -326,16 +336,34 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
                                                  u64 flags)
 {
        struct list_head *head = &info->space_info;
-       struct list_head *cur;
        struct btrfs_space_info *found;
-       list_for_each(cur, head) {
-               found = list_entry(cur, struct btrfs_space_info, list);
-               if (found->flags == flags)
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(found, head, list) {
+               if (found->flags == flags) {
+                       rcu_read_unlock();
                        return found;
+               }
        }
+       rcu_read_unlock();
        return NULL;
 }
 
+/*
+ * after adding space to the filesystem, we need to clear the full flags
+ * on all the space infos.
+ */
+void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
+{
+       struct list_head *head = &info->space_info;
+       struct btrfs_space_info *found;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(found, head, list)
+               found->full = 0;
+       rcu_read_unlock();
+}
+
 static u64 div_factor(u64 num, int factor)
 {
        if (factor == 10)
@@ -532,262 +560,13 @@ out:
        return ret;
 }
 
-/*
- * updates all the backrefs that are pending on update_list for the
- * extent_root
- */
-static noinline int update_backrefs(struct btrfs_trans_handle *trans,
-                                   struct btrfs_root *extent_root,
-                                   struct btrfs_path *path,
-                                   struct list_head *update_list)
-{
-       struct btrfs_key key;
-       struct btrfs_extent_ref *ref;
-       struct btrfs_fs_info *info = extent_root->fs_info;
-       struct pending_extent_op *op;
-       struct extent_buffer *leaf;
-       int ret = 0;
-       struct list_head *cur = update_list->next;
-       u64 ref_objectid;
-       u64 ref_root = extent_root->root_key.objectid;
-
-       op = list_entry(cur, struct pending_extent_op, list);
-
-search:
-       key.objectid = op->bytenr;
-       key.type = BTRFS_EXTENT_REF_KEY;
-       key.offset = op->orig_parent;
-
-       ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
-       BUG_ON(ret);
-
-       leaf = path->nodes[0];
-
-loop:
-       ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-
-       ref_objectid = btrfs_ref_objectid(leaf, ref);
-
-       if (btrfs_ref_root(leaf, ref) != ref_root ||
-           btrfs_ref_generation(leaf, ref) != op->orig_generation ||
-           (ref_objectid != op->level &&
-            ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
-               printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
-                      "root %llu, owner %u\n",
-                      (unsigned long long)op->bytenr,
-                      (unsigned long long)op->orig_parent,
-                      (unsigned long long)ref_root, op->level);
-               btrfs_print_leaf(extent_root, leaf);
-               BUG();
-       }
-
-       key.objectid = op->bytenr;
-       key.offset = op->parent;
-       key.type = BTRFS_EXTENT_REF_KEY;
-       ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
-       BUG_ON(ret);
-       ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-       btrfs_set_ref_generation(leaf, ref, op->generation);
-
-       cur = cur->next;
-
-       list_del_init(&op->list);
-       unlock_extent(&info->extent_ins, op->bytenr,
-                     op->bytenr + op->num_bytes - 1, GFP_NOFS);
-       kfree(op);
-
-       if (cur == update_list) {
-               btrfs_mark_buffer_dirty(path->nodes[0]);
-               btrfs_release_path(extent_root, path);
-               goto out;
-       }
-
-       op = list_entry(cur, struct pending_extent_op, list);
-
-       path->slots[0]++;
-       while (path->slots[0] < btrfs_header_nritems(leaf)) {
-               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-               if (key.objectid == op->bytenr &&
-                   key.type == BTRFS_EXTENT_REF_KEY)
-                       goto loop;
-               path->slots[0]++;
-       }
-
-       btrfs_mark_buffer_dirty(path->nodes[0]);
-       btrfs_release_path(extent_root, path);
-       goto search;
-
-out:
-       return 0;
-}
-
-static noinline int insert_extents(struct btrfs_trans_handle *trans,
-                                  struct btrfs_root *extent_root,
-                                  struct btrfs_path *path,
-                                  struct list_head *insert_list, int nr)
-{
-       struct btrfs_key *keys;
-       u32 *data_size;
-       struct pending_extent_op *op;
-       struct extent_buffer *leaf;
-       struct list_head *cur = insert_list->next;
-       struct btrfs_fs_info *info = extent_root->fs_info;
-       u64 ref_root = extent_root->root_key.objectid;
-       int i = 0, last = 0, ret;
-       int total = nr * 2;
-
-       if (!nr)
-               return 0;
-
-       keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
-       if (!keys)
-               return -ENOMEM;
-
-       data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
-       if (!data_size) {
-               kfree(keys);
-               return -ENOMEM;
-       }
-
-       list_for_each_entry(op, insert_list, list) {
-               keys[i].objectid = op->bytenr;
-               keys[i].offset = op->num_bytes;
-               keys[i].type = BTRFS_EXTENT_ITEM_KEY;
-               data_size[i] = sizeof(struct btrfs_extent_item);
-               i++;
-
-               keys[i].objectid = op->bytenr;
-               keys[i].offset = op->parent;
-               keys[i].type = BTRFS_EXTENT_REF_KEY;
-               data_size[i] = sizeof(struct btrfs_extent_ref);
-               i++;
-       }
-
-       op = list_entry(cur, struct pending_extent_op, list);
-       i = 0;
-       while (i < total) {
-               int c;
-               ret = btrfs_insert_some_items(trans, extent_root, path,
-                                             keys+i, data_size+i, total-i);
-               BUG_ON(ret < 0);
-
-               if (last && ret > 1)
-                       BUG();
-
-               leaf = path->nodes[0];
-               for (c = 0; c < ret; c++) {
-                       int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
-
-                       /*
-                        * if the first item we inserted was a backref, then
-                        * the EXTENT_ITEM will be the odd c's, else it will
-                        * be the even c's
-                        */
-                       if ((ref_first && (c % 2)) ||
-                           (!ref_first && !(c % 2))) {
-                               struct btrfs_extent_item *itm;
-
-                               itm = btrfs_item_ptr(leaf, path->slots[0] + c,
-                                                    struct btrfs_extent_item);
-                               btrfs_set_extent_refs(path->nodes[0], itm, 1);
-                               op->del++;
-                       } else {
-                               struct btrfs_extent_ref *ref;
-
-                               ref = btrfs_item_ptr(leaf, path->slots[0] + c,
-                                                    struct btrfs_extent_ref);
-                               btrfs_set_ref_root(leaf, ref, ref_root);
-                               btrfs_set_ref_generation(leaf, ref,
-                                                        op->generation);
-                               btrfs_set_ref_objectid(leaf, ref, op->level);
-                               btrfs_set_ref_num_refs(leaf, ref, 1);
-                               op->del++;
-                       }
-
-                       /*
-                        * using del to see when its ok to free up the
-                        * pending_extent_op.  In the case where we insert the
-                        * last item on the list in order to help do batching
-                        * we need to not free the extent op until we actually
-                        * insert the extent_item
-                        */
-                       if (op->del == 2) {
-                               unlock_extent(&info->extent_ins, op->bytenr,
-                                             op->bytenr + op->num_bytes - 1,
-                                             GFP_NOFS);
-                               cur = cur->next;
-                               list_del_init(&op->list);
-                               kfree(op);
-                               if (cur != insert_list)
-                                       op = list_entry(cur,
-                                               struct pending_extent_op,
-                                               list);
-                       }
-               }
-               btrfs_mark_buffer_dirty(leaf);
-               btrfs_release_path(extent_root, path);
-
-               /*
-                * Ok backref's and items usually go right next to eachother,
-                * but if we could only insert 1 item that means that we
-                * inserted on the end of a leaf, and we have no idea what may
-                * be on the next leaf so we just play it safe.  In order to
-                * try and help this case we insert the last thing on our
-                * insert list so hopefully it will end up being the last
-                * thing on the leaf and everything else will be before it,
-                * which will let us insert a whole bunch of items at the same
-                * time.
-                */
-               if (ret == 1 && !last && (i + ret < total)) {
-                       /*
-                        * last: where we will pick up the next time around
-                        * i: our current key to insert, will be total - 1
-                        * cur: the current op we are screwing with
-                        * op: duh
-                        */
-                       last = i + ret;
-                       i = total - 1;
-                       cur = insert_list->prev;
-                       op = list_entry(cur, struct pending_extent_op, list);
-               } else if (last) {
-                       /*
-                        * ok we successfully inserted the last item on the
-                        * list, lets reset everything
-                        *
-                        * i: our current key to insert, so where we left off
-                        *    last time
-                        * last: done with this
-                        * cur: the op we are messing with
-                        * op: duh
-                        * total: since we inserted the last key, we need to
-                        *        decrement total so we dont overflow
-                        */
-                       i = last;
-                       last = 0;
-                       total--;
-                       if (i < total) {
-                               cur = insert_list->next;
-                               op = list_entry(cur, struct pending_extent_op,
-                                               list);
-                       }
-               } else {
-                       i += ret;
-               }
-
-               cond_resched();
-       }
-       ret = 0;
-       kfree(keys);
-       kfree(data_size);
-       return ret;
-}
-
 static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path,
                                          u64 bytenr, u64 parent,
                                          u64 ref_root, u64 ref_generation,
-                                         u64 owner_objectid)
+                                         u64 owner_objectid,
+                                         int refs_to_add)
 {
        struct btrfs_key key;
        struct extent_buffer *leaf;
@@ -807,9 +586,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
                btrfs_set_ref_root(leaf, ref, ref_root);
                btrfs_set_ref_generation(leaf, ref, ref_generation);
                btrfs_set_ref_objectid(leaf, ref, owner_objectid);
-               btrfs_set_ref_num_refs(leaf, ref, 1);
+               btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
        } else if (ret == -EEXIST) {
                u64 existing_owner;
+
                BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
                leaf = path->nodes[0];
                ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -823,7 +603,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
 
                num_refs = btrfs_ref_num_refs(leaf, ref);
                BUG_ON(num_refs == 0);
-               btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+               btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
 
                existing_owner = btrfs_ref_objectid(leaf, ref);
                if (existing_owner != owner_objectid &&
@@ -835,6 +615,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
        } else {
                goto out;
        }
+       btrfs_unlock_up_safe(path, 1);
        btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
        btrfs_release_path(root, path);
@@ -843,7 +624,8 @@ out:
 
 static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
-                                         struct btrfs_path *path)
+                                         struct btrfs_path *path,
+                                         int refs_to_drop)
 {
        struct extent_buffer *leaf;
        struct btrfs_extent_ref *ref;
@@ -853,8 +635,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
        leaf = path->nodes[0];
        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
        num_refs = btrfs_ref_num_refs(leaf, ref);
-       BUG_ON(num_refs == 0);
-       num_refs -= 1;
+       BUG_ON(num_refs < refs_to_drop);
+       num_refs -= refs_to_drop;
        if (num_refs == 0) {
                ret = btrfs_del_item(trans, root, path);
        } else {
@@ -905,332 +687,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 #endif
 }
 
-static noinline int free_extents(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *extent_root,
-                                struct list_head *del_list)
-{
-       struct btrfs_fs_info *info = extent_root->fs_info;
-       struct btrfs_path *path;
-       struct btrfs_key key, found_key;
-       struct extent_buffer *leaf;
-       struct list_head *cur;
-       struct pending_extent_op *op;
-       struct btrfs_extent_item *ei;
-       int ret, num_to_del, extent_slot = 0, found_extent = 0;
-       u32 refs;
-       u64 bytes_freed = 0;
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-       path->reada = 1;
-
-search:
-       /* search for the backref for the current ref we want to delete */
-       cur = del_list->next;
-       op = list_entry(cur, struct pending_extent_op, list);
-       ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
-                                   op->orig_parent,
-                                   extent_root->root_key.objectid,
-                                   op->orig_generation, op->level, 1);
-       if (ret) {
-               printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
-                      "root %llu gen %llu owner %u\n",
-                      (unsigned long long)op->bytenr,
-                      (unsigned long long)extent_root->root_key.objectid,
-                      (unsigned long long)op->orig_generation, op->level);
-               btrfs_print_leaf(extent_root, path->nodes[0]);
-               WARN_ON(1);
-               goto out;
-       }
-
-       extent_slot = path->slots[0];
-       num_to_del = 1;
-       found_extent = 0;
-
-       /*
-        * if we aren't the first item on the leaf we can move back one and see
-        * if our ref is right next to our extent item
-        */
-       if (likely(extent_slot)) {
-               extent_slot--;
-               btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-                                     extent_slot);
-               if (found_key.objectid == op->bytenr &&
-                   found_key.type == BTRFS_EXTENT_ITEM_KEY &&
-                   found_key.offset == op->num_bytes) {
-                       num_to_del++;
-                       found_extent = 1;
-               }
-       }
-
-       /*
-        * if we didn't find the extent we need to delete the backref and then
-        * search for the extent item key so we can update its ref count
-        */
-       if (!found_extent) {
-               key.objectid = op->bytenr;
-               key.type = BTRFS_EXTENT_ITEM_KEY;
-               key.offset = op->num_bytes;
-
-               ret = remove_extent_backref(trans, extent_root, path);
-               BUG_ON(ret);
-               btrfs_release_path(extent_root, path);
-               ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
-               BUG_ON(ret);
-               extent_slot = path->slots[0];
-       }
-
-       /* this is where we update the ref count for the extent */
-       leaf = path->nodes[0];
-       ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
-       refs = btrfs_extent_refs(leaf, ei);
-       BUG_ON(refs == 0);
-       refs--;
-       btrfs_set_extent_refs(leaf, ei, refs);
-
-       btrfs_mark_buffer_dirty(leaf);
-
-       /*
-        * This extent needs deleting.  The reason cur_slot is extent_slot +
-        * num_to_del is because extent_slot points to the slot where the extent
-        * is, and if the backref was not right next to the extent we will be
-        * deleting at least 1 item, and will want to start searching at the
-        * slot directly next to extent_slot.  However if we did find the
-        * backref next to the extent item them we will be deleting at least 2
-        * items and will want to start searching directly after the ref slot
-        */
-       if (!refs) {
-               struct list_head *pos, *n, *end;
-               int cur_slot = extent_slot+num_to_del;
-               u64 super_used;
-               u64 root_used;
-
-               path->slots[0] = extent_slot;
-               bytes_freed = op->num_bytes;
-
-               mutex_lock(&info->pinned_mutex);
-               ret = pin_down_bytes(trans, extent_root, op->bytenr,
-                                    op->num_bytes, op->level >=
-                                    BTRFS_FIRST_FREE_OBJECTID);
-               mutex_unlock(&info->pinned_mutex);
-               BUG_ON(ret < 0);
-               op->del = ret;
-
-               /*
-                * we need to see if we can delete multiple things at once, so
-                * start looping through the list of extents we are wanting to
-                * delete and see if their extent/backref's are right next to
-                * eachother and the extents only have 1 ref
-                */
-               for (pos = cur->next; pos != del_list; pos = pos->next) {
-                       struct pending_extent_op *tmp;
-
-                       tmp = list_entry(pos, struct pending_extent_op, list);
-
-                       /* we only want to delete extent+ref at this stage */
-                       if (cur_slot >= btrfs_header_nritems(leaf) - 1)
-                               break;
-
-                       btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
-                       if (found_key.objectid != tmp->bytenr ||
-                           found_key.type != BTRFS_EXTENT_ITEM_KEY ||
-                           found_key.offset != tmp->num_bytes)
-                               break;
-
-                       /* check to make sure this extent only has one ref */
-                       ei = btrfs_item_ptr(leaf, cur_slot,
-                                           struct btrfs_extent_item);
-                       if (btrfs_extent_refs(leaf, ei) != 1)
-                               break;
-
-                       btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
-                       if (found_key.objectid != tmp->bytenr ||
-                           found_key.type != BTRFS_EXTENT_REF_KEY ||
-                           found_key.offset != tmp->orig_parent)
-                               break;
-
-                       /*
-                        * the ref is right next to the extent, we can set the
-                        * ref count to 0 since we will delete them both now
-                        */
-                       btrfs_set_extent_refs(leaf, ei, 0);
-
-                       /* pin down the bytes for this extent */
-                       mutex_lock(&info->pinned_mutex);
-                       ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
-                                            tmp->num_bytes, tmp->level >=
-                                            BTRFS_FIRST_FREE_OBJECTID);
-                       mutex_unlock(&info->pinned_mutex);
-                       BUG_ON(ret < 0);
-
-                       /*
-                        * use the del field to tell if we need to go ahead and
-                        * free up the extent when we delete the item or not.
-                        */
-                       tmp->del = ret;
-                       bytes_freed += tmp->num_bytes;
-
-                       num_to_del += 2;
-                       cur_slot += 2;
-               }
-               end = pos;
-
-               /* update the free space counters */
-               spin_lock(&info->delalloc_lock);
-               super_used = btrfs_super_bytes_used(&info->super_copy);
-               btrfs_set_super_bytes_used(&info->super_copy,
-                                          super_used - bytes_freed);
-
-               root_used = btrfs_root_used(&extent_root->root_item);
-               btrfs_set_root_used(&extent_root->root_item,
-                                   root_used - bytes_freed);
-               spin_unlock(&info->delalloc_lock);
-
-               /* delete the items */
-               ret = btrfs_del_items(trans, extent_root, path,
-                                     path->slots[0], num_to_del);
-               BUG_ON(ret);
-
-               /*
-                * loop through the extents we deleted and do the cleanup work
-                * on them
-                */
-               for (pos = cur, n = pos->next; pos != end;
-                    pos = n, n = pos->next) {
-                       struct pending_extent_op *tmp;
-                       tmp = list_entry(pos, struct pending_extent_op, list);
-
-                       /*
-                        * remember tmp->del tells us wether or not we pinned
-                        * down the extent
-                        */
-                       ret = update_block_group(trans, extent_root,
-                                                tmp->bytenr, tmp->num_bytes, 0,
-                                                tmp->del);
-                       BUG_ON(ret);
-
-                       list_del_init(&tmp->list);
-                       unlock_extent(&info->extent_ins, tmp->bytenr,
-                                     tmp->bytenr + tmp->num_bytes - 1,
-                                     GFP_NOFS);
-                       kfree(tmp);
-               }
-       } else if (refs && found_extent) {
-               /*
-                * the ref and extent were right next to eachother, but the
-                * extent still has a ref, so just free the backref and keep
-                * going
-                */
-               ret = remove_extent_backref(trans, extent_root, path);
-               BUG_ON(ret);
-
-               list_del_init(&op->list);
-               unlock_extent(&info->extent_ins, op->bytenr,
-                             op->bytenr + op->num_bytes - 1, GFP_NOFS);
-               kfree(op);
-       } else {
-               /*
-                * the extent has multiple refs and the backref we were looking
-                * for was not right next to it, so just unlock and go next,
-                * we're good to go
-                */
-               list_del_init(&op->list);
-               unlock_extent(&info->extent_ins, op->bytenr,
-                             op->bytenr + op->num_bytes - 1, GFP_NOFS);
-               kfree(op);
-       }
-
-       btrfs_release_path(extent_root, path);
-       if (!list_empty(del_list))
-               goto search;
-
-out:
-       btrfs_free_path(path);
-       return ret;
-}
-
 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root, u64 bytenr,
+                                    u64 num_bytes,
                                     u64 orig_parent, u64 parent,
                                     u64 orig_root, u64 ref_root,
                                     u64 orig_generation, u64 ref_generation,
                                     u64 owner_objectid)
 {
        int ret;
-       struct btrfs_root *extent_root = root->fs_info->extent_root;
-       struct btrfs_path *path;
+       int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
 
-       if (root == root->fs_info->extent_root) {
-               struct pending_extent_op *extent_op;
-               u64 num_bytes;
-
-               BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
-               num_bytes = btrfs_level_size(root, (int)owner_objectid);
-               mutex_lock(&root->fs_info->extent_ins_mutex);
-               if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-                               bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
-                       u64 priv;
-                       ret = get_state_private(&root->fs_info->extent_ins,
-                                               bytenr, &priv);
-                       BUG_ON(ret);
-                       extent_op = (struct pending_extent_op *)
-                                                       (unsigned long)priv;
-                       BUG_ON(extent_op->parent != orig_parent);
-                       BUG_ON(extent_op->generation != orig_generation);
-
-                       extent_op->parent = parent;
-                       extent_op->generation = ref_generation;
-               } else {
-                       extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-                       BUG_ON(!extent_op);
-
-                       extent_op->type = PENDING_BACKREF_UPDATE;
-                       extent_op->bytenr = bytenr;
-                       extent_op->num_bytes = num_bytes;
-                       extent_op->parent = parent;
-                       extent_op->orig_parent = orig_parent;
-                       extent_op->generation = ref_generation;
-                       extent_op->orig_generation = orig_generation;
-                       extent_op->level = (int)owner_objectid;
-                       INIT_LIST_HEAD(&extent_op->list);
-                       extent_op->del = 0;
-
-                       set_extent_bits(&root->fs_info->extent_ins,
-                                       bytenr, bytenr + num_bytes - 1,
-                                       EXTENT_WRITEBACK, GFP_NOFS);
-                       set_state_private(&root->fs_info->extent_ins,
-                                         bytenr, (unsigned long)extent_op);
-               }
-               mutex_unlock(&root->fs_info->extent_ins_mutex);
-               return 0;
-       }
-
-       path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
-       ret = lookup_extent_backref(trans, extent_root, path,
-                                   bytenr, orig_parent, orig_root,
-                                   orig_generation, owner_objectid, 1);
-       if (ret)
-               goto out;
-       ret = remove_extent_backref(trans, extent_root, path);
-       if (ret)
-               goto out;
-       ret = insert_extent_backref(trans, extent_root, path, bytenr,
-                                   parent, ref_root, ref_generation,
-                                   owner_objectid);
+       ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
+                                      orig_parent, parent, orig_root,
+                                      ref_root, orig_generation,
+                                      ref_generation, owner_objectid, pin);
        BUG_ON(ret);
-       finish_current_insert(trans, extent_root, 0);
-       del_pending_extents(trans, extent_root, 0);
-out:
-       btrfs_free_path(path);
        return ret;
 }
 
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, u64 bytenr,
-                           u64 orig_parent, u64 parent,
+                           u64 num_bytes, u64 orig_parent, u64 parent,
                            u64 ref_root, u64 ref_generation,
                            u64 owner_objectid)
 {
@@ -1238,19 +716,35 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
                return 0;
-       ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
-                                       parent, ref_root, ref_root,
-                                       ref_generation, ref_generation,
-                                       owner_objectid);
+
+       ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
+                                       orig_parent, parent, ref_root,
+                                       ref_root, ref_generation,
+                                       ref_generation, owner_objectid);
        return ret;
 }
-
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root, u64 bytenr,
+                                 u64 num_bytes,
                                  u64 orig_parent, u64 parent,
                                  u64 orig_root, u64 ref_root,
                                  u64 orig_generation, u64 ref_generation,
                                  u64 owner_objectid)
+{
+       int ret;
+
+       ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
+                                   ref_generation, owner_objectid,
+                                   BTRFS_ADD_DELAYED_REF, 0);
+       BUG_ON(ret);
+       return ret;
+}
+
+static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
+                         struct btrfs_root *root, u64 bytenr,
+                         u64 num_bytes, u64 parent, u64 ref_root,
+                         u64 ref_generation, u64 owner_objectid,
+                         int refs_to_add)
 {
        struct btrfs_path *path;
        int ret;
@@ -1264,17 +758,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                return -ENOMEM;
 
        path->reada = 1;
+       path->leave_spinning = 1;
        key.objectid = bytenr;
        key.type = BTRFS_EXTENT_ITEM_KEY;
-       key.offset = (u64)-1;
+       key.offset = num_bytes;
 
-       ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
-                               0, 1);
-       if (ret < 0)
+       /* first find the extent item and update its reference count */
+       ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+                               path, 0, 1);
+       if (ret < 0) {
+               btrfs_set_path_blocking(path);
                return ret;
-       BUG_ON(ret == 0 || path->slots[0] == 0);
+       }
 
-       path->slots[0]--;
+       if (ret > 0) {
+               WARN_ON(1);
+               btrfs_free_path(path);
+               return -EIO;
+       }
        l = path->nodes[0];
 
        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1288,21 +789,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
 
        item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+
        refs = btrfs_extent_refs(l, item);
-       btrfs_set_extent_refs(l, item, refs + 1);
+       btrfs_set_extent_refs(l, item, refs + refs_to_add);
+       btrfs_unlock_up_safe(path, 1);
+
        btrfs_mark_buffer_dirty(path->nodes[0]);
 
        btrfs_release_path(root->fs_info->extent_root, path);
 
        path->reada = 1;
+       path->leave_spinning = 1;
+
+       /* now insert the actual backref */
        ret = insert_extent_backref(trans, root->fs_info->extent_root,
                                    path, bytenr, parent,
                                    ref_root, ref_generation,
-                                   owner_objectid);
+                                   owner_objectid, refs_to_add);
        BUG_ON(ret);
-       finish_current_insert(trans, root->fs_info->extent_root, 0);
-       del_pending_extents(trans, root->fs_info->extent_root, 0);
-
        btrfs_free_path(path);
        return 0;
 }
@@ -1317,51 +821,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
                return 0;
-       ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+
+       ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
                                     0, ref_root, 0, ref_generation,
                                     owner_objectid);
        return ret;
 }
 
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root)
+static int drop_delayed_ref(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct btrfs_delayed_ref_node *node)
 {
-       finish_current_insert(trans, root->fs_info->extent_root, 1);
-       del_pending_extents(trans, root->fs_info->extent_root, 1);
+       int ret = 0;
+       struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
+
+       BUG_ON(node->ref_mod == 0);
+       ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
+                                 node->parent, ref->root, ref->generation,
+                                 ref->owner_objectid, ref->pin, node->ref_mod);
+
+       return ret;
+}
+
+/* helper function to actually process a single delayed ref entry */
+static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct btrfs_delayed_ref_node *node,
+                                       int insert_reserved)
+{
+       int ret;
+       struct btrfs_delayed_ref *ref;
+
+       if (node->parent == (u64)-1) {
+               struct btrfs_delayed_ref_head *head;
+               /*
+                * we've hit the end of the chain and we were supposed
+                * to insert this extent into the tree.  But, it got
+                * deleted before we ever needed to insert it, so all
+                * we have to do is clean up the accounting
+                */
+               if (insert_reserved) {
+                       update_reserved_extents(root, node->bytenr,
+                                               node->num_bytes, 0);
+               }
+               head = btrfs_delayed_node_to_head(node);
+               mutex_unlock(&head->mutex);
+               return 0;
+       }
+
+       ref = btrfs_delayed_node_to_ref(node);
+       if (ref->action == BTRFS_ADD_DELAYED_REF) {
+               if (insert_reserved) {
+                       struct btrfs_key ins;
+
+                       ins.objectid = node->bytenr;
+                       ins.offset = node->num_bytes;
+                       ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+                       /* record the full extent allocation */
+                       ret = __btrfs_alloc_reserved_extent(trans, root,
+                                       node->parent, ref->root,
+                                       ref->generation, ref->owner_objectid,
+                                       &ins, node->ref_mod);
+                       update_reserved_extents(root, node->bytenr,
+                                               node->num_bytes, 0);
+               } else {
+                       /* just add one backref */
+                       ret = add_extent_ref(trans, root, node->bytenr,
+                                    node->num_bytes,
+                                    node->parent, ref->root, ref->generation,
+                                    ref->owner_objectid, node->ref_mod);
+               }
+               BUG_ON(ret);
+       } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
+               WARN_ON(insert_reserved);
+               ret = drop_delayed_ref(trans, root, node);
+       }
        return 0;
 }
 
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root, u64 bytenr,
-                           u64 num_bytes, u32 *refs)
+static noinline struct btrfs_delayed_ref_node *
+select_delayed_ref(struct btrfs_delayed_ref_head *head)
 {
-       struct btrfs_path *path;
+       struct rb_node *node;
+       struct btrfs_delayed_ref_node *ref;
+       int action = BTRFS_ADD_DELAYED_REF;
+again:
+       /*
+        * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
+        * this prevents ref count from going down to zero when
+        * there still are pending delayed ref.
+        */
+       node = rb_prev(&head->node.rb_node);
+       while (1) {
+               if (!node)
+                       break;
+               ref = rb_entry(node, struct btrfs_delayed_ref_node,
+                               rb_node);
+               if (ref->bytenr != head->node.bytenr)
+                       break;
+               if (btrfs_delayed_node_to_ref(ref)->action == action)
+                       return ref;
+               node = rb_prev(node);
+       }
+       if (action == BTRFS_ADD_DELAYED_REF) {
+               action = BTRFS_DROP_DELAYED_REF;
+               goto again;
+       }
+       return NULL;
+}
+
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct list_head *cluster)
+{
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_node *ref;
+       struct btrfs_delayed_ref_head *locked_ref = NULL;
        int ret;
-       struct btrfs_key key;
-       struct extent_buffer *l;
-       struct btrfs_extent_item *item;
+       int count = 0;
+       int must_insert_reserved = 0;
 
-       WARN_ON(num_bytes < root->sectorsize);
-       path = btrfs_alloc_path();
-       path->reada = 1;
-       key.objectid = bytenr;
-       key.offset = num_bytes;
-       btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-       ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
-                               0, 0);
-       if (ret < 0)
-               goto out;
-       if (ret != 0) {
-               btrfs_print_leaf(root, path->nodes[0]);
-               printk(KERN_INFO "btrfs failed to find block number %llu\n",
-                      (unsigned long long)bytenr);
-               BUG();
+       delayed_refs = &trans->transaction->delayed_refs;
+       while (1) {
+               if (!locked_ref) {
+                       /* pick a new head ref from the cluster list */
+                       if (list_empty(cluster))
+                               break;
+
+                       locked_ref = list_entry(cluster->next,
+                                    struct btrfs_delayed_ref_head, cluster);
+
+                       /* grab the lock that says we are going to process
+                        * all the refs for this head */
+                       ret = btrfs_delayed_ref_lock(trans, locked_ref);
+
+                       /*
+                        * we may have dropped the spin lock to get the head
+                        * mutex lock, and that might have given someone else
+                        * time to free the head.  If that's true, it has been
+                        * removed from our list and we can move on.
+                        */
+                       if (ret == -EAGAIN) {
+                               locked_ref = NULL;
+                               count++;
+                               continue;
+                       }
+               }
+
+               /*
+                * record the must insert reserved flag before we
+                * drop the spin lock.
+                */
+               must_insert_reserved = locked_ref->must_insert_reserved;
+               locked_ref->must_insert_reserved = 0;
+
+               /*
+                * locked_ref is the head node, so we have to go one
+                * node back for any delayed ref updates
+                */
+               ref = select_delayed_ref(locked_ref);
+               if (!ref) {
+                       /* All delayed refs have been processed, Go ahead
+                        * and send the head node to run_one_delayed_ref,
+                        * so that any accounting fixes can happen
+                        */
+                       ref = &locked_ref->node;
+                       list_del_init(&locked_ref->cluster);
+                       locked_ref = NULL;
+               }
+
+               ref->in_tree = 0;
+               rb_erase(&ref->rb_node, &delayed_refs->root);
+               delayed_refs->num_entries--;
+               spin_unlock(&delayed_refs->lock);
+
+               ret = run_one_delayed_ref(trans, root, ref,
+                                         must_insert_reserved);
+               BUG_ON(ret);
+               btrfs_put_delayed_ref(ref);
+
+               count++;
+               cond_resched();
+               spin_lock(&delayed_refs->lock);
+       }
+       return count;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far.  count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, unsigned long count)
+{
+       struct rb_node *node;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_node *ref;
+       struct list_head cluster;
+       int ret;
+       int run_all = count == (unsigned long)-1;
+       int run_most = 0;
+
+       if (root == root->fs_info->extent_root)
+               root = root->fs_info->tree_root;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       INIT_LIST_HEAD(&cluster);
+again:
+       spin_lock(&delayed_refs->lock);
+       if (count == 0) {
+               count = delayed_refs->num_entries * 2;
+               run_most = 1;
+       }
+       while (1) {
+               if (!(run_all || run_most) &&
+                   delayed_refs->num_heads_ready < 64)
+                       break;
+
+               /*
+                * go find something we can process in the rbtree.  We start at
+                * the beginning of the tree, and then build a cluster
+                * of refs to process starting at the first one we are able to
+                * lock
+                */
+               ret = btrfs_find_ref_cluster(trans, &cluster,
+                                            delayed_refs->run_delayed_start);
+               if (ret)
+                       break;
+
+               ret = run_clustered_refs(trans, root, &cluster);
+               BUG_ON(ret < 0);
+
+               count -= min_t(unsigned long, ret, count);
+
+               if (count == 0)
+                       break;
+       }
+
+       if (run_all) {
+               node = rb_first(&delayed_refs->root);
+               if (!node)
+                       goto out;
+               count = (unsigned long)-1;
+
+               while (node) {
+                       ref = rb_entry(node, struct btrfs_delayed_ref_node,
+                                      rb_node);
+                       if (btrfs_delayed_ref_is_head(ref)) {
+                               struct btrfs_delayed_ref_head *head;
+
+                               head = btrfs_delayed_node_to_head(ref);
+                               atomic_inc(&ref->refs);
+
+                               spin_unlock(&delayed_refs->lock);
+                               mutex_lock(&head->mutex);
+                               mutex_unlock(&head->mutex);
+
+                               btrfs_put_delayed_ref(ref);
+                               cond_resched();
+                               goto again;
+                       }
+                       node = rb_next(node);
+               }
+               spin_unlock(&delayed_refs->lock);
+               schedule_timeout(1);
+               goto again;
        }
-       l = path->nodes[0];
-       item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-       *refs = btrfs_extent_refs(l, item);
 out:
-       btrfs_free_path(path);
+       spin_unlock(&delayed_refs->lock);
        return 0;
 }
 
@@ -1525,15 +1256,55 @@ out:
        return ret;
 }
 
-int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                 struct extent_buffer *orig_buf, struct extent_buffer *buf,
-                 u32 *nr_extents)
+/* when a block goes through cow, we update the reference counts of
+ * everything that block points to.  The internal pointers of the block
+ * can be in just about any order, and it is likely to have clusters of
+ * things that are close together and clusters of things that are not.
+ *
+ * To help reduce the seeks that come with updating all of these reference
+ * counts, sort them by byte number before actual updates are done.
+ *
+ * struct refsort is used to match byte number to slot in the btree block.
+ * we sort based on the byte number and then use the slot to actually
+ * find the item.
+ *
+ * struct refsort is smaller than strcut btrfs_item and smaller than
+ * struct btrfs_key_ptr.  Since we're currently limited to the page size
+ * for a btree block, there's no way for a kmalloc of refsorts for a
+ * single node to be bigger than a page.
+ */
+struct refsort {
+       u64 bytenr;
+       u32 slot;
+};
+
+/*
+ * for passing into sort()
+ */
+static int refsort_cmp(const void *a_void, const void *b_void)
+{
+       const struct refsort *a = a_void;
+       const struct refsort *b = b_void;
+
+       if (a->bytenr < b->bytenr)
+               return -1;
+       if (a->bytenr > b->bytenr)
+               return 1;
+       return 0;
+}
+
+
+noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct extent_buffer *orig_buf,
+                          struct extent_buffer *buf, u32 *nr_extents)
 {
        u64 bytenr;
        u64 ref_root;
        u64 orig_root;
        u64 ref_generation;
        u64 orig_generation;
+       struct refsort *sorted;
        u32 nritems;
        u32 nr_file_extents = 0;
        struct btrfs_key key;
@@ -1542,8 +1313,10 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        int level;
        int ret = 0;
        int faili = 0;
+       int refi = 0;
+       int slot;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                           u64, u64, u64, u64, u64, u64, u64, u64);
+                           u64, u64, u64, u64, u64, u64, u64, u64, u64);
 
        ref_root = btrfs_header_owner(buf);
        ref_generation = btrfs_header_generation(buf);
@@ -1553,6 +1326,9 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        nritems = btrfs_header_nritems(buf);
        level = btrfs_header_level(buf);
 
+       sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
+       BUG_ON(!sorted);
+
        if (root->ref_cows) {
                process_func = __btrfs_inc_extent_ref;
        } else {
@@ -1565,49 +1341,87 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                process_func = __btrfs_update_extent_ref;
        }
 
-       for (i = 0; i < nritems; i++) {
+       /*
+        * we make two passes through the items.  In the first pass we
+        * only record the byte number and slot.  Then we sort based on
+        * byte number and do the actual work based on the sorted results
+        */
+       for (i = 0; i < nritems; i++) {
+               cond_resched();
+               if (level == 0) {
+                       btrfs_item_key_to_cpu(buf, &key, i);
+                       if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                               continue;
+                       fi = btrfs_item_ptr(buf, i,
+                                           struct btrfs_file_extent_item);
+                       if (btrfs_file_extent_type(buf, fi) ==
+                           BTRFS_FILE_EXTENT_INLINE)
+                               continue;
+                       bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+                       if (bytenr == 0)
+                               continue;
+
+                       nr_file_extents++;
+                       sorted[refi].bytenr = bytenr;
+                       sorted[refi].slot = i;
+                       refi++;
+               } else {
+                       bytenr = btrfs_node_blockptr(buf, i);
+                       sorted[refi].bytenr = bytenr;
+                       sorted[refi].slot = i;
+                       refi++;
+               }
+       }
+       /*
+        * if refi == 0, we didn't actually put anything into the sorted
+        * array and we're done
+        */
+       if (refi == 0)
+               goto out;
+
+       sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+       for (i = 0; i < refi; i++) {
                cond_resched();
+               slot = sorted[i].slot;
+               bytenr = sorted[i].bytenr;
+
                if (level == 0) {
-                       btrfs_item_key_to_cpu(buf, &key, i);
-                       if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-                               continue;
-                       fi = btrfs_item_ptr(buf, i,
+                       btrfs_item_key_to_cpu(buf, &key, slot);
+                       fi = btrfs_item_ptr(buf, slot,
                                            struct btrfs_file_extent_item);
-                       if (btrfs_file_extent_type(buf, fi) ==
-                           BTRFS_FILE_EXTENT_INLINE)
-                               continue;
+
                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
                        if (bytenr == 0)
                                continue;
 
-                       nr_file_extents++;
-
                        ret = process_func(trans, root, bytenr,
-                                          orig_buf->start, buf->start,
-                                          orig_root, ref_root,
-                                          orig_generation, ref_generation,
-                                          key.objectid);
+                                  btrfs_file_extent_disk_num_bytes(buf, fi),
+                                  orig_buf->start, buf->start,
+                                  orig_root, ref_root,
+                                  orig_generation, ref_generation,
+                                  key.objectid);
 
                        if (ret) {
-                               faili = i;
+                               faili = slot;
                                WARN_ON(1);
                                goto fail;
                        }
                } else {
-                       bytenr = btrfs_node_blockptr(buf, i);
-                       ret = process_func(trans, root, bytenr,
+                       ret = process_func(trans, root, bytenr, buf->len,
                                           orig_buf->start, buf->start,
                                           orig_root, ref_root,
                                           orig_generation, ref_generation,
                                           level - 1);
                        if (ret) {
-                               faili = i;
+                               faili = slot;
                                WARN_ON(1);
                                goto fail;
                        }
                }
        }
 out:
+       kfree(sorted);
        if (nr_extents) {
                if (level == 0)
                        *nr_extents = nr_file_extents;
@@ -1616,6 +1430,7 @@ out:
        }
        return 0;
 fail:
+       kfree(sorted);
        WARN_ON(1);
        return ret;
 }
@@ -1670,17 +1485,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
                        if (bytenr == 0)
                                continue;
                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
-                                           orig_buf->start, buf->start,
-                                           orig_root, ref_root,
-                                           orig_generation, ref_generation,
-                                           key.objectid);
+                                   btrfs_file_extent_disk_num_bytes(buf, fi),
+                                   orig_buf->start, buf->start,
+                                   orig_root, ref_root, orig_generation,
+                                   ref_generation, key.objectid);
                        if (ret)
                                goto fail;
                } else {
                        bytenr = btrfs_node_blockptr(buf, slot);
                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
-                                           orig_buf->start, buf->start,
-                                           orig_root, ref_root,
+                                           buf->len, orig_buf->start,
+                                           buf->start, orig_root, ref_root,
                                            orig_generation, ref_generation,
                                            level - 1);
                        if (ret)
@@ -1699,7 +1514,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
                                 struct btrfs_block_group_cache *cache)
 {
        int ret;
-       int pending_ret;
        struct btrfs_root *extent_root = root->fs_info->extent_root;
        unsigned long bi;
        struct extent_buffer *leaf;
@@ -1715,12 +1529,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(extent_root, path);
 fail:
-       finish_current_insert(trans, extent_root, 0);
-       pending_ret = del_pending_extents(trans, extent_root, 0);
        if (ret)
                return ret;
-       if (pending_ret)
-               return pending_ret;
        return 0;
 
 }
@@ -1808,7 +1618,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        if (!found)
                return -ENOMEM;
 
-       list_add(&found->list, &info->space_info);
        INIT_LIST_HEAD(&found->block_groups);
        init_rwsem(&found->groups_sem);
        spin_lock_init(&found->lock);
@@ -1818,9 +1627,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->bytes_pinned = 0;
        found->bytes_reserved = 0;
        found->bytes_readonly = 0;
+       found->bytes_delalloc = 0;
        found->full = 0;
        found->force_alloc = 0;
        *space_info = found;
+       list_add_rcu(&found->list, &info->space_info);
        return 0;
 }
 
@@ -1881,6 +1692,233 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        return flags;
 }
 
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+{
+       struct btrfs_fs_info *info = root->fs_info;
+       u64 alloc_profile;
+
+       if (data) {
+               alloc_profile = info->avail_data_alloc_bits &
+                       info->data_alloc_profile;
+               data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
+       } else if (root == root->fs_info->chunk_root) {
+               alloc_profile = info->avail_system_alloc_bits &
+                       info->system_alloc_profile;
+               data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
+       } else {
+               alloc_profile = info->avail_metadata_alloc_bits &
+                       info->metadata_alloc_profile;
+               data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
+       }
+
+       return btrfs_reduce_alloc_profile(root, data);
+}
+
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+{
+       u64 alloc_target;
+
+       alloc_target = btrfs_get_alloc_profile(root, 1);
+       BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+                                                      alloc_target);
+}
+
+/*
+ * for now this just makes sure we have at least 5% of our metadata space free
+ * for use.
+ */
+int btrfs_check_metadata_free_space(struct btrfs_root *root)
+{
+       struct btrfs_fs_info *info = root->fs_info;
+       struct btrfs_space_info *meta_sinfo;
+       u64 alloc_target, thresh;
+       int committed = 0, ret;
+
+       /* get the space info for where the metadata will live */
+       alloc_target = btrfs_get_alloc_profile(root, 0);
+       meta_sinfo = __find_space_info(info, alloc_target);
+
+again:
+       spin_lock(&meta_sinfo->lock);
+       if (!meta_sinfo->full)
+               thresh = meta_sinfo->total_bytes * 80;
+       else
+               thresh = meta_sinfo->total_bytes * 95;
+
+       do_div(thresh, 100);
+
+       if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+           meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+               struct btrfs_trans_handle *trans;
+               if (!meta_sinfo->full) {
+                       meta_sinfo->force_alloc = 1;
+                       spin_unlock(&meta_sinfo->lock);
+
+                       trans = btrfs_start_transaction(root, 1);
+                       if (!trans)
+                               return -ENOMEM;
+
+                       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                                            2 * 1024 * 1024, alloc_target, 0);
+                       btrfs_end_transaction(trans, root);
+                       goto again;
+               }
+               spin_unlock(&meta_sinfo->lock);
+
+               if (!committed) {
+                       committed = 1;
+                       trans = btrfs_join_transaction(root, 1);
+                       if (!trans)
+                               return -ENOMEM;
+                       ret = btrfs_commit_transaction(trans, root);
+                       if (ret)
+                               return ret;
+                       goto again;
+               }
+               return -ENOSPC;
+       }
+       spin_unlock(&meta_sinfo->lock);
+
+       return 0;
+}
+
+/*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+                               u64 bytes)
+{
+       struct btrfs_space_info *data_sinfo;
+       int ret = 0, committed = 0;
+
+       /* make sure bytes are sectorsize aligned */
+       bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+       data_sinfo = BTRFS_I(inode)->space_info;
+again:
+       /* make sure we have enough space to handle the data first */
+       spin_lock(&data_sinfo->lock);
+       if (data_sinfo->total_bytes - data_sinfo->bytes_used -
+           data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
+           data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
+           data_sinfo->bytes_may_use < bytes) {
+               struct btrfs_trans_handle *trans;
+
+               /*
+                * if we don't have enough free bytes in this space then we need
+                * to alloc a new chunk.
+                */
+               if (!data_sinfo->full) {
+                       u64 alloc_target;
+
+                       data_sinfo->force_alloc = 1;
+                       spin_unlock(&data_sinfo->lock);
+
+                       alloc_target = btrfs_get_alloc_profile(root, 1);
+                       trans = btrfs_start_transaction(root, 1);
+                       if (!trans)
+                               return -ENOMEM;
+
+                       ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                                            bytes + 2 * 1024 * 1024,
+                                            alloc_target, 0);
+                       btrfs_end_transaction(trans, root);
+                       if (ret)
+                               return ret;
+                       goto again;
+               }
+               spin_unlock(&data_sinfo->lock);
+
+               /* commit the current transaction and try again */
+               if (!committed) {
+                       committed = 1;
+                       trans = btrfs_join_transaction(root, 1);
+                       if (!trans)
+                               return -ENOMEM;
+                       ret = btrfs_commit_transaction(trans, root);
+                       if (ret)
+                               return ret;
+                       goto again;
+               }
+
+               printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
+                      ", %llu bytes_used, %llu bytes_reserved, "
+                      "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
+                      "%llu total\n", bytes, data_sinfo->bytes_delalloc,
+                      data_sinfo->bytes_used, data_sinfo->bytes_reserved,
+                      data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
+                      data_sinfo->bytes_may_use, data_sinfo->total_bytes);
+               return -ENOSPC;
+       }
+       data_sinfo->bytes_may_use += bytes;
+       BTRFS_I(inode)->reserved_bytes += bytes;
+       spin_unlock(&data_sinfo->lock);
+
+       return btrfs_check_metadata_free_space(root);
+}
+
+/*
+ * if there was an error for whatever reason after calling
+ * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ */
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+                                   struct inode *inode, u64 bytes)
+{
+       struct btrfs_space_info *data_sinfo;
+
+       /* make sure bytes are sectorsize aligned */
+       bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+       data_sinfo = BTRFS_I(inode)->space_info;
+       spin_lock(&data_sinfo->lock);
+       data_sinfo->bytes_may_use -= bytes;
+       BTRFS_I(inode)->reserved_bytes -= bytes;
+       spin_unlock(&data_sinfo->lock);
+}
+
+/* called when we are adding a delalloc extent to the inode's io_tree */
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+                                 u64 bytes)
+{
+       struct btrfs_space_info *data_sinfo;
+
+       /* get the space info for where this inode will be storing its data */
+       data_sinfo = BTRFS_I(inode)->space_info;
+
+       /* make sure we have enough space to handle the data first */
+       spin_lock(&data_sinfo->lock);
+       data_sinfo->bytes_delalloc += bytes;
+
+       /*
+        * we are adding a delalloc extent without calling
+        * btrfs_check_data_free_space first.  This happens on a weird
+        * writepage condition, but shouldn't hurt our accounting
+        */
+       if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
+               data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
+               BTRFS_I(inode)->reserved_bytes = 0;
+       } else {
+               data_sinfo->bytes_may_use -= bytes;
+               BTRFS_I(inode)->reserved_bytes -= bytes;
+       }
+
+       spin_unlock(&data_sinfo->lock);
+}
+
+/* called when we are clearing an delalloc extent from the inode's io_tree */
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+                             u64 bytes)
+{
+       struct btrfs_space_info *info;
+
+       info = BTRFS_I(inode)->space_info;
+
+       spin_lock(&info->lock);
+       info->bytes_delalloc -= bytes;
+       spin_unlock(&info->lock);
+}
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force)
@@ -2017,6 +2055,8 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                clear_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
        }
+       mutex_unlock(&root->fs_info->pinned_mutex);
+
        while (num > 0) {
                cache = btrfs_lookup_block_group(fs_info, bytenr);
                BUG_ON(!cache);
@@ -2108,8 +2148,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
        u64 end;
        int ret;
 
-       mutex_lock(&root->fs_info->pinned_mutex);
        while (1) {
+               mutex_lock(&root->fs_info->pinned_mutex);
                ret = find_first_extent_bit(unpin, 0, &start, &end,
                                            EXTENT_DIRTY);
                if (ret)
@@ -2117,214 +2157,21 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 
                ret = btrfs_discard_extent(root, start, end + 1 - start);
 
+               /* unlocks the pinned mutex */
                btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
                clear_extent_dirty(unpin, start, end, GFP_NOFS);
 
-               if (need_resched()) {
-                       mutex_unlock(&root->fs_info->pinned_mutex);
-                       cond_resched();
-                       mutex_lock(&root->fs_info->pinned_mutex);
-               }
+               cond_resched();
        }
        mutex_unlock(&root->fs_info->pinned_mutex);
        return ret;
 }
 
-static int finish_current_insert(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *extent_root, int all)
-{
-       u64 start;
-       u64 end;
-       u64 priv;
-       u64 search = 0;
-       u64 skipped = 0;
-       struct btrfs_fs_info *info = extent_root->fs_info;
-       struct btrfs_path *path;
-       struct pending_extent_op *extent_op, *tmp;
-       struct list_head insert_list, update_list;
-       int ret;
-       int num_inserts = 0, max_inserts;
-
-       path = btrfs_alloc_path();
-       INIT_LIST_HEAD(&insert_list);
-       INIT_LIST_HEAD(&update_list);
-
-       max_inserts = extent_root->leafsize /
-               (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
-                sizeof(struct btrfs_extent_ref) +
-                sizeof(struct btrfs_extent_item));
-again:
-       mutex_lock(&info->extent_ins_mutex);
-       while (1) {
-               ret = find_first_extent_bit(&info->extent_ins, search, &start,
-                                           &end, EXTENT_WRITEBACK);
-               if (ret) {
-                       if (skipped && all && !num_inserts) {
-                               skipped = 0;
-                               search = 0;
-                               continue;
-                       }
-                       mutex_unlock(&info->extent_ins_mutex);
-                       break;
-               }
-
-               ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
-               if (!ret) {
-                       skipped = 1;
-                       search = end + 1;
-                       if (need_resched()) {
-                               mutex_unlock(&info->extent_ins_mutex);
-                               cond_resched();
-                               mutex_lock(&info->extent_ins_mutex);
-                       }
-                       continue;
-               }
-
-               ret = get_state_private(&info->extent_ins, start, &priv);
-               BUG_ON(ret);
-               extent_op = (struct pending_extent_op *)(unsigned long) priv;
-
-               if (extent_op->type == PENDING_EXTENT_INSERT) {
-                       num_inserts++;
-                       list_add_tail(&extent_op->list, &insert_list);
-                       search = end + 1;
-                       if (num_inserts == max_inserts) {
-                               mutex_unlock(&info->extent_ins_mutex);
-                               break;
-                       }
-               } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
-                       list_add_tail(&extent_op->list, &update_list);
-                       search = end + 1;
-               } else {
-                       BUG();
-               }
-       }
-
-       /*
-        * process the update list, clear the writeback bit for it, and if
-        * somebody marked this thing for deletion then just unlock it and be
-        * done, the free_extents will handle it
-        */
-       mutex_lock(&info->extent_ins_mutex);
-       list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
-               clear_extent_bits(&info->extent_ins, extent_op->bytenr,
-                                 extent_op->bytenr + extent_op->num_bytes - 1,
-                                 EXTENT_WRITEBACK, GFP_NOFS);
-               if (extent_op->del) {
-                       list_del_init(&extent_op->list);
-                       unlock_extent(&info->extent_ins, extent_op->bytenr,
-                                     extent_op->bytenr + extent_op->num_bytes
-                                     - 1, GFP_NOFS);
-                       kfree(extent_op);
-               }
-       }
-       mutex_unlock(&info->extent_ins_mutex);
-
-       /*
-        * still have things left on the update list, go ahead an update
-        * everything
-        */
-       if (!list_empty(&update_list)) {
-               ret = update_backrefs(trans, extent_root, path, &update_list);
-               BUG_ON(ret);
-       }
-
-       /*
-        * if no inserts need to be done, but we skipped some extents and we
-        * need to make sure everything is cleaned then reset everything and
-        * go back to the beginning
-        */
-       if (!num_inserts && all && skipped) {
-               search = 0;
-               skipped = 0;
-               INIT_LIST_HEAD(&update_list);
-               INIT_LIST_HEAD(&insert_list);
-               goto again;
-       } else if (!num_inserts) {
-               goto out;
-       }
-
-       /*
-        * process the insert extents list.  Again if we are deleting this
-        * extent, then just unlock it, pin down the bytes if need be, and be
-        * done with it.  Saves us from having to actually insert the extent
-        * into the tree and then subsequently come along and delete it
-        */
-       mutex_lock(&info->extent_ins_mutex);
-       list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
-               clear_extent_bits(&info->extent_ins, extent_op->bytenr,
-                                 extent_op->bytenr + extent_op->num_bytes - 1,
-                                 EXTENT_WRITEBACK, GFP_NOFS);
-               if (extent_op->del) {
-                       u64 used;
-                       list_del_init(&extent_op->list);
-                       unlock_extent(&info->extent_ins, extent_op->bytenr,
-                                     extent_op->bytenr + extent_op->num_bytes
-                                     - 1, GFP_NOFS);
-
-                       mutex_lock(&extent_root->fs_info->pinned_mutex);
-                       ret = pin_down_bytes(trans, extent_root,
-                                            extent_op->bytenr,
-                                            extent_op->num_bytes, 0);
-                       mutex_unlock(&extent_root->fs_info->pinned_mutex);
-
-                       spin_lock(&info->delalloc_lock);
-                       used = btrfs_super_bytes_used(&info->super_copy);
-                       btrfs_set_super_bytes_used(&info->super_copy,
-                                       used - extent_op->num_bytes);
-                       used = btrfs_root_used(&extent_root->root_item);
-                       btrfs_set_root_used(&extent_root->root_item,
-                                       used - extent_op->num_bytes);
-                       spin_unlock(&info->delalloc_lock);
-
-                       ret = update_block_group(trans, extent_root,
-                                                extent_op->bytenr,
-                                                extent_op->num_bytes,
-                                                0, ret > 0);
-                       BUG_ON(ret);
-                       kfree(extent_op);
-                       num_inserts--;
-               }
-       }
-       mutex_unlock(&info->extent_ins_mutex);
-
-       ret = insert_extents(trans, extent_root, path, &insert_list,
-                            num_inserts);
-       BUG_ON(ret);
-
-       /*
-        * if we broke out of the loop in order to insert stuff because we hit
-        * the maximum number of inserts at a time we can handle, then loop
-        * back and pick up where we left off
-        */
-       if (num_inserts == max_inserts) {
-               INIT_LIST_HEAD(&insert_list);
-               INIT_LIST_HEAD(&update_list);
-               num_inserts = 0;
-               goto again;
-       }
-
-       /*
-        * again, if we need to make absolutely sure there are no more pending
-        * extent operations left and we know that we skipped some, go back to
-        * the beginning and do it all again
-        */
-       if (all && skipped) {
-               INIT_LIST_HEAD(&insert_list);
-               INIT_LIST_HEAD(&update_list);
-               search = 0;
-               skipped = 0;
-               num_inserts = 0;
-               goto again;
-       }
-out:
-       btrfs_free_path(path);
-       return 0;
-}
-
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
-                         u64 bytenr, u64 num_bytes, int is_data)
+                         struct btrfs_path *path,
+                         u64 bytenr, u64 num_bytes, int is_data,
+                         struct extent_buffer **must_clean)
 {
        int err = 0;
        struct extent_buffer *buf;
@@ -2347,17 +2194,19 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
                u64 header_transid = btrfs_header_generation(buf);
                if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
                    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
+                   header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
                    header_transid == trans->transid &&
                    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-                       clean_tree_block(NULL, root, buf);
-                       btrfs_tree_unlock(buf);
-                       free_extent_buffer(buf);
+                       *must_clean = buf;
                        return 1;
                }
                btrfs_tree_unlock(buf);
        }
        free_extent_buffer(buf);
 pinit:
+       btrfs_set_path_blocking(path);
+       mutex_lock(&root->fs_info->pinned_mutex);
+       /* unlocks the pinned mutex */
        btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 
        BUG_ON(err < 0);
@@ -2371,7 +2220,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
                         u64 root_objectid, u64 ref_generation,
-                        u64 owner_objectid, int pin, int mark_free)
+                        u64 owner_objectid, int pin, int mark_free,
+                        int refs_to_drop)
 {
        struct btrfs_path *path;
        struct btrfs_key key;
@@ -2393,6 +2243,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                return -ENOMEM;
 
        path->reada = 1;
+       path->leave_spinning = 1;
        ret = lookup_extent_backref(trans, extent_root, path,
                                    bytenr, parent, root_objectid,
                                    ref_generation, owner_objectid, 1);
@@ -2414,9 +2265,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                                break;
                }
                if (!found_extent) {
-                       ret = remove_extent_backref(trans, extent_root, path);
+                       ret = remove_extent_backref(trans, extent_root, path,
+                                                   refs_to_drop);
                        BUG_ON(ret);
                        btrfs_release_path(extent_root, path);
+                       path->leave_spinning = 1;
                        ret = btrfs_search_slot(trans, extent_root,
                                                &key, path, -1, 1);
                        if (ret) {
@@ -2432,8 +2285,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                btrfs_print_leaf(extent_root, path->nodes[0]);
                WARN_ON(1);
                printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-                      "root %llu gen %llu owner %llu\n",
+                      "parent %llu root %llu gen %llu owner %llu\n",
                       (unsigned long long)bytenr,
+                      (unsigned long long)parent,
                       (unsigned long long)root_objectid,
                       (unsigned long long)ref_generation,
                       (unsigned long long)owner_objectid);
@@ -2443,17 +2297,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
        ei = btrfs_item_ptr(leaf, extent_slot,
                            struct btrfs_extent_item);
        refs = btrfs_extent_refs(leaf, ei);
-       BUG_ON(refs == 0);
-       refs -= 1;
-       btrfs_set_extent_refs(leaf, ei, refs);
 
+       /*
+        * we're not allowed to delete the extent item if there
+        * are other delayed ref updates pending
+        */
+
+       BUG_ON(refs < refs_to_drop);
+       refs -= refs_to_drop;
+       btrfs_set_extent_refs(leaf, ei, refs);
        btrfs_mark_buffer_dirty(leaf);
 
-       if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+       if (refs == 0 && found_extent &&
+           path->slots[0] == extent_slot + 1) {
                struct btrfs_extent_ref *ref;
                ref = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_extent_ref);
-               BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
+               BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
                /* if the back ref and the extent are next to each other
                 * they get deleted below in one shot
                 */
@@ -2461,11 +2321,13 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                num_to_del = 2;
        } else if (found_extent) {
                /* otherwise delete the extent back ref */
-               ret = remove_extent_backref(trans, extent_root, path);
+               ret = remove_extent_backref(trans, extent_root, path,
+                                           refs_to_drop);
                BUG_ON(ret);
                /* if refs are 0, we need to setup the path for deletion */
                if (refs == 0) {
                        btrfs_release_path(extent_root, path);
+                       path->leave_spinning = 1;
                        ret = btrfs_search_slot(trans, extent_root, &key, path,
                                                -1, 1);
                        BUG_ON(ret);
@@ -2475,16 +2337,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
        if (refs == 0) {
                u64 super_used;
                u64 root_used;
+               struct extent_buffer *must_clean = NULL;
 
                if (pin) {
-                       mutex_lock(&root->fs_info->pinned_mutex);
-                       ret = pin_down_bytes(trans, root, bytenr, num_bytes,
-                               owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
-                       mutex_unlock(&root->fs_info->pinned_mutex);
+                       ret = pin_down_bytes(trans, root, path,
+                               bytenr, num_bytes,
+                               owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
+                               &must_clean);
                        if (ret > 0)
                                mark_free = 1;
                        BUG_ON(ret < 0);
                }
+
                /* block accounting for super block */
                spin_lock(&info->delalloc_lock);
                super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2496,234 +2360,134 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                btrfs_set_root_used(&root->root_item,
                                           root_used - num_bytes);
                spin_unlock(&info->delalloc_lock);
+
+               /*
+                * it is going to be very rare for someone to be waiting
+                * on the block we're freeing.  del_items might need to
+                * schedule, so rather than get fancy, just force it
+                * to blocking here
+                */
+               if (must_clean)
+                       btrfs_set_lock_blocking(must_clean);
+
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
                BUG_ON(ret);
                btrfs_release_path(extent_root, path);
 
-               if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-                       ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
-                       BUG_ON(ret);
-               }
-
-               ret = update_block_group(trans, root, bytenr, num_bytes, 0,
-                                        mark_free);
-               BUG_ON(ret);
-       }
-       btrfs_free_path(path);
-       finish_current_insert(trans, extent_root, 0);
-       return ret;
-}
-
-/*
- * find all the blocks marked as pending in the radix tree and remove
- * them from the extent map
- */
-static int del_pending_extents(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *extent_root, int all)
-{
-       int ret;
-       int err = 0;
-       u64 start;
-       u64 end;
-       u64 priv;
-       u64 search = 0;
-       int nr = 0, skipped = 0;
-       struct extent_io_tree *pending_del;
-       struct extent_io_tree *extent_ins;
-       struct pending_extent_op *extent_op;
-       struct btrfs_fs_info *info = extent_root->fs_info;
-       struct list_head delete_list;
-
-       INIT_LIST_HEAD(&delete_list);
-       extent_ins = &extent_root->fs_info->extent_ins;
-       pending_del = &extent_root->fs_info->pending_del;
-
-again:
-       mutex_lock(&info->extent_ins_mutex);
-       while (1) {
-               ret = find_first_extent_bit(pending_del, search, &start, &end,
-                                           EXTENT_WRITEBACK);
-               if (ret) {
-                       if (all && skipped && !nr) {
-                               search = 0;
-                               continue;
-                       }
-                       mutex_unlock(&info->extent_ins_mutex);
-                       break;
-               }
-
-               ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
-               if (!ret) {
-                       search = end+1;
-                       skipped = 1;
-
-                       if (need_resched()) {
-                               mutex_unlock(&info->extent_ins_mutex);
-                               cond_resched();
-                               mutex_lock(&info->extent_ins_mutex);
-                       }
-
-                       continue;
-               }
-               BUG_ON(ret < 0);
-
-               ret = get_state_private(pending_del, start, &priv);
-               BUG_ON(ret);
-               extent_op = (struct pending_extent_op *)(unsigned long)priv;
-
-               clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
-                                 GFP_NOFS);
-               if (!test_range_bit(extent_ins, start, end,
-                                   EXTENT_WRITEBACK, 0)) {
-                       list_add_tail(&extent_op->list, &delete_list);
-                       nr++;
-               } else {
-                       kfree(extent_op);
-
-                       ret = get_state_private(&info->extent_ins, start,
-                                               &priv);
-                       BUG_ON(ret);
-                       extent_op = (struct pending_extent_op *)
-                                               (unsigned long)priv;
-
-                       clear_extent_bits(&info->extent_ins, start, end,
-                                         EXTENT_WRITEBACK, GFP_NOFS);
-
-                       if (extent_op->type == PENDING_BACKREF_UPDATE) {
-                               list_add_tail(&extent_op->list, &delete_list);
-                               search = end + 1;
-                               nr++;
-                               continue;
-                       }
-
-                       mutex_lock(&extent_root->fs_info->pinned_mutex);
-                       ret = pin_down_bytes(trans, extent_root, start,
-                                            end + 1 - start, 0);
-                       mutex_unlock(&extent_root->fs_info->pinned_mutex);
-
-                       ret = update_block_group(trans, extent_root, start,
-                                               end + 1 - start, 0, ret > 0);
-
-                       unlock_extent(extent_ins, start, end, GFP_NOFS);
-                       BUG_ON(ret);
-                       kfree(extent_op);
+               if (must_clean) {
+                       clean_tree_block(NULL, root, must_clean);
+                       btrfs_tree_unlock(must_clean);
+                       free_extent_buffer(must_clean);
                }
-               if (ret)
-                       err = ret;
-
-               search = end + 1;
 
-               if (need_resched()) {
-                       mutex_unlock(&info->extent_ins_mutex);
-                       cond_resched();
-                       mutex_lock(&info->extent_ins_mutex);
+               if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+                       ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
+                       BUG_ON(ret);
                }
-       }
 
-       if (nr) {
-               ret = free_extents(trans, extent_root, &delete_list);
+               ret = update_block_group(trans, root, bytenr, num_bytes, 0,
+                                        mark_free);
                BUG_ON(ret);
        }
-
-       if (all && skipped) {
-               INIT_LIST_HEAD(&delete_list);
-               search = 0;
-               nr = 0;
-               goto again;
-       }
-
-       return err;
+       btrfs_free_path(path);
+       return ret;
 }
 
 /*
  * remove an extent from the root, returns 0 on success
  */
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root,
-                              u64 bytenr, u64 num_bytes, u64 parent,
-                              u64 root_objectid, u64 ref_generation,
-                              u64 owner_objectid, int pin)
+                                       struct btrfs_root *root,
+                                       u64 bytenr, u64 num_bytes, u64 parent,
+                                       u64 root_objectid, u64 ref_generation,
+                                       u64 owner_objectid, int pin,
+                                       int refs_to_drop)
 {
-       struct btrfs_root *extent_root = root->fs_info->extent_root;
-       int pending_ret;
-       int ret;
-
        WARN_ON(num_bytes < root->sectorsize);
-       if (root == extent_root) {
-               struct pending_extent_op *extent_op = NULL;
-
-               mutex_lock(&root->fs_info->extent_ins_mutex);
-               if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-                               bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
-                       u64 priv;
-                       ret = get_state_private(&root->fs_info->extent_ins,
-                                               bytenr, &priv);
-                       BUG_ON(ret);
-                       extent_op = (struct pending_extent_op *)
-                                               (unsigned long)priv;
-
-                       extent_op->del = 1;
-                       if (extent_op->type == PENDING_EXTENT_INSERT) {
-                               mutex_unlock(&root->fs_info->extent_ins_mutex);
-                               return 0;
-                       }
-               }
 
-               if (extent_op) {
-                       ref_generation = extent_op->orig_generation;
-                       parent = extent_op->orig_parent;
-               }
-
-               extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-               BUG_ON(!extent_op);
-
-               extent_op->type = PENDING_EXTENT_DELETE;
-               extent_op->bytenr = bytenr;
-               extent_op->num_bytes = num_bytes;
-               extent_op->parent = parent;
-               extent_op->orig_parent = parent;
-               extent_op->generation = ref_generation;
-               extent_op->orig_generation = ref_generation;
-               extent_op->level = (int)owner_objectid;
-               INIT_LIST_HEAD(&extent_op->list);
-               extent_op->del = 0;
-
-               set_extent_bits(&root->fs_info->pending_del,
-                               bytenr, bytenr + num_bytes - 1,
-                               EXTENT_WRITEBACK, GFP_NOFS);
-               set_state_private(&root->fs_info->pending_del,
-                                 bytenr, (unsigned long)extent_op);
-               mutex_unlock(&root->fs_info->extent_ins_mutex);
-               return 0;
-       }
-       /* if metadata always pin */
-       if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
-               if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
-                       struct btrfs_block_group_cache *cache;
-
-                       /* btrfs_free_reserved_extent */
-                       cache = btrfs_lookup_block_group(root->fs_info, bytenr);
-                       BUG_ON(!cache);
-                       btrfs_add_free_space(cache, bytenr, num_bytes);
-                       put_block_group(cache);
-                       update_reserved_extents(root, bytenr, num_bytes, 0);
-                       return 0;
-               }
+       /*
+        * if metadata always pin
+        * if data pin when any transaction has committed this
+        */
+       if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
+           ref_generation != trans->transid)
                pin = 1;
-       }
 
-       /* if data pin when any transaction has committed this */
        if (ref_generation != trans->transid)
                pin = 1;
 
-       ret = __free_extent(trans, root, bytenr, num_bytes, parent,
+       return __free_extent(trans, root, bytenr, num_bytes, parent,
                            root_objectid, ref_generation,
-                           owner_objectid, pin, pin == 0);
+                           owner_objectid, pin, pin == 0, refs_to_drop);
+}
+
+/*
+ * when we free an extent, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well.  This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
+ */
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root, u64 bytenr)
+{
+       struct btrfs_delayed_ref_head *head;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct btrfs_delayed_ref_node *ref;
+       struct rb_node *node;
+       int ret;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+       head = btrfs_find_delayed_ref_head(trans, bytenr);
+       if (!head)
+               goto out;
+
+       node = rb_prev(&head->node.rb_node);
+       if (!node)
+               goto out;
+
+       ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+
+       /* there are still entries for this ref, we can't drop it */
+       if (ref->bytenr == bytenr)
+               goto out;
+
+       /*
+        * waiting for the lock here would deadlock.  If someone else has it
+        * locked they are already in the process of dropping it anyway
+        */
+       if (!mutex_trylock(&head->mutex))
+               goto out;
+
+       /*
+        * at this point we have a head with no other entries.  Go
+        * ahead and process it.
+        */
+       head->node.in_tree = 0;
+       rb_erase(&head->node.rb_node, &delayed_refs->root);
+
+       delayed_refs->num_entries--;
+
+       /*
+        * we don't take a ref on the node because we're removing it from the
+        * tree, so we just steal the ref the tree was holding.
+        */
+       delayed_refs->num_heads--;
+       if (list_empty(&head->cluster))
+               delayed_refs->num_heads_ready--;
+
+       list_del_init(&head->cluster);
+       spin_unlock(&delayed_refs->lock);
 
-       finish_current_insert(trans, root->fs_info->extent_root, 0);
-       pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
-       return ret ? ret : pending_ret;
+       ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
+                                 &head->node, head->must_insert_reserved);
+       BUG_ON(ret);
+       btrfs_put_delayed_ref(&head->node);
+       return 0;
+out:
+       spin_unlock(&delayed_refs->lock);
+       return 0;
 }
 
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -2734,9 +2498,30 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 {
        int ret;
 
-       ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
-                                 root_objectid, ref_generation,
-                                 owner_objectid, pin);
+       /*
+        * tree log blocks never actually go into the extent allocation
+        * tree, just update pinning info and exit early.
+        *
+        * data extents referenced by the tree log do need to have
+        * their reference counts bumped.
+        */
+       if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+           owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+               mutex_lock(&root->fs_info->pinned_mutex);
+
+               /* unlocks the pinned mutex */
+               btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+               update_reserved_extents(root, bytenr, num_bytes, 0);
+               ret = 0;
+       } else {
+               ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
+                                      root_objectid, ref_generation,
+                                      owner_objectid,
+                                      BTRFS_DROP_DELAYED_REF, 1);
+               BUG_ON(ret);
+               ret = check_ref_cleanup(trans, root, bytenr);
+               BUG_ON(ret);
+       }
        return ret;
 }
 
@@ -2787,7 +2572,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 
        if (data & BTRFS_BLOCK_GROUP_METADATA) {
                last_ptr = &root->fs_info->last_alloc;
-               empty_cluster = 64 * 1024;
+               if (!btrfs_test_opt(root, SSD))
+                       empty_cluster = 64 * 1024;
        }
 
        if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
@@ -3014,16 +2800,18 @@ loop_check:
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 {
        struct btrfs_block_group_cache *cache;
-       struct list_head *l;
 
        printk(KERN_INFO "space_info has %llu free, is %sfull\n",
               (unsigned long long)(info->total_bytes - info->bytes_used -
                                    info->bytes_pinned - info->bytes_reserved),
               (info->full) ? "" : "not ");
+       printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+              " may_use=%llu, used=%llu\n", info->total_bytes,
+              info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
+              info->bytes_used);
 
        down_read(&info->groups_sem);
-       list_for_each(l, &info->block_groups) {
-               cache = list_entry(l, struct btrfs_block_group_cache, list);
+       list_for_each_entry(cache, &info->block_groups, list) {
                spin_lock(&cache->lock);
                printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
                       "%llu pinned %llu reserved\n",
@@ -3047,24 +2835,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 {
        int ret;
        u64 search_start = 0;
-       u64 alloc_profile;
        struct btrfs_fs_info *info = root->fs_info;
 
-       if (data) {
-               alloc_profile = info->avail_data_alloc_bits &
-                       info->data_alloc_profile;
-               data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
-       } else if (root == root->fs_info->chunk_root) {
-               alloc_profile = info->avail_system_alloc_bits &
-                       info->system_alloc_profile;
-               data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
-       } else {
-               alloc_profile = info->avail_metadata_alloc_bits &
-                       info->metadata_alloc_profile;
-               data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
-       }
+       data = btrfs_get_alloc_profile(root, data);
 again:
-       data = btrfs_reduce_alloc_profile(root, data);
        /*
         * the only place that sets empty_size is btrfs_realloc_node, which
         * is not called recursively on allocations
@@ -3148,10 +2922,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
                                         struct btrfs_root *root, u64 parent,
                                         u64 root_objectid, u64 ref_generation,
-                                        u64 owner, struct btrfs_key *ins)
+                                        u64 owner, struct btrfs_key *ins,
+                                        int ref_mod)
 {
        int ret;
-       int pending_ret;
        u64 super_used;
        u64 root_used;
        u64 num_bytes = ins->offset;
@@ -3176,33 +2950,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
        btrfs_set_root_used(&root->root_item, root_used + num_bytes);
        spin_unlock(&info->delalloc_lock);
 
-       if (root == extent_root) {
-               struct pending_extent_op *extent_op;
-
-               extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-               BUG_ON(!extent_op);
-
-               extent_op->type = PENDING_EXTENT_INSERT;
-               extent_op->bytenr = ins->objectid;
-               extent_op->num_bytes = ins->offset;
-               extent_op->parent = parent;
-               extent_op->orig_parent = 0;
-               extent_op->generation = ref_generation;
-               extent_op->orig_generation = 0;
-               extent_op->level = (int)owner;
-               INIT_LIST_HEAD(&extent_op->list);
-               extent_op->del = 0;
-
-               mutex_lock(&root->fs_info->extent_ins_mutex);
-               set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
-                               ins->objectid + ins->offset - 1,
-                               EXTENT_WRITEBACK, GFP_NOFS);
-               set_state_private(&root->fs_info->extent_ins,
-                                 ins->objectid, (unsigned long)extent_op);
-               mutex_unlock(&root->fs_info->extent_ins_mutex);
-               goto update_block;
-       }
-
        memcpy(&keys[0], ins, sizeof(*ins));
        keys[1].objectid = ins->objectid;
        keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3213,37 +2960,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
 
+       path->leave_spinning = 1;
        ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
                                       sizes, 2);
        BUG_ON(ret);
 
        extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                     struct btrfs_extent_item);
-       btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
+       btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
        ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
                             struct btrfs_extent_ref);
 
        btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
        btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
        btrfs_set_ref_objectid(path->nodes[0], ref, owner);
-       btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
+       btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
 
        btrfs_mark_buffer_dirty(path->nodes[0]);
 
        trans->alloc_exclude_start = 0;
        trans->alloc_exclude_nr = 0;
        btrfs_free_path(path);
-       finish_current_insert(trans, extent_root, 0);
-       pending_ret = del_pending_extents(trans, extent_root, 0);
 
        if (ret)
                goto out;
-       if (pending_ret) {
-               ret = pending_ret;
-               goto out;
-       }
 
-update_block:
        ret = update_block_group(trans, root, ins->objectid,
                                 ins->offset, 1, 0);
        if (ret) {
@@ -3265,9 +3006,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
 
        if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
                return 0;
-       ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-                                           ref_generation, owner, ins);
-       update_reserved_extents(root, ins->objectid, ins->offset, 0);
+
+       ret = btrfs_add_delayed_ref(trans, ins->objectid,
+                                   ins->offset, parent, root_objectid,
+                                   ref_generation, owner,
+                                   BTRFS_ADD_DELAYED_EXTENT, 0);
+       BUG_ON(ret);
        return ret;
 }
 
@@ -3294,7 +3038,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        put_block_group(block_group);
        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-                                           ref_generation, owner, ins);
+                                           ref_generation, owner, ins, 1);
        return ret;
 }
 
@@ -3313,26 +3057,25 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
                       u64 search_end, struct btrfs_key *ins, u64 data)
 {
        int ret;
-
        ret = __btrfs_reserve_extent(trans, root, num_bytes,
                                     min_alloc_size, empty_size, hint_byte,
                                     search_end, ins, data);
        BUG_ON(ret);
        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-               ret = __btrfs_alloc_reserved_extent(trans, root, parent,
-                                       root_objectid, ref_generation,
-                                       owner_objectid, ins);
+               ret = btrfs_add_delayed_ref(trans, ins->objectid,
+                                           ins->offset, parent, root_objectid,
+                                           ref_generation, owner_objectid,
+                                           BTRFS_ADD_DELAYED_EXTENT, 0);
                BUG_ON(ret);
-
-       } else {
-               update_reserved_extents(root, ins->objectid, ins->offset, 1);
        }
+       update_reserved_extents(root, ins->objectid, ins->offset, 1);
        return ret;
 }
 
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
-                                           u64 bytenr, u32 blocksize)
+                                           u64 bytenr, u32 blocksize,
+                                           int level)
 {
        struct extent_buffer *buf;
 
@@ -3340,9 +3083,13 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
        if (!buf)
                return ERR_PTR(-ENOMEM);
        btrfs_set_header_generation(buf, trans->transid);
+       btrfs_set_buffer_lockdep_class(buf, level);
        btrfs_tree_lock(buf);
        clean_tree_block(trans, root, buf);
+
+       btrfs_set_lock_blocking(buf);
        btrfs_set_buffer_uptodate(buf);
+
        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
                set_extent_dirty(&root->dirty_log_pages, buf->start,
                         buf->start + buf->len - 1, GFP_NOFS);
@@ -3351,6 +3098,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                         buf->start + buf->len - 1, GFP_NOFS);
        }
        trans->blocks_used++;
+       /* this returns a buffer locked for blocking */
        return buf;
 }
 
@@ -3379,7 +3127,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                return ERR_PTR(ret);
        }
 
-       buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
+       buf = btrfs_init_new_buffer(trans, root, ins.objectid,
+                                   blocksize, level);
        return buf;
 }
 
@@ -3388,37 +3137,74 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 {
        u64 leaf_owner;
        u64 leaf_generation;
+       struct refsort *sorted;
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        int i;
        int nritems;
        int ret;
+       int refi = 0;
+       int slot;
 
        BUG_ON(!btrfs_is_leaf(leaf));
        nritems = btrfs_header_nritems(leaf);
        leaf_owner = btrfs_header_owner(leaf);
        leaf_generation = btrfs_header_generation(leaf);
 
+       sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+       /* we do this loop twice.  The first time we build a list
+        * of the extents we have a reference on, then we sort the list
+        * by bytenr.  The second time around we actually do the
+        * extent freeing.
+        */
        for (i = 0; i < nritems; i++) {
                u64 disk_bytenr;
                cond_resched();
 
                btrfs_item_key_to_cpu(leaf, &key, i);
+
+               /* only extents have references, skip everything else */
                if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
                        continue;
+
                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+
+               /* inline extents live in the btree, they don't have refs */
                if (btrfs_file_extent_type(leaf, fi) ==
                    BTRFS_FILE_EXTENT_INLINE)
                        continue;
-               /*
-                * FIXME make sure to insert a trans record that
-                * repeats the snapshot del on crash
-                */
+
                disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+
+               /* holes don't have refs */
                if (disk_bytenr == 0)
                        continue;
 
-               ret = __btrfs_free_extent(trans, root, disk_bytenr,
+               sorted[refi].bytenr = disk_bytenr;
+               sorted[refi].slot = i;
+               refi++;
+       }
+
+       if (refi == 0)
+               goto out;
+
+       sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+       for (i = 0; i < refi; i++) {
+               u64 disk_bytenr;
+
+               disk_bytenr = sorted[i].bytenr;
+               slot = sorted[i].slot;
+
+               cond_resched();
+
+               btrfs_item_key_to_cpu(leaf, &key, slot);
+               if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
+                       continue;
+
+               fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+               ret = btrfs_free_extent(trans, root, disk_bytenr,
                                btrfs_file_extent_disk_num_bytes(leaf, fi),
                                leaf->start, leaf_owner, leaf_generation,
                                key.objectid, 0);
@@ -3428,6 +3214,8 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                wake_up(&root->fs_info->transaction_throttle);
                cond_resched();
        }
+out:
+       kfree(sorted);
        return 0;
 }
 
@@ -3437,10 +3225,26 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 {
        int i;
        int ret;
-       struct btrfs_extent_info *info = ref->extents;
+       struct btrfs_extent_info *info;
+       struct refsort *sorted;
+
+       if (ref->nritems == 0)
+               return 0;
 
+       sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
        for (i = 0; i < ref->nritems; i++) {
-               ret = __btrfs_free_extent(trans, root, info->bytenr,
+               sorted[i].bytenr = ref->extents[i].bytenr;
+               sorted[i].slot = i;
+       }
+       sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
+
+       /*
+        * the items in the ref were sorted when the ref was inserted
+        * into the ref cache, so this is already in order
+        */
+       for (i = 0; i < ref->nritems; i++) {
+               info = ref->extents + sorted[i].slot;
+               ret = btrfs_free_extent(trans, root, info->bytenr,
                                          info->num_bytes, ref->bytenr,
                                          ref->owner, ref->generation,
                                          info->objectid, 0);
@@ -3453,15 +3257,17 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
                info++;
        }
 
+       kfree(sorted);
        return 0;
 }
 
-static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root, u64 start,
                                     u64 len, u32 *refs)
 {
        int ret;
 
-       ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
+       ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
        BUG_ON(ret);
 
 #if 0 /* some debugging code in case we see problems here */
@@ -3496,6 +3302,153 @@ static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
        return ret;
 }
 
+/*
+ * this is used while deleting old snapshots, and it drops the refs
+ * on a whole subtree starting from a level 1 node.
+ *
+ * The idea is to sort all the leaf pointers, and then drop the
+ * ref on all the leaves in order.  Most of the time the leaves
+ * will have ref cache entries, so no leaf IOs will be required to
+ * find the extents they have references on.
+ *
+ * For each leaf, any references it has are also dropped in order
+ *
+ * This ends up dropping the references in something close to optimal
+ * order for reading and modifying the extent allocation tree.
+ */
+static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct btrfs_path *path)
+{
+       u64 bytenr;
+       u64 root_owner;
+       u64 root_gen;
+       struct extent_buffer *eb = path->nodes[1];
+       struct extent_buffer *leaf;
+       struct btrfs_leaf_ref *ref;
+       struct refsort *sorted = NULL;
+       int nritems = btrfs_header_nritems(eb);
+       int ret;
+       int i;
+       int refi = 0;
+       int slot = path->slots[1];
+       u32 blocksize = btrfs_level_size(root, 0);
+       u32 refs;
+
+       if (nritems == 0)
+               goto out;
+
+       root_owner = btrfs_header_owner(eb);
+       root_gen = btrfs_header_generation(eb);
+       sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
+
+       /*
+        * step one, sort all the leaf pointers so we don't scribble
+        * randomly into the extent allocation tree
+        */
+       for (i = slot; i < nritems; i++) {
+               sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
+               sorted[refi].slot = i;
+               refi++;
+       }
+
+       /*
+        * nritems won't be zero, but if we're picking up drop_snapshot
+        * after a crash, slot might be > 0, so double check things
+        * just in case.
+        */
+       if (refi == 0)
+               goto out;
+
+       sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
+
+       /*
+        * the first loop frees everything the leaves point to
+        */
+       for (i = 0; i < refi; i++) {
+               u64 ptr_gen;
+
+               bytenr = sorted[i].bytenr;
+
+               /*
+                * check the reference count on this leaf.  If it is > 1
+                * we just decrement it below and don't update any
+                * of the refs the leaf points to.
+                */
+               ret = drop_snap_lookup_refcount(trans, root, bytenr,
+                                               blocksize, &refs);
+               BUG_ON(ret);
+               if (refs != 1)
+                       continue;
+
+               ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
+
+               /*
+                * the leaf only had one reference, which means the
+                * only thing pointing to this leaf is the snapshot
+                * we're deleting.  It isn't possible for the reference
+                * count to increase again later
+                *
+                * The reference cache is checked for the leaf,
+                * and if found we'll be able to drop any refs held by
+                * the leaf without needing to read it in.
+                */
+               ref = btrfs_lookup_leaf_ref(root, bytenr);
+               if (ref && ref->generation != ptr_gen) {
+                       btrfs_free_leaf_ref(root, ref);
+                       ref = NULL;
+               }
+               if (ref) {
+                       ret = cache_drop_leaf_ref(trans, root, ref);
+                       BUG_ON(ret);
+                       btrfs_remove_leaf_ref(root, ref);
+                       btrfs_free_leaf_ref(root, ref);
+               } else {
+                       /*
+                        * the leaf wasn't in the reference cache, so
+                        * we have to read it.
+                        */
+                       leaf = read_tree_block(root, bytenr, blocksize,
+                                              ptr_gen);
+                       ret = btrfs_drop_leaf_ref(trans, root, leaf);
+                       BUG_ON(ret);
+                       free_extent_buffer(leaf);
+               }
+               atomic_inc(&root->fs_info->throttle_gen);
+               wake_up(&root->fs_info->transaction_throttle);
+               cond_resched();
+       }
+
+       /*
+        * run through the loop again to free the refs on the leaves.
+        * This is faster than doing it in the loop above because
+        * the leaves are likely to be clustered together.  We end up
+        * working in nice chunks on the extent allocation tree.
+        */
+       for (i = 0; i < refi; i++) {
+               bytenr = sorted[i].bytenr;
+               ret = btrfs_free_extent(trans, root, bytenr,
+                                       blocksize, eb->start,
+                                       root_owner, root_gen, 0, 1);
+               BUG_ON(ret);
+
+               atomic_inc(&root->fs_info->throttle_gen);
+               wake_up(&root->fs_info->transaction_throttle);
+               cond_resched();
+       }
+out:
+       kfree(sorted);
+
+       /*
+        * update the path to show we've processed the entire level 1
+        * node.  This will get saved into the root's drop_snapshot_progress
+        * field so these drops are not repeated again if this transaction
+        * commits.
+        */
+       path->slots[1] = nritems;
+       return 0;
+}
+
 /*
  * helper function for drop_snapshot, this walks down the tree dropping ref
  * counts as it goes.
@@ -3511,14 +3464,13 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
        struct extent_buffer *next;
        struct extent_buffer *cur;
        struct extent_buffer *parent;
-       struct btrfs_leaf_ref *ref;
        u32 blocksize;
        int ret;
        u32 refs;
 
        WARN_ON(*level < 0);
        WARN_ON(*level >= BTRFS_MAX_LEVEL);
-       ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
+       ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
                                path->nodes[*level]->len, &refs);
        BUG_ON(ret);
        if (refs > 1)
@@ -3538,24 +3490,54 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                if (path->slots[*level] >=
                    btrfs_header_nritems(cur))
                        break;
+
+               /* the new code goes down to level 1 and does all the
+                * leaves pointed to that node in bulk.  So, this check
+                * for level 0 will always be false.
+                *
+                * But, the disk format allows the drop_snapshot_progress
+                * field in the root to leave things in a state where
+                * a leaf will need cleaning up here.  If someone crashes
+                * with the old code and then boots with the new code,
+                * we might find a leaf here.
+                */
                if (*level == 0) {
                        ret = btrfs_drop_leaf_ref(trans, root, cur);
                        BUG_ON(ret);
                        break;
                }
+
+               /*
+                * once we get to level one, process the whole node
+                * at once, including everything below it.
+                */
+               if (*level == 1) {
+                       ret = drop_level_one_refs(trans, root, path);
+                       BUG_ON(ret);
+                       break;
+               }
+
                bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
                blocksize = btrfs_level_size(root, *level - 1);
 
-               ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+               ret = drop_snap_lookup_refcount(trans, root, bytenr,
+                                               blocksize, &refs);
                BUG_ON(ret);
+
+               /*
+                * if there is more than one reference, we don't need
+                * to read that node to drop any references it has.  We
+                * just drop the ref we hold on that node and move on to the
+                * next slot in this level.
+                */
                if (refs != 1) {
                        parent = path->nodes[*level];
                        root_owner = btrfs_header_owner(parent);
                        root_gen = btrfs_header_generation(parent);
                        path->slots[*level]++;
 
-                       ret = __btrfs_free_extent(trans, root, bytenr,
+                       ret = btrfs_free_extent(trans, root, bytenr,
                                                blocksize, parent->start,
                                                root_owner, root_gen,
                                                *level - 1, 1);
@@ -3567,46 +3549,12 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 
                        continue;
                }
+
                /*
-                * at this point, we have a single ref, and since the
-                * only place referencing this extent is a dead root
-                * the reference count should never go higher.
-                * So, we don't need to check it again
+                * we need to keep freeing things in the next level down.
+                * read the block and loop around to process it
                 */
-               if (*level == 1) {
-                       ref = btrfs_lookup_leaf_ref(root, bytenr);
-                       if (ref && ref->generation != ptr_gen) {
-                               btrfs_free_leaf_ref(root, ref);
-                               ref = NULL;
-                       }
-                       if (ref) {
-                               ret = cache_drop_leaf_ref(trans, root, ref);
-                               BUG_ON(ret);
-                               btrfs_remove_leaf_ref(root, ref);
-                               btrfs_free_leaf_ref(root, ref);
-                               *level = 0;
-                               break;
-                       }
-               }
-               next = btrfs_find_tree_block(root, bytenr, blocksize);
-               if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
-                       free_extent_buffer(next);
-
-                       next = read_tree_block(root, bytenr, blocksize,
-                                              ptr_gen);
-                       cond_resched();
-#if 0
-                       /*
-                        * this is a debugging check and can go away
-                        * the ref should never go all the way down to 1
-                        * at this point
-                        */
-                       ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
-                                               &refs);
-                       BUG_ON(ret);
-                       WARN_ON(refs != 1);
-#endif
-               }
+               next = read_tree_block(root, bytenr, blocksize, ptr_gen);
                WARN_ON(*level <= 0);
                if (path->nodes[*level-1])
                        free_extent_buffer(path->nodes[*level-1]);
@@ -3631,11 +3579,16 @@ out:
        root_owner = btrfs_header_owner(parent);
        root_gen = btrfs_header_generation(parent);
 
-       ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+       /*
+        * cleanup and free the reference on the last node
+        * we processed
+        */
+       ret = btrfs_free_extent(trans, root, bytenr, blocksize,
                                  parent->start, root_owner, root_gen,
                                  *level, 1);
        free_extent_buffer(path->nodes[*level]);
        path->nodes[*level] = NULL;
+
        *level += 1;
        BUG_ON(ret);
 
@@ -3687,6 +3640,7 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
 
                next = read_tree_block(root, bytenr, blocksize, ptr_gen);
                btrfs_tree_lock(next);
+               btrfs_set_lock_blocking(next);
 
                ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
                                              &refs);
@@ -3754,6 +3708,13 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
                        struct extent_buffer *node;
                        struct btrfs_disk_key disk_key;
+
+                       /*
+                        * there is more work to do in this level.
+                        * Update the drop_progress marker to reflect
+                        * the work we've done so far, and then bump
+                        * the slot number
+                        */
                        node = path->nodes[i];
                        path->slots[i]++;
                        *level = i;
@@ -3765,6 +3726,11 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
                        return 0;
                } else {
                        struct extent_buffer *parent;
+
+                       /*
+                        * this whole node is done, free our reference
+                        * on it and go up one level
+                        */
                        if (path->nodes[*level] == root->node)
                                parent = path->nodes[*level];
                        else
@@ -3806,6 +3772,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
        struct btrfs_path *path;
        int i;
        int orig_level;
+       int update_count;
        struct btrfs_root_item *root_item = &root->root_item;
 
        WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -3847,6 +3814,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                }
        }
        while (1) {
+               unsigned long update;
                wret = walk_down_tree(trans, root, path, &level);
                if (wret > 0)
                        break;
@@ -3859,12 +3827,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                        break;
                if (wret < 0)
                        ret = wret;
-               if (trans->transaction->in_commit) {
+               if (trans->transaction->in_commit ||
+                   trans->transaction->delayed_refs.flushing) {
                        ret = -EAGAIN;
                        break;
                }
                atomic_inc(&root->fs_info->throttle_gen);
                wake_up(&root->fs_info->transaction_throttle);
+               for (update_count = 0; update_count < 16; update_count++) {
+                       update = trans->delayed_ref_updates;
+                       trans->delayed_ref_updates = 0;
+                       if (update)
+                               btrfs_run_delayed_refs(trans, root, update);
+                       else
+                               break;
+               }
        }
        for (i = 0; i <= orig_level; i++) {
                if (path->nodes[i]) {
@@ -3891,13 +3868,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
 
-       BUG_ON(!btrfs_tree_locked(parent));
+       btrfs_assert_tree_locked(parent);
        parent_level = btrfs_header_level(parent);
        extent_buffer_get(parent);
        path->nodes[parent_level] = parent;
        path->slots[parent_level] = btrfs_header_nritems(parent);
 
-       BUG_ON(!btrfs_tree_locked(node));
+       btrfs_assert_tree_locked(node);
        level = btrfs_header_level(node);
        extent_buffer_get(node);
        path->nodes[level] = node;
@@ -4444,7 +4421,7 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
        u64 lock_end = 0;
        u64 num_bytes;
        u64 ext_offset;
-       u64 first_pos;
+       u64 search_end = (u64)-1;
        u32 nritems;
        int nr_scaned = 0;
        int extent_locked = 0;
@@ -4452,7 +4429,6 @@ static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
        int ret;
 
        memcpy(&key, leaf_key, sizeof(key));
-       first_pos = INT_LIMIT(loff_t) - extent_key->offset;
        if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
                if (key.objectid < ref_path->owner_objectid ||
                    (key.objectid == ref_path->owner_objectid &&
@@ -4501,7 +4477,7 @@ next:
                        if ((key.objectid > ref_path->owner_objectid) ||
                            (key.objectid == ref_path->owner_objectid &&
                             key.type > BTRFS_EXTENT_DATA_KEY) ||
-                           (key.offset >= first_pos + extent_key->offset))
+                           key.offset >= search_end)
                                break;
                }
 
@@ -4534,8 +4510,10 @@ next:
                num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
                ext_offset = btrfs_file_extent_offset(leaf, fi);
 
-               if (first_pos > key.offset - ext_offset)
-                       first_pos = key.offset - ext_offset;
+               if (search_end == (u64)-1) {
+                       search_end = key.offset - ext_offset +
+                               btrfs_file_extent_ram_bytes(leaf, fi);
+               }
 
                if (!extent_locked) {
                        lock_start = key.offset;
@@ -4724,7 +4702,7 @@ next:
                }
 skip:
                if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
-                   key.offset >= first_pos + extent_key->offset)
+                   key.offset >= search_end)
                        break;
 
                cond_resched();
@@ -4778,6 +4756,7 @@ int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
                ref->bytenr = buf->start;
                ref->owner = btrfs_header_owner(buf);
                ref->generation = btrfs_header_generation(buf);
+
                ret = btrfs_add_leaf_ref(root, ref, 0);
                WARN_ON(ret);
                btrfs_free_leaf_ref(root, ref);
@@ -4907,6 +4886,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
                                        root->root_key.objectid,
                                        trans->transid, key.objectid);
                BUG_ON(ret);
+
                ret = btrfs_free_extent(trans, root,
                                        bytenr, num_bytes, leaf->start,
                                        btrfs_header_owner(leaf),
@@ -5218,9 +5198,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
                                ref_path, NULL, NULL);
        BUG_ON(ret);
 
-       if (root == root->fs_info->extent_root)
-               btrfs_extent_post_op(trans, root);
-
        return 0;
 }
 
@@ -5351,7 +5328,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
                        prev_block = block_start;
                }
 
+               mutex_lock(&extent_root->fs_info->trans_mutex);
                btrfs_record_root_in_trans(found_root);
+               mutex_unlock(&extent_root->fs_info->trans_mutex);
                if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
                        /*
                         * try to update data extent references while
@@ -5486,6 +5465,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
 
+       path->leave_spinning = 1;
        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
        if (ret)
                goto out;
@@ -5656,6 +5636,9 @@ again:
        btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
        mutex_unlock(&root->fs_info->cleaner_mutex);
 
+       trans = btrfs_start_transaction(info->tree_root, 1);
+       btrfs_commit_transaction(trans, info->tree_root);
+
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
@@ -5789,6 +5772,7 @@ out:
 int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
        struct btrfs_block_group_cache *block_group;
+       struct btrfs_space_info *space_info;
        struct rb_node *n;
 
        spin_lock(&info->block_group_cache_lock);
@@ -5810,6 +5794,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                spin_lock(&info->block_group_cache_lock);
        }
        spin_unlock(&info->block_group_cache_lock);
+
+       /* now that all the block groups are freed, go through and
+        * free all the space_info structs.  This is only called during
+        * the final stages of unmount, and so we know nobody is
+        * using them.  We call synchronize_rcu() once before we start,
+        * just to be on the safe side.
+        */
+       synchronize_rcu();
+
+       while(!list_empty(&info->space_info)) {
+               space_info = list_entry(info->space_info.next,
+                                       struct btrfs_space_info,
+                                       list);
+
+               list_del(&space_info->list);
+               kfree(space_info);
+       }
        return 0;
 }
 
@@ -5930,9 +5931,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                                sizeof(cache->item));
        BUG_ON(ret);
 
-       finish_current_insert(trans, extent_root, 0);
-       ret = del_pending_extents(trans, extent_root, 0);
-       BUG_ON(ret);
        set_avail_alloc_bits(extent_root->fs_info, type);
 
        return 0;
@@ -5957,9 +5955,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
 
-       btrfs_remove_free_space_cache(block_group);
+       spin_lock(&root->fs_info->block_group_cache_lock);
        rb_erase(&block_group->cache_node,
                 &root->fs_info->block_group_cache_tree);
+       spin_unlock(&root->fs_info->block_group_cache_lock);
+       btrfs_remove_free_space_cache(block_group);
        down_write(&block_group->space_info->groups_sem);
        list_del(&block_group->list);
        up_write(&block_group->space_info->groups_sem);