Btrfs: Btree defrag on the extent-mapping tree as well

[linux-2.6-omap-h63xx.git] / fs / btrfs / inode.c
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 46f869d6d0fb9d4a5312ddd6d1d88569c0493286..5c05ecbc5726e79e2716faddd8aa35d31534f6b2 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -16,7 +16,6 @@
   * Boston, MA 021110-1307, USA.
   */
  
-#include <linux/module.h>
  #include <linux/buffer_head.h>
  #include <linux/fs.h>
  #include <linux/pagemap.h>
@@ -31,6 +30,8 @@
  #include <linux/writeback.h>
  #include <linux/statfs.h>
  #include <linux/compat.h>
+#include <linux/bit_spinlock.h>
+#include <linux/version.h>
  #include "ctree.h"
  #include "disk-io.h"
  #include "transaction.h"
@@ -46,6 +47,7 @@ struct btrfs_iget_args {
  static struct inode_operations btrfs_dir_inode_operations;
  static struct inode_operations btrfs_symlink_inode_operations;
  static struct inode_operations btrfs_dir_ro_inode_operations;
+static struct inode_operations btrfs_special_inode_operations;
  static struct inode_operations btrfs_file_inode_operations;
  static struct address_space_operations btrfs_aops;
  static struct address_space_operations btrfs_symlink_aops;
@@ -75,11 +77,11 @@ void btrfs_read_locked_inode(struct inode *inode)
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_key location;
         u64 alloc_group_block;
+       u32 rdev;
         int ret;
  
         path = btrfs_alloc_path();
         BUG_ON(!path);
-       btrfs_init_path(path);
         mutex_lock(&root->fs_info->fs_mutex);
  
         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
@@ -105,6 +107,8 @@ void btrfs_read_locked_inode(struct inode *inode)
         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime);
         inode->i_blocks = btrfs_inode_nblocks(inode_item);
         inode->i_generation = btrfs_inode_generation(inode_item);
+       inode->i_rdev = 0;
+       rdev = btrfs_inode_rdev(inode_item);
         alloc_group_block = btrfs_inode_block_group(inode_item);
         BTRFS_I(inode)->block_group = btrfs_lookup_block_group(root->fs_info,
                                                        alloc_group_block);
@@ -115,12 +119,6 @@ void btrfs_read_locked_inode(struct inode *inode)
         mutex_unlock(&root->fs_info->fs_mutex);
  
         switch (inode->i_mode & S_IFMT) {
-#if 0
-       default:
-               init_special_inode(inode, inode->i_mode,
-                                  btrfs_inode_rdev(inode_item));
-               break;
-#endif
         case S_IFREG:
                 inode->i_mapping->a_ops = &btrfs_aops;
                 inode->i_fop = &btrfs_file_operations;
@@ -137,6 +135,9 @@ void btrfs_read_locked_inode(struct inode *inode)
                 inode->i_op = &btrfs_symlink_inode_operations;
                 inode->i_mapping->a_ops = &btrfs_symlink_aops;
                 break;
+       default:
+               init_special_inode(inode, inode->i_mode, rdev);
+               break;
         }
         return;
  
@@ -163,6 +164,7 @@ static void fill_inode_item(struct btrfs_inode_item *item,
         btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
         btrfs_set_inode_nblocks(item, inode->i_blocks);
         btrfs_set_inode_generation(item, inode->i_generation);
+       btrfs_set_inode_rdev(item, inode->i_rdev);
         btrfs_set_inode_block_group(item,
                                     BTRFS_I(inode)->block_group->key.objectid);
  }
@@ -177,7 +179,6 @@ static int btrfs_update_inode(struct btrfs_trans_handle *trans,
  
         path = btrfs_alloc_path();
         BUG_ON(!path);
-       btrfs_init_path(path);
         ret = btrfs_lookup_inode(trans, root, path,
                                  &BTRFS_I(inode)->location, 1);
         if (ret) {
@@ -213,8 +214,11 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
         struct btrfs_dir_item *di;
  
         path = btrfs_alloc_path();
-       BUG_ON(!path);
-       btrfs_init_path(path);
+       if (!path) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
         di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
                                     name, name_len, -1);
         if (IS_ERR(di)) {
@@ -227,7 +231,8 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
         }
         objectid = btrfs_disk_key_objectid(&di->location);
         ret = btrfs_delete_one_dir_name(trans, root, path, di);
-       BUG_ON(ret);
+       if (ret)
+               goto err;
         btrfs_release_path(root, path);
  
         di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
@@ -241,16 +246,16 @@ static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
                 goto err;
         }
         ret = btrfs_delete_one_dir_name(trans, root, path, di);
-       BUG_ON(ret);
  
         dentry->d_inode->i_ctime = dir->i_ctime;
  err:
         btrfs_free_path(path);
         if (!ret) {
                 dir->i_size -= name_len * 2;
+               dir->i_mtime = dir->i_ctime = CURRENT_TIME;
                 btrfs_update_inode(trans, root, dir);
                 drop_nlink(dentry->d_inode);
-               btrfs_update_inode(trans, root, dentry->d_inode);
+               ret = btrfs_update_inode(trans, root, dentry->d_inode);
                 dir->i_sb->s_dirt = 1;
         }
         return ret;
@@ -289,7 +294,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
  
         path = btrfs_alloc_path();
         BUG_ON(!path);
-       btrfs_init_path(path);
         mutex_lock(&root->fs_info->fs_mutex);
         trans = btrfs_start_transaction(root, 1);
         btrfs_set_trans_block_group(trans, dir);
@@ -360,50 +364,16 @@ static int btrfs_free_inode(struct btrfs_trans_handle *trans,
  
         path = btrfs_alloc_path();
         BUG_ON(!path);
-       btrfs_init_path(path);
         ret = btrfs_lookup_inode(trans, root, path,
                                  &BTRFS_I(inode)->location, -1);
-       BUG_ON(ret);
-       ret = btrfs_del_item(trans, root, path);
-       BUG_ON(ret);
+       if (ret > 0)
+               ret = -ENOENT;
+       if (!ret)
+               ret = btrfs_del_item(trans, root, path);
         btrfs_free_path(path);
         return ret;
  }
  
-/*
- * truncates go from a high offset to a low offset.  So, walk
- * from hi to lo in the node and issue readas.  Stop when you find
- * keys from a different objectid
- */
-static void reada_truncate(struct btrfs_root *root, struct btrfs_path *path,
-                          u64 objectid)
-{
-       struct btrfs_node *node;
-       int i;
-       int nritems;
-       u64 item_objectid;
-       u64 blocknr;
-       int slot;
-       int ret;
-
-       if (!path->nodes[1])
-               return;
-       node = btrfs_buffer_node(path->nodes[1]);
-       slot = path->slots[1];
-       if (slot == 0)
-               return;
-       nritems = btrfs_header_nritems(&node->header);
-       for (i = slot - 1; i >= 0; i--) {
-               item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
-               if (item_objectid != objectid)
-                       break;
-               blocknr = btrfs_node_blockptr(node, i);
-               ret = readahead_tree_block(root, blocknr);
-               if (ret)
-                       break;
-       }
-}
-
  /*
   * this can truncate away extent items, csum items and directory items.
   * It starts at a high offset and removes keys until it can't find
@@ -430,6 +400,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
         int del_item;
  
         path = btrfs_alloc_path();
+       path->reada = -1;
         BUG_ON(!path);
         /* FIXME, add redo link to tree so we don't leak on crash */
         key.objectid = inode->i_ino;
@@ -446,7 +417,6 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
                         BUG_ON(path->slots[0] == 0);
                         path->slots[0]--;
                 }
-               reada_truncate(root, path, inode->i_ino);
                 leaf = btrfs_buffer_leaf(path->nodes[0]);
                 found_key = &leaf->items[path->slots[0]].key;
                 found_type = btrfs_disk_key_type(found_key);
@@ -504,7 +474,7 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
                                                          extent_num_blocks);
                                 inode->i_blocks -= (orig_num_blocks -
                                         extent_num_blocks) << 3;
-                               mark_buffer_dirty(path->nodes[0]);
+                               btrfs_mark_buffer_dirty(path->nodes[0]);
                         } else {
                                 extent_start =
                                         btrfs_file_extent_disk_blocknr(fi);
@@ -520,7 +490,8 @@ static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
                 }
                 if (del_item) {
                         ret = btrfs_del_item(trans, root, path);
-                       BUG_ON(ret);
+                       if (ret)
+                               goto error;
                 } else {
                         break;
                 }
@@ -566,7 +537,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
                 goto out;
  
         if (!PageUptodate(page)) {
-               ret = mpage_readpage(page, btrfs_get_block);
+               ret = btrfs_readpage(NULL, page);
                 lock_page(page);
                 if (!PageUptodate(page)) {
                         ret = -EIO;
@@ -581,19 +552,22 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
                                  page->index << PAGE_CACHE_SHIFT,
                                  (page->index + 1) << PAGE_CACHE_SHIFT,
                                  &alloc_hint);
-       BUG_ON(ret);
-       ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1,
+       if (ret)
+               goto out;
+       ret = btrfs_alloc_extent(trans, root, inode->i_ino, 1, 0,
                                  alloc_hint, (u64)-1, &ins, 1);
-       BUG_ON(ret);
+       if (ret)
+               goto out;
         ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
                                        page->index << PAGE_CACHE_SHIFT,
                                        ins.objectid, 1, 1);
-       BUG_ON(ret);
+       if (ret)
+               goto out;
         SetPageChecked(page);
         kaddr = kmap(page);
         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
         flush_dcache_page(page);
-       btrfs_csum_file_block(trans, root, inode->i_ino,
+       ret = btrfs_csum_file_block(trans, root, inode->i_ino,
                               page->index << PAGE_CACHE_SHIFT,
                               kaddr, PAGE_CACHE_SIZE);
         kunmap(page);
@@ -637,9 +611,10 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                 btrfs_set_trans_block_group(trans, inode);
                 err = btrfs_insert_file_extent(trans, root, inode->i_ino,
                                                pos, 0, 0, hole_size);
-               BUG_ON(err);
                 btrfs_end_transaction(trans, root);
                 mutex_unlock(&root->fs_info->fs_mutex);
+               if (err)
+                       return err;
         }
  out:
         err = inode_setattr(inode, attr);
@@ -661,12 +636,20 @@ void btrfs_delete_inode(struct inode *inode)
         trans = btrfs_start_transaction(root, 1);
         btrfs_set_trans_block_group(trans, inode);
         ret = btrfs_truncate_in_trans(trans, root, inode);
-       BUG_ON(ret);
-       btrfs_free_inode(trans, root, inode);
+       if (ret)
+               goto no_delete_lock;
+       ret = btrfs_free_inode(trans, root, inode);
+       if (ret)
+               goto no_delete_lock;
         btrfs_end_transaction(trans, root);
         mutex_unlock(&root->fs_info->fs_mutex);
         btrfs_btree_balance_dirty(root);
         return;
+
+no_delete_lock:
+       btrfs_end_transaction(trans, root);
+       mutex_unlock(&root->fs_info->fs_mutex);
+       btrfs_btree_balance_dirty(root);
  no_delete:
         clear_inode(inode);
  }
@@ -687,7 +670,6 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
  
         path = btrfs_alloc_path();
         BUG_ON(!path);
-       btrfs_init_path(path);
         di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
                                     namelen, 0);
         if (!di || IS_ERR(di)) {
@@ -811,36 +793,6 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
         return d_splice_alias(inode, dentry);
  }
  
-/*
- * readahead one full node of leaves as long as their keys include
- * the objectid supplied
- */
-static void reada_leaves(struct btrfs_root *root, struct btrfs_path *path,
-                        u64 objectid)
-{
-       struct btrfs_node *node;
-       int i;
-       u32 nritems;
-       u64 item_objectid;
-       u64 blocknr;
-       int slot;
-       int ret;
-
-       if (!path->nodes[1])
-               return;
-       node = btrfs_buffer_node(path->nodes[1]);
-       slot = path->slots[1];
-       nritems = btrfs_header_nritems(&node->header);
-       for (i = slot + 1; i < nritems; i++) {
-               item_objectid = btrfs_disk_key_objectid(&node->ptrs[i].key);
-               if (item_objectid != objectid)
-                       break;
-               blocknr = btrfs_node_blockptr(node, i);
-               ret = readahead_tree_block(root, blocknr);
-               if (ret)
-                       break;
-       }
-}
  static unsigned char btrfs_filetype_table[] = {
         DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
  };
@@ -874,19 +826,17 @@ static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
         btrfs_set_key_type(&key, key_type);
         key.offset = filp->f_pos;
         path = btrfs_alloc_path();
-       btrfs_init_path(path);
+       path->reada = 1;
         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
         if (ret < 0)
                 goto err;
         advance = 0;
-       reada_leaves(root, path, inode->i_ino);
         while(1) {
                 leaf = btrfs_buffer_leaf(path->nodes[0]);
                 nritems = btrfs_header_nritems(&leaf->header);
                 slot = path->slots[0];
                 if (advance || slot >= nritems) {
                         if (slot >= nritems -1) {
-                               reada_leaves(root, path, inode->i_ino);
                                 ret = btrfs_next_leaf(root, path);
                                 if (ret)
                                         break;
@@ -952,7 +902,7 @@ int btrfs_write_inode(struct inode *inode, int wait)
  }
  
  /*
- * This is somewhat expense, updating the tree every time the
+ * This is somewhat expensive, updating the tree every time the
   * inode changes.  But, it is most likely to find the inode in cache.
   * FIXME, needs more benchmarking...there are no reasons other than performance
   * to keep or drop this code.
@@ -968,7 +918,6 @@ void btrfs_dirty_inode(struct inode *inode)
         btrfs_update_inode(trans, root, inode);
         btrfs_end_transaction(trans, root);
         mutex_unlock(&root->fs_info->fs_mutex);
-       btrfs_btree_balance_dirty(root);
  }
  
  static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
@@ -1009,8 +958,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
  
         ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
-       BUG_ON(ret);
-
+       if (ret)
+               return ERR_PTR(ret);
         insert_inode_hash(inode);
         return inode;
  }
@@ -1026,6 +975,7 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
         int ret;
         struct btrfs_key key;
         struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root;
+       struct inode *parent_inode;
         key.objectid = inode->i_ino;
         key.flags = 0;
         btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
@@ -1036,7 +986,9 @@ static int btrfs_add_link(struct btrfs_trans_handle *trans,
                                     dentry->d_parent->d_inode->i_ino,
                                     &key, btrfs_inode_type(inode));
         if (ret == 0) {
-               dentry->d_parent->d_inode->i_size += dentry->d_name.len * 2;
+               parent_inode = dentry->d_parent->d_inode;
+               parent_inode->i_size += dentry->d_name.len * 2;
+               parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
                 ret = btrfs_update_inode(trans, root,
                                          dentry->d_parent->d_inode);
         }
@@ -1056,6 +1008,58 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
         return err;
  }
  
+static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
+                       int mode, dev_t rdev)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(dir)->root;
+       struct inode *inode;
+       int err;
+       int drop_inode = 0;
+       u64 objectid;
+
+       if (!new_valid_dev(rdev))
+               return -EINVAL;
+
+       mutex_lock(&root->fs_info->fs_mutex);
+       trans = btrfs_start_transaction(root, 1);
+       btrfs_set_trans_block_group(trans, dir);
+
+       err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
+       if (err) {
+               err = -ENOSPC;
+               goto out_unlock;
+       }
+
+       inode = btrfs_new_inode(trans, root, objectid,
+                               BTRFS_I(dir)->block_group, mode);
+       err = PTR_ERR(inode);
+       if (IS_ERR(inode))
+               goto out_unlock;
+
+       btrfs_set_trans_block_group(trans, inode);
+       err = btrfs_add_nondir(trans, dentry, inode);
+       if (err)
+               drop_inode = 1;
+       else {
+               inode->i_op = &btrfs_special_inode_operations;
+               init_special_inode(inode, inode->i_mode, rdev);
+       }
+       dir->i_sb->s_dirt = 1;
+       btrfs_update_inode_block_group(trans, inode);
+       btrfs_update_inode_block_group(trans, dir);
+out_unlock:
+       btrfs_end_transaction(trans, root);
+       mutex_unlock(&root->fs_info->fs_mutex);
+
+       if (drop_inode) {
+               inode_dec_link_count(inode);
+               iput(inode);
+       }
+       btrfs_btree_balance_dirty(root);
+       return err;
+}
+
  static int btrfs_create(struct inode *dir, struct dentry *dentry,
                         int mode, struct nameidata *nd)
  {
@@ -1128,7 +1132,9 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                 drop_inode = 1;
         dir->i_sb->s_dirt = 1;
         btrfs_update_inode_block_group(trans, dir);
-       btrfs_update_inode(trans, root, inode);
+       err = btrfs_update_inode(trans, root, inode);
+       if (err)
+               drop_inode = 1;
  
         btrfs_end_transaction(trans, root);
         mutex_unlock(&root->fs_info->fs_mutex);
@@ -1263,10 +1269,11 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
  
         path = btrfs_alloc_path();
         BUG_ON(!path);
-       btrfs_init_path(path);
         if (create & BTRFS_GET_BLOCK_CREATE) {
-               WARN_ON(1);
-               /* this almost but not quite works */
+               /*
+                * danger!, this only works if the page is properly up
+                * to date somehow
+                */
                 trans = btrfs_start_transaction(root, 1);
                 if (!trans) {
                         err = -ENOMEM;
@@ -1280,7 +1287,7 @@ static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
         }
  
         ret = btrfs_lookup_file_extent(NULL, root, path,
-                                      inode->i_ino,
+                                      objectid,
                                        iblock << inode->i_blkbits, 0);
         if (ret < 0) {
                 err = ret;
@@ -1353,20 +1360,28 @@ not_found:
         if (create & BTRFS_GET_BLOCK_CREATE) {
                 struct btrfs_key ins;
                 ret = btrfs_alloc_extent(trans, root, inode->i_ino,
-                                        1, alloc_hint, (u64)-1,
+                                        1, 0, alloc_hint, (u64)-1,
                                          &ins, 1);
-               BUG_ON(ret);
+               if (ret) {
+                       err = ret;
+                       goto out;
+               }
                 ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
                                                iblock << inode->i_blkbits,
                                                ins.objectid, ins.offset,
                                                ins.offset);
-               BUG_ON(ret);
-               SetPageChecked(result->b_page);
+               if (ret) {
+                       err = ret;
+                       goto out;
+               }
                 btrfs_map_bh_to_logical(root, result, ins.objectid);
         }
  out:
-       if (trans)
-               err = btrfs_end_transaction(trans, root);
+       if (trans) {
+               ret = btrfs_end_transaction(trans, root);
+               if (!err)
+                       err = ret;
+       }
         btrfs_free_path(path);
         return err;
  }
@@ -1382,6 +1397,39 @@ int btrfs_get_block(struct inode *inode, sector_t iblock,
         return err;
  }
  
+static int btrfs_get_block_csum(struct inode *inode, sector_t iblock,
+                               struct buffer_head *result, int create)
+{
+       int ret;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct page *page = result->b_page;
+       u64 offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(result);
+       struct btrfs_csum_item *item;
+       struct btrfs_path *path = NULL;
+
+       mutex_lock(&root->fs_info->fs_mutex);
+       ret = btrfs_get_block_lock(inode, iblock, result, create);
+       if (ret)
+               goto out;
+
+       path = btrfs_alloc_path();
+       item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, offset, 0);
+       if (IS_ERR(item)) {
+               ret = PTR_ERR(item);
+               /* a csum that isn't present is a preallocated region. */
+               if (ret == -ENOENT || ret == -EFBIG)
+                       ret = 0;
+               result->b_private = NULL;
+               goto out;
+       }
+       memcpy((char *)&result->b_private, &item->csum, BTRFS_CRC32_SIZE);
+out:
+       if (path)
+               btrfs_free_path(path);
+       mutex_unlock(&root->fs_info->fs_mutex);
+       return ret;
+}
+
  static int btrfs_get_block_bmap(struct inode *inode, sector_t iblock,
                            struct buffer_head *result, int create)
  {
@@ -1403,9 +1451,197 @@ static int btrfs_prepare_write(struct file *file, struct page *page,
         return block_prepare_write(page, from, to, btrfs_get_block);
  }
  
-static int btrfs_readpage(struct file *file, struct page *page)
+static void buffer_io_error(struct buffer_head *bh)
+{
+       char b[BDEVNAME_SIZE];
+
+       printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
+                       bdevname(bh->b_bdev, b),
+                       (unsigned long long)bh->b_blocknr);
+}
+
+/*
+ * I/O completion handler for block_read_full_page() - pages
+ * which come unlocked at the end of I/O.
+ */
+static void btrfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
+{
+       unsigned long flags;
+       struct buffer_head *first;
+       struct buffer_head *tmp;
+       struct page *page;
+       int page_uptodate = 1;
+       struct inode *inode;
+       int ret;
+
+       BUG_ON(!buffer_async_read(bh));
+
+       page = bh->b_page;
+       inode = page->mapping->host;
+       if (uptodate) {
+               void *kaddr;
+               struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+               if (bh->b_private) {
+                       char csum[BTRFS_CRC32_SIZE];
+                       kaddr = kmap_atomic(page, KM_IRQ0);
+                       ret = btrfs_csum_data(root, kaddr + bh_offset(bh),
+                                             bh->b_size, csum);
+                       BUG_ON(ret);
+                       if (memcmp(csum, &bh->b_private, BTRFS_CRC32_SIZE)) {
+                               u64 offset;
+                               offset = (page->index << PAGE_CACHE_SHIFT) +
+                                       bh_offset(bh);
+                               printk("btrfs csum failed ino %lu off %llu\n",
+                                      page->mapping->host->i_ino,
+                                      (unsigned long long)offset);
+                               memset(kaddr + bh_offset(bh), 1, bh->b_size);
+                               flush_dcache_page(page);
+                       }
+                       kunmap_atomic(kaddr, KM_IRQ0);
+               }
+               set_buffer_uptodate(bh);
+       } else {
+               clear_buffer_uptodate(bh);
+               if (printk_ratelimit())
+                       buffer_io_error(bh);
+               SetPageError(page);
+       }
+
+       /*
+        * Be _very_ careful from here on. Bad things can happen if
+        * two buffer heads end IO at almost the same time and both
+        * decide that the page is now completely done.
+        */
+       first = page_buffers(page);
+       local_irq_save(flags);
+       bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+       clear_buffer_async_read(bh);
+       unlock_buffer(bh);
+       tmp = bh;
+       do {
+               if (!buffer_uptodate(tmp))
+                       page_uptodate = 0;
+               if (buffer_async_read(tmp)) {
+                       BUG_ON(!buffer_locked(tmp));
+                       goto still_busy;
+               }
+               tmp = tmp->b_this_page;
+       } while (tmp != bh);
+       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+       local_irq_restore(flags);
+
+       /*
+        * If none of the buffers had errors and they are all
+        * uptodate then we can set the page uptodate.
+        */
+       if (page_uptodate && !PageError(page))
+               SetPageUptodate(page);
+       unlock_page(page);
+       return;
+
+still_busy:
+       bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+       local_irq_restore(flags);
+       return;
+}
+
+/*
+ * Generic "read page" function for block devices that have the normal
+ * get_block functionality. This is most of the block device filesystems.
+ * Reads the page asynchronously --- the unlock_buffer() and
+ * set/clear_buffer_uptodate() functions propagate buffer state into the
+ * page struct once IO has completed.
+ */
+int btrfs_readpage(struct file *file, struct page *page)
  {
-       return mpage_readpage(page, btrfs_get_block);
+       struct inode *inode = page->mapping->host;
+       sector_t iblock, lblock;
+       struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+       unsigned int blocksize;
+       int nr, i;
+       int fully_mapped = 1;
+
+       BUG_ON(!PageLocked(page));
+       blocksize = 1 << inode->i_blkbits;
+       if (!page_has_buffers(page))
+               create_empty_buffers(page, blocksize, 0);
+       head = page_buffers(page);
+
+       iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+       lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
+       bh = head;
+       nr = 0;
+       i = 0;
+
+       do {
+               if (buffer_uptodate(bh))
+                       continue;
+
+               if (!buffer_mapped(bh)) {
+                       int err = 0;
+
+                       fully_mapped = 0;
+                       if (iblock < lblock) {
+                               WARN_ON(bh->b_size != blocksize);
+                               err = btrfs_get_block_csum(inode, iblock,
+                                                          bh, 0);
+                               if (err)
+                                       SetPageError(page);
+                       }
+                       if (!buffer_mapped(bh)) {
+                               void *kaddr = kmap_atomic(page, KM_USER0);
+                               memset(kaddr + i * blocksize, 0, blocksize);
+                               flush_dcache_page(page);
+                               kunmap_atomic(kaddr, KM_USER0);
+                               if (!err)
+                                       set_buffer_uptodate(bh);
+                               continue;
+                       }
+                       /*
+                        * get_block() might have updated the buffer
+                        * synchronously
+                        */
+                       if (buffer_uptodate(bh))
+                               continue;
+               }
+               arr[nr++] = bh;
+       } while (i++, iblock++, (bh = bh->b_this_page) != head);
+
+       if (fully_mapped)
+               SetPageMappedToDisk(page);
+
+       if (!nr) {
+               /*
+                * All buffers are uptodate - we can set the page uptodate
+                * as well. But not if get_block() returned an error.
+                */
+               if (!PageError(page))
+                       SetPageUptodate(page);
+               unlock_page(page);
+               return 0;
+       }
+
+       /* Stage two: lock the buffers */
+       for (i = 0; i < nr; i++) {
+               bh = arr[i];
+               lock_buffer(bh);
+               bh->b_end_io = btrfs_end_buffer_async_read;
+               set_buffer_async_read(bh);
+       }
+
+       /*
+        * Stage 3: start the IO.  Check for uptodateness
+        * inside the buffer lock in case another process reading
+        * the underlying blockdev brought it uptodate (the sct fix).
+        */
+       for (i = 0; i < nr; i++) {
+               bh = arr[i];
+               if (buffer_uptodate(bh))
+                       btrfs_end_buffer_async_read(bh, 1);
+               else
+                       submit_bh(READ, bh);
+       }
+       return 0;
  }
  
  /*
@@ -1432,11 +1668,19 @@ static int __btrfs_write_full_page(struct inode *inode, struct page *page,
         struct buffer_head *bh, *head;
         const unsigned blocksize = 1 << inode->i_blkbits;
         int nr_underway = 0;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
  
         BUG_ON(!PageLocked(page));
  
         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
  
+       /* no csumming allowed when from PF_MEMALLOC */
+       if (current->flags & PF_MEMALLOC) {
+               redirty_page_for_writepage(wbc, page);
+               unlock_page(page);
+               return 0;
+       }
+
         if (!page_has_buffers(page)) {
                 create_empty_buffers(page, blocksize,
                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1504,6 +1748,23 @@ static int __btrfs_write_full_page(struct inode *inode, struct page *page,
                         continue;
                 }
                 if (test_clear_buffer_dirty(bh) && bh->b_blocknr != 0) {
+                       struct btrfs_trans_handle *trans;
+                       int ret;
+                       u64 off = page->index << PAGE_CACHE_SHIFT;
+                       char *kaddr;
+
+                       off += bh_offset(bh);
+                       mutex_lock(&root->fs_info->fs_mutex);
+                       trans = btrfs_start_transaction(root, 1);
+                       btrfs_set_trans_block_group(trans, inode);
+                       kaddr = kmap(page);
+                       btrfs_csum_file_block(trans, root, inode->i_ino,
+                                                   off, kaddr + bh_offset(bh),
+                                                   bh->b_size);
+                       kunmap(page);
+                       ret = btrfs_end_transaction(trans, root);
+                       BUG_ON(ret);
+                       mutex_unlock(&root->fs_info->fs_mutex);
                         mark_buffer_async_write(bh);
                 } else {
                         unlock_buffer(bh);
@@ -1625,6 +1886,52 @@ static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
         return __btrfs_write_full_page(inode, page, wbc);
  }
  
+/*
+ * btrfs_page_mkwrite() is not allowed to change the file size as it gets
+ * called from a page fault handler when a page is first dirtied. Hence we must
+ * be careful to check for EOF conditions here. We set the page up correctly
+ * for a written page which means we get ENOSPC checking when writing into
+ * holes and correct delalloc and unwritten extent mapping on filesystems that
+ * support these features.
+ *
+ * We are not allowed to take the i_mutex here so we have to play games to
+ * protect against truncate races as the page could now be beyond EOF.  Because
+ * vmtruncate() writes the inode size before removing pages, once we have the
+ * page lock we can determine safely if the page is beyond EOF. If it is not
+ * beyond EOF, then the page is guaranteed safe against truncation until we
+ * unlock the page.
+ */
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+       struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+       unsigned long end;
+       loff_t size;
+       int ret = -EINVAL;
+
+       lock_page(page);
+       wait_on_page_writeback(page);
+       size = i_size_read(inode);
+       if ((page->mapping != inode->i_mapping) ||
+           ((page->index << PAGE_CACHE_SHIFT) > size)) {
+               /* page got truncated out from underneath us */
+               goto out_unlock;
+       }
+
+       /* page is wholly or partially inside EOF */
+       if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
+               end = size & ~PAGE_CACHE_MASK;
+       else
+               end = PAGE_CACHE_SIZE;
+
+       ret = btrfs_prepare_write(NULL, page, 0, end);
+       if (!ret)
+               ret = btrfs_commit_write(NULL, page, 0, end);
+
+out_unlock:
+       unlock_page(page);
+       return ret;
+}
+
  static void btrfs_truncate(struct inode *inode)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -1644,7 +1951,6 @@ static void btrfs_truncate(struct inode *inode)
  
         /* FIXME, add redo link to tree so we don't leak on crash */
         ret = btrfs_truncate_in_trans(trans, root, inode);
-       BUG_ON(ret);
         btrfs_update_inode(trans, root, inode);
         ret = btrfs_end_transaction(trans, root);
         BUG_ON(ret);
@@ -1684,6 +1990,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
         struct inode *inode;
         struct inode *dir;
         int ret;
+       int err;
         u64 objectid;
         u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
  
@@ -1691,9 +1998,9 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
         trans = btrfs_start_transaction(root, 1);
         BUG_ON(!trans);
  
-       subvol = btrfs_alloc_free_block(trans, root, 0);
-       if (subvol == NULL)
-               return -ENOSPC;
+       subvol = btrfs_alloc_free_block(trans, root, 0, 0);
+       if (IS_ERR(subvol))
+               return PTR_ERR(subvol);
         leaf = btrfs_buffer_leaf(subvol);
         btrfs_set_header_nritems(&leaf->header, 0);
         btrfs_set_header_level(&leaf->header, 0);
@@ -1702,7 +2009,7 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
         btrfs_set_header_owner(&leaf->header, root->root_key.objectid);
         memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid,
                sizeof(leaf->header.fsid));
-       mark_buffer_dirty(subvol);
+       btrfs_mark_buffer_dirty(subvol);
  
         inode_item = &root_item.inode;
         memset(inode_item, 0, sizeof(*inode_item));
@@ -1714,12 +2021,15 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
  
         btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol));
         btrfs_set_root_refs(&root_item, 1);
+       memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
+       root_item.drop_level = 0;
         brelse(subvol);
         subvol = NULL;
  
         ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
                                        0, &objectid);
-       BUG_ON(ret);
+       if (ret)
+               goto fail;
  
         btrfs_set_root_dirid(&root_item, new_dirid);
  
@@ -1729,7 +2039,8 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
                                 &root_item);
-       BUG_ON(ret);
+       if (ret)
+               goto fail;
  
         /*
          * insert the directory item
@@ -1739,10 +2050,12 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
         ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
                                     name, namelen, dir->i_ino, &key,
                                     BTRFS_FT_DIR);
-       BUG_ON(ret);
+       if (ret)
+               goto fail;
  
         ret = btrfs_commit_transaction(trans, root);
-       BUG_ON(ret);
+       if (ret)
+               goto fail_commit;
  
         new_root = btrfs_read_fs_root(root->fs_info, &key);
         BUG_ON(!new_root);
@@ -1752,25 +2065,29 @@ static int create_subvol(struct btrfs_root *root, char *name, int namelen)
  
         inode = btrfs_new_inode(trans, new_root, new_dirid,
                                 BTRFS_I(dir)->block_group, S_IFDIR | 0700);
+       if (IS_ERR(inode))
+               goto fail;
         inode->i_op = &btrfs_dir_inode_operations;
         inode->i_fop = &btrfs_dir_file_operations;
+       new_root->inode = inode;
  
         ret = btrfs_make_empty_dir(trans, new_root, new_dirid, new_dirid);
-       BUG_ON(ret);
+       if (ret)
+               goto fail;
  
         inode->i_nlink = 1;
         inode->i_size = 6;
         ret = btrfs_update_inode(trans, new_root, inode);
-       BUG_ON(ret);
-
-       ret = btrfs_commit_transaction(trans, new_root);
-       BUG_ON(ret);
-
-       iput(inode);
-
+       if (ret)
+               goto fail;
+fail:
+       err = btrfs_commit_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
+fail_commit:
         mutex_unlock(&root->fs_info->fs_mutex);
         btrfs_btree_balance_dirty(root);
-       return 0;
+       return ret;
  }
  
  static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
@@ -1779,6 +2096,7 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
         struct btrfs_key key;
         struct btrfs_root_item new_root_item;
         int ret;
+       int err;
         u64 objectid;
  
         if (!root->ref_cows)
@@ -1789,11 +2107,13 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
         BUG_ON(!trans);
  
         ret = btrfs_update_inode(trans, root, root->inode);
-       BUG_ON(ret);
+       if (ret)
+               goto fail;
  
         ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
                                        0, &objectid);
-       BUG_ON(ret);
+       if (ret)
+               goto fail;
  
         memcpy(&new_root_item, &root->root_item,
                sizeof(new_root_item));
@@ -1806,7 +2126,8 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
  
         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
                                 &new_root_item);
-       BUG_ON(ret);
+       if (ret)
+               goto fail;
  
         /*
          * insert the directory item
@@ -1817,16 +2138,20 @@ static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
                                     root->fs_info->sb->s_root->d_inode->i_ino,
                                     &key, BTRFS_FT_DIR);
  
-       BUG_ON(ret);
+       if (ret)
+               goto fail;
  
         ret = btrfs_inc_root_ref(trans, root);
-       BUG_ON(ret);
+       if (ret)
+               goto fail;
  
-       ret = btrfs_commit_transaction(trans, root);
-       BUG_ON(ret);
+fail:
+       err = btrfs_commit_transaction(trans, root);
+       if (err && !ret)
+               ret = err;
         mutex_unlock(&root->fs_info->fs_mutex);
         btrfs_btree_balance_dirty(root);
-       return 0;
+       return ret;
  }
  
  int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
@@ -1863,12 +2188,21 @@ int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
                 btrfs_free_path(path);
                 if (di && !IS_ERR(di))
                         return -EEXIST;
+               if (IS_ERR(di))
+                       return PTR_ERR(di);
  
                 if (root == root->fs_info->tree_root)
                         ret = create_subvol(root, vol_args.name, namelen);
                 else
                         ret = create_snapshot(root, vol_args.name, namelen);
-               WARN_ON(ret);
+               break;
+
+       case BTRFS_IOC_DEFRAG:
+               mutex_lock(&root->fs_info->fs_mutex);
+               btrfs_defrag_root(root, 0);
+               btrfs_defrag_root(root->fs_info->extent_root, 0);
+               mutex_unlock(&root->fs_info->fs_mutex);
+               ret = 0;
                 break;
         default:
                 return -ENOTTY;
@@ -1933,42 +2267,43 @@ void btrfs_destroy_cachep(void)
                 kmem_cache_destroy(btrfs_path_cachep);
  }
  
+static struct kmem_cache *cache_create(const char *name, size_t size,
+                                      unsigned long extra_flags,
+                                      void (*ctor)(void *, struct kmem_cache *,
+                                                   unsigned long))
+{
+       return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
+                                SLAB_MEM_SPREAD | extra_flags), ctor
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+                                ,NULL
+#endif
+                               );
+}
+
  int btrfs_init_cachep(void)
  {
-       btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
-                                            sizeof(struct btrfs_inode),
-                                            0, (SLAB_RECLAIM_ACCOUNT|
-                                               SLAB_MEM_SPREAD),
-                                            init_once, NULL);
+       btrfs_inode_cachep = cache_create("btrfs_inode_cache",
+                                         sizeof(struct btrfs_inode),
+                                         0, init_once);
         if (!btrfs_inode_cachep)
                 goto fail;
-       btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+       btrfs_trans_handle_cachep = cache_create("btrfs_trans_handle_cache",
                                              sizeof(struct btrfs_trans_handle),
-                                            0, (SLAB_RECLAIM_ACCOUNT|
-                                               SLAB_MEM_SPREAD),
-                                            NULL, NULL);
+                                            0, NULL);
         if (!btrfs_trans_handle_cachep)
                 goto fail;
-       btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+       btrfs_transaction_cachep = cache_create("btrfs_transaction_cache",
                                              sizeof(struct btrfs_transaction),
-                                            0, (SLAB_RECLAIM_ACCOUNT|
-                                               SLAB_MEM_SPREAD),
-                                            NULL, NULL);
+                                            0, NULL);
         if (!btrfs_transaction_cachep)
                 goto fail;
-       btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
-                                            sizeof(struct btrfs_transaction),
-                                            0, (SLAB_RECLAIM_ACCOUNT|
-                                               SLAB_MEM_SPREAD),
-                                            NULL, NULL);
+       btrfs_path_cachep = cache_create("btrfs_path_cache",
+                                        sizeof(struct btrfs_transaction),
+                                        0, NULL);
         if (!btrfs_path_cachep)
                 goto fail;
-       btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix",
-                                            256,
-                                            0, (SLAB_RECLAIM_ACCOUNT|
-                                               SLAB_MEM_SPREAD |
-                                               SLAB_DESTROY_BY_RCU),
-                                            NULL, NULL);
+       btrfs_bit_radix_cachep = cache_create("btrfs_radix", 256,
+                                             SLAB_DESTROY_BY_RCU, NULL);
         if (!btrfs_bit_radix_cachep)
                 goto fail;
         return 0;
@@ -2031,7 +2366,6 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
                 old_parent_oid = btrfs_disk_key_objectid(&di->location);
                 ret = btrfs_del_item(trans, root, path);
                 if (ret) {
-                       ret = -EIO;
                         goto out_fail;
                 }
                 btrfs_release_path(root, path);
@@ -2050,7 +2384,6 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
                 }
                 ret = btrfs_del_item(trans, root, path);
                 if (ret) {
-                       ret = -EIO;
                         goto out_fail;
                 }
                 btrfs_release_path(root, path);
@@ -2076,7 +2409,9 @@ static int btrfs_rename(struct inode * old_dir, struct dentry *old_dentry,
                         clear_nlink(new_inode);
                 else
                         drop_nlink(new_inode);
-               btrfs_update_inode(trans, root, new_inode);
+               ret = btrfs_update_inode(trans, root, new_inode);
+               if (ret)
+                       goto out_fail;
         }
         ret = btrfs_add_link(trans, new_dentry, old_inode);
         if (ret)
@@ -2148,7 +2483,10 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
         datasize = btrfs_file_extent_calc_inline_size(name_len);
         err = btrfs_insert_empty_item(trans, root, path, &key,
                                       datasize);
-       BUG_ON(err);
+       if (err) {
+               drop_inode = 1;
+               goto out_unlock;
+       }
         ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
                path->slots[0], struct btrfs_file_extent_item);
         btrfs_set_file_extent_generation(ei, trans->transid);
@@ -2157,18 +2495,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
         ptr = btrfs_file_extent_inline_start(ei);
         btrfs_memcpy(root, path->nodes[0]->b_data,
                      ptr, symname, name_len);
-       mark_buffer_dirty(path->nodes[0]);
+       btrfs_mark_buffer_dirty(path->nodes[0]);
         btrfs_free_path(path);
         inode->i_op = &btrfs_symlink_inode_operations;
         inode->i_mapping->a_ops = &btrfs_symlink_aops;
         inode->i_size = name_len - 1;
-       btrfs_update_inode(trans, root, inode);
-       err = 0;
+       err = btrfs_update_inode(trans, root, inode);
+       if (err)
+               drop_inode = 1;
  
  out_unlock:
         btrfs_end_transaction(trans, root);
         mutex_unlock(&root->fs_info->fs_mutex);
-
         if (drop_inode) {
                 inode_dec_link_count(inode);
                 iput(inode);
@@ -2187,6 +2525,7 @@ static struct inode_operations btrfs_dir_inode_operations = {
         .rename         = btrfs_rename,
         .symlink        = btrfs_symlink,
         .setattr        = btrfs_setattr,
+       .mknod          = btrfs_mknod,
  };
  
  static struct inode_operations btrfs_dir_ro_inode_operations = {
@@ -2223,6 +2562,11 @@ static struct inode_operations btrfs_file_inode_operations = {
         .setattr        = btrfs_setattr,
  };
  
+static struct inode_operations btrfs_special_inode_operations = {
+       .getattr        = btrfs_getattr,
+       .setattr        = btrfs_setattr,
+};
+
  static struct inode_operations btrfs_symlink_inode_operations = {
         .readlink       = generic_readlink,
         .follow_link    = page_follow_link_light,