Btrfs: fix multi-device code to use raid policies set by mkfs

[linux-2.6-omap-h63xx.git] / fs / btrfs / inode.c
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 65b4f864b0df5ab23024bdd61c337712d9bc78d8..f3abecc2d14c941da0ae39d643c08fb78225fbc3 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -83,6 +83,10 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
  
  static void btrfs_truncate(struct inode *inode);
  
+/*
+ * a very lame attempt at stopping writes when the FS is 85% full.  There
+ * are countless ways this is incorrect, but it is better than nothing.
+ */
  int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
                            int for_del)
  {
@@ -108,6 +112,12 @@ int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
         return ret;
  }
  
+/*
+ * when extent_io.c finds a delayed allocation range in the file,
+ * the call backs end up in this code.  The basic idea is to
+ * allocate extents on disk for the range, and create ordered data structs
+ * in ram to track those extents.
+ */
  static int cow_file_range(struct inode *inode, u64 start, u64 end)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -135,7 +145,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
  
         BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
         mutex_lock(&BTRFS_I(inode)->extent_mutex);
-       btrfs_drop_extent_cache(inode, start, start + num_bytes - 1);
+       btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
         mutex_unlock(&BTRFS_I(inode)->extent_mutex);
  
         while(num_bytes > 0) {
@@ -163,7 +173,7 @@ static int cow_file_range(struct inode *inode, u64 start, u64 end)
                                 break;
                         }
                         btrfs_drop_extent_cache(inode, start,
-                                               start + ins.offset - 1);
+                                               start + ins.offset - 1, 0);
                 }
                 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
  
@@ -185,6 +195,13 @@ out:
         return ret;
  }
  
+/*
+ * when nowcow writeback call back.  This checks for snapshots or COW copies
+ * of the extents that exist in the file, and COWs the file as required.
+ *
+ * If no cow copies or snapshots exist, we write directly to the existing
+ * blocks on disk
+ */
  static int run_delalloc_nocow(struct inode *inode, u64 start, u64 end)
  {
         u64 extent_start;
@@ -291,6 +308,9 @@ out:
         return err;
  }
  
+/*
+ * extent_io.c call back to do delayed allocation processing
+ */
  static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -305,6 +325,11 @@ static int run_delalloc_range(struct inode *inode, u64 start, u64 end)
         return ret;
  }
  
+/*
+ * extent_io.c set_bit_hook, used to track delayed allocation
+ * bytes in this file, and to maintain the list of inodes that
+ * have pending delalloc work to be done.
+ */
  int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
                        unsigned long old, unsigned long bits)
  {
@@ -323,6 +348,9 @@ int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
         return 0;
  }
  
+/*
+ * extent_io.c clear_bit_hook, see set_bit_hook for why
+ */
  int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
                          unsigned long old, unsigned long bits)
  {
@@ -349,6 +377,10 @@ int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
         return 0;
  }
  
+/*
+ * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
+ * we don't create bios that span stripes or chunks
+ */
  int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
                          size_t size, struct bio *bio)
  {
@@ -371,6 +403,14 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
         return 0;
  }
  
+/*
+ * in order to insert checksums into the metadata in large chunks,
+ * we wait until bio submission time.   All the pages in the bio are
+ * checksummed and sums are attached onto the ordered extent record.
+ *
+ * At IO completion time the cums attached on the ordered extent record
+ * are inserted into the btree
+ */
  int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                           int mirror_num)
  {
@@ -383,6 +423,10 @@ int __btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
         return btrfs_map_bio(root, rw, bio, mirror_num, 1);
  }
  
+/*
+ * extent_io.c submission hook. This does the right thing for csum calculation on write,
+ * or reading the csums from the tree before a read
+ */
  int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                           int mirror_num)
  {
@@ -408,6 +452,10 @@ mapit:
         return btrfs_map_bio(root, rw, bio, mirror_num, 0);
  }
  
+/*
+ * given a list of ordered sums record them in the inode.  This happens
+ * at IO completion time based on sums calculated at bio submission time.
+ */
  static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
                              struct inode *inode, u64 file_offset,
                              struct list_head *list)
@@ -430,12 +478,12 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end)
                                    GFP_NOFS);
  }
  
+/* see btrfs_writepage_start_hook for details on why this is required */
  struct btrfs_writepage_fixup {
         struct page *page;
         struct btrfs_work work;
  };
  
-/* see btrfs_writepage_start_hook for details on why this is required */
  void btrfs_writepage_fixup_worker(struct btrfs_work *work)
  {
         struct btrfs_writepage_fixup *fixup;
@@ -522,12 +570,19 @@ int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
         return -EAGAIN;
  }
  
+/* as ordered data IO finishes, this gets called so we can finish
+ * an ordered extent if the range of bytes in the file it covers are
+ * fully written.
+ */
  static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_trans_handle *trans;
         struct btrfs_ordered_extent *ordered_extent;
         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+       struct btrfs_file_extent_item *extent_item;
+       struct btrfs_path *path = NULL;
+       struct extent_buffer *leaf;
         u64 alloc_hint = 0;
         struct list_head list;
         struct btrfs_key ins;
@@ -544,20 +599,15 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
                 goto nocow;
  
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+
         lock_extent(io_tree, ordered_extent->file_offset,
                     ordered_extent->file_offset + ordered_extent->len - 1,
                     GFP_NOFS);
  
         INIT_LIST_HEAD(&list);
  
-       ins.objectid = ordered_extent->start;
-       ins.offset = ordered_extent->len;
-       ins.type = BTRFS_EXTENT_ITEM_KEY;
-
-       ret = btrfs_alloc_reserved_extent(trans, root, root->root_key.objectid,
-                                         trans->transid, inode->i_ino,
-                                         ordered_extent->file_offset, &ins);
-       BUG_ON(ret);
         mutex_lock(&BTRFS_I(inode)->extent_mutex);
  
         ret = btrfs_drop_extents(trans, root, inode,
@@ -566,18 +616,42 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                  ordered_extent->len,
                                  ordered_extent->file_offset, &alloc_hint);
         BUG_ON(ret);
-       ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-                                      ordered_extent->file_offset,
-                                      ordered_extent->start,
-                                      ordered_extent->len,
-                                      ordered_extent->len, 0);
+
+       ins.objectid = inode->i_ino;
+       ins.offset = ordered_extent->file_offset;
+       ins.type = BTRFS_EXTENT_DATA_KEY;
+       ret = btrfs_insert_empty_item(trans, root, path, &ins,
+                                     sizeof(*extent_item));
         BUG_ON(ret);
+       leaf = path->nodes[0];
+       extent_item = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+       btrfs_set_file_extent_generation(leaf, extent_item, trans->transid);
+       btrfs_set_file_extent_type(leaf, extent_item, BTRFS_FILE_EXTENT_REG);
+       btrfs_set_file_extent_disk_bytenr(leaf, extent_item,
+                                         ordered_extent->start);
+       btrfs_set_file_extent_disk_num_bytes(leaf, extent_item,
+                                            ordered_extent->len);
+       btrfs_set_file_extent_offset(leaf, extent_item, 0);
+       btrfs_set_file_extent_num_bytes(leaf, extent_item,
+                                       ordered_extent->len);
+       btrfs_mark_buffer_dirty(leaf);
  
         btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
                                 ordered_extent->file_offset +
-                               ordered_extent->len - 1);
+                               ordered_extent->len - 1, 0);
         mutex_unlock(&BTRFS_I(inode)->extent_mutex);
  
+       ins.objectid = ordered_extent->start;
+       ins.offset = ordered_extent->len;
+       ins.type = BTRFS_EXTENT_ITEM_KEY;
+       ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
+                                         root->root_key.objectid,
+                                         trans->transid, inode->i_ino,
+                                         ordered_extent->file_offset, &ins);
+       BUG_ON(ret);
+       btrfs_release_path(root, path);
+
         inode->i_blocks += ordered_extent->len >> 9;
         unlock_extent(io_tree, ordered_extent->file_offset,
                     ordered_extent->file_offset + ordered_extent->len - 1,
@@ -586,9 +660,11 @@ nocow:
         add_pending_csums(trans, inode, ordered_extent->file_offset,
                           &ordered_extent->list);
  
+       mutex_lock(&BTRFS_I(inode)->extent_mutex);
         btrfs_ordered_update_i_size(inode, ordered_extent);
         btrfs_update_inode(trans, root, inode);
         btrfs_remove_ordered_extent(inode, ordered_extent);
+       mutex_unlock(&BTRFS_I(inode)->extent_mutex);
  
         /* once for us */
         btrfs_put_ordered_extent(ordered_extent);
@@ -596,6 +672,8 @@ nocow:
         btrfs_put_ordered_extent(ordered_extent);
  
         btrfs_end_transaction(trans, root);
+       if (path)
+               btrfs_free_path(path);
         return 0;
  }
  
@@ -605,6 +683,14 @@ int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
         return btrfs_finish_ordered_io(page->mapping->host, start, end);
  }
  
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
  struct io_failure_record {
         struct page *page;
         u64 start;
@@ -699,6 +785,10 @@ int btrfs_io_failed_hook(struct bio *failed_bio,
         return 0;
  }
  
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
  int btrfs_clean_io_failures(struct inode *inode, u64 start)
  {
         u64 private;
@@ -727,6 +817,11 @@ int btrfs_clean_io_failures(struct inode *inode, u64 start)
         return 0;
  }
  
+/*
+ * when reads are done, we need to check csums to verify the data is correct
+ * if there's a match, we allow the bio to finish.  If not, we go through
+ * the io_failure_record routines to find good copies
+ */
  int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                                struct extent_state *state)
  {
@@ -854,7 +949,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
         int ret = 0, nr_unlink = 0, nr_truncate = 0;
  
         /* don't do orphan cleanup if the fs is readonly. */
-       if (root->inode->i_sb->s_flags & MS_RDONLY)
+       if (root->fs_info->sb->s_flags & MS_RDONLY)
                 return;
  
         path = btrfs_alloc_path();
@@ -866,8 +961,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
         btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
         key.offset = (u64)-1;
  
-       trans = btrfs_start_transaction(root, 1);
-       btrfs_set_trans_block_group(trans, root->inode);
  
         while (1) {
                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -907,7 +1000,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                  * crossing root thing.  we store the inode number in the
                  * offset of the orphan item.
                  */
-               inode = btrfs_iget_locked(root->inode->i_sb,
+               inode = btrfs_iget_locked(root->fs_info->sb,
                                           found_key.offset, root);
                 if (!inode)
                         break;
@@ -939,7 +1032,9 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                  * do a destroy_inode
                  */
                 if (is_bad_inode(inode)) {
+                       trans = btrfs_start_transaction(root, 1);
                         btrfs_orphan_del(trans, inode);
+                       btrfs_end_transaction(trans, root);
                         iput(inode);
                         continue;
                 }
@@ -962,9 +1057,11 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
  
         btrfs_free_path(path);
-       btrfs_end_transaction(trans, root);
  }
  
+/*
+ * read an inode from the btree into the in-memory inode
+ */
  void btrfs_read_locked_inode(struct inode *inode)
  {
         struct btrfs_path *path;
@@ -1058,6 +1155,9 @@ make_bad:
         make_bad_inode(inode);
  }
  
+/*
+ * given a leaf and an inode, copy the inode fields into the leaf
+ */
  static void fill_inode_item(struct btrfs_trans_handle *trans,
                             struct extent_buffer *leaf,
                             struct btrfs_inode_item *item,
@@ -1093,6 +1193,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                                     BTRFS_I(inode)->block_group->key.objectid);
  }
  
+/*
+ * copy everything in the in-memory inode into the btree.
+ */
  int noinline btrfs_update_inode(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               struct inode *inode)
@@ -1126,6 +1229,11 @@ failed:
  }
  
  
+/*
+ * unlink helper that gets used here in inode.c and in the tree logging
+ * recovery code.  It remove a link in a directory with a given name, and
+ * also drops the back refs in the inode to the directory
+ */
  int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct inode *dir, struct inode *inode,
@@ -1284,7 +1392,7 @@ fail:
  /*
   * this can truncate away extent items, csum items and directory items.
   * It starts at a high offset and removes keys until it can't find
- * any higher than i_size.
+ * any higher than new_size
   *
   * csum items that cross the new i_size are truncated to the new size
   * as well.
@@ -1317,8 +1425,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
         u64 mask = root->sectorsize - 1;
  
         if (root->ref_cows)
-               btrfs_drop_extent_cache(inode,
-                                       new_size & (~mask), (u64)-1);
+               btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
         path = btrfs_alloc_path();
         path->reada = -1;
         BUG_ON(!path);
@@ -1433,10 +1540,7 @@ search_again:
                                         if (root->ref_cows)
                                                 dec_i_blocks(inode, num_dec);
                                 }
-                               if (root->ref_cows) {
-                                       root_gen =
-                                               btrfs_header_generation(leaf);
-                               }
+                               root_gen = btrfs_header_generation(leaf);
                                 root_owner = btrfs_header_owner(leaf);
                         }
                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
@@ -1477,7 +1581,7 @@ delete:
                 if (found_extent) {
                         ret = btrfs_free_extent(trans, root, extent_start,
                                                 extent_num_bytes,
-                                               root_owner,
+                                               leaf->start, root_owner,
                                                 root_gen, inode->i_ino,
                                                 found_key.offset, 0);
                         BUG_ON(ret);
@@ -1654,7 +1758,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                                                        hole_start, 0, 0,
                                                        hole_size, 0);
                         btrfs_drop_extent_cache(inode, hole_start,
-                                               (u64)-1);
+                                               (u64)-1, 0);
                         btrfs_check_file(root, inode);
                 }
                 mutex_unlock(&BTRFS_I(inode)->extent_mutex);
@@ -1820,6 +1924,24 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
                 args->root == BTRFS_I(inode)->root);
  }
  
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+                           struct btrfs_root *root, int wait)
+{
+       struct inode *inode;
+       struct btrfs_iget_args args;
+       args.ino = objectid;
+       args.root = root;
+
+       if (wait) {
+               inode = ilookup5(s, objectid, btrfs_find_actor,
+                                (void *)&args);
+       } else {
+               inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
+                                       (void *)&args);
+       }
+       return inode;
+}
+
  struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
                                 struct btrfs_root *root)
  {
@@ -2050,104 +2172,6 @@ err:
         return ret;
  }
  
-/* Kernels earlier than 2.6.28 still have the NFS deadlock where nfsd
-   will call the file system's ->lookup() method from within its
-   filldir callback, which in turn was called from the file system's
-   ->readdir() method. And will deadlock for many file systems. */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
-
-struct nfshack_dirent {
-       u64             ino;
-       loff_t          offset;
-       int             namlen;
-       unsigned int    d_type;
-       char            name[];
-};
-
-struct nfshack_readdir {
-       char            *dirent;
-       size_t          used;
-       int             full;
-};
-
-
-
-static int btrfs_nfshack_filldir(void *__buf, const char *name, int namlen,
-                             loff_t offset, u64 ino, unsigned int d_type)
-{
-       struct nfshack_readdir *buf = __buf;
-       struct nfshack_dirent *de = (void *)(buf->dirent + buf->used);
-       unsigned int reclen;
-
-       reclen = ALIGN(sizeof(struct nfshack_dirent) + namlen, sizeof(u64));
-       if (buf->used + reclen > PAGE_SIZE) {
-               buf->full = 1;
-               return -EINVAL;
-       }
-
-       de->namlen = namlen;
-       de->offset = offset;
-       de->ino = ino;
-       de->d_type = d_type;
-       memcpy(de->name, name, namlen);
-       buf->used += reclen;
-
-       return 0;
-}
-
-static int btrfs_nfshack_readdir(struct file *file, void *dirent,
-                                filldir_t filldir)
-{
-       struct nfshack_readdir buf;
-       struct nfshack_dirent *de;
-       int err;
-       int size;
-       loff_t offset;
-
-       buf.dirent = (void *)__get_free_page(GFP_KERNEL);
-       if (!buf.dirent)
-               return -ENOMEM;
-
-       offset = file->f_pos;
-
-       do {
-               unsigned int reclen;
-
-               buf.used = 0;
-               buf.full = 0;
-               err = btrfs_real_readdir(file, &buf, btrfs_nfshack_filldir);
-               if (err)
-                       break;
-
-               size = buf.used;
-
-               if (!size)
-                       break;
-
-               de = (struct nfshack_dirent *)buf.dirent;
-               while (size > 0) {
-                       offset = de->offset;
-
-                       if (filldir(dirent, de->name, de->namlen, de->offset,
-                                   de->ino, de->d_type))
-                               goto done;
-                       offset = file->f_pos;
-
-                       reclen = ALIGN(sizeof(*de) + de->namlen,
-                                      sizeof(u64));
-                       size -= reclen;
-                       de = (struct nfshack_dirent *)((char *)de + reclen);
-               }
-       } while (buf.full);
-
- done:
-       free_page((unsigned long)buf.dirent);
-       file->f_pos = offset;
-
-       return err;
-}
-#endif
-
  int btrfs_write_inode(struct inode *inode, int wait)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2182,6 +2206,11 @@ void btrfs_dirty_inode(struct inode *inode)
         btrfs_end_transaction(trans, root);
  }
  
+/*
+ * find the highest existing sequence number in a directory
+ * and then set the in-memory index_cnt variable to reflect
+ * free sequence numbers
+ */
  static int btrfs_set_inode_index_count(struct inode *inode)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2234,6 +2263,10 @@ out:
         return ret;
  }
  
+/*
+ * helper to find a free sequence number in a given directory.  This current
+ * code is very simple, later versions will do smarter things in the btree
+ */
  static int btrfs_set_inode_index(struct inode *dir, struct inode *inode,
                                  u64 *index)
  {
@@ -2364,6 +2397,12 @@ static inline u8 btrfs_inode_type(struct inode *inode)
         return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
  }
  
+/*
+ * utility function to add 'inode' into 'parent_inode' with
+ * a give name and a given sequence number.
+ * if 'add_backref' is true, also insert a backref from the
+ * inode to the parent directory.
+ */
  int btrfs_add_link(struct btrfs_trans_handle *trans,
                    struct inode *parent_inode, struct inode *inode,
                    const char *name, int name_len, int add_backref, u64 index)
@@ -2670,6 +2709,10 @@ out_unlock:
         return err;
  }
  
+/* helper for btfs_get_extent.  Given an existing extent in the tree,
+ * and an extent that you want to insert, deal with overlap and insert
+ * the new extent into the tree.
+ */
  static int merge_extent_mapping(struct extent_map_tree *em_tree,
                                 struct extent_map *existing,
                                 struct extent_map *em,
@@ -2686,6 +2729,14 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
         return add_extent_mapping(em_tree, em);
  }
  
+/*
+ * a bit scary, this does extent mapping from logical file offset to the disk.
+ * the ugly parts come from merging extents from the disk with the
+ * in-ram representation.  This gets more complex because of the data=ordered code,
+ * where the in-ram extents might be locked pending data=ordered completion.
+ *
+ * This also copies inline extents directly into the page.
+ */
  struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
                                     size_t pg_offset, u64 start, u64 len,
                                     int create)
@@ -2928,76 +2979,11 @@ out:
         return em;
  }
  
-#if 0 /* waiting for O_DIRECT reads */
-static int btrfs_get_block(struct inode *inode, sector_t iblock,
-                       struct buffer_head *bh_result, int create)
-{
-       struct extent_map *em;
-       u64 start = (u64)iblock << inode->i_blkbits;
-       struct btrfs_multi_bio *multi = NULL;
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       u64 len;
-       u64 logical;
-       u64 map_length;
-       int ret = 0;
-
-       em = btrfs_get_extent(inode, NULL, 0, start, bh_result->b_size, 0);
-
-       if (!em || IS_ERR(em))
-               goto out;
-
-       if (em->start > start || em->start + em->len <= start) {
-           goto out;
-       }
-
-       if (em->block_start == EXTENT_MAP_INLINE) {
-               ret = -EINVAL;
-               goto out;
-       }
-
-       len = em->start + em->len - start;
-       len = min_t(u64, len, INT_LIMIT(typeof(bh_result->b_size)));
-
-       if (em->block_start == EXTENT_MAP_HOLE ||
-           em->block_start == EXTENT_MAP_DELALLOC) {
-               bh_result->b_size = len;
-               goto out;
-       }
-
-       logical = start - em->start;
-       logical = em->block_start + logical;
-
-       map_length = len;
-       ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
-                             logical, &map_length, &multi, 0);
-       BUG_ON(ret);
-       bh_result->b_blocknr = multi->stripes[0].physical >> inode->i_blkbits;
-       bh_result->b_size = min(map_length, len);
-
-       bh_result->b_bdev = multi->stripes[0].dev->bdev;
-       set_buffer_mapped(bh_result);
-       kfree(multi);
-out:
-       free_extent_map(em);
-       return ret;
-}
-#endif
-
  static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                         const struct iovec *iov, loff_t offset,
                         unsigned long nr_segs)
  {
         return -EINVAL;
-#if 0
-       struct file *file = iocb->ki_filp;
-       struct inode *inode = file->f_mapping->host;
-
-       if (rw == WRITE)
-               return -EINVAL;
-
-       return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-                                 offset, nr_segs, btrfs_get_block, NULL);
-#endif
  }
  
  static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock)
@@ -3261,6 +3247,9 @@ void btrfs_invalidate_dcache_root(struct btrfs_root *root, char *name,
         }
  }
  
+/*
+ * create a new subvolume directory/inode (helper for the ioctl).
+ */
  int btrfs_create_subvol_root(struct btrfs_root *new_root,
                 struct btrfs_trans_handle *trans, u64 new_dirid,
                 struct btrfs_block_group_cache *block_group)
@@ -3282,19 +3271,17 @@ int btrfs_create_subvol_root(struct btrfs_root *new_root,
         return btrfs_update_inode(trans, new_root, inode);
  }
  
+/* helper function for file defrag and space balancing.  This
+ * forces readahead on a given range of bytes in an inode
+ */
  unsigned long btrfs_force_ra(struct address_space *mapping,
                               struct file_ra_state *ra, struct file *file,
                               pgoff_t offset, pgoff_t last_index)
  {
         pgoff_t req_size = last_index - offset + 1;
  
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
-       offset = page_cache_readahead(mapping, ra, file, offset, req_size);
-       return offset;
-#else
         page_cache_sync_readahead(mapping, ra, file, offset, req_size);
         return offset + req_size;
-#endif
  }
  
  struct inode *btrfs_alloc_inode(struct super_block *sb)
@@ -3346,18 +3333,11 @@ void btrfs_destroy_inode(struct inode *inode)
                         btrfs_put_ordered_extent(ordered);
                 }
         }
-       btrfs_drop_extent_cache(inode, 0, (u64)-1);
+       btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
  }
  
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
  static void init_once(void *foo)
-#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
-static void init_once(struct kmem_cache * cachep, void *foo)
-#else
-static void init_once(void * foo, struct kmem_cache * cachep,
-                     unsigned long flags)
-#endif
  {
         struct btrfs_inode *ei = (struct btrfs_inode *) foo;
  
@@ -3380,22 +3360,10 @@ void btrfs_destroy_cachep(void)
  
  struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
                                        unsigned long extra_flags,
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
-                                      void (*ctor)(void *)
-#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
-                                      void (*ctor)(struct kmem_cache *, void *)
-#else
-                                      void (*ctor)(void *, struct kmem_cache *,
-                                                   unsigned long)
-#endif
-                                    )
+                                      void (*ctor)(void *))
  {
         return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT |
-                                SLAB_MEM_SPREAD | extra_flags), ctor
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
-                                ,NULL
-#endif
-                               );
+                                SLAB_MEM_SPREAD | extra_flags), ctor);
  }
  
  int btrfs_init_cachep(void)
@@ -3507,23 +3475,44 @@ out_unlock:
         return ret;
  }
  
+/*
+ * some fairly slow code that needs optimization. This walks the list
+ * of all the inodes with pending delalloc and forces them to disk.
+ */
  int btrfs_start_delalloc_inodes(struct btrfs_root *root)
  {
         struct list_head *head = &root->fs_info->delalloc_inodes;
         struct btrfs_inode *binode;
+       struct inode *inode;
         unsigned long flags;
  
         spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
         while(!list_empty(head)) {
                 binode = list_entry(head->next, struct btrfs_inode,
                                     delalloc_inodes);
-               atomic_inc(&binode->vfs_inode.i_count);
+               inode = igrab(&binode->vfs_inode);
+               if (!inode)
+                       list_del_init(&binode->delalloc_inodes);
                 spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
-               filemap_write_and_wait(binode->vfs_inode.i_mapping);
-               iput(&binode->vfs_inode);
+               if (inode) {
+                       filemap_flush(inode->i_mapping);
+                       iput(inode);
+               }
+               cond_resched();
                 spin_lock_irqsave(&root->fs_info->delalloc_lock, flags);
         }
         spin_unlock_irqrestore(&root->fs_info->delalloc_lock, flags);
+
+       /* the filemap_flush will queue IO into the worker threads, but
+        * we have to make sure the IO is actually started and that
+        * ordered extents get created before we return
+        */
+       atomic_inc(&root->fs_info->async_submit_draining);
+       while(atomic_read(&root->fs_info->nr_async_submits)) {
+               wait_event(root->fs_info->async_submit_wait,
+                  (atomic_read(&root->fs_info->nr_async_submits) == 0));
+       }
+       atomic_dec(&root->fs_info->async_submit_draining);
         return 0;
  }
  
@@ -3643,12 +3632,7 @@ static int btrfs_set_page_dirty(struct page *page)
         return __set_page_dirty_nobuffers(page);
  }
  
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,26)
  static int btrfs_permission(struct inode *inode, int mask)
-#else
-static int btrfs_permission(struct inode *inode, int mask,
-                           struct nameidata *nd)
-#endif
  {
         if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
                 return -EACCES;
@@ -3679,11 +3663,7 @@ static struct inode_operations btrfs_dir_ro_inode_operations = {
  static struct file_operations btrfs_dir_file_operations = {
         .llseek         = generic_file_llseek,
         .read           = generic_read_dir,
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
-       .readdir        = btrfs_nfshack_readdir,
-#else /* NFSd readdir/lookup deadlock is fixed */
         .readdir        = btrfs_real_readdir,
-#endif
         .unlocked_ioctl = btrfs_ioctl,
  #ifdef CONFIG_COMPAT
         .compat_ioctl   = btrfs_ioctl,