btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           hash.o file-item.o inode-item.o inode-map.o disk-io.o \
           transaction.o bit-radix.o inode.o file.o tree-defrag.o \
-          extent_map.o sysfs.o struct-funcs.o xattr.o acl.o
+          extent_map.o sysfs.o struct-funcs.o xattr.o acl.o ordered-data.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #        root-tree.o dir-item.o hash.o file-item.o inode-item.o \
 
        struct extent_map_tree extent_tree;
        struct inode vfs_inode;
 
+       u64 ordered_trans;
        /*
         * transid of the trans_handle that last modified this inode
         */
 
                    struct extent_buffer **cow_ret)
 {
        u64 search_start;
+       u64 header_trans;
        int ret;
+
        if (trans->transaction != root->fs_info->running_transaction) {
                printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
                       root->fs_info->running_transaction->transid);
                       root->fs_info->generation);
                WARN_ON(1);
        }
-       if (btrfs_header_generation(buf) == trans->transid) {
+
+       header_trans = btrfs_header_generation(buf);
+       if (header_trans == trans->transid) {
                *cow_ret = buf;
                return 0;
        }
 
  * Boston, MA 021110-1307, USA.
  */
 
-#ifndef __BTRFS__
-#define __BTRFS__
+#ifndef __BTRFS_CTREE__
+#define __BTRFS_CTREE__
 
 #include <linux/version.h>
 #include <linux/mm.h>
        struct inode *inode;
        struct kobject root_kobj;
        struct completion kobj_unregister;
-       struct rw_semaphore snap_sem;
        u64 objectid;
        u64 last_trans;
 
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
                                struct btrfs_root *root);
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+                           u64 root_objectid);
 int btrfs_commit_write(struct file *file, struct page *page,
                       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
 
        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
        memset(&root->root_kobj, 0, sizeof(root->root_kobj));
        init_completion(&root->kobj_unregister);
-       init_rwsem(&root->snap_sem);
        root->defrag_running = 0;
        root->defrag_level = 0;
        root->root_key.objectid = objectid;
        return root;
 }
 
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+                                       u64 root_objectid)
+{
+       struct btrfs_root *root;
+
+       if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
+               return fs_info->tree_root;
+       if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
+               return fs_info->extent_root;
+
+       root = radix_tree_lookup(&fs_info->fs_roots_radix,
+                                (unsigned long)root_objectid);
+       return root;
+}
+
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                                              struct btrfs_key *location)
 {
 
                      struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize);
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+                                       u64 root_objectid);
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
                                      struct btrfs_key *location,
                                      const char *name, int namelen);
 
                        if (btrfs_buffer_uptodate(buf)) {
                                u64 transid =
                                    root->fs_info->running_transaction->transid;
-                               if (btrfs_header_generation(buf) == transid) {
+                               u64 header_transid =
+                                       btrfs_header_generation(buf);
+                               if (header_transid == transid) {
                                        free_extent_buffer(buf);
                                        return 1;
                                }
 
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
+#include "ordered-data.h"
 #include "ioctl.h"
 #include "print-tree.h"
 
                root->fs_info->delalloc_bytes += (end_of_last_block + 1 -
                                          start_pos) - existing_delalloc;
                spin_unlock(&root->fs_info->delalloc_lock);
+               btrfs_add_ordered_inode(inode);
        } else {
                u64 aligned_end;
                /* step one, delete the existing extents in this range */
 
        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 
-       down_read(&BTRFS_I(inode)->root->snap_sem);
-
        mutex_lock(&inode->i_mutex);
        first_index = pos >> PAGE_CACHE_SHIFT;
        last_index = (pos + count) >> PAGE_CACHE_SHIFT;
        }
 out:
        mutex_unlock(&inode->i_mutex);
-       up_read(&BTRFS_I(inode)->root->snap_sem);
 
 out_nolock:
        kfree(pages);
 
                alloc_hint = ins.objectid + ins.offset;
                start += cur_alloc_size;
        }
+       btrfs_add_ordered_inode(inode);
 out:
        btrfs_end_transaction(trans, root);
        return ret;
        path = btrfs_alloc_path();
        BUG_ON(!path);
        mutex_lock(&root->fs_info->fs_mutex);
-
        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+
        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
        if (ret)
                goto make_bad;
        if ((offset & (blocksize - 1)) == 0)
                goto out;
 
-       down_read(&root->snap_sem);
        ret = -ENOMEM;
        page = grab_cache_page(mapping, index);
        if (!page)
 
        unlock_page(page);
        page_cache_release(page);
-       up_read(&BTRFS_I(inode)->root->snap_sem);
 out:
        return ret;
 }
                args->root == BTRFS_I(inode)->root);
 }
 
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+                           u64 root_objectid)
+{
+       struct btrfs_iget_args args;
+       args.ino = objectid;
+       args.root = btrfs_lookup_fs_root(btrfs_sb(s)->fs_info, root_objectid);
+
+       if (!args.root)
+               return NULL;
+
+       return ilookup5(s, objectid, btrfs_find_actor, (void *)&args);
+}
+
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
                                struct btrfs_root *root)
 {
 
                        d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
                        btrfs_dir_item_key_to_cpu(leaf, di, &location);
-
                        over = filldir(dirent, name_ptr, name_len,
                                       found_key.offset,
                                       location.objectid,
 
        ret = -EINVAL;
 
-       down_read(&BTRFS_I(inode)->root->snap_sem);
        lock_page(page);
        wait_on_page_writeback(page);
        size = i_size_read(inode);
        ret = btrfs_cow_one_page(inode, page, end);
 
 out_unlock:
-       up_read(&BTRFS_I(inode)->root->snap_sem);
        unlock_page(page);
 out:
        return ret;
        struct btrfs_root_item root_item;
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
-       struct btrfs_root *new_root;
+       struct btrfs_root *new_root = root;
        struct inode *inode;
        struct inode *dir;
        int ret;
                goto fail;
 fail:
        nr = trans->blocks_used;
-       err = btrfs_commit_transaction(trans, root);
+       err = btrfs_commit_transaction(trans, new_root);
        if (err && !ret)
                ret = err;
 fail_commit:
        if (!root->ref_cows)
                return -EINVAL;
 
-       down_write(&root->snap_sem);
-       freeze_bdev(root->fs_info->sb->s_bdev);
-       thaw_bdev(root->fs_info->sb->s_bdev, root->fs_info->sb);
-
        mutex_lock(&root->fs_info->fs_mutex);
        ret = btrfs_check_free_space(root, 1, 0);
        if (ret)
 
        trans = btrfs_start_transaction(root, 1);
        BUG_ON(!trans);
+       err = btrfs_commit_transaction(trans, root);
+
+       trans = btrfs_start_transaction(root, 1);
 
        ret = btrfs_update_inode(trans, root, root->inode);
        if (ret)
        ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
                                       0, &objectid);
        if (ret)
-               goto fail;
-
-       memcpy(&new_root_item, &root->root_item,
+               goto fail; memcpy(&new_root_item, &root->root_item,
               sizeof(new_root_item));
 
        key.objectid = objectid;
        btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
        free_extent_buffer(tmp);
 
+       /* write the ordered inodes to force all delayed allocations to
+        * be filled.  Once this is done, we can copy the root
+        */
+       mutex_lock(&root->fs_info->trans_mutex);
+       btrfs_write_ordered_inodes(trans, root);
+       mutex_unlock(&root->fs_info->trans_mutex);
+
        btrfs_copy_root(trans, root, root->node, &tmp, objectid);
 
        btrfs_set_root_bytenr(&new_root_item, tmp->start);
        btrfs_set_root_level(&new_root_item, btrfs_header_level(tmp));
        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
                                &new_root_item);
+printk("new root %Lu node %Lu\n", objectid, tmp->start);
        free_extent_buffer(tmp);
        if (ret)
                goto fail;
                ret = err;
 fail_unlock:
        mutex_unlock(&root->fs_info->fs_mutex);
-       up_write(&root->snap_sem);
        btrfs_btree_balance_dirty(root, nr);
        return ret;
 }
        if (!ei)
                return NULL;
        ei->last_trans = 0;
+       ei->ordered_trans = 0;
        return &ei->vfs_inode;
 }
 
 
--- /dev/null
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+
+struct tree_entry {
+       u64 root_objectid;
+       u64 objectid;
+       struct rb_node rb_node;
+};
+
+/*
+ * returns > 0 if entry passed (root, objectid) is > entry,
+ * < 0 if (root, objectid) < entry and zero if they are equal
+ */
+static int comp_entry(struct tree_entry *entry, u64 root_objectid,
+                     u64 objectid)
+{
+       if (root_objectid < entry->root_objectid)
+               return -1;
+       if (root_objectid > entry->root_objectid)
+               return 1;
+       if (objectid < entry->objectid)
+               return -1;
+       if (objectid > entry->objectid)
+               return 1;
+       return 0;
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 root_objectid,
+                                  u64 objectid, struct rb_node *node)
+{
+       struct rb_node ** p = &root->rb_node;
+       struct rb_node * parent = NULL;
+       struct tree_entry *entry;
+       int comp;
+
+       while(*p) {
+               parent = *p;
+               entry = rb_entry(parent, struct tree_entry, rb_node);
+
+               comp = comp_entry(entry, root_objectid, objectid);
+               if (comp < 0)
+                       p = &(*p)->rb_left;
+               else if (comp > 0)
+                       p = &(*p)->rb_right;
+               else
+                       return parent;
+       }
+
+       rb_link_node(node, parent, p);
+       rb_insert_color(node, root);
+       return NULL;
+}
+
+static struct rb_node *__tree_search(struct rb_root *root, u64 root_objectid,
+                                    u64 objectid, struct rb_node **prev_ret)
+{
+       struct rb_node * n = root->rb_node;
+       struct rb_node *prev = NULL;
+       struct tree_entry *entry;
+       struct tree_entry *prev_entry = NULL;
+       int comp;
+
+       while(n) {
+               entry = rb_entry(n, struct tree_entry, rb_node);
+               prev = n;
+               prev_entry = entry;
+               comp = comp_entry(entry, root_objectid, objectid);
+
+               if (comp < 0)
+                       n = n->rb_left;
+               else if (comp > 0)
+                       n = n->rb_right;
+               else
+                       return n;
+       }
+       if (!prev_ret)
+               return NULL;
+
+       while(prev && comp_entry(prev_entry, root_objectid, objectid) >= 0) {
+               prev = rb_next(prev);
+               prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+       }
+       *prev_ret = prev;
+       return NULL;
+}
+
+static inline struct rb_node *tree_search(struct rb_root *root,
+                                         u64 root_objectid, u64 objectid)
+{
+       struct rb_node *prev;
+       struct rb_node *ret;
+       ret = __tree_search(root, root_objectid, objectid, &prev);
+       if (!ret)
+               return prev;
+       return ret;
+}
+
+int btrfs_add_ordered_inode(struct inode *inode)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       u64 root_objectid = root->root_key.objectid;
+       u64 transid = root->fs_info->running_transaction->transid;
+       struct tree_entry *entry;
+       struct rb_node *node;
+       struct btrfs_ordered_inode_tree *tree;
+
+       if (transid <= BTRFS_I(inode)->ordered_trans)
+               return 0;
+
+       tree = &root->fs_info->running_transaction->ordered_inode_tree;
+
+       read_lock(&tree->lock);
+       node = __tree_search(&tree->tree, root_objectid, inode->i_ino, NULL);
+       read_unlock(&tree->lock);
+       if (node) {
+               return 0;
+       }
+
+       entry = kmalloc(sizeof(*entry), GFP_NOFS);
+       if (!entry)
+               return -ENOMEM;
+
+       write_lock(&tree->lock);
+       entry->objectid = inode->i_ino;
+       entry->root_objectid = root_objectid;
+
+       node = tree_insert(&tree->tree, root_objectid,
+                          inode->i_ino, &entry->rb_node);
+
+       BTRFS_I(inode)->ordered_trans = transid;
+
+       write_unlock(&tree->lock);
+       if (node)
+               kfree(entry);
+       return 0;
+}
+
+int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+                                      u64 *root_objectid, u64 *objectid)
+{
+       struct tree_entry *entry;
+       struct rb_node *node;
+
+       write_lock(&tree->lock);
+       node = tree_search(&tree->tree, *root_objectid, *objectid);
+       if (!node) {
+               write_unlock(&tree->lock);
+               return 0;
+       }
+       entry = rb_entry(node, struct tree_entry, rb_node);
+
+       while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
+               node = rb_next(node);
+               if (!node)
+                       break;
+               entry = rb_entry(node, struct tree_entry, rb_node);
+       }
+       if (!node) {
+               write_unlock(&tree->lock);
+               return 0;
+       }
+
+       *root_objectid = entry->root_objectid;
+       *objectid = entry->objectid;
+       write_unlock(&tree->lock);
+       return 1;
+}
+
+int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+                                      u64 *root_objectid, u64 *objectid)
+{
+       struct tree_entry *entry;
+       struct rb_node *node;
+
+       write_lock(&tree->lock);
+       node = tree_search(&tree->tree, *root_objectid, *objectid);
+       if (!node) {
+               write_unlock(&tree->lock);
+               return 0;
+       }
+
+       entry = rb_entry(node, struct tree_entry, rb_node);
+       while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
+               node = rb_next(node);
+               if (!node)
+                       break;
+               entry = rb_entry(node, struct tree_entry, rb_node);
+       }
+       if (!node) {
+               write_unlock(&tree->lock);
+               return 0;
+       }
+
+       *root_objectid = entry->root_objectid;
+       *objectid = entry->objectid;
+       rb_erase(node, &tree->tree);
+       write_unlock(&tree->lock);
+       kfree(entry);
+       return 1;
+}
 
--- /dev/null
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ORDERED_DATA__
+#define __BTRFS_ORDERED_DATA__
+
+struct btrfs_ordered_inode_tree {
+       rwlock_t lock;
+       struct rb_root tree;
+};
+
+static inline void
+btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
+{
+       rwlock_init(&t->lock);
+       t->tree.rb_node = NULL;
+}
+
+int btrfs_add_ordered_inode(struct inode *inode);
+int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+                                      u64 *root_objectid, u64 *objectid);
+int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+                                      u64 *root_objectid, u64 *objectid);
+#endif
 
                cur_trans->commit_done = 0;
                cur_trans->start_time = get_seconds();
                list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+               btrfs_ordered_inode_tree_init(&cur_trans->ordered_inode_tree);
                extent_map_tree_init(&cur_trans->dirty_pages,
                                     root->fs_info->btree_inode->i_mapping,
                                     GFP_NOFS);
        return ret;
 }
 
+int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root)
+{
+       struct btrfs_transaction *cur_trans = trans->transaction;
+       struct inode *inode;
+       u64 root_objectid = 0;
+       u64 objectid = 0;
+       u64 transid = trans->transid;
+       int ret;
+
+printk("write ordered trans %Lu\n", transid);
+       while(1) {
+               ret = btrfs_find_first_ordered_inode(
+                               &cur_trans->ordered_inode_tree,
+                               &root_objectid, &objectid);
+               if (!ret)
+                       break;
+
+               mutex_unlock(&root->fs_info->trans_mutex);
+               mutex_unlock(&root->fs_info->fs_mutex);
+               inode = btrfs_ilookup(root->fs_info->sb, objectid,
+                                     root_objectid);
+               if (inode) {
+                       if (S_ISREG(inode->i_mode))
+                               filemap_fdatawrite(inode->i_mapping);
+                       iput(inode);
+               }
+               mutex_lock(&root->fs_info->fs_mutex);
+               mutex_lock(&root->fs_info->trans_mutex);
+       }
+       while(1) {
+               root_objectid = 0;
+               objectid = 0;
+               ret = btrfs_find_del_first_ordered_inode(
+                               &cur_trans->ordered_inode_tree,
+                               &root_objectid, &objectid);
+               if (!ret)
+                       break;
+               mutex_unlock(&root->fs_info->trans_mutex);
+               mutex_unlock(&root->fs_info->fs_mutex);
+               inode = btrfs_ilookup(root->fs_info->sb, objectid,
+                                     root_objectid);
+               if (inode) {
+                       if (S_ISREG(inode->i_mode))
+                               filemap_write_and_wait(inode->i_mapping);
+                       iput(inode);
+               }
+               mutex_lock(&root->fs_info->fs_mutex);
+               mutex_lock(&root->fs_info->trans_mutex);
+       }
+printk("done write ordered trans %Lu\n", transid);
+       return 0;
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
                mutex_lock(&root->fs_info->fs_mutex);
                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&cur_trans->writer_wait, &wait);
+               ret = btrfs_write_ordered_inodes(trans, root);
+
        } while (cur_trans->num_writers > 1 ||
                 (cur_trans->num_joined != joined));
 
        WARN_ON(cur_trans != trans->transaction);
+
        ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
                              &dirty_fs_roots);
        BUG_ON(ret);
 
  * Boston, MA 021110-1307, USA.
  */
 
-#ifndef __TRANSACTION__
-#define __TRANSACTION__
+#ifndef __BTRFS_TRANSACTION__
+#define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
+#include "ordered-data.h"
 
 struct btrfs_transaction {
        u64 transid;
        struct list_head list;
        struct extent_map_tree dirty_pages;
        unsigned long start_time;
+       struct btrfs_ordered_inode_tree ordered_inode_tree;
        wait_queue_head_t writer_wait;
        wait_queue_head_t commit_wait;
 };
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root);
+int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root);
 #endif