]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - fs/btrfs/tree-log.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
[linux-2.6-omap-h63xx.git] / fs / btrfs / tree-log.c
1 /*
2  * Copyright (C) 2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include "ctree.h"
21 #include "transaction.h"
22 #include "disk-io.h"
23 #include "locking.h"
24 #include "print-tree.h"
25 #include "compat.h"
26 #include "tree-log.h"
27
28 /* magic values for the inode_only field in btrfs_log_inode:
29  *
30  * LOG_INODE_ALL means to log everything
31  * LOG_INODE_EXISTS means to log just enough to recreate the inode
32  * during log replay
33  */
34 #define LOG_INODE_ALL 0
35 #define LOG_INODE_EXISTS 1
36
37 /*
38  * stages for the tree walking.  The first
39  * stage (0) is to only pin down the blocks we find
40  * the second stage (1) is to make sure that all the inodes
41  * we find in the log are created in the subvolume.
42  *
43  * The last stage is to deal with directories and links and extents
44  * and all the other fun semantics
45  */
46 #define LOG_WALK_PIN_ONLY 0
47 #define LOG_WALK_REPLAY_INODES 1
48 #define LOG_WALK_REPLAY_ALL 2
49
50 static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
51                              struct btrfs_root *root, struct inode *inode,
52                              int inode_only);
53 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
54                              struct btrfs_root *root,
55                              struct btrfs_path *path, u64 objectid);
56
57 /*
58  * tree logging is a special write ahead log used to make sure that
59  * fsyncs and O_SYNCs can happen without doing full tree commits.
60  *
61  * Full tree commits are expensive because they require commonly
62  * modified blocks to be recowed, creating many dirty pages in the
63  * extent tree an 4x-6x higher write load than ext3.
64  *
65  * Instead of doing a tree commit on every fsync, we use the
66  * key ranges and transaction ids to find items for a given file or directory
67  * that have changed in this transaction.  Those items are copied into
68  * a special tree (one per subvolume root), that tree is written to disk
69  * and then the fsync is considered complete.
70  *
71  * After a crash, items are copied out of the log-tree back into the
72  * subvolume tree.  Any file data extents found are recorded in the extent
73  * allocation tree, and the log-tree freed.
74  *
75  * The log tree is read three times, once to pin down all the extents it is
76  * using in ram and once, once to create all the inodes logged in the tree
77  * and once to do all the other items.
78  */
79
80 /*
81  * btrfs_add_log_tree adds a new per-subvolume log tree into the
82  * tree of log tree roots.  This must be called with a tree log transaction
83  * running (see start_log_trans).
84  */
85 static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
86                       struct btrfs_root *root)
87 {
88         struct btrfs_key key;
89         struct btrfs_root_item root_item;
90         struct btrfs_inode_item *inode_item;
91         struct extent_buffer *leaf;
92         struct btrfs_root *new_root = root;
93         int ret;
94         u64 objectid = root->root_key.objectid;
95
96         leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
97                                       BTRFS_TREE_LOG_OBJECTID,
98                                       trans->transid, 0, 0, 0);
99         if (IS_ERR(leaf)) {
100                 ret = PTR_ERR(leaf);
101                 return ret;
102         }
103
104         btrfs_set_header_nritems(leaf, 0);
105         btrfs_set_header_level(leaf, 0);
106         btrfs_set_header_bytenr(leaf, leaf->start);
107         btrfs_set_header_generation(leaf, trans->transid);
108         btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
109
110         write_extent_buffer(leaf, root->fs_info->fsid,
111                             (unsigned long)btrfs_header_fsid(leaf),
112                             BTRFS_FSID_SIZE);
113         btrfs_mark_buffer_dirty(leaf);
114
115         inode_item = &root_item.inode;
116         memset(inode_item, 0, sizeof(*inode_item));
117         inode_item->generation = cpu_to_le64(1);
118         inode_item->size = cpu_to_le64(3);
119         inode_item->nlink = cpu_to_le32(1);
120         inode_item->nbytes = cpu_to_le64(root->leafsize);
121         inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
122
123         btrfs_set_root_bytenr(&root_item, leaf->start);
124         btrfs_set_root_generation(&root_item, trans->transid);
125         btrfs_set_root_level(&root_item, 0);
126         btrfs_set_root_refs(&root_item, 0);
127         btrfs_set_root_used(&root_item, 0);
128
129         memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
130         root_item.drop_level = 0;
131
132         btrfs_tree_unlock(leaf);
133         free_extent_buffer(leaf);
134         leaf = NULL;
135
136         btrfs_set_root_dirid(&root_item, 0);
137
138         key.objectid = BTRFS_TREE_LOG_OBJECTID;
139         key.offset = objectid;
140         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
141         ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
142                                 &root_item);
143         if (ret)
144                 goto fail;
145
146         new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
147                                                &key);
148         BUG_ON(!new_root);
149
150         WARN_ON(root->log_root);
151         root->log_root = new_root;
152
153         /*
154          * log trees do not get reference counted because they go away
155          * before a real commit is actually done.  They do store pointers
156          * to file data extents, and those reference counts still get
157          * updated (along with back refs to the log tree).
158          */
159         new_root->ref_cows = 0;
160         new_root->last_trans = trans->transid;
161
162         /*
163          * we need to make sure the root block for this new tree
164          * is marked as dirty in the dirty_log_pages tree.  This
165          * is how it gets flushed down to disk at tree log commit time.
166          *
167          * the tree logging mutex keeps others from coming in and changing
168          * the new_root->node, so we can safely access it here
169          */
170         set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start,
171                          new_root->node->start + new_root->node->len - 1,
172                          GFP_NOFS);
173
174 fail:
175         return ret;
176 }
177
178 /*
179  * start a sub transaction and setup the log tree
180  * this increments the log tree writer count to make the people
181  * syncing the tree wait for us to finish
182  */
183 static int start_log_trans(struct btrfs_trans_handle *trans,
184                            struct btrfs_root *root)
185 {
186         int ret;
187         mutex_lock(&root->fs_info->tree_log_mutex);
188         if (!root->fs_info->log_root_tree) {
189                 ret = btrfs_init_log_root_tree(trans, root->fs_info);
190                 BUG_ON(ret);
191         }
192         if (!root->log_root) {
193                 ret = btrfs_add_log_tree(trans, root);
194                 BUG_ON(ret);
195         }
196         atomic_inc(&root->fs_info->tree_log_writers);
197         root->fs_info->tree_log_batch++;
198         mutex_unlock(&root->fs_info->tree_log_mutex);
199         return 0;
200 }
201
202 /*
203  * returns 0 if there was a log transaction running and we were able
204  * to join, or returns -ENOENT if there were not transactions
205  * in progress
206  */
207 static int join_running_log_trans(struct btrfs_root *root)
208 {
209         int ret = -ENOENT;
210
211         smp_mb();
212         if (!root->log_root)
213                 return -ENOENT;
214
215         mutex_lock(&root->fs_info->tree_log_mutex);
216         if (root->log_root) {
217                 ret = 0;
218                 atomic_inc(&root->fs_info->tree_log_writers);
219                 root->fs_info->tree_log_batch++;
220         }
221         mutex_unlock(&root->fs_info->tree_log_mutex);
222         return ret;
223 }
224
225 /*
226  * indicate we're done making changes to the log tree
227  * and wake up anyone waiting to do a sync
228  */
229 static int end_log_trans(struct btrfs_root *root)
230 {
231         atomic_dec(&root->fs_info->tree_log_writers);
232         smp_mb();
233         if (waitqueue_active(&root->fs_info->tree_log_wait))
234                 wake_up(&root->fs_info->tree_log_wait);
235         return 0;
236 }
237
238
239 /*
240  * the walk control struct is used to pass state down the chain when
241  * processing the log tree.  The stage field tells us which part
242  * of the log tree processing we are currently doing.  The others
243  * are state fields used for that specific part
244  */
245 struct walk_control {
246         /* should we free the extent on disk when done?  This is used
247          * at transaction commit time while freeing a log tree
248          */
249         int free;
250
251         /* should we write out the extent buffer?  This is used
252          * while flushing the log tree to disk during a sync
253          */
254         int write;
255
256         /* should we wait for the extent buffer io to finish?  Also used
257          * while flushing the log tree to disk for a sync
258          */
259         int wait;
260
261         /* pin only walk, we record which extents on disk belong to the
262          * log trees
263          */
264         int pin;
265
266         /* what stage of the replay code we're currently in */
267         int stage;
268
269         /* the root we are currently replaying */
270         struct btrfs_root *replay_dest;
271
272         /* the trans handle for the current replay */
273         struct btrfs_trans_handle *trans;
274
275         /* the function that gets used to process blocks we find in the
276          * tree.  Note the extent_buffer might not be up to date when it is
277          * passed in, and it must be checked or read if you need the data
278          * inside it
279          */
280         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
281                             struct walk_control *wc, u64 gen);
282 };
283
284 /*
285  * process_func used to pin down extents, write them or wait on them
286  */
287 static int process_one_buffer(struct btrfs_root *log,
288                               struct extent_buffer *eb,
289                               struct walk_control *wc, u64 gen)
290 {
291         if (wc->pin) {
292                 mutex_lock(&log->fs_info->pinned_mutex);
293                 btrfs_update_pinned_extents(log->fs_info->extent_root,
294                                             eb->start, eb->len, 1);
295                 mutex_unlock(&log->fs_info->pinned_mutex);
296         }
297
298         if (btrfs_buffer_uptodate(eb, gen)) {
299                 if (wc->write)
300                         btrfs_write_tree_block(eb);
301                 if (wc->wait)
302                         btrfs_wait_tree_block_writeback(eb);
303         }
304         return 0;
305 }
306
307 /*
308  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
309  * to the src data we are copying out.
310  *
311  * root is the tree we are copying into, and path is a scratch
312  * path for use in this function (it should be released on entry and
313  * will be released on exit).
314  *
315  * If the key is already in the destination tree the existing item is
316  * overwritten.  If the existing item isn't big enough, it is extended.
317  * If it is too large, it is truncated.
318  *
319  * If the key isn't in the destination yet, a new item is inserted.
320  */
321 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
322                                    struct btrfs_root *root,
323                                    struct btrfs_path *path,
324                                    struct extent_buffer *eb, int slot,
325                                    struct btrfs_key *key)
326 {
327         int ret;
328         u32 item_size;
329         u64 saved_i_size = 0;
330         int save_old_i_size = 0;
331         unsigned long src_ptr;
332         unsigned long dst_ptr;
333         int overwrite_root = 0;
334
335         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
336                 overwrite_root = 1;
337
338         item_size = btrfs_item_size_nr(eb, slot);
339         src_ptr = btrfs_item_ptr_offset(eb, slot);
340
341         /* look for the key in the destination tree */
342         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
343         if (ret == 0) {
344                 char *src_copy;
345                 char *dst_copy;
346                 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
347                                                   path->slots[0]);
348                 if (dst_size != item_size)
349                         goto insert;
350
351                 if (item_size == 0) {
352                         btrfs_release_path(root, path);
353                         return 0;
354                 }
355                 dst_copy = kmalloc(item_size, GFP_NOFS);
356                 src_copy = kmalloc(item_size, GFP_NOFS);
357
358                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
359
360                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
361                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
362                                    item_size);
363                 ret = memcmp(dst_copy, src_copy, item_size);
364
365                 kfree(dst_copy);
366                 kfree(src_copy);
367                 /*
368                  * they have the same contents, just return, this saves
369                  * us from cowing blocks in the destination tree and doing
370                  * extra writes that may not have been done by a previous
371                  * sync
372                  */
373                 if (ret == 0) {
374                         btrfs_release_path(root, path);
375                         return 0;
376                 }
377
378         }
379 insert:
380         btrfs_release_path(root, path);
381         /* try to insert the key into the destination tree */
382         ret = btrfs_insert_empty_item(trans, root, path,
383                                       key, item_size);
384
385         /* make sure any existing item is the correct size */
386         if (ret == -EEXIST) {
387                 u32 found_size;
388                 found_size = btrfs_item_size_nr(path->nodes[0],
389                                                 path->slots[0]);
390                 if (found_size > item_size) {
391                         btrfs_truncate_item(trans, root, path, item_size, 1);
392                 } else if (found_size < item_size) {
393                         ret = btrfs_extend_item(trans, root, path,
394                                                 item_size - found_size);
395                         BUG_ON(ret);
396                 }
397         } else if (ret) {
398                 BUG();
399         }
400         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
401                                         path->slots[0]);
402
403         /* don't overwrite an existing inode if the generation number
404          * was logged as zero.  This is done when the tree logging code
405          * is just logging an inode to make sure it exists after recovery.
406          *
407          * Also, don't overwrite i_size on directories during replay.
408          * log replay inserts and removes directory items based on the
409          * state of the tree found in the subvolume, and i_size is modified
410          * as it goes
411          */
412         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
413                 struct btrfs_inode_item *src_item;
414                 struct btrfs_inode_item *dst_item;
415
416                 src_item = (struct btrfs_inode_item *)src_ptr;
417                 dst_item = (struct btrfs_inode_item *)dst_ptr;
418
419                 if (btrfs_inode_generation(eb, src_item) == 0)
420                         goto no_copy;
421
422                 if (overwrite_root &&
423                     S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
424                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
425                         save_old_i_size = 1;
426                         saved_i_size = btrfs_inode_size(path->nodes[0],
427                                                         dst_item);
428                 }
429         }
430
431         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
432                            src_ptr, item_size);
433
434         if (save_old_i_size) {
435                 struct btrfs_inode_item *dst_item;
436                 dst_item = (struct btrfs_inode_item *)dst_ptr;
437                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
438         }
439
440         /* make sure the generation is filled in */
441         if (key->type == BTRFS_INODE_ITEM_KEY) {
442                 struct btrfs_inode_item *dst_item;
443                 dst_item = (struct btrfs_inode_item *)dst_ptr;
444                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
445                         btrfs_set_inode_generation(path->nodes[0], dst_item,
446                                                    trans->transid);
447                 }
448         }
449 no_copy:
450         btrfs_mark_buffer_dirty(path->nodes[0]);
451         btrfs_release_path(root, path);
452         return 0;
453 }
454
455 /*
456  * simple helper to read an inode off the disk from a given root
457  * This can only be called for subvolume roots and not for the log
458  */
459 static noinline struct inode *read_one_inode(struct btrfs_root *root,
460                                              u64 objectid)
461 {
462         struct inode *inode;
463         inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
464         if (inode->i_state & I_NEW) {
465                 BTRFS_I(inode)->root = root;
466                 BTRFS_I(inode)->location.objectid = objectid;
467                 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
468                 BTRFS_I(inode)->location.offset = 0;
469                 btrfs_read_locked_inode(inode);
470                 unlock_new_inode(inode);
471
472         }
473         if (is_bad_inode(inode)) {
474                 iput(inode);
475                 inode = NULL;
476         }
477         return inode;
478 }
479
480 /* replays a single extent in 'eb' at 'slot' with 'key' into the
481  * subvolume 'root'.  path is released on entry and should be released
482  * on exit.
483  *
484  * extents in the log tree have not been allocated out of the extent
485  * tree yet.  So, this completes the allocation, taking a reference
486  * as required if the extent already exists or creating a new extent
487  * if it isn't in the extent allocation tree yet.
488  *
489  * The extent is inserted into the file, dropping any existing extents
490  * from the file that overlap the new one.
491  */
492 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
493                                       struct btrfs_root *root,
494                                       struct btrfs_path *path,
495                                       struct extent_buffer *eb, int slot,
496                                       struct btrfs_key *key)
497 {
498         int found_type;
499         u64 mask = root->sectorsize - 1;
500         u64 extent_end;
501         u64 alloc_hint;
502         u64 start = key->offset;
503         u64 saved_nbytes;
504         struct btrfs_file_extent_item *item;
505         struct inode *inode = NULL;
506         unsigned long size;
507         int ret = 0;
508
509         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
510         found_type = btrfs_file_extent_type(eb, item);
511
512         if (found_type == BTRFS_FILE_EXTENT_REG ||
513             found_type == BTRFS_FILE_EXTENT_PREALLOC)
514                 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
515         else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
516                 size = btrfs_file_extent_inline_len(eb, item);
517                 extent_end = (start + size + mask) & ~mask;
518         } else {
519                 ret = 0;
520                 goto out;
521         }
522
523         inode = read_one_inode(root, key->objectid);
524         if (!inode) {
525                 ret = -EIO;
526                 goto out;
527         }
528
529         /*
530          * first check to see if we already have this extent in the
531          * file.  This must be done before the btrfs_drop_extents run
532          * so we don't try to drop this extent.
533          */
534         ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
535                                        start, 0);
536
537         if (ret == 0 &&
538             (found_type == BTRFS_FILE_EXTENT_REG ||
539              found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
540                 struct btrfs_file_extent_item cmp1;
541                 struct btrfs_file_extent_item cmp2;
542                 struct btrfs_file_extent_item *existing;
543                 struct extent_buffer *leaf;
544
545                 leaf = path->nodes[0];
546                 existing = btrfs_item_ptr(leaf, path->slots[0],
547                                           struct btrfs_file_extent_item);
548
549                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
550                                    sizeof(cmp1));
551                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
552                                    sizeof(cmp2));
553
554                 /*
555                  * we already have a pointer to this exact extent,
556                  * we don't have to do anything
557                  */
558                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
559                         btrfs_release_path(root, path);
560                         goto out;
561                 }
562         }
563         btrfs_release_path(root, path);
564
565         saved_nbytes = inode_get_bytes(inode);
566         /* drop any overlapping extents */
567         ret = btrfs_drop_extents(trans, root, inode,
568                          start, extent_end, start, &alloc_hint);
569         BUG_ON(ret);
570
571         if (found_type == BTRFS_FILE_EXTENT_REG ||
572             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
573                 unsigned long dest_offset;
574                 struct btrfs_key ins;
575
576                 ret = btrfs_insert_empty_item(trans, root, path, key,
577                                               sizeof(*item));
578                 BUG_ON(ret);
579                 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
580                                                     path->slots[0]);
581                 copy_extent_buffer(path->nodes[0], eb, dest_offset,
582                                 (unsigned long)item,  sizeof(*item));
583
584                 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
585                 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
586                 ins.type = BTRFS_EXTENT_ITEM_KEY;
587
588                 if (ins.objectid > 0) {
589                         u64 csum_start;
590                         u64 csum_end;
591                         LIST_HEAD(ordered_sums);
592                         /*
593                          * is this extent already allocated in the extent
594                          * allocation tree?  If so, just add a reference
595                          */
596                         ret = btrfs_lookup_extent(root, ins.objectid,
597                                                 ins.offset);
598                         if (ret == 0) {
599                                 ret = btrfs_inc_extent_ref(trans, root,
600                                                 ins.objectid, ins.offset,
601                                                 path->nodes[0]->start,
602                                                 root->root_key.objectid,
603                                                 trans->transid, key->objectid);
604                         } else {
605                                 /*
606                                  * insert the extent pointer in the extent
607                                  * allocation tree
608                                  */
609                                 ret = btrfs_alloc_logged_extent(trans, root,
610                                                 path->nodes[0]->start,
611                                                 root->root_key.objectid,
612                                                 trans->transid, key->objectid,
613                                                 &ins);
614                                 BUG_ON(ret);
615                         }
616                         btrfs_release_path(root, path);
617
618                         if (btrfs_file_extent_compression(eb, item)) {
619                                 csum_start = ins.objectid;
620                                 csum_end = csum_start + ins.offset;
621                         } else {
622                                 csum_start = ins.objectid +
623                                         btrfs_file_extent_offset(eb, item);
624                                 csum_end = csum_start +
625                                         btrfs_file_extent_num_bytes(eb, item);
626                         }
627
628                         ret = btrfs_lookup_csums_range(root->log_root,
629                                                 csum_start, csum_end - 1,
630                                                 &ordered_sums);
631                         BUG_ON(ret);
632                         while (!list_empty(&ordered_sums)) {
633                                 struct btrfs_ordered_sum *sums;
634                                 sums = list_entry(ordered_sums.next,
635                                                 struct btrfs_ordered_sum,
636                                                 list);
637                                 ret = btrfs_csum_file_blocks(trans,
638                                                 root->fs_info->csum_root,
639                                                 sums);
640                                 BUG_ON(ret);
641                                 list_del(&sums->list);
642                                 kfree(sums);
643                         }
644                 } else {
645                         btrfs_release_path(root, path);
646                 }
647         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
648                 /* inline extents are easy, we just overwrite them */
649                 ret = overwrite_item(trans, root, path, eb, slot, key);
650                 BUG_ON(ret);
651         }
652
653         inode_set_bytes(inode, saved_nbytes);
654         btrfs_update_inode(trans, root, inode);
655 out:
656         if (inode)
657                 iput(inode);
658         return ret;
659 }
660
661 /*
662  * when cleaning up conflicts between the directory names in the
663  * subvolume, directory names in the log and directory names in the
664  * inode back references, we may have to unlink inodes from directories.
665  *
666  * This is a helper function to do the unlink of a specific directory
667  * item
668  */
669 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
670                                       struct btrfs_root *root,
671                                       struct btrfs_path *path,
672                                       struct inode *dir,
673                                       struct btrfs_dir_item *di)
674 {
675         struct inode *inode;
676         char *name;
677         int name_len;
678         struct extent_buffer *leaf;
679         struct btrfs_key location;
680         int ret;
681
682         leaf = path->nodes[0];
683
684         btrfs_dir_item_key_to_cpu(leaf, di, &location);
685         name_len = btrfs_dir_name_len(leaf, di);
686         name = kmalloc(name_len, GFP_NOFS);
687         read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
688         btrfs_release_path(root, path);
689
690         inode = read_one_inode(root, location.objectid);
691         BUG_ON(!inode);
692
693         ret = link_to_fixup_dir(trans, root, path, location.objectid);
694         BUG_ON(ret);
695         ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
696         BUG_ON(ret);
697         kfree(name);
698
699         iput(inode);
700         return ret;
701 }
702
703 /*
704  * helper function to see if a given name and sequence number found
705  * in an inode back reference are already in a directory and correctly
706  * point to this inode
707  */
708 static noinline int inode_in_dir(struct btrfs_root *root,
709                                  struct btrfs_path *path,
710                                  u64 dirid, u64 objectid, u64 index,
711                                  const char *name, int name_len)
712 {
713         struct btrfs_dir_item *di;
714         struct btrfs_key location;
715         int match = 0;
716
717         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
718                                          index, name, name_len, 0);
719         if (di && !IS_ERR(di)) {
720                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
721                 if (location.objectid != objectid)
722                         goto out;
723         } else
724                 goto out;
725         btrfs_release_path(root, path);
726
727         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
728         if (di && !IS_ERR(di)) {
729                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
730                 if (location.objectid != objectid)
731                         goto out;
732         } else
733                 goto out;
734         match = 1;
735 out:
736         btrfs_release_path(root, path);
737         return match;
738 }
739
740 /*
741  * helper function to check a log tree for a named back reference in
742  * an inode.  This is used to decide if a back reference that is
743  * found in the subvolume conflicts with what we find in the log.
744  *
745  * inode backreferences may have multiple refs in a single item,
746  * during replay we process one reference at a time, and we don't
747  * want to delete valid links to a file from the subvolume if that
748  * link is also in the log.
749  */
750 static noinline int backref_in_log(struct btrfs_root *log,
751                                    struct btrfs_key *key,
752                                    char *name, int namelen)
753 {
754         struct btrfs_path *path;
755         struct btrfs_inode_ref *ref;
756         unsigned long ptr;
757         unsigned long ptr_end;
758         unsigned long name_ptr;
759         int found_name_len;
760         int item_size;
761         int ret;
762         int match = 0;
763
764         path = btrfs_alloc_path();
765         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
766         if (ret != 0)
767                 goto out;
768
769         item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
770         ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
771         ptr_end = ptr + item_size;
772         while (ptr < ptr_end) {
773                 ref = (struct btrfs_inode_ref *)ptr;
774                 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
775                 if (found_name_len == namelen) {
776                         name_ptr = (unsigned long)(ref + 1);
777                         ret = memcmp_extent_buffer(path->nodes[0], name,
778                                                    name_ptr, namelen);
779                         if (ret == 0) {
780                                 match = 1;
781                                 goto out;
782                         }
783                 }
784                 ptr = (unsigned long)(ref + 1) + found_name_len;
785         }
786 out:
787         btrfs_free_path(path);
788         return match;
789 }
790
791
792 /*
793  * replay one inode back reference item found in the log tree.
794  * eb, slot and key refer to the buffer and key found in the log tree.
795  * root is the destination we are replaying into, and path is for temp
796  * use by this function.  (it should be released on return).
797  */
798 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
799                                   struct btrfs_root *root,
800                                   struct btrfs_root *log,
801                                   struct btrfs_path *path,
802                                   struct extent_buffer *eb, int slot,
803                                   struct btrfs_key *key)
804 {
805         struct inode *dir;
806         int ret;
807         struct btrfs_key location;
808         struct btrfs_inode_ref *ref;
809         struct btrfs_dir_item *di;
810         struct inode *inode;
811         char *name;
812         int namelen;
813         unsigned long ref_ptr;
814         unsigned long ref_end;
815
816         location.objectid = key->objectid;
817         location.type = BTRFS_INODE_ITEM_KEY;
818         location.offset = 0;
819
820         /*
821          * it is possible that we didn't log all the parent directories
822          * for a given inode.  If we don't find the dir, just don't
823          * copy the back ref in.  The link count fixup code will take
824          * care of the rest
825          */
826         dir = read_one_inode(root, key->offset);
827         if (!dir)
828                 return -ENOENT;
829
830         inode = read_one_inode(root, key->objectid);
831         BUG_ON(!dir);
832
833         ref_ptr = btrfs_item_ptr_offset(eb, slot);
834         ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
835
836 again:
837         ref = (struct btrfs_inode_ref *)ref_ptr;
838
839         namelen = btrfs_inode_ref_name_len(eb, ref);
840         name = kmalloc(namelen, GFP_NOFS);
841         BUG_ON(!name);
842
843         read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
844
845         /* if we already have a perfect match, we're done */
846         if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
847                          btrfs_inode_ref_index(eb, ref),
848                          name, namelen)) {
849                 goto out;
850         }
851
852         /*
853          * look for a conflicting back reference in the metadata.
854          * if we find one we have to unlink that name of the file
855          * before we add our new link.  Later on, we overwrite any
856          * existing back reference, and we don't want to create
857          * dangling pointers in the directory.
858          */
859 conflict_again:
860         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
861         if (ret == 0) {
862                 char *victim_name;
863                 int victim_name_len;
864                 struct btrfs_inode_ref *victim_ref;
865                 unsigned long ptr;
866                 unsigned long ptr_end;
867                 struct extent_buffer *leaf = path->nodes[0];
868
869                 /* are we trying to overwrite a back ref for the root directory
870                  * if so, just jump out, we're done
871                  */
872                 if (key->objectid == key->offset)
873                         goto out_nowrite;
874
875                 /* check all the names in this back reference to see
876                  * if they are in the log.  if so, we allow them to stay
877                  * otherwise they must be unlinked as a conflict
878                  */
879                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
880                 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
881                 while (ptr < ptr_end) {
882                         victim_ref = (struct btrfs_inode_ref *)ptr;
883                         victim_name_len = btrfs_inode_ref_name_len(leaf,
884                                                                    victim_ref);
885                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
886                         BUG_ON(!victim_name);
887
888                         read_extent_buffer(leaf, victim_name,
889                                            (unsigned long)(victim_ref + 1),
890                                            victim_name_len);
891
892                         if (!backref_in_log(log, key, victim_name,
893                                             victim_name_len)) {
894                                 btrfs_inc_nlink(inode);
895                                 btrfs_release_path(root, path);
896                                 ret = btrfs_unlink_inode(trans, root, dir,
897                                                          inode, victim_name,
898                                                          victim_name_len);
899                                 kfree(victim_name);
900                                 btrfs_release_path(root, path);
901                                 goto conflict_again;
902                         }
903                         kfree(victim_name);
904                         ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
905                 }
906                 BUG_ON(ret);
907         }
908         btrfs_release_path(root, path);
909
910         /* look for a conflicting sequence number */
911         di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
912                                          btrfs_inode_ref_index(eb, ref),
913                                          name, namelen, 0);
914         if (di && !IS_ERR(di)) {
915                 ret = drop_one_dir_item(trans, root, path, dir, di);
916                 BUG_ON(ret);
917         }
918         btrfs_release_path(root, path);
919
920
921         /* look for a conflicting name */
922         di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
923                                    name, namelen, 0);
924         if (di && !IS_ERR(di)) {
925                 ret = drop_one_dir_item(trans, root, path, dir, di);
926                 BUG_ON(ret);
927         }
928         btrfs_release_path(root, path);
929
930         /* insert our name */
931         ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
932                              btrfs_inode_ref_index(eb, ref));
933         BUG_ON(ret);
934
935         btrfs_update_inode(trans, root, inode);
936
937 out:
938         ref_ptr = (unsigned long)(ref + 1) + namelen;
939         kfree(name);
940         if (ref_ptr < ref_end)
941                 goto again;
942
943         /* finally write the back reference in the inode */
944         ret = overwrite_item(trans, root, path, eb, slot, key);
945         BUG_ON(ret);
946
947 out_nowrite:
948         btrfs_release_path(root, path);
949         iput(dir);
950         iput(inode);
951         return 0;
952 }
953
954 /*
955  * There are a few corners where the link count of the file can't
956  * be properly maintained during replay.  So, instead of adding
957  * lots of complexity to the log code, we just scan the backrefs
958  * for any file that has been through replay.
959  *
960  * The scan will update the link count on the inode to reflect the
961  * number of back refs found.  If it goes down to zero, the iput
962  * will free the inode.
963  */
964 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
965                                            struct btrfs_root *root,
966                                            struct inode *inode)
967 {
968         struct btrfs_path *path;
969         int ret;
970         struct btrfs_key key;
971         u64 nlink = 0;
972         unsigned long ptr;
973         unsigned long ptr_end;
974         int name_len;
975
976         key.objectid = inode->i_ino;
977         key.type = BTRFS_INODE_REF_KEY;
978         key.offset = (u64)-1;
979
980         path = btrfs_alloc_path();
981
982         while (1) {
983                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
984                 if (ret < 0)
985                         break;
986                 if (ret > 0) {
987                         if (path->slots[0] == 0)
988                                 break;
989                         path->slots[0]--;
990                 }
991                 btrfs_item_key_to_cpu(path->nodes[0], &key,
992                                       path->slots[0]);
993                 if (key.objectid != inode->i_ino ||
994                     key.type != BTRFS_INODE_REF_KEY)
995                         break;
996                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
997                 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
998                                                    path->slots[0]);
999                 while (ptr < ptr_end) {
1000                         struct btrfs_inode_ref *ref;
1001
1002                         ref = (struct btrfs_inode_ref *)ptr;
1003                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1004                                                             ref);
1005                         ptr = (unsigned long)(ref + 1) + name_len;
1006                         nlink++;
1007                 }
1008
1009                 if (key.offset == 0)
1010                         break;
1011                 key.offset--;
1012                 btrfs_release_path(root, path);
1013         }
1014         btrfs_free_path(path);
1015         if (nlink != inode->i_nlink) {
1016                 inode->i_nlink = nlink;
1017                 btrfs_update_inode(trans, root, inode);
1018         }
1019         BTRFS_I(inode)->index_cnt = (u64)-1;
1020
1021         return 0;
1022 }
1023
1024 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1025                                             struct btrfs_root *root,
1026                                             struct btrfs_path *path)
1027 {
1028         int ret;
1029         struct btrfs_key key;
1030         struct inode *inode;
1031
1032         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1033         key.type = BTRFS_ORPHAN_ITEM_KEY;
1034         key.offset = (u64)-1;
1035         while (1) {
1036                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1037                 if (ret < 0)
1038                         break;
1039
1040                 if (ret == 1) {
1041                         if (path->slots[0] == 0)
1042                                 break;
1043                         path->slots[0]--;
1044                 }
1045
1046                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1047                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1048                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1049                         break;
1050
1051                 ret = btrfs_del_item(trans, root, path);
1052                 BUG_ON(ret);
1053
1054                 btrfs_release_path(root, path);
1055                 inode = read_one_inode(root, key.offset);
1056                 BUG_ON(!inode);
1057
1058                 ret = fixup_inode_link_count(trans, root, inode);
1059                 BUG_ON(ret);
1060
1061                 iput(inode);
1062
1063                 if (key.offset == 0)
1064                         break;
1065                 key.offset--;
1066         }
1067         btrfs_release_path(root, path);
1068         return 0;
1069 }
1070
1071
1072 /*
1073  * record a given inode in the fixup dir so we can check its link
1074  * count when replay is done.  The link count is incremented here
1075  * so the inode won't go away until we check it
1076  */
1077 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1078                                       struct btrfs_root *root,
1079                                       struct btrfs_path *path,
1080                                       u64 objectid)
1081 {
1082         struct btrfs_key key;
1083         int ret = 0;
1084         struct inode *inode;
1085
1086         inode = read_one_inode(root, objectid);
1087         BUG_ON(!inode);
1088
1089         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1090         btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1091         key.offset = objectid;
1092
1093         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1094
1095         btrfs_release_path(root, path);
1096         if (ret == 0) {
1097                 btrfs_inc_nlink(inode);
1098                 btrfs_update_inode(trans, root, inode);
1099         } else if (ret == -EEXIST) {
1100                 ret = 0;
1101         } else {
1102                 BUG();
1103         }
1104         iput(inode);
1105
1106         return ret;
1107 }
1108
1109 /*
1110  * when replaying the log for a directory, we only insert names
1111  * for inodes that actually exist.  This means an fsync on a directory
1112  * does not implicitly fsync all the new files in it
1113  */
1114 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1115                                     struct btrfs_root *root,
1116                                     struct btrfs_path *path,
1117                                     u64 dirid, u64 index,
1118                                     char *name, int name_len, u8 type,
1119                                     struct btrfs_key *location)
1120 {
1121         struct inode *inode;
1122         struct inode *dir;
1123         int ret;
1124
1125         inode = read_one_inode(root, location->objectid);
1126         if (!inode)
1127                 return -ENOENT;
1128
1129         dir = read_one_inode(root, dirid);
1130         if (!dir) {
1131                 iput(inode);
1132                 return -EIO;
1133         }
1134         ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1135
1136         /* FIXME, put inode into FIXUP list */
1137
1138         iput(inode);
1139         iput(dir);
1140         return ret;
1141 }
1142
1143 /*
1144  * take a single entry in a log directory item and replay it into
1145  * the subvolume.
1146  *
1147  * if a conflicting item exists in the subdirectory already,
1148  * the inode it points to is unlinked and put into the link count
1149  * fix up tree.
1150  *
1151  * If a name from the log points to a file or directory that does
1152  * not exist in the FS, it is skipped.  fsyncs on directories
1153  * do not force down inodes inside that directory, just changes to the
1154  * names or unlinks in a directory.
1155  */
1156 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1157                                     struct btrfs_root *root,
1158                                     struct btrfs_path *path,
1159                                     struct extent_buffer *eb,
1160                                     struct btrfs_dir_item *di,
1161                                     struct btrfs_key *key)
1162 {
1163         char *name;
1164         int name_len;
1165         struct btrfs_dir_item *dst_di;
1166         struct btrfs_key found_key;
1167         struct btrfs_key log_key;
1168         struct inode *dir;
1169         u8 log_type;
1170         int exists;
1171         int ret;
1172
1173         dir = read_one_inode(root, key->objectid);
1174         BUG_ON(!dir);
1175
1176         name_len = btrfs_dir_name_len(eb, di);
1177         name = kmalloc(name_len, GFP_NOFS);
1178         log_type = btrfs_dir_type(eb, di);
1179         read_extent_buffer(eb, name, (unsigned long)(di + 1),
1180                    name_len);
1181
1182         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1183         exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1184         if (exists == 0)
1185                 exists = 1;
1186         else
1187                 exists = 0;
1188         btrfs_release_path(root, path);
1189
1190         if (key->type == BTRFS_DIR_ITEM_KEY) {
1191                 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1192                                        name, name_len, 1);
1193         } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1194                 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1195                                                      key->objectid,
1196                                                      key->offset, name,
1197                                                      name_len, 1);
1198         } else {
1199                 BUG();
1200         }
1201         if (!dst_di || IS_ERR(dst_di)) {
1202                 /* we need a sequence number to insert, so we only
1203                  * do inserts for the BTRFS_DIR_INDEX_KEY types
1204                  */
1205                 if (key->type != BTRFS_DIR_INDEX_KEY)
1206                         goto out;
1207                 goto insert;
1208         }
1209
1210         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1211         /* the existing item matches the logged item */
1212         if (found_key.objectid == log_key.objectid &&
1213             found_key.type == log_key.type &&
1214             found_key.offset == log_key.offset &&
1215             btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1216                 goto out;
1217         }
1218
1219         /*
1220          * don't drop the conflicting directory entry if the inode
1221          * for the new entry doesn't exist
1222          */
1223         if (!exists)
1224                 goto out;
1225
1226         ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1227         BUG_ON(ret);
1228
1229         if (key->type == BTRFS_DIR_INDEX_KEY)
1230                 goto insert;
1231 out:
1232         btrfs_release_path(root, path);
1233         kfree(name);
1234         iput(dir);
1235         return 0;
1236
1237 insert:
1238         btrfs_release_path(root, path);
1239         ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1240                               name, name_len, log_type, &log_key);
1241
1242         if (ret && ret != -ENOENT)
1243                 BUG();
1244         goto out;
1245 }
1246
1247 /*
1248  * find all the names in a directory item and reconcile them into
1249  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1250  * one name in a directory item, but the same code gets used for
1251  * both directory index types
1252  */
1253 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1254                                         struct btrfs_root *root,
1255                                         struct btrfs_path *path,
1256                                         struct extent_buffer *eb, int slot,
1257                                         struct btrfs_key *key)
1258 {
1259         int ret;
1260         u32 item_size = btrfs_item_size_nr(eb, slot);
1261         struct btrfs_dir_item *di;
1262         int name_len;
1263         unsigned long ptr;
1264         unsigned long ptr_end;
1265
1266         ptr = btrfs_item_ptr_offset(eb, slot);
1267         ptr_end = ptr + item_size;
1268         while (ptr < ptr_end) {
1269                 di = (struct btrfs_dir_item *)ptr;
1270                 name_len = btrfs_dir_name_len(eb, di);
1271                 ret = replay_one_name(trans, root, path, eb, di, key);
1272                 BUG_ON(ret);
1273                 ptr = (unsigned long)(di + 1);
1274                 ptr += name_len;
1275         }
1276         return 0;
1277 }
1278
1279 /*
1280  * directory replay has two parts.  There are the standard directory
1281  * items in the log copied from the subvolume, and range items
1282  * created in the log while the subvolume was logged.
1283  *
1284  * The range items tell us which parts of the key space the log
1285  * is authoritative for.  During replay, if a key in the subvolume
1286  * directory is in a logged range item, but not actually in the log
1287  * that means it was deleted from the directory before the fsync
1288  * and should be removed.
1289  */
1290 static noinline int find_dir_range(struct btrfs_root *root,
1291                                    struct btrfs_path *path,
1292                                    u64 dirid, int key_type,
1293                                    u64 *start_ret, u64 *end_ret)
1294 {
1295         struct btrfs_key key;
1296         u64 found_end;
1297         struct btrfs_dir_log_item *item;
1298         int ret;
1299         int nritems;
1300
1301         if (*start_ret == (u64)-1)
1302                 return 1;
1303
1304         key.objectid = dirid;
1305         key.type = key_type;
1306         key.offset = *start_ret;
1307
1308         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1309         if (ret < 0)
1310                 goto out;
1311         if (ret > 0) {
1312                 if (path->slots[0] == 0)
1313                         goto out;
1314                 path->slots[0]--;
1315         }
1316         if (ret != 0)
1317                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1318
1319         if (key.type != key_type || key.objectid != dirid) {
1320                 ret = 1;
1321                 goto next;
1322         }
1323         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1324                               struct btrfs_dir_log_item);
1325         found_end = btrfs_dir_log_end(path->nodes[0], item);
1326
1327         if (*start_ret >= key.offset && *start_ret <= found_end) {
1328                 ret = 0;
1329                 *start_ret = key.offset;
1330                 *end_ret = found_end;
1331                 goto out;
1332         }
1333         ret = 1;
1334 next:
1335         /* check the next slot in the tree to see if it is a valid item */
1336         nritems = btrfs_header_nritems(path->nodes[0]);
1337         if (path->slots[0] >= nritems) {
1338                 ret = btrfs_next_leaf(root, path);
1339                 if (ret)
1340                         goto out;
1341         } else {
1342                 path->slots[0]++;
1343         }
1344
1345         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1346
1347         if (key.type != key_type || key.objectid != dirid) {
1348                 ret = 1;
1349                 goto out;
1350         }
1351         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1352                               struct btrfs_dir_log_item);
1353         found_end = btrfs_dir_log_end(path->nodes[0], item);
1354         *start_ret = key.offset;
1355         *end_ret = found_end;
1356         ret = 0;
1357 out:
1358         btrfs_release_path(root, path);
1359         return ret;
1360 }
1361
1362 /*
1363  * this looks for a given directory item in the log.  If the directory
1364  * item is not in the log, the item is removed and the inode it points
1365  * to is unlinked
1366  */
1367 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1368                                       struct btrfs_root *root,
1369                                       struct btrfs_root *log,
1370                                       struct btrfs_path *path,
1371                                       struct btrfs_path *log_path,
1372                                       struct inode *dir,
1373                                       struct btrfs_key *dir_key)
1374 {
1375         int ret;
1376         struct extent_buffer *eb;
1377         int slot;
1378         u32 item_size;
1379         struct btrfs_dir_item *di;
1380         struct btrfs_dir_item *log_di;
1381         int name_len;
1382         unsigned long ptr;
1383         unsigned long ptr_end;
1384         char *name;
1385         struct inode *inode;
1386         struct btrfs_key location;
1387
1388 again:
1389         eb = path->nodes[0];
1390         slot = path->slots[0];
1391         item_size = btrfs_item_size_nr(eb, slot);
1392         ptr = btrfs_item_ptr_offset(eb, slot);
1393         ptr_end = ptr + item_size;
1394         while (ptr < ptr_end) {
1395                 di = (struct btrfs_dir_item *)ptr;
1396                 name_len = btrfs_dir_name_len(eb, di);
1397                 name = kmalloc(name_len, GFP_NOFS);
1398                 if (!name) {
1399                         ret = -ENOMEM;
1400                         goto out;
1401                 }
1402                 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1403                                   name_len);
1404                 log_di = NULL;
1405                 if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
1406                         log_di = btrfs_lookup_dir_item(trans, log, log_path,
1407                                                        dir_key->objectid,
1408                                                        name, name_len, 0);
1409                 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
1410                         log_di = btrfs_lookup_dir_index_item(trans, log,
1411                                                      log_path,
1412                                                      dir_key->objectid,
1413                                                      dir_key->offset,
1414                                                      name, name_len, 0);
1415                 }
1416                 if (!log_di || IS_ERR(log_di)) {
1417                         btrfs_dir_item_key_to_cpu(eb, di, &location);
1418                         btrfs_release_path(root, path);
1419                         btrfs_release_path(log, log_path);
1420                         inode = read_one_inode(root, location.objectid);
1421                         BUG_ON(!inode);
1422
1423                         ret = link_to_fixup_dir(trans, root,
1424                                                 path, location.objectid);
1425                         BUG_ON(ret);
1426                         btrfs_inc_nlink(inode);
1427                         ret = btrfs_unlink_inode(trans, root, dir, inode,
1428                                                  name, name_len);
1429                         BUG_ON(ret);
1430                         kfree(name);
1431                         iput(inode);
1432
1433                         /* there might still be more names under this key
1434                          * check and repeat if required
1435                          */
1436                         ret = btrfs_search_slot(NULL, root, dir_key, path,
1437                                                 0, 0);
1438                         if (ret == 0)
1439                                 goto again;
1440                         ret = 0;
1441                         goto out;
1442                 }
1443                 btrfs_release_path(log, log_path);
1444                 kfree(name);
1445
1446                 ptr = (unsigned long)(di + 1);
1447                 ptr += name_len;
1448         }
1449         ret = 0;
1450 out:
1451         btrfs_release_path(root, path);
1452         btrfs_release_path(log, log_path);
1453         return ret;
1454 }
1455
1456 /*
1457  * deletion replay happens before we copy any new directory items
1458  * out of the log or out of backreferences from inodes.  It
1459  * scans the log to find ranges of keys that log is authoritative for,
1460  * and then scans the directory to find items in those ranges that are
1461  * not present in the log.
1462  *
1463  * Anything we don't find in the log is unlinked and removed from the
1464  * directory.
1465  */
1466 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1467                                        struct btrfs_root *root,
1468                                        struct btrfs_root *log,
1469                                        struct btrfs_path *path,
1470                                        u64 dirid)
1471 {
1472         u64 range_start;
1473         u64 range_end;
1474         int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1475         int ret = 0;
1476         struct btrfs_key dir_key;
1477         struct btrfs_key found_key;
1478         struct btrfs_path *log_path;
1479         struct inode *dir;
1480
1481         dir_key.objectid = dirid;
1482         dir_key.type = BTRFS_DIR_ITEM_KEY;
1483         log_path = btrfs_alloc_path();
1484         if (!log_path)
1485                 return -ENOMEM;
1486
1487         dir = read_one_inode(root, dirid);
1488         /* it isn't an error if the inode isn't there, that can happen
1489          * because we replay the deletes before we copy in the inode item
1490          * from the log
1491          */
1492         if (!dir) {
1493                 btrfs_free_path(log_path);
1494                 return 0;
1495         }
1496 again:
1497         range_start = 0;
1498         range_end = 0;
1499         while (1) {
1500                 ret = find_dir_range(log, path, dirid, key_type,
1501                                      &range_start, &range_end);
1502                 if (ret != 0)
1503                         break;
1504
1505                 dir_key.offset = range_start;
1506                 while (1) {
1507                         int nritems;
1508                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
1509                                                 0, 0);
1510                         if (ret < 0)
1511                                 goto out;
1512
1513                         nritems = btrfs_header_nritems(path->nodes[0]);
1514                         if (path->slots[0] >= nritems) {
1515                                 ret = btrfs_next_leaf(root, path);
1516                                 if (ret)
1517                                         break;
1518                         }
1519                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1520                                               path->slots[0]);
1521                         if (found_key.objectid != dirid ||
1522                             found_key.type != dir_key.type)
1523                                 goto next_type;
1524
1525                         if (found_key.offset > range_end)
1526                                 break;
1527
1528                         ret = check_item_in_log(trans, root, log, path,
1529                                                 log_path, dir, &found_key);
1530                         BUG_ON(ret);
1531                         if (found_key.offset == (u64)-1)
1532                                 break;
1533                         dir_key.offset = found_key.offset + 1;
1534                 }
1535                 btrfs_release_path(root, path);
1536                 if (range_end == (u64)-1)
1537                         break;
1538                 range_start = range_end + 1;
1539         }
1540
1541 next_type:
1542         ret = 0;
1543         if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1544                 key_type = BTRFS_DIR_LOG_INDEX_KEY;
1545                 dir_key.type = BTRFS_DIR_INDEX_KEY;
1546                 btrfs_release_path(root, path);
1547                 goto again;
1548         }
1549 out:
1550         btrfs_release_path(root, path);
1551         btrfs_free_path(log_path);
1552         iput(dir);
1553         return ret;
1554 }
1555
1556 /*
1557  * the process_func used to replay items from the log tree.  This
1558  * gets called in two different stages.  The first stage just looks
1559  * for inodes and makes sure they are all copied into the subvolume.
1560  *
1561  * The second stage copies all the other item types from the log into
1562  * the subvolume.  The two stage approach is slower, but gets rid of
1563  * lots of complexity around inodes referencing other inodes that exist
1564  * only in the log (references come from either directory items or inode
1565  * back refs).
1566  */
1567 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1568                              struct walk_control *wc, u64 gen)
1569 {
1570         int nritems;
1571         struct btrfs_path *path;
1572         struct btrfs_root *root = wc->replay_dest;
1573         struct btrfs_key key;
1574         u32 item_size;
1575         int level;
1576         int i;
1577         int ret;
1578
1579         btrfs_read_buffer(eb, gen);
1580
1581         level = btrfs_header_level(eb);
1582
1583         if (level != 0)
1584                 return 0;
1585
1586         path = btrfs_alloc_path();
1587         BUG_ON(!path);
1588
1589         nritems = btrfs_header_nritems(eb);
1590         for (i = 0; i < nritems; i++) {
1591                 btrfs_item_key_to_cpu(eb, &key, i);
1592                 item_size = btrfs_item_size_nr(eb, i);
1593
1594                 /* inode keys are done during the first stage */
1595                 if (key.type == BTRFS_INODE_ITEM_KEY &&
1596                     wc->stage == LOG_WALK_REPLAY_INODES) {
1597                         struct inode *inode;
1598                         struct btrfs_inode_item *inode_item;
1599                         u32 mode;
1600
1601                         inode_item = btrfs_item_ptr(eb, i,
1602                                             struct btrfs_inode_item);
1603                         mode = btrfs_inode_mode(eb, inode_item);
1604                         if (S_ISDIR(mode)) {
1605                                 ret = replay_dir_deletes(wc->trans,
1606                                          root, log, path, key.objectid);
1607                                 BUG_ON(ret);
1608                         }
1609                         ret = overwrite_item(wc->trans, root, path,
1610                                              eb, i, &key);
1611                         BUG_ON(ret);
1612
1613                         /* for regular files, truncate away
1614                          * extents past the new EOF
1615                          */
1616                         if (S_ISREG(mode)) {
1617                                 inode = read_one_inode(root,
1618                                                        key.objectid);
1619                                 BUG_ON(!inode);
1620
1621                                 ret = btrfs_truncate_inode_items(wc->trans,
1622                                         root, inode, inode->i_size,
1623                                         BTRFS_EXTENT_DATA_KEY);
1624                                 BUG_ON(ret);
1625                                 iput(inode);
1626                         }
1627                         ret = link_to_fixup_dir(wc->trans, root,
1628                                                 path, key.objectid);
1629                         BUG_ON(ret);
1630                 }
1631                 if (wc->stage < LOG_WALK_REPLAY_ALL)
1632                         continue;
1633
1634                 /* these keys are simply copied */
1635                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
1636                         ret = overwrite_item(wc->trans, root, path,
1637                                              eb, i, &key);
1638                         BUG_ON(ret);
1639                 } else if (key.type == BTRFS_INODE_REF_KEY) {
1640                         ret = add_inode_ref(wc->trans, root, log, path,
1641                                             eb, i, &key);
1642                         BUG_ON(ret && ret != -ENOENT);
1643                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1644                         ret = replay_one_extent(wc->trans, root, path,
1645                                                 eb, i, &key);
1646                         BUG_ON(ret);
1647                 } else if (key.type == BTRFS_DIR_ITEM_KEY ||
1648                            key.type == BTRFS_DIR_INDEX_KEY) {
1649                         ret = replay_one_dir_item(wc->trans, root, path,
1650                                                   eb, i, &key);
1651                         BUG_ON(ret);
1652                 }
1653         }
1654         btrfs_free_path(path);
1655         return 0;
1656 }
1657
1658 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1659                                    struct btrfs_root *root,
1660                                    struct btrfs_path *path, int *level,
1661                                    struct walk_control *wc)
1662 {
1663         u64 root_owner;
1664         u64 root_gen;
1665         u64 bytenr;
1666         u64 ptr_gen;
1667         struct extent_buffer *next;
1668         struct extent_buffer *cur;
1669         struct extent_buffer *parent;
1670         u32 blocksize;
1671         int ret = 0;
1672
1673         WARN_ON(*level < 0);
1674         WARN_ON(*level >= BTRFS_MAX_LEVEL);
1675
1676         while (*level > 0) {
1677                 WARN_ON(*level < 0);
1678                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1679                 cur = path->nodes[*level];
1680
1681                 if (btrfs_header_level(cur) != *level)
1682                         WARN_ON(1);
1683
1684                 if (path->slots[*level] >=
1685                     btrfs_header_nritems(cur))
1686                         break;
1687
1688                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1689                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1690                 blocksize = btrfs_level_size(root, *level - 1);
1691
1692                 parent = path->nodes[*level];
1693                 root_owner = btrfs_header_owner(parent);
1694                 root_gen = btrfs_header_generation(parent);
1695
1696                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1697
1698                 wc->process_func(root, next, wc, ptr_gen);
1699
1700                 if (*level == 1) {
1701                         path->slots[*level]++;
1702                         if (wc->free) {
1703                                 btrfs_read_buffer(next, ptr_gen);
1704
1705                                 btrfs_tree_lock(next);
1706                                 clean_tree_block(trans, root, next);
1707                                 btrfs_wait_tree_block_writeback(next);
1708                                 btrfs_tree_unlock(next);
1709
1710                                 ret = btrfs_drop_leaf_ref(trans, root, next);
1711                                 BUG_ON(ret);
1712
1713                                 WARN_ON(root_owner !=
1714                                         BTRFS_TREE_LOG_OBJECTID);
1715                                 ret = btrfs_free_reserved_extent(root,
1716                                                          bytenr, blocksize);
1717                                 BUG_ON(ret);
1718                         }
1719                         free_extent_buffer(next);
1720                         continue;
1721                 }
1722                 btrfs_read_buffer(next, ptr_gen);
1723
1724                 WARN_ON(*level <= 0);
1725                 if (path->nodes[*level-1])
1726                         free_extent_buffer(path->nodes[*level-1]);
1727                 path->nodes[*level-1] = next;
1728                 *level = btrfs_header_level(next);
1729                 path->slots[*level] = 0;
1730                 cond_resched();
1731         }
1732         WARN_ON(*level < 0);
1733         WARN_ON(*level >= BTRFS_MAX_LEVEL);
1734
1735         if (path->nodes[*level] == root->node)
1736                 parent = path->nodes[*level];
1737         else
1738                 parent = path->nodes[*level + 1];
1739
1740         bytenr = path->nodes[*level]->start;
1741
1742         blocksize = btrfs_level_size(root, *level);
1743         root_owner = btrfs_header_owner(parent);
1744         root_gen = btrfs_header_generation(parent);
1745
1746         wc->process_func(root, path->nodes[*level], wc,
1747                          btrfs_header_generation(path->nodes[*level]));
1748
1749         if (wc->free) {
1750                 next = path->nodes[*level];
1751                 btrfs_tree_lock(next);
1752                 clean_tree_block(trans, root, next);
1753                 btrfs_wait_tree_block_writeback(next);
1754                 btrfs_tree_unlock(next);
1755
1756                 if (*level == 0) {
1757                         ret = btrfs_drop_leaf_ref(trans, root, next);
1758                         BUG_ON(ret);
1759                 }
1760                 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1761                 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1762                 BUG_ON(ret);
1763         }
1764         free_extent_buffer(path->nodes[*level]);
1765         path->nodes[*level] = NULL;
1766         *level += 1;
1767
1768         cond_resched();
1769         return 0;
1770 }
1771
1772 static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1773                                  struct btrfs_root *root,
1774                                  struct btrfs_path *path, int *level,
1775                                  struct walk_control *wc)
1776 {
1777         u64 root_owner;
1778         u64 root_gen;
1779         int i;
1780         int slot;
1781         int ret;
1782
1783         for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1784                 slot = path->slots[i];
1785                 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
1786                         struct extent_buffer *node;
1787                         node = path->nodes[i];
1788                         path->slots[i]++;
1789                         *level = i;
1790                         WARN_ON(*level == 0);
1791                         return 0;
1792                 } else {
1793                         struct extent_buffer *parent;
1794                         if (path->nodes[*level] == root->node)
1795                                 parent = path->nodes[*level];
1796                         else
1797                                 parent = path->nodes[*level + 1];
1798
1799                         root_owner = btrfs_header_owner(parent);
1800                         root_gen = btrfs_header_generation(parent);
1801                         wc->process_func(root, path->nodes[*level], wc,
1802                                  btrfs_header_generation(path->nodes[*level]));
1803                         if (wc->free) {
1804                                 struct extent_buffer *next;
1805
1806                                 next = path->nodes[*level];
1807
1808                                 btrfs_tree_lock(next);
1809                                 clean_tree_block(trans, root, next);
1810                                 btrfs_wait_tree_block_writeback(next);
1811                                 btrfs_tree_unlock(next);
1812
1813                                 if (*level == 0) {
1814                                         ret = btrfs_drop_leaf_ref(trans, root,
1815                                                                   next);
1816                                         BUG_ON(ret);
1817                                 }
1818
1819                                 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1820                                 ret = btrfs_free_reserved_extent(root,
1821                                                 path->nodes[*level]->start,
1822                                                 path->nodes[*level]->len);
1823                                 BUG_ON(ret);
1824                         }
1825                         free_extent_buffer(path->nodes[*level]);
1826                         path->nodes[*level] = NULL;
1827                         *level = i + 1;
1828                 }
1829         }
1830         return 1;
1831 }
1832
1833 /*
1834  * drop the reference count on the tree rooted at 'snap'.  This traverses
1835  * the tree freeing any blocks that have a ref count of zero after being
1836  * decremented.
1837  */
1838 static int walk_log_tree(struct btrfs_trans_handle *trans,
1839                          struct btrfs_root *log, struct walk_control *wc)
1840 {
1841         int ret = 0;
1842         int wret;
1843         int level;
1844         struct btrfs_path *path;
1845         int i;
1846         int orig_level;
1847
1848         path = btrfs_alloc_path();
1849         BUG_ON(!path);
1850
1851         level = btrfs_header_level(log->node);
1852         orig_level = level;
1853         path->nodes[level] = log->node;
1854         extent_buffer_get(log->node);
1855         path->slots[level] = 0;
1856
1857         while (1) {
1858                 wret = walk_down_log_tree(trans, log, path, &level, wc);
1859                 if (wret > 0)
1860                         break;
1861                 if (wret < 0)
1862                         ret = wret;
1863
1864                 wret = walk_up_log_tree(trans, log, path, &level, wc);
1865                 if (wret > 0)
1866                         break;
1867                 if (wret < 0)
1868                         ret = wret;
1869         }
1870
1871         /* was the root node processed? if not, catch it here */
1872         if (path->nodes[orig_level]) {
1873                 wc->process_func(log, path->nodes[orig_level], wc,
1874                          btrfs_header_generation(path->nodes[orig_level]));
1875                 if (wc->free) {
1876                         struct extent_buffer *next;
1877
1878                         next = path->nodes[orig_level];
1879
1880                         btrfs_tree_lock(next);
1881                         clean_tree_block(trans, log, next);
1882                         btrfs_wait_tree_block_writeback(next);
1883                         btrfs_tree_unlock(next);
1884
1885                         if (orig_level == 0) {
1886                                 ret = btrfs_drop_leaf_ref(trans, log,
1887                                                           next);
1888                                 BUG_ON(ret);
1889                         }
1890                         WARN_ON(log->root_key.objectid !=
1891                                 BTRFS_TREE_LOG_OBJECTID);
1892                         ret = btrfs_free_reserved_extent(log, next->start,
1893                                                          next->len);
1894                         BUG_ON(ret);
1895                 }
1896         }
1897
1898         for (i = 0; i <= orig_level; i++) {
1899                 if (path->nodes[i]) {
1900                         free_extent_buffer(path->nodes[i]);
1901                         path->nodes[i] = NULL;
1902                 }
1903         }
1904         btrfs_free_path(path);
1905         if (wc->free)
1906                 free_extent_buffer(log->node);
1907         return ret;
1908 }
1909
1910 static int wait_log_commit(struct btrfs_root *log)
1911 {
1912         DEFINE_WAIT(wait);
1913         u64 transid = log->fs_info->tree_log_transid;
1914
1915         do {
1916                 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1917                                 TASK_UNINTERRUPTIBLE);
1918                 mutex_unlock(&log->fs_info->tree_log_mutex);
1919                 if (atomic_read(&log->fs_info->tree_log_commit))
1920                         schedule();
1921                 finish_wait(&log->fs_info->tree_log_wait, &wait);
1922                 mutex_lock(&log->fs_info->tree_log_mutex);
1923         } while (transid == log->fs_info->tree_log_transid &&
1924                 atomic_read(&log->fs_info->tree_log_commit));
1925         return 0;
1926 }
1927
1928 /*
1929  * btrfs_sync_log does sends a given tree log down to the disk and
1930  * updates the super blocks to record it.  When this call is done,
1931  * you know that any inodes previously logged are safely on disk
1932  */
1933 int btrfs_sync_log(struct btrfs_trans_handle *trans,
1934                    struct btrfs_root *root)
1935 {
1936         int ret;
1937         unsigned long batch;
1938         struct btrfs_root *log = root->log_root;
1939
1940         mutex_lock(&log->fs_info->tree_log_mutex);
1941         if (atomic_read(&log->fs_info->tree_log_commit)) {
1942                 wait_log_commit(log);
1943                 goto out;
1944         }
1945         atomic_set(&log->fs_info->tree_log_commit, 1);
1946
1947         while (1) {
1948                 batch = log->fs_info->tree_log_batch;
1949                 mutex_unlock(&log->fs_info->tree_log_mutex);
1950                 schedule_timeout_uninterruptible(1);
1951                 mutex_lock(&log->fs_info->tree_log_mutex);
1952
1953                 while (atomic_read(&log->fs_info->tree_log_writers)) {
1954                         DEFINE_WAIT(wait);
1955                         prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1956                                         TASK_UNINTERRUPTIBLE);
1957                         mutex_unlock(&log->fs_info->tree_log_mutex);
1958                         if (atomic_read(&log->fs_info->tree_log_writers))
1959                                 schedule();
1960                         mutex_lock(&log->fs_info->tree_log_mutex);
1961                         finish_wait(&log->fs_info->tree_log_wait, &wait);
1962                 }
1963                 if (batch == log->fs_info->tree_log_batch)
1964                         break;
1965         }
1966
1967         ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1968         BUG_ON(ret);
1969         ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
1970                                &root->fs_info->log_root_tree->dirty_log_pages);
1971         BUG_ON(ret);
1972
1973         btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1974                                  log->fs_info->log_root_tree->node->start);
1975         btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
1976                        btrfs_header_level(log->fs_info->log_root_tree->node));
1977
1978         write_ctree_super(trans, log->fs_info->tree_root, 2);
1979         log->fs_info->tree_log_transid++;
1980         log->fs_info->tree_log_batch = 0;
1981         atomic_set(&log->fs_info->tree_log_commit, 0);
1982         smp_mb();
1983         if (waitqueue_active(&log->fs_info->tree_log_wait))
1984                 wake_up(&log->fs_info->tree_log_wait);
1985 out:
1986         mutex_unlock(&log->fs_info->tree_log_mutex);
1987         return 0;
1988 }
1989
1990 /* * free all the extents used by the tree log.  This should be called
1991  * at commit time of the full transaction
1992  */
1993 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
1994 {
1995         int ret;
1996         struct btrfs_root *log;
1997         struct key;
1998         u64 start;
1999         u64 end;
2000         struct walk_control wc = {
2001                 .free = 1,
2002                 .process_func = process_one_buffer
2003         };
2004
2005         if (!root->log_root || root->fs_info->log_root_recovering)
2006                 return 0;
2007
2008         log = root->log_root;
2009         ret = walk_log_tree(trans, log, &wc);
2010         BUG_ON(ret);
2011
2012         while (1) {
2013                 ret = find_first_extent_bit(&log->dirty_log_pages,
2014                                     0, &start, &end, EXTENT_DIRTY);
2015                 if (ret)
2016                         break;
2017
2018                 clear_extent_dirty(&log->dirty_log_pages,
2019                                    start, end, GFP_NOFS);
2020         }
2021
2022         log = root->log_root;
2023         ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2024                              &log->root_key);
2025         BUG_ON(ret);
2026         root->log_root = NULL;
2027         kfree(root->log_root);
2028         return 0;
2029 }
2030
2031 /*
2032  * helper function to update the item for a given subvolumes log root
2033  * in the tree of log roots
2034  */
2035 static int update_log_root(struct btrfs_trans_handle *trans,
2036                            struct btrfs_root *log)
2037 {
2038         u64 bytenr = btrfs_root_bytenr(&log->root_item);
2039         int ret;
2040
2041         if (log->node->start == bytenr)
2042                 return 0;
2043
2044         btrfs_set_root_bytenr(&log->root_item, log->node->start);
2045         btrfs_set_root_generation(&log->root_item, trans->transid);
2046         btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2047         ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2048                                 &log->root_key, &log->root_item);
2049         BUG_ON(ret);
2050         return ret;
2051 }
2052
2053 /*
2054  * If both a file and directory are logged, and unlinks or renames are
2055  * mixed in, we have a few interesting corners:
2056  *
2057  * create file X in dir Y
2058  * link file X to X.link in dir Y
2059  * fsync file X
2060  * unlink file X but leave X.link
2061  * fsync dir Y
2062  *
2063  * After a crash we would expect only X.link to exist.  But file X
2064  * didn't get fsync'd again so the log has back refs for X and X.link.
2065  *
2066  * We solve this by removing directory entries and inode backrefs from the
2067  * log when a file that was logged in the current transaction is
2068  * unlinked.  Any later fsync will include the updated log entries, and
2069  * we'll be able to reconstruct the proper directory items from backrefs.
2070  *
2071  * This optimizations allows us to avoid relogging the entire inode
2072  * or the entire directory.
2073  */
2074 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2075                                  struct btrfs_root *root,
2076                                  const char *name, int name_len,
2077                                  struct inode *dir, u64 index)
2078 {
2079         struct btrfs_root *log;
2080         struct btrfs_dir_item *di;
2081         struct btrfs_path *path;
2082         int ret;
2083         int bytes_del = 0;
2084
2085         if (BTRFS_I(dir)->logged_trans < trans->transid)
2086                 return 0;
2087
2088         ret = join_running_log_trans(root);
2089         if (ret)
2090                 return 0;
2091
2092         mutex_lock(&BTRFS_I(dir)->log_mutex);
2093
2094         log = root->log_root;
2095         path = btrfs_alloc_path();
2096         di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2097                                    name, name_len, -1);
2098         if (di && !IS_ERR(di)) {
2099                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2100                 bytes_del += name_len;
2101                 BUG_ON(ret);
2102         }
2103         btrfs_release_path(log, path);
2104         di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2105                                          index, name, name_len, -1);
2106         if (di && !IS_ERR(di)) {
2107                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2108                 bytes_del += name_len;
2109                 BUG_ON(ret);
2110         }
2111
2112         /* update the directory size in the log to reflect the names
2113          * we have removed
2114          */
2115         if (bytes_del) {
2116                 struct btrfs_key key;
2117
2118                 key.objectid = dir->i_ino;
2119                 key.offset = 0;
2120                 key.type = BTRFS_INODE_ITEM_KEY;
2121                 btrfs_release_path(log, path);
2122
2123                 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2124                 if (ret == 0) {
2125                         struct btrfs_inode_item *item;
2126                         u64 i_size;
2127
2128                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2129                                               struct btrfs_inode_item);
2130                         i_size = btrfs_inode_size(path->nodes[0], item);
2131                         if (i_size > bytes_del)
2132                                 i_size -= bytes_del;
2133                         else
2134                                 i_size = 0;
2135                         btrfs_set_inode_size(path->nodes[0], item, i_size);
2136                         btrfs_mark_buffer_dirty(path->nodes[0]);
2137                 } else
2138                         ret = 0;
2139                 btrfs_release_path(log, path);
2140         }
2141
2142         btrfs_free_path(path);
2143         mutex_unlock(&BTRFS_I(dir)->log_mutex);
2144         end_log_trans(root);
2145
2146         return 0;
2147 }
2148
2149 /* see comments for btrfs_del_dir_entries_in_log */
2150 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2151                                struct btrfs_root *root,
2152                                const char *name, int name_len,
2153                                struct inode *inode, u64 dirid)
2154 {
2155         struct btrfs_root *log;
2156         u64 index;
2157         int ret;
2158
2159         if (BTRFS_I(inode)->logged_trans < trans->transid)
2160                 return 0;
2161
2162         ret = join_running_log_trans(root);
2163         if (ret)
2164                 return 0;
2165         log = root->log_root;
2166         mutex_lock(&BTRFS_I(inode)->log_mutex);
2167
2168         ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2169                                   dirid, &index);
2170         mutex_unlock(&BTRFS_I(inode)->log_mutex);
2171         end_log_trans(root);
2172
2173         return ret;
2174 }
2175
2176 /*
2177  * creates a range item in the log for 'dirid'.  first_offset and
2178  * last_offset tell us which parts of the key space the log should
2179  * be considered authoritative for.
2180  */
2181 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2182                                        struct btrfs_root *log,
2183                                        struct btrfs_path *path,
2184                                        int key_type, u64 dirid,
2185                                        u64 first_offset, u64 last_offset)
2186 {
2187         int ret;
2188         struct btrfs_key key;
2189         struct btrfs_dir_log_item *item;
2190
2191         key.objectid = dirid;
2192         key.offset = first_offset;
2193         if (key_type == BTRFS_DIR_ITEM_KEY)
2194                 key.type = BTRFS_DIR_LOG_ITEM_KEY;
2195         else
2196                 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2197         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2198         BUG_ON(ret);
2199
2200         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2201                               struct btrfs_dir_log_item);
2202         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2203         btrfs_mark_buffer_dirty(path->nodes[0]);
2204         btrfs_release_path(log, path);
2205         return 0;
2206 }
2207
2208 /*
2209  * log all the items included in the current transaction for a given
2210  * directory.  This also creates the range items in the log tree required
2211  * to replay anything deleted before the fsync
2212  */
2213 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2214                           struct btrfs_root *root, struct inode *inode,
2215                           struct btrfs_path *path,
2216                           struct btrfs_path *dst_path, int key_type,
2217                           u64 min_offset, u64 *last_offset_ret)
2218 {
2219         struct btrfs_key min_key;
2220         struct btrfs_key max_key;
2221         struct btrfs_root *log = root->log_root;
2222         struct extent_buffer *src;
2223         int ret;
2224         int i;
2225         int nritems;
2226         u64 first_offset = min_offset;
2227         u64 last_offset = (u64)-1;
2228
2229         log = root->log_root;
2230         max_key.objectid = inode->i_ino;
2231         max_key.offset = (u64)-1;
2232         max_key.type = key_type;
2233
2234         min_key.objectid = inode->i_ino;
2235         min_key.type = key_type;
2236         min_key.offset = min_offset;
2237
2238         path->keep_locks = 1;
2239
2240         ret = btrfs_search_forward(root, &min_key, &max_key,
2241                                    path, 0, trans->transid);
2242
2243         /*
2244          * we didn't find anything from this transaction, see if there
2245          * is anything at all
2246          */
2247         if (ret != 0 || min_key.objectid != inode->i_ino ||
2248             min_key.type != key_type) {
2249                 min_key.objectid = inode->i_ino;
2250                 min_key.type = key_type;
2251                 min_key.offset = (u64)-1;
2252                 btrfs_release_path(root, path);
2253                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2254                 if (ret < 0) {
2255                         btrfs_release_path(root, path);
2256                         return ret;
2257                 }
2258                 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2259
2260                 /* if ret == 0 there are items for this type,
2261                  * create a range to tell us the last key of this type.
2262                  * otherwise, there are no items in this directory after
2263                  * *min_offset, and we create a range to indicate that.
2264                  */
2265                 if (ret == 0) {
2266                         struct btrfs_key tmp;
2267                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2268                                               path->slots[0]);
2269                         if (key_type == tmp.type)
2270                                 first_offset = max(min_offset, tmp.offset) + 1;
2271                 }
2272                 goto done;
2273         }
2274
2275         /* go backward to find any previous key */
2276         ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2277         if (ret == 0) {
2278                 struct btrfs_key tmp;
2279                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2280                 if (key_type == tmp.type) {
2281                         first_offset = tmp.offset;
2282                         ret = overwrite_item(trans, log, dst_path,
2283                                              path->nodes[0], path->slots[0],
2284                                              &tmp);
2285                 }
2286         }
2287         btrfs_release_path(root, path);
2288
2289         /* find the first key from this transaction again */
2290         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2291         if (ret != 0) {
2292                 WARN_ON(1);
2293                 goto done;
2294         }
2295
2296         /*
2297          * we have a block from this transaction, log every item in it
2298          * from our directory
2299          */
2300         while (1) {
2301                 struct btrfs_key tmp;
2302                 src = path->nodes[0];
2303                 nritems = btrfs_header_nritems(src);
2304                 for (i = path->slots[0]; i < nritems; i++) {
2305                         btrfs_item_key_to_cpu(src, &min_key, i);
2306
2307                         if (min_key.objectid != inode->i_ino ||
2308                             min_key.type != key_type)
2309                                 goto done;
2310                         ret = overwrite_item(trans, log, dst_path, src, i,
2311                                              &min_key);
2312                         BUG_ON(ret);
2313                 }
2314                 path->slots[0] = nritems;
2315
2316                 /*
2317                  * look ahead to the next item and see if it is also
2318                  * from this directory and from this transaction
2319                  */
2320                 ret = btrfs_next_leaf(root, path);
2321                 if (ret == 1) {
2322                         last_offset = (u64)-1;
2323                         goto done;
2324                 }
2325                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2326                 if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
2327                         last_offset = (u64)-1;
2328                         goto done;
2329                 }
2330                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2331                         ret = overwrite_item(trans, log, dst_path,
2332                                              path->nodes[0], path->slots[0],
2333                                              &tmp);
2334
2335                         BUG_ON(ret);
2336                         last_offset = tmp.offset;
2337                         goto done;
2338                 }
2339         }
2340 done:
2341         *last_offset_ret = last_offset;
2342         btrfs_release_path(root, path);
2343         btrfs_release_path(log, dst_path);
2344
2345         /* insert the log range keys to indicate where the log is valid */
2346         ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
2347                                  first_offset, last_offset);
2348         BUG_ON(ret);
2349         return 0;
2350 }
2351
2352 /*
2353  * logging directories is very similar to logging inodes, We find all the items
2354  * from the current transaction and write them to the log.
2355  *
2356  * The recovery code scans the directory in the subvolume, and if it finds a
2357  * key in the range logged that is not present in the log tree, then it means
2358  * that dir entry was unlinked during the transaction.
2359  *
2360  * In order for that scan to work, we must include one key smaller than
2361  * the smallest logged by this transaction and one key larger than the largest
2362  * key logged by this transaction.
2363  */
2364 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2365                           struct btrfs_root *root, struct inode *inode,
2366                           struct btrfs_path *path,
2367                           struct btrfs_path *dst_path)
2368 {
2369         u64 min_key;
2370         u64 max_key;
2371         int ret;
2372         int key_type = BTRFS_DIR_ITEM_KEY;
2373
2374 again:
2375         min_key = 0;
2376         max_key = 0;
2377         while (1) {
2378                 ret = log_dir_items(trans, root, inode, path,
2379                                     dst_path, key_type, min_key,
2380                                     &max_key);
2381                 BUG_ON(ret);
2382                 if (max_key == (u64)-1)
2383                         break;
2384                 min_key = max_key + 1;
2385         }
2386
2387         if (key_type == BTRFS_DIR_ITEM_KEY) {
2388                 key_type = BTRFS_DIR_INDEX_KEY;
2389                 goto again;
2390         }
2391         return 0;
2392 }
2393
2394 /*
2395  * a helper function to drop items from the log before we relog an
2396  * inode.  max_key_type indicates the highest item type to remove.
2397  * This cannot be run for file data extents because it does not
2398  * free the extents they point to.
2399  */
2400 static int drop_objectid_items(struct btrfs_trans_handle *trans,
2401                                   struct btrfs_root *log,
2402                                   struct btrfs_path *path,
2403                                   u64 objectid, int max_key_type)
2404 {
2405         int ret;
2406         struct btrfs_key key;
2407         struct btrfs_key found_key;
2408
2409         key.objectid = objectid;
2410         key.type = max_key_type;
2411         key.offset = (u64)-1;
2412
2413         while (1) {
2414                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2415
2416                 if (ret != 1)
2417                         break;
2418
2419                 if (path->slots[0] == 0)
2420                         break;
2421
2422                 path->slots[0]--;
2423                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2424                                       path->slots[0]);
2425
2426                 if (found_key.objectid != objectid)
2427                         break;
2428
2429                 ret = btrfs_del_item(trans, log, path);
2430                 BUG_ON(ret);
2431                 btrfs_release_path(log, path);
2432         }
2433         btrfs_release_path(log, path);
2434         return 0;
2435 }
2436
2437 static noinline int copy_items(struct btrfs_trans_handle *trans,
2438                                struct btrfs_root *log,
2439                                struct btrfs_path *dst_path,
2440                                struct extent_buffer *src,
2441                                int start_slot, int nr, int inode_only)
2442 {
2443         unsigned long src_offset;
2444         unsigned long dst_offset;
2445         struct btrfs_file_extent_item *extent;
2446         struct btrfs_inode_item *inode_item;
2447         int ret;
2448         struct btrfs_key *ins_keys;
2449         u32 *ins_sizes;
2450         char *ins_data;
2451         int i;
2452         struct list_head ordered_sums;
2453
2454         INIT_LIST_HEAD(&ordered_sums);
2455
2456         ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2457                            nr * sizeof(u32), GFP_NOFS);
2458         ins_sizes = (u32 *)ins_data;
2459         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2460
2461         for (i = 0; i < nr; i++) {
2462                 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
2463                 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
2464         }
2465         ret = btrfs_insert_empty_items(trans, log, dst_path,
2466                                        ins_keys, ins_sizes, nr);
2467         BUG_ON(ret);
2468
2469         for (i = 0; i < nr; i++) {
2470                 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2471                                                    dst_path->slots[0]);
2472
2473                 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2474
2475                 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2476                                    src_offset, ins_sizes[i]);
2477
2478                 if (inode_only == LOG_INODE_EXISTS &&
2479                     ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2480                         inode_item = btrfs_item_ptr(dst_path->nodes[0],
2481                                                     dst_path->slots[0],
2482                                                     struct btrfs_inode_item);
2483                         btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
2484
2485                         /* set the generation to zero so the recover code
2486                          * can tell the difference between an logging
2487                          * just to say 'this inode exists' and a logging
2488                          * to say 'update this inode with these values'
2489                          */
2490                         btrfs_set_inode_generation(dst_path->nodes[0],
2491                                                    inode_item, 0);
2492                 }
2493                 /* take a reference on file data extents so that truncates
2494                  * or deletes of this inode don't have to relog the inode
2495                  * again
2496                  */
2497                 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
2498                         int found_type;
2499                         extent = btrfs_item_ptr(src, start_slot + i,
2500                                                 struct btrfs_file_extent_item);
2501
2502                         found_type = btrfs_file_extent_type(src, extent);
2503                         if (found_type == BTRFS_FILE_EXTENT_REG ||
2504                             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
2505                                 u64 ds = btrfs_file_extent_disk_bytenr(src,
2506                                                                    extent);
2507                                 u64 dl = btrfs_file_extent_disk_num_bytes(src,
2508                                                                       extent);
2509                                 u64 cs = btrfs_file_extent_offset(src, extent);
2510                                 u64 cl = btrfs_file_extent_num_bytes(src,
2511                                                                      extent);;
2512                                 if (btrfs_file_extent_compression(src,
2513                                                                   extent)) {
2514                                         cs = 0;
2515                                         cl = dl;
2516                                 }
2517                                 /* ds == 0 is a hole */
2518                                 if (ds != 0) {
2519                                         ret = btrfs_inc_extent_ref(trans, log,
2520                                                    ds, dl,
2521                                                    dst_path->nodes[0]->start,
2522                                                    BTRFS_TREE_LOG_OBJECTID,
2523                                                    trans->transid,
2524                                                    ins_keys[i].objectid);
2525                                         BUG_ON(ret);
2526                                         ret = btrfs_lookup_csums_range(
2527                                                    log->fs_info->csum_root,
2528                                                    ds + cs, ds + cs + cl - 1,
2529                                                    &ordered_sums);
2530                                         BUG_ON(ret);
2531                                 }
2532                         }
2533                 }
2534                 dst_path->slots[0]++;
2535         }
2536
2537         btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2538         btrfs_release_path(log, dst_path);
2539         kfree(ins_data);
2540
2541         /*
2542          * we have to do this after the loop above to avoid changing the
2543          * log tree while trying to change the log tree.
2544          */
2545         while (!list_empty(&ordered_sums)) {
2546                 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2547                                                    struct btrfs_ordered_sum,
2548                                                    list);
2549                 ret = btrfs_csum_file_blocks(trans, log, sums);
2550                 BUG_ON(ret);
2551                 list_del(&sums->list);
2552                 kfree(sums);
2553         }
2554         return 0;
2555 }
2556
2557 /* log a single inode in the tree log.
2558  * At least one parent directory for this inode must exist in the tree
2559  * or be logged already.
2560  *
2561  * Any items from this inode changed by the current transaction are copied
2562  * to the log tree.  An extra reference is taken on any extents in this
2563  * file, allowing us to avoid a whole pile of corner cases around logging
2564  * blocks that have been removed from the tree.
2565  *
2566  * See LOG_INODE_ALL and related defines for a description of what inode_only
2567  * does.
2568  *
2569  * This handles both files and directories.
2570  */
2571 static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2572                              struct btrfs_root *root, struct inode *inode,
2573                              int inode_only)
2574 {
2575         struct btrfs_path *path;
2576         struct btrfs_path *dst_path;
2577         struct btrfs_key min_key;
2578         struct btrfs_key max_key;
2579         struct btrfs_root *log = root->log_root;
2580         struct extent_buffer *src = NULL;
2581         u32 size;
2582         int ret;
2583         int nritems;
2584         int ins_start_slot = 0;
2585         int ins_nr;
2586
2587         log = root->log_root;
2588
2589         path = btrfs_alloc_path();
2590         dst_path = btrfs_alloc_path();
2591
2592         min_key.objectid = inode->i_ino;
2593         min_key.type = BTRFS_INODE_ITEM_KEY;
2594         min_key.offset = 0;
2595
2596         max_key.objectid = inode->i_ino;
2597         if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2598                 max_key.type = BTRFS_XATTR_ITEM_KEY;
2599         else
2600                 max_key.type = (u8)-1;
2601         max_key.offset = (u64)-1;
2602
2603         /*
2604          * if this inode has already been logged and we're in inode_only
2605          * mode, we don't want to delete the things that have already
2606          * been written to the log.
2607          *
2608          * But, if the inode has been through an inode_only log,
2609          * the logged_trans field is not set.  This allows us to catch
2610          * any new names for this inode in the backrefs by logging it
2611          * again
2612          */
2613         if (inode_only == LOG_INODE_EXISTS &&
2614             BTRFS_I(inode)->logged_trans == trans->transid) {
2615                 btrfs_free_path(path);
2616                 btrfs_free_path(dst_path);
2617                 goto out;
2618         }
2619         mutex_lock(&BTRFS_I(inode)->log_mutex);
2620
2621         /*
2622          * a brute force approach to making sure we get the most uptodate
2623          * copies of everything.
2624          */
2625         if (S_ISDIR(inode->i_mode)) {
2626                 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
2627
2628                 if (inode_only == LOG_INODE_EXISTS)
2629                         max_key_type = BTRFS_XATTR_ITEM_KEY;
2630                 ret = drop_objectid_items(trans, log, path,
2631                                           inode->i_ino, max_key_type);
2632         } else {
2633                 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2634         }
2635         BUG_ON(ret);
2636         path->keep_locks = 1;
2637
2638         while (1) {
2639                 ins_nr = 0;
2640                 ret = btrfs_search_forward(root, &min_key, &max_key,
2641                                            path, 0, trans->transid);
2642                 if (ret != 0)
2643                         break;
2644 again:
2645                 /* note, ins_nr might be > 0 here, cleanup outside the loop */
2646                 if (min_key.objectid != inode->i_ino)
2647                         break;
2648                 if (min_key.type > max_key.type)
2649                         break;
2650
2651                 src = path->nodes[0];
2652                 size = btrfs_item_size_nr(src, path->slots[0]);
2653                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2654                         ins_nr++;
2655                         goto next_slot;
2656                 } else if (!ins_nr) {
2657                         ins_start_slot = path->slots[0];
2658                         ins_nr = 1;
2659                         goto next_slot;
2660                 }
2661
2662                 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2663                                  ins_nr, inode_only);
2664                 BUG_ON(ret);
2665                 ins_nr = 1;
2666                 ins_start_slot = path->slots[0];
2667 next_slot:
2668
2669                 nritems = btrfs_header_nritems(path->nodes[0]);
2670                 path->slots[0]++;
2671                 if (path->slots[0] < nritems) {
2672                         btrfs_item_key_to_cpu(path->nodes[0], &min_key,
2673                                               path->slots[0]);
2674                         goto again;
2675                 }
2676                 if (ins_nr) {
2677                         ret = copy_items(trans, log, dst_path, src,
2678                                          ins_start_slot,
2679                                          ins_nr, inode_only);
2680                         BUG_ON(ret);
2681                         ins_nr = 0;
2682                 }
2683                 btrfs_release_path(root, path);
2684
2685                 if (min_key.offset < (u64)-1)
2686                         min_key.offset++;
2687                 else if (min_key.type < (u8)-1)
2688                         min_key.type++;
2689                 else if (min_key.objectid < (u64)-1)
2690                         min_key.objectid++;
2691                 else
2692                         break;
2693         }
2694         if (ins_nr) {
2695                 ret = copy_items(trans, log, dst_path, src,
2696                                  ins_start_slot,
2697                                  ins_nr, inode_only);
2698                 BUG_ON(ret);
2699                 ins_nr = 0;
2700         }
2701         WARN_ON(ins_nr);
2702         if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2703                 btrfs_release_path(root, path);
2704                 btrfs_release_path(log, dst_path);
2705                 BTRFS_I(inode)->log_dirty_trans = 0;
2706                 ret = log_directory_changes(trans, root, inode, path, dst_path);
2707                 BUG_ON(ret);
2708         }
2709         BTRFS_I(inode)->logged_trans = trans->transid;
2710         mutex_unlock(&BTRFS_I(inode)->log_mutex);
2711
2712         btrfs_free_path(path);
2713         btrfs_free_path(dst_path);
2714
2715         mutex_lock(&root->fs_info->tree_log_mutex);
2716         ret = update_log_root(trans, log);
2717         BUG_ON(ret);
2718         mutex_unlock(&root->fs_info->tree_log_mutex);
2719 out:
2720         return 0;
2721 }
2722
2723 int btrfs_log_inode(struct btrfs_trans_handle *trans,
2724                     struct btrfs_root *root, struct inode *inode,
2725                     int inode_only)
2726 {
2727         int ret;
2728
2729         start_log_trans(trans, root);
2730         ret = __btrfs_log_inode(trans, root, inode, inode_only);
2731         end_log_trans(root);
2732         return ret;
2733 }
2734
2735 /*
2736  * helper function around btrfs_log_inode to make sure newly created
2737  * parent directories also end up in the log.  A minimal inode and backref
2738  * only logging is done of any parent directories that are older than
2739  * the last committed transaction
2740  */
2741 int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2742                     struct btrfs_root *root, struct dentry *dentry)
2743 {
2744         int inode_only = LOG_INODE_ALL;
2745         struct super_block *sb;
2746         int ret;
2747
2748         start_log_trans(trans, root);
2749         sb = dentry->d_inode->i_sb;
2750         while (1) {
2751                 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2752                                         inode_only);
2753                 BUG_ON(ret);
2754                 inode_only = LOG_INODE_EXISTS;
2755
2756                 dentry = dentry->d_parent;
2757                 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
2758                         break;
2759
2760                 if (BTRFS_I(dentry->d_inode)->generation <=
2761                     root->fs_info->last_trans_committed)
2762                         break;
2763         }
2764         end_log_trans(root);
2765         return 0;
2766 }
2767
2768 /*
2769  * it is not safe to log dentry if the chunk root has added new
2770  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
2771  * If this returns 1, you must commit the transaction to safely get your
2772  * data on disk.
2773  */
2774 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2775                           struct btrfs_root *root, struct dentry *dentry)
2776 {
2777         u64 gen;
2778         gen = root->fs_info->last_trans_new_blockgroup;
2779         if (gen > root->fs_info->last_trans_committed)
2780                 return 1;
2781         else
2782                 return btrfs_log_dentry(trans, root, dentry);
2783 }
2784
2785 /*
2786  * should be called during mount to recover any replay any log trees
2787  * from the FS
2788  */
2789 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
2790 {
2791         int ret;
2792         struct btrfs_path *path;
2793         struct btrfs_trans_handle *trans;
2794         struct btrfs_key key;
2795         struct btrfs_key found_key;
2796         struct btrfs_key tmp_key;
2797         struct btrfs_root *log;
2798         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
2799         u64 highest_inode;
2800         struct walk_control wc = {
2801                 .process_func = process_one_buffer,
2802                 .stage = 0,
2803         };
2804
2805         fs_info->log_root_recovering = 1;
2806         path = btrfs_alloc_path();
2807         BUG_ON(!path);
2808
2809         trans = btrfs_start_transaction(fs_info->tree_root, 1);
2810
2811         wc.trans = trans;
2812         wc.pin = 1;
2813
2814         walk_log_tree(trans, log_root_tree, &wc);
2815
2816 again:
2817         key.objectid = BTRFS_TREE_LOG_OBJECTID;
2818         key.offset = (u64)-1;
2819         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2820
2821         while (1) {
2822                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
2823                 if (ret < 0)
2824                         break;
2825                 if (ret > 0) {
2826                         if (path->slots[0] == 0)
2827                                 break;
2828                         path->slots[0]--;
2829                 }
2830                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2831                                       path->slots[0]);
2832                 btrfs_release_path(log_root_tree, path);
2833                 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
2834                         break;
2835
2836                 log = btrfs_read_fs_root_no_radix(log_root_tree,
2837                                                   &found_key);
2838                 BUG_ON(!log);
2839
2840
2841                 tmp_key.objectid = found_key.offset;
2842                 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
2843                 tmp_key.offset = (u64)-1;
2844
2845                 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
2846                 BUG_ON(!wc.replay_dest);
2847
2848                 wc.replay_dest->log_root = log;
2849                 btrfs_record_root_in_trans(wc.replay_dest);
2850                 ret = walk_log_tree(trans, log, &wc);
2851                 BUG_ON(ret);
2852
2853                 if (wc.stage == LOG_WALK_REPLAY_ALL) {
2854                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
2855                                                       path);
2856                         BUG_ON(ret);
2857                 }
2858                 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
2859                 if (ret == 0) {
2860                         wc.replay_dest->highest_inode = highest_inode;
2861                         wc.replay_dest->last_inode_alloc = highest_inode;
2862                 }
2863
2864                 key.offset = found_key.offset - 1;
2865                 wc.replay_dest->log_root = NULL;
2866                 free_extent_buffer(log->node);
2867                 kfree(log);
2868
2869                 if (found_key.offset == 0)
2870                         break;
2871         }
2872         btrfs_release_path(log_root_tree, path);
2873
2874         /* step one is to pin it all, step two is to replay just inodes */
2875         if (wc.pin) {
2876                 wc.pin = 0;
2877                 wc.process_func = replay_one_buffer;
2878                 wc.stage = LOG_WALK_REPLAY_INODES;
2879                 goto again;
2880         }
2881         /* step three is to replay everything */
2882         if (wc.stage < LOG_WALK_REPLAY_ALL) {
2883                 wc.stage++;
2884                 goto again;
2885         }
2886
2887         btrfs_free_path(path);
2888
2889         free_extent_buffer(log_root_tree->node);
2890         log_root_tree->log_root = NULL;
2891         fs_info->log_root_recovering = 0;
2892
2893         /* step 4: commit the transaction, which also unpins the blocks */
2894         btrfs_commit_transaction(trans, fs_info->tree_root);
2895
2896         kfree(log_root_tree);
2897         return 0;
2898 }