]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - fs/ext3/inode.c
[PATCH] ext3: remove unnecessary race then retry in ext3_get_block
[linux-2.6-omap-h63xx.git] / fs / ext3 / inode.c
1 /*
2  *  linux/fs/ext3/inode.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/inode.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  Goal-directed block allocation by Stephen Tweedie
16  *      (sct@redhat.com), 1993, 1998
17  *  Big-endian to little-endian byte-swapping/bitmaps by
18  *        David S. Miller (davem@caip.rutgers.edu), 1995
19  *  64-bit file support on 64-bit platforms by Jakub Jelinek
20  *      (jj@sunsite.ms.mff.cuni.cz)
21  *
22  *  Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23  */
24
25 #include <linux/module.h>
26 #include <linux/fs.h>
27 #include <linux/time.h>
28 #include <linux/ext3_jbd.h>
29 #include <linux/jbd.h>
30 #include <linux/smp_lock.h>
31 #include <linux/highuid.h>
32 #include <linux/pagemap.h>
33 #include <linux/quotaops.h>
34 #include <linux/string.h>
35 #include <linux/buffer_head.h>
36 #include <linux/writeback.h>
37 #include <linux/mpage.h>
38 #include <linux/uio.h>
39 #include "xattr.h"
40 #include "acl.h"
41
42 static int ext3_writepage_trans_blocks(struct inode *inode);
43
44 /*
45  * Test whether an inode is a fast symlink.
46  */
47 static inline int ext3_inode_is_fast_symlink(struct inode *inode)
48 {
49         int ea_blocks = EXT3_I(inode)->i_file_acl ?
50                 (inode->i_sb->s_blocksize >> 9) : 0;
51
52         return (S_ISLNK(inode->i_mode) &&
53                 inode->i_blocks - ea_blocks == 0);
54 }
55
56 /* The ext3 forget function must perform a revoke if we are freeing data
57  * which has been journaled.  Metadata (eg. indirect blocks) must be
58  * revoked in all cases. 
59  *
60  * "bh" may be NULL: a metadata block may have been freed from memory
61  * but there may still be a record of it in the journal, and that record
62  * still needs to be revoked.
63  */
64
65 int ext3_forget(handle_t *handle, int is_metadata,
66                        struct inode *inode, struct buffer_head *bh,
67                        int blocknr)
68 {
69         int err;
70
71         might_sleep();
72
73         BUFFER_TRACE(bh, "enter");
74
75         jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
76                   "data mode %lx\n",
77                   bh, is_metadata, inode->i_mode,
78                   test_opt(inode->i_sb, DATA_FLAGS));
79
80         /* Never use the revoke function if we are doing full data
81          * journaling: there is no need to, and a V1 superblock won't
82          * support it.  Otherwise, only skip the revoke on un-journaled
83          * data blocks. */
84
85         if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
86             (!is_metadata && !ext3_should_journal_data(inode))) {
87                 if (bh) {
88                         BUFFER_TRACE(bh, "call journal_forget");
89                         return ext3_journal_forget(handle, bh);
90                 }
91                 return 0;
92         }
93
94         /*
95          * data!=journal && (is_metadata || should_journal_data(inode))
96          */
97         BUFFER_TRACE(bh, "call ext3_journal_revoke");
98         err = ext3_journal_revoke(handle, blocknr, bh);
99         if (err)
100                 ext3_abort(inode->i_sb, __FUNCTION__,
101                            "error %d when attempting revoke", err);
102         BUFFER_TRACE(bh, "exit");
103         return err;
104 }
105
106 /*
107  * Work out how many blocks we need to progress with the next chunk of a
108  * truncate transaction.
109  */
110
111 static unsigned long blocks_for_truncate(struct inode *inode) 
112 {
113         unsigned long needed;
114
115         needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
116
117         /* Give ourselves just enough room to cope with inodes in which
118          * i_blocks is corrupt: we've seen disk corruptions in the past
119          * which resulted in random data in an inode which looked enough
120          * like a regular file for ext3 to try to delete it.  Things
121          * will go a bit crazy if that happens, but at least we should
122          * try not to panic the whole kernel. */
123         if (needed < 2)
124                 needed = 2;
125
126         /* But we need to bound the transaction so we don't overflow the
127          * journal. */
128         if (needed > EXT3_MAX_TRANS_DATA) 
129                 needed = EXT3_MAX_TRANS_DATA;
130
131         return EXT3_DATA_TRANS_BLOCKS + needed;
132 }
133
134 /* 
135  * Truncate transactions can be complex and absolutely huge.  So we need to
136  * be able to restart the transaction at a conventient checkpoint to make
137  * sure we don't overflow the journal.
138  *
139  * start_transaction gets us a new handle for a truncate transaction,
140  * and extend_transaction tries to extend the existing one a bit.  If
141  * extend fails, we need to propagate the failure up and restart the
142  * transaction in the top-level truncate loop. --sct 
143  */
144
145 static handle_t *start_transaction(struct inode *inode) 
146 {
147         handle_t *result;
148
149         result = ext3_journal_start(inode, blocks_for_truncate(inode));
150         if (!IS_ERR(result))
151                 return result;
152
153         ext3_std_error(inode->i_sb, PTR_ERR(result));
154         return result;
155 }
156
157 /*
158  * Try to extend this transaction for the purposes of truncation.
159  *
160  * Returns 0 if we managed to create more room.  If we can't create more
161  * room, and the transaction must be restarted we return 1.
162  */
163 static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
164 {
165         if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
166                 return 0;
167         if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
168                 return 0;
169         return 1;
170 }
171
172 /*
173  * Restart the transaction associated with *handle.  This does a commit,
174  * so before we call here everything must be consistently dirtied against
175  * this transaction.
176  */
177 static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
178 {
179         jbd_debug(2, "restarting handle %p\n", handle);
180         return ext3_journal_restart(handle, blocks_for_truncate(inode));
181 }
182
183 /*
184  * Called at the last iput() if i_nlink is zero.
185  */
186 void ext3_delete_inode (struct inode * inode)
187 {
188         handle_t *handle;
189
190         if (is_bad_inode(inode))
191                 goto no_delete;
192
193         handle = start_transaction(inode);
194         if (IS_ERR(handle)) {
195                 /* If we're going to skip the normal cleanup, we still
196                  * need to make sure that the in-core orphan linked list
197                  * is properly cleaned up. */
198                 ext3_orphan_del(NULL, inode);
199                 goto no_delete;
200         }
201
202         if (IS_SYNC(inode))
203                 handle->h_sync = 1;
204         inode->i_size = 0;
205         if (inode->i_blocks)
206                 ext3_truncate(inode);
207         /*
208          * Kill off the orphan record which ext3_truncate created.
209          * AKPM: I think this can be inside the above `if'.
210          * Note that ext3_orphan_del() has to be able to cope with the
211          * deletion of a non-existent orphan - this is because we don't
212          * know if ext3_truncate() actually created an orphan record.
213          * (Well, we could do this if we need to, but heck - it works)
214          */
215         ext3_orphan_del(handle, inode);
216         EXT3_I(inode)->i_dtime  = get_seconds();
217
218         /* 
219          * One subtle ordering requirement: if anything has gone wrong
220          * (transaction abort, IO errors, whatever), then we can still
221          * do these next steps (the fs will already have been marked as
222          * having errors), but we can't free the inode if the mark_dirty
223          * fails.  
224          */
225         if (ext3_mark_inode_dirty(handle, inode))
226                 /* If that failed, just do the required in-core inode clear. */
227                 clear_inode(inode);
228         else
229                 ext3_free_inode(handle, inode);
230         ext3_journal_stop(handle);
231         return;
232 no_delete:
233         clear_inode(inode);     /* We must guarantee clearing of inode... */
234 }
235
236 static int ext3_alloc_block (handle_t *handle,
237                         struct inode * inode, unsigned long goal, int *err)
238 {
239         unsigned long result;
240
241         result = ext3_new_block(handle, inode, goal, err);
242         return result;
243 }
244
245
246 typedef struct {
247         __le32  *p;
248         __le32  key;
249         struct buffer_head *bh;
250 } Indirect;
251
252 static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
253 {
254         p->key = *(p->p = v);
255         p->bh = bh;
256 }
257
258 static inline int verify_chain(Indirect *from, Indirect *to)
259 {
260         while (from <= to && from->key == *from->p)
261                 from++;
262         return (from > to);
263 }
264
265 /**
266  *      ext3_block_to_path - parse the block number into array of offsets
267  *      @inode: inode in question (we are only interested in its superblock)
268  *      @i_block: block number to be parsed
269  *      @offsets: array to store the offsets in
270  *      @boundary: set this non-zero if the referred-to block is likely to be
271  *             followed (on disk) by an indirect block.
272  *
273  *      To store the locations of file's data ext3 uses a data structure common
274  *      for UNIX filesystems - tree of pointers anchored in the inode, with
275  *      data blocks at leaves and indirect blocks in intermediate nodes.
276  *      This function translates the block number into path in that tree -
277  *      return value is the path length and @offsets[n] is the offset of
278  *      pointer to (n+1)th node in the nth one. If @block is out of range
279  *      (negative or too large) warning is printed and zero returned.
280  *
281  *      Note: function doesn't find node addresses, so no IO is needed. All
282  *      we need to know is the capacity of indirect blocks (taken from the
283  *      inode->i_sb).
284  */
285
286 /*
287  * Portability note: the last comparison (check that we fit into triple
288  * indirect block) is spelled differently, because otherwise on an
289  * architecture with 32-bit longs and 8Kb pages we might get into trouble
290  * if our filesystem had 8Kb blocks. We might use long long, but that would
291  * kill us on x86. Oh, well, at least the sign propagation does not matter -
292  * i_block would have to be negative in the very beginning, so we would not
293  * get there at all.
294  */
295
296 static int ext3_block_to_path(struct inode *inode,
297                         long i_block, int offsets[4], int *boundary)
298 {
299         int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
300         int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
301         const long direct_blocks = EXT3_NDIR_BLOCKS,
302                 indirect_blocks = ptrs,
303                 double_blocks = (1 << (ptrs_bits * 2));
304         int n = 0;
305         int final = 0;
306
307         if (i_block < 0) {
308                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
309         } else if (i_block < direct_blocks) {
310                 offsets[n++] = i_block;
311                 final = direct_blocks;
312         } else if ( (i_block -= direct_blocks) < indirect_blocks) {
313                 offsets[n++] = EXT3_IND_BLOCK;
314                 offsets[n++] = i_block;
315                 final = ptrs;
316         } else if ((i_block -= indirect_blocks) < double_blocks) {
317                 offsets[n++] = EXT3_DIND_BLOCK;
318                 offsets[n++] = i_block >> ptrs_bits;
319                 offsets[n++] = i_block & (ptrs - 1);
320                 final = ptrs;
321         } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
322                 offsets[n++] = EXT3_TIND_BLOCK;
323                 offsets[n++] = i_block >> (ptrs_bits * 2);
324                 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
325                 offsets[n++] = i_block & (ptrs - 1);
326                 final = ptrs;
327         } else {
328                 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
329         }
330         if (boundary)
331                 *boundary = (i_block & (ptrs - 1)) == (final - 1);
332         return n;
333 }
334
335 /**
336  *      ext3_get_branch - read the chain of indirect blocks leading to data
337  *      @inode: inode in question
338  *      @depth: depth of the chain (1 - direct pointer, etc.)
339  *      @offsets: offsets of pointers in inode/indirect blocks
340  *      @chain: place to store the result
341  *      @err: here we store the error value
342  *
343  *      Function fills the array of triples <key, p, bh> and returns %NULL
344  *      if everything went OK or the pointer to the last filled triple
345  *      (incomplete one) otherwise. Upon the return chain[i].key contains
346  *      the number of (i+1)-th block in the chain (as it is stored in memory,
347  *      i.e. little-endian 32-bit), chain[i].p contains the address of that
348  *      number (it points into struct inode for i==0 and into the bh->b_data
349  *      for i>0) and chain[i].bh points to the buffer_head of i-th indirect
350  *      block for i>0 and NULL for i==0. In other words, it holds the block
351  *      numbers of the chain, addresses they were taken from (and where we can
352  *      verify that chain did not change) and buffer_heads hosting these
353  *      numbers.
354  *
355  *      Function stops when it stumbles upon zero pointer (absent block)
356  *              (pointer to last triple returned, *@err == 0)
357  *      or when it gets an IO error reading an indirect block
358  *              (ditto, *@err == -EIO)
359  *      or when it notices that chain had been changed while it was reading
360  *              (ditto, *@err == -EAGAIN)
361  *      or when it reads all @depth-1 indirect blocks successfully and finds
362  *      the whole chain, all way to the data (returns %NULL, *err == 0).
363  */
364 static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
365                                  Indirect chain[4], int *err)
366 {
367         struct super_block *sb = inode->i_sb;
368         Indirect *p = chain;
369         struct buffer_head *bh;
370
371         *err = 0;
372         /* i_data is not going away, no lock needed */
373         add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
374         if (!p->key)
375                 goto no_block;
376         while (--depth) {
377                 bh = sb_bread(sb, le32_to_cpu(p->key));
378                 if (!bh)
379                         goto failure;
380                 /* Reader: pointers */
381                 if (!verify_chain(chain, p))
382                         goto changed;
383                 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
384                 /* Reader: end */
385                 if (!p->key)
386                         goto no_block;
387         }
388         return NULL;
389
390 changed:
391         brelse(bh);
392         *err = -EAGAIN;
393         goto no_block;
394 failure:
395         *err = -EIO;
396 no_block:
397         return p;
398 }
399
400 /**
401  *      ext3_find_near - find a place for allocation with sufficient locality
402  *      @inode: owner
403  *      @ind: descriptor of indirect block.
404  *
405  *      This function returns the prefered place for block allocation.
406  *      It is used when heuristic for sequential allocation fails.
407  *      Rules are:
408  *        + if there is a block to the left of our position - allocate near it.
409  *        + if pointer will live in indirect block - allocate near that block.
410  *        + if pointer will live in inode - allocate in the same
411  *          cylinder group. 
412  *
413  * In the latter case we colour the starting block by the callers PID to
414  * prevent it from clashing with concurrent allocations for a different inode
415  * in the same block group.   The PID is used here so that functionally related
416  * files will be close-by on-disk.
417  *
418  *      Caller must make sure that @ind is valid and will stay that way.
419  */
420
421 static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
422 {
423         struct ext3_inode_info *ei = EXT3_I(inode);
424         __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
425         __le32 *p;
426         unsigned long bg_start;
427         unsigned long colour;
428
429         /* Try to find previous block */
430         for (p = ind->p - 1; p >= start; p--)
431                 if (*p)
432                         return le32_to_cpu(*p);
433
434         /* No such thing, so let's try location of indirect block */
435         if (ind->bh)
436                 return ind->bh->b_blocknr;
437
438         /*
439          * It is going to be refered from inode itself? OK, just put it into
440          * the same cylinder group then.
441          */
442         bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
443                 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
444         colour = (current->pid % 16) *
445                         (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
446         return bg_start + colour;
447 }
448
449 /**
450  *      ext3_find_goal - find a prefered place for allocation.
451  *      @inode: owner
452  *      @block:  block we want
453  *      @chain:  chain of indirect blocks
454  *      @partial: pointer to the last triple within a chain
455  *      @goal:  place to store the result.
456  *
457  *      Normally this function find the prefered place for block allocation,
458  *      stores it in *@goal and returns zero.
459  */
460
461 static unsigned long ext3_find_goal(struct inode *inode, long block,
462                 Indirect chain[4], Indirect *partial)
463 {
464         struct ext3_block_alloc_info *block_i =  EXT3_I(inode)->i_block_alloc_info;
465
466         /*
467          * try the heuristic for sequential allocation,
468          * failing that at least try to get decent locality.
469          */
470         if (block_i && (block == block_i->last_alloc_logical_block + 1)
471                 && (block_i->last_alloc_physical_block != 0)) {
472                 return block_i->last_alloc_physical_block + 1;
473         }
474
475         return ext3_find_near(inode, partial);
476 }
477
478 /**
479  *      ext3_alloc_branch - allocate and set up a chain of blocks.
480  *      @inode: owner
481  *      @num: depth of the chain (number of blocks to allocate)
482  *      @offsets: offsets (in the blocks) to store the pointers to next.
483  *      @branch: place to store the chain in.
484  *
485  *      This function allocates @num blocks, zeroes out all but the last one,
486  *      links them into chain and (if we are synchronous) writes them to disk.
487  *      In other words, it prepares a branch that can be spliced onto the
488  *      inode. It stores the information about that chain in the branch[], in
489  *      the same format as ext3_get_branch() would do. We are calling it after
490  *      we had read the existing part of chain and partial points to the last
491  *      triple of that (one with zero ->key). Upon the exit we have the same
492  *      picture as after the successful ext3_get_block(), excpet that in one
493  *      place chain is disconnected - *branch->p is still zero (we did not
494  *      set the last link), but branch->key contains the number that should
495  *      be placed into *branch->p to fill that gap.
496  *
497  *      If allocation fails we free all blocks we've allocated (and forget
498  *      their buffer_heads) and return the error value the from failed
499  *      ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
500  *      as described above and return 0.
501  */
502
503 static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
504                              int num,
505                              unsigned long goal,
506                              int *offsets,
507                              Indirect *branch)
508 {
509         int blocksize = inode->i_sb->s_blocksize;
510         int n = 0, keys = 0;
511         int err = 0;
512         int i;
513         int parent = ext3_alloc_block(handle, inode, goal, &err);
514
515         branch[0].key = cpu_to_le32(parent);
516         if (parent) {
517                 for (n = 1; n < num; n++) {
518                         struct buffer_head *bh;
519                         /* Allocate the next block */
520                         int nr = ext3_alloc_block(handle, inode, parent, &err);
521                         if (!nr)
522                                 break;
523                         branch[n].key = cpu_to_le32(nr);
524                         keys = n+1;
525
526                         /*
527                          * Get buffer_head for parent block, zero it out
528                          * and set the pointer to new one, then send
529                          * parent to disk.  
530                          */
531                         bh = sb_getblk(inode->i_sb, parent);
532                         branch[n].bh = bh;
533                         lock_buffer(bh);
534                         BUFFER_TRACE(bh, "call get_create_access");
535                         err = ext3_journal_get_create_access(handle, bh);
536                         if (err) {
537                                 unlock_buffer(bh);
538                                 brelse(bh);
539                                 break;
540                         }
541
542                         memset(bh->b_data, 0, blocksize);
543                         branch[n].p = (__le32*) bh->b_data + offsets[n];
544                         *branch[n].p = branch[n].key;
545                         BUFFER_TRACE(bh, "marking uptodate");
546                         set_buffer_uptodate(bh);
547                         unlock_buffer(bh);
548
549                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
550                         err = ext3_journal_dirty_metadata(handle, bh);
551                         if (err)
552                                 break;
553
554                         parent = nr;
555                 }
556         }
557         if (n == num)
558                 return 0;
559
560         /* Allocation failed, free what we already allocated */
561         for (i = 1; i < keys; i++) {
562                 BUFFER_TRACE(branch[i].bh, "call journal_forget");
563                 ext3_journal_forget(handle, branch[i].bh);
564         }
565         for (i = 0; i < keys; i++)
566                 ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
567         return err;
568 }
569
570 /**
571  *      ext3_splice_branch - splice the allocated branch onto inode.
572  *      @inode: owner
573  *      @block: (logical) number of block we are adding
574  *      @chain: chain of indirect blocks (with a missing link - see
575  *              ext3_alloc_branch)
576  *      @where: location of missing link
577  *      @num:   number of blocks we are adding
578  *
579  *      This function fills the missing link and does all housekeeping needed in
580  *      inode (->i_blocks, etc.). In case of success we end up with the full
581  *      chain to new block and return 0.
582  */
583
584 static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
585                               Indirect chain[4], Indirect *where, int num)
586 {
587         int i;
588         int err = 0;
589         struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
590
591         /*
592          * If we're splicing into a [td]indirect block (as opposed to the
593          * inode) then we need to get write access to the [td]indirect block
594          * before the splice.
595          */
596         if (where->bh) {
597                 BUFFER_TRACE(where->bh, "get_write_access");
598                 err = ext3_journal_get_write_access(handle, where->bh);
599                 if (err)
600                         goto err_out;
601         }
602         /* That's it */
603
604         *where->p = where->key;
605
606         /*
607          * update the most recently allocated logical & physical block
608          * in i_block_alloc_info, to assist find the proper goal block for next
609          * allocation
610          */
611         if (block_i) {
612                 block_i->last_alloc_logical_block = block;
613                 block_i->last_alloc_physical_block = le32_to_cpu(where[num-1].key);
614         }
615
616         /* We are done with atomic stuff, now do the rest of housekeeping */
617
618         inode->i_ctime = CURRENT_TIME_SEC;
619         ext3_mark_inode_dirty(handle, inode);
620
621         /* had we spliced it onto indirect block? */
622         if (where->bh) {
623                 /*
624                  * akpm: If we spliced it onto an indirect block, we haven't
625                  * altered the inode.  Note however that if it is being spliced
626                  * onto an indirect block at the very end of the file (the
627                  * file is growing) then we *will* alter the inode to reflect
628                  * the new i_size.  But that is not done here - it is done in
629                  * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
630                  */
631                 jbd_debug(5, "splicing indirect only\n");
632                 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
633                 err = ext3_journal_dirty_metadata(handle, where->bh);
634                 if (err) 
635                         goto err_out;
636         } else {
637                 /*
638                  * OK, we spliced it into the inode itself on a direct block.
639                  * Inode was dirtied above.
640                  */
641                 jbd_debug(5, "splicing direct\n");
642         }
643         return err;
644
645 err_out:
646         for (i = 1; i < num; i++) {
647                 BUFFER_TRACE(where[i].bh, "call journal_forget");
648                 ext3_journal_forget(handle, where[i].bh);
649         }
650         return err;
651 }
652
653 /*
654  * Allocation strategy is simple: if we have to allocate something, we will
655  * have to go the whole way to leaf. So let's do it before attaching anything
656  * to tree, set linkage between the newborn blocks, write them if sync is
657  * required, recheck the path, free and repeat if check fails, otherwise
658  * set the last missing link (that will protect us from any truncate-generated
659  * removals - all blocks on the path are immune now) and possibly force the
660  * write on the parent block.
661  * That has a nice additional property: no special recovery from the failed
662  * allocations is needed - we simply release blocks and do not touch anything
663  * reachable from inode.
664  *
665  * akpm: `handle' can be NULL if create == 0.
666  *
667  * The BKL may not be held on entry here.  Be sure to take it early.
668  */
669
670 static int
671 ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
672                 struct buffer_head *bh_result, int create, int extend_disksize)
673 {
674         int err = -EIO;
675         int offsets[4];
676         Indirect chain[4];
677         Indirect *partial;
678         unsigned long goal;
679         int left;
680         int boundary = 0;
681         const int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
682         struct ext3_inode_info *ei = EXT3_I(inode);
683
684         J_ASSERT(handle != NULL || create == 0);
685
686         if (depth == 0)
687                 goto out;
688
689         partial = ext3_get_branch(inode, depth, offsets, chain, &err);
690
691         /* Simplest case - block found, no allocation needed */
692         if (!partial) {
693                 clear_buffer_new(bh_result);
694                 goto got_it;
695         }
696
697         /* Next simple case - plain lookup or failed read of indirect block */
698         if (!create || err == -EIO)
699                 goto cleanup;
700
701         down(&ei->truncate_sem);
702
703         /*
704          * If the indirect block is missing while we are reading
705          * the chain(ext3_get_branch() returns -EAGAIN err), or
706          * if the chain has been changed after we grab the semaphore,
707          * (either because another process truncated this branch, or
708          * another get_block allocated this branch) re-grab the chain to see if
709          * the request block has been allocated or not.
710          *
711          * Since we already block the truncate/other get_block
712          * at this point, we will have the current copy of the chain when we
713          * splice the branch into the tree.
714          */
715         if (err == -EAGAIN || !verify_chain(chain, partial)) {
716                 while (partial > chain) {
717                         brelse(partial->bh);
718                         partial--;
719                 }
720                 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
721                 if (!partial) {
722                         up(&ei->truncate_sem);
723                         if (err)
724                                 goto cleanup;
725                         clear_buffer_new(bh_result);
726                         goto got_it;
727                 }
728         }
729
730         /*
731          * Okay, we need to do block allocation.  Lazily initialize the block
732          * allocation info here if necessary
733         */
734         if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
735                 ext3_init_block_alloc_info(inode);
736
737         goal = ext3_find_goal(inode, iblock, chain, partial);
738
739         left = (chain + depth) - partial;
740
741         /*
742          * Block out ext3_truncate while we alter the tree
743          */
744         err = ext3_alloc_branch(handle, inode, left, goal,
745                                 offsets + (partial - chain), partial);
746
747         /*
748          * The ext3_splice_branch call will free and forget any buffers
749          * on the new chain if there is a failure, but that risks using
750          * up transaction credits, especially for bitmaps where the
751          * credits cannot be returned.  Can we handle this somehow?  We
752          * may need to return -EAGAIN upwards in the worst case.  --sct
753          */
754         if (!err)
755                 err = ext3_splice_branch(handle, inode, iblock, chain,
756                                          partial, left);
757         /*
758          * i_disksize growing is protected by truncate_sem.  Don't forget to
759          * protect it if you're about to implement concurrent
760          * ext3_get_block() -bzzz
761         */
762         if (!err && extend_disksize && inode->i_size > ei->i_disksize)
763                 ei->i_disksize = inode->i_size;
764         up(&ei->truncate_sem);
765         if (err)
766                 goto cleanup;
767
768         set_buffer_new(bh_result);
769 got_it:
770         map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
771         if (boundary)
772                 set_buffer_boundary(bh_result);
773         /* Clean up and exit */
774         partial = chain + depth - 1;    /* the whole chain */
775 cleanup:
776         while (partial > chain) {
777                 BUFFER_TRACE(partial->bh, "call brelse");
778                 brelse(partial->bh);
779                 partial--;
780         }
781         BUFFER_TRACE(bh_result, "returned");
782 out:
783         return err;
784 }
785
786 static int ext3_get_block(struct inode *inode, sector_t iblock,
787                         struct buffer_head *bh_result, int create)
788 {
789         handle_t *handle = NULL;
790         int ret;
791
792         if (create) {
793                 handle = ext3_journal_current_handle();
794                 J_ASSERT(handle != 0);
795         }
796         ret = ext3_get_block_handle(handle, inode, iblock,
797                                 bh_result, create, 1);
798         return ret;
799 }
800
801 #define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
802
803 static int
804 ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
805                 unsigned long max_blocks, struct buffer_head *bh_result,
806                 int create)
807 {
808         handle_t *handle = journal_current_handle();
809         int ret = 0;
810
811         if (!handle)
812                 goto get_block;         /* A read */
813
814         if (handle->h_transaction->t_state == T_LOCKED) {
815                 /*
816                  * Huge direct-io writes can hold off commits for long
817                  * periods of time.  Let this commit run.
818                  */
819                 ext3_journal_stop(handle);
820                 handle = ext3_journal_start(inode, DIO_CREDITS);
821                 if (IS_ERR(handle))
822                         ret = PTR_ERR(handle);
823                 goto get_block;
824         }
825
826         if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
827                 /*
828                  * Getting low on buffer credits...
829                  */
830                 ret = ext3_journal_extend(handle, DIO_CREDITS);
831                 if (ret > 0) {
832                         /*
833                          * Couldn't extend the transaction.  Start a new one.
834                          */
835                         ret = ext3_journal_restart(handle, DIO_CREDITS);
836                 }
837         }
838
839 get_block:
840         if (ret == 0)
841                 ret = ext3_get_block_handle(handle, inode, iblock,
842                                         bh_result, create, 0);
843         bh_result->b_size = (1 << inode->i_blkbits);
844         return ret;
845 }
846
847 static int ext3_writepages_get_block(struct inode *inode, sector_t iblock,
848                         struct buffer_head *bh, int create)
849 {
850         return ext3_direct_io_get_blocks(inode, iblock, 1, bh, create);
851 }
852
853 /*
854  * `handle' can be NULL if create is zero
855  */
856 struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
857                                 long block, int create, int * errp)
858 {
859         struct buffer_head dummy;
860         int fatal = 0, err;
861
862         J_ASSERT(handle != NULL || create == 0);
863
864         dummy.b_state = 0;
865         dummy.b_blocknr = -1000;
866         buffer_trace_init(&dummy.b_history);
867         *errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
868         if (!*errp && buffer_mapped(&dummy)) {
869                 struct buffer_head *bh;
870                 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
871                 if (buffer_new(&dummy)) {
872                         J_ASSERT(create != 0);
873                         J_ASSERT(handle != 0);
874
875                         /* Now that we do not always journal data, we
876                            should keep in mind whether this should
877                            always journal the new buffer as metadata.
878                            For now, regular file writes use
879                            ext3_get_block instead, so it's not a
880                            problem. */
881                         lock_buffer(bh);
882                         BUFFER_TRACE(bh, "call get_create_access");
883                         fatal = ext3_journal_get_create_access(handle, bh);
884                         if (!fatal && !buffer_uptodate(bh)) {
885                                 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
886                                 set_buffer_uptodate(bh);
887                         }
888                         unlock_buffer(bh);
889                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
890                         err = ext3_journal_dirty_metadata(handle, bh);
891                         if (!fatal)
892                                 fatal = err;
893                 } else {
894                         BUFFER_TRACE(bh, "not a new buffer");
895                 }
896                 if (fatal) {
897                         *errp = fatal;
898                         brelse(bh);
899                         bh = NULL;
900                 }
901                 return bh;
902         }
903         return NULL;
904 }
905
906 struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
907                                int block, int create, int *err)
908 {
909         struct buffer_head * bh;
910
911         bh = ext3_getblk(handle, inode, block, create, err);
912         if (!bh)
913                 return bh;
914         if (buffer_uptodate(bh))
915                 return bh;
916         ll_rw_block(READ, 1, &bh);
917         wait_on_buffer(bh);
918         if (buffer_uptodate(bh))
919                 return bh;
920         put_bh(bh);
921         *err = -EIO;
922         return NULL;
923 }
924
925 static int walk_page_buffers(   handle_t *handle,
926                                 struct buffer_head *head,
927                                 unsigned from,
928                                 unsigned to,
929                                 int *partial,
930                                 int (*fn)(      handle_t *handle,
931                                                 struct buffer_head *bh))
932 {
933         struct buffer_head *bh;
934         unsigned block_start, block_end;
935         unsigned blocksize = head->b_size;
936         int err, ret = 0;
937         struct buffer_head *next;
938
939         for (   bh = head, block_start = 0;
940                 ret == 0 && (bh != head || !block_start);
941                 block_start = block_end, bh = next)
942         {
943                 next = bh->b_this_page;
944                 block_end = block_start + blocksize;
945                 if (block_end <= from || block_start >= to) {
946                         if (partial && !buffer_uptodate(bh))
947                                 *partial = 1;
948                         continue;
949                 }
950                 err = (*fn)(handle, bh);
951                 if (!ret)
952                         ret = err;
953         }
954         return ret;
955 }
956
957 /*
958  * To preserve ordering, it is essential that the hole instantiation and
959  * the data write be encapsulated in a single transaction.  We cannot
960  * close off a transaction and start a new one between the ext3_get_block()
961  * and the commit_write().  So doing the journal_start at the start of
962  * prepare_write() is the right place.
963  *
964  * Also, this function can nest inside ext3_writepage() ->
965  * block_write_full_page(). In that case, we *know* that ext3_writepage()
966  * has generated enough buffer credits to do the whole page.  So we won't
967  * block on the journal in that case, which is good, because the caller may
968  * be PF_MEMALLOC.
969  *
970  * By accident, ext3 can be reentered when a transaction is open via
971  * quota file writes.  If we were to commit the transaction while thus
972  * reentered, there can be a deadlock - we would be holding a quota
973  * lock, and the commit would never complete if another thread had a
974  * transaction open and was blocking on the quota lock - a ranking
975  * violation.
976  *
977  * So what we do is to rely on the fact that journal_stop/journal_start
978  * will _not_ run commit under these circumstances because handle->h_ref
979  * is elevated.  We'll still have enough credits for the tiny quotafile
980  * write.  
981  */
982
983 static int do_journal_get_write_access(handle_t *handle, 
984                                        struct buffer_head *bh)
985 {
986         if (!buffer_mapped(bh) || buffer_freed(bh))
987                 return 0;
988         return ext3_journal_get_write_access(handle, bh);
989 }
990
991 static int ext3_prepare_write(struct file *file, struct page *page,
992                               unsigned from, unsigned to)
993 {
994         struct inode *inode = page->mapping->host;
995         int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
996         handle_t *handle;
997         int retries = 0;
998
999 retry:
1000         handle = ext3_journal_start(inode, needed_blocks);
1001         if (IS_ERR(handle)) {
1002                 ret = PTR_ERR(handle);
1003                 goto out;
1004         }
1005         if (test_opt(inode->i_sb, NOBH))
1006                 ret = nobh_prepare_write(page, from, to, ext3_get_block);
1007         else
1008                 ret = block_prepare_write(page, from, to, ext3_get_block);
1009         if (ret)
1010                 goto prepare_write_failed;
1011
1012         if (ext3_should_journal_data(inode)) {
1013                 ret = walk_page_buffers(handle, page_buffers(page),
1014                                 from, to, NULL, do_journal_get_write_access);
1015         }
1016 prepare_write_failed:
1017         if (ret)
1018                 ext3_journal_stop(handle);
1019         if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1020                 goto retry;
1021 out:
1022         return ret;
1023 }
1024
1025 int
1026 ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1027 {
1028         int err = journal_dirty_data(handle, bh);
1029         if (err)
1030                 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1031                                                 bh, handle,err);
1032         return err;
1033 }
1034
1035 /* For commit_write() in data=journal mode */
1036 static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1037 {
1038         if (!buffer_mapped(bh) || buffer_freed(bh))
1039                 return 0;
1040         set_buffer_uptodate(bh);
1041         return ext3_journal_dirty_metadata(handle, bh);
1042 }
1043
1044 /*
1045  * We need to pick up the new inode size which generic_commit_write gave us
1046  * `file' can be NULL - eg, when called from page_symlink().
1047  *
1048  * ext3 never places buffers on inode->i_mapping->private_list.  metadata
1049  * buffers are managed internally.
1050  */
1051
1052 static int ext3_ordered_commit_write(struct file *file, struct page *page,
1053                              unsigned from, unsigned to)
1054 {
1055         handle_t *handle = ext3_journal_current_handle();
1056         struct inode *inode = page->mapping->host;
1057         int ret = 0, ret2;
1058
1059         ret = walk_page_buffers(handle, page_buffers(page),
1060                 from, to, NULL, ext3_journal_dirty_data);
1061
1062         if (ret == 0) {
1063                 /*
1064                  * generic_commit_write() will run mark_inode_dirty() if i_size
1065                  * changes.  So let's piggyback the i_disksize mark_inode_dirty
1066                  * into that.
1067                  */
1068                 loff_t new_i_size;
1069
1070                 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1071                 if (new_i_size > EXT3_I(inode)->i_disksize)
1072                         EXT3_I(inode)->i_disksize = new_i_size;
1073                 ret = generic_commit_write(file, page, from, to);
1074         }
1075         ret2 = ext3_journal_stop(handle);
1076         if (!ret)
1077                 ret = ret2;
1078         return ret;
1079 }
1080
1081 static int ext3_writeback_commit_write(struct file *file, struct page *page,
1082                              unsigned from, unsigned to)
1083 {
1084         handle_t *handle = ext3_journal_current_handle();
1085         struct inode *inode = page->mapping->host;
1086         int ret = 0, ret2;
1087         loff_t new_i_size;
1088
1089         new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1090         if (new_i_size > EXT3_I(inode)->i_disksize)
1091                 EXT3_I(inode)->i_disksize = new_i_size;
1092
1093         if (test_opt(inode->i_sb, NOBH))
1094                 ret = nobh_commit_write(file, page, from, to);
1095         else
1096                 ret = generic_commit_write(file, page, from, to);
1097
1098         ret2 = ext3_journal_stop(handle);
1099         if (!ret)
1100                 ret = ret2;
1101         return ret;
1102 }
1103
1104 static int ext3_journalled_commit_write(struct file *file,
1105                         struct page *page, unsigned from, unsigned to)
1106 {
1107         handle_t *handle = ext3_journal_current_handle();
1108         struct inode *inode = page->mapping->host;
1109         int ret = 0, ret2;
1110         int partial = 0;
1111         loff_t pos;
1112
1113         /*
1114          * Here we duplicate the generic_commit_write() functionality
1115          */
1116         pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1117
1118         ret = walk_page_buffers(handle, page_buffers(page), from,
1119                                 to, &partial, commit_write_fn);
1120         if (!partial)
1121                 SetPageUptodate(page);
1122         if (pos > inode->i_size)
1123                 i_size_write(inode, pos);
1124         EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1125         if (inode->i_size > EXT3_I(inode)->i_disksize) {
1126                 EXT3_I(inode)->i_disksize = inode->i_size;
1127                 ret2 = ext3_mark_inode_dirty(handle, inode);
1128                 if (!ret) 
1129                         ret = ret2;
1130         }
1131         ret2 = ext3_journal_stop(handle);
1132         if (!ret)
1133                 ret = ret2;
1134         return ret;
1135 }
1136
1137 /* 
1138  * bmap() is special.  It gets used by applications such as lilo and by
1139  * the swapper to find the on-disk block of a specific piece of data.
1140  *
1141  * Naturally, this is dangerous if the block concerned is still in the
1142  * journal.  If somebody makes a swapfile on an ext3 data-journaling
1143  * filesystem and enables swap, then they may get a nasty shock when the
1144  * data getting swapped to that swapfile suddenly gets overwritten by
1145  * the original zero's written out previously to the journal and
1146  * awaiting writeback in the kernel's buffer cache. 
1147  *
1148  * So, if we see any bmap calls here on a modified, data-journaled file,
1149  * take extra steps to flush any blocks which might be in the cache. 
1150  */
1151 static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1152 {
1153         struct inode *inode = mapping->host;
1154         journal_t *journal;
1155         int err;
1156
1157         if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1158                 /* 
1159                  * This is a REALLY heavyweight approach, but the use of
1160                  * bmap on dirty files is expected to be extremely rare:
1161                  * only if we run lilo or swapon on a freshly made file
1162                  * do we expect this to happen. 
1163                  *
1164                  * (bmap requires CAP_SYS_RAWIO so this does not
1165                  * represent an unprivileged user DOS attack --- we'd be
1166                  * in trouble if mortal users could trigger this path at
1167                  * will.) 
1168                  *
1169                  * NB. EXT3_STATE_JDATA is not set on files other than
1170                  * regular files.  If somebody wants to bmap a directory
1171                  * or symlink and gets confused because the buffer
1172                  * hasn't yet been flushed to disk, they deserve
1173                  * everything they get.
1174                  */
1175
1176                 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1177                 journal = EXT3_JOURNAL(inode);
1178                 journal_lock_updates(journal);
1179                 err = journal_flush(journal);
1180                 journal_unlock_updates(journal);
1181
1182                 if (err)
1183                         return 0;
1184         }
1185
1186         return generic_block_bmap(mapping,block,ext3_get_block);
1187 }
1188
1189 static int bget_one(handle_t *handle, struct buffer_head *bh)
1190 {
1191         get_bh(bh);
1192         return 0;
1193 }
1194
1195 static int bput_one(handle_t *handle, struct buffer_head *bh)
1196 {
1197         put_bh(bh);
1198         return 0;
1199 }
1200
1201 static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1202 {
1203         if (buffer_mapped(bh))
1204                 return ext3_journal_dirty_data(handle, bh);
1205         return 0;
1206 }
1207
1208 /*
1209  * Note that we always start a transaction even if we're not journalling
1210  * data.  This is to preserve ordering: any hole instantiation within
1211  * __block_write_full_page -> ext3_get_block() should be journalled
1212  * along with the data so we don't crash and then get metadata which
1213  * refers to old data.
1214  *
1215  * In all journalling modes block_write_full_page() will start the I/O.
1216  *
1217  * Problem:
1218  *
1219  *      ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1220  *              ext3_writepage()
1221  *
1222  * Similar for:
1223  *
1224  *      ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1225  *
1226  * Same applies to ext3_get_block().  We will deadlock on various things like
1227  * lock_journal and i_truncate_sem.
1228  *
1229  * Setting PF_MEMALLOC here doesn't work - too many internal memory
1230  * allocations fail.
1231  *
1232  * 16May01: If we're reentered then journal_current_handle() will be
1233  *          non-zero. We simply *return*.
1234  *
1235  * 1 July 2001: @@@ FIXME:
1236  *   In journalled data mode, a data buffer may be metadata against the
1237  *   current transaction.  But the same file is part of a shared mapping
1238  *   and someone does a writepage() on it.
1239  *
1240  *   We will move the buffer onto the async_data list, but *after* it has
1241  *   been dirtied. So there's a small window where we have dirty data on
1242  *   BJ_Metadata.
1243  *
1244  *   Note that this only applies to the last partial page in the file.  The
1245  *   bit which block_write_full_page() uses prepare/commit for.  (That's
1246  *   broken code anyway: it's wrong for msync()).
1247  *
1248  *   It's a rare case: affects the final partial page, for journalled data
1249  *   where the file is subject to bith write() and writepage() in the same
1250  *   transction.  To fix it we'll need a custom block_write_full_page().
1251  *   We'll probably need that anyway for journalling writepage() output.
1252  *
1253  * We don't honour synchronous mounts for writepage().  That would be
1254  * disastrous.  Any write() or metadata operation will sync the fs for
1255  * us.
1256  *
1257  * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1258  * we don't need to open a transaction here.
1259  */
1260 static int ext3_ordered_writepage(struct page *page,
1261                         struct writeback_control *wbc)
1262 {
1263         struct inode *inode = page->mapping->host;
1264         struct buffer_head *page_bufs;
1265         handle_t *handle = NULL;
1266         int ret = 0;
1267         int err;
1268
1269         J_ASSERT(PageLocked(page));
1270
1271         /*
1272          * We give up here if we're reentered, because it might be for a
1273          * different filesystem.
1274          */
1275         if (ext3_journal_current_handle())
1276                 goto out_fail;
1277
1278         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1279
1280         if (IS_ERR(handle)) {
1281                 ret = PTR_ERR(handle);
1282                 goto out_fail;
1283         }
1284
1285         if (!page_has_buffers(page)) {
1286                 create_empty_buffers(page, inode->i_sb->s_blocksize,
1287                                 (1 << BH_Dirty)|(1 << BH_Uptodate));
1288         }
1289         page_bufs = page_buffers(page);
1290         walk_page_buffers(handle, page_bufs, 0,
1291                         PAGE_CACHE_SIZE, NULL, bget_one);
1292
1293         ret = block_write_full_page(page, ext3_get_block, wbc);
1294
1295         /*
1296          * The page can become unlocked at any point now, and
1297          * truncate can then come in and change things.  So we
1298          * can't touch *page from now on.  But *page_bufs is
1299          * safe due to elevated refcount.
1300          */
1301
1302         /*
1303          * And attach them to the current transaction.  But only if 
1304          * block_write_full_page() succeeded.  Otherwise they are unmapped,
1305          * and generally junk.
1306          */
1307         if (ret == 0) {
1308                 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1309                                         NULL, journal_dirty_data_fn);
1310                 if (!ret)
1311                         ret = err;
1312         }
1313         walk_page_buffers(handle, page_bufs, 0,
1314                         PAGE_CACHE_SIZE, NULL, bput_one);
1315         err = ext3_journal_stop(handle);
1316         if (!ret)
1317                 ret = err;
1318         return ret;
1319
1320 out_fail:
1321         redirty_page_for_writepage(wbc, page);
1322         unlock_page(page);
1323         return ret;
1324 }
1325
1326 static int
1327 ext3_writeback_writepage_helper(struct page *page,
1328                                 struct writeback_control *wbc)
1329 {
1330         return block_write_full_page(page, ext3_get_block, wbc);
1331 }
1332
1333 static int
1334 ext3_writeback_writepages(struct address_space *mapping,
1335                                 struct writeback_control *wbc)
1336 {
1337         struct inode *inode = mapping->host;
1338         handle_t *handle = NULL;
1339         int err, ret = 0;
1340
1341         if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
1342                 return ret;
1343
1344         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1345         if (IS_ERR(handle)) {
1346                 ret = PTR_ERR(handle);
1347                 return ret;
1348         }
1349
1350         ret = __mpage_writepages(mapping, wbc, ext3_writepages_get_block,
1351                                         ext3_writeback_writepage_helper);
1352
1353         /*
1354          * Need to reaquire the handle since ext3_writepages_get_block()
1355          * can restart the handle
1356          */
1357         handle = journal_current_handle();
1358
1359         err = ext3_journal_stop(handle);
1360         if (!ret)
1361                 ret = err;
1362         return ret;
1363 }
1364
1365 static int ext3_writeback_writepage(struct page *page,
1366                                 struct writeback_control *wbc)
1367 {
1368         struct inode *inode = page->mapping->host;
1369         handle_t *handle = NULL;
1370         int ret = 0;
1371         int err;
1372
1373         if (ext3_journal_current_handle())
1374                 goto out_fail;
1375
1376         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1377         if (IS_ERR(handle)) {
1378                 ret = PTR_ERR(handle);
1379                 goto out_fail;
1380         }
1381
1382         if (test_opt(inode->i_sb, NOBH))
1383                 ret = nobh_writepage(page, ext3_get_block, wbc);
1384         else
1385                 ret = block_write_full_page(page, ext3_get_block, wbc);
1386
1387         err = ext3_journal_stop(handle);
1388         if (!ret)
1389                 ret = err;
1390         return ret;
1391
1392 out_fail:
1393         redirty_page_for_writepage(wbc, page);
1394         unlock_page(page);
1395         return ret;
1396 }
1397
1398 static int ext3_journalled_writepage(struct page *page,
1399                                 struct writeback_control *wbc)
1400 {
1401         struct inode *inode = page->mapping->host;
1402         handle_t *handle = NULL;
1403         int ret = 0;
1404         int err;
1405
1406         if (ext3_journal_current_handle())
1407                 goto no_write;
1408
1409         handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1410         if (IS_ERR(handle)) {
1411                 ret = PTR_ERR(handle);
1412                 goto no_write;
1413         }
1414
1415         if (!page_has_buffers(page) || PageChecked(page)) {
1416                 /*
1417                  * It's mmapped pagecache.  Add buffers and journal it.  There
1418                  * doesn't seem much point in redirtying the page here.
1419                  */
1420                 ClearPageChecked(page);
1421                 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1422                                         ext3_get_block);
1423                 if (ret != 0)
1424                         goto out_unlock;
1425                 ret = walk_page_buffers(handle, page_buffers(page), 0,
1426                         PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1427
1428                 err = walk_page_buffers(handle, page_buffers(page), 0,
1429                                 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1430                 if (ret == 0)
1431                         ret = err;
1432                 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1433                 unlock_page(page);
1434         } else {
1435                 /*
1436                  * It may be a page full of checkpoint-mode buffers.  We don't
1437                  * really know unless we go poke around in the buffer_heads.
1438                  * But block_write_full_page will do the right thing.
1439                  */
1440                 ret = block_write_full_page(page, ext3_get_block, wbc);
1441         }
1442         err = ext3_journal_stop(handle);
1443         if (!ret)
1444                 ret = err;
1445 out:
1446         return ret;
1447
1448 no_write:
1449         redirty_page_for_writepage(wbc, page);
1450 out_unlock:
1451         unlock_page(page);
1452         goto out;
1453 }
1454
1455 static int ext3_readpage(struct file *file, struct page *page)
1456 {
1457         return mpage_readpage(page, ext3_get_block);
1458 }
1459
1460 static int
1461 ext3_readpages(struct file *file, struct address_space *mapping,
1462                 struct list_head *pages, unsigned nr_pages)
1463 {
1464         return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1465 }
1466
1467 static int ext3_invalidatepage(struct page *page, unsigned long offset)
1468 {
1469         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1470
1471         /*
1472          * If it's a full truncate we just forget about the pending dirtying
1473          */
1474         if (offset == 0)
1475                 ClearPageChecked(page);
1476
1477         return journal_invalidatepage(journal, page, offset);
1478 }
1479
1480 static int ext3_releasepage(struct page *page, int wait)
1481 {
1482         journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1483
1484         WARN_ON(PageChecked(page));
1485         if (!page_has_buffers(page))
1486                 return 0;
1487         return journal_try_to_free_buffers(journal, page, wait);
1488 }
1489
1490 /*
1491  * If the O_DIRECT write will extend the file then add this inode to the
1492  * orphan list.  So recovery will truncate it back to the original size
1493  * if the machine crashes during the write.
1494  *
1495  * If the O_DIRECT write is intantiating holes inside i_size and the machine
1496  * crashes then stale disk data _may_ be exposed inside the file.
1497  */
1498 static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1499                         const struct iovec *iov, loff_t offset,
1500                         unsigned long nr_segs)
1501 {
1502         struct file *file = iocb->ki_filp;
1503         struct inode *inode = file->f_mapping->host;
1504         struct ext3_inode_info *ei = EXT3_I(inode);
1505         handle_t *handle = NULL;
1506         ssize_t ret;
1507         int orphan = 0;
1508         size_t count = iov_length(iov, nr_segs);
1509
1510         if (rw == WRITE) {
1511                 loff_t final_size = offset + count;
1512
1513                 handle = ext3_journal_start(inode, DIO_CREDITS);
1514                 if (IS_ERR(handle)) {
1515                         ret = PTR_ERR(handle);
1516                         goto out;
1517                 }
1518                 if (final_size > inode->i_size) {
1519                         ret = ext3_orphan_add(handle, inode);
1520                         if (ret)
1521                                 goto out_stop;
1522                         orphan = 1;
1523                         ei->i_disksize = inode->i_size;
1524                 }
1525         }
1526
1527         ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 
1528                                  offset, nr_segs,
1529                                  ext3_direct_io_get_blocks, NULL);
1530
1531         /*
1532          * Reacquire the handle: ext3_direct_io_get_block() can restart the
1533          * transaction
1534          */
1535         handle = journal_current_handle();
1536
1537 out_stop:
1538         if (handle) {
1539                 int err;
1540
1541                 if (orphan && inode->i_nlink)
1542                         ext3_orphan_del(handle, inode);
1543                 if (orphan && ret > 0) {
1544                         loff_t end = offset + ret;
1545                         if (end > inode->i_size) {
1546                                 ei->i_disksize = end;
1547                                 i_size_write(inode, end);
1548                                 /*
1549                                  * We're going to return a positive `ret'
1550                                  * here due to non-zero-length I/O, so there's
1551                                  * no way of reporting error returns from
1552                                  * ext3_mark_inode_dirty() to userspace.  So
1553                                  * ignore it.
1554                                  */
1555                                 ext3_mark_inode_dirty(handle, inode);
1556                         }
1557                 }
1558                 err = ext3_journal_stop(handle);
1559                 if (ret == 0)
1560                         ret = err;
1561         }
1562 out:
1563         return ret;
1564 }
1565
1566 /*
1567  * Pages can be marked dirty completely asynchronously from ext3's journalling
1568  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do
1569  * much here because ->set_page_dirty is called under VFS locks.  The page is
1570  * not necessarily locked.
1571  *
1572  * We cannot just dirty the page and leave attached buffers clean, because the
1573  * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
1574  * or jbddirty because all the journalling code will explode.
1575  *
1576  * So what we do is to mark the page "pending dirty" and next time writepage
1577  * is called, propagate that into the buffers appropriately.
1578  */
1579 static int ext3_journalled_set_page_dirty(struct page *page)
1580 {
1581         SetPageChecked(page);
1582         return __set_page_dirty_nobuffers(page);
1583 }
1584
1585 static struct address_space_operations ext3_ordered_aops = {
1586         .readpage       = ext3_readpage,
1587         .readpages      = ext3_readpages,
1588         .writepage      = ext3_ordered_writepage,
1589         .sync_page      = block_sync_page,
1590         .prepare_write  = ext3_prepare_write,
1591         .commit_write   = ext3_ordered_commit_write,
1592         .bmap           = ext3_bmap,
1593         .invalidatepage = ext3_invalidatepage,
1594         .releasepage    = ext3_releasepage,
1595         .direct_IO      = ext3_direct_IO,
1596 };
1597
1598 static struct address_space_operations ext3_writeback_aops = {
1599         .readpage       = ext3_readpage,
1600         .readpages      = ext3_readpages,
1601         .writepage      = ext3_writeback_writepage,
1602         .writepages     = ext3_writeback_writepages,
1603         .sync_page      = block_sync_page,
1604         .prepare_write  = ext3_prepare_write,
1605         .commit_write   = ext3_writeback_commit_write,
1606         .bmap           = ext3_bmap,
1607         .invalidatepage = ext3_invalidatepage,
1608         .releasepage    = ext3_releasepage,
1609         .direct_IO      = ext3_direct_IO,
1610 };
1611
1612 static struct address_space_operations ext3_journalled_aops = {
1613         .readpage       = ext3_readpage,
1614         .readpages      = ext3_readpages,
1615         .writepage      = ext3_journalled_writepage,
1616         .sync_page      = block_sync_page,
1617         .prepare_write  = ext3_prepare_write,
1618         .commit_write   = ext3_journalled_commit_write,
1619         .set_page_dirty = ext3_journalled_set_page_dirty,
1620         .bmap           = ext3_bmap,
1621         .invalidatepage = ext3_invalidatepage,
1622         .releasepage    = ext3_releasepage,
1623 };
1624
1625 void ext3_set_aops(struct inode *inode)
1626 {
1627         if (ext3_should_order_data(inode))
1628                 inode->i_mapping->a_ops = &ext3_ordered_aops;
1629         else if (ext3_should_writeback_data(inode))
1630                 inode->i_mapping->a_ops = &ext3_writeback_aops;
1631         else
1632                 inode->i_mapping->a_ops = &ext3_journalled_aops;
1633 }
1634
1635 /*
1636  * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1637  * up to the end of the block which corresponds to `from'.
1638  * This required during truncate. We need to physically zero the tail end
1639  * of that block so it doesn't yield old data if the file is later grown.
1640  */
1641 static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1642                 struct address_space *mapping, loff_t from)
1643 {
1644         unsigned long index = from >> PAGE_CACHE_SHIFT;
1645         unsigned offset = from & (PAGE_CACHE_SIZE-1);
1646         unsigned blocksize, iblock, length, pos;
1647         struct inode *inode = mapping->host;
1648         struct buffer_head *bh;
1649         int err = 0;
1650         void *kaddr;
1651
1652         blocksize = inode->i_sb->s_blocksize;
1653         length = blocksize - (offset & (blocksize - 1));
1654         iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1655
1656         /*
1657          * For "nobh" option,  we can only work if we don't need to
1658          * read-in the page - otherwise we create buffers to do the IO.
1659          */
1660         if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) {
1661                 if (PageUptodate(page)) {
1662                         kaddr = kmap_atomic(page, KM_USER0);
1663                         memset(kaddr + offset, 0, length);
1664                         flush_dcache_page(page);
1665                         kunmap_atomic(kaddr, KM_USER0);
1666                         set_page_dirty(page);
1667                         goto unlock;
1668                 }
1669         }
1670
1671         if (!page_has_buffers(page))
1672                 create_empty_buffers(page, blocksize, 0);
1673
1674         /* Find the buffer that contains "offset" */
1675         bh = page_buffers(page);
1676         pos = blocksize;
1677         while (offset >= pos) {
1678                 bh = bh->b_this_page;
1679                 iblock++;
1680                 pos += blocksize;
1681         }
1682
1683         err = 0;
1684         if (buffer_freed(bh)) {
1685                 BUFFER_TRACE(bh, "freed: skip");
1686                 goto unlock;
1687         }
1688
1689         if (!buffer_mapped(bh)) {
1690                 BUFFER_TRACE(bh, "unmapped");
1691                 ext3_get_block(inode, iblock, bh, 0);
1692                 /* unmapped? It's a hole - nothing to do */
1693                 if (!buffer_mapped(bh)) {
1694                         BUFFER_TRACE(bh, "still unmapped");
1695                         goto unlock;
1696                 }
1697         }
1698
1699         /* Ok, it's mapped. Make sure it's up-to-date */
1700         if (PageUptodate(page))
1701                 set_buffer_uptodate(bh);
1702
1703         if (!buffer_uptodate(bh)) {
1704                 err = -EIO;
1705                 ll_rw_block(READ, 1, &bh);
1706                 wait_on_buffer(bh);
1707                 /* Uhhuh. Read error. Complain and punt. */
1708                 if (!buffer_uptodate(bh))
1709                         goto unlock;
1710         }
1711
1712         if (ext3_should_journal_data(inode)) {
1713                 BUFFER_TRACE(bh, "get write access");
1714                 err = ext3_journal_get_write_access(handle, bh);
1715                 if (err)
1716                         goto unlock;
1717         }
1718
1719         kaddr = kmap_atomic(page, KM_USER0);
1720         memset(kaddr + offset, 0, length);
1721         flush_dcache_page(page);
1722         kunmap_atomic(kaddr, KM_USER0);
1723
1724         BUFFER_TRACE(bh, "zeroed end of block");
1725
1726         err = 0;
1727         if (ext3_should_journal_data(inode)) {
1728                 err = ext3_journal_dirty_metadata(handle, bh);
1729         } else {
1730                 if (ext3_should_order_data(inode))
1731                         err = ext3_journal_dirty_data(handle, bh);
1732                 mark_buffer_dirty(bh);
1733         }
1734
1735 unlock:
1736         unlock_page(page);
1737         page_cache_release(page);
1738         return err;
1739 }
1740
1741 /*
1742  * Probably it should be a library function... search for first non-zero word
1743  * or memcmp with zero_page, whatever is better for particular architecture.
1744  * Linus?
1745  */
1746 static inline int all_zeroes(__le32 *p, __le32 *q)
1747 {
1748         while (p < q)
1749                 if (*p++)
1750                         return 0;
1751         return 1;
1752 }
1753
1754 /**
1755  *      ext3_find_shared - find the indirect blocks for partial truncation.
1756  *      @inode:   inode in question
1757  *      @depth:   depth of the affected branch
1758  *      @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1759  *      @chain:   place to store the pointers to partial indirect blocks
1760  *      @top:     place to the (detached) top of branch
1761  *
1762  *      This is a helper function used by ext3_truncate().
1763  *
1764  *      When we do truncate() we may have to clean the ends of several
1765  *      indirect blocks but leave the blocks themselves alive. Block is
1766  *      partially truncated if some data below the new i_size is refered
1767  *      from it (and it is on the path to the first completely truncated
1768  *      data block, indeed).  We have to free the top of that path along
1769  *      with everything to the right of the path. Since no allocation
1770  *      past the truncation point is possible until ext3_truncate()
1771  *      finishes, we may safely do the latter, but top of branch may
1772  *      require special attention - pageout below the truncation point
1773  *      might try to populate it.
1774  *
1775  *      We atomically detach the top of branch from the tree, store the
1776  *      block number of its root in *@top, pointers to buffer_heads of
1777  *      partially truncated blocks - in @chain[].bh and pointers to
1778  *      their last elements that should not be removed - in
1779  *      @chain[].p. Return value is the pointer to last filled element
1780  *      of @chain.
1781  *
1782  *      The work left to caller to do the actual freeing of subtrees:
1783  *              a) free the subtree starting from *@top
1784  *              b) free the subtrees whose roots are stored in
1785  *                      (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1786  *              c) free the subtrees growing from the inode past the @chain[0].
1787  *                      (no partially truncated stuff there).  */
1788
1789 static Indirect *ext3_find_shared(struct inode *inode,
1790                                 int depth,
1791                                 int offsets[4],
1792                                 Indirect chain[4],
1793                                 __le32 *top)
1794 {
1795         Indirect *partial, *p;
1796         int k, err;
1797
1798         *top = 0;
1799         /* Make k index the deepest non-null offest + 1 */
1800         for (k = depth; k > 1 && !offsets[k-1]; k--)
1801                 ;
1802         partial = ext3_get_branch(inode, k, offsets, chain, &err);
1803         /* Writer: pointers */
1804         if (!partial)
1805                 partial = chain + k-1;
1806         /*
1807          * If the branch acquired continuation since we've looked at it -
1808          * fine, it should all survive and (new) top doesn't belong to us.
1809          */
1810         if (!partial->key && *partial->p)
1811                 /* Writer: end */
1812                 goto no_top;
1813         for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
1814                 ;
1815         /*
1816          * OK, we've found the last block that must survive. The rest of our
1817          * branch should be detached before unlocking. However, if that rest
1818          * of branch is all ours and does not grow immediately from the inode
1819          * it's easier to cheat and just decrement partial->p.
1820          */
1821         if (p == chain + k - 1 && p > chain) {
1822                 p->p--;
1823         } else {
1824                 *top = *p->p;
1825                 /* Nope, don't do this in ext3.  Must leave the tree intact */
1826 #if 0
1827                 *p->p = 0;
1828 #endif
1829         }
1830         /* Writer: end */
1831
1832         while(partial > p)
1833         {
1834                 brelse(partial->bh);
1835                 partial--;
1836         }
1837 no_top:
1838         return partial;
1839 }
1840
1841 /*
1842  * Zero a number of block pointers in either an inode or an indirect block.
1843  * If we restart the transaction we must again get write access to the
1844  * indirect block for further modification.
1845  *
1846  * We release `count' blocks on disk, but (last - first) may be greater
1847  * than `count' because there can be holes in there.
1848  */
1849 static void
1850 ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1851                 unsigned long block_to_free, unsigned long count,
1852                 __le32 *first, __le32 *last)
1853 {
1854         __le32 *p;
1855         if (try_to_extend_transaction(handle, inode)) {
1856                 if (bh) {
1857                         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1858                         ext3_journal_dirty_metadata(handle, bh);
1859                 }
1860                 ext3_mark_inode_dirty(handle, inode);
1861                 ext3_journal_test_restart(handle, inode);
1862                 if (bh) {
1863                         BUFFER_TRACE(bh, "retaking write access");
1864                         ext3_journal_get_write_access(handle, bh);
1865                 }
1866         }
1867
1868         /*
1869          * Any buffers which are on the journal will be in memory. We find
1870          * them on the hash table so journal_revoke() will run journal_forget()
1871          * on them.  We've already detached each block from the file, so
1872          * bforget() in journal_forget() should be safe.
1873          *
1874          * AKPM: turn on bforget in journal_forget()!!!
1875          */
1876         for (p = first; p < last; p++) {
1877                 u32 nr = le32_to_cpu(*p);
1878                 if (nr) {
1879                         struct buffer_head *bh;
1880
1881                         *p = 0;
1882                         bh = sb_find_get_block(inode->i_sb, nr);
1883                         ext3_forget(handle, 0, inode, bh, nr);
1884                 }
1885         }
1886
1887         ext3_free_blocks(handle, inode, block_to_free, count);
1888 }
1889
1890 /**
1891  * ext3_free_data - free a list of data blocks
1892  * @handle:     handle for this transaction
1893  * @inode:      inode we are dealing with
1894  * @this_bh:    indirect buffer_head which contains *@first and *@last
1895  * @first:      array of block numbers
1896  * @last:       points immediately past the end of array
1897  *
1898  * We are freeing all blocks refered from that array (numbers are stored as
1899  * little-endian 32-bit) and updating @inode->i_blocks appropriately.
1900  *
1901  * We accumulate contiguous runs of blocks to free.  Conveniently, if these
1902  * blocks are contiguous then releasing them at one time will only affect one
1903  * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
1904  * actually use a lot of journal space.
1905  *
1906  * @this_bh will be %NULL if @first and @last point into the inode's direct
1907  * block pointers.
1908  */
1909 static void ext3_free_data(handle_t *handle, struct inode *inode,
1910                            struct buffer_head *this_bh,
1911                            __le32 *first, __le32 *last)
1912 {
1913         unsigned long block_to_free = 0;    /* Starting block # of a run */
1914         unsigned long count = 0;            /* Number of blocks in the run */ 
1915         __le32 *block_to_free_p = NULL;     /* Pointer into inode/ind
1916                                                corresponding to
1917                                                block_to_free */
1918         unsigned long nr;                   /* Current block # */
1919         __le32 *p;                          /* Pointer into inode/ind
1920                                                for current block */
1921         int err;
1922
1923         if (this_bh) {                          /* For indirect block */
1924                 BUFFER_TRACE(this_bh, "get_write_access");
1925                 err = ext3_journal_get_write_access(handle, this_bh);
1926                 /* Important: if we can't update the indirect pointers
1927                  * to the blocks, we can't free them. */
1928                 if (err)
1929                         return;
1930         }
1931
1932         for (p = first; p < last; p++) {
1933                 nr = le32_to_cpu(*p);
1934                 if (nr) {
1935                         /* accumulate blocks to free if they're contiguous */
1936                         if (count == 0) {
1937                                 block_to_free = nr;
1938                                 block_to_free_p = p;
1939                                 count = 1;
1940                         } else if (nr == block_to_free + count) {
1941                                 count++;
1942                         } else {
1943                                 ext3_clear_blocks(handle, inode, this_bh, 
1944                                                   block_to_free,
1945                                                   count, block_to_free_p, p);
1946                                 block_to_free = nr;
1947                                 block_to_free_p = p;
1948                                 count = 1;
1949                         }
1950                 }
1951         }
1952
1953         if (count > 0)
1954                 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
1955                                   count, block_to_free_p, p);
1956
1957         if (this_bh) {
1958                 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
1959                 ext3_journal_dirty_metadata(handle, this_bh);
1960         }
1961 }
1962
1963 /**
1964  *      ext3_free_branches - free an array of branches
1965  *      @handle: JBD handle for this transaction
1966  *      @inode: inode we are dealing with
1967  *      @parent_bh: the buffer_head which contains *@first and *@last
1968  *      @first: array of block numbers
1969  *      @last:  pointer immediately past the end of array
1970  *      @depth: depth of the branches to free
1971  *
1972  *      We are freeing all blocks refered from these branches (numbers are
1973  *      stored as little-endian 32-bit) and updating @inode->i_blocks
1974  *      appropriately.
1975  */
1976 static void ext3_free_branches(handle_t *handle, struct inode *inode,
1977                                struct buffer_head *parent_bh,
1978                                __le32 *first, __le32 *last, int depth)
1979 {
1980         unsigned long nr;
1981         __le32 *p;
1982
1983         if (is_handle_aborted(handle))
1984                 return;
1985
1986         if (depth--) {
1987                 struct buffer_head *bh;
1988                 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
1989                 p = last;
1990                 while (--p >= first) {
1991                         nr = le32_to_cpu(*p);
1992                         if (!nr)
1993                                 continue;               /* A hole */
1994
1995                         /* Go read the buffer for the next level down */
1996                         bh = sb_bread(inode->i_sb, nr);
1997
1998                         /*
1999                          * A read failure? Report error and clear slot
2000                          * (should be rare).
2001                          */
2002                         if (!bh) {
2003                                 ext3_error(inode->i_sb, "ext3_free_branches",
2004                                            "Read failure, inode=%ld, block=%ld",
2005                                            inode->i_ino, nr);
2006                                 continue;
2007                         }
2008
2009                         /* This zaps the entire block.  Bottom up. */
2010                         BUFFER_TRACE(bh, "free child branches");
2011                         ext3_free_branches(handle, inode, bh,
2012                                            (__le32*)bh->b_data,
2013                                            (__le32*)bh->b_data + addr_per_block,
2014                                            depth);
2015
2016                         /*
2017                          * We've probably journalled the indirect block several
2018                          * times during the truncate.  But it's no longer
2019                          * needed and we now drop it from the transaction via
2020                          * journal_revoke().
2021                          *
2022                          * That's easy if it's exclusively part of this
2023                          * transaction.  But if it's part of the committing
2024                          * transaction then journal_forget() will simply
2025                          * brelse() it.  That means that if the underlying
2026                          * block is reallocated in ext3_get_block(),
2027                          * unmap_underlying_metadata() will find this block
2028                          * and will try to get rid of it.  damn, damn.
2029                          *
2030                          * If this block has already been committed to the
2031                          * journal, a revoke record will be written.  And
2032                          * revoke records must be emitted *before* clearing
2033                          * this block's bit in the bitmaps.
2034                          */
2035                         ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2036
2037                         /*
2038                          * Everything below this this pointer has been
2039                          * released.  Now let this top-of-subtree go.
2040                          *
2041                          * We want the freeing of this indirect block to be
2042                          * atomic in the journal with the updating of the
2043                          * bitmap block which owns it.  So make some room in
2044                          * the journal.
2045                          *
2046                          * We zero the parent pointer *after* freeing its
2047                          * pointee in the bitmaps, so if extend_transaction()
2048                          * for some reason fails to put the bitmap changes and
2049                          * the release into the same transaction, recovery
2050                          * will merely complain about releasing a free block,
2051                          * rather than leaking blocks.
2052                          */
2053                         if (is_handle_aborted(handle))
2054                                 return;
2055                         if (try_to_extend_transaction(handle, inode)) {
2056                                 ext3_mark_inode_dirty(handle, inode);
2057                                 ext3_journal_test_restart(handle, inode);
2058                         }
2059
2060                         ext3_free_blocks(handle, inode, nr, 1);
2061
2062                         if (parent_bh) {
2063                                 /*
2064                                  * The block which we have just freed is
2065                                  * pointed to by an indirect block: journal it
2066                                  */
2067                                 BUFFER_TRACE(parent_bh, "get_write_access");
2068                                 if (!ext3_journal_get_write_access(handle,
2069                                                                    parent_bh)){
2070                                         *p = 0;
2071                                         BUFFER_TRACE(parent_bh,
2072                                         "call ext3_journal_dirty_metadata");
2073                                         ext3_journal_dirty_metadata(handle, 
2074                                                                     parent_bh);
2075                                 }
2076                         }
2077                 }
2078         } else {
2079                 /* We have reached the bottom of the tree. */
2080                 BUFFER_TRACE(parent_bh, "free data blocks");
2081                 ext3_free_data(handle, inode, parent_bh, first, last);
2082         }
2083 }
2084
2085 /*
2086  * ext3_truncate()
2087  *
2088  * We block out ext3_get_block() block instantiations across the entire
2089  * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2090  * simultaneously on behalf of the same inode.
2091  *
2092  * As we work through the truncate and commmit bits of it to the journal there
2093  * is one core, guiding principle: the file's tree must always be consistent on
2094  * disk.  We must be able to restart the truncate after a crash.
2095  *
2096  * The file's tree may be transiently inconsistent in memory (although it
2097  * probably isn't), but whenever we close off and commit a journal transaction,
2098  * the contents of (the filesystem + the journal) must be consistent and
2099  * restartable.  It's pretty simple, really: bottom up, right to left (although
2100  * left-to-right works OK too).
2101  *
2102  * Note that at recovery time, journal replay occurs *before* the restart of
2103  * truncate against the orphan inode list.
2104  *
2105  * The committed inode has the new, desired i_size (which is the same as
2106  * i_disksize in this case).  After a crash, ext3_orphan_cleanup() will see
2107  * that this inode's truncate did not complete and it will again call
2108  * ext3_truncate() to have another go.  So there will be instantiated blocks
2109  * to the right of the truncation point in a crashed ext3 filesystem.  But
2110  * that's fine - as long as they are linked from the inode, the post-crash
2111  * ext3_truncate() run will find them and release them.
2112  */
2113
2114 void ext3_truncate(struct inode * inode)
2115 {
2116         handle_t *handle;
2117         struct ext3_inode_info *ei = EXT3_I(inode);
2118         __le32 *i_data = ei->i_data;
2119         int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2120         struct address_space *mapping = inode->i_mapping;
2121         int offsets[4];
2122         Indirect chain[4];
2123         Indirect *partial;
2124         __le32 nr = 0;
2125         int n;
2126         long last_block;
2127         unsigned blocksize = inode->i_sb->s_blocksize;
2128         struct page *page;
2129
2130         if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2131             S_ISLNK(inode->i_mode)))
2132                 return;
2133         if (ext3_inode_is_fast_symlink(inode))
2134                 return;
2135         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2136                 return;
2137
2138         /*
2139          * We have to lock the EOF page here, because lock_page() nests
2140          * outside journal_start().
2141          */
2142         if ((inode->i_size & (blocksize - 1)) == 0) {
2143                 /* Block boundary? Nothing to do */
2144                 page = NULL;
2145         } else {
2146                 page = grab_cache_page(mapping,
2147                                 inode->i_size >> PAGE_CACHE_SHIFT);
2148                 if (!page)
2149                         return;
2150         }
2151
2152         handle = start_transaction(inode);
2153         if (IS_ERR(handle)) {
2154                 if (page) {
2155                         clear_highpage(page);
2156                         flush_dcache_page(page);
2157                         unlock_page(page);
2158                         page_cache_release(page);
2159                 }
2160                 return;         /* AKPM: return what? */
2161         }
2162
2163         last_block = (inode->i_size + blocksize-1)
2164                                         >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2165
2166         if (page)
2167                 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2168
2169         n = ext3_block_to_path(inode, last_block, offsets, NULL);
2170         if (n == 0)
2171                 goto out_stop;  /* error */
2172
2173         /*
2174          * OK.  This truncate is going to happen.  We add the inode to the
2175          * orphan list, so that if this truncate spans multiple transactions,
2176          * and we crash, we will resume the truncate when the filesystem
2177          * recovers.  It also marks the inode dirty, to catch the new size.
2178          *
2179          * Implication: the file must always be in a sane, consistent
2180          * truncatable state while each transaction commits.
2181          */
2182         if (ext3_orphan_add(handle, inode))
2183                 goto out_stop;
2184
2185         /*
2186          * The orphan list entry will now protect us from any crash which
2187          * occurs before the truncate completes, so it is now safe to propagate
2188          * the new, shorter inode size (held for now in i_size) into the
2189          * on-disk inode. We do this via i_disksize, which is the value which
2190          * ext3 *really* writes onto the disk inode.
2191          */
2192         ei->i_disksize = inode->i_size;
2193
2194         /*
2195          * From here we block out all ext3_get_block() callers who want to
2196          * modify the block allocation tree.
2197          */
2198         down(&ei->truncate_sem);
2199
2200         if (n == 1) {           /* direct blocks */
2201                 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2202                                i_data + EXT3_NDIR_BLOCKS);
2203                 goto do_indirects;
2204         }
2205
2206         partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2207         /* Kill the top of shared branch (not detached) */
2208         if (nr) {
2209                 if (partial == chain) {
2210                         /* Shared branch grows from the inode */
2211                         ext3_free_branches(handle, inode, NULL,
2212                                            &nr, &nr+1, (chain+n-1) - partial);
2213                         *partial->p = 0;
2214                         /*
2215                          * We mark the inode dirty prior to restart,
2216                          * and prior to stop.  No need for it here.
2217                          */
2218                 } else {
2219                         /* Shared branch grows from an indirect block */
2220                         BUFFER_TRACE(partial->bh, "get_write_access");
2221                         ext3_free_branches(handle, inode, partial->bh,
2222                                         partial->p,
2223                                         partial->p+1, (chain+n-1) - partial);
2224                 }
2225         }
2226         /* Clear the ends of indirect blocks on the shared branch */
2227         while (partial > chain) {
2228                 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2229                                    (__le32*)partial->bh->b_data+addr_per_block,
2230                                    (chain+n-1) - partial);
2231                 BUFFER_TRACE(partial->bh, "call brelse");
2232                 brelse (partial->bh);
2233                 partial--;
2234         }
2235 do_indirects:
2236         /* Kill the remaining (whole) subtrees */
2237         switch (offsets[0]) {
2238                 default:
2239                         nr = i_data[EXT3_IND_BLOCK];
2240                         if (nr) {
2241                                 ext3_free_branches(handle, inode, NULL,
2242                                                    &nr, &nr+1, 1);
2243                                 i_data[EXT3_IND_BLOCK] = 0;
2244                         }
2245                 case EXT3_IND_BLOCK:
2246                         nr = i_data[EXT3_DIND_BLOCK];
2247                         if (nr) {
2248                                 ext3_free_branches(handle, inode, NULL,
2249                                                    &nr, &nr+1, 2);
2250                                 i_data[EXT3_DIND_BLOCK] = 0;
2251                         }
2252                 case EXT3_DIND_BLOCK:
2253                         nr = i_data[EXT3_TIND_BLOCK];
2254                         if (nr) {
2255                                 ext3_free_branches(handle, inode, NULL,
2256                                                    &nr, &nr+1, 3);
2257                                 i_data[EXT3_TIND_BLOCK] = 0;
2258                         }
2259                 case EXT3_TIND_BLOCK:
2260                         ;
2261         }
2262
2263         ext3_discard_reservation(inode);
2264
2265         up(&ei->truncate_sem);
2266         inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2267         ext3_mark_inode_dirty(handle, inode);
2268
2269         /* In a multi-transaction truncate, we only make the final
2270          * transaction synchronous */
2271         if (IS_SYNC(inode))
2272                 handle->h_sync = 1;
2273 out_stop:
2274         /*
2275          * If this was a simple ftruncate(), and the file will remain alive
2276          * then we need to clear up the orphan record which we created above.
2277          * However, if this was a real unlink then we were called by
2278          * ext3_delete_inode(), and we allow that function to clean up the
2279          * orphan info for us.
2280          */
2281         if (inode->i_nlink)
2282                 ext3_orphan_del(handle, inode);
2283
2284         ext3_journal_stop(handle);
2285 }
2286
2287 static unsigned long ext3_get_inode_block(struct super_block *sb,
2288                 unsigned long ino, struct ext3_iloc *iloc)
2289 {
2290         unsigned long desc, group_desc, block_group;
2291         unsigned long offset, block;
2292         struct buffer_head *bh;
2293         struct ext3_group_desc * gdp;
2294
2295
2296         if ((ino != EXT3_ROOT_INO &&
2297                 ino != EXT3_JOURNAL_INO &&
2298                 ino != EXT3_RESIZE_INO &&
2299                 ino < EXT3_FIRST_INO(sb)) ||
2300                 ino > le32_to_cpu(
2301                         EXT3_SB(sb)->s_es->s_inodes_count)) {
2302                 ext3_error (sb, "ext3_get_inode_block",
2303                             "bad inode number: %lu", ino);
2304                 return 0;
2305         }
2306         block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2307         if (block_group >= EXT3_SB(sb)->s_groups_count) {
2308                 ext3_error (sb, "ext3_get_inode_block",
2309                             "group >= groups count");
2310                 return 0;
2311         }
2312         smp_rmb();
2313         group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2314         desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2315         bh = EXT3_SB(sb)->s_group_desc[group_desc];
2316         if (!bh) {
2317                 ext3_error (sb, "ext3_get_inode_block",
2318                             "Descriptor not loaded");
2319                 return 0;
2320         }
2321
2322         gdp = (struct ext3_group_desc *) bh->b_data;
2323         /*
2324          * Figure out the offset within the block group inode table
2325          */
2326         offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2327                 EXT3_INODE_SIZE(sb);
2328         block = le32_to_cpu(gdp[desc].bg_inode_table) +
2329                 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2330
2331         iloc->block_group = block_group;
2332         iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2333         return block;
2334 }
2335
2336 /*
2337  * ext3_get_inode_loc returns with an extra refcount against the inode's
2338  * underlying buffer_head on success. If 'in_mem' is true, we have all
2339  * data in memory that is needed to recreate the on-disk version of this
2340  * inode.
2341  */
2342 static int __ext3_get_inode_loc(struct inode *inode,
2343                                 struct ext3_iloc *iloc, int in_mem)
2344 {
2345         unsigned long block;
2346         struct buffer_head *bh;
2347
2348         block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2349         if (!block)
2350                 return -EIO;
2351
2352         bh = sb_getblk(inode->i_sb, block);
2353         if (!bh) {
2354                 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2355                                 "unable to read inode block - "
2356                                 "inode=%lu, block=%lu", inode->i_ino, block);
2357                 return -EIO;
2358         }
2359         if (!buffer_uptodate(bh)) {
2360                 lock_buffer(bh);
2361                 if (buffer_uptodate(bh)) {
2362                         /* someone brought it uptodate while we waited */
2363                         unlock_buffer(bh);
2364                         goto has_buffer;
2365                 }
2366
2367                 /*
2368                  * If we have all information of the inode in memory and this
2369                  * is the only valid inode in the block, we need not read the
2370                  * block.
2371                  */
2372                 if (in_mem) {
2373                         struct buffer_head *bitmap_bh;
2374                         struct ext3_group_desc *desc;
2375                         int inodes_per_buffer;
2376                         int inode_offset, i;
2377                         int block_group;
2378                         int start;
2379
2380                         block_group = (inode->i_ino - 1) /
2381                                         EXT3_INODES_PER_GROUP(inode->i_sb);
2382                         inodes_per_buffer = bh->b_size /
2383                                 EXT3_INODE_SIZE(inode->i_sb);
2384                         inode_offset = ((inode->i_ino - 1) %
2385                                         EXT3_INODES_PER_GROUP(inode->i_sb));
2386                         start = inode_offset & ~(inodes_per_buffer - 1);
2387
2388                         /* Is the inode bitmap in cache? */
2389                         desc = ext3_get_group_desc(inode->i_sb,
2390                                                 block_group, NULL);
2391                         if (!desc)
2392                                 goto make_io;
2393
2394                         bitmap_bh = sb_getblk(inode->i_sb,
2395                                         le32_to_cpu(desc->bg_inode_bitmap));
2396                         if (!bitmap_bh)
2397                                 goto make_io;
2398
2399                         /*
2400                          * If the inode bitmap isn't in cache then the
2401                          * optimisation may end up performing two reads instead
2402                          * of one, so skip it.
2403                          */
2404                         if (!buffer_uptodate(bitmap_bh)) {
2405                                 brelse(bitmap_bh);
2406                                 goto make_io;
2407                         }
2408                         for (i = start; i < start + inodes_per_buffer; i++) {
2409                                 if (i == inode_offset)
2410                                         continue;
2411                                 if (ext3_test_bit(i, bitmap_bh->b_data))
2412                                         break;
2413                         }
2414                         brelse(bitmap_bh);
2415                         if (i == start + inodes_per_buffer) {
2416                                 /* all other inodes are free, so skip I/O */
2417                                 memset(bh->b_data, 0, bh->b_size);
2418                                 set_buffer_uptodate(bh);
2419                                 unlock_buffer(bh);
2420                                 goto has_buffer;
2421                         }
2422                 }
2423
2424 make_io:
2425                 /*
2426                  * There are other valid inodes in the buffer, this inode
2427                  * has in-inode xattrs, or we don't have this inode in memory.
2428                  * Read the block from disk.
2429                  */
2430                 get_bh(bh);
2431                 bh->b_end_io = end_buffer_read_sync;
2432                 submit_bh(READ, bh);
2433                 wait_on_buffer(bh);
2434                 if (!buffer_uptodate(bh)) {
2435                         ext3_error(inode->i_sb, "ext3_get_inode_loc",
2436                                         "unable to read inode block - "
2437                                         "inode=%lu, block=%lu",
2438                                         inode->i_ino, block);
2439                         brelse(bh);
2440                         return -EIO;
2441                 }
2442         }
2443 has_buffer:
2444         iloc->bh = bh;
2445         return 0;
2446 }
2447
2448 int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2449 {
2450         /* We have all inode data except xattrs in memory here. */
2451         return __ext3_get_inode_loc(inode, iloc,
2452                 !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
2453 }
2454
2455 void ext3_set_inode_flags(struct inode *inode)
2456 {
2457         unsigned int flags = EXT3_I(inode)->i_flags;
2458
2459         inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2460         if (flags & EXT3_SYNC_FL)
2461                 inode->i_flags |= S_SYNC;
2462         if (flags & EXT3_APPEND_FL)
2463                 inode->i_flags |= S_APPEND;
2464         if (flags & EXT3_IMMUTABLE_FL)
2465                 inode->i_flags |= S_IMMUTABLE;
2466         if (flags & EXT3_NOATIME_FL)
2467                 inode->i_flags |= S_NOATIME;
2468         if (flags & EXT3_DIRSYNC_FL)
2469                 inode->i_flags |= S_DIRSYNC;
2470 }
2471
2472 void ext3_read_inode(struct inode * inode)
2473 {
2474         struct ext3_iloc iloc;
2475         struct ext3_inode *raw_inode;
2476         struct ext3_inode_info *ei = EXT3_I(inode);
2477         struct buffer_head *bh;
2478         int block;
2479
2480 #ifdef CONFIG_EXT3_FS_POSIX_ACL
2481         ei->i_acl = EXT3_ACL_NOT_CACHED;
2482         ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2483 #endif
2484         ei->i_block_alloc_info = NULL;
2485
2486         if (__ext3_get_inode_loc(inode, &iloc, 0))
2487                 goto bad_inode;
2488         bh = iloc.bh;
2489         raw_inode = ext3_raw_inode(&iloc);
2490         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2491         inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2492         inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2493         if(!(test_opt (inode->i_sb, NO_UID32))) {
2494                 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2495                 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2496         }
2497         inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2498         inode->i_size = le32_to_cpu(raw_inode->i_size);
2499         inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2500         inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2501         inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2502         inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2503
2504         ei->i_state = 0;
2505         ei->i_dir_start_lookup = 0;
2506         ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2507         /* We now have enough fields to check if the inode was active or not.
2508          * This is needed because nfsd might try to access dead inodes
2509          * the test is that same one that e2fsck uses
2510          * NeilBrown 1999oct15
2511          */
2512         if (inode->i_nlink == 0) {
2513                 if (inode->i_mode == 0 ||
2514                     !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2515                         /* this inode is deleted */
2516                         brelse (bh);
2517                         goto bad_inode;
2518                 }
2519                 /* The only unlinked inodes we let through here have
2520                  * valid i_mode and are being read by the orphan
2521                  * recovery code: that's fine, we're about to complete
2522                  * the process of deleting those. */
2523         }
2524         inode->i_blksize = PAGE_SIZE;   /* This is the optimal IO size
2525                                          * (for stat), not the fs block
2526                                          * size */  
2527         inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2528         ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2529 #ifdef EXT3_FRAGMENTS
2530         ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2531         ei->i_frag_no = raw_inode->i_frag;
2532         ei->i_frag_size = raw_inode->i_fsize;
2533 #endif
2534         ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2535         if (!S_ISREG(inode->i_mode)) {
2536                 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2537         } else {
2538                 inode->i_size |=
2539                         ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2540         }
2541         ei->i_disksize = inode->i_size;
2542         inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2543         ei->i_block_group = iloc.block_group;
2544         /*
2545          * NOTE! The in-memory inode i_data array is in little-endian order
2546          * even on big-endian machines: we do NOT byteswap the block numbers!
2547          */
2548         for (block = 0; block < EXT3_N_BLOCKS; block++)
2549                 ei->i_data[block] = raw_inode->i_block[block];
2550         INIT_LIST_HEAD(&ei->i_orphan);
2551
2552         if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2553             EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2554                 /*
2555                  * When mke2fs creates big inodes it does not zero out
2556                  * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
2557                  * so ignore those first few inodes.
2558                  */
2559                 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2560                 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2561                     EXT3_INODE_SIZE(inode->i_sb))
2562                         goto bad_inode;
2563                 if (ei->i_extra_isize == 0) {
2564                         /* The extra space is currently unused. Use it. */
2565                         ei->i_extra_isize = sizeof(struct ext3_inode) -
2566                                             EXT3_GOOD_OLD_INODE_SIZE;
2567                 } else {
2568                         __le32 *magic = (void *)raw_inode +
2569                                         EXT3_GOOD_OLD_INODE_SIZE +
2570                                         ei->i_extra_isize;
2571                         if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2572                                  ei->i_state |= EXT3_STATE_XATTR;
2573                 }
2574         } else
2575                 ei->i_extra_isize = 0;
2576
2577         if (S_ISREG(inode->i_mode)) {
2578                 inode->i_op = &ext3_file_inode_operations;
2579                 inode->i_fop = &ext3_file_operations;
2580                 ext3_set_aops(inode);
2581         } else if (S_ISDIR(inode->i_mode)) {
2582                 inode->i_op = &ext3_dir_inode_operations;
2583                 inode->i_fop = &ext3_dir_operations;
2584         } else if (S_ISLNK(inode->i_mode)) {
2585                 if (ext3_inode_is_fast_symlink(inode))
2586                         inode->i_op = &ext3_fast_symlink_inode_operations;
2587                 else {
2588                         inode->i_op = &ext3_symlink_inode_operations;
2589                         ext3_set_aops(inode);
2590                 }
2591         } else {
2592                 inode->i_op = &ext3_special_inode_operations;
2593                 if (raw_inode->i_block[0])
2594                         init_special_inode(inode, inode->i_mode,
2595                            old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2596                 else 
2597                         init_special_inode(inode, inode->i_mode,
2598                            new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2599         }
2600         brelse (iloc.bh);
2601         ext3_set_inode_flags(inode);
2602         return;
2603
2604 bad_inode:
2605         make_bad_inode(inode);
2606         return;
2607 }
2608
2609 /*
2610  * Post the struct inode info into an on-disk inode location in the
2611  * buffer-cache.  This gobbles the caller's reference to the
2612  * buffer_head in the inode location struct.
2613  *
2614  * The caller must have write access to iloc->bh.
2615  */
2616 static int ext3_do_update_inode(handle_t *handle, 
2617                                 struct inode *inode, 
2618                                 struct ext3_iloc *iloc)
2619 {
2620         struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2621         struct ext3_inode_info *ei = EXT3_I(inode);
2622         struct buffer_head *bh = iloc->bh;
2623         int err = 0, rc, block;
2624
2625         /* For fields not not tracking in the in-memory inode,
2626          * initialise them to zero for new inodes. */
2627         if (ei->i_state & EXT3_STATE_NEW)
2628                 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2629
2630         raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2631         if(!(test_opt(inode->i_sb, NO_UID32))) {
2632                 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2633                 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2634 /*
2635  * Fix up interoperability with old kernels. Otherwise, old inodes get
2636  * re-used with the upper 16 bits of the uid/gid intact
2637  */
2638                 if(!ei->i_dtime) {
2639                         raw_inode->i_uid_high =
2640                                 cpu_to_le16(high_16_bits(inode->i_uid));
2641                         raw_inode->i_gid_high =
2642                                 cpu_to_le16(high_16_bits(inode->i_gid));
2643                 } else {
2644                         raw_inode->i_uid_high = 0;
2645                         raw_inode->i_gid_high = 0;
2646                 }
2647         } else {
2648                 raw_inode->i_uid_low =
2649                         cpu_to_le16(fs_high2lowuid(inode->i_uid));
2650                 raw_inode->i_gid_low =
2651                         cpu_to_le16(fs_high2lowgid(inode->i_gid));
2652                 raw_inode->i_uid_high = 0;
2653                 raw_inode->i_gid_high = 0;
2654         }
2655         raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2656         raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2657         raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2658         raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2659         raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2660         raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2661         raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2662         raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2663 #ifdef EXT3_FRAGMENTS
2664         raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2665         raw_inode->i_frag = ei->i_frag_no;
2666         raw_inode->i_fsize = ei->i_frag_size;
2667 #endif
2668         raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2669         if (!S_ISREG(inode->i_mode)) {
2670                 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2671         } else {
2672                 raw_inode->i_size_high =
2673                         cpu_to_le32(ei->i_disksize >> 32);
2674                 if (ei->i_disksize > 0x7fffffffULL) {
2675                         struct super_block *sb = inode->i_sb;
2676                         if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2677                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2678                             EXT3_SB(sb)->s_es->s_rev_level ==
2679                                         cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2680                                /* If this is the first large file
2681                                 * created, add a flag to the superblock.
2682                                 */
2683                                 err = ext3_journal_get_write_access(handle,
2684                                                 EXT3_SB(sb)->s_sbh);
2685                                 if (err)
2686                                         goto out_brelse;
2687                                 ext3_update_dynamic_rev(sb);
2688                                 EXT3_SET_RO_COMPAT_FEATURE(sb,
2689                                         EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2690                                 sb->s_dirt = 1;
2691                                 handle->h_sync = 1;
2692                                 err = ext3_journal_dirty_metadata(handle,
2693                                                 EXT3_SB(sb)->s_sbh);
2694                         }
2695                 }
2696         }
2697         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2698         if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2699                 if (old_valid_dev(inode->i_rdev)) {
2700                         raw_inode->i_block[0] =
2701                                 cpu_to_le32(old_encode_dev(inode->i_rdev));
2702                         raw_inode->i_block[1] = 0;
2703                 } else {
2704                         raw_inode->i_block[0] = 0;
2705                         raw_inode->i_block[1] =
2706                                 cpu_to_le32(new_encode_dev(inode->i_rdev));
2707                         raw_inode->i_block[2] = 0;
2708                 }
2709         } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2710                 raw_inode->i_block[block] = ei->i_data[block];
2711
2712         if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE)
2713                 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2714
2715         BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2716         rc = ext3_journal_dirty_metadata(handle, bh);
2717         if (!err)
2718                 err = rc;
2719         ei->i_state &= ~EXT3_STATE_NEW;
2720
2721 out_brelse:
2722         brelse (bh);
2723         ext3_std_error(inode->i_sb, err);
2724         return err;
2725 }
2726
2727 /*
2728  * ext3_write_inode()
2729  *
2730  * We are called from a few places:
2731  *
2732  * - Within generic_file_write() for O_SYNC files.
2733  *   Here, there will be no transaction running. We wait for any running
2734  *   trasnaction to commit.
2735  *
2736  * - Within sys_sync(), kupdate and such.
2737  *   We wait on commit, if tol to.
2738  *
2739  * - Within prune_icache() (PF_MEMALLOC == true)
2740  *   Here we simply return.  We can't afford to block kswapd on the
2741  *   journal commit.
2742  *
2743  * In all cases it is actually safe for us to return without doing anything,
2744  * because the inode has been copied into a raw inode buffer in
2745  * ext3_mark_inode_dirty().  This is a correctness thing for O_SYNC and for
2746  * knfsd.
2747  *
2748  * Note that we are absolutely dependent upon all inode dirtiers doing the
2749  * right thing: they *must* call mark_inode_dirty() after dirtying info in
2750  * which we are interested.
2751  *
2752  * It would be a bug for them to not do this.  The code:
2753  *
2754  *      mark_inode_dirty(inode)
2755  *      stuff();
2756  *      inode->i_size = expr;
2757  *
2758  * is in error because a kswapd-driven write_inode() could occur while
2759  * `stuff()' is running, and the new i_size will be lost.  Plus the inode
2760  * will no longer be on the superblock's dirty inode list.
2761  */
2762 int ext3_write_inode(struct inode *inode, int wait)
2763 {
2764         if (current->flags & PF_MEMALLOC)
2765                 return 0;
2766
2767         if (ext3_journal_current_handle()) {
2768                 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2769                 dump_stack();
2770                 return -EIO;
2771         }
2772
2773         if (!wait)
2774                 return 0;
2775
2776         return ext3_force_commit(inode->i_sb);
2777 }
2778
2779 /*
2780  * ext3_setattr()
2781  *
2782  * Called from notify_change.
2783  *
2784  * We want to trap VFS attempts to truncate the file as soon as
2785  * possible.  In particular, we want to make sure that when the VFS
2786  * shrinks i_size, we put the inode on the orphan list and modify
2787  * i_disksize immediately, so that during the subsequent flushing of
2788  * dirty pages and freeing of disk blocks, we can guarantee that any
2789  * commit will leave the blocks being flushed in an unused state on
2790  * disk.  (On recovery, the inode will get truncated and the blocks will
2791  * be freed, so we have a strong guarantee that no future commit will
2792  * leave these blocks visible to the user.)  
2793  *
2794  * Called with inode->sem down.
2795  */
2796 int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2797 {
2798         struct inode *inode = dentry->d_inode;
2799         int error, rc = 0;
2800         const unsigned int ia_valid = attr->ia_valid;
2801
2802         error = inode_change_ok(inode, attr);
2803         if (error)
2804                 return error;
2805
2806         if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2807                 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2808                 handle_t *handle;
2809
2810                 /* (user+group)*(old+new) structure, inode write (sb,
2811                  * inode block, ? - but truncate inode update has it) */
2812                 handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3);
2813                 if (IS_ERR(handle)) {
2814                         error = PTR_ERR(handle);
2815                         goto err_out;
2816                 }
2817                 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2818                 if (error) {
2819                         ext3_journal_stop(handle);
2820                         return error;
2821                 }
2822                 /* Update corresponding info in inode so that everything is in
2823                  * one transaction */
2824                 if (attr->ia_valid & ATTR_UID)
2825                         inode->i_uid = attr->ia_uid;
2826                 if (attr->ia_valid & ATTR_GID)
2827                         inode->i_gid = attr->ia_gid;
2828                 error = ext3_mark_inode_dirty(handle, inode);
2829                 ext3_journal_stop(handle);
2830         }
2831
2832         if (S_ISREG(inode->i_mode) &&
2833             attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2834                 handle_t *handle;
2835
2836                 handle = ext3_journal_start(inode, 3);
2837                 if (IS_ERR(handle)) {
2838                         error = PTR_ERR(handle);
2839                         goto err_out;
2840                 }
2841
2842                 error = ext3_orphan_add(handle, inode);
2843                 EXT3_I(inode)->i_disksize = attr->ia_size;
2844                 rc = ext3_mark_inode_dirty(handle, inode);
2845                 if (!error)
2846                         error = rc;
2847                 ext3_journal_stop(handle);
2848         }
2849
2850         rc = inode_setattr(inode, attr);
2851
2852         /* If inode_setattr's call to ext3_truncate failed to get a
2853          * transaction handle at all, we need to clean up the in-core
2854          * orphan list manually. */
2855         if (inode->i_nlink)
2856                 ext3_orphan_del(NULL, inode);
2857
2858         if (!rc && (ia_valid & ATTR_MODE))
2859                 rc = ext3_acl_chmod(inode);
2860
2861 err_out:
2862         ext3_std_error(inode->i_sb, error);
2863         if (!error)
2864                 error = rc;
2865         return error;
2866 }
2867
2868
2869 /*
2870  * akpm: how many blocks doth make a writepage()?
2871  *
2872  * With N blocks per page, it may be:
2873  * N data blocks
2874  * 2 indirect block
2875  * 2 dindirect
2876  * 1 tindirect
2877  * N+5 bitmap blocks (from the above)
2878  * N+5 group descriptor summary blocks
2879  * 1 inode block
2880  * 1 superblock.
2881  * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
2882  *
2883  * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
2884  *
2885  * With ordered or writeback data it's the same, less the N data blocks.
2886  *
2887  * If the inode's direct blocks can hold an integral number of pages then a
2888  * page cannot straddle two indirect blocks, and we can only touch one indirect
2889  * and dindirect block, and the "5" above becomes "3".
2890  *
2891  * This still overestimates under most circumstances.  If we were to pass the
2892  * start and end offsets in here as well we could do block_to_path() on each
2893  * block and work out the exact number of indirects which are touched.  Pah.
2894  */
2895
2896 static int ext3_writepage_trans_blocks(struct inode *inode)
2897 {
2898         int bpp = ext3_journal_blocks_per_page(inode);
2899         int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
2900         int ret;
2901
2902         if (ext3_should_journal_data(inode))
2903                 ret = 3 * (bpp + indirects) + 2;
2904         else
2905                 ret = 2 * (bpp + indirects) + 2;
2906
2907 #ifdef CONFIG_QUOTA
2908         /* We know that structure was already allocated during DQUOT_INIT so
2909          * we will be updating only the data blocks + inodes */
2910         ret += 2*EXT3_QUOTA_TRANS_BLOCKS;
2911 #endif
2912
2913         return ret;
2914 }
2915
2916 /*
2917  * The caller must have previously called ext3_reserve_inode_write().
2918  * Give this, we know that the caller already has write access to iloc->bh.
2919  */
2920 int ext3_mark_iloc_dirty(handle_t *handle,
2921                 struct inode *inode, struct ext3_iloc *iloc)
2922 {
2923         int err = 0;
2924
2925         /* the do_update_inode consumes one bh->b_count */
2926         get_bh(iloc->bh);
2927
2928         /* ext3_do_update_inode() does journal_dirty_metadata */
2929         err = ext3_do_update_inode(handle, inode, iloc);
2930         put_bh(iloc->bh);
2931         return err;
2932 }
2933
2934 /* 
2935  * On success, We end up with an outstanding reference count against
2936  * iloc->bh.  This _must_ be cleaned up later. 
2937  */
2938
2939 int
2940 ext3_reserve_inode_write(handle_t *handle, struct inode *inode, 
2941                          struct ext3_iloc *iloc)
2942 {
2943         int err = 0;
2944         if (handle) {
2945                 err = ext3_get_inode_loc(inode, iloc);
2946                 if (!err) {
2947                         BUFFER_TRACE(iloc->bh, "get_write_access");
2948                         err = ext3_journal_get_write_access(handle, iloc->bh);
2949                         if (err) {
2950                                 brelse(iloc->bh);
2951                                 iloc->bh = NULL;
2952                         }
2953                 }
2954         }
2955         ext3_std_error(inode->i_sb, err);
2956         return err;
2957 }
2958
2959 /*
2960  * akpm: What we do here is to mark the in-core inode as clean
2961  * with respect to inode dirtiness (it may still be data-dirty).
2962  * This means that the in-core inode may be reaped by prune_icache
2963  * without having to perform any I/O.  This is a very good thing,
2964  * because *any* task may call prune_icache - even ones which
2965  * have a transaction open against a different journal.
2966  *
2967  * Is this cheating?  Not really.  Sure, we haven't written the
2968  * inode out, but prune_icache isn't a user-visible syncing function.
2969  * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
2970  * we start and wait on commits.
2971  *
2972  * Is this efficient/effective?  Well, we're being nice to the system
2973  * by cleaning up our inodes proactively so they can be reaped
2974  * without I/O.  But we are potentially leaving up to five seconds'
2975  * worth of inodes floating about which prune_icache wants us to
2976  * write out.  One way to fix that would be to get prune_icache()
2977  * to do a write_super() to free up some memory.  It has the desired
2978  * effect.
2979  */
2980 int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
2981 {
2982         struct ext3_iloc iloc;
2983         int err;
2984
2985         might_sleep();
2986         err = ext3_reserve_inode_write(handle, inode, &iloc);
2987         if (!err)
2988                 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
2989         return err;
2990 }
2991
2992 /*
2993  * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
2994  *
2995  * We're really interested in the case where a file is being extended.
2996  * i_size has been changed by generic_commit_write() and we thus need
2997  * to include the updated inode in the current transaction.
2998  *
2999  * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
3000  * are allocated to the file.
3001  *
3002  * If the inode is marked synchronous, we don't honour that here - doing
3003  * so would cause a commit on atime updates, which we don't bother doing.
3004  * We handle synchronous inodes at the highest possible level.
3005  */
3006 void ext3_dirty_inode(struct inode *inode)
3007 {
3008         handle_t *current_handle = ext3_journal_current_handle();
3009         handle_t *handle;
3010
3011         handle = ext3_journal_start(inode, 2);
3012         if (IS_ERR(handle))
3013                 goto out;
3014         if (current_handle &&
3015                 current_handle->h_transaction != handle->h_transaction) {
3016                 /* This task has a transaction open against a different fs */
3017                 printk(KERN_EMERG "%s: transactions do not match!\n",
3018                        __FUNCTION__);
3019         } else {
3020                 jbd_debug(5, "marking dirty.  outer handle=%p\n",
3021                                 current_handle);
3022                 ext3_mark_inode_dirty(handle, inode);
3023         }
3024         ext3_journal_stop(handle);
3025 out:
3026         return;
3027 }
3028
3029 #ifdef AKPM
3030 /* 
3031  * Bind an inode's backing buffer_head into this transaction, to prevent
3032  * it from being flushed to disk early.  Unlike
3033  * ext3_reserve_inode_write, this leaves behind no bh reference and
3034  * returns no iloc structure, so the caller needs to repeat the iloc
3035  * lookup to mark the inode dirty later.
3036  */
3037 static inline int
3038 ext3_pin_inode(handle_t *handle, struct inode *inode)
3039 {
3040         struct ext3_iloc iloc;
3041
3042         int err = 0;
3043         if (handle) {
3044                 err = ext3_get_inode_loc(inode, &iloc);
3045                 if (!err) {
3046                         BUFFER_TRACE(iloc.bh, "get_write_access");
3047                         err = journal_get_write_access(handle, iloc.bh);
3048                         if (!err)
3049                                 err = ext3_journal_dirty_metadata(handle, 
3050                                                                   iloc.bh);
3051                         brelse(iloc.bh);
3052                 }
3053         }
3054         ext3_std_error(inode->i_sb, err);
3055         return err;
3056 }
3057 #endif
3058
3059 int ext3_change_inode_journal_flag(struct inode *inode, int val)
3060 {
3061         journal_t *journal;
3062         handle_t *handle;
3063         int err;
3064
3065         /*
3066          * We have to be very careful here: changing a data block's
3067          * journaling status dynamically is dangerous.  If we write a
3068          * data block to the journal, change the status and then delete
3069          * that block, we risk forgetting to revoke the old log record
3070          * from the journal and so a subsequent replay can corrupt data.
3071          * So, first we make sure that the journal is empty and that
3072          * nobody is changing anything.
3073          */
3074
3075         journal = EXT3_JOURNAL(inode);
3076         if (is_journal_aborted(journal) || IS_RDONLY(inode))
3077                 return -EROFS;
3078
3079         journal_lock_updates(journal);
3080         journal_flush(journal);
3081
3082         /*
3083          * OK, there are no updates running now, and all cached data is
3084          * synced to disk.  We are now in a completely consistent state
3085          * which doesn't have anything in the journal, and we know that
3086          * no filesystem updates are running, so it is safe to modify
3087          * the inode's in-core data-journaling state flag now.
3088          */
3089
3090         if (val)
3091                 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3092         else
3093                 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3094         ext3_set_aops(inode);
3095
3096         journal_unlock_updates(journal);
3097
3098         /* Finally we can mark the inode as dirty. */
3099
3100         handle = ext3_journal_start(inode, 1);
3101         if (IS_ERR(handle))
3102                 return PTR_ERR(handle);
3103
3104         err = ext3_mark_inode_dirty(handle, inode);
3105         handle->h_sync = 1;
3106         ext3_journal_stop(handle);
3107         ext3_std_error(inode->i_sb, err);
3108
3109         return err;
3110 }