]> pilppa.org Git - linux-2.6-omap-h63xx.git/blob - fs/jbd2/commit.c
Add SDTI device for OMAP3 and unify address definitions for OMAP1 and OMAP2.
[linux-2.6-omap-h63xx.git] / fs / jbd2 / commit.c
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25
26 /*
27  * Default IO end handler for temporary BJ_IO buffer_heads.
28  */
29 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30 {
31         BUFFER_TRACE(bh, "");
32         if (uptodate)
33                 set_buffer_uptodate(bh);
34         else
35                 clear_buffer_uptodate(bh);
36         unlock_buffer(bh);
37 }
38
39 /*
40  * When an ext3-ordered file is truncated, it is possible that many pages are
41  * not sucessfully freed, because they are attached to a committing transaction.
42  * After the transaction commits, these pages are left on the LRU, with no
43  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
44  * by the VM, but their apparent absence upsets the VM accounting, and it makes
45  * the numbers in /proc/meminfo look odd.
46  *
47  * So here, we have a buffer which has just come off the forget list.  Look to
48  * see if we can strip all buffers from the backing page.
49  *
50  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
51  * caller provided us with a ref against the buffer, and we drop that here.
52  */
53 static void release_buffer_page(struct buffer_head *bh)
54 {
55         struct page *page;
56
57         if (buffer_dirty(bh))
58                 goto nope;
59         if (atomic_read(&bh->b_count) != 1)
60                 goto nope;
61         page = bh->b_page;
62         if (!page)
63                 goto nope;
64         if (page->mapping)
65                 goto nope;
66
67         /* OK, it's a truncated page */
68         if (TestSetPageLocked(page))
69                 goto nope;
70
71         page_cache_get(page);
72         __brelse(bh);
73         try_to_free_buffers(page);
74         unlock_page(page);
75         page_cache_release(page);
76         return;
77
78 nope:
79         __brelse(bh);
80 }
81
82 /*
83  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
84  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
85  * return 0.  j_list_lock is dropped in this case.
86  */
87 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
88 {
89         if (!jbd_trylock_bh_state(bh)) {
90                 spin_unlock(&journal->j_list_lock);
91                 schedule();
92                 return 0;
93         }
94         return 1;
95 }
96
97 /*
98  * Done it all: now submit the commit record.  We should have
99  * cleaned up our previous buffers by now, so if we are in abort
100  * mode we can now just skip the rest of the journal write
101  * entirely.
102  *
103  * Returns 1 if the journal needs to be aborted or 0 on success
104  */
105 static int journal_submit_commit_record(journal_t *journal,
106                                         transaction_t *commit_transaction,
107                                         struct buffer_head **cbh,
108                                         __u32 crc32_sum)
109 {
110         struct journal_head *descriptor;
111         struct commit_header *tmp;
112         struct buffer_head *bh;
113         int ret;
114         int barrier_done = 0;
115
116         if (is_journal_aborted(journal))
117                 return 0;
118
119         descriptor = jbd2_journal_get_descriptor_buffer(journal);
120         if (!descriptor)
121                 return 1;
122
123         bh = jh2bh(descriptor);
124
125         tmp = (struct commit_header *)bh->b_data;
126         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
127         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
128         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
129
130         if (JBD2_HAS_COMPAT_FEATURE(journal,
131                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
132                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
133                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
134                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
135         }
136
137         JBUFFER_TRACE(descriptor, "submit commit block");
138         lock_buffer(bh);
139         get_bh(bh);
140         set_buffer_dirty(bh);
141         set_buffer_uptodate(bh);
142         bh->b_end_io = journal_end_buffer_io_sync;
143
144         if (journal->j_flags & JBD2_BARRIER &&
145                 !JBD2_HAS_INCOMPAT_FEATURE(journal,
146                                          JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
147                 set_buffer_ordered(bh);
148                 barrier_done = 1;
149         }
150         ret = submit_bh(WRITE, bh);
151         if (barrier_done)
152                 clear_buffer_ordered(bh);
153
154         /* is it possible for another commit to fail at roughly
155          * the same time as this one?  If so, we don't want to
156          * trust the barrier flag in the super, but instead want
157          * to remember if we sent a barrier request
158          */
159         if (ret == -EOPNOTSUPP && barrier_done) {
160                 char b[BDEVNAME_SIZE];
161
162                 printk(KERN_WARNING
163                         "JBD: barrier-based sync failed on %s - "
164                         "disabling barriers\n",
165                         bdevname(journal->j_dev, b));
166                 spin_lock(&journal->j_state_lock);
167                 journal->j_flags &= ~JBD2_BARRIER;
168                 spin_unlock(&journal->j_state_lock);
169
170                 /* And try again, without the barrier */
171                 set_buffer_uptodate(bh);
172                 set_buffer_dirty(bh);
173                 ret = submit_bh(WRITE, bh);
174         }
175         *cbh = bh;
176         return ret;
177 }
178
179 /*
180  * This function along with journal_submit_commit_record
181  * allows to write the commit record asynchronously.
182  */
183 static int journal_wait_on_commit_record(struct buffer_head *bh)
184 {
185         int ret = 0;
186
187         clear_buffer_dirty(bh);
188         wait_on_buffer(bh);
189
190         if (unlikely(!buffer_uptodate(bh)))
191                 ret = -EIO;
192         put_bh(bh);            /* One for getblk() */
193         jbd2_journal_put_journal_head(bh2jh(bh));
194
195         return ret;
196 }
197
198 /*
199  * Wait for all submitted IO to complete.
200  */
201 static int journal_wait_on_locked_list(journal_t *journal,
202                                        transaction_t *commit_transaction)
203 {
204         int ret = 0;
205         struct journal_head *jh;
206
207         while (commit_transaction->t_locked_list) {
208                 struct buffer_head *bh;
209
210                 jh = commit_transaction->t_locked_list->b_tprev;
211                 bh = jh2bh(jh);
212                 get_bh(bh);
213                 if (buffer_locked(bh)) {
214                         spin_unlock(&journal->j_list_lock);
215                         wait_on_buffer(bh);
216                         if (unlikely(!buffer_uptodate(bh)))
217                                 ret = -EIO;
218                         spin_lock(&journal->j_list_lock);
219                 }
220                 if (!inverted_lock(journal, bh)) {
221                         put_bh(bh);
222                         spin_lock(&journal->j_list_lock);
223                         continue;
224                 }
225                 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
226                         __jbd2_journal_unfile_buffer(jh);
227                         jbd_unlock_bh_state(bh);
228                         jbd2_journal_remove_journal_head(bh);
229                         put_bh(bh);
230                 } else {
231                         jbd_unlock_bh_state(bh);
232                 }
233                 put_bh(bh);
234                 cond_resched_lock(&journal->j_list_lock);
235         }
236         return ret;
237   }
238
239 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
240 {
241         int i;
242
243         for (i = 0; i < bufs; i++) {
244                 wbuf[i]->b_end_io = end_buffer_write_sync;
245                 /* We use-up our safety reference in submit_bh() */
246                 submit_bh(WRITE, wbuf[i]);
247         }
248 }
249
250 /*
251  *  Submit all the data buffers to disk
252  */
253 static void journal_submit_data_buffers(journal_t *journal,
254                                 transaction_t *commit_transaction)
255 {
256         struct journal_head *jh;
257         struct buffer_head *bh;
258         int locked;
259         int bufs = 0;
260         struct buffer_head **wbuf = journal->j_wbuf;
261
262         /*
263          * Whenever we unlock the journal and sleep, things can get added
264          * onto ->t_sync_datalist, so we have to keep looping back to
265          * write_out_data until we *know* that the list is empty.
266          *
267          * Cleanup any flushed data buffers from the data list.  Even in
268          * abort mode, we want to flush this out as soon as possible.
269          */
270 write_out_data:
271         cond_resched();
272         spin_lock(&journal->j_list_lock);
273
274         while (commit_transaction->t_sync_datalist) {
275                 jh = commit_transaction->t_sync_datalist;
276                 bh = jh2bh(jh);
277                 locked = 0;
278
279                 /* Get reference just to make sure buffer does not disappear
280                  * when we are forced to drop various locks */
281                 get_bh(bh);
282                 /* If the buffer is dirty, we need to submit IO and hence
283                  * we need the buffer lock. We try to lock the buffer without
284                  * blocking. If we fail, we need to drop j_list_lock and do
285                  * blocking lock_buffer().
286                  */
287                 if (buffer_dirty(bh)) {
288                         if (test_set_buffer_locked(bh)) {
289                                 BUFFER_TRACE(bh, "needs blocking lock");
290                                 spin_unlock(&journal->j_list_lock);
291                                 /* Write out all data to prevent deadlocks */
292                                 journal_do_submit_data(wbuf, bufs);
293                                 bufs = 0;
294                                 lock_buffer(bh);
295                                 spin_lock(&journal->j_list_lock);
296                         }
297                         locked = 1;
298                 }
299                 /* We have to get bh_state lock. Again out of order, sigh. */
300                 if (!inverted_lock(journal, bh)) {
301                         jbd_lock_bh_state(bh);
302                         spin_lock(&journal->j_list_lock);
303                 }
304                 /* Someone already cleaned up the buffer? */
305                 if (!buffer_jbd(bh)
306                         || jh->b_transaction != commit_transaction
307                         || jh->b_jlist != BJ_SyncData) {
308                         jbd_unlock_bh_state(bh);
309                         if (locked)
310                                 unlock_buffer(bh);
311                         BUFFER_TRACE(bh, "already cleaned up");
312                         put_bh(bh);
313                         continue;
314                 }
315                 if (locked && test_clear_buffer_dirty(bh)) {
316                         BUFFER_TRACE(bh, "needs writeout, adding to array");
317                         wbuf[bufs++] = bh;
318                         __jbd2_journal_file_buffer(jh, commit_transaction,
319                                                 BJ_Locked);
320                         jbd_unlock_bh_state(bh);
321                         if (bufs == journal->j_wbufsize) {
322                                 spin_unlock(&journal->j_list_lock);
323                                 journal_do_submit_data(wbuf, bufs);
324                                 bufs = 0;
325                                 goto write_out_data;
326                         }
327                 } else if (!locked && buffer_locked(bh)) {
328                         __jbd2_journal_file_buffer(jh, commit_transaction,
329                                                 BJ_Locked);
330                         jbd_unlock_bh_state(bh);
331                         put_bh(bh);
332                 } else {
333                         BUFFER_TRACE(bh, "writeout complete: unfile");
334                         __jbd2_journal_unfile_buffer(jh);
335                         jbd_unlock_bh_state(bh);
336                         if (locked)
337                                 unlock_buffer(bh);
338                         jbd2_journal_remove_journal_head(bh);
339                         /* Once for our safety reference, once for
340                          * jbd2_journal_remove_journal_head() */
341                         put_bh(bh);
342                         put_bh(bh);
343                 }
344
345                 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
346                         spin_unlock(&journal->j_list_lock);
347                         goto write_out_data;
348                 }
349         }
350         spin_unlock(&journal->j_list_lock);
351         journal_do_submit_data(wbuf, bufs);
352 }
353
354 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
355 {
356         struct page *page = bh->b_page;
357         char *addr;
358         __u32 checksum;
359
360         addr = kmap_atomic(page, KM_USER0);
361         checksum = crc32_be(crc32_sum,
362                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
363         kunmap_atomic(addr, KM_USER0);
364
365         return checksum;
366 }
367
368 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
369                                    unsigned long long block)
370 {
371         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
372         if (tag_bytes > JBD2_TAG_SIZE32)
373                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
374 }
375
376 /*
377  * jbd2_journal_commit_transaction
378  *
379  * The primary function for committing a transaction to the log.  This
380  * function is called by the journal thread to begin a complete commit.
381  */
382 void jbd2_journal_commit_transaction(journal_t *journal)
383 {
384         struct transaction_stats_s stats;
385         transaction_t *commit_transaction;
386         struct journal_head *jh, *new_jh, *descriptor;
387         struct buffer_head **wbuf = journal->j_wbuf;
388         int bufs;
389         int flags;
390         int err;
391         unsigned long long blocknr;
392         char *tagp = NULL;
393         journal_header_t *header;
394         journal_block_tag_t *tag = NULL;
395         int space_left = 0;
396         int first_tag = 0;
397         int tag_flag;
398         int i;
399         int tag_bytes = journal_tag_bytes(journal);
400         struct buffer_head *cbh = NULL; /* For transactional checksums */
401         __u32 crc32_sum = ~0;
402
403         /*
404          * First job: lock down the current transaction and wait for
405          * all outstanding updates to complete.
406          */
407
408 #ifdef COMMIT_STATS
409         spin_lock(&journal->j_list_lock);
410         summarise_journal_usage(journal);
411         spin_unlock(&journal->j_list_lock);
412 #endif
413
414         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
415         if (journal->j_flags & JBD2_FLUSHED) {
416                 jbd_debug(3, "super block updated\n");
417                 jbd2_journal_update_superblock(journal, 1);
418         } else {
419                 jbd_debug(3, "superblock not updated\n");
420         }
421
422         J_ASSERT(journal->j_running_transaction != NULL);
423         J_ASSERT(journal->j_committing_transaction == NULL);
424
425         commit_transaction = journal->j_running_transaction;
426         J_ASSERT(commit_transaction->t_state == T_RUNNING);
427
428         jbd_debug(1, "JBD: starting commit of transaction %d\n",
429                         commit_transaction->t_tid);
430
431         spin_lock(&journal->j_state_lock);
432         commit_transaction->t_state = T_LOCKED;
433
434         stats.u.run.rs_wait = commit_transaction->t_max_wait;
435         stats.u.run.rs_locked = jiffies;
436         stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
437                                                 stats.u.run.rs_locked);
438
439         spin_lock(&commit_transaction->t_handle_lock);
440         while (commit_transaction->t_updates) {
441                 DEFINE_WAIT(wait);
442
443                 prepare_to_wait(&journal->j_wait_updates, &wait,
444                                         TASK_UNINTERRUPTIBLE);
445                 if (commit_transaction->t_updates) {
446                         spin_unlock(&commit_transaction->t_handle_lock);
447                         spin_unlock(&journal->j_state_lock);
448                         schedule();
449                         spin_lock(&journal->j_state_lock);
450                         spin_lock(&commit_transaction->t_handle_lock);
451                 }
452                 finish_wait(&journal->j_wait_updates, &wait);
453         }
454         spin_unlock(&commit_transaction->t_handle_lock);
455
456         J_ASSERT (commit_transaction->t_outstanding_credits <=
457                         journal->j_max_transaction_buffers);
458
459         /*
460          * First thing we are allowed to do is to discard any remaining
461          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
462          * that there are no such buffers: if a large filesystem
463          * operation like a truncate needs to split itself over multiple
464          * transactions, then it may try to do a jbd2_journal_restart() while
465          * there are still BJ_Reserved buffers outstanding.  These must
466          * be released cleanly from the current transaction.
467          *
468          * In this case, the filesystem must still reserve write access
469          * again before modifying the buffer in the new transaction, but
470          * we do not require it to remember exactly which old buffers it
471          * has reserved.  This is consistent with the existing behaviour
472          * that multiple jbd2_journal_get_write_access() calls to the same
473          * buffer are perfectly permissable.
474          */
475         while (commit_transaction->t_reserved_list) {
476                 jh = commit_transaction->t_reserved_list;
477                 JBUFFER_TRACE(jh, "reserved, unused: refile");
478                 /*
479                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
480                  * leave undo-committed data.
481                  */
482                 if (jh->b_committed_data) {
483                         struct buffer_head *bh = jh2bh(jh);
484
485                         jbd_lock_bh_state(bh);
486                         jbd2_free(jh->b_committed_data, bh->b_size);
487                         jh->b_committed_data = NULL;
488                         jbd_unlock_bh_state(bh);
489                 }
490                 jbd2_journal_refile_buffer(journal, jh);
491         }
492
493         /*
494          * Now try to drop any written-back buffers from the journal's
495          * checkpoint lists.  We do this *before* commit because it potentially
496          * frees some memory
497          */
498         spin_lock(&journal->j_list_lock);
499         __jbd2_journal_clean_checkpoint_list(journal);
500         spin_unlock(&journal->j_list_lock);
501
502         jbd_debug (3, "JBD: commit phase 1\n");
503
504         /*
505          * Switch to a new revoke table.
506          */
507         jbd2_journal_switch_revoke_table(journal);
508
509         stats.u.run.rs_flushing = jiffies;
510         stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
511                                                stats.u.run.rs_flushing);
512
513         commit_transaction->t_state = T_FLUSH;
514         journal->j_committing_transaction = commit_transaction;
515         journal->j_running_transaction = NULL;
516         commit_transaction->t_log_start = journal->j_head;
517         wake_up(&journal->j_wait_transaction_locked);
518         spin_unlock(&journal->j_state_lock);
519
520         jbd_debug (3, "JBD: commit phase 2\n");
521
522         /*
523          * Now start flushing things to disk, in the order they appear
524          * on the transaction lists.  Data blocks go first.
525          */
526         err = 0;
527         journal_submit_data_buffers(journal, commit_transaction);
528
529         /*
530          * Wait for all previously submitted IO to complete if commit
531          * record is to be written synchronously.
532          */
533         spin_lock(&journal->j_list_lock);
534         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
535                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
536                 err = journal_wait_on_locked_list(journal,
537                                                 commit_transaction);
538
539         spin_unlock(&journal->j_list_lock);
540
541         if (err)
542                 jbd2_journal_abort(journal, err);
543
544         jbd2_journal_write_revoke_records(journal, commit_transaction);
545
546         jbd_debug(3, "JBD: commit phase 2\n");
547
548         /*
549          * If we found any dirty or locked buffers, then we should have
550          * looped back up to the write_out_data label.  If there weren't
551          * any then journal_clean_data_list should have wiped the list
552          * clean by now, so check that it is in fact empty.
553          */
554         J_ASSERT (commit_transaction->t_sync_datalist == NULL);
555
556         jbd_debug (3, "JBD: commit phase 3\n");
557
558         /*
559          * Way to go: we have now written out all of the data for a
560          * transaction!  Now comes the tricky part: we need to write out
561          * metadata.  Loop over the transaction's entire buffer list:
562          */
563         spin_lock(&journal->j_state_lock);
564         commit_transaction->t_state = T_COMMIT;
565         spin_unlock(&journal->j_state_lock);
566
567         stats.u.run.rs_logging = jiffies;
568         stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
569                                                  stats.u.run.rs_logging);
570         stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
571         stats.u.run.rs_blocks_logged = 0;
572
573         J_ASSERT(commit_transaction->t_nr_buffers <=
574                  commit_transaction->t_outstanding_credits);
575
576         descriptor = NULL;
577         bufs = 0;
578         while (commit_transaction->t_buffers) {
579
580                 /* Find the next buffer to be journaled... */
581
582                 jh = commit_transaction->t_buffers;
583
584                 /* If we're in abort mode, we just un-journal the buffer and
585                    release it for background writing. */
586
587                 if (is_journal_aborted(journal)) {
588                         JBUFFER_TRACE(jh, "journal is aborting: refile");
589                         jbd2_journal_refile_buffer(journal, jh);
590                         /* If that was the last one, we need to clean up
591                          * any descriptor buffers which may have been
592                          * already allocated, even if we are now
593                          * aborting. */
594                         if (!commit_transaction->t_buffers)
595                                 goto start_journal_io;
596                         continue;
597                 }
598
599                 /* Make sure we have a descriptor block in which to
600                    record the metadata buffer. */
601
602                 if (!descriptor) {
603                         struct buffer_head *bh;
604
605                         J_ASSERT (bufs == 0);
606
607                         jbd_debug(4, "JBD: get descriptor\n");
608
609                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
610                         if (!descriptor) {
611                                 jbd2_journal_abort(journal, -EIO);
612                                 continue;
613                         }
614
615                         bh = jh2bh(descriptor);
616                         jbd_debug(4, "JBD: got buffer %llu (%p)\n",
617                                 (unsigned long long)bh->b_blocknr, bh->b_data);
618                         header = (journal_header_t *)&bh->b_data[0];
619                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
620                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
621                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
622
623                         tagp = &bh->b_data[sizeof(journal_header_t)];
624                         space_left = bh->b_size - sizeof(journal_header_t);
625                         first_tag = 1;
626                         set_buffer_jwrite(bh);
627                         set_buffer_dirty(bh);
628                         wbuf[bufs++] = bh;
629
630                         /* Record it so that we can wait for IO
631                            completion later */
632                         BUFFER_TRACE(bh, "ph3: file as descriptor");
633                         jbd2_journal_file_buffer(descriptor, commit_transaction,
634                                         BJ_LogCtl);
635                 }
636
637                 /* Where is the buffer to be written? */
638
639                 err = jbd2_journal_next_log_block(journal, &blocknr);
640                 /* If the block mapping failed, just abandon the buffer
641                    and repeat this loop: we'll fall into the
642                    refile-on-abort condition above. */
643                 if (err) {
644                         jbd2_journal_abort(journal, err);
645                         continue;
646                 }
647
648                 /*
649                  * start_this_handle() uses t_outstanding_credits to determine
650                  * the free space in the log, but this counter is changed
651                  * by jbd2_journal_next_log_block() also.
652                  */
653                 commit_transaction->t_outstanding_credits--;
654
655                 /* Bump b_count to prevent truncate from stumbling over
656                    the shadowed buffer!  @@@ This can go if we ever get
657                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
658                 atomic_inc(&jh2bh(jh)->b_count);
659
660                 /* Make a temporary IO buffer with which to write it out
661                    (this will requeue both the metadata buffer and the
662                    temporary IO buffer). new_bh goes on BJ_IO*/
663
664                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
665                 /*
666                  * akpm: jbd2_journal_write_metadata_buffer() sets
667                  * new_bh->b_transaction to commit_transaction.
668                  * We need to clean this up before we release new_bh
669                  * (which is of type BJ_IO)
670                  */
671                 JBUFFER_TRACE(jh, "ph3: write metadata");
672                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
673                                                       jh, &new_jh, blocknr);
674                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
675                 wbuf[bufs++] = jh2bh(new_jh);
676
677                 /* Record the new block's tag in the current descriptor
678                    buffer */
679
680                 tag_flag = 0;
681                 if (flags & 1)
682                         tag_flag |= JBD2_FLAG_ESCAPE;
683                 if (!first_tag)
684                         tag_flag |= JBD2_FLAG_SAME_UUID;
685
686                 tag = (journal_block_tag_t *) tagp;
687                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
688                 tag->t_flags = cpu_to_be32(tag_flag);
689                 tagp += tag_bytes;
690                 space_left -= tag_bytes;
691
692                 if (first_tag) {
693                         memcpy (tagp, journal->j_uuid, 16);
694                         tagp += 16;
695                         space_left -= 16;
696                         first_tag = 0;
697                 }
698
699                 /* If there's no more to do, or if the descriptor is full,
700                    let the IO rip! */
701
702                 if (bufs == journal->j_wbufsize ||
703                     commit_transaction->t_buffers == NULL ||
704                     space_left < tag_bytes + 16) {
705
706                         jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
707
708                         /* Write an end-of-descriptor marker before
709                            submitting the IOs.  "tag" still points to
710                            the last tag we set up. */
711
712                         tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
713
714 start_journal_io:
715                         for (i = 0; i < bufs; i++) {
716                                 struct buffer_head *bh = wbuf[i];
717                                 /*
718                                  * Compute checksum.
719                                  */
720                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
721                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
722                                         crc32_sum =
723                                             jbd2_checksum_data(crc32_sum, bh);
724                                 }
725
726                                 lock_buffer(bh);
727                                 clear_buffer_dirty(bh);
728                                 set_buffer_uptodate(bh);
729                                 bh->b_end_io = journal_end_buffer_io_sync;
730                                 submit_bh(WRITE, bh);
731                         }
732                         cond_resched();
733                         stats.u.run.rs_blocks_logged += bufs;
734
735                         /* Force a new descriptor to be generated next
736                            time round the loop. */
737                         descriptor = NULL;
738                         bufs = 0;
739                 }
740         }
741
742         /* Done it all: now write the commit record asynchronously. */
743
744         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
745                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
746                 err = journal_submit_commit_record(journal, commit_transaction,
747                                                  &cbh, crc32_sum);
748                 if (err)
749                         __jbd2_journal_abort_hard(journal);
750
751                 spin_lock(&journal->j_list_lock);
752                 err = journal_wait_on_locked_list(journal,
753                                                 commit_transaction);
754                 spin_unlock(&journal->j_list_lock);
755                 if (err)
756                         __jbd2_journal_abort_hard(journal);
757         }
758
759         /* Lo and behold: we have just managed to send a transaction to
760            the log.  Before we can commit it, wait for the IO so far to
761            complete.  Control buffers being written are on the
762            transaction's t_log_list queue, and metadata buffers are on
763            the t_iobuf_list queue.
764
765            Wait for the buffers in reverse order.  That way we are
766            less likely to be woken up until all IOs have completed, and
767            so we incur less scheduling load.
768         */
769
770         jbd_debug(3, "JBD: commit phase 4\n");
771
772         /*
773          * akpm: these are BJ_IO, and j_list_lock is not needed.
774          * See __journal_try_to_free_buffer.
775          */
776 wait_for_iobuf:
777         while (commit_transaction->t_iobuf_list != NULL) {
778                 struct buffer_head *bh;
779
780                 jh = commit_transaction->t_iobuf_list->b_tprev;
781                 bh = jh2bh(jh);
782                 if (buffer_locked(bh)) {
783                         wait_on_buffer(bh);
784                         goto wait_for_iobuf;
785                 }
786                 if (cond_resched())
787                         goto wait_for_iobuf;
788
789                 if (unlikely(!buffer_uptodate(bh)))
790                         err = -EIO;
791
792                 clear_buffer_jwrite(bh);
793
794                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
795                 jbd2_journal_unfile_buffer(journal, jh);
796
797                 /*
798                  * ->t_iobuf_list should contain only dummy buffer_heads
799                  * which were created by jbd2_journal_write_metadata_buffer().
800                  */
801                 BUFFER_TRACE(bh, "dumping temporary bh");
802                 jbd2_journal_put_journal_head(jh);
803                 __brelse(bh);
804                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
805                 free_buffer_head(bh);
806
807                 /* We also have to unlock and free the corresponding
808                    shadowed buffer */
809                 jh = commit_transaction->t_shadow_list->b_tprev;
810                 bh = jh2bh(jh);
811                 clear_bit(BH_JWrite, &bh->b_state);
812                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
813
814                 /* The metadata is now released for reuse, but we need
815                    to remember it against this transaction so that when
816                    we finally commit, we can do any checkpointing
817                    required. */
818                 JBUFFER_TRACE(jh, "file as BJ_Forget");
819                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
820                 /* Wake up any transactions which were waiting for this
821                    IO to complete */
822                 wake_up_bit(&bh->b_state, BH_Unshadow);
823                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
824                 __brelse(bh);
825         }
826
827         J_ASSERT (commit_transaction->t_shadow_list == NULL);
828
829         jbd_debug(3, "JBD: commit phase 5\n");
830
831         /* Here we wait for the revoke record and descriptor record buffers */
832  wait_for_ctlbuf:
833         while (commit_transaction->t_log_list != NULL) {
834                 struct buffer_head *bh;
835
836                 jh = commit_transaction->t_log_list->b_tprev;
837                 bh = jh2bh(jh);
838                 if (buffer_locked(bh)) {
839                         wait_on_buffer(bh);
840                         goto wait_for_ctlbuf;
841                 }
842                 if (cond_resched())
843                         goto wait_for_ctlbuf;
844
845                 if (unlikely(!buffer_uptodate(bh)))
846                         err = -EIO;
847
848                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
849                 clear_buffer_jwrite(bh);
850                 jbd2_journal_unfile_buffer(journal, jh);
851                 jbd2_journal_put_journal_head(jh);
852                 __brelse(bh);           /* One for getblk */
853                 /* AKPM: bforget here */
854         }
855
856         jbd_debug(3, "JBD: commit phase 6\n");
857
858         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
859                 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
860                 err = journal_submit_commit_record(journal, commit_transaction,
861                                                 &cbh, crc32_sum);
862                 if (err)
863                         __jbd2_journal_abort_hard(journal);
864         }
865         if (!err && !is_journal_aborted(journal))
866                 err = journal_wait_on_commit_record(cbh);
867
868         if (err)
869                 jbd2_journal_abort(journal, err);
870
871         /* End of a transaction!  Finally, we can do checkpoint
872            processing: any buffers committed as a result of this
873            transaction can be removed from any checkpoint list it was on
874            before. */
875
876         jbd_debug(3, "JBD: commit phase 7\n");
877
878         J_ASSERT(commit_transaction->t_sync_datalist == NULL);
879         J_ASSERT(commit_transaction->t_buffers == NULL);
880         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
881         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
882         J_ASSERT(commit_transaction->t_shadow_list == NULL);
883         J_ASSERT(commit_transaction->t_log_list == NULL);
884
885 restart_loop:
886         /*
887          * As there are other places (journal_unmap_buffer()) adding buffers
888          * to this list we have to be careful and hold the j_list_lock.
889          */
890         spin_lock(&journal->j_list_lock);
891         while (commit_transaction->t_forget) {
892                 transaction_t *cp_transaction;
893                 struct buffer_head *bh;
894
895                 jh = commit_transaction->t_forget;
896                 spin_unlock(&journal->j_list_lock);
897                 bh = jh2bh(jh);
898                 jbd_lock_bh_state(bh);
899                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
900                         jh->b_transaction == journal->j_running_transaction);
901
902                 /*
903                  * If there is undo-protected committed data against
904                  * this buffer, then we can remove it now.  If it is a
905                  * buffer needing such protection, the old frozen_data
906                  * field now points to a committed version of the
907                  * buffer, so rotate that field to the new committed
908                  * data.
909                  *
910                  * Otherwise, we can just throw away the frozen data now.
911                  */
912                 if (jh->b_committed_data) {
913                         jbd2_free(jh->b_committed_data, bh->b_size);
914                         jh->b_committed_data = NULL;
915                         if (jh->b_frozen_data) {
916                                 jh->b_committed_data = jh->b_frozen_data;
917                                 jh->b_frozen_data = NULL;
918                         }
919                 } else if (jh->b_frozen_data) {
920                         jbd2_free(jh->b_frozen_data, bh->b_size);
921                         jh->b_frozen_data = NULL;
922                 }
923
924                 spin_lock(&journal->j_list_lock);
925                 cp_transaction = jh->b_cp_transaction;
926                 if (cp_transaction) {
927                         JBUFFER_TRACE(jh, "remove from old cp transaction");
928                         cp_transaction->t_chp_stats.cs_dropped++;
929                         __jbd2_journal_remove_checkpoint(jh);
930                 }
931
932                 /* Only re-checkpoint the buffer_head if it is marked
933                  * dirty.  If the buffer was added to the BJ_Forget list
934                  * by jbd2_journal_forget, it may no longer be dirty and
935                  * there's no point in keeping a checkpoint record for
936                  * it. */
937
938                 /* A buffer which has been freed while still being
939                  * journaled by a previous transaction may end up still
940                  * being dirty here, but we want to avoid writing back
941                  * that buffer in the future now that the last use has
942                  * been committed.  That's not only a performance gain,
943                  * it also stops aliasing problems if the buffer is left
944                  * behind for writeback and gets reallocated for another
945                  * use in a different page. */
946                 if (buffer_freed(bh)) {
947                         clear_buffer_freed(bh);
948                         clear_buffer_jbddirty(bh);
949                 }
950
951                 if (buffer_jbddirty(bh)) {
952                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
953                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
954                         JBUFFER_TRACE(jh, "refile for checkpoint writeback");
955                         __jbd2_journal_refile_buffer(jh);
956                         jbd_unlock_bh_state(bh);
957                 } else {
958                         J_ASSERT_BH(bh, !buffer_dirty(bh));
959                         /* The buffer on BJ_Forget list and not jbddirty means
960                          * it has been freed by this transaction and hence it
961                          * could not have been reallocated until this
962                          * transaction has committed. *BUT* it could be
963                          * reallocated once we have written all the data to
964                          * disk and before we process the buffer on BJ_Forget
965                          * list. */
966                         JBUFFER_TRACE(jh, "refile or unfile freed buffer");
967                         __jbd2_journal_refile_buffer(jh);
968                         if (!jh->b_transaction) {
969                                 jbd_unlock_bh_state(bh);
970                                  /* needs a brelse */
971                                 jbd2_journal_remove_journal_head(bh);
972                                 release_buffer_page(bh);
973                         } else
974                                 jbd_unlock_bh_state(bh);
975                 }
976                 cond_resched_lock(&journal->j_list_lock);
977         }
978         spin_unlock(&journal->j_list_lock);
979         /*
980          * This is a bit sleazy.  We use j_list_lock to protect transition
981          * of a transaction into T_FINISHED state and calling
982          * __jbd2_journal_drop_transaction(). Otherwise we could race with
983          * other checkpointing code processing the transaction...
984          */
985         spin_lock(&journal->j_state_lock);
986         spin_lock(&journal->j_list_lock);
987         /*
988          * Now recheck if some buffers did not get attached to the transaction
989          * while the lock was dropped...
990          */
991         if (commit_transaction->t_forget) {
992                 spin_unlock(&journal->j_list_lock);
993                 spin_unlock(&journal->j_state_lock);
994                 goto restart_loop;
995         }
996
997         /* Done with this transaction! */
998
999         jbd_debug(3, "JBD: commit phase 8\n");
1000
1001         J_ASSERT(commit_transaction->t_state == T_COMMIT);
1002
1003         commit_transaction->t_start = jiffies;
1004         stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
1005                                                 commit_transaction->t_start);
1006
1007         /*
1008          * File the transaction for history
1009          */
1010         stats.ts_type = JBD2_STATS_RUN;
1011         stats.ts_tid = commit_transaction->t_tid;
1012         stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
1013         spin_lock(&journal->j_history_lock);
1014         memcpy(journal->j_history + journal->j_history_cur, &stats,
1015                         sizeof(stats));
1016         if (++journal->j_history_cur == journal->j_history_max)
1017                 journal->j_history_cur = 0;
1018
1019         /*
1020          * Calculate overall stats
1021          */
1022         journal->j_stats.ts_tid++;
1023         journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
1024         journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
1025         journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
1026         journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
1027         journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
1028         journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
1029         journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
1030         journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
1031         spin_unlock(&journal->j_history_lock);
1032
1033         commit_transaction->t_state = T_FINISHED;
1034         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1035         journal->j_commit_sequence = commit_transaction->t_tid;
1036         journal->j_committing_transaction = NULL;
1037         spin_unlock(&journal->j_state_lock);
1038
1039         if (commit_transaction->t_checkpoint_list == NULL &&
1040             commit_transaction->t_checkpoint_io_list == NULL) {
1041                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1042         } else {
1043                 if (journal->j_checkpoint_transactions == NULL) {
1044                         journal->j_checkpoint_transactions = commit_transaction;
1045                         commit_transaction->t_cpnext = commit_transaction;
1046                         commit_transaction->t_cpprev = commit_transaction;
1047                 } else {
1048                         commit_transaction->t_cpnext =
1049                                 journal->j_checkpoint_transactions;
1050                         commit_transaction->t_cpprev =
1051                                 commit_transaction->t_cpnext->t_cpprev;
1052                         commit_transaction->t_cpnext->t_cpprev =
1053                                 commit_transaction;
1054                         commit_transaction->t_cpprev->t_cpnext =
1055                                 commit_transaction;
1056                 }
1057         }
1058         spin_unlock(&journal->j_list_lock);
1059
1060         jbd_debug(1, "JBD: commit %d complete, head %d\n",
1061                   journal->j_commit_sequence, journal->j_tail_sequence);
1062
1063         wake_up(&journal->j_wait_done_commit);
1064 }