Merge branch 'devel' into next

[linux-2.6-omap-h63xx.git] / fs / ext4 / inode.c
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index 2bef4f879e4b006d072b9bc8e79007a7ffb6c56b..8ca2763df091051fea3e02ae7bba35a1e82e21d9 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -39,6 +39,7 @@
  #include "ext4_jbd2.h"
  #include "xattr.h"
  #include "acl.h"
+#include "ext4_extents.h"
  
  static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                               loff_t new_size)
@@ -846,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
         struct ext4_inode_info *ei = EXT4_I(inode);
         int count = 0;
         ext4_fsblk_t first_block = 0;
+       loff_t disksize;
  
  
         J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -921,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
          * protect it if you're about to implement concurrent
          * ext4_get_block() -bzzz
         */
-       if (!err && extend_disksize && inode->i_size > ei->i_disksize)
-               ei->i_disksize = inode->i_size;
+       if (!err && extend_disksize) {
+               disksize = ((loff_t) iblock + count) << inode->i_blkbits;
+               if (disksize > i_size_read(inode))
+                       disksize = i_size_read(inode);
+               if (disksize > ei->i_disksize)
+                       ei->i_disksize = disksize;
+       }
         if (err)
                 goto cleanup;
  
@@ -982,7 +989,7 @@ out:
   */
  int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                         unsigned long max_blocks, struct buffer_head *bh,
-                       int create, int extend_disksize)
+                       int create, int extend_disksize, int flag)
  {
         int retval;
  
@@ -1023,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
          * with create == 1 flag.
          */
         down_write((&EXT4_I(inode)->i_data_sem));
+
+       /*
+        * if the caller is from delayed allocation writeout path
+        * we have already reserved fs blocks for allocation
+        * let the underlying get_block() function know to
+        * avoid double accounting
+        */
+       if (flag)
+               EXT4_I(inode)->i_delalloc_reserved_flag = 1;
         /*
          * We need to check for EXT4 here because migrate
          * could have changed the inode type in between
@@ -1044,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                                                         ~EXT4_EXT_MIGRATE;
                 }
         }
+
+       if (flag) {
+               EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+               /*
+                * Update reserved blocks/metadata blocks
+                * after successful block allocation
+                * which were deferred till now
+                */
+               if ((retval > 0) && buffer_delay(bh))
+                       ext4_da_release_space(inode, retval, 0);
+       }
+
         up_write((&EXT4_I(inode)->i_data_sem));
         return retval;
  }
@@ -1069,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
         }
  
         ret = ext4_get_blocks_wrap(handle, inode, iblock,
-                                       max_blocks, bh_result, create, 0);
+                                       max_blocks, bh_result, create, 0, 0);
         if (ret > 0) {
                 bh_result->b_size = (ret << inode->i_blkbits);
                 ret = 0;
@@ -1095,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
         dummy.b_blocknr = -1000;
         buffer_trace_init(&dummy.b_history);
         err = ext4_get_blocks_wrap(handle, inode, block, 1,
-                                       &dummy, create, 1);
+                                       &dummy, create, 1, 0);
         /*
          * ext4_get_blocks_handle() returns number of blocks
          * mapped. 0 in case of a HOLE.
@@ -1409,6 +1437,122 @@ static int ext4_journalled_write_end(struct file *file,
  
         return ret ? ret : copied;
  }
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
+ */
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+       int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+       int ind_blks, dind_blks, tind_blks;
+
+       /* number of new indirect blocks needed */
+       ind_blks = (blocks + icap - 1) / icap;
+
+       dind_blks = (ind_blks + icap - 1) / icap;
+
+       tind_blks = 1;
+
+       return ind_blks + dind_blks + tind_blks;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+       if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+               return ext4_ext_calc_metadata_amount(inode, blocks);
+
+       return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       unsigned long md_needed, mdblocks, total = 0;
+
+       /*
+        * recalculate the amount of metadata blocks to reserve
+        * in order to allocate nrblocks
+        * worse case is one extent per block
+        */
+       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+       total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+       mdblocks = ext4_calc_metadata_amount(inode, total);
+       BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+
+       md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+       total = md_needed + nrblocks;
+
+       if (ext4_has_free_blocks(sbi, total) < total) {
+               spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+               return -ENOSPC;
+       }
+
+       /* reduce fs free blocks counter */
+       percpu_counter_sub(&sbi->s_freeblocks_counter, total);
+
+       EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+       EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+
+       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+       return 0;       /* success */
+}
+
+void ext4_da_release_space(struct inode *inode, int used, int to_free)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       int total, mdb, mdb_free, release;
+
+       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+       /* recalculate the number of metablocks still need to be reserved */
+       total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+       mdb = ext4_calc_metadata_amount(inode, total);
+
+       /* figure out how many metablocks to release */
+       BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+       mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+
+       /* Account for allocated meta_blocks */
+       mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+       release = to_free + mdb_free;
+
+       /* update fs free blocks counter for truncate case */
+       percpu_counter_add(&sbi->s_freeblocks_counter, release);
+
+       /* update per-inode reservations */
+       BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
+       EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+
+       BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+       EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+       EXT4_I(inode)->i_allocated_meta_blocks = 0;
+       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+                                               unsigned long offset)
+{
+       int to_release = 0;
+       struct buffer_head *head, *bh;
+       unsigned int curr_off = 0;
+
+       head = page_buffers(page);
+       bh = head;
+       do {
+               unsigned int next_off = curr_off + bh->b_size;
+
+               if ((offset <= curr_off) && (buffer_delay(bh))) {
+                       to_release++;
+                       clear_buffer_delay(bh);
+               }
+               curr_off = next_off;
+       } while ((bh = bh->b_this_page) != head);
+       ext4_da_release_space(page->mapping->host, 0, to_release);
+}
  
  /*
   * Delayed allocation stuff
@@ -1545,13 +1689,11 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                         do {
                                 if (cur_logical >= logical + blocks)
                                         break;
-
                                 if (buffer_delay(bh)) {
                                         bh->b_blocknr = pblock;
                                         clear_buffer_delay(bh);
-                               } else if (buffer_mapped(bh)) {
+                               } else if (buffer_mapped(bh))
                                         BUG_ON(bh->b_blocknr != pblock);
-                               }
  
                                 cur_logical++;
                                 pblock++;
@@ -1626,10 +1768,10 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd)
                 if (buffer_delay(lbh))
                         mpage_put_bnr_to_bhs(mpd, next, &new);
  
-                       /* go for the remaining blocks */
-                       next += new.b_size >> mpd->inode->i_blkbits;
-                       remain -= new.b_size;
-               }
+               /* go for the remaining blocks */
+               next += new.b_size >> mpd->inode->i_blkbits;
+               remain -= new.b_size;
+       }
  }
  
  #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
@@ -1829,14 +1971,18 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
          * preallocated blocks are unmapped but should treated
          * the same as allocated blocks.
          */
-       ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0);
-       if (ret == 0) {
-               /* the block isn't allocated yet, let's reserve space */
-               /* XXX: call reservation here */
+       ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+       if ((ret == 0) && !buffer_delay(bh_result)) {
+               /* the block isn't (pre)allocated yet, let's reserve space */
                 /*
                  * XXX: __block_prepare_write() unmaps passed block,
                  * is it OK?
                  */
+               ret = ext4_da_reserve_space(inode, 1);
+               if (ret)
+                       /* not enough space to reserve */
+                       return ret;
+
                 map_bh(bh_result, inode->i_sb, 0);
                 set_buffer_new(bh_result);
                 set_buffer_delay(bh_result);
@@ -1847,25 +1993,25 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
  
         return ret;
  }
-
+#define                EXT4_DELALLOC_RSVED     1
  static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
                                    struct buffer_head *bh_result, int create)
  {
-       int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+       int ret;
         unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
         loff_t disksize = EXT4_I(inode)->i_disksize;
         handle_t *handle = NULL;
  
-       if (create) {
-               handle = ext4_journal_start(inode, needed_blocks);
-               if (IS_ERR(handle)) {
-                       ret = PTR_ERR(handle);
-                       goto out;
-               }
+       handle = ext4_journal_current_handle();
+       if (!handle) {
+               ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+                                  bh_result, 0, 0, 0);
+               BUG_ON(!ret);
+       } else {
+               ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+                                  bh_result, create, 0, EXT4_DELALLOC_RSVED);
         }
  
-       ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-                                  bh_result, create, 0);
         if (ret > 0) {
                 bh_result->b_size = (ret << inode->i_blkbits);
  
@@ -1887,72 +2033,229 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
                         up_write(&EXT4_I(inode)->i_data_sem);
  
                         if (EXT4_I(inode)->i_disksize == disksize) {
-                               if (handle == NULL)
-                                       handle = ext4_journal_start(inode, 1);
-                               if (!IS_ERR(handle))
-                                       ext4_mark_inode_dirty(handle, inode);
+                               ret = ext4_mark_inode_dirty(handle, inode);
+                               return ret;
                         }
                 }
-
                 ret = 0;
         }
+       return ret;
+}
  
-out:
-       if (handle && !IS_ERR(handle))
-               ext4_journal_stop(handle);
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+       /*
+        * unmapped buffer is possible for holes.
+        * delay buffer is possible with delayed allocation
+        */
+       return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+                                  struct buffer_head *bh_result, int create)
+{
+       int ret = 0;
+       unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
  
+       /*
+        * we don't want to do block allocation in writepage
+        * so call get_block_wrap with create = 0
+        */
+       ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+                                  bh_result, 0, 0, 0);
+       if (ret > 0) {
+               bh_result->b_size = (ret << inode->i_blkbits);
+               ret = 0;
+       }
         return ret;
  }
-/* FIXME!! only support data=writeback mode */
+
+/*
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
+ */
  static int ext4_da_writepage(struct page *page,
                                 struct writeback_control *wbc)
  {
-       struct inode *inode = page->mapping->host;
-       handle_t *handle = NULL;
         int ret = 0;
-       int err;
+       loff_t size;
+       unsigned long len;
+       struct buffer_head *page_bufs;
+       struct inode *inode = page->mapping->host;
  
-       if (ext4_journal_current_handle())
-               goto out_fail;
+       size = i_size_read(inode);
+       if (page->index == size >> PAGE_CACHE_SHIFT)
+               len = size & ~PAGE_CACHE_MASK;
+       else
+               len = PAGE_CACHE_SIZE;
  
-       handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-       if (IS_ERR(handle)) {
-               ret = PTR_ERR(handle);
-               goto out_fail;
+       if (page_has_buffers(page)) {
+               page_bufs = page_buffers(page);
+               if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                                       ext4_bh_unmapped_or_delay)) {
+                       /*
+                        * We don't want to do  block allocation
+                        * So redirty the page and return
+                        * We may reach here when we do a journal commit
+                        * via journal_submit_inode_data_buffers.
+                        * If we don't have mapping block we just ignore
+                        * them. We can also reach here via shrink_page_list
+                        */
+                       redirty_page_for_writepage(wbc, page);
+                       unlock_page(page);
+                       return 0;
+               }
+       } else {
+               /*
+                * The test for page_has_buffers() is subtle:
+                * We know the page is dirty but it lost buffers. That means
+                * that at some moment in time after write_begin()/write_end()
+                * has been called all buffers have been clean and thus they
+                * must have been written at least once. So they are all
+                * mapped and we can happily proceed with mapping them
+                * and writing the page.
+                *
+                * Try to initialize the buffer_heads and check whether
+                * all are mapped and non delay. We don't want to
+                * do block allocation here.
+                */
+               ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                                               ext4_normal_get_block_write);
+               if (!ret) {
+                       page_bufs = page_buffers(page);
+                       /* check whether all are mapped and non delay */
+                       if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+                                               ext4_bh_unmapped_or_delay)) {
+                               redirty_page_for_writepage(wbc, page);
+                               unlock_page(page);
+                               return 0;
+                       }
+               } else {
+                       /*
+                        * We can't do block allocation here
+                        * so just redity the page and unlock
+                        * and return
+                        */
+                       redirty_page_for_writepage(wbc, page);
+                       unlock_page(page);
+                       return 0;
+               }
         }
  
         if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-               ret = nobh_writepage(page, ext4_get_block, wbc);
+               ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
         else
-               ret = block_write_full_page(page, ext4_get_block, wbc);
+               ret = block_write_full_page(page,
+                                               ext4_normal_get_block_write,
+                                               wbc);
  
-       if (!ret && inode->i_size > EXT4_I(inode)->i_disksize) {
-               EXT4_I(inode)->i_disksize = inode->i_size;
-               ext4_mark_inode_dirty(handle, inode);
-       }
-
-       err = ext4_journal_stop(handle);
-       if (!ret)
-               ret = err;
-       return ret;
-
-out_fail:
-       redirty_page_for_writepage(wbc, page);
-       unlock_page(page);
         return ret;
  }
  
+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES      DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS    DIO_CREDITS
+
  static int ext4_da_writepages(struct address_space *mapping,
                                 struct writeback_control *wbc)
  {
-       return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write);
+       struct inode *inode = mapping->host;
+       handle_t *handle = NULL;
+       int needed_blocks;
+       int ret = 0;
+       long to_write;
+       loff_t range_start = 0;
+
+       /*
+        * No pages to write? This is mainly a kludge to avoid starting
+        * a transaction for special inodes like journal inode on last iput()
+        * because that could violate lock ordering on umount
+        */
+       if (!mapping->nrpages)
+               return 0;
+
+       /*
+        * Estimate the worse case needed credits to write out
+        * EXT4_MAX_BUF_BLOCKS pages
+        */
+       needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+
+       to_write = wbc->nr_to_write;
+       if (!wbc->range_cyclic) {
+               /*
+                * If range_cyclic is not set force range_cont
+                * and save the old writeback_index
+                */
+               wbc->range_cont = 1;
+               range_start =  wbc->range_start;
+       }
+
+       while (!ret && to_write) {
+               /* start a new transaction*/
+               handle = ext4_journal_start(inode, needed_blocks);
+               if (IS_ERR(handle)) {
+                       ret = PTR_ERR(handle);
+                       goto out_writepages;
+               }
+               if (ext4_should_order_data(inode)) {
+                       /*
+                        * With ordered mode we need to add
+                        * the inode to the journal handle
+                        * when we do block allocation.
+                        */
+                       ret = ext4_jbd2_file_inode(handle, inode);
+                       if (ret) {
+                               ext4_journal_stop(handle);
+                               goto out_writepages;
+                       }
+
+               }
+               /*
+                * set the max dirty pages could be write at a time
+                * to fit into the reserved transaction credits
+                */
+               if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+                       wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+
+               to_write -= wbc->nr_to_write;
+               ret = mpage_da_writepages(mapping, wbc,
+                                               ext4_da_get_block_write);
+               ext4_journal_stop(handle);
+               if (wbc->nr_to_write) {
+                       /*
+                        * There is no more writeout needed
+                        * or we requested for a noblocking writeout
+                        * and we found the device congested
+                        */
+                       to_write += wbc->nr_to_write;
+                       break;
+               }
+               wbc->nr_to_write = to_write;
+       }
+
+out_writepages:
+       wbc->nr_to_write = to_write;
+       if (range_start)
+               wbc->range_start = range_start;
+       return ret;
  }
  
  static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                                 loff_t pos, unsigned len, unsigned flags,
                                 struct page **pagep, void **fsdata)
  {
-       int ret;
+       int ret, retries = 0;
         struct page *page;
         pgoff_t index;
         unsigned from, to;
@@ -1963,6 +2266,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
         from = pos & (PAGE_CACHE_SIZE - 1);
         to = from + len;
  
+retry:
         /*
          * With delayed allocation, we don't log the i_disksize update
          * if there is delayed block allocation. But we still need
@@ -1988,13 +2292,33 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                 page_cache_release(page);
         }
  
+       if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+               goto retry;
  out:
         return ret;
  }
  
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+                                        unsigned long offset)
  {
-       return !buffer_mapped(bh) || buffer_delay(bh);
+       struct buffer_head *bh;
+       struct inode *inode = page->mapping->host;
+       unsigned int idx;
+       int i;
+
+       bh = page_buffers(page);
+       idx = offset >> inode->i_blkbits;
+
+       for (i=0; i < idx; i++)
+               bh = bh->b_this_page;
+
+       if (!buffer_mapped(bh) || (buffer_delay(bh)))
+               return 0;
+       return 1;
  }
  
  static int ext4_da_write_end(struct file *file,
@@ -2006,6 +2330,10 @@ static int ext4_da_write_end(struct file *file,
         int ret = 0, ret2;
         handle_t *handle = ext4_journal_current_handle();
         loff_t new_i_size;
+       unsigned long start, end;
+
+       start = pos & (PAGE_CACHE_SIZE - 1);
+       end = start + copied -1;
  
         /*
          * generic_write_end() will run mark_inode_dirty() if i_size
@@ -2014,18 +2342,23 @@ static int ext4_da_write_end(struct file *file,
          */
  
         new_i_size = pos + copied;
-       if (new_i_size > EXT4_I(inode)->i_disksize)
-               if (!walk_page_buffers(NULL, page_buffers(page),
-                                      0, len, NULL, ext4_bh_unmapped_or_delay)){
-                       /*
-                        * Updating i_disksize when extending file without
-                        * needing block allocation
-                        */
-                       if (ext4_should_order_data(inode))
-                               ret = ext4_jbd2_file_inode(handle, inode);
+       if (new_i_size > EXT4_I(inode)->i_disksize) {
+               if (ext4_da_should_update_i_disksize(page, end)) {
+                       down_write(&EXT4_I(inode)->i_data_sem);
+                       if (new_i_size > EXT4_I(inode)->i_disksize) {
+                               /*
+                                * Updating i_disksize when extending file
+                                * without needing block allocation
+                                */
+                               if (ext4_should_order_data(inode))
+                                       ret = ext4_jbd2_file_inode(handle,
+                                                                  inode);
  
-                       EXT4_I(inode)->i_disksize = new_i_size;
+                               EXT4_I(inode)->i_disksize = new_i_size;
+                       }
+                       up_write(&EXT4_I(inode)->i_data_sem);
                 }
+       }
         ret2 = generic_write_end(file, mapping, pos, len, copied,
                                                         page, fsdata);
         copied = ret2;
@@ -2040,9 +2373,6 @@ static int ext4_da_write_end(struct file *file,
  
  static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
  {
-       struct buffer_head *head, *bh;
-       unsigned int curr_off = 0;
-
         /*
          * Drop reserved blocks
          */
@@ -2050,21 +2380,7 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
         if (!page_has_buffers(page))
                 goto out;
  
-       head = page_buffers(page);
-       bh = head;
-       do {
-               unsigned int next_off = curr_off + bh->b_size;
-
-               /*
-                * is this block fully invalidated?
-                */
-               if (offset <= curr_off && buffer_delay(bh)) {
-                       clear_buffer_delay(bh);
-                       /* XXX: add real stuff here */
-               }
-               curr_off = next_off;
-               bh = bh->b_this_page;
-       } while (bh != head);
+       ext4_da_page_release_reservation(page, offset);
  
  out:
         ext4_invalidatepage(page, offset);
@@ -2206,12 +2522,14 @@ static int __ext4_normal_writepage(struct page *page,
         struct inode *inode = page->mapping->host;
  
         if (test_opt(inode->i_sb, NOBH))
-               return nobh_writepage(page, ext4_get_block, wbc);
+               return nobh_writepage(page,
+                                       ext4_normal_get_block_write, wbc);
         else
-               return block_write_full_page(page, ext4_get_block, wbc);
+               return block_write_full_page(page,
+                                               ext4_normal_get_block_write,
+                                               wbc);
  }
  
-
  static int ext4_normal_writepage(struct page *page,
                                 struct writeback_control *wbc)
  {
@@ -2220,13 +2538,24 @@ static int ext4_normal_writepage(struct page *page,
         loff_t len;
  
         J_ASSERT(PageLocked(page));
-       J_ASSERT(page_has_buffers(page));
         if (page->index == size >> PAGE_CACHE_SHIFT)
                 len = size & ~PAGE_CACHE_MASK;
         else
                 len = PAGE_CACHE_SIZE;
-       BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                ext4_bh_unmapped_or_delay));
+
+       if (page_has_buffers(page)) {
+               /* if page has buffers it should all be mapped
+                * and allocated. If there are not buffers attached
+                * to the page we know the page is dirty but it lost
+                * buffers. That means that at some moment in time
+                * after write_begin() / write_end() has been called
+                * all buffers have been clean and thus they must have been
+                * written at least once. So they are all mapped and we can
+                * happily proceed with mapping them and writing the page.
+                */
+               BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+                                       ext4_bh_unmapped_or_delay));
+       }
  
         if (!ext4_journal_current_handle())
                 return __ext4_normal_writepage(page, wbc);
@@ -2246,7 +2575,8 @@ static int __ext4_journalled_writepage(struct page *page,
         int ret = 0;
         int err;
  
-       ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, ext4_get_block);
+       ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+                                       ext4_normal_get_block_write);
         if (ret != 0)
                 goto out_unlock;
  
@@ -2293,13 +2623,24 @@ static int ext4_journalled_writepage(struct page *page,
         loff_t len;
  
         J_ASSERT(PageLocked(page));
-       J_ASSERT(page_has_buffers(page));
         if (page->index == size >> PAGE_CACHE_SHIFT)
                 len = size & ~PAGE_CACHE_MASK;
         else
                 len = PAGE_CACHE_SIZE;
-       BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                ext4_bh_unmapped_or_delay));
+
+       if (page_has_buffers(page)) {
+               /* if page has buffers it should all be mapped
+                * and allocated. If there are not buffers attached
+                * to the page we know the page is dirty but it lost
+                * buffers. That means that at some moment in time
+                * after write_begin() / write_end() has been called
+                * all buffers have been clean and thus they must have been
+                * written at least once. So they are all mapped and we can
+                * happily proceed with mapping them and writing the page.
+                */
+               BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+                                       ext4_bh_unmapped_or_delay));
+       }
  
         if (ext4_journal_current_handle())
                 goto no_write;
@@ -2317,7 +2658,9 @@ static int ext4_journalled_writepage(struct page *page,
                  * really know unless we go poke around in the buffer_heads.
                  * But block_write_full_page will do the right thing.
                  */
-               return block_write_full_page(page, ext4_get_block, wbc);
+               return block_write_full_page(page,
+                                               ext4_normal_get_block_write,
+                                               wbc);
         }
  no_write:
         redirty_page_for_writepage(wbc, page);
@@ -2520,7 +2863,10 @@ static const struct address_space_operations ext4_da_aops = {
  
  void ext4_set_aops(struct inode *inode)
  {
-       if (ext4_should_order_data(inode))
+       if (ext4_should_order_data(inode) &&
+               test_opt(inode->i_sb, DELALLOC))
+               inode->i_mapping->a_ops = &ext4_da_aops;
+       else if (ext4_should_order_data(inode))
                 inode->i_mapping->a_ops = &ext4_ordered_aops;
         else if (ext4_should_writeback_data(inode) &&
                  test_opt(inode->i_sb, DELALLOC))
@@ -3079,6 +3425,11 @@ void ext4_truncate(struct inode *inode)
         if (ext4_orphan_add(handle, inode))
                 goto out_stop;
  
+       /*
+        * From here we block out all ext4_get_block() callers who want to
+        * modify the block allocation tree.
+        */
+       down_write(&ei->i_data_sem);
         /*
          * The orphan list entry will now protect us from any crash which
          * occurs before the truncate completes, so it is now safe to propagate
@@ -3088,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
          */
         ei->i_disksize = inode->i_size;
  
-       /*
-        * From here we block out all ext4_get_block() callers who want to
-        * modify the block allocation tree.
-        */
-       down_write(&ei->i_data_sem);
-
         if (n == 1) {           /* direct blocks */
                 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
                                i_data + EXT4_NDIR_BLOCKS);
@@ -3886,6 +4231,32 @@ err_out:
         return error;
  }
  
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                struct kstat *stat)
+{
+       struct inode *inode;
+       unsigned long delalloc_blocks;
+
+       inode = dentry->d_inode;
+       generic_fillattr(inode, stat);
+
+       /*
+        * We can't update i_blocks if the block allocation is delayed
+        * otherwise in the case of system crash before the real block
+        * allocation is done, we will have i_blocks inconsistent with
+        * on-disk file blocks.
+        * We always keep i_blocks updated together with real
+        * allocation. But to not confuse with user, stat
+        * will return the blocks that include the delayed allocation
+        * blocks for this file.
+        */
+       spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+       delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+       spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+       stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+       return 0;
+}
  
  /*
   * How many blocks doth make a writepage()?