X-Git-Url: http://pilppa.org/gitweb/gitweb.cgi?a=blobdiff_plain;f=fs%2Fext4%2Finode.c;h=a271290651446f8143918f2ac2765295e8ad3187;hb=a02908f1;hp=8ca2763df091051fea3e02ae7bba35a1e82e21d9;hpb=dc7c65db2845a8d17432d89252c4227a9a7cb15f;p=linux-2.6-omap-h63xx.git diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8ca2763df09..a2712906514 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -191,6 +191,7 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) void ext4_delete_inode (struct inode * inode) { handle_t *handle; + int err; if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); @@ -199,8 +200,9 @@ void ext4_delete_inode (struct inode * inode) if (is_bad_inode(inode)) goto no_delete; - handle = start_transaction(inode); + handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); if (IS_ERR(handle)) { + ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* * If we're going to skip the normal cleanup, we still need to * make sure that the in-core orphan linked list is properly @@ -213,8 +215,34 @@ void ext4_delete_inode (struct inode * inode) if (IS_SYNC(inode)) handle->h_sync = 1; inode->i_size = 0; + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_warning(inode->i_sb, __func__, + "couldn't mark inode dirty (err %d)", err); + goto stop_handle; + } if (inode->i_blocks) ext4_truncate(inode); + + /* + * ext4_ext_truncate() doesn't reserve any slop when it + * restarts journal transactions; therefore there may not be + * enough credits left in the handle to remove the inode from + * the orphan list and set the dtime field. + */ + if (handle->h_buffer_credits < 3) { + err = ext4_journal_extend(handle, 3); + if (err > 0) + err = ext4_journal_restart(handle, 3); + if (err != 0) { + ext4_warning(inode->i_sb, __func__, + "couldn't extend journal (err %d)", err); + stop_handle: + ext4_journal_stop(handle); + goto no_delete; + } + } + /* * Kill off the orphan record which ext4_truncate created. * AKPM: I think this can be inside the above `if'. @@ -952,6 +980,70 @@ out: return err; } +/* + * Calculate the number of metadata blocks need to reserve + * to allocate @blocks for non extent file based file + */ +static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) +{ + int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); + int ind_blks, dind_blks, tind_blks; + + /* number of new indirect blocks needed */ + ind_blks = (blocks + icap - 1) / icap; + + dind_blks = (ind_blks + icap - 1) / icap; + + tind_blks = 1; + + return ind_blks + dind_blks + tind_blks; +} + +/* + * Calculate the number of metadata blocks need to reserve + * to allocate given number of blocks + */ +static int ext4_calc_metadata_amount(struct inode *inode, int blocks) +{ + if (!blocks) + return 0; + + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) + return ext4_ext_calc_metadata_amount(inode, blocks); + + return ext4_indirect_calc_metadata_amount(inode, blocks); +} + +static void ext4_da_update_reserve_space(struct inode *inode, int used) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + int total, mdb, mdb_free; + + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + /* recalculate the number of metablocks still need to be reserved */ + total = EXT4_I(inode)->i_reserved_data_blocks - used; + mdb = ext4_calc_metadata_amount(inode, total); + + /* figure out how many metablocks to release */ + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; + + /* Account for allocated meta_blocks */ + mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; + + /* update fs free blocks counter for truncate case */ + percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free); + + /* update per-inode reservations */ + BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); + EXT4_I(inode)->i_reserved_data_blocks -= used; + + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); + EXT4_I(inode)->i_reserved_meta_blocks = mdb; + EXT4_I(inode)->i_allocated_meta_blocks = 0; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); +} + /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 /* @@ -965,10 +1057,9 @@ out: /* + * The ext4_get_blocks_wrap() function try to look up the requested blocks, + * and returns if the blocks are already mapped. * - * - * ext4_ext4 get_block() wrapper function - * It will do a look up first, and returns if the blocks already mapped. * Otherwise it takes the write lock of the i_data_sem and allocate blocks * and store the allocated blocks in the result buffer head and mark it * mapped. @@ -1069,7 +1160,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, * which were deferred till now */ if ((retval > 0) && buffer_delay(bh)) - ext4_da_release_space(inode, retval, 0); + ext4_da_update_reserve_space(inode, retval); } up_write((&EXT4_I(inode)->i_data_sem)); @@ -1336,12 +1427,8 @@ static int ext4_ordered_write_end(struct file *file, { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; - unsigned from, to; int ret = 0, ret2; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; - ret = ext4_jbd2_file_inode(handle, inode); if (ret == 0) { @@ -1437,36 +1524,6 @@ static int ext4_journalled_write_end(struct file *file, return ret ? ret : copied; } -/* - * Calculate the number of metadata blocks need to reserve - * to allocate @blocks for non extent file based file - */ -static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) -{ - int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ind_blks, dind_blks, tind_blks; - - /* number of new indirect blocks needed */ - ind_blks = (blocks + icap - 1) / icap; - - dind_blks = (ind_blks + icap - 1) / icap; - - tind_blks = 1; - - return ind_blks + dind_blks + tind_blks; -} - -/* - * Calculate the number of metadata blocks need to reserve - * to allocate given number of blocks - */ -static int ext4_calc_metadata_amount(struct inode *inode, int blocks) -{ - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) - return ext4_ext_calc_metadata_amount(inode, blocks); - - return ext4_indirect_calc_metadata_amount(inode, blocks); -} static int ext4_da_reserve_space(struct inode *inode, int nrblocks) { @@ -1490,7 +1547,6 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); return -ENOSPC; } - /* reduce fs free blocks counter */ percpu_counter_sub(&sbi->s_freeblocks_counter, total); @@ -1501,35 +1557,49 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) return 0; /* success */ } -void ext4_da_release_space(struct inode *inode, int used, int to_free) +static void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int total, mdb, mdb_free, release; + if (!to_free) + return; /* Nothing to release, exit */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + + if (!EXT4_I(inode)->i_reserved_data_blocks) { + /* + * if there is no reserved blocks, but we try to free some + * then the counter is messed up somewhere. + * but since this function is called from invalidate + * page, it's harmless to return without any action + */ + printk(KERN_INFO "ext4 delalloc try to release %d reserved " + "blocks for inode %lu, but there is no reserved " + "data blocks\n", to_free, inode->i_ino); + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return; + } + /* recalculate the number of metablocks still need to be reserved */ - total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free; + total = EXT4_I(inode)->i_reserved_data_blocks - to_free; mdb = ext4_calc_metadata_amount(inode, total); /* figure out how many metablocks to release */ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; - /* Account for allocated meta_blocks */ - mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; - release = to_free + mdb_free; /* update fs free blocks counter for truncate case */ percpu_counter_add(&sbi->s_freeblocks_counter, release); /* update per-inode reservations */ - BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks); - EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free); + BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); + EXT4_I(inode)->i_reserved_data_blocks -= to_free; BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); EXT4_I(inode)->i_reserved_meta_blocks = mdb; - EXT4_I(inode)->i_allocated_meta_blocks = 0; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); } @@ -1551,7 +1621,7 @@ static void ext4_da_page_release_reservation(struct page *page, } curr_off = next_off; } while ((bh = bh->b_this_page) != head); - ext4_da_release_space(page->mapping->host, 0, to_release); + ext4_da_release_space(page->mapping->host, to_release); } /* @@ -1692,6 +1762,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, if (buffer_delay(bh)) { bh->b_blocknr = pblock; clear_buffer_delay(bh); + bh->b_bdev = inode->i_sb->s_bdev; + } else if (buffer_unwritten(bh)) { + bh->b_blocknr = pblock; + clear_buffer_unwritten(bh); + set_buffer_mapped(bh); + set_buffer_new(bh); + bh->b_bdev = inode->i_sb->s_bdev; } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); @@ -1765,7 +1842,7 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) * If blocks are delayed marked, we need to * put actual blocknr and drop delayed bit */ - if (buffer_delay(lbh)) + if (buffer_delay(lbh) || buffer_unwritten(lbh)) mpage_put_bnr_to_bhs(mpd, next, &new); /* go for the remaining blocks */ @@ -1774,7 +1851,8 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) } } -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ + (1 << BH_Delay) | (1 << BH_Unwritten)) /* * mpage_add_bh_to_extent - try to add one more block to extent of blocks @@ -2280,8 +2358,11 @@ retry: } page = __grab_cache_page(mapping, index); - if (!page) - return -ENOMEM; + if (!page) { + ext4_journal_stop(handle); + ret = -ENOMEM; + goto out; + } *pagep = page; ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, @@ -2806,59 +2887,63 @@ static int ext4_journalled_set_page_dirty(struct page *page) } static const struct address_space_operations ext4_ordered_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_normal_writepage, - .sync_page = block_sync_page, - .write_begin = ext4_write_begin, - .write_end = ext4_ordered_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_normal_writepage, + .sync_page = block_sync_page, + .write_begin = ext4_write_begin, + .write_end = ext4_ordered_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, }; static const struct address_space_operations ext4_writeback_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_normal_writepage, - .sync_page = block_sync_page, - .write_begin = ext4_write_begin, - .write_end = ext4_writeback_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_normal_writepage, + .sync_page = block_sync_page, + .write_begin = ext4_write_begin, + .write_end = ext4_writeback_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, }; static const struct address_space_operations ext4_journalled_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_journalled_writepage, - .sync_page = block_sync_page, - .write_begin = ext4_write_begin, - .write_end = ext4_journalled_write_end, - .set_page_dirty = ext4_journalled_set_page_dirty, - .bmap = ext4_bmap, - .invalidatepage = ext4_invalidatepage, - .releasepage = ext4_releasepage, + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_journalled_writepage, + .sync_page = block_sync_page, + .write_begin = ext4_write_begin, + .write_end = ext4_journalled_write_end, + .set_page_dirty = ext4_journalled_set_page_dirty, + .bmap = ext4_bmap, + .invalidatepage = ext4_invalidatepage, + .releasepage = ext4_releasepage, + .is_partially_uptodate = block_is_partially_uptodate, }; static const struct address_space_operations ext4_da_aops = { - .readpage = ext4_readpage, - .readpages = ext4_readpages, - .writepage = ext4_da_writepage, - .writepages = ext4_da_writepages, - .sync_page = block_sync_page, - .write_begin = ext4_da_write_begin, - .write_end = ext4_da_write_end, - .bmap = ext4_bmap, - .invalidatepage = ext4_da_invalidatepage, - .releasepage = ext4_releasepage, - .direct_IO = ext4_direct_IO, - .migratepage = buffer_migrate_page, + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_da_writepage, + .writepages = ext4_da_writepages, + .sync_page = block_sync_page, + .write_begin = ext4_da_write_begin, + .write_end = ext4_da_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_da_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, }; void ext4_set_aops(struct inode *inode) @@ -3430,6 +3515,9 @@ void ext4_truncate(struct inode *inode) * modify the block allocation tree. */ down_write(&ei->i_data_sem); + + ext4_discard_reservation(inode); + /* * The orphan list entry will now protect us from any crash which * occurs before the truncate completes, so it is now safe to propagate @@ -3499,8 +3587,6 @@ do_indirects: ; } - ext4_discard_reservation(inode); - up_write(&ei->i_data_sem); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); @@ -3586,6 +3672,16 @@ static int __ext4_get_inode_loc(struct inode *inode, } if (!buffer_uptodate(bh)) { lock_buffer(bh); + + /* + * If the buffer has the write error flag, we have failed + * to write out another inode in the same block. In this + * case, we don't have to read the block because we may + * read the old inode data successfully. + */ + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (buffer_uptodate(bh)) { /* someone brought it uptodate while we waited */ unlock_buffer(bh); @@ -4258,56 +4354,119 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } +static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, + int chunk) +{ + int indirects; + + /* if nrblocks are contiguous */ + if (chunk) { + /* + * With N contiguous data blocks, it need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks + * 2 dindirect blocks + * 1 tindirect block + */ + indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); + return indirects + 3; + } + /* + * if nrblocks are not contiguous, worse case, each block touch + * a indirect block, and each indirect block touch a double indirect + * block, plus a triple indirect block + */ + indirects = nrblocks * 2 + 1; + return indirects; +} + +static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return ext4_indirect_trans_blocks(inode, nrblocks, 0); + return ext4_ext_index_trans_blocks(inode, nrblocks, 0); +} /* - * How many blocks doth make a writepage()? + * Account for index blocks, block groups bitmaps and block group + * descriptor blocks if modify datablocks and index blocks + * worse case, the indexs blocks spread over different block groups * - * With N blocks per page, it may be: - * N data blocks - * 2 indirect block - * 2 dindirect - * 1 tindirect - * N+5 bitmap blocks (from the above) - * N+5 group descriptor summary blocks - * 1 inode block - * 1 superblock. - * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files + * If datablocks are discontiguous, they are possible to spread over + * different block groups too. If they are contiugous, with flexbg, + * they could still across block group boundary. * - * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS + * Also account for superblock, inode, quota and xattr blocks + */ +int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int groups, gdpblocks; + int idxblocks; + int ret = 0; + + /* + * How many index blocks need to touch to modify nrblocks? + * The "Chunk" flag indicating whether the nrblocks is + * physically contiguous on disk + * + * For Direct IO and fallocate, they calls get_block to allocate + * one single extent at a time, so they could set the "Chunk" flag + */ + idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); + + ret = idxblocks; + + /* + * Now let's see how many group bitmaps and group descriptors need + * to account + */ + groups = idxblocks; + if (chunk) + groups += 1; + else + groups += nrblocks; + + gdpblocks = groups; + if (groups > EXT4_SB(inode->i_sb)->s_groups_count) + groups = EXT4_SB(inode->i_sb)->s_groups_count; + if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) + gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; + + /* bitmaps and block group descriptor blocks */ + ret += groups + gdpblocks; + + /* Blocks for super block, inode, quota and xattr blocks */ + ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); + + return ret; +} + +/* + * Calulate the total number of credits to reserve to fit + * the modification of a single pages into a single transaction * - * With ordered or writeback data it's the same, less the N data blocks. + * This could be called via ext4_write_begin() or later + * ext4_da_writepages() in delalyed allocation case. * - * If the inode's direct blocks can hold an integral number of pages then a - * page cannot straddle two indirect blocks, and we can only touch one indirect - * and dindirect block, and the "5" above becomes "3". + * In both case it's possible that we could allocating multiple + * chunks of blocks. We need to consider the worse case, when + * one new block per extent. * - * This still overestimates under most circumstances. If we were to pass the - * start and end offsets in here as well we could do block_to_path() on each - * block and work out the exact number of indirects which are touched. Pah. + * For Direct IO and fallocate, the journal credits reservation + * is based on one single extent allocation, so they could use + * EXT4_DATA_TRANS_BLOCKS to get the needed credit to log a single + * chunk of allocation needs. */ - int ext4_writepage_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); - int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3; int ret; - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) - return ext4_ext_writepage_trans_blocks(inode, bpp); + ret = ext4_meta_trans_blocks(inode, bpp, 0); + /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) - ret = 3 * (bpp + indirects) + 2; - else - ret = 2 * (bpp + indirects) + 2; - -#ifdef CONFIG_QUOTA - /* We know that structure was already allocated during DQUOT_INIT so - * we will be updating only the data blocks + inodes */ - ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - + ret += bpp; return ret; } - /* * The caller must have previously called ext4_reserve_inode_write(). * Give this, we know that the caller already has write access to iloc->bh.