As spotted by kmemtrace, struct ext4_sb_info is 17664 bytes on 64-bit
which makes it a very bad fit for SLAB allocators.  The culprit of the
wasted memory is ->s_blockgroup_lock which can be as big as 16 KB when
NR_CPUS >= 32.
To fix that, allocate ->s_blockgroup_lock, which fits nicely in a order 2
page in the worst case, separately.  This shinks down struct ext4_sb_info
enough to fit a 2 KB slab cache so now we allocate 16 KB + 2 KB instead of
32 KB saving 14 KB of memory.
Acked-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
        struct percpu_counter s_dirtyblocks_counter;
-       struct blockgroup_lock s_blockgroup_lock;
+       struct blockgroup_lock *s_blockgroup_lock;
        struct proc_dir_entry *s_proc;
 
        /* Journaling */
 static inline spinlock_t *
 sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
 {
-       return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+       return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
 }
 
 #endif /* _EXT4_SB */
 
                ext4_blkdev_remove(sbi);
        }
        sb->s_fs_info = NULL;
+       kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return;
 }
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
+
+       sbi->s_blockgroup_lock =
+               kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+       if (!sbi->s_blockgroup_lock) {
+               kfree(sbi);
+               return -ENOMEM;
+       }
        sb->s_fs_info = sbi;
        sbi->s_mount_opt = 0;
        sbi->s_resuid = EXT4_DEF_RESUID;
                                 &sbi->s_inode_readahead_blks);
 #endif
 
-       bgl_lock_init(&sbi->s_blockgroup_lock);
+       bgl_lock_init(sbi->s_blockgroup_lock);
 
        for (i = 0; i < db_count; i++) {
                block = descriptor_loc(sb, logical_sb_block, i);