From: Arjan van de Ven From: Stephen Tweedie The patch below adds online resize capability to ext3 based on Andreas patch for 2.4 and fixed up by Stephen. The patch also removes s_debts: s_debts is currently not used by ext3 (it is created, destroyed and checked but never set). Remove it for now. Resurrecting this will require adding it back in changed form. In existing form it's already unsafe wrt. byte-tearing as it performs unlocked byte increment/decrement on words which may be being accessed simultaneously on other CPUs. It is also the only in-memory dynamic table which needs to be extended by online-resize, so locking it will require care. Signed-off-by: Andrew Morton --- 25-akpm/fs/ext3/Makefile | 2 25-akpm/fs/ext3/balloc.c | 48 + 25-akpm/fs/ext3/ialloc.c | 2 25-akpm/fs/ext3/inode.c | 3 25-akpm/fs/ext3/ioctl.c | 45 + 25-akpm/fs/ext3/resize.c | 996 +++++++++++++++++++++++++++++++++++++ 25-akpm/fs/ext3/super.c | 53 + 25-akpm/include/linux/ext3_fs.h | 38 + 25-akpm/include/linux/ext3_fs_sb.h | 1 25-akpm/include/linux/ext3_jbd.h | 7 10 files changed, 1153 insertions(+), 42 deletions(-) diff -puN fs/ext3/balloc.c~ext3-online-resize-patch fs/ext3/balloc.c --- 25/fs/ext3/balloc.c~ext3-online-resize-patch 2004-10-20 01:56:57.512415424 -0700 +++ 25-akpm/fs/ext3/balloc.c 2004-10-20 02:40:17.202202504 -0700 @@ -54,6 +54,7 @@ struct ext3_group_desc * ext3_get_group_ return NULL; } + smp_rmb(); group_desc = block_group / EXT3_DESC_PER_BLOCK(sb); desc = block_group % EXT3_DESC_PER_BLOCK(sb); @@ -274,8 +275,9 @@ void ext3_discard_reservation(struct ino } /* Free given blocks, update quota and i_blocks field */ -void ext3_free_blocks(handle_t *handle, struct inode *inode, - unsigned long block, unsigned long count) +void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb, + unsigned long block, unsigned long count, + int *pdquot_freed_blocks) { struct buffer_head *bitmap_bh = NULL; struct buffer_head *gd_bh; @@ -283,18 +285,12 @@ void ext3_free_blocks(handle_t *handle, unsigned long bit; unsigned long i; unsigned long overflow; - struct super_block * sb; struct ext3_group_desc * gdp; struct ext3_super_block * es; struct ext3_sb_info *sbi; int err = 0, ret; - int dquot_freed_blocks = 0; - sb = inode->i_sb; - if (!sb) { - printk ("ext3_free_blocks: nonexistent device"); - return; - } + *pdquot_freed_blocks = 0; sbi = EXT3_SB(sb); es = EXT3_SB(sb)->s_es; if (block < le32_to_cpu(es->s_first_data_block) || @@ -421,7 +417,7 @@ do_more: jbd_lock_bh_state(bitmap_bh); BUFFER_TRACE(bitmap_bh, "bit already cleared"); } else { - dquot_freed_blocks++; + (*pdquot_freed_blocks)++; } } jbd_unlock_bh_state(bitmap_bh); @@ -429,7 +425,7 @@ do_more: spin_lock(sb_bgl_lock(sbi, block_group)); gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + - dquot_freed_blocks); + *pdquot_freed_blocks); spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, count); @@ -451,6 +447,22 @@ do_more: error_return: brelse(bitmap_bh); ext3_std_error(sb, err); + return; +} + +/* Free given blocks, update quota and i_blocks field */ +void ext3_free_blocks(handle_t *handle, struct inode *inode, + unsigned long block, unsigned long count) +{ + struct super_block * sb; + int dquot_freed_blocks; + + sb = inode->i_sb; + if (!sb) { + printk ("ext3_free_blocks: nonexistent device"); + return; + } + ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks); if (dquot_freed_blocks) DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); return; @@ -1141,6 +1153,8 @@ int ext3_new_block(handle_t *handle, str #ifdef EXT3FS_DEBUG static int goal_hits, goal_attempts; #endif + unsigned long ngroups; + *errp = -ENOSPC; sb = inode->i_sb; if (!sb) { @@ -1205,13 +1219,16 @@ retry: goto allocated; } + ngroups = EXT3_SB(sb)->s_groups_count; + smp_rmb(); + /* * Now search the rest of the groups. We assume that * i and gdp correctly point to the last group visited. */ - for (bgi = 0; bgi < EXT3_SB(sb)->s_groups_count; bgi++) { + for (bgi = 0; bgi < ngroups; bgi++) { group_no++; - if (group_no >= EXT3_SB(sb)->s_groups_count) + if (group_no >= ngroups) group_no = 0; gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); if (!gdp) { @@ -1362,6 +1379,7 @@ unsigned long ext3_count_free_blocks(str unsigned long desc_count; struct ext3_group_desc *gdp; int i; + unsigned long ngroups; #ifdef EXT3FS_DEBUG struct ext3_super_block *es; unsigned long bitmap_count, x; @@ -1394,7 +1412,9 @@ unsigned long ext3_count_free_blocks(str return bitmap_count; #else desc_count = 0; - for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) { + ngroups = EXT3_SB(sb)->s_groups_count; + smp_rmb(); + for (i = 0; i < ngroups; i++) { gdp = ext3_get_group_desc(sb, i, NULL); if (!gdp) continue; diff -puN fs/ext3/inode.c~ext3-online-resize-patch fs/ext3/inode.c --- 25/fs/ext3/inode.c~ext3-online-resize-patch 2004-10-20 01:56:57.514415120 -0700 +++ 25-akpm/fs/ext3/inode.c 2004-10-20 02:40:09.233413944 -0700 @@ -2231,8 +2231,10 @@ static unsigned long ext3_get_inode_bloc struct buffer_head *bh; struct ext3_group_desc * gdp; + if ((ino != EXT3_ROOT_INO && ino != EXT3_JOURNAL_INO && + ino != EXT3_RESIZE_INO && ino < EXT3_FIRST_INO(sb)) || ino > le32_to_cpu( EXT3_SB(sb)->s_es->s_inodes_count)) { @@ -2246,6 +2248,7 @@ static unsigned long ext3_get_inode_bloc "group >= groups count"); return 0; } + smp_rmb(); group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb); desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1); bh = EXT3_SB(sb)->s_group_desc[group_desc]; diff -puN fs/ext3/ioctl.c~ext3-online-resize-patch fs/ext3/ioctl.c --- 25/fs/ext3/ioctl.c~ext3-online-resize-patch 2004-10-20 01:56:57.515414968 -0700 +++ 25-akpm/fs/ext3/ioctl.c 2004-10-20 02:39:57.153250408 -0700 @@ -175,6 +175,51 @@ flags_err: rsv_window_size = EXT3_MAX_RESERVE_BLOCKS; atomic_set(&ei->i_rsv_window.rsv_goal_size, rsv_window_size); return 0; + case EXT3_IOC_GROUP_EXTEND: { + unsigned long n_blocks_count; + struct super_block *sb = inode->i_sb; + int err; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (IS_RDONLY(inode)) + return -EROFS; + + if (get_user(n_blocks_count, (__u32 *)arg)) + return -EFAULT; + + err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count); + journal_lock_updates(EXT3_SB(sb)->s_journal); + journal_flush(EXT3_SB(sb)->s_journal); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + + return err; + } + case EXT3_IOC_GROUP_ADD: { + struct ext3_new_group_data input; + struct super_block *sb = inode->i_sb; + int err; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (IS_RDONLY(inode)) + return -EROFS; + + if (copy_from_user(&input, (struct ext3_new_group_input *)arg, + sizeof(input))) + return -EFAULT; + + err = ext3_group_add(sb, &input); + journal_lock_updates(EXT3_SB(sb)->s_journal); + journal_flush(EXT3_SB(sb)->s_journal); + journal_unlock_updates(EXT3_SB(sb)->s_journal); + + return err; + } + + default: return -ENOTTY; } diff -puN fs/ext3/Makefile~ext3-online-resize-patch fs/ext3/Makefile --- 25/fs/ext3/Makefile~ext3-online-resize-patch 2004-10-20 01:56:57.517414664 -0700 +++ 25-akpm/fs/ext3/Makefile 2004-10-20 01:56:57.530412688 -0700 @@ -5,7 +5,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o hash.o + ioctl.o namei.o super.o symlink.o hash.o resize.o ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff -puN /dev/null fs/ext3/resize.c --- /dev/null 2003-09-15 06:40:47.000000000 -0700 +++ 25-akpm/fs/ext3/resize.c 2004-10-20 02:40:25.153993648 -0700 @@ -0,0 +1,996 @@ +/* + * linux/fs/ext3/resize.c + * + * Support for resizing an ext3 filesystem while it is mounted. + * + * Copyright (C) 2001, 2002 Andreas Dilger + * + * This could probably be made into a module, because it is not often in use. + */ + +#include + +#define EXT3FS_DEBUG + +#include +#include +#include + +#include +#include + + +#define outside(b, first, last) ((b) < (first) || (b) >= (last)) +#define inside(b, first, last) ((b) >= (first) && (b) < (last)) + +static int verify_group_input(struct super_block *sb, + struct ext3_new_group_data *input) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + unsigned start = le32_to_cpu(es->s_blocks_count); + unsigned end = start + input->blocks_count; + unsigned group = input->group; + unsigned itend = input->inode_table + EXT3_SB(sb)->s_itb_per_group; + unsigned overhead = ext3_bg_has_super(sb, group) ? + (1 + ext3_bg_num_gdb(sb, group) + + le16_to_cpu(es->s_reserved_gdt_blocks)) : 0; + unsigned metaend = start + overhead; + struct buffer_head *bh = NULL; + int free_blocks_count; + int err = -EINVAL; + + input->free_blocks_count = free_blocks_count = + input->blocks_count - 2 - overhead - sbi->s_itb_per_group; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3-fs: adding %s group %u: %u blocks " + "(%d free, %u reserved)\n", + ext3_bg_has_super(sb, input->group) ? "normal" : + "no-super", input->group, input->blocks_count, + free_blocks_count, input->reserved_blocks); + + if (group != sbi->s_groups_count) + ext3_warning(sb, __FUNCTION__, + "Cannot add at group %u (only %lu groups)", + input->group, sbi->s_groups_count); + else if ((start - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb)) + ext3_warning(sb, __FUNCTION__, "Last group not full"); + else if (input->reserved_blocks > input->blocks_count / 5) + ext3_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)", + input->reserved_blocks); + else if (free_blocks_count < 0) + ext3_warning(sb, __FUNCTION__, "Bad blocks count %u", + input->blocks_count); + else if (!(bh = sb_bread(sb, end - 1))) + ext3_warning(sb, __FUNCTION__, "Cannot read last block (%u)", + end - 1); + else if (outside(input->block_bitmap, start, end)) + ext3_warning(sb, __FUNCTION__, + "Block bitmap not in group (block %u)", + input->block_bitmap); + else if (outside(input->inode_bitmap, start, end)) + ext3_warning(sb, __FUNCTION__, + "Inode bitmap not in group (block %u)", + input->inode_bitmap); + else if (outside(input->inode_table, start, end) || + outside(itend - 1, start, end)) + ext3_warning(sb, __FUNCTION__, + "Inode table not in group (blocks %u-%u)", + input->inode_table, itend - 1); + else if (input->inode_bitmap == input->block_bitmap) + ext3_warning(sb, __FUNCTION__, + "Block bitmap same as inode bitmap (%u)", + input->block_bitmap); + else if (inside(input->block_bitmap, input->inode_table, itend)) + ext3_warning(sb, __FUNCTION__, + "Block bitmap (%u) in inode table (%u-%u)", + input->block_bitmap, input->inode_table, itend-1); + else if (inside(input->inode_bitmap, input->inode_table, itend)) + ext3_warning(sb, __FUNCTION__, + "Inode bitmap (%u) in inode table (%u-%u)", + input->inode_bitmap, input->inode_table, itend-1); + else if (inside(input->block_bitmap, start, metaend)) + ext3_warning(sb, __FUNCTION__, + "Block bitmap (%u) in GDT table (%u-%u)", + input->block_bitmap, start, metaend - 1); + else if (inside(input->inode_bitmap, start, metaend)) + ext3_warning(sb, __FUNCTION__, + "Inode bitmap (%u) in GDT table (%u-%u)", + input->inode_bitmap, start, metaend - 1); + else if (inside(input->inode_table, start, metaend) || + inside(itend - 1, start, metaend)) + ext3_warning(sb, __FUNCTION__, + "Inode table (%u-%u) overlaps GDT table (%u-%u)", + input->inode_table, itend - 1, start, metaend - 1); + else + err = 0; + brelse(bh); + + return err; +} + +static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, + unsigned long blk) +{ + struct buffer_head *bh; + int err; + + bh = sb_getblk(sb, blk); + if ((err = ext3_journal_get_write_access(handle, bh))) { + brelse(bh); + bh = ERR_PTR(err); + } else { + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + } + + return bh; +} + +/* + * To avoid calling the atomic setbit hundreds or thousands of times, we only + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ +static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + + if (start_bit >= end_bit) + return; + + ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); + for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) + ext3_set_bit(i, bitmap); + if (i < end_bit) + memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +} + +/* + * Set up the block and inode bitmaps, and the inode table for the new group. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to + * ensure the recovery is correct in case of a failure just after resize. + * If any part of this fails, we simply abort the resize. + */ +static int setup_new_group_blocks(struct super_block *sb, + struct ext3_new_group_data *input) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + unsigned long start = input->group * sbi->s_blocks_per_group + + le32_to_cpu(sbi->s_es->s_first_data_block); + int reserved_gdb = ext3_bg_has_super(sb, input->group) ? + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0; + unsigned long gdblocks = ext3_bg_num_gdb(sb, input->group); + struct buffer_head *bh; + handle_t *handle; + unsigned long block; + int bit; + int i; + int err = 0, err2; + + handle = ext3_journal_start_sb(sb, reserved_gdb + gdblocks + + 2 + sbi->s_itb_per_group); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + lock_super(sb); + if (input->group != sbi->s_groups_count) { + err = -EBUSY; + goto exit_journal; + } + + if (IS_ERR(bh = bclean(handle, sb, input->block_bitmap))) { + err = PTR_ERR(bh); + goto exit_journal; + } + + if (ext3_bg_has_super(sb, input->group)) { + ext3_debug("mark backup superblock %#04lx (+0)\n", start); + ext3_set_bit(0, bh->b_data); + } + + /* Copy all of the GDT blocks into the backup in this group */ + for (i = 0, bit = 1, block = start + 1; + i < gdblocks; i++, block++, bit++) { + struct buffer_head *gdb; + + ext3_debug("update backup group %#04lx (+%d)\n", block, bit); + + gdb = sb_getblk(sb, block); + if ((err = ext3_journal_get_write_access(handle, gdb))) { + brelse(gdb); + goto exit_bh; + } + lock_buffer(bh); + memcpy(gdb->b_data, sbi->s_group_desc[i], bh->b_size); + set_buffer_uptodate(gdb); + unlock_buffer(bh); + ext3_journal_dirty_metadata(handle, gdb); + ext3_set_bit(bit, bh->b_data); + brelse(gdb); + } + + /* Zero out all of the reserved backup group descriptor table blocks */ + for (i = 0, bit = gdblocks + 1, block = start + bit; + i < reserved_gdb; i++, block++, bit++) { + struct buffer_head *gdb; + + ext3_debug("clear reserved block %#04lx (+%d)\n", block, bit); + + if (IS_ERR(gdb = bclean(handle, sb, block))) { + err = PTR_ERR(bh); + goto exit_bh; + } + ext3_journal_dirty_metadata(handle, gdb); + ext3_set_bit(bit, bh->b_data); + brelse(gdb); + } + ext3_debug("mark block bitmap %#04x (+%ld)\n", input->block_bitmap, + input->block_bitmap - start); + ext3_set_bit(input->block_bitmap - start, bh->b_data); + ext3_debug("mark inode bitmap %#04x (+%ld)\n", input->inode_bitmap, + input->inode_bitmap - start); + ext3_set_bit(input->inode_bitmap - start, bh->b_data); + + /* Zero out all of the inode table blocks */ + for (i = 0, block = input->inode_table, bit = block - start; + i < sbi->s_itb_per_group; i++, bit++, block++) { + struct buffer_head *it; + + ext3_debug("clear inode block %#04x (+%ld)\n", block, bit); + if (IS_ERR(it = bclean(handle, sb, block))) { + err = PTR_ERR(it); + goto exit_bh; + } + ext3_journal_dirty_metadata(handle, it); + brelse(it); + ext3_set_bit(bit, bh->b_data); + } + mark_bitmap_end(input->blocks_count, EXT3_BLOCKS_PER_GROUP(sb), + bh->b_data); + ext3_journal_dirty_metadata(handle, bh); + brelse(bh); + + /* Mark unused entries in inode bitmap used */ + ext3_debug("clear inode bitmap %#04x (+%ld)\n", + input->inode_bitmap, input->inode_bitmap - start); + if (IS_ERR(bh = bclean(handle, sb, input->inode_bitmap))) { + err = PTR_ERR(bh); + goto exit_journal; + } + + mark_bitmap_end(EXT3_INODES_PER_GROUP(sb), EXT3_BLOCKS_PER_GROUP(sb), + bh->b_data); + ext3_journal_dirty_metadata(handle, bh); +exit_bh: + brelse(bh); + +exit_journal: + unlock_super(sb); + if ((err2 = ext3_journal_stop(handle)) && !err) + err = err2; + + return err; +} + +/* + * Iterate through the groups which hold BACKUP superblock/GDT copies in an + * ext3 filesystem. The counters should be initialized to 1, 5, and 7 before + * calling this for the first time. In a sparse filesystem it will be the + * sequence of powers of 3, 5, and 7: 1, 3, 5, 7, 9, 25, 27, 49, 81, ... + * For a non-sparse filesystem it will be every group: 1, 2, 3, 4, ... + */ +unsigned ext3_list_backups(struct super_block *sb, unsigned *three, + unsigned *five, unsigned *seven) +{ + unsigned *min = three; + int mult = 3; + unsigned ret; + + if (!EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ret = *min; + *min += 1; + return ret; + } + + if (*five < *min) { + min = five; + mult = 5; + } + if (*seven < *min) { + min = seven; + mult = 7; + } + + ret = *min; + *min *= mult; + + return ret; +} + +/* + * Check that all of the backup GDT blocks are held in the primary GDT block. + * It is assumed that they are stored in group order. Returns the number of + * groups in current filesystem that have BACKUPS, or -ve error code. + */ +static int verify_reserved_gdb(struct super_block *sb, + struct buffer_head *primary) +{ + const unsigned long blk = primary->b_blocknr; + const unsigned long end = EXT3_SB(sb)->s_groups_count; + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned grp; + __u32 *p = (__u32 *)primary->b_data; + int gdbackups = 0; + + while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { + if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){ + ext3_warning(sb, __FUNCTION__, + "reserved GDT %ld missing grp %d (%ld)\n", + blk, grp, + grp * EXT3_BLOCKS_PER_GROUP(sb) + blk); + return -EINVAL; + } + if (++gdbackups > EXT3_ADDR_PER_BLOCK(sb)) + return -EFBIG; + } + + return gdbackups; +} + +/* + * Called when we need to bring a reserved group descriptor table block into + * use from the resize inode. The primary copy of the new GDT block currently + * is an indirect block (under the double indirect block in the resize inode). + * The new backup GDT blocks will be stored as leaf blocks in this indirect + * block, in group order. Even though we know all the block numbers we need, + * we check to ensure that the resize inode has actually reserved these blocks. + * + * Don't need to update the block bitmaps because the blocks are still in use. + * + * We get all of the error cases out of the way, so that we are sure to not + * fail once we start modifying the data on disk, because JBD has no rollback. + */ +static int add_new_gdb(handle_t *handle, struct inode *inode, + struct ext3_new_group_data *input, + struct buffer_head **primary) +{ + struct super_block *sb = inode->i_sb; + struct ext3_super_block *es = EXT3_SB(sb)->s_es; + unsigned long gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb); + unsigned long gdblock = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + gdb_num; + struct buffer_head **o_group_desc, **n_group_desc; + struct buffer_head *dind; + int gdbackups; + struct ext3_iloc iloc; + __u32 *data; + int err; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG + "EXT3-fs: ext3_add_new_gdb: adding group block %lu\n", + gdb_num); + + /* + * If we are not using the primary superblock/GDT copy don't resize, + * because the user tools have no way of handling this. Probably a + * bad time to do it anyways. + */ + if (EXT3_SB(sb)->s_sbh->b_blocknr != + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) { + ext3_warning(sb, __FUNCTION__, + "won't resize using backup superblock at %llu\n", + (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr); + return -EPERM; + } + + *primary = sb_bread(sb, gdblock); + if (!*primary) + return -EIO; + + if ((gdbackups = verify_reserved_gdb(sb, *primary)) < 0) { + err = gdbackups; + goto exit_bh; + } + + data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_bh; + } + + data = (__u32 *)dind->b_data; + if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) { + ext3_warning(sb, __FUNCTION__, + "new group %u GDT block %lu not reserved\n", + input->group, gdblock); + err = -EINVAL; + goto exit_dind; + } + + if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh))) + goto exit_dind; + + if ((err = ext3_journal_get_write_access(handle, *primary))) + goto exit_sbh; + + if ((err = ext3_journal_get_write_access(handle, dind))) + goto exit_primary; + + /* ext3_reserve_inode_write() gets a reference on the iloc */ + if ((err = ext3_reserve_inode_write(handle, inode, &iloc))) + goto exit_dindj; + + n_group_desc = (struct buffer_head **)kmalloc((gdb_num + 1) * + sizeof(struct buffer_head *), GFP_KERNEL); + if (!n_group_desc) { + err = -ENOMEM; + ext3_warning (sb, __FUNCTION__, + "not enough memory for %lu groups", gdb_num + 1); + goto exit_inode; + } + + /* + * Finally, we have all of the possible failures behind us... + * + * Remove new GDT block from inode double-indirect block and clear out + * the new GDT block for use (which also "frees" the backup GDT blocks + * from the reserved inode). We don't need to change the bitmaps for + * these blocks, because they are marked as in-use from being in the + * reserved inode, and will become GDT blocks (primary and backup). + */ + data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)] = 0; + ext3_journal_dirty_metadata(handle, dind); + brelse(dind); + inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; + ext3_mark_iloc_dirty(handle, inode, &iloc); + memset((*primary)->b_data, 0, sb->s_blocksize); + ext3_journal_dirty_metadata(handle, *primary); + + o_group_desc = EXT3_SB(sb)->s_group_desc; + memcpy(n_group_desc, o_group_desc, + EXT3_SB(sb)->s_gdb_count * sizeof(struct buffer_head *)); + n_group_desc[gdb_num] = *primary; + EXT3_SB(sb)->s_group_desc = n_group_desc; + EXT3_SB(sb)->s_gdb_count++; + kfree(o_group_desc); + + es->s_reserved_gdt_blocks = + cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1); + ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + + return 0; + +exit_inode: + //ext3_journal_release_buffer(handle, iloc.bh); + brelse(iloc.bh); +exit_dindj: + //ext3_journal_release_buffer(handle, dind); +exit_primary: + //ext3_journal_release_buffer(handle, *primary); +exit_sbh: + //ext3_journal_release_buffer(handle, *primary); +exit_dind: + brelse(dind); +exit_bh: + brelse(*primary); + + ext3_debug("leaving with error %d\n", err); + return err; +} + +/* + * Called when we are adding a new group which has a backup copy of each of + * the GDT blocks (i.e. sparse group) and there are reserved GDT blocks. + * We need to add these reserved backup GDT blocks to the resize inode, so + * that they are kept for future resizing and not allocated to files. + * + * Each reserved backup GDT block will go into a different indirect block. + * The indirect blocks are actually the primary reserved GDT blocks, + * so we know in advance what their block numbers are. We only get the + * double-indirect block to verify it is pointing to the primary reserved + * GDT blocks so we don't overwrite a data block by accident. The reserved + * backup GDT blocks are stored in their reserved primary GDT block. + */ +static int reserve_backup_gdb(handle_t *handle, struct inode *inode, + struct ext3_new_group_data *input) +{ + struct super_block *sb = inode->i_sb; + int reserved_gdb =le16_to_cpu(EXT3_SB(sb)->s_es->s_reserved_gdt_blocks); + struct buffer_head **primary; + struct buffer_head *dind; + struct ext3_iloc iloc; + unsigned long blk; + __u32 *data, *end; + int gdbackups = 0; + int res, i; + int err; + + primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL); + if (!primary) + return -ENOMEM; + + data = EXT3_I(inode)->i_data + EXT3_DIND_BLOCK; + dind = sb_bread(sb, le32_to_cpu(*data)); + if (!dind) { + err = -EIO; + goto exit_free; + } + + blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count; + data = (__u32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count; + end = (__u32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb); + + /* Get each reserved primary GDT block and verify it holds backups */ + for (res = 0; res < reserved_gdb; res++, blk++) { + if (le32_to_cpu(*data) != blk) { + ext3_warning(sb, __FUNCTION__, + "reserved block %lu not at offset %ld\n", + blk, (long)(data - (__u32 *)dind->b_data)); + err = -EINVAL; + goto exit_bh; + } + primary[res] = sb_bread(sb, blk); + if (!primary[res]) { + err = -EIO; + goto exit_bh; + } + if ((gdbackups = verify_reserved_gdb(sb, primary[res])) < 0) { + brelse(primary[res]); + err = gdbackups; + goto exit_bh; + } + if (++data >= end) + data = (__u32 *)dind->b_data; + } + + for (i = 0; i < reserved_gdb; i++) { + if ((err = ext3_journal_get_write_access(handle, primary[i]))) { + /* + int j; + for (j = 0; j < i; j++) + ext3_journal_release_buffer(handle, primary[j]); + */ + goto exit_bh; + } + } + + if ((err = ext3_reserve_inode_write(handle, inode, &iloc))) + goto exit_bh; + + /* + * Finally we can add each of the reserved backup GDT blocks from + * the new group to its reserved primary GDT block. + */ + blk = input->group * EXT3_BLOCKS_PER_GROUP(sb); + for (i = 0; i < reserved_gdb; i++) { + int err2; + data = (__u32 *)primary[i]->b_data; + /* printk("reserving backup %lu[%u] = %lu\n", + primary[i]->b_blocknr, gdbackups, + blk + primary[i]->b_blocknr); */ + data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); + err2 = ext3_journal_dirty_metadata(handle, primary[i]); + if (!err) + err = err2; + } + inode->i_blocks += reserved_gdb * sb->s_blocksize >> 9; + ext3_mark_iloc_dirty(handle, inode, &iloc); + +exit_bh: + while (--res >= 0) + brelse(primary[res]); + brelse(dind); + +exit_free: + kfree(primary); + + return err; +} + +/* + * Update the backup copies of the ext3 metadata. These don't need to be part + * of the main resize transaction, because e2fsck will re-write them if there + * is a problem (basically only OOM will cause a problem). However, we + * _should_ update the backups if possible, in case the primary gets trashed + * for some reason and we need to run e2fsck from a backup superblock. The + * important part is that the new block and inode counts are in the backup + * superblocks, and the location of the new group metadata in the GDT backups. + * + * We do not need lock_super() for this, because these blocks are not + * otherwise touched by the filesystem code when it is mounted. We don't + * need to worry about last changing from sbi->s_groups_count, because the + * worst that can happen is that we do not copy the full number of backups + * at this time. The resize which changed s_groups_count will backup again. + */ +static void update_backups(struct super_block *sb, + int blk_off, char *data, int size) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + const unsigned long last = sbi->s_groups_count; + const int bpg = EXT3_BLOCKS_PER_GROUP(sb); + unsigned three = 1; + unsigned five = 5; + unsigned seven = 7; + unsigned group; + int rest = sb->s_blocksize - size; + handle_t *handle; + int err = 0, err2; + + handle = ext3_journal_start_sb(sb, EXT3_MAX_TRANS_DATA); + if (IS_ERR(handle)) { + group = 1; + err = PTR_ERR(handle); + goto exit_err; + } + + while ((group = ext3_list_backups(sb, &three, &five, &seven)) < last) { + struct buffer_head *bh; + + /* Out of journal space, and can't get more - abort - so sad */ + if (handle->h_buffer_credits == 0 && + ext3_journal_extend(handle, EXT3_MAX_TRANS_DATA) && + (err = ext3_journal_restart(handle, EXT3_MAX_TRANS_DATA))) + break; + + bh = sb_getblk(sb, group * bpg + blk_off); + ext3_debug(sb, __FUNCTION__, "update metadata backup %#04lx\n", + bh->b_blocknr); + if ((err = ext3_journal_get_write_access(handle, bh))) + break; + lock_buffer(bh); + memcpy(bh->b_data, data, size); + if (rest) + memset(bh->b_data + size, 0, rest); + set_buffer_uptodate(bh); + unlock_buffer(bh); + ext3_journal_dirty_metadata(handle, bh); + brelse(bh); + } + if ((err2 = ext3_journal_stop(handle)) && !err) + err = err2; + + /* + * Ugh! Need to have e2fsck write the backup copies. It is too + * late to revert the resize, we shouldn't fail just because of + * the backup copies (they are only needed in case of corruption). + * + * However, if we got here we have a journal problem too, so we + * can't really start a transaction to mark the superblock. + * Chicken out and just set the flag on the hope it will be written + * to disk, and if not - we will simply wait until next fsck. + */ +exit_err: + if (err) { + ext3_warning(sb, __FUNCTION__, + "can't update backup for group %d (err %d), " + "forcing fsck on next reboot\n", group, err); + sbi->s_mount_state &= ~EXT3_VALID_FS; + sbi->s_es->s_state &= ~cpu_to_le16(EXT3_VALID_FS); + mark_buffer_dirty(sbi->s_sbh); + } +} + +/* Add group descriptor data to an existing or new group descriptor block. + * Ensure we handle all possible error conditions _before_ we start modifying + * the filesystem, because we cannot abort the transaction and not have it + * write the data to disk. + * + * If we are on a GDT block boundary, we need to get the reserved GDT block. + * Otherwise, we may need to add backup GDT blocks for a sparse group. + * + * We only need to hold the superblock lock while we are actually adding + * in the new group's counts to the superblock. Prior to that we have + * not really "added" the group at all. We re-check that we are still + * adding in the last group in case things have changed since verifying. + */ +int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + struct ext3_super_block *es = sbi->s_es; + int reserved_gdb = ext3_bg_has_super(sb, input->group) ? + le16_to_cpu(es->s_reserved_gdt_blocks) : 0; + struct buffer_head *primary = NULL; + struct ext3_group_desc *gdp; + struct inode *inode = NULL; + handle_t *handle; + int gdb_off, gdb_num; + int err, err2; + + gdb_num = input->group / EXT3_DESC_PER_BLOCK(sb); + gdb_off = input->group % EXT3_DESC_PER_BLOCK(sb); + + if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb, + EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { + ext3_warning(sb, __FUNCTION__, + "Can't resize non-sparse filesystem further\n"); + return -EPERM; + } + + if (reserved_gdb || gdb_off == 0) { + if (!EXT3_HAS_COMPAT_FEATURE(sb, + EXT3_FEATURE_COMPAT_RESIZE_INODE)){ + ext3_warning(sb, __FUNCTION__, + "No reserved GDT blocks, can't resize\n"); + return -EPERM; + } + inode = iget(sb, EXT3_RESIZE_INO); + if (!inode || is_bad_inode(inode)) { + ext3_warning(sb, __FUNCTION__, + "Error opening resize inode\n"); + iput(inode); + return -ENOENT; + } + } + + if ((err = verify_group_input(sb, input))) + goto exit_put; + + if ((err = setup_new_group_blocks(sb, input))) + goto exit_put; + + /* + * We will always be modifying at least the superblock and a GDT + * block. If we are adding a group past the last current GDT block, + * we will also modify the inode and the dindirect block. If we + * are adding a group with superblock/GDT backups we will also + * modify each of the reserved GDT dindirect blocks. + */ + handle = ext3_journal_start_sb(sb, + ext3_bg_has_super(sb, input->group) ? + 3 + reserved_gdb : 4); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto exit_put; + } + + lock_super(sb); + if (input->group != EXT3_SB(sb)->s_groups_count) { + ext3_warning(sb, __FUNCTION__, + "multiple resizers run on filesystem!\n"); + goto exit_journal; + } + + if ((err = ext3_journal_get_write_access(handle, sbi->s_sbh))) + goto exit_journal; + + /* + * We will only either add reserved group blocks to a backup group + * or remove reserved blocks for the first group in a new group block. + * Doing both would be mean more complex code, and sane people don't + * use non-sparse filesystems anymore. This is already checked above. + */ + if (gdb_off) { + primary = sbi->s_group_desc[gdb_num]; + if ((err = ext3_journal_get_write_access(handle, primary))) + goto exit_journal; + + if (reserved_gdb && ext3_bg_num_gdb(sb, input->group) && + (err = reserve_backup_gdb(handle, inode, input))) + goto exit_journal; + } else if ((err = add_new_gdb(handle, inode, input, &primary))) + goto exit_journal; + + /* + * OK, now we've set up the new group. Time to make it active. + * + * Current kernels don't lock all allocations via lock_super(), + * so we have to be safe wrt. concurrent accesses the group + * data. So we need to be careful to set all of the relevant + * group descriptor data etc. *before* we enable the group. + * + * The key field here is EXT3_SB(sb)->s_groups_count: as long as + * that retains its old value, nobody is going to access the new + * group. + * + * So first we update all the descriptor metadata for the new + * group; then we update the total disk blocks count; then we + * update the groups count to enable the group; then finally we + * update the free space counts so that the system can start + * using the new disk blocks. + */ + + /* Update group descriptor block for new group */ + gdp = (struct ext3_group_desc *)primary->b_data + gdb_off; + + gdp->bg_block_bitmap = cpu_to_le32(input->block_bitmap); + gdp->bg_inode_bitmap = cpu_to_le32(input->inode_bitmap); + gdp->bg_inode_table = cpu_to_le32(input->inode_table); + gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); + gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb)); + + /* + * Make the new blocks and inodes valid next. We do this before + * increasing the group count so that once the group is enabled, + * all of its blocks and inodes are already valid. + * + * We always allocate group-by-group, then block-by-block or + * inode-by-inode within a group, so enabling these + * blocks/inodes before the group is live won't actually let us + * allocate the new space yet. + */ + es->s_blocks_count = cpu_to_le32(le32_to_cpu(es->s_blocks_count) + + input->blocks_count); + es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) + + EXT3_INODES_PER_GROUP(sb)); + + /* + * We need to protect s_groups_count against other CPUs seeing + * inconsistent state in the superblock. + * + * The precise rules we use are: + * + * * Writers of s_groups_count *must* hold lock_super + * AND + * * Writers must perform a smp_wmb() after updating all dependent + * data and before modifying the groups count + * + * * Readers must hold lock_super() over the access + * OR + * * Readers must perform an smp_rmb() after reading the groups count + * and before reading any dependent data. + * + * NB. These rules can be relaxed when checking the group count + * while freeing data, as we can only allocate from a block + * group after serialising against the group count, and we can + * only then free after serialising in turn against that + * allocation. + */ + smp_wmb(); + + /* Update the global fs size fields */ + EXT3_SB(sb)->s_groups_count++; + + ext3_journal_dirty_metadata(handle, primary); + + /* Update the reserved block counts only once the new group is + * active. */ + es->s_r_blocks_count = cpu_to_le32(le32_to_cpu(es->s_r_blocks_count) + + input->reserved_blocks); + + /* Update the free space counts */ + percpu_counter_mod(&sbi->s_freeblocks_counter, + input->free_blocks_count); + percpu_counter_mod(&sbi->s_freeinodes_counter, + EXT3_INODES_PER_GROUP(sb)); + + ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + sb->s_dirt = 1; + +exit_journal: + unlock_super(sb); + if ((err2 = ext3_journal_stop(handle)) && !err) + err = err2; + if (!err) { + update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext3_super_block)); + update_backups(sb, primary->b_blocknr, primary->b_data, + primary->b_size); + } +exit_put: + iput(inode); + return err; +} /* ext3_group_add */ + +/* Extend the filesystem to the new number of blocks specified. This entry + * point is only used to extend the current filesystem to the end of the last + * existing group. It can be accessed via ioctl, or by "remount,resize=" + * for emergencies (because it has no dependencies on reserved blocks). + * + * If we _really_ wanted, we could use default values to call ext3_group_add() + * allow the "remount" trick to work for arbitrary resizing, assuming enough + * GDT blocks are reserved to grow to the desired size. + */ +int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, + unsigned long n_blocks_count) +{ + unsigned long o_blocks_count; + unsigned long o_groups_count; + unsigned long last; + int add; + struct buffer_head * bh; + handle_t *handle; + int err, freed_blocks; + + /* We don't need to worry about locking wrt other resizers just + * yet: we're going to revalidate es->s_blocks_count after + * taking lock_super() below. */ + o_blocks_count = le32_to_cpu(es->s_blocks_count); + o_groups_count = EXT3_SB(sb)->s_groups_count; + + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3-fs: extending last group from %lu to %lu blocks\n", + o_blocks_count, n_blocks_count); + + if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) + return 0; + + if (n_blocks_count < o_blocks_count) { + ext3_warning(sb, __FUNCTION__, + "can't shrink FS - resize aborted"); + return -EBUSY; + } + + /* Handle the remaining blocks in the last group only. */ + last = (o_blocks_count - le32_to_cpu(es->s_first_data_block)) % + EXT3_BLOCKS_PER_GROUP(sb); + + if (last == 0) { + ext3_warning(sb, __FUNCTION__, + "need to use ext2online to resize further\n"); + return -EPERM; + } + + add = EXT3_BLOCKS_PER_GROUP(sb) - last; + + if (o_blocks_count + add > n_blocks_count) + add = n_blocks_count - o_blocks_count; + + if (o_blocks_count + add < n_blocks_count) + ext3_warning(sb, __FUNCTION__, + "will only finish group (%lu blocks, %u new)", + o_blocks_count + add, add); + + /* See if the device is actually as big as what was requested */ + bh = sb_bread(sb, o_blocks_count + add -1); + if (!bh) { + ext3_warning(sb, __FUNCTION__, + "can't read last block, resize aborted"); + return -ENOSPC; + } + brelse(bh); + + /* We will update the superblock, one block bitmap, and + * one group descriptor via ext3_free_blocks(). + */ + handle = ext3_journal_start_sb(sb, 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + ext3_warning(sb, __FUNCTION__, "error %d on journal start",err); + goto exit_put; + } + + lock_super(sb); + if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { + ext3_warning(sb, __FUNCTION__, + "multiple resizers run on filesystem!\n"); + err = -EBUSY; + goto exit_put; + } + + if ((err = ext3_journal_get_write_access(handle, + EXT3_SB(sb)->s_sbh))) { + ext3_warning(sb, __FUNCTION__, + "error %d on journal write access", err); + unlock_super(sb); + ext3_journal_stop(handle); + goto exit_put; + } + es->s_blocks_count = cpu_to_le32(o_blocks_count + add); + ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); + sb->s_dirt = 1; + unlock_super(sb); + ext3_debug("freeing blocks %ld through %ld\n", o_blocks_count, + o_blocks_count + add); + ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); + ext3_debug("freed blocks %ld through %ld\n", o_blocks_count, + o_blocks_count + add); + if ((err = ext3_journal_stop(handle))) + goto exit_put; + if (test_opt(sb, DEBUG)) + printk(KERN_DEBUG "EXT3-fs: extended group to %u blocks\n", + le32_to_cpu(es->s_blocks_count)); + update_backups(sb, EXT3_SB(sb)->s_sbh->b_blocknr, (char *)es, + sizeof(struct ext3_super_block)); +exit_put: + return err; +} /* ext3_group_extend */ diff -puN fs/ext3/super.c~ext3-online-resize-patch fs/ext3/super.c --- 25/fs/ext3/super.c~ext3-online-resize-patch 2004-10-20 01:56:57.519414360 -0700 +++ 25-akpm/fs/ext3/super.c 2004-10-20 02:40:13.204810200 -0700 @@ -59,19 +59,19 @@ static int ext3_sync_fs(struct super_blo * that sync() will call the filesystem's write_super callback if * appropriate. */ -handle_t *ext3_journal_start(struct inode *inode, int nblocks) +handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) { journal_t *journal; - if (inode->i_sb->s_flags & MS_RDONLY) + if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); /* Special case here: if the journal has aborted behind our * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ - journal = EXT3_JOURNAL(inode); + journal = EXT3_SB(sb)->s_journal; if (is_journal_aborted(journal)) { - ext3_abort(inode->i_sb, __FUNCTION__, + ext3_abort(sb, __FUNCTION__, "Detected aborted journal"); return ERR_PTR(-EROFS); } @@ -400,7 +400,6 @@ void ext3_put_super (struct super_block for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); - kfree(sbi->s_debts); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) { @@ -582,7 +581,7 @@ enum { Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, - Opt_ignore, Opt_barrier, Opt_err, + Opt_ignore, Opt_barrier, Opt_err, Opt_resize, }; static match_table_t tokens = { @@ -630,7 +629,8 @@ static match_table_t tokens = { {Opt_ignore, "quota"}, {Opt_ignore, "usrquota"}, {Opt_barrier, "barrier=%u"}, - {Opt_err, NULL} + {Opt_err, NULL}, + {Opt_resize, "resize"}, }; static unsigned long get_sb_block(void **data) @@ -654,7 +654,7 @@ static unsigned long get_sb_block(void * } static int parse_options (char * options, struct super_block *sb, - unsigned long * inum, int is_remount) + unsigned long * inum, unsigned long *n_blocks_count, int is_remount) { struct ext3_sb_info *sbi = EXT3_SB(sb); char * p; @@ -911,6 +911,15 @@ clear_qf_name: break; case Opt_ignore: break; + case Opt_resize: + if (!n_blocks_count) { + printk("EXT3-fs: resize option only available " + "for remount\n"); + return 0; + } + match_int(&args[0], &option); + *n_blocks_count = option; + break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " @@ -1004,6 +1013,7 @@ static int ext3_setup_super(struct super return res; } +/* Called at mount-time, super-block is locked */ static int ext3_check_descriptors (struct super_block * sb) { struct ext3_sb_info *sbi = EXT3_SB(sb); @@ -1302,7 +1312,7 @@ static int ext3_fill_super (struct super set_opt(sbi->s_mount_opt, RESERVATION); - if (!parse_options ((char *) data, sb, &journal_inum, 0)) + if (!parse_options ((char *) data, sb, &journal_inum, NULL, 0)) goto failed_mount; sb->s_flags |= MS_ONE_SECOND; @@ -1447,13 +1457,6 @@ static int ext3_fill_super (struct super printk (KERN_ERR "EXT3-fs: not enough memory\n"); goto failed_mount; } - sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(u8), - GFP_KERNEL); - if (!sbi->s_debts) { - printk("EXT3-fs: not enough memory to allocate s_bgi\n"); - goto failed_mount2; - } - memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(u8)); percpu_counter_init(&sbi->s_freeblocks_counter); percpu_counter_init(&sbi->s_freeinodes_counter); @@ -1604,7 +1607,6 @@ static int ext3_fill_super (struct super failed_mount3: journal_destroy(sbi->s_journal); failed_mount2: - kfree(sbi->s_debts); for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); @@ -2049,11 +2051,12 @@ int ext3_remount (struct super_block * s struct ext3_super_block * es; struct ext3_sb_info *sbi = EXT3_SB(sb); unsigned long tmp; + unsigned long n_blocks_count = 0; /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options(data, sb, &tmp, 1)) + if (!parse_options(data, sb, &tmp, &n_blocks_count, 1)) return -EINVAL; if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) @@ -2066,7 +2069,8 @@ int ext3_remount (struct super_block * s ext3_init_journal_params(sb, sbi->s_journal); - if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || + n_blocks_count > le32_to_cpu(es->s_blocks_count)) { if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) return -EROFS; @@ -2105,6 +2109,8 @@ int ext3_remount (struct super_block * s */ ext3_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); + if ((ret = ext3_group_extend(sb, es, n_blocks_count))) + return ret; if (!ext3_setup_super (sb, es, 0)) sb->s_flags &= ~MS_RDONLY; } @@ -2121,6 +2127,10 @@ int ext3_statfs (struct super_block * sb if (test_opt (sb, MINIX_DF)) overhead = 0; else { + unsigned long ngroups; + ngroups = EXT3_SB(sb)->s_groups_count; + smp_rmb(); + /* * Compute the overhead (FS structures) */ @@ -2136,7 +2146,7 @@ int ext3_statfs (struct super_block * sb * block group descriptors. If the sparse superblocks * feature is turned on, then not all groups have this. */ - for (i = 0; i < EXT3_SB(sb)->s_groups_count; i++) + for (i = 0; i < ngroups; i++) overhead += ext3_bg_has_super(sb, i) + ext3_bg_num_gdb(sb, i); @@ -2144,8 +2154,7 @@ int ext3_statfs (struct super_block * sb * Every block group has an inode bitmap, a block * bitmap, and an inode table. */ - overhead += (EXT3_SB(sb)->s_groups_count * - (2 + EXT3_SB(sb)->s_itb_per_group)); + overhead += (ngroups * (2 + EXT3_SB(sb)->s_itb_per_group)); } buf->f_type = EXT3_SUPER_MAGIC; diff -puN include/linux/ext3_fs.h~ext3-online-resize-patch include/linux/ext3_fs.h --- 25/include/linux/ext3_fs.h~ext3-online-resize-patch 2004-10-20 01:56:57.520414208 -0700 +++ 25-akpm/include/linux/ext3_fs.h 2004-10-20 02:40:17.204202200 -0700 @@ -196,6 +196,31 @@ struct ext3_group_desc #define EXT3_STATE_JDATA 0x00000001 /* journaled data exists */ #define EXT3_STATE_NEW 0x00000002 /* inode is newly created */ + +/* Used to pass group descriptor data when online resize is done */ +struct ext3_new_group_input { + __u32 group; /* Group number for this data */ + __u32 block_bitmap; /* Absolute block number of block bitmap */ + __u32 inode_bitmap; /* Absolute block number of inode bitmap */ + __u32 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +/* The struct ext3_new_group_input in kernel space, with free_blocks_count */ +struct ext3_new_group_data { + __u32 group; + __u32 block_bitmap; + __u32 inode_bitmap; + __u32 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 unused; + __u32 free_blocks_count; +}; + + /* * ioctl commands */ @@ -203,6 +228,8 @@ struct ext3_group_desc #define EXT3_IOC_SETFLAGS _IOW('f', 2, long) #define EXT3_IOC_GETVERSION _IOR('f', 3, long) #define EXT3_IOC_SETVERSION _IOW('f', 4, long) +#define EXT3_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT3_IOC_GROUP_ADD _IOW('f', 8,struct ext3_new_group_input) #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long) #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long) #ifdef CONFIG_JBD_DEBUG @@ -421,7 +448,7 @@ struct ext3_super_block { */ __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ - __u16 s_padding1; + __u16 s_reserved_gdt_blocks; /* Per group desc for online growth */ /* * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set. */ @@ -687,6 +714,8 @@ extern unsigned long ext3_bg_num_gdb(str extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); extern void ext3_free_blocks (handle_t *, struct inode *, unsigned long, unsigned long); +extern void ext3_free_blocks_sb (handle_t *, struct super_block *, + unsigned long, unsigned long, int *); extern unsigned long ext3_count_free_blocks (struct super_block *); extern void ext3_check_blocks_bitmap (struct super_block *); extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, @@ -749,6 +778,13 @@ extern int ext3_orphan_del(handle_t *, s extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); +/* resize.c */ +extern int ext3_group_add(struct super_block *sb, + struct ext3_new_group_data *input); +extern int ext3_group_extend(struct super_block *sb, + struct ext3_super_block *es, + unsigned long n_blocks_count); + /* super.c */ extern void ext3_error (struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); diff -puN include/linux/ext3_jbd.h~ext3-online-resize-patch include/linux/ext3_jbd.h --- 25/include/linux/ext3_jbd.h~ext3-online-resize-patch 2004-10-20 02:40:05.203026656 -0700 +++ 25-akpm/include/linux/ext3_jbd.h 2004-10-20 02:40:05.209025744 -0700 @@ -188,9 +188,14 @@ __ext3_journal_dirty_metadata(const char #define ext3_journal_dirty_metadata(handle, bh) \ __ext3_journal_dirty_metadata(__FUNCTION__, (handle), (bh)) -handle_t *ext3_journal_start(struct inode *inode, int nblocks); +handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks); int __ext3_journal_stop(const char *where, handle_t *handle); +static inline handle_t *ext3_journal_start(struct inode *inode, int nblocks) +{ + return ext3_journal_start_sb(inode->i_sb, nblocks); +} + #define ext3_journal_stop(handle) \ __ext3_journal_stop(__FUNCTION__, (handle)) diff -puN fs/ext3/ialloc.c~ext3-online-resize-patch fs/ext3/ialloc.c --- 25/fs/ext3/ialloc.c~ext3-online-resize-patch 2004-10-20 02:40:13.152818104 -0700 +++ 25-akpm/fs/ext3/ialloc.c 2004-10-20 02:40:13.201810656 -0700 @@ -320,8 +320,6 @@ static int find_group_orlov(struct super desc = ext3_get_group_desc (sb, group, &bh); if (!desc || !desc->bg_free_inodes_count) continue; - if (sbi->s_debts[group] >= max_debt) - continue; if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) continue; if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) diff -puN include/linux/ext3_fs_sb.h~ext3-online-resize-patch include/linux/ext3_fs_sb.h --- 25/include/linux/ext3_fs_sb.h~ext3-online-resize-patch 2004-10-20 02:40:13.199810960 -0700 +++ 25-akpm/include/linux/ext3_fs_sb.h 2004-10-20 02:40:13.204810200 -0700 @@ -54,7 +54,6 @@ struct ext3_sb_info { u32 s_next_generation; u32 s_hash_seed[4]; int s_def_hash_version; - u8 *s_debts; struct percpu_counter s_freeblocks_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; _