X-Git-Url: http://pilppa.org/gitweb/gitweb.cgi?a=blobdiff_plain;f=drivers%2Fmd%2Fraid5.c;h=8d59914f2057a6d0a06527dd8b3c872ed3c1b842;hb=07e44708059010aa26c6a4c8ee6ff11743d04d4e;hp=be008f034ada5e222c18043cf0b766426c86f03c;hpb=6bfe5c9d6f4dcaa998f67e691359cf7b1c4b443d;p=linux-2.6-omap-h63xx.git diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index be008f034ad..8d59914f205 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -405,6 +405,8 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ + md_allow_write(conf->mddev); + /* Step 1 */ sc = kmem_cache_create(conf->cache_name[1-conf->active_name], sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), @@ -1048,7 +1050,7 @@ static void compute_parity5(struct stripe_head *sh, int method) static void compute_parity6(struct stripe_head *sh, int method) { raid6_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count; + int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count; struct bio *chosen; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ void *ptrs[disks]; @@ -1129,8 +1131,7 @@ static void compute_parity6(struct stripe_head *sh, int method) /* Compute one missing block */ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) { - raid6_conf_t *conf = sh->raid_conf; - int i, count, disks = conf->raid_disks; + int i, count, disks = sh->disks; void *ptr[MAX_XOR_BLOCKS], *p; int pd_idx = sh->pd_idx; int qd_idx = raid6_next_disk(pd_idx, disks); @@ -1168,8 +1169,7 @@ static void compute_block_1(struct stripe_head *sh, int dd_idx, int nozero) /* Compute two missing blocks */ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) { - raid6_conf_t *conf = sh->raid_conf; - int i, count, disks = conf->raid_disks; + int i, count, disks = sh->disks; int pd_idx = sh->pd_idx; int qd_idx = raid6_next_disk(pd_idx, disks); int d0_idx = raid6_next_disk(qd_idx, disks); @@ -1885,11 +1885,11 @@ static void handle_stripe5(struct stripe_head *sh) static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) { raid6_conf_t *conf = sh->raid_conf; - int disks = conf->raid_disks; + int disks = sh->disks; struct bio *return_bi= NULL; struct bio *bi; int i; - int syncing; + int syncing, expanding, expanded; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; int non_overwrite = 0; int failed_num[2] = {0, 0}; @@ -1907,6 +1907,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) clear_bit(STRIPE_DELAYED, &sh->state); syncing = test_bit(STRIPE_SYNCING, &sh->state); + expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); + expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); /* Now to look around and see what can be done */ rcu_read_lock(); @@ -2112,13 +2114,15 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) * parity, or to satisfy requests * or to load a block that is being partially written. */ - if (to_read || non_overwrite || (to_write && failed) || (syncing && (uptodate < disks))) { + if (to_read || non_overwrite || (to_write && failed) || + (syncing && (uptodate < disks)) || expanding) { for (i=disks; i--;) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || syncing || + expanding || (failed >= 1 && (sh->dev[failed_num[0]].toread || to_write)) || (failed >= 2 && (sh->dev[failed_num[1]].toread || to_write)) ) @@ -2353,6 +2357,79 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } } } + + if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { + /* Need to write out all blocks after computing P&Q */ + sh->disks = conf->raid_disks; + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, + conf->raid_disks); + compute_parity6(sh, RECONSTRUCT_WRITE); + for (i = conf->raid_disks ; i-- ; ) { + set_bit(R5_LOCKED, &sh->dev[i].flags); + locked++; + set_bit(R5_Wantwrite, &sh->dev[i].flags); + } + clear_bit(STRIPE_EXPANDING, &sh->state); + } else if (expanded) { + clear_bit(STRIPE_EXPAND_READY, &sh->state); + atomic_dec(&conf->reshape_stripes); + wake_up(&conf->wait_for_overlap); + md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + } + + if (expanding && locked == 0) { + /* We have read all the blocks in this stripe and now we need to + * copy some of them into a target stripe for expand. + */ + clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); + for (i = 0; i < sh->disks ; i++) + if (i != pd_idx && i != qd_idx) { + int dd_idx2, pd_idx2, j; + struct stripe_head *sh2; + + sector_t bn = compute_blocknr(sh, i); + sector_t s = raid5_compute_sector( + bn, conf->raid_disks, + conf->raid_disks - conf->max_degraded, + &dd_idx2, &pd_idx2, conf); + sh2 = get_active_stripe(conf, s, + conf->raid_disks, + pd_idx2, 1); + if (sh2 == NULL) + /* so for only the early blocks of + * this stripe have been requests. + * When later blocks get requests, we + * will try again + */ + continue; + if (!test_bit(STRIPE_EXPANDING, &sh2->state) || + test_bit(R5_Expanded, + &sh2->dev[dd_idx2].flags)) { + /* must have already done this block */ + release_stripe(sh2); + continue; + } + memcpy(page_address(sh2->dev[dd_idx2].page), + page_address(sh->dev[i].page), + STRIPE_SIZE); + set_bit(R5_Expanded, &sh2->dev[dd_idx2].flags); + set_bit(R5_UPTODATE, &sh2->dev[dd_idx2].flags); + for (j = 0 ; j < conf->raid_disks ; j++) + if (j != sh2->pd_idx && + j != raid6_next_disk(sh2->pd_idx, + sh2->disks) && + !test_bit(R5_Expanded, + &sh2->dev[j].flags)) + break; + if (j == conf->raid_disks) { + set_bit(STRIPE_EXPAND_READY, + &sh2->state); + set_bit(STRIPE_HANDLE, &sh2->state); + } + release_stripe(sh2); + } + } + spin_unlock(&sh->lock); while ((bi=return_bi)) { @@ -2393,7 +2470,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) rcu_read_unlock(); if (rdev) { - if (syncing) + if (syncing || expanding || expanded) md_sync_acct(rdev->bdev, STRIPE_SECTORS); bi->bi_bdev = rdev->bdev; @@ -2618,7 +2695,7 @@ static struct bio *remove_bio_from_retry(raid5_conf_t *conf) } bi = conf->retry_read_aligned_list; if(bi) { - conf->retry_read_aligned = bi->bi_next; + conf->retry_read_aligned_list = bi->bi_next; bi->bi_next = NULL; bi->bi_phys_segments = 1; /* biased count of active stripes */ bi->bi_hw_segments = 0; /* count of processed stripes */ @@ -2667,6 +2744,27 @@ static int raid5_align_endio(struct bio *bi, unsigned int bytes, int error) return 0; } +static int bio_fits_rdev(struct bio *bi) +{ + request_queue_t *q = bdev_get_queue(bi->bi_bdev); + + if ((bi->bi_size>>9) > q->max_sectors) + return 0; + blk_recount_segments(q, bi); + if (bi->bi_phys_segments > q->max_phys_segments || + bi->bi_hw_segments > q->max_hw_segments) + return 0; + + if (q->merge_bvec_fn) + /* it's too hard to apply the merge_bvec_fn at this stage, + * just just give up + */ + return 0; + + return 1; +} + + static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) { mddev_t *mddev = q->queuedata; @@ -2678,7 +2776,7 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) mdk_rdev_t *rdev; if (!in_chunk_boundary(mddev, raid_bio)) { - printk("chunk_aligned_read : non aligned\n"); + PRINTK("chunk_aligned_read : non aligned\n"); return 0; } /* @@ -2713,6 +2811,13 @@ static int chunk_aligned_read(request_queue_t *q, struct bio * raid_bio) align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); align_bi->bi_sector += rdev->data_offset; + if (!bio_fits_rdev(align_bi)) { + /* too big in some way */ + bio_put(align_bi); + rdev_dec_pending(rdev, mddev); + return 0; + } + spin_lock_irq(&conf->device_lock); wait_event_lock_irq(conf->wait_for_stripe, conf->quiesce == 0, @@ -2885,8 +2990,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped struct stripe_head *sh; int pd_idx; sector_t first_sector, last_sector; - int raid_disks; - int data_disks; + int raid_disks = conf->previous_raid_disks; + int data_disks = raid_disks - conf->max_degraded; + int new_data_disks = conf->raid_disks - conf->max_degraded; int i; int dd_idx; sector_t writepos, safepos, gap; @@ -2895,7 +3001,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped conf->expand_progress != 0) { /* restarting in the middle, skip the initial sectors */ sector_nr = conf->expand_progress; - sector_div(sector_nr, conf->raid_disks-1); + sector_div(sector_nr, new_data_disks); *skipped = 1; return sector_nr; } @@ -2909,14 +3015,14 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped * to after where expand_lo old_maps to */ writepos = conf->expand_progress + - conf->chunk_size/512*(conf->raid_disks-1); - sector_div(writepos, conf->raid_disks-1); + conf->chunk_size/512*(new_data_disks); + sector_div(writepos, new_data_disks); safepos = conf->expand_lo; - sector_div(safepos, conf->previous_raid_disks-1); + sector_div(safepos, data_disks); gap = conf->expand_progress - conf->expand_lo; if (writepos >= safepos || - gap > (conf->raid_disks-1)*3000*2 /*3Meg*/) { + gap > (new_data_disks)*3000*2 /*3Meg*/) { /* Cannot proceed until we've updated the superblock... */ wait_event(conf->wait_for_overlap, atomic_read(&conf->reshape_stripes)==0); @@ -2946,6 +3052,9 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped sector_t s; if (j == sh->pd_idx) continue; + if (conf->level == 6 && + j == raid6_next_disk(sh->pd_idx, sh->disks)) + continue; s = compute_blocknr(sh, j); if (s < (mddev->array_size<<1)) { skipped = 1; @@ -2962,28 +3071,27 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped release_stripe(sh); } spin_lock_irq(&conf->device_lock); - conf->expand_progress = (sector_nr + i)*(conf->raid_disks-1); + conf->expand_progress = (sector_nr + i) * new_data_disks; spin_unlock_irq(&conf->device_lock); /* Ok, those stripe are ready. We can start scheduling * reads on the source stripes. * The source stripes are determined by mapping the first and last * block on the destination stripes. */ - raid_disks = conf->previous_raid_disks; - data_disks = raid_disks - 1; first_sector = - raid5_compute_sector(sector_nr*(conf->raid_disks-1), + raid5_compute_sector(sector_nr*(new_data_disks), raid_disks, data_disks, &dd_idx, &pd_idx, conf); last_sector = raid5_compute_sector((sector_nr+conf->chunk_size/512) - *(conf->raid_disks-1) -1, + *(new_data_disks) -1, raid_disks, data_disks, &dd_idx, &pd_idx, conf); if (last_sector >= (mddev->size<<1)) last_sector = (mddev->size<<1)-1; while (first_sector <= last_sector) { - pd_idx = stripe_to_pdidx(first_sector, conf, conf->previous_raid_disks); + pd_idx = stripe_to_pdidx(first_sector, conf, + conf->previous_raid_disks); sh = get_active_stripe(conf, first_sector, conf->previous_raid_disks, pd_idx, 0); set_bit(STRIPE_EXPAND_SOURCE, &sh->state); @@ -3105,7 +3213,9 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); for (; logical_sector < last_sector; - logical_sector += STRIPE_SECTORS, scnt++) { + logical_sector += STRIPE_SECTORS, + sector += STRIPE_SECTORS, + scnt++) { if (scnt < raid_bio->bi_hw_segments) /* already done this stripe */ @@ -3121,7 +3231,13 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) } set_bit(R5_ReadError, &sh->dev[dd_idx].flags); - add_stripe_bio(sh, raid_bio, dd_idx, 0); + if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { + release_stripe(sh); + raid_bio->bi_hw_segments = scnt; + conf->retry_read_aligned = raid_bio; + return handled; + } + handle_stripe(sh, NULL); release_stripe(sh); handled++; @@ -3250,6 +3366,7 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) else break; } + md_allow_write(mddev); while (new > conf->max_nr_stripes) { if (grow_one_stripe(conf)) conf->max_nr_stripes++; @@ -3309,35 +3426,44 @@ static int run(mddev_t *mddev) */ sector_t here_new, here_old; int old_disks; + int max_degraded = (mddev->level == 5 ? 1 : 2); if (mddev->new_level != mddev->level || mddev->new_layout != mddev->layout || mddev->new_chunk != mddev->chunk_size) { - printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n", + printk(KERN_ERR "raid5: %s: unsupported reshape " + "required - aborting.\n", mdname(mddev)); return -EINVAL; } if (mddev->delta_disks <= 0) { - printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n", + printk(KERN_ERR "raid5: %s: unsupported reshape " + "(reduce disks) required - aborting.\n", mdname(mddev)); return -EINVAL; } old_disks = mddev->raid_disks - mddev->delta_disks; /* reshape_position must be on a new-stripe boundary, and one - * further up in new geometry must map after here in old geometry. + * further up in new geometry must map after here in old + * geometry. */ here_new = mddev->reshape_position; - if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) { - printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n"); + if (sector_div(here_new, (mddev->chunk_size>>9)* + (mddev->raid_disks - max_degraded))) { + printk(KERN_ERR "raid5: reshape_position not " + "on a stripe boundary\n"); return -EINVAL; } /* here_new is the stripe we will write to */ here_old = mddev->reshape_position; - sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1)); - /* here_old is the first stripe that we might need to read from */ + sector_div(here_old, (mddev->chunk_size>>9)* + (old_disks-max_degraded)); + /* here_old is the first stripe that we might need to read + * from */ if (here_new >= here_old) { /* Reading from the same stripe as writing to - bad */ - printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n"); + printk(KERN_ERR "raid5: reshape_position too early for " + "auto-recovery - aborting.\n"); return -EINVAL; } printk(KERN_INFO "raid5: reshape will continue\n"); @@ -3516,12 +3642,15 @@ static int run(mddev_t *mddev) } /* Ok, everything is just fine now */ - sysfs_create_group(&mddev->kobj, &raid5_attrs_group); + if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) + printk(KERN_WARNING + "raid5: failed to create sysfs attributes for %s\n", + mdname(mddev)); mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->issue_flush_fn = raid5_issue_flush; - mddev->queue->backing_dev_info.congested_fn = raid5_congested; mddev->queue->backing_dev_info.congested_data = mddev; + mddev->queue->backing_dev_info.congested_fn = raid5_congested; mddev->array_size = mddev->size * (conf->previous_raid_disks - conf->max_degraded); @@ -3552,6 +3681,7 @@ static int stop(mddev_t *mddev) mddev->thread = NULL; shrink_stripes(conf); kfree(conf->stripe_hashtbl); + mddev->queue->backing_dev_info.congested_fn = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); kfree(conf->disks); @@ -3775,6 +3905,8 @@ static int raid5_check_reshape(mddev_t *mddev) if (err) return err; + if (mddev->degraded > conf->max_degraded) + return -EINVAL; /* looks like we might be able to manage this */ return 0; } @@ -3788,8 +3920,7 @@ static int raid5_start_reshape(mddev_t *mddev) int added_devices = 0; unsigned long flags; - if (mddev->degraded || - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; ITERATE_RDEV(mddev, rdev, rtmp) @@ -3797,7 +3928,7 @@ static int raid5_start_reshape(mddev_t *mddev) !test_bit(Faulty, &rdev->flags)) spares++; - if (spares < mddev->delta_disks-1) + if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) /* Not enough devices even to make a degraded array * of that size */ @@ -3823,7 +3954,12 @@ static int raid5_start_reshape(mddev_t *mddev) added_devices++; rdev->recovery_offset = 0; sprintf(nm, "rd%d", rdev->raid_disk); - sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); + if (sysfs_create_link(&mddev->kobj, + &rdev->kobj, nm)) + printk(KERN_WARNING + "raid5: failed to create " + " link %s for %s\n", + nm, mdname(mddev)); } else break; } @@ -3860,7 +3996,8 @@ static void end_reshape(raid5_conf_t *conf) struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1); + conf->mddev->array_size = conf->mddev->size * + (conf->raid_disks - conf->max_degraded); set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); conf->mddev->changed = 1; @@ -3933,6 +4070,10 @@ static struct mdk_personality raid6_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, +#ifdef CONFIG_MD_RAID5_RESHAPE + .check_reshape = raid5_check_reshape, + .start_reshape = raid5_start_reshape, +#endif .quiesce = raid5_quiesce, }; static struct mdk_personality raid5_personality = @@ -3972,6 +4113,10 @@ static struct mdk_personality raid4_personality = .spare_active = raid5_spare_active, .sync_request = sync_request, .resize = raid5_resize, +#ifdef CONFIG_MD_RAID5_RESHAPE + .check_reshape = raid5_check_reshape, + .start_reshape = raid5_start_reshape, +#endif .quiesce = raid5_quiesce, };