]> pilppa.org Git - linux-2.6-omap-h63xx.git/blobdiff - drivers/md/raid1.c
Merge master.kernel.org:/pub/scm/linux/kernel/git/davem/sparc-2.6
[linux-2.6-omap-h63xx.git] / drivers / md / raid1.c
index a8bc93d6ff63fcd578dee710c3513342efc0fe2a..a06ff91f27e2e6bb3af330e170faebdd79e6a9ce 100644 (file)
@@ -47,7 +47,6 @@
  */
 #define        NR_RAID1_BIOS 256
 
-static mdk_personality_t raid1_personality;
 
 static void unplug_slaves(mddev_t *mddev);
 
@@ -61,10 +60,8 @@ static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
        int size = offsetof(r1bio_t, bios[pi->raid_disks]);
 
        /* allocate a r1bio with room for raid_disks entries in the bios array */
-       r1_bio = kmalloc(size, gfp_flags);
-       if (r1_bio)
-               memset(r1_bio, 0, size);
-       else
+       r1_bio = kzalloc(size, gfp_flags);
+       if (!r1_bio)
                unplug_slaves(pi->mddev);
 
        return r1_bio;
@@ -106,15 +103,30 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
        }
        /*
         * Allocate RESYNC_PAGES data pages and attach them to
-        * the first bio;
+        * the first bio.
+        * If this is a user-requested check/repair, allocate
+        * RESYNC_PAGES for each bio.
         */
-       bio = r1_bio->bios[0];
-       for (i = 0; i < RESYNC_PAGES; i++) {
-               page = alloc_page(gfp_flags);
-               if (unlikely(!page))
-                       goto out_free_pages;
-
-               bio->bi_io_vec[i].bv_page = page;
+       if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
+               j = pi->raid_disks;
+       else
+               j = 1;
+       while(j--) {
+               bio = r1_bio->bios[j];
+               for (i = 0; i < RESYNC_PAGES; i++) {
+                       page = alloc_page(gfp_flags);
+                       if (unlikely(!page))
+                               goto out_free_pages;
+
+                       bio->bi_io_vec[i].bv_page = page;
+               }
+       }
+       /* If not user-requests, copy the page pointers to all bios */
+       if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
+               for (i=0; i<RESYNC_PAGES ; i++)
+                       for (j=1; j<pi->raid_disks; j++)
+                               r1_bio->bios[j]->bi_io_vec[i].bv_page =
+                                       r1_bio->bios[0]->bi_io_vec[i].bv_page;
        }
 
        r1_bio->master_bio = NULL;
@@ -122,8 +134,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
        return r1_bio;
 
 out_free_pages:
-       for ( ; i > 0 ; i--)
-               __free_page(bio->bi_io_vec[i-1].bv_page);
+       for (i=0; i < RESYNC_PAGES ; i++)
+               for (j=0 ; j < pi->raid_disks; j++)
+                       safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
+       j = -1;
 out_free_bio:
        while ( ++j < pi->raid_disks )
                bio_put(r1_bio->bios[j]);
@@ -134,14 +148,16 @@ out_free_bio:
 static void r1buf_pool_free(void *__r1_bio, void *data)
 {
        struct pool_info *pi = data;
-       int i;
+       int i,j;
        r1bio_t *r1bio = __r1_bio;
-       struct bio *bio = r1bio->bios[0];
 
-       for (i = 0; i < RESYNC_PAGES; i++) {
-               __free_page(bio->bi_io_vec[i].bv_page);
-               bio->bi_io_vec[i].bv_page = NULL;
-       }
+       for (i = 0; i < RESYNC_PAGES; i++)
+               for (j = pi->raid_disks; j-- ;) {
+                       if (j == 0 ||
+                           r1bio->bios[j]->bi_io_vec[i].bv_page !=
+                           r1bio->bios[0]->bi_io_vec[i].bv_page)
+                               safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
+               }
        for (i=0 ; i < pi->raid_disks; i++)
                bio_put(r1bio->bios[i]);
 
@@ -265,7 +281,8 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
                 * user-side. So if something waits for IO, then it will
                 * wait for the 'master' bio.
                 */
-               set_bit(R1BIO_Uptodate, &r1_bio->state);
+               if (uptodate)
+                       set_bit(R1BIO_Uptodate, &r1_bio->state);
 
                raid_end_bio_io(r1_bio);
        } else {
@@ -364,7 +381,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
                        /* free extra copy of the data pages */
                        int i = bio->bi_vcnt;
                        while (i--)
-                               __free_page(bio->bi_io_vec[i].bv_page);
+                               safe_put_page(bio->bi_io_vec[i].bv_page);
                }
                /* clear the bitmap if all writes complete successfully */
                bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
@@ -510,7 +527,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
                        /* cannot risk returning a device that failed
                         * before we inc'ed nr_pending
                         */
-                       atomic_dec(&rdev->nr_pending);
+                       rdev_dec_pending(rdev, conf->mddev);
                        goto retry;
                }
                conf->next_seq_sect = this_sector + sectors;
@@ -691,13 +708,11 @@ static struct page **alloc_behind_pages(struct bio *bio)
 {
        int i;
        struct bio_vec *bvec;
-       struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *),
+       struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),
                                        GFP_NOIO);
        if (unlikely(!pages))
                goto do_sync_io;
 
-       memset(pages, 0, bio->bi_vcnt * sizeof(struct page *));
-
        bio_for_each_segment(bvec, bio, i) {
                pages[i] = alloc_page(GFP_NOIO);
                if (unlikely(!pages[i]))
@@ -713,7 +728,7 @@ static struct page **alloc_behind_pages(struct bio *bio)
 do_sync_io:
        if (pages)
                for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
-                       __free_page(pages[i]);
+                       put_page(pages[i]);
        kfree(pages);
        PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
        return NULL;
@@ -815,7 +830,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
                    !test_bit(Faulty, &rdev->flags)) {
                        atomic_inc(&rdev->nr_pending);
                        if (test_bit(Faulty, &rdev->flags)) {
-                               atomic_dec(&rdev->nr_pending);
+                               rdev_dec_pending(rdev, mddev);
                                r1_bio->bios[i] = NULL;
                        } else
                                r1_bio->bios[i] = bio;
@@ -1077,13 +1092,16 @@ abort:
 static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 {
        r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
+       int i;
 
        if (bio->bi_size)
                return 1;
 
-       if (r1_bio->bios[r1_bio->read_disk] != bio)
-               BUG();
-       update_head_pos(r1_bio->read_disk, r1_bio);
+       for (i=r1_bio->mddev->raid_disks; i--; )
+               if (r1_bio->bios[i] == bio)
+                       break;
+       BUG_ON(i < 0);
+       update_head_pos(i, r1_bio);
        /*
         * we have read a block, now it needs to be re-written,
         * or re-read if the read failed.
@@ -1091,7 +1109,9 @@ static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
         */
        if (test_bit(BIO_UPTODATE, &bio->bi_flags))
                set_bit(R1BIO_Uptodate, &r1_bio->state);
-       reschedule_retry(r1_bio);
+
+       if (atomic_dec_and_test(&r1_bio->remaining))
+               reschedule_retry(r1_bio);
        return 0;
 }
 
@@ -1134,9 +1154,67 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
        bio = r1_bio->bios[r1_bio->read_disk];
 
 
-       /*
-        * schedule writes
-        */
+       if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+               /* We have read all readable devices.  If we haven't
+                * got the block, then there is no hope left.
+                * If we have, then we want to do a comparison
+                * and skip the write if everything is the same.
+                * If any blocks failed to read, then we need to
+                * attempt an over-write
+                */
+               int primary;
+               if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+                       for (i=0; i<mddev->raid_disks; i++)
+                               if (r1_bio->bios[i]->bi_end_io == end_sync_read)
+                                       md_error(mddev, conf->mirrors[i].rdev);
+
+                       md_done_sync(mddev, r1_bio->sectors, 1);
+                       put_buf(r1_bio);
+                       return;
+               }
+               for (primary=0; primary<mddev->raid_disks; primary++)
+                       if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
+                           test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
+                               r1_bio->bios[primary]->bi_end_io = NULL;
+                               rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
+                               break;
+                       }
+               r1_bio->read_disk = primary;
+               for (i=0; i<mddev->raid_disks; i++)
+                       if (r1_bio->bios[i]->bi_end_io == end_sync_read &&
+                           test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) {
+                               int j;
+                               int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
+                               struct bio *pbio = r1_bio->bios[primary];
+                               struct bio *sbio = r1_bio->bios[i];
+                               for (j = vcnt; j-- ; )
+                                       if (memcmp(page_address(pbio->bi_io_vec[j].bv_page),
+                                                  page_address(sbio->bi_io_vec[j].bv_page),
+                                                  PAGE_SIZE))
+                                               break;
+                               if (j >= 0)
+                                       mddev->resync_mismatches += r1_bio->sectors;
+                               if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
+                                       sbio->bi_end_io = NULL;
+                                       rdev_dec_pending(conf->mirrors[i].rdev, mddev);
+                               } else {
+                                       /* fixup the bio for reuse */
+                                       sbio->bi_vcnt = vcnt;
+                                       sbio->bi_size = r1_bio->sectors << 9;
+                                       sbio->bi_idx = 0;
+                                       sbio->bi_phys_segments = 0;
+                                       sbio->bi_hw_segments = 0;
+                                       sbio->bi_hw_front_size = 0;
+                                       sbio->bi_hw_back_size = 0;
+                                       sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
+                                       sbio->bi_flags |= 1 << BIO_UPTODATE;
+                                       sbio->bi_next = NULL;
+                                       sbio->bi_sector = r1_bio->sector +
+                                               conf->mirrors[i].rdev->data_offset;
+                                       sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+                               }
+                       }
+       }
        if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
                /* ouch - failed to read all of that.
                 * Try some synchronous reads of other devices to get
@@ -1177,6 +1255,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
                        } while (!success && d != r1_bio->read_disk);
 
                        if (success) {
+                               int start = d;
                                /* write it back and re-read */
                                set_bit(R1BIO_Uptodate, &r1_bio->state);
                                while (d != r1_bio->read_disk) {
@@ -1186,18 +1265,28 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
                                        if (r1_bio->bios[d]->bi_end_io != end_sync_read)
                                                continue;
                                        rdev = conf->mirrors[d].rdev;
+                                       atomic_add(s, &rdev->corrected_errors);
                                        if (sync_page_io(rdev->bdev,
                                                         sect + rdev->data_offset,
                                                         s<<9,
                                                         bio->bi_io_vec[idx].bv_page,
-                                                        WRITE) == 0 ||
-                                           sync_page_io(rdev->bdev,
+                                                        WRITE) == 0)
+                                               md_error(mddev, rdev);
+                               }
+                               d = start;
+                               while (d != r1_bio->read_disk) {
+                                       if (d == 0)
+                                               d = conf->raid_disks;
+                                       d--;
+                                       if (r1_bio->bios[d]->bi_end_io != end_sync_read)
+                                               continue;
+                                       rdev = conf->mirrors[d].rdev;
+                                       if (sync_page_io(rdev->bdev,
                                                         sect + rdev->data_offset,
                                                         s<<9,
                                                         bio->bi_io_vec[idx].bv_page,
-                                                        READ) == 0) {
+                                                        READ) == 0)
                                                md_error(mddev, rdev);
-                                       }
                                }
                        } else {
                                char b[BDEVNAME_SIZE];
@@ -1216,6 +1305,10 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
                        idx ++;
                }
        }
+
+       /*
+        * schedule writes
+        */
        atomic_set(&r1_bio->remaining, 1);
        for (i = 0; i < disks ; i++) {
                wbio = r1_bio->bios[i];
@@ -1365,22 +1458,35 @@ static void raid1d(mddev_t *mddev)
 
                                if (success) {
                                        /* write it back and re-read */
+                                       int start = d;
                                        while (d != r1_bio->read_disk) {
                                                if (d==0)
                                                        d = conf->raid_disks;
                                                d--;
                                                rdev = conf->mirrors[d].rdev;
+                                               atomic_add(s, &rdev->corrected_errors);
                                                if (rdev &&
                                                    test_bit(In_sync, &rdev->flags)) {
                                                        if (sync_page_io(rdev->bdev,
                                                                         sect + rdev->data_offset,
-                                                                        s<<9, conf->tmppage, WRITE) == 0 ||
-                                                           sync_page_io(rdev->bdev,
+                                                                        s<<9, conf->tmppage, WRITE) == 0)
+                                                               /* Well, this device is dead */
+                                                               md_error(mddev, rdev);
+                                               }
+                                       }
+                                       d = start;
+                                       while (d != r1_bio->read_disk) {
+                                               if (d==0)
+                                                       d = conf->raid_disks;
+                                               d--;
+                                               rdev = conf->mirrors[d].rdev;
+                                               if (rdev &&
+                                                   test_bit(In_sync, &rdev->flags)) {
+                                                       if (sync_page_io(rdev->bdev,
                                                                         sect + rdev->data_offset,
-                                                                        s<<9, conf->tmppage, READ) == 0) {
+                                                                        s<<9, conf->tmppage, READ) == 0)
                                                                /* Well, this device is dead */
                                                                md_error(mddev, rdev);
-                                                       }
                                                }
                                        }
                                } else {
@@ -1618,10 +1724,10 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                for (i=0 ; i < conf->raid_disks; i++) {
                        bio = r1_bio->bios[i];
                        if (bio->bi_end_io) {
-                               page = r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page;
+                               page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
                                if (bio_add_page(bio, page, len, 0) == 0) {
                                        /* stop here */
-                                       r1_bio->bios[0]->bi_io_vec[bio->bi_vcnt].bv_page = page;
+                                       bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
                                        while (i > 0) {
                                                i--;
                                                bio = r1_bio->bios[i];
@@ -1641,12 +1747,28 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
                sync_blocks -= (len>>9);
        } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
  bio_full:
-       bio = r1_bio->bios[r1_bio->read_disk];
        r1_bio->sectors = nr_sectors;
 
-       md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev, nr_sectors);
+       /* For a user-requested sync, we read all readable devices and do a
+        * compare
+        */
+       if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+               atomic_set(&r1_bio->remaining, read_targets);
+               for (i=0; i<conf->raid_disks; i++) {
+                       bio = r1_bio->bios[i];
+                       if (bio->bi_end_io == end_sync_read) {
+                               md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors);
+                               generic_make_request(bio);
+                       }
+               }
+       } else {
+               atomic_set(&r1_bio->remaining, 1);
+               bio = r1_bio->bios[r1_bio->read_disk];
+               md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev,
+                            nr_sectors);
+               generic_make_request(bio);
 
-       generic_make_request(bio);
+       }
 
        return nr_sectors;
 }
@@ -1669,19 +1791,16 @@ static int run(mddev_t *mddev)
         * bookkeeping area. [whatever we allocate in run(),
         * should be freed in stop()]
         */
-       conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
+       conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
        mddev->private = conf;
        if (!conf)
                goto out_no_mem;
 
-       memset(conf, 0, sizeof(*conf));
-       conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks, 
+       conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
                                 GFP_KERNEL);
        if (!conf->mirrors)
                goto out_no_mem;
 
-       memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
-
        conf->tmppage = alloc_page(GFP_KERNEL);
        if (!conf->tmppage)
                goto out_no_mem;
@@ -1792,7 +1911,7 @@ out_free_conf:
                if (conf->r1bio_pool)
                        mempool_destroy(conf->r1bio_pool);
                kfree(conf->mirrors);
-               __free_page(conf->tmppage);
+               safe_put_page(conf->tmppage);
                kfree(conf->poolinfo);
                kfree(conf);
                mddev->private = NULL;
@@ -1891,13 +2010,12 @@ static int raid1_reshape(mddev_t *mddev, int raid_disks)
                kfree(newpoolinfo);
                return -ENOMEM;
        }
-       newmirrors = kmalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
+       newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
        if (!newmirrors) {
                kfree(newpoolinfo);
                mempool_destroy(newpool);
                return -ENOMEM;
        }
-       memset(newmirrors, 0, sizeof(struct mirror_info)*raid_disks);
 
        raise_barrier(conf);
 
@@ -1943,9 +2061,10 @@ static void raid1_quiesce(mddev_t *mddev, int state)
 }
 
 
-static mdk_personality_t raid1_personality =
+static struct mdk_personality raid1_personality =
 {
        .name           = "raid1",
+       .level          = 1,
        .owner          = THIS_MODULE,
        .make_request   = make_request,
        .run            = run,
@@ -1963,15 +2082,17 @@ static mdk_personality_t raid1_personality =
 
 static int __init raid_init(void)
 {
-       return register_md_personality(RAID1, &raid1_personality);
+       return register_md_personality(&raid1_personality);
 }
 
 static void raid_exit(void)
 {
-       unregister_md_personality(RAID1);
+       unregister_md_personality(&raid1_personality);
 }
 
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md-personality-3"); /* RAID1 */
+MODULE_ALIAS("md-raid1");
+MODULE_ALIAS("md-level-1");