]> pilppa.org Git - linux-2.6-omap-h63xx.git/blobdiff - drivers/md/md.c
[PATCH] md: fix deadlock due to md thread processing delayed requests.
[linux-2.6-omap-h63xx.git] / drivers / md / md.c
index b02f8d1d77e72da31f8d55a990117c7205014d5e..7075bebb7f37015ce09ecc37d7aabd557ebfe82b 100644 (file)
@@ -224,8 +224,8 @@ static mddev_t * mddev_find(dev_t unit)
        INIT_LIST_HEAD(&new->all_mddevs);
        init_timer(&new->safemode_timer);
        atomic_set(&new->active, 1);
-       bio_list_init(&new->write_list);
        spin_lock_init(&new->write_lock);
+       init_waitqueue_head(&new->sb_wait);
 
        new->queue = blk_alloc_queue(GFP_KERNEL);
        if (!new->queue) {
@@ -577,6 +577,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
        mdp_disk_t *desc;
        mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
 
+       rdev->raid_disk = -1;
+       rdev->in_sync = 0;
        if (mddev->raid_disks == 0) {
                mddev->major_version = 0;
                mddev->minor_version = sb->minor_version;
@@ -607,16 +609,24 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
 
                mddev->max_disks = MD_SB_DISKS;
-       } else {
-               __u64 ev1;
-               ev1 = md_event(sb);
+       } else if (mddev->pers == NULL) {
+               /* Insist on good event counter while assembling */
+               __u64 ev1 = md_event(sb);
                ++ev1;
                if (ev1 < mddev->events) 
                        return -EINVAL;
-       }
+       } else if (mddev->bitmap) {
+               /* if adding to array with a bitmap, then we can accept an
+                * older device ... but not too old.
+                */
+               __u64 ev1 = md_event(sb);
+               if (ev1 < mddev->bitmap->events_cleared)
+                       return 0;
+       } else /* just a hot-add of a new device, leave raid_disk at -1 */
+               return 0;
+
        if (mddev->level != LEVEL_MULTIPATH) {
-               rdev->raid_disk = -1;
-               rdev->in_sync = rdev->faulty = 0;
+               rdev->faulty = 0;
                desc = sb->disks + rdev->desc_nr;
 
                if (desc->state & (1<<MD_DISK_FAULTY))
@@ -626,7 +636,8 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                        rdev->in_sync = 1;
                        rdev->raid_disk = desc->raid_disk;
                }
-       }
+       } else /* MULTIPATH are always insync */
+               rdev->in_sync = 1;
        return 0;
 }
 
@@ -868,6 +879,8 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
 
+       rdev->raid_disk = -1;
+       rdev->in_sync = 0;
        if (mddev->raid_disks == 0) {
                mddev->major_version = 1;
                mddev->patch_version = 0;
@@ -885,13 +898,21 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                memcpy(mddev->uuid, sb->set_uuid, 16);
 
                mddev->max_disks =  (4096-256)/2;
-       } else {
-               __u64 ev1;
-               ev1 = le64_to_cpu(sb->events);
+       } else if (mddev->pers == NULL) {
+               /* Insist of good event counter while assembling */
+               __u64 ev1 = le64_to_cpu(sb->events);
                ++ev1;
                if (ev1 < mddev->events)
                        return -EINVAL;
-       }
+       } else if (mddev->bitmap) {
+               /* If adding to array with a bitmap, then we can accept an
+                * older device, but not too old.
+                */
+               __u64 ev1 = le64_to_cpu(sb->events);
+               if (ev1 < mddev->bitmap->events_cleared)
+                       return 0;
+       } else /* just a hot-add of a new device, leave raid_disk at -1 */
+               return 0;
 
        if (mddev->level != LEVEL_MULTIPATH) {
                int role;
@@ -899,14 +920,10 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
                switch(role) {
                case 0xffff: /* spare */
-                       rdev->in_sync = 0;
                        rdev->faulty = 0;
-                       rdev->raid_disk = -1;
                        break;
                case 0xfffe: /* faulty */
-                       rdev->in_sync = 0;
                        rdev->faulty = 1;
-                       rdev->raid_disk = -1;
                        break;
                default:
                        rdev->in_sync = 1;
@@ -914,7 +931,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
                        rdev->raid_disk = role;
                        break;
                }
-       }
+       } else /* MULTIPATH are always insync */
+               rdev->in_sync = 1;
+
        return 0;
 }
 
@@ -1288,6 +1307,7 @@ repeat:
        if (!mddev->persistent) {
                mddev->sb_dirty = 0;
                spin_unlock(&mddev->write_lock);
+               wake_up(&mddev->sb_wait);
                return;
        }
        spin_unlock(&mddev->write_lock);
@@ -1329,6 +1349,7 @@ repeat:
        }
        mddev->sb_dirty = 0;
        spin_unlock(&mddev->write_lock);
+       wake_up(&mddev->sb_wait);
 
 }
 
@@ -2155,6 +2176,18 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
                                PTR_ERR(rdev));
                        return PTR_ERR(rdev);
                }
+               /* set save_raid_disk if appropriate */
+               if (!mddev->persistent) {
+                       if (info->state & (1<<MD_DISK_SYNC)  &&
+                           info->raid_disk < mddev->raid_disks)
+                               rdev->raid_disk = info->raid_disk;
+                       else
+                               rdev->raid_disk = -1;
+               } else
+                       super_types[mddev->major_version].
+                               validate_super(mddev, rdev);
+               rdev->saved_raid_disk = rdev->raid_disk;
+
                rdev->in_sync = 0; /* just to be sure */
                rdev->raid_disk = -1;
                err = bind_rdev_to_array(rdev, mddev);
@@ -3337,29 +3370,26 @@ void md_done_sync(mddev_t *mddev, int blocks, int ok)
 
 /* md_write_start(mddev, bi)
  * If we need to update some array metadata (e.g. 'active' flag
- * in superblock) before writing, queue bi for later writing
- * and return 0, else return 1 and it will be written now
+ * in superblock) before writing, schedule a superblock update
+ * and wait for it to complete.
  */
-int md_write_start(mddev_t *mddev, struct bio *bi)
+void md_write_start(mddev_t *mddev, struct bio *bi)
 {
+       DEFINE_WAIT(w);
        if (bio_data_dir(bi) != WRITE)
-               return 1;
+               return;
 
        atomic_inc(&mddev->writes_pending);
-       spin_lock(&mddev->write_lock);
-       if (mddev->in_sync == 0 && mddev->sb_dirty == 0) {
-               spin_unlock(&mddev->write_lock);
-               return 1;
-       }
-       bio_list_add(&mddev->write_list, bi);
-
        if (mddev->in_sync) {
-               mddev->in_sync = 0;
-               mddev->sb_dirty = 1;
+               spin_lock(&mddev->write_lock);
+               if (mddev->in_sync) {
+                       mddev->in_sync = 0;
+                       mddev->sb_dirty = 1;
+                       md_wakeup_thread(mddev->thread);
+               }
+               spin_unlock(&mddev->write_lock);
        }
-       spin_unlock(&mddev->write_lock);
-       md_wakeup_thread(mddev->thread);
-       return 0;
+       wait_event(mddev->sb_wait, mddev->sb_dirty==0);
 }
 
 void md_write_end(mddev_t *mddev)
@@ -3654,7 +3684,6 @@ void md_check_recovery(mddev_t *mddev)
                mddev->sb_dirty ||
                test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
                test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
-               mddev->write_list.head ||
                (mddev->safemode == 1) ||
                (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
                 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
@@ -3663,7 +3692,6 @@ void md_check_recovery(mddev_t *mddev)
 
        if (mddev_trylock(mddev)==0) {
                int spares =0;
-               struct bio *blist;
 
                spin_lock(&mddev->write_lock);
                if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
@@ -3673,21 +3701,11 @@ void md_check_recovery(mddev_t *mddev)
                }
                if (mddev->safemode == 1)
                        mddev->safemode = 0;
-               blist = bio_list_get(&mddev->write_list);
                spin_unlock(&mddev->write_lock);
 
                if (mddev->sb_dirty)
                        md_update_sb(mddev);
 
-               while (blist) {
-                       struct bio *b = blist;
-                       blist = blist->bi_next;
-                       b->bi_next = NULL;
-                       generic_make_request(b);
-                       /* we already counted this, so need to un-count */
-                       md_write_end(mddev);
-               }
-
 
                if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
                    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@@ -3706,6 +3724,14 @@ void md_check_recovery(mddev_t *mddev)
                                mddev->pers->spare_active(mddev);
                        }
                        md_update_sb(mddev);
+
+                       /* if array is no-longer degraded, then any saved_raid_disk
+                        * information must be scrapped
+                        */
+                       if (!mddev->degraded)
+                               ITERATE_RDEV(mddev,rdev,rtmp)
+                                       rdev->saved_raid_disk = -1;
+
                        mddev->recovery = 0;
                        /* flag recovery needed just to double check */
                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);