]> pilppa.org Git - linux-2.6-omap-h63xx.git/blobdiff - drivers/md/md.c
Merge branch 'linus' into x86/apic
[linux-2.6-omap-h63xx.git] / drivers / md / md.c
index 1b1d32694f6fc8eddf7a8c4cccc6d836b5e88054..4495104f6c9f33297727891510e667db46ac29ac 100644 (file)
@@ -214,20 +214,33 @@ static inline mddev_t *mddev_get(mddev_t *mddev)
        return mddev;
 }
 
+static void mddev_delayed_delete(struct work_struct *ws)
+{
+       mddev_t *mddev = container_of(ws, mddev_t, del_work);
+       kobject_del(&mddev->kobj);
+       kobject_put(&mddev->kobj);
+}
+
 static void mddev_put(mddev_t *mddev)
 {
        if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
                return;
-       if (!mddev->raid_disks && list_empty(&mddev->disks)) {
+       if (!mddev->raid_disks && list_empty(&mddev->disks) &&
+           !mddev->hold_active) {
                list_del(&mddev->all_mddevs);
-               spin_unlock(&all_mddevs_lock);
-               blk_cleanup_queue(mddev->queue);
-               if (mddev->sysfs_state)
-                       sysfs_put(mddev->sysfs_state);
-               mddev->sysfs_state = NULL;
-               kobject_put(&mddev->kobj);
-       } else
-               spin_unlock(&all_mddevs_lock);
+               if (mddev->gendisk) {
+                       /* we did a probe so need to clean up.
+                        * Call schedule_work inside the spinlock
+                        * so that flush_scheduled_work() after
+                        * mddev_find will succeed in waiting for the
+                        * work to be done.
+                        */
+                       INIT_WORK(&mddev->del_work, mddev_delayed_delete);
+                       schedule_work(&mddev->del_work);
+               } else
+                       kfree(mddev);
+       }
+       spin_unlock(&all_mddevs_lock);
 }
 
 static mddev_t * mddev_find(dev_t unit)
@@ -236,15 +249,50 @@ static mddev_t * mddev_find(dev_t unit)
 
  retry:
        spin_lock(&all_mddevs_lock);
-       list_for_each_entry(mddev, &all_mddevs, all_mddevs)
-               if (mddev->unit == unit) {
-                       mddev_get(mddev);
+
+       if (unit) {
+               list_for_each_entry(mddev, &all_mddevs, all_mddevs)
+                       if (mddev->unit == unit) {
+                               mddev_get(mddev);
+                               spin_unlock(&all_mddevs_lock);
+                               kfree(new);
+                               return mddev;
+                       }
+
+               if (new) {
+                       list_add(&new->all_mddevs, &all_mddevs);
                        spin_unlock(&all_mddevs_lock);
-                       kfree(new);
-                       return mddev;
+                       new->hold_active = UNTIL_IOCTL;
+                       return new;
                }
-
-       if (new) {
+       } else if (new) {
+               /* find an unused unit number */
+               static int next_minor = 512;
+               int start = next_minor;
+               int is_free = 0;
+               int dev = 0;
+               while (!is_free) {
+                       dev = MKDEV(MD_MAJOR, next_minor);
+                       next_minor++;
+                       if (next_minor > MINORMASK)
+                               next_minor = 0;
+                       if (next_minor == start) {
+                               /* Oh dear, all in use. */
+                               spin_unlock(&all_mddevs_lock);
+                               kfree(new);
+                               return NULL;
+                       }
+                               
+                       is_free = 1;
+                       list_for_each_entry(mddev, &all_mddevs, all_mddevs)
+                               if (mddev->unit == dev) {
+                                       is_free = 0;
+                                       break;
+                               }
+               }
+               new->unit = dev;
+               new->md_minor = MINOR(dev);
+               new->hold_active = UNTIL_STOP;
                list_add(&new->all_mddevs, &all_mddevs);
                spin_unlock(&all_mddevs_lock);
                return new;
@@ -275,16 +323,6 @@ static mddev_t * mddev_find(dev_t unit)
        new->resync_max = MaxSector;
        new->level = LEVEL_NONE;
 
-       new->queue = blk_alloc_queue(GFP_KERNEL);
-       if (!new->queue) {
-               kfree(new);
-               return NULL;
-       }
-       /* Can be unlocked because the queue is new: no concurrency */
-       queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, new->queue);
-
-       blk_queue_make_request(new->queue, md_fail_request);
-
        goto retry;
 }
 
@@ -307,25 +345,23 @@ static inline void mddev_unlock(mddev_t * mddev)
 
 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
 {
-       mdk_rdev_t * rdev;
-       struct list_head *tmp;
+       mdk_rdev_t *rdev;
 
-       rdev_for_each(rdev, tmp, mddev) {
+       list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->desc_nr == nr)
                        return rdev;
-       }
+
        return NULL;
 }
 
 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
 {
-       struct list_head *tmp;
        mdk_rdev_t *rdev;
 
-       rdev_for_each(rdev, tmp, mddev) {
+       list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->bdev->bd_dev == dev)
                        return rdev;
-       }
+
        return NULL;
 }
 
@@ -861,7 +897,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        mdp_super_t *sb;
-       struct list_head *tmp;
        mdk_rdev_t *rdev2;
        int next_spare = mddev->raid_disks;
 
@@ -933,7 +968,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
                sb->state |= (1<<MD_SB_BITMAP_PRESENT);
 
        sb->disks[0].state = (1<<MD_DISK_REMOVED);
-       rdev_for_each(rdev2, tmp, mddev) {
+       list_for_each_entry(rdev2, &mddev->disks, same_set) {
                mdp_disk_t *d;
                int desc_nr;
                if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
@@ -1259,7 +1294,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 {
        struct mdp_superblock_1 *sb;
-       struct list_head *tmp;
        mdk_rdev_t *rdev2;
        int max_dev, i;
        /* make rdev->sb match mddev and rdev data. */
@@ -1307,7 +1341,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        }
 
        max_dev = 0;
-       rdev_for_each(rdev2, tmp, mddev)
+       list_for_each_entry(rdev2, &mddev->disks, same_set)
                if (rdev2->desc_nr+1 > max_dev)
                        max_dev = rdev2->desc_nr+1;
 
@@ -1316,7 +1350,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
        for (i=0; i<max_dev;i++)
                sb->dev_roles[i] = cpu_to_le16(0xfffe);
        
-       rdev_for_each(rdev2, tmp, mddev) {
+       list_for_each_entry(rdev2, &mddev->disks, same_set) {
                i = rdev2->desc_nr;
                if (test_bit(Faulty, &rdev2->flags))
                        sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1447,6 +1481,11 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
                if (find_rdev_nr(mddev, rdev->desc_nr))
                        return -EBUSY;
        }
+       if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
+               printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
+                      mdname(mddev), mddev->max_disks);
+               return -EBUSY;
+       }
        bdevname(rdev->bdev,b);
        while ( (s=strchr(b, '/')) != NULL)
                *s = '!';
@@ -1466,6 +1505,9 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
 
        list_add_rcu(&rdev->same_set, &mddev->disks);
        bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
+
+       /* May as well allow recovery to be retried once */
+       mddev->recovery_disabled = 0;
        return 0;
 
  fail:
@@ -1571,8 +1613,7 @@ static void kick_rdev_from_array(mdk_rdev_t * rdev)
 
 static void export_array(mddev_t *mddev)
 {
-       struct list_head *tmp;
-       mdk_rdev_t *rdev;
+       mdk_rdev_t *rdev, *tmp;
 
        rdev_for_each(rdev, tmp, mddev) {
                if (!rdev->mddev) {
@@ -1593,7 +1634,7 @@ static void print_desc(mdp_disk_t *desc)
                desc->major,desc->minor,desc->raid_disk,desc->state);
 }
 
-static void print_sb(mdp_super_t *sb)
+static void print_sb_90(mdp_super_t *sb)
 {
        int i;
 
@@ -1624,10 +1665,57 @@ static void print_sb(mdp_super_t *sb)
        }
        printk(KERN_INFO "md:     THIS: ");
        print_desc(&sb->this_disk);
-
 }
 
-static void print_rdev(mdk_rdev_t *rdev)
+static void print_sb_1(struct mdp_superblock_1 *sb)
+{
+       __u8 *uuid;
+
+       uuid = sb->set_uuid;
+       printk(KERN_INFO "md:  SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
+                       ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
+              KERN_INFO "md:    Name: \"%s\" CT:%llu\n",
+               le32_to_cpu(sb->major_version),
+               le32_to_cpu(sb->feature_map),
+               uuid[0], uuid[1], uuid[2], uuid[3],
+               uuid[4], uuid[5], uuid[6], uuid[7],
+               uuid[8], uuid[9], uuid[10], uuid[11],
+               uuid[12], uuid[13], uuid[14], uuid[15],
+               sb->set_name,
+               (unsigned long long)le64_to_cpu(sb->ctime)
+                      & MD_SUPERBLOCK_1_TIME_SEC_MASK);
+
+       uuid = sb->device_uuid;
+       printk(KERN_INFO "md:       L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
+                       " RO:%llu\n"
+              KERN_INFO "md:     Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
+                       ":%02x%02x%02x%02x%02x%02x\n"
+              KERN_INFO "md:       (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
+              KERN_INFO "md:         (MaxDev:%u) \n",
+               le32_to_cpu(sb->level),
+               (unsigned long long)le64_to_cpu(sb->size),
+               le32_to_cpu(sb->raid_disks),
+               le32_to_cpu(sb->layout),
+               le32_to_cpu(sb->chunksize),
+               (unsigned long long)le64_to_cpu(sb->data_offset),
+               (unsigned long long)le64_to_cpu(sb->data_size),
+               (unsigned long long)le64_to_cpu(sb->super_offset),
+               (unsigned long long)le64_to_cpu(sb->recovery_offset),
+               le32_to_cpu(sb->dev_number),
+               uuid[0], uuid[1], uuid[2], uuid[3],
+               uuid[4], uuid[5], uuid[6], uuid[7],
+               uuid[8], uuid[9], uuid[10], uuid[11],
+               uuid[12], uuid[13], uuid[14], uuid[15],
+               sb->devflags,
+               (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
+               (unsigned long long)le64_to_cpu(sb->events),
+               (unsigned long long)le64_to_cpu(sb->resync_offset),
+               le32_to_cpu(sb->sb_csum),
+               le32_to_cpu(sb->max_dev)
+               );
+}
+
+static void print_rdev(mdk_rdev_t *rdev, int major_version)
 {
        char b[BDEVNAME_SIZE];
        printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
@@ -1635,15 +1723,22 @@ static void print_rdev(mdk_rdev_t *rdev)
                test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
                rdev->desc_nr);
        if (rdev->sb_loaded) {
-               printk(KERN_INFO "md: rdev superblock:\n");
-               print_sb((mdp_super_t*)page_address(rdev->sb_page));
+               printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
+               switch (major_version) {
+               case 0:
+                       print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
+                       break;
+               case 1:
+                       print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
+                       break;
+               }
        } else
                printk(KERN_INFO "md: no rdev superblock!\n");
 }
 
 static void md_print_devices(void)
 {
-       struct list_head *tmp, *tmp2;
+       struct list_head *tmp;
        mdk_rdev_t *rdev;
        mddev_t *mddev;
        char b[BDEVNAME_SIZE];
@@ -1658,12 +1753,12 @@ static void md_print_devices(void)
                        bitmap_print_sb(mddev->bitmap);
                else
                        printk("%s: ", mdname(mddev));
-               rdev_for_each(rdev, tmp2, mddev)
+               list_for_each_entry(rdev, &mddev->disks, same_set)
                        printk("<%s>", bdevname(rdev->bdev,b));
                printk("\n");
 
-               rdev_for_each(rdev, tmp2, mddev)
-                       print_rdev(rdev);
+               list_for_each_entry(rdev, &mddev->disks, same_set)
+                       print_rdev(rdev, mddev->major_version);
        }
        printk("md:     **********************************\n");
        printk("\n");
@@ -1679,9 +1774,8 @@ static void sync_sbs(mddev_t * mddev, int nospares)
         * with the rest of the array)
         */
        mdk_rdev_t *rdev;
-       struct list_head *tmp;
 
-       rdev_for_each(rdev, tmp, mddev) {
+       list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (rdev->sb_events == mddev->events ||
                    (nospares &&
                     rdev->raid_disk < 0 &&
@@ -1699,7 +1793,6 @@ static void sync_sbs(mddev_t * mddev, int nospares)
 
 static void md_update_sb(mddev_t * mddev, int force_change)
 {
-       struct list_head *tmp;
        mdk_rdev_t *rdev;
        int sync_req;
        int nospares = 0;
@@ -1790,7 +1883,7 @@ repeat:
                mdname(mddev),mddev->in_sync);
 
        bitmap_update_sb(mddev->bitmap);
-       rdev_for_each(rdev, tmp, mddev) {
+       list_for_each_entry(rdev, &mddev->disks, same_set) {
                char b[BDEVNAME_SIZE];
                dprintk(KERN_INFO "md: ");
                if (rdev->sb_loaded != 1)
@@ -1999,7 +2092,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                md_wakeup_thread(rdev->mddev->thread);
        } else if (rdev->mddev->pers) {
                mdk_rdev_t *rdev2;
-               struct list_head *tmp;
                /* Activating a spare .. or possibly reactivating
                 * if we every get bitmaps working here.
                 */
@@ -2010,7 +2102,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                if (rdev->mddev->pers->hot_add_disk == NULL)
                        return -EINVAL;
 
-               rdev_for_each(rdev2, tmp, rdev->mddev)
+               list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
                        if (rdev2->raid_disk == slot)
                                return -EEXIST;
 
@@ -2125,14 +2217,14 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
                 */
                mddev_t *mddev;
                int overlap = 0;
-               struct list_head *tmp, *tmp2;
+               struct list_head *tmp;
 
                mddev_unlock(my_mddev);
                for_each_mddev(mddev, tmp) {
                        mdk_rdev_t *rdev2;
 
                        mddev_lock(mddev);
-                       rdev_for_each(rdev2, tmp2, mddev)
+                       list_for_each_entry(rdev2, &mddev->disks, same_set)
                                if (test_bit(AllReserved, &rdev2->flags) ||
                                    (rdev->bdev == rdev2->bdev &&
                                     rdev != rdev2 &&
@@ -2328,8 +2420,7 @@ abort_free:
 static void analyze_sbs(mddev_t * mddev)
 {
        int i;
-       struct list_head *tmp;
-       mdk_rdev_t *rdev, *freshest;
+       mdk_rdev_t *rdev, *freshest, *tmp;
        char b[BDEVNAME_SIZE];
 
        freshest = NULL;
@@ -2355,6 +2446,15 @@ static void analyze_sbs(mddev_t * mddev)
 
        i = 0;
        rdev_for_each(rdev, tmp, mddev) {
+               if (rdev->desc_nr >= mddev->max_disks ||
+                   i > mddev->max_disks) {
+                       printk(KERN_WARNING
+                              "md: %s: %s: only %d devices permitted\n",
+                              mdname(mddev), bdevname(rdev->bdev, b),
+                              mddev->max_disks);
+                       kick_rdev_from_array(rdev);
+                       continue;
+               }
                if (rdev != freshest)
                        if (super_types[mddev->major_version].
                            validate_super(mddev, rdev)) {
@@ -3046,7 +3146,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
        }
        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
        md_wakeup_thread(mddev->thread);
-       sysfs_notify(&mddev->kobj, NULL, "sync_action");
+       sysfs_notify_dirent(mddev->sysfs_action);
        return len;
 }
 
@@ -3404,6 +3504,8 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
        if (!capable(CAP_SYS_ADMIN))
                return -EACCES;
        rv = mddev_lock(mddev);
+       if (mddev->hold_active == UNTIL_IOCTL)
+               mddev->hold_active = 0;
        if (!rv) {
                rv = entry->store(mddev, page, length);
                mddev_unlock(mddev);
@@ -3414,6 +3516,17 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
 static void md_free(struct kobject *ko)
 {
        mddev_t *mddev = container_of(ko, mddev_t, kobj);
+
+       if (mddev->sysfs_state)
+               sysfs_put(mddev->sysfs_state);
+
+       if (mddev->gendisk) {
+               del_gendisk(mddev->gendisk);
+               put_disk(mddev->gendisk);
+       }
+       if (mddev->queue)
+               blk_cleanup_queue(mddev->queue);
+
        kfree(mddev);
 }
 
@@ -3429,34 +3542,74 @@ static struct kobj_type md_ktype = {
 
 int mdp_major = 0;
 
-static struct kobject *md_probe(dev_t dev, int *part, void *data)
+static int md_alloc(dev_t dev, char *name)
 {
        static DEFINE_MUTEX(disks_mutex);
        mddev_t *mddev = mddev_find(dev);
        struct gendisk *disk;
-       int partitioned = (MAJOR(dev) != MD_MAJOR);
-       int shift = partitioned ? MdpMinorShift : 0;
-       int unit = MINOR(dev) >> shift;
+       int partitioned;
+       int shift;
+       int unit;
        int error;
 
        if (!mddev)
-               return NULL;
+               return -ENODEV;
+
+       partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
+       shift = partitioned ? MdpMinorShift : 0;
+       unit = MINOR(mddev->unit) >> shift;
+
+       /* wait for any previous instance if this device
+        * to be completed removed (mddev_delayed_delete).
+        */
+       flush_scheduled_work();
 
        mutex_lock(&disks_mutex);
        if (mddev->gendisk) {
                mutex_unlock(&disks_mutex);
                mddev_put(mddev);
-               return NULL;
+               return -EEXIST;
+       }
+
+       if (name) {
+               /* Need to ensure that 'name' is not a duplicate.
+                */
+               mddev_t *mddev2;
+               spin_lock(&all_mddevs_lock);
+
+               list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
+                       if (mddev2->gendisk &&
+                           strcmp(mddev2->gendisk->disk_name, name) == 0) {
+                               spin_unlock(&all_mddevs_lock);
+                               return -EEXIST;
+                       }
+               spin_unlock(&all_mddevs_lock);
        }
+
+       mddev->queue = blk_alloc_queue(GFP_KERNEL);
+       if (!mddev->queue) {
+               mutex_unlock(&disks_mutex);
+               mddev_put(mddev);
+               return -ENOMEM;
+       }
+       /* Can be unlocked because the queue is new: no concurrency */
+       queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
+
+       blk_queue_make_request(mddev->queue, md_fail_request);
+
        disk = alloc_disk(1 << shift);
        if (!disk) {
                mutex_unlock(&disks_mutex);
+               blk_cleanup_queue(mddev->queue);
+               mddev->queue = NULL;
                mddev_put(mddev);
-               return NULL;
+               return -ENOMEM;
        }
-       disk->major = MAJOR(dev);
+       disk->major = MAJOR(mddev->unit);
        disk->first_minor = unit << shift;
-       if (partitioned)
+       if (name)
+               strcpy(disk->disk_name, name);
+       else if (partitioned)
                sprintf(disk->disk_name, "md_d%d", unit);
        else
                sprintf(disk->disk_name, "md%d", unit);
@@ -3464,7 +3617,7 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
        disk->private_data = mddev;
        disk->queue = mddev->queue;
        /* Allow extended partitions.  This makes the
-        * 'mdp' device redundant, but we can really
+        * 'mdp' device redundant, but we can't really
         * remove it now.
         */
        disk->flags |= GENHD_FL_EXT_DEVT;
@@ -3480,9 +3633,35 @@ static struct kobject *md_probe(dev_t dev, int *part, void *data)
                kobject_uevent(&mddev->kobj, KOBJ_ADD);
                mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
        }
+       mddev_put(mddev);
+       return 0;
+}
+
+static struct kobject *md_probe(dev_t dev, int *part, void *data)
+{
+       md_alloc(dev, NULL);
        return NULL;
 }
 
+static int add_named_array(const char *val, struct kernel_param *kp)
+{
+       /* val must be "md_*" where * is not all digits.
+        * We allocate an array with a large free minor number, and
+        * set the name to val.  val must not already be an active name.
+        */
+       int len = strlen(val);
+       char buf[DISK_NAME_LEN];
+
+       while (len && val[len-1] == '\n')
+               len--;
+       if (len >= DISK_NAME_LEN)
+               return -E2BIG;
+       strlcpy(buf, val, len+1);
+       if (strncmp(buf, "md_", 3) != 0)
+               return -EINVAL;
+       return md_alloc(0, buf);
+}
+
 static void md_safemode_timeout(unsigned long data)
 {
        mddev_t *mddev = (mddev_t *) data;
@@ -3501,7 +3680,6 @@ static int do_md_run(mddev_t * mddev)
 {
        int err;
        int chunk_size;
-       struct list_head *tmp;
        mdk_rdev_t *rdev;
        struct gendisk *disk;
        struct mdk_personality *pers;
@@ -3540,7 +3718,7 @@ static int do_md_run(mddev_t * mddev)
                }
 
                /* devices must have minimum size of one chunk */
-               rdev_for_each(rdev, tmp, mddev) {
+               list_for_each_entry(rdev, &mddev->disks, same_set) {
                        if (test_bit(Faulty, &rdev->flags))
                                continue;
                        if (rdev->size < chunk_size / 1024) {
@@ -3565,7 +3743,7 @@ static int do_md_run(mddev_t * mddev)
         * the only valid external interface is through the md
         * device.
         */
-       rdev_for_each(rdev, tmp, mddev) {
+       list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (test_bit(Faulty, &rdev->flags))
                        continue;
                sync_blockdev(rdev->bdev);
@@ -3630,10 +3808,10 @@ static int do_md_run(mddev_t * mddev)
                 */
                char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
                mdk_rdev_t *rdev2;
-               struct list_head *tmp2;
                int warned = 0;
-               rdev_for_each(rdev, tmp, mddev) {
-                       rdev_for_each(rdev2, tmp2, mddev) {
+
+               list_for_each_entry(rdev, &mddev->disks, same_set)
+                       list_for_each_entry(rdev2, &mddev->disks, same_set) {
                                if (rdev < rdev2 &&
                                    rdev->bdev->bd_contains ==
                                    rdev2->bdev->bd_contains) {
@@ -3647,7 +3825,7 @@ static int do_md_run(mddev_t * mddev)
                                        warned = 1;
                                }
                        }
-               }
+
                if (warned)
                        printk(KERN_WARNING
                               "True protection against single-disk"
@@ -3684,6 +3862,7 @@ static int do_md_run(mddev_t * mddev)
                        printk(KERN_WARNING
                               "md: cannot register extra attributes for %s\n",
                               mdname(mddev));
+               mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
        } else if (mddev->ro == 2) /* auto-readonly not meaningful */
                mddev->ro = 0;
 
@@ -3694,7 +3873,7 @@ static int do_md_run(mddev_t * mddev)
        mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
        mddev->in_sync = 1;
 
-       rdev_for_each(rdev, tmp, mddev)
+       list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk >= 0) {
                        char nm[20];
                        sprintf(nm, "rd%d", rdev->raid_disk);
@@ -3725,9 +3904,8 @@ static int do_md_run(mddev_t * mddev)
         * it will remove the drives and not do the right thing
         */
        if (mddev->degraded && !mddev->sync_thread) {
-               struct list_head *rtmp;
                int spares = 0;
-               rdev_for_each(rdev, rtmp, mddev)
+               list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk >= 0 &&
                            !test_bit(In_sync, &rdev->flags) &&
                            !test_bit(Faulty, &rdev->flags))
@@ -3754,7 +3932,8 @@ static int do_md_run(mddev_t * mddev)
        mddev->changed = 1;
        md_new_event(mddev);
        sysfs_notify_dirent(mddev->sysfs_state);
-       sysfs_notify(&mddev->kobj, NULL, "sync_action");
+       if (mddev->sysfs_action)
+               sysfs_notify_dirent(mddev->sysfs_action);
        sysfs_notify(&mddev->kobj, NULL, "degraded");
        kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
        return 0;
@@ -3854,9 +4033,12 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                        mddev->queue->merge_bvec_fn = NULL;
                        mddev->queue->unplug_fn = NULL;
                        mddev->queue->backing_dev_info.congested_fn = NULL;
-                       if (mddev->pers->sync_request)
+                       if (mddev->pers->sync_request) {
                                sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
-
+                               if (mddev->sysfs_action)
+                                       sysfs_put(mddev->sysfs_action);
+                               mddev->sysfs_action = NULL;
+                       }
                        module_put(mddev->pers->owner);
                        mddev->pers = NULL;
                        /* tell userspace to handle 'inactive' */
@@ -3883,7 +4065,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
         */
        if (mode == 0) {
                mdk_rdev_t *rdev;
-               struct list_head *tmp;
 
                printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
 
@@ -3895,7 +4076,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                }
                mddev->bitmap_offset = 0;
 
-               rdev_for_each(rdev, tmp, mddev)
+               list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk >= 0) {
                                char nm[20];
                                sprintf(nm, "rd%d", rdev->raid_disk);
@@ -3941,6 +4122,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
                mddev->barriers_work = 0;
                mddev->safemode = 0;
                kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
+               if (mddev->hold_active == UNTIL_STOP)
+                       mddev->hold_active = 0;
 
        } else if (mddev->pers)
                printk(KERN_INFO "md: %s switched to read-only mode.\n",
@@ -3956,7 +4139,6 @@ out:
 static void autorun_array(mddev_t *mddev)
 {
        mdk_rdev_t *rdev;
-       struct list_head *tmp;
        int err;
 
        if (list_empty(&mddev->disks))
@@ -3964,7 +4146,7 @@ static void autorun_array(mddev_t *mddev)
 
        printk(KERN_INFO "md: running: ");
 
-       rdev_for_each(rdev, tmp, mddev) {
+       list_for_each_entry(rdev, &mddev->disks, same_set) {
                char b[BDEVNAME_SIZE];
                printk("<%s>", bdevname(rdev->bdev,b));
        }
@@ -3991,8 +4173,7 @@ static void autorun_array(mddev_t *mddev)
  */
 static void autorun_devices(int part)
 {
-       struct list_head *tmp;
-       mdk_rdev_t *rdev0, *rdev;
+       mdk_rdev_t *rdev0, *rdev, *tmp;
        mddev_t *mddev;
        char b[BDEVNAME_SIZE];
 
@@ -4007,7 +4188,7 @@ static void autorun_devices(int part)
                printk(KERN_INFO "md: considering %s ...\n",
                        bdevname(rdev0->bdev,b));
                INIT_LIST_HEAD(&candidates);
-               rdev_for_each_list(rdev, tmp, pending_raid_disks)
+               rdev_for_each_list(rdev, tmp, &pending_raid_disks)
                        if (super_90_load(rdev, rdev0, 0) >= 0) {
                                printk(KERN_INFO "md:  adding %s ...\n",
                                        bdevname(rdev->bdev,b));
@@ -4053,7 +4234,7 @@ static void autorun_devices(int part)
                } else {
                        printk(KERN_INFO "md: created %s\n", mdname(mddev));
                        mddev->persistent = 1;
-                       rdev_for_each_list(rdev, tmp, candidates) {
+                       rdev_for_each_list(rdev, tmp, &candidates) {
                                list_del_init(&rdev->same_set);
                                if (bind_rdev_to_array(rdev, mddev))
                                        export_rdev(rdev);
@@ -4064,7 +4245,7 @@ static void autorun_devices(int part)
                /* on success, candidates will be empty, on error
                 * it won't...
                 */
-               rdev_for_each_list(rdev, tmp, candidates) {
+               rdev_for_each_list(rdev, tmp, &candidates) {
                        list_del_init(&rdev->same_set);
                        export_rdev(rdev);
                }
@@ -4093,10 +4274,9 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
        mdu_array_info_t info;
        int nr,working,active,failed,spare;
        mdk_rdev_t *rdev;
-       struct list_head *tmp;
 
        nr=working=active=failed=spare=0;
-       rdev_for_each(rdev, tmp, mddev) {
+       list_for_each_entry(rdev, &mddev->disks, same_set) {
                nr++;
                if (test_bit(Faulty, &rdev->flags))
                        failed++;
@@ -4448,13 +4628,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
         * noticed in interrupt contexts ...
         */
 
-       if (rdev->desc_nr == mddev->max_disks) {
-               printk(KERN_WARNING "%s: can not hot-add to full array!\n",
-                       mdname(mddev));
-               err = -EBUSY;
-               goto abort_unbind_export;
-       }
-
        rdev->raid_disk = -1;
 
        md_update_sb(mddev, 1);
@@ -4468,9 +4641,6 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
        md_new_event(mddev);
        return 0;
 
-abort_unbind_export:
-       unbind_rdev_from_array(rdev);
-
 abort_export:
        export_rdev(rdev);
        return err;
@@ -4614,9 +4784,8 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 
 static int update_size(mddev_t *mddev, sector_t num_sectors)
 {
-       mdk_rdev_t * rdev;
+       mdk_rdev_t *rdev;
        int rv;
-       struct list_head *tmp;
        int fit = (num_sectors == 0);
 
        if (mddev->pers->resize == NULL)
@@ -4638,7 +4807,7 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
                 * grow, and re-add.
                 */
                return -EBUSY;
-       rdev_for_each(rdev, tmp, mddev) {
+       list_for_each_entry(rdev, &mddev->disks, same_set) {
                sector_t avail;
                avail = rdev->size * 2;
 
@@ -5000,6 +5169,9 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 
 done_unlock:
 abort_unlock:
+       if (mddev->hold_active == UNTIL_IOCTL &&
+           err != -EINVAL)
+               mddev->hold_active = 0;
        mddev_unlock(mddev);
 
        return err;
@@ -5016,14 +5188,25 @@ static int md_open(struct block_device *bdev, fmode_t mode)
         * Succeed if we can lock the mddev, which confirms that
         * it isn't being stopped right now.
         */
-       mddev_t *mddev = bdev->bd_disk->private_data;
+       mddev_t *mddev = mddev_find(bdev->bd_dev);
        int err;
 
+       if (mddev->gendisk != bdev->bd_disk) {
+               /* we are racing with mddev_put which is discarding this
+                * bd_disk.
+                */
+               mddev_put(mddev);
+               /* Wait until bdev->bd_disk is definitely gone */
+               flush_scheduled_work();
+               /* Then retry the open from the top */
+               return -ERESTARTSYS;
+       }
+       BUG_ON(mddev != bdev->bd_disk->private_data);
+
        if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1)))
                goto out;
 
        err = 0;
-       mddev_get(mddev);
        atomic_inc(&mddev->openers);
        mddev_unlock(mddev);
 
@@ -5187,11 +5370,10 @@ static void status_unused(struct seq_file *seq)
 {
        int i = 0;
        mdk_rdev_t *rdev;
-       struct list_head *tmp;
 
        seq_printf(seq, "unused devices: ");
 
-       rdev_for_each_list(rdev, tmp, pending_raid_disks) {
+       list_for_each_entry(rdev, &pending_raid_disks, same_set) {
                char b[BDEVNAME_SIZE];
                i++;
                seq_printf(seq, "%s ",
@@ -5350,7 +5532,6 @@ static int md_seq_show(struct seq_file *seq, void *v)
 {
        mddev_t *mddev = v;
        sector_t size;
-       struct list_head *tmp2;
        mdk_rdev_t *rdev;
        struct mdstat_info *mi = seq->private;
        struct bitmap *bitmap;
@@ -5387,7 +5568,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
                }
 
                size = 0;
-               rdev_for_each(rdev, tmp2, mddev) {
+               list_for_each_entry(rdev, &mddev->disks, same_set) {
                        char b[BDEVNAME_SIZE];
                        seq_printf(seq, " %s[%d]",
                                bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -5694,7 +5875,6 @@ void md_do_sync(mddev_t *mddev)
        struct list_head *tmp;
        sector_t last_check;
        int skipped = 0;
-       struct list_head *rtmp;
        mdk_rdev_t *rdev;
        char *desc;
 
@@ -5799,7 +5979,7 @@ void md_do_sync(mddev_t *mddev)
                /* recovery follows the physical size of devices */
                max_sectors = mddev->size << 1;
                j = MaxSector;
-               rdev_for_each(rdev, rtmp, mddev)
+               list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (rdev->raid_disk >= 0 &&
                            !test_bit(Faulty, &rdev->flags) &&
                            !test_bit(In_sync, &rdev->flags) &&
@@ -5949,7 +6129,7 @@ void md_do_sync(mddev_t *mddev)
                } else {
                        if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
                                mddev->curr_resync = MaxSector;
-                       rdev_for_each(rdev, rtmp, mddev)
+                       list_for_each_entry(rdev, &mddev->disks, same_set)
                                if (rdev->raid_disk >= 0 &&
                                    !test_bit(Faulty, &rdev->flags) &&
                                    !test_bit(In_sync, &rdev->flags) &&
@@ -5985,10 +6165,9 @@ EXPORT_SYMBOL_GPL(md_do_sync);
 static int remove_and_add_spares(mddev_t *mddev)
 {
        mdk_rdev_t *rdev;
-       struct list_head *rtmp;
        int spares = 0;
 
-       rdev_for_each(rdev, rtmp, mddev)
+       list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk >= 0 &&
                    !test_bit(Blocked, &rdev->flags) &&
                    (test_bit(Faulty, &rdev->flags) ||
@@ -6003,8 +6182,8 @@ static int remove_and_add_spares(mddev_t *mddev)
                        }
                }
 
-       if (mddev->degraded && ! mddev->ro) {
-               rdev_for_each(rdev, rtmp, mddev) {
+       if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
+               list_for_each_entry(rdev, &mddev->disks, same_set) {
                        if (rdev->raid_disk >= 0 &&
                            !test_bit(In_sync, &rdev->flags) &&
                            !test_bit(Blocked, &rdev->flags))
@@ -6056,7 +6235,6 @@ static int remove_and_add_spares(mddev_t *mddev)
 void md_check_recovery(mddev_t *mddev)
 {
        mdk_rdev_t *rdev;
-       struct list_head *rtmp;
 
 
        if (mddev->bitmap)
@@ -6120,7 +6298,7 @@ void md_check_recovery(mddev_t *mddev)
                if (mddev->flags)
                        md_update_sb(mddev, 0);
 
-               rdev_for_each(rdev, rtmp, mddev)
+               list_for_each_entry(rdev, &mddev->disks, same_set)
                        if (test_and_clear_bit(StateChanged, &rdev->flags))
                                sysfs_notify_dirent(rdev->sysfs_state);
 
@@ -6149,13 +6327,13 @@ void md_check_recovery(mddev_t *mddev)
                         * information must be scrapped
                         */
                        if (!mddev->degraded)
-                               rdev_for_each(rdev, rtmp, mddev)
+                               list_for_each_entry(rdev, &mddev->disks, same_set)
                                        rdev->saved_raid_disk = -1;
 
                        mddev->recovery = 0;
                        /* flag recovery needed just to double check */
                        set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-                       sysfs_notify(&mddev->kobj, NULL, "sync_action");
+                       sysfs_notify_dirent(mddev->sysfs_action);
                        md_new_event(mddev);
                        goto unlock;
                }
@@ -6216,7 +6394,7 @@ void md_check_recovery(mddev_t *mddev)
                                mddev->recovery = 0;
                        } else
                                md_wakeup_thread(mddev->sync_thread);
-                       sysfs_notify(&mddev->kobj, NULL, "sync_action");
+                       sysfs_notify_dirent(mddev->sysfs_action);
                        md_new_event(mddev);
                }
        unlock:
@@ -6224,7 +6402,8 @@ void md_check_recovery(mddev_t *mddev)
                        clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
                        if (test_and_clear_bit(MD_RECOVERY_RECOVER,
                                               &mddev->recovery))
-                               sysfs_notify(&mddev->kobj, NULL, "sync_action");
+                               if (mddev->sysfs_action)
+                                       sysfs_notify_dirent(mddev->sysfs_action);
                }
                mddev_unlock(mddev);
        }
@@ -6386,14 +6565,8 @@ static __exit void md_exit(void)
        unregister_sysctl_table(raid_table_header);
        remove_proc_entry("mdstat", NULL);
        for_each_mddev(mddev, tmp) {
-               struct gendisk *disk = mddev->gendisk;
-               if (!disk)
-                       continue;
                export_array(mddev);
-               del_gendisk(disk);
-               put_disk(disk);
-               mddev->gendisk = NULL;
-               mddev_put(mddev);
+               mddev->hold_active = 0;
        }
 }
 
@@ -6418,6 +6591,7 @@ static int set_ro(const char *val, struct kernel_param *kp)
 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
 
+module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
 
 EXPORT_SYMBOL(register_md_personality);
 EXPORT_SYMBOL(unregister_md_personality);