md: do not compute parity unless it is on a failed drive

[linux-2.6-omap-h63xx.git] / drivers / md / raid5.c
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c

index 968dacaced6de669181bba2d1e1902fbea3ffa8f..c37e256b117603f8f68aed3de3a40db9328e70f6 100644 (file)
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -94,6 +94,8 @@
  #define __inline__
  #endif
  
+#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
+
  #if !RAID6_USE_EMPTY_ZERO_PAGE
  /* In .bss so it's zeroed */
  const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
@@ -1143,10 +1145,12 @@ static void raid5_end_read_request(struct bio * bi, int error)
                 set_bit(R5_UPTODATE, &sh->dev[i].flags);
                 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
                         rdev = conf->disks[i].rdev;
-                       printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n",
-                              mdname(conf->mddev), STRIPE_SECTORS,
-                              (unsigned long long)(sh->sector + rdev->data_offset),
-                              bdevname(rdev->bdev, b));
+                       printk_rl(KERN_INFO "raid5:%s: read error corrected"
+                                 " (%lu sectors at %llu on %s)\n",
+                                 mdname(conf->mddev), STRIPE_SECTORS,
+                                 (unsigned long long)(sh->sector
+                                                      + rdev->data_offset),
+                                 bdevname(rdev->bdev, b));
                         clear_bit(R5_ReadError, &sh->dev[i].flags);
                         clear_bit(R5_ReWrite, &sh->dev[i].flags);
                 }
@@ -1160,16 +1164,22 @@ static void raid5_end_read_request(struct bio * bi, int error)
                 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
                 atomic_inc(&rdev->read_errors);
                 if (conf->mddev->degraded)
-                       printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n",
-                              mdname(conf->mddev),
-                              (unsigned long long)(sh->sector + rdev->data_offset),
-                              bdn);
+                       printk_rl(KERN_WARNING
+                                 "raid5:%s: read error not correctable "
+                                 "(sector %llu on %s).\n",
+                                 mdname(conf->mddev),
+                                 (unsigned long long)(sh->sector
+                                                      + rdev->data_offset),
+                                 bdn);
                 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
                         /* Oh, no!!! */
-                       printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n",
-                              mdname(conf->mddev),
-                              (unsigned long long)(sh->sector + rdev->data_offset),
-                              bdn);
+                       printk_rl(KERN_WARNING
+                                 "raid5:%s: read error NOT corrected!! "
+                                 "(sector %llu on %s).\n",
+                                 mdname(conf->mddev),
+                                 (unsigned long long)(sh->sector
+                                                      + rdev->data_offset),
+                                 bdn);
                 else if (atomic_read(&rdev->read_errors)
                          > conf->max_nr_stripes)
                         printk(KERN_WARNING
@@ -1258,7 +1268,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
                         /*
                          * if recovery was running, make sure it aborts.
                          */
-                       set_bit(MD_RECOVERY_ERR, &mddev->recovery);
+                       set_bit(MD_RECOVERY_INTR, &mddev->recovery);
                 }
                 set_bit(Faulty, &rdev->flags);
                 printk (KERN_ALERT
@@ -1992,6 +2002,7 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh,
                  * have quiesced.
                  */
                 if ((s->uptodate == disks - 1) &&
+                   (s->failed && disk_idx == s->failed_num) &&
                     !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) {
                         set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
                         set_bit(R5_Wantcompute, &dev->flags);
@@ -2077,7 +2088,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh,
                         /* we would like to get this block, possibly
                          * by computing it, but we might not be able to
                          */
-                       if (s->uptodate == disks-1) {
+                       if ((s->uptodate == disks - 1) &&
+                           (s->failed && (i == r6s->failed_num[0] ||
+                                          i == r6s->failed_num[1]))) {
                                 pr_debug("Computing stripe %llu block %d\n",
                                        (unsigned long long)sh->sector, i);
                                 compute_block_1(sh, i, 0);
@@ -2369,8 +2382,8 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
  
         /* complete a check operation */
         if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) {
-           clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
-           clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
+               clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
+               clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
                 if (s->failed == 0) {
                         if (sh->ops.zero_sum_result == 0)
                                 /* parity is correct (on disc,
@@ -2400,16 +2413,6 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
                         canceled_check = 1; /* STRIPE_INSYNC is not set */
         }
  
-       /* check if we can clear a parity disk reconstruct */
-       if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
-               test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
-
-               clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
-               clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
-               clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
-               clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
-       }
-
         /* start a new check operation if there are no failures, the stripe is
          * not insync, and a repair is not in flight
          */
@@ -2424,6 +2427,17 @@ static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
                 }
         }
  
+       /* check if we can clear a parity disk reconstruct */
+       if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
+           test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+
+               clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending);
+               clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
+               clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
+               clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
+       }
+
+
         /* Wait for check parity and compute block operations to complete
          * before write-back.  If a failure occurred while the check operation
          * was in flight we need to cycle this stripe through handle_stripe
@@ -2607,6 +2621,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
         }
  }
  
+
  /*
   * handle_stripe - do things to a stripe.
   *
@@ -2632,6 +2647,8 @@ static void handle_stripe5(struct stripe_head *sh)
         struct stripe_head_state s;
         struct r5dev *dev;
         unsigned long pending = 0;
+       mdk_rdev_t *blocked_rdev = NULL;
+       int prexor;
  
         memset(&s, 0, sizeof(s));
         pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
@@ -2691,6 +2708,11 @@ static void handle_stripe5(struct stripe_head *sh)
                 if (dev->written)
                         s.written++;
                 rdev = rcu_dereference(conf->disks[i].rdev);
+               if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+                       blocked_rdev = rdev;
+                       atomic_inc(&rdev->nr_pending);
+                       break;
+               }
                 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
                         /* The ReadError flag will just be confusing now */
                         clear_bit(R5_ReadError, &dev->flags);
@@ -2705,6 +2727,11 @@ static void handle_stripe5(struct stripe_head *sh)
         }
         rcu_read_unlock();
  
+       if (unlikely(blocked_rdev)) {
+               set_bit(STRIPE_HANDLE, &sh->state);
+               goto unlock;
+       }
+
         if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
                 sh->ops.count++;
  
@@ -2751,9 +2778,11 @@ static void handle_stripe5(struct stripe_head *sh)
         /* leave prexor set until postxor is done, allows us to distinguish
          * a rmw from a rcw during biodrain
          */
+       prexor = 0;
         if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
                 test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
  
+               prexor = 1;
                 clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
                 clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
                 clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
@@ -2787,6 +2816,8 @@ static void handle_stripe5(struct stripe_head *sh)
                                 if (!test_and_set_bit(
                                     STRIPE_OP_IO, &sh->ops.pending))
                                         sh->ops.count++;
+                               if (prexor)
+                                       continue;
                                 if (!test_bit(R5_Insync, &dev->flags) ||
                                     (i == sh->pd_idx && s.failed == 0))
                                         set_bit(STRIPE_INSYNC, &sh->state);
@@ -2894,8 +2925,13 @@ static void handle_stripe5(struct stripe_head *sh)
         if (sh->ops.count)
                 pending = get_stripe_work(sh);
  
+ unlock:
         spin_unlock(&sh->lock);
  
+       /* wait for this device to become unblocked */
+       if (unlikely(blocked_rdev))
+               md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
+
         if (pending)
                 raid5_run_ops(sh, pending);
  
@@ -2912,6 +2948,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
         struct stripe_head_state s;
         struct r6_state r6s;
         struct r5dev *dev, *pdev, *qdev;
+       mdk_rdev_t *blocked_rdev = NULL;
  
         r6s.qd_idx = raid6_next_disk(pd_idx, disks);
         pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
@@ -2975,6 +3012,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                 if (dev->written)
                         s.written++;
                 rdev = rcu_dereference(conf->disks[i].rdev);
+               if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
+                       blocked_rdev = rdev;
+                       atomic_inc(&rdev->nr_pending);
+                       break;
+               }
                 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
                         /* The ReadError flag will just be confusing now */
                         clear_bit(R5_ReadError, &dev->flags);
@@ -2989,6 +3031,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
                         set_bit(R5_Insync, &dev->flags);
         }
         rcu_read_unlock();
+
+       if (unlikely(blocked_rdev)) {
+               set_bit(STRIPE_HANDLE, &sh->state);
+               goto unlock;
+       }
         pr_debug("locked=%d uptodate=%d to_read=%d"
                " to_write=%d failed=%d failed_num=%d,%d\n",
                s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -3094,8 +3141,13 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
             !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
                 handle_stripe_expansion(conf, sh, &r6s);
  
+ unlock:
         spin_unlock(&sh->lock);
  
+       /* wait for this device to become unblocked */
+       if (unlikely(blocked_rdev))
+               md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
+
         return_io(return_bi);
  
         for (i=disks; i-- ;) {
@@ -4223,6 +4275,7 @@ static int run(mddev_t *mddev)
                         goto abort;
         }
         spin_lock_init(&conf->device_lock);
+       mddev->queue->queue_lock = &conf->device_lock;
         init_waitqueue_head(&conf->wait_for_stripe);
         init_waitqueue_head(&conf->wait_for_overlap);
         INIT_LIST_HEAD(&conf->handle_list);
@@ -4529,6 +4582,14 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
                         err = -EBUSY;
                         goto abort;
                 }
+               /* Only remove non-faulty devices if recovery
+                * isn't possible.
+                */
+               if (!test_bit(Faulty, &rdev->flags) &&
+                   mddev->degraded <= conf->max_degraded) {
+                       err = -EBUSY;
+                       goto abort;
+               }
                 p->rdev = NULL;
                 synchronize_rcu();
                 if (atomic_read(&rdev->nr_pending)) {