From: Jonathan Brassow <jbrassow@redhat.com>

This patch gives mirror the ability to handle write failures
during recovery.

When kcopyd finishes resynchronizing a mirror region, it
calls recovery_complete() with the results - which are
currently ignored.  This patch checks over the bits in
'write_err' and calls a new function, fail_mirror, on those
devices whose bit is set.  'fail_mirror' increments the
as-yet-unused error_count on the mirror device, and will
switch the primary device pointer for the mirror set if the
mirror is in-sync.

To maintain backwards compatibility, fail_mirror does nothing
if the DM_FEATURES_HANDLE_ERRORS flag is not present.

[AGK Bug? Does default_mirror need protecting with a lock (or atomic)?]

[AGK Patch incomplete? This patch introduces state information that must
be exported to userspace: default_mirror and error_count.]

[AGK Patch incomplete? fail_mirror should trigger an event to notify
userspace - share code with dm-mpath trigger_event perhaps?]

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

---
 drivers/md/dm-raid1.c |   74 ++++++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 69 insertions(+), 5 deletions(-)

Index: linux-2.6.24-rc1/drivers/md/dm-raid1.c
===================================================================
--- linux-2.6.24-rc1.orig/drivers/md/dm-raid1.c	2007-11-05 11:45:55.000000000 +0000
+++ linux-2.6.24-rc1/drivers/md/dm-raid1.c	2007-11-05 11:49:02.000000000 +0000
@@ -646,6 +646,51 @@ static void bio_set_ms(struct bio *bio, 
 	bio->bi_next = (struct bio *) ms;
 }
 
+/* fail_mirror
+ * @m: mirror device to fail
+ *
+ * If the device is valid, mark it invalid.  Also,
+ * if this is the default mirror device (i.e. the primary
+ * device) and the mirror set is in-sync, choose an
+ * alternative primary device.
+ *
+ * This function must not block.
+ */
+static void fail_mirror(struct mirror *m)
+{
+	struct mirror_set *ms = m->ms;
+	struct mirror *new;
+
+	if (!errors_handled(ms))
+		return;
+
+	if (atomic_inc_return(&m->error_count) > 1)
+		return;
+
+	if (m != ms->default_mirror)
+		return;
+
+	/* Change default mirror provided it is fully in-sync. */
+	if (!ms->in_sync) {
+		/*
+		 * Better to issue requests to same failing device
+		 * than to risk returning corrupt data.
+		 */
+		DMERR("Primary mirror (%s) failed while out-of-sync: "
+		      "Reads may fail.", m->dev->name);
+		return;
+	}
+
+	for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++)
+		if (!atomic_read(&new->error_count)) {
+			ms->default_mirror = new;
+			break;
+		}
+
+	if (unlikely(new == ms->mirror + ms->nr_mirrors))
+		DMWARN("All sides of mirror have failed.");
+}
+
 /*-----------------------------------------------------------------
  * Recovery.
  *
@@ -656,16 +701,34 @@ static void bio_set_ms(struct bio *bio, 
 static void recovery_complete(int read_err, unsigned int write_err,
 			      void *context)
 {
-	struct region *reg = (struct region *) context;
+	struct region *reg = (struct region *)context;
+	struct mirror_set *ms = reg->rh->ms;
+	unsigned long write_err_ulong = (unsigned long)write_err;
+	unsigned m;
+	int bit = 0;
 
 	if (read_err)
-		/* Read error means the failure of default mirror. */
 		DMERR_LIMIT("Unable to read primary mirror during recovery");
 
-	if (write_err)
-		DMERR_LIMIT("Write error during recovery (error = 0x%x)",
-			    write_err);
+	if (!write_err)
+		goto out;
+
+	DMERR_LIMIT("Write error during recovery: 0x%x", write_err);
+
+	/*
+	 * Bits correspond to devices excluding default mirror.
+	 * The default mirror cannot change during recovery.
+	 */
+	for (m = 0; m < ms->nr_mirrors; m++) {
+		if (&ms->mirror[m] == ms->default_mirror)
+			continue;
+
+		if (test_bit(bit, &write_err_ulong))
+			fail_mirror(ms->mirror + m);
+		bit++;
+	}
 
+      out:
 	rh_recovery_end(reg, !(read_err || write_err));
 }
 
@@ -1019,6 +1082,7 @@ static int get_mirror(struct mirror_set 
 	}
 
 	ms->mirror[mirror].ms = ms;
+	atomic_set(&(ms->mirror[mirror].error_count), 0);
 	ms->mirror[mirror].offset = offset;
 
 	return 0;
