From: Mikulas Patocka <mpatocka@redhat.com>

Use new variables, 'merge_write_interlock' and 'merge_write_interlock_n',
to determine the chunk number (on the origin device) and number of chunks
that are being merged.  Writes to this area are held on the
'merge_write_list' queue.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>

---
drivers/md/dm-snap.c |  132 +++++++++++++++++++++++++++++++++++--------------
 drivers/md/dm-snap.c |   89 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 79 insertions(+), 10 deletions(-)

Index: linux-2.6.32/drivers/md/dm-snap.c
===================================================================
--- linux-2.6.32.orig/drivers/md/dm-snap.c
+++ linux-2.6.32/drivers/md/dm-snap.c
@@ -109,6 +109,16 @@ struct dm_snapshot {
 
 	/* Wait for events based on state_bits */
 	unsigned long state_bits;
+
+	/* Merging this area --- block any writes */
+	chunk_t merge_write_interlock;
+	int merge_write_interlock_n;
+
+	/*
+	 * A list of requests that were delayed because
+	 * of racing with merge
+	 */
+	struct bio_list merge_write_list;
 };
 
 /*
@@ -747,6 +757,17 @@ static void merge_shutdown(struct dm_sna
 	wake_up_bit(&s->state_bits, RUNNING_MERGE);
 }
 
+static void flush_bios(struct bio *bio);
+static void error_bios(struct bio *bio);
+
+static struct bio *__release_write_interlock(struct dm_snapshot *s)
+{
+	s->merge_write_interlock = 0;
+	s->merge_write_interlock_n = 0;
+
+	return bio_list_get(&s->merge_write_list);
+}
+
 /*
  * Remove one chunk from the index of completed exceptions.
  */
@@ -801,14 +822,33 @@ static int __remove_single_exception_chu
 	return 0;
 }
 
-static int remove_single_exception_chunk(struct dm_snapshot *s,
-					 chunk_t old_chunk)
+static int remove_single_exception_chunk(struct dm_snapshot *s)
 {
+	struct bio *b = NULL;
+	chunk_t old_chunk;
 	int r = 0;
+	int i;
 
 	down_write(&s->lock);
-	r = __remove_single_exception_chunk(s, old_chunk);
+
+	/*
+	 * Process chunks (and associated exceptions) in reverse order
+	 * so that dm_consecutive_chunk_count_dec() accounting works.
+	 */
+	for (i = s->merge_write_interlock_n - 1; i >= 0; i--) {
+		old_chunk = s->merge_write_interlock + i;
+
+		r = __remove_single_exception_chunk(s, old_chunk);
+		if (r)
+			goto out;
+	}
+
+	b = __release_write_interlock(s);
+
+out:
 	up_write(&s->lock);
+	if (b)
+		flush_bios(b);
 
 	return r;
 }
@@ -844,9 +884,6 @@ static void snapshot_merge_next_chunks(s
 
 	/* TODO: use larger I/O size once we verify that kcopyd handles it */
 
-	if (remove_single_exception_chunk(s, old_chunk) < 0)
-		goto shut;
-
 	dest.bdev = s->origin->bdev;
 	dest.sector = chunk_to_sector(s->store, old_chunk);
 	dest.count = min((sector_t)s->store->chunk_size,
@@ -856,6 +893,13 @@ static void snapshot_merge_next_chunks(s
 	src.sector = chunk_to_sector(s->store, new_chunk);
 	src.count = dest.count;
 
+	down_write(&s->lock);
+	s->merge_write_interlock = old_chunk;
+	s->merge_write_interlock_n = 1;
+	up_write(&s->lock);
+
+	/* !!! FIXME: wait until writes to this chunk drain */
+
 	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
 	return;
 
@@ -866,6 +910,7 @@ shut:
 static void merge_callback(int read_err, unsigned long write_err, void *context)
 {
 	struct dm_snapshot *s = context;
+	struct bio *b = NULL;
 
 	if (read_err || write_err) {
 		if (read_err)
@@ -875,16 +920,25 @@ static void merge_callback(int read_err,
 		goto shut;
 	}
 
-	if (s->store->type->commit_merge(s->store, 1) < 0) {
+	if (s->store->type->commit_merge(s->store,
+					 s->merge_write_interlock_n) < 0) {
 		DMERR("Write error in exception store: shutting down merge");
 		goto shut;
 	}
 
+	if (remove_single_exception_chunk(s) < 0)
+		goto shut;
+
 	snapshot_merge_next_chunks(s);
 
 	return;
 
 shut:
+	down_write(&s->lock);
+	b = __release_write_interlock(s);
+	up_write(&s->lock);
+	error_bios(b);
+
 	merge_shutdown(s);
 }
 
@@ -983,6 +1037,9 @@ static int snapshot_ctr(struct dm_target
 	INIT_LIST_HEAD(&s->list);
 	spin_lock_init(&s->pe_lock);
 	s->state_bits = 0;
+	s->merge_write_interlock = 0;
+	s->merge_write_interlock_n = 0;
+	bio_list_init(&s->merge_write_list);
 
 	/* Allocate hash table for COW data */
 	if (init_hash_tables(s)) {
@@ -1539,6 +1596,8 @@ static int snapshot_map(struct dm_target
  * For each chunk, if there is an existing exception, it is used to
  * redirect I/O to the cow device.  Otherwise I/O is sent to the origin,
  * which in turn might generate exceptions in other snapshots.
+ * If merging is currently taking place on the chunk in question, the
+ * I/O is deferred by adding it to s->merge_write_list.
  */
 static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
 			      union map_info *map_context)
@@ -1559,7 +1618,7 @@ static int snapshot_merge_map(struct dm_
 
 	chunk = sector_to_chunk(s->store, bio->bi_sector);
 
-	down_read(&s->lock);
+	down_write(&s->lock);
 
 	/* Full snapshots are not usable */
 	if (!s->valid) {
@@ -1570,6 +1629,16 @@ static int snapshot_merge_map(struct dm_
 	/* If the block is already remapped - use that */
 	e = dm_lookup_exception(&s->complete, chunk);
 	if (e) {
+		/* We are copying this area --- so don't write to it */
+		if (bio_rw(bio) == WRITE &&
+		    chunk >= s->merge_write_interlock &&
+		    chunk < (s->merge_write_interlock +
+			     s->merge_write_interlock_n)) {
+			bio->bi_bdev = s->origin->bdev;
+			bio_list_add(&s->merge_write_list, bio);
+			r = DM_MAPIO_SUBMITTED;
+			goto out_unlock;
+		}
 		remap_exception(s, e, bio, chunk);
 		goto out_unlock;
 	}
@@ -1577,12 +1646,12 @@ static int snapshot_merge_map(struct dm_
 	bio->bi_bdev = s->origin->bdev;
 
 	if (bio_rw(bio) == WRITE) {
-		up_read(&s->lock);
+		up_write(&s->lock);
 		return do_origin(s->origin, bio);
 	}
 
 out_unlock:
-	up_read(&s->lock);
+	up_write(&s->lock);
 
 	return r;
 }
