Index: linux-2.6.17/drivers/md/dm-snap.c
===================================================================
--- linux-2.6.17.orig/drivers/md/dm-snap.c	2006-08-07 16:06:03.000000000 +0100
+++ linux-2.6.17/drivers/md/dm-snap.c	2006-08-07 19:50:18.000000000 +0100
@@ -60,20 +60,20 @@ struct pending_exception {
 
 	/*
 	 * The primary pending_exception is the one that holds
-	 * the sibling_count and the list of origin_bios for a
+	 * the ref_count and the list of origin_bios for a
 	 * group of pending_exceptions.  It is always last to get freed.
 	 * These fields get set up when writing to the origin.
 	 */
 	struct pending_exception *primary_pe;
 
 	/*
-	 * Number of pending_exceptions processing this chunk.
+	 * Number of exception copies or snapshot reads processing this chunk.
 	 * When this drops to zero we must complete the origin bios.
 	 * If incrementing or decrementing this, hold pe->snap->lock for
 	 * the sibling concerned and not pe->primary_pe->snap->lock unless
 	 * they are the same.
 	 */
-	atomic_t sibling_count;
+	atomic_t ref_count;
 
 	/* Pointer back to snapshot context */
 	struct dm_snapshot *snap;
@@ -654,12 +654,58 @@ static void __invalidate_snapshot(struct
 	dm_table_event(s->table);
 }
 
+static void get_pending_exception(struct pending_exception *pe)
+{
+	atomic_inc(&pe->ref_count);
+
+	if (pe->primary_pe && pe->primary_pe != pe)
+		atomic_inc(&pe->primary_pe->ref_count);
+}
+
+static struct bio *put_pending_exception(struct pending_exception *pe)
+{
+	struct dm_snapshot *s = pe->snap;
+	struct pending_exception *primary_pe;
+	struct bio *origin_bios = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&s->pe_lock, flags);
+
+	primary_pe = pe->primary_pe;
+
+	/*
+	 * Free the pe if it's not linked to an origin write or if
+	 * it's not itself a primary pe.
+	 */
+	if ((!primary_pe || primary_pe != pe) &&
+	    atomic_dec_and_test(&pe->ref_count)) {
+		remove_exception(&pe->e);
+		free_pending_exception(pe);
+	}
+
+	/*
+	 * If this pe is involved in a write to the origin and
+	 * it is the last sibling to complete then release
+	 * the bios for the original write to the origin.
+	 */
+	if (primary_pe && atomic_dec_and_test(&primary_pe->ref_count)) {
+		origin_bios = bio_list_get(&primary_pe->origin_bios);
+		remove_exception(&primary_pe->pe);
+		free_pending_exception(primary_pe);
+	}
+
+	spin_lock_irqrestore(&s->pe_lock, flags);
+
+	return origin_bios;
+}
+
 static void pending_complete(struct pending_exception *pe, int success)
 {
 	struct exception *e;
 	struct pending_exception *primary_pe;
 	struct dm_snapshot *s = pe->snap;
-	struct bio *flush = NULL;
+	struct bio *origin_bios = NULL;
+	struct bio *snapshot_bios = NULL;
 	int error = 0;
 
 	if (!success) {
@@ -691,43 +737,20 @@ static void pending_complete(struct pend
 	 * in-flight exception from the list.
 	 */
 	insert_exception(&s->complete, e);
-	remove_exception(&pe->e);
 
  out:
-	primary_pe = pe->primary_pe;
-
-	/*
-	 * If this pe is involved in a write to the origin and
-	 * it is the last sibling to complete then release
-	 * the bios for the original write to the origin.
-	 */
-	if (primary_pe &&
-	    atomic_dec_and_test(&primary_pe->sibling_count))
-		flush = bio_list_get(&primary_pe->origin_bios);
+	snapshot_bios = bio_list_get(&pe->snapshot_bios);
+	origin_bios = put_pending_exception(pe);
 
 	up_write(&s->lock);
 
 	/* Submit any pending write bios */
 	if (!error)
-		flush_bios(bio_list_get(&pe->snapshot_bios));
+		flush_bios(snapshot_bios);
 	else
-		error_bios(bio_list_get(&pe->snapshot_bios));
+		error_bios(snapshot_bios);
 
-	/*
-	 * Free the pe if it's not linked to an origin write or if
-	 * it's not itself a primary pe.
-	 */
-	if (!primary_pe || primary_pe != pe)
-		free_pending_exception(pe);
-
-	/*
-	 * Free the primary pe if nothing references it.
-	 */
-	if (primary_pe && !atomic_read(&primary_pe->sibling_count))
-		free_pending_exception(primary_pe);
-
-	if (flush)
-		flush_bios(flush);
+	flush_bios(origin_bios);
 }
 
 static void commit_callback(void *context, int success)
@@ -793,6 +816,9 @@ __find_pending_exception(struct dm_snaps
 	struct exception *e;
 	struct pending_exception *pe;
 	chunk_t chunk = sector_to_chunk(s, bio->bi_sector);
+	unsigned long flags;
+
+	spin_lock_irqsave(&s->pe_lock, flags);
 
 	/*
 	 * Is there a pending exception for this already ?
@@ -808,13 +834,16 @@ __find_pending_exception(struct dm_snaps
 	 * Create a new pending exception, we don't want
 	 * to hold the lock while we do this.
 	 */
+	spin_lock_irqrestore(&s->pe_lock, flags);
 	up_write(&s->lock);
 	pe = alloc_pending_exception();
 	down_write(&s->lock);
+	spin_lock_irqsave(&s->pe_lock, flags);
 
 	if (!s->valid) {
 		free_pending_exception(pe);
-		return NULL;
+		pe = NULL;
+		goto out;
 	}
 
 	e = lookup_exception(&s->pending, chunk);
@@ -828,18 +857,22 @@ __find_pending_exception(struct dm_snaps
 	bio_list_init(&pe->origin_bios);
 	bio_list_init(&pe->snapshot_bios);
 	pe->primary_pe = NULL;
-	atomic_set(&pe->sibling_count, 1);
+	atomic_set(&pe->ref_count, 0);
 	pe->snap = s;
 	pe->started = 0;
 
 	if (s->store.prepare_exception(&s->store, &pe->e)) {
 		free_pending_exception(pe);
-		return NULL;
+		pe = NULL;
+		goto out;
 	}
 
+	get_pending_exception(pe);
+
 	insert_exception(&s->pending, &pe->e);
 
  out:
+	spin_lock_irqrestore(&s->pe_lock, flags);
 	return pe;
 }
 
@@ -1011,7 +1044,7 @@ static int __origin_write(struct list_he
 		 * is already remapped in this snapshot
 		 * and trigger an exception if not.
 		 *
-		 * sibling_count is initialised to 1 so pending_complete()
+		 * ref_count is initialised to 1 so pending_complete()
 		 * won't destroy the primary_pe while we're inside this loop.
 		 */
 		e = lookup_exception(&snap->complete, chunk);
@@ -1036,14 +1069,18 @@ static int __origin_write(struct list_he
 				first = 1;
 			}
 
+			get_pending_exception(primary_pe);
+
 			bio_list_add(&primary_pe->origin_bios, bio);
 
 			r = 0;
 		}
 
 		if (!pe->primary_pe) {
-			atomic_inc(&primary_pe->sibling_count);
 			pe->primary_pe = primary_pe;
+			if (primary_pe != pe)
+				atomic_add(atomic_read(&pe->ref_count),
+					   &primary_pe->ref_count);
 		}
 
 		if (!pe->started) {
@@ -1056,20 +1093,20 @@ static int __origin_write(struct list_he
 	}
 
 	if (!primary_pe)
-		goto out;
+		return r;
 
 	/*
 	 * If this is the first time we're processing this chunk and
-	 * sibling_count is now 1 it means all the pending exceptions
+	 * ref_count is now 1 it means all the pending exceptions
 	 * got completed while we were in the loop above, so it falls to
 	 * us here to remove the primary_pe and submit any origin_bios.
 	 */
 
-	if (first && atomic_dec_and_test(&primary_pe->sibling_count)) {
+	if (first && atomic_dec_and_test(&primary_pe->ref_count)) {
 		flush_bios(bio_list_get(&primary_pe->origin_bios));
 		free_pending_exception(primary_pe);
 		/* If we got here, pe_queue is necessarily empty. */
-		goto out;
+		return r;
 	}
 
 	/*
@@ -1078,7 +1115,6 @@ static int __origin_write(struct list_he
 	list_for_each_entry_safe(pe, next_pe, &pe_queue, list)
 		start_copy(pe);
 
- out:
 	return r;
 }
 
