---
 drivers/md/persistent-data/Kconfig                  |    1 
 drivers/md/persistent-data/Makefile                 |    2 
 drivers/md/persistent-data/dm-block-manager.c       | 1222 ++++++--------------
 drivers/md/persistent-data/dm-block-manager.h       |   33 
 drivers/md/persistent-data/dm-btree-internal.h      |    6 
 drivers/md/persistent-data/dm-btree-remove.c        |  109 -
 drivers/md/persistent-data/dm-btree-spine.c         |   38 
 drivers/md/persistent-data/dm-btree.c               |   72 -
 drivers/md/persistent-data/dm-btree.h               |   19 
 drivers/md/persistent-data/dm-space-map-checker.c   |  437 +++++++
 drivers/md/persistent-data/dm-space-map-checker.h   |   26 
 drivers/md/persistent-data/dm-space-map-common.c    |  704 +++++++++++
 drivers/md/persistent-data/dm-space-map-common.h    |   52 
 drivers/md/persistent-data/dm-space-map-disk.c      |  590 ++-------
 drivers/md/persistent-data/dm-space-map-metadata.c  |  432 -------
 drivers/md/persistent-data/dm-space-map.h           |   16 
 drivers/md/persistent-data/dm-transaction-manager.c |   96 -
 17 files changed, 1978 insertions(+), 1877 deletions(-)

Index: linux-3.1-rc9/drivers/md/persistent-data/Kconfig
===================================================================
--- linux-3.1-rc9.orig/drivers/md/persistent-data/Kconfig
+++ linux-3.1-rc9/drivers/md/persistent-data/Kconfig
@@ -2,6 +2,7 @@ config DM_PERSISTENT_DATA
        tristate
        depends on BLK_DEV_DM && EXPERIMENTAL
        select LIBCRC32C
+       select DM_BUFIO
        ---help---
 	 Library providing immutable on-disk data structure support for
 	 device-mapper targets such as the thin provisioning target.
Index: linux-3.1-rc9/drivers/md/persistent-data/Makefile
===================================================================
--- linux-3.1-rc9.orig/drivers/md/persistent-data/Makefile
+++ linux-3.1-rc9/drivers/md/persistent-data/Makefile
@@ -1,6 +1,8 @@
 obj-$(CONFIG_DM_PERSISTENT_DATA) += dm-persistent-data.o
 dm-persistent-data-objs := \
 	dm-block-manager.o \
+	dm-space-map-checker.o \
+	dm-space-map-common.o \
 	dm-space-map-disk.o \
 	dm-space-map-metadata.o \
 	dm-transaction-manager.o \
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-block-manager.c
===================================================================
--- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-block-manager.c
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-block-manager.c
@@ -5,843 +5,453 @@
  */
 #include "dm-block-manager.h"
 #include "dm-persistent-data-internal.h"
+#include "../dm-bufio.h"
 
-#include <linux/dm-io.h>
+#include <linux/crc32c.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/rwsem.h>
 #include <linux/device-mapper.h>
+#include <linux/stacktrace.h>
 
 #define DM_MSG_PREFIX "block manager"
 
 /*----------------------------------------------------------------*/
 
-#define SECTOR_SIZE (1 << SECTOR_SHIFT)
-#define MAX_CACHE_SIZE 16U
-
-enum dm_block_state {
-	BS_EMPTY,
-	BS_CLEAN,
-	BS_READING,
-	BS_WRITING,
-	BS_READ_LOCKED,
-	BS_READ_LOCKED_DIRTY,	/* Block was dirty before it was read locked. */
-	BS_WRITE_LOCKED,
-	BS_DIRTY,
-	BS_ERROR
-};
-
-struct dm_block {
-	struct list_head list;
-	struct hlist_node hlist;
-
-	dm_block_t where;
-	struct dm_block_validator *validator;
-	void *data;
-	wait_queue_head_t io_q;
-	unsigned read_lock_count;
-	unsigned write_lock_pending;
-	enum dm_block_state state;
-
-	/*
-	 * Extra flags like REQ_FLUSH and REQ_FUA can be set here.  This is
-	 * mainly as to avoid a race condition in flush_and_unlock() where
-	 * the newly-unlocked superblock may have been submitted for a
-	 * write before the write_all_dirty() call is made.
-	 */
-	int io_flags;
-
-	/*
-	 * Sadly we need an up pointer so we can get to the bm on io
-	 * completion.
-	 */
-	struct dm_block_manager *bm;
-};
-
-struct dm_block_manager {
-	struct block_device *bdev;
-	unsigned cache_size;
-	unsigned max_held_per_thread;
-	unsigned block_size;	/* In bytes */
-	dm_block_t nr_blocks;
-
-	/*
-	 * This will trigger every time an io completes.
-	 */
-	wait_queue_head_t io_q;
+/*
+ * This is a read/write semaphore with a couple of differences.
+ *
+ * i) There is a restriction on the number of concurrent read locks that
+ * may be held at once.  This is just an implementation detail.
+ *
+ * ii) Recursive locking attempts are detected and return EINVAL.  A stack
+ * trace is also emitted for the previous lock aquisition.
+ *
+ * iii) Priority is given to write locks.
+ */
+#define MAX_HOLDERS 4
+#define MAX_STACK 10
 
-	struct dm_io_client *io;
+typedef unsigned long stack_entries[MAX_STACK];
 
-	/*
-	 * Protects all the lists and the hash table.
-	 */
+struct block_lock {
 	spinlock_t lock;
+	__s32 count;
+	struct list_head waiters;
+	struct task_struct *holders[MAX_HOLDERS];
+
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+	struct stack_trace traces[MAX_HOLDERS];
+	stack_entries entries[MAX_HOLDERS];
+#endif
+};
 
-	unsigned error_count;
-	unsigned available_count;
-	unsigned reading_count;
-	unsigned writing_count;
-
-	struct list_head empty_list;	/* No block assigned */
-	struct list_head clean_list;	/* Unlocked and clean */
-	struct list_head dirty_list;	/* Unlocked and dirty */
-	struct list_head error_list;
-
-	char buffer_cache_name[32];
-	struct kmem_cache *buffer_cache; /* The buffers that store the raw data */
-
-	/*
-	 * Hash table of cached blocks, holds everything that isn't in the
-	 * BS_EMPTY state.
-	 */
-	unsigned hash_size;
-	unsigned hash_mask;
-
-	struct hlist_head buckets[0];	/* Must be last member of struct. */
+struct waiter {
+	struct list_head list;
+	struct task_struct *task;
+	int wants_write;
 };
 
-dm_block_t dm_block_location(struct dm_block *b)
+static unsigned __find_holder(struct block_lock *lock,
+			      struct task_struct *task)
 {
-	return b->where;
-}
-EXPORT_SYMBOL_GPL(dm_block_location);
+	unsigned i;
 
-void *dm_block_data(struct dm_block *b)
-{
-	return b->data;
+	for (i = 0; i < MAX_HOLDERS; i++)
+		if (lock->holders[i] == task)
+			break;
+
+	BUG_ON(i == MAX_HOLDERS);
+	return i;
 }
-EXPORT_SYMBOL_GPL(dm_block_data);
 
-/*----------------------------------------------------------------
- * Hash table
- *--------------------------------------------------------------*/
-static struct dm_block *__find_block(struct dm_block_manager *bm, dm_block_t b)
+/* call this *after* you increment lock->count */
+static void __add_holder(struct block_lock *lock, struct task_struct *task)
 {
-	unsigned bucket = dm_hash_block(b, bm->hash_mask);
-	struct dm_block *blk;
-	struct hlist_node *n;
+	unsigned h = __find_holder(lock, NULL);
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+	struct stack_trace *t;
+#endif
 
-	hlist_for_each_entry(blk, n, bm->buckets + bucket, hlist)
-		if (blk->where == b)
-			return blk;
+	get_task_struct(task);
+	lock->holders[h] = task;
 
-	return NULL;
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+	t = lock->traces + h;
+	t->nr_entries = 0;
+	t->max_entries = MAX_STACK;
+	t->entries = lock->entries[h];
+	t->skip = 2;
+	save_stack_trace(t);
+#endif
 }
 
-static void __insert_block(struct dm_block_manager *bm, struct dm_block *b)
+/* call this *before* you decrement lock->count */
+static void __del_holder(struct block_lock *lock, struct task_struct *task)
 {
-	unsigned bucket = dm_hash_block(b->where, bm->hash_mask);
-
-	hlist_add_head(&b->hlist, bm->buckets + bucket);
+	unsigned h = __find_holder(lock, task);
+	lock->holders[h] = NULL;
+	put_task_struct(task);
 }
 
-/*----------------------------------------------------------------
- * Block state:
- * __transition() handles transition of a block between different states.
- * Study this to understand the state machine.
- *
- * Alternatively install graphviz and run:
- *     grep DOT dm-block-manager.c | grep -v '	' |
- *	 sed -e 's/.*DOT: //' -e 's/\*\///' |
- *	 dot -Tps -o states.ps
- *
- * Assumes bm->lock is held.
- *--------------------------------------------------------------*/
-static void __transition(struct dm_block *b, enum dm_block_state new_state)
+static int __check_holder(struct block_lock *lock)
 {
-	/* DOT: digraph BlockStates { */
-	struct dm_block_manager *bm = b->bm;
+	unsigned i;
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+	static struct stack_trace t;
+	static stack_entries entries;
+#endif
 
-	switch (new_state) {
-	case BS_EMPTY:
-		/* DOT: error -> empty */
-		/* DOT: clean -> empty */
-		BUG_ON(!((b->state == BS_ERROR) ||
-			 (b->state == BS_CLEAN)));
-		hlist_del(&b->hlist);
-		list_move(&b->list, &bm->empty_list);
-		b->write_lock_pending = 0;
-		b->read_lock_count = 0;
-		b->io_flags = 0;
-		b->validator = NULL;
-
-		if (b->state == BS_ERROR) {
-			bm->error_count--;
-			bm->available_count++;
+	for (i = 0; i < MAX_HOLDERS; i++) {
+		if (lock->holders[i] == current) {
+			DMERR("recursive lock detected in pool metadata");
+#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
+			DMERR("previously held here:");
+			print_stack_trace(lock->traces + i, 4);
+
+			DMERR("subsequent aquisition attempted here:");
+			t.nr_entries = 0;
+			t.max_entries = MAX_STACK;
+			t.entries = entries;
+			t.skip = 3;
+			save_stack_trace(&t);
+			print_stack_trace(&t, 4);
+#endif
+			return -EINVAL;
 		}
-		break;
+	}
 
-	case BS_CLEAN:
-		/* DOT: reading -> clean */
-		/* DOT: writing -> clean */
-		/* DOT: read_locked -> clean */
-		BUG_ON(!((b->state == BS_READING) ||
-			 (b->state == BS_WRITING) ||
-			 (b->state == BS_READ_LOCKED)));
-		switch (b->state) {
-		case BS_READING:
-			BUG_ON(!bm->reading_count);
-			bm->reading_count--;
-			break;
+	return 0;
+}
 
-		case BS_WRITING:
-			BUG_ON(!bm->writing_count);
-			bm->writing_count--;
-			b->io_flags = 0;
-			break;
+static void __wait(struct waiter *w)
+{
+	for (;;) {
+		set_task_state(current, TASK_UNINTERRUPTIBLE);
 
-		default:
+		if (!w->task)
 			break;
-		}
-		list_add_tail(&b->list, &bm->clean_list);
-		bm->available_count++;
-		break;
-
-	case BS_READING:
-		/* DOT: empty -> reading */
-		BUG_ON(!(b->state == BS_EMPTY));
-		__insert_block(bm, b);
-		list_del(&b->list);
-		bm->available_count--;
-		bm->reading_count++;
-		break;
-
-	case BS_WRITING:
-		/* DOT: dirty -> writing */
-		BUG_ON(!(b->state == BS_DIRTY));
-		list_del(&b->list);
-		bm->writing_count++;
-		break;
-
-	case BS_READ_LOCKED:
-		/* DOT: clean -> read_locked */
-		BUG_ON(!(b->state == BS_CLEAN));
-		list_del(&b->list);
-		bm->available_count--;
-		break;
-
-	case BS_READ_LOCKED_DIRTY:
-		/* DOT: dirty -> read_locked_dirty */
-		BUG_ON(!((b->state == BS_DIRTY)));
-		list_del(&b->list);
-		break;
-
-	case BS_WRITE_LOCKED:
-		/* DOT: dirty -> write_locked */
-		/* DOT: clean -> write_locked */
-		BUG_ON(!((b->state == BS_DIRTY) ||
-			 (b->state == BS_CLEAN)));
-		list_del(&b->list);
-
-		if (b->state == BS_CLEAN)
-			bm->available_count--;
-		break;
-
-	case BS_DIRTY:
-		/* DOT: write_locked -> dirty */
-		/* DOT: read_locked_dirty -> dirty */
-		BUG_ON(!((b->state == BS_WRITE_LOCKED) ||
-			 (b->state == BS_READ_LOCKED_DIRTY)));
-		list_add_tail(&b->list, &bm->dirty_list);
-		break;
-
-	case BS_ERROR:
-		/* DOT: writing -> error */
-		/* DOT: reading -> error */
-		BUG_ON(!((b->state == BS_WRITING) ||
-			 (b->state == BS_READING)));
-		bm->error_count++;
-		list_add_tail(&b->list, &bm->error_list);
-		break;
+
+		schedule();
 	}
 
-	b->state = new_state;
-	/* DOT: } */
+	set_task_state(current, TASK_RUNNING);
 }
 
-/*----------------------------------------------------------------
- * Low-level io.
- *--------------------------------------------------------------*/
-typedef void (completion_fn)(unsigned long error, struct dm_block *b);
-
-static void submit_io(struct dm_block *b, int rw,
-		      completion_fn fn)
+static void __wake_waiter(struct waiter *w)
 {
-	struct dm_block_manager *bm = b->bm;
-	struct dm_io_request req;
-	struct dm_io_region region;
-	unsigned sectors_per_block = bm->block_size >> SECTOR_SHIFT;
-
-	region.bdev = bm->bdev;
-	region.sector = b->where * sectors_per_block;
-	region.count = sectors_per_block;
-
-	req.bi_rw = rw;
-	req.mem.type = DM_IO_KMEM;
-	req.mem.offset = 0;
-	req.mem.ptr.addr = b->data;
-	req.notify.fn = (void (*)(unsigned long, void *)) fn;
-	req.notify.context = b;
-	req.client = bm->io;
+	struct task_struct *task;
 
-	if (dm_io(&req, 1, &region, NULL) < 0)
-		fn(1, b);
+	list_del(&w->list);
+	task = w->task;
+	smp_mb();
+	w->task = NULL;
+	wake_up_process(task);
 }
 
-/*----------------------------------------------------------------
- * High-level io.
- *--------------------------------------------------------------*/
-static void __complete_io(unsigned long error, struct dm_block *b)
+/*
+ * We either wake a few readers or a single writer.
+ */
+static void __wake_many(struct block_lock *lock)
 {
-	struct dm_block_manager *bm = b->bm;
+	struct waiter *w, *tmp;
 
-	if (error) {
-		DMERR("io error = %lu, block = %llu",
-		      error , (unsigned long long)b->where);
-		__transition(b, BS_ERROR);
-	} else
-		__transition(b, BS_CLEAN);
+	BUG_ON(lock->count < 0);
+	list_for_each_entry_safe(w, tmp, &lock->waiters, list) {
+		if (lock->count >= MAX_HOLDERS)
+			return;
 
-	wake_up(&b->io_q);
-	wake_up(&bm->io_q);
-}
+		if (w->wants_write) {
+			if (lock->count > 0)
+				return; /* still read locked */
 
-static void complete_io(unsigned long error, struct dm_block *b)
-{
-	struct dm_block_manager *bm = b->bm;
-	unsigned long flags;
+			lock->count = -1;
+			__add_holder(lock, w->task);
+			__wake_waiter(w);
+			return;
+		}
 
-	spin_lock_irqsave(&bm->lock, flags);
-	__complete_io(error, b);
-	spin_unlock_irqrestore(&bm->lock, flags);
+		lock->count++;
+		__add_holder(lock, w->task);
+		__wake_waiter(w);
+	}
 }
 
-static void read_block(struct dm_block *b)
+static void bl_init(struct block_lock *lock)
 {
-	submit_io(b, READ, complete_io);
+	int i;
+
+	spin_lock_init(&lock->lock);
+	lock->count = 0;
+	INIT_LIST_HEAD(&lock->waiters);
+	for (i = 0; i < MAX_HOLDERS; i++)
+		lock->holders[i] = NULL;
 }
 
-static void write_block(struct dm_block *b)
+static int __available_for_read(struct block_lock *lock)
 {
-	if (b->validator)
-		b->validator->prepare_for_write(b->validator, b,
-						b->bm->block_size);
-
-	submit_io(b, WRITE | b->io_flags, complete_io);
+	return lock->count >= 0 &&
+		lock->count < MAX_HOLDERS &&
+		list_empty(&lock->waiters);
 }
 
-static void write_dirty(struct dm_block_manager *bm, unsigned count)
+static int bl_down_read(struct block_lock *lock)
 {
-	struct dm_block *b, *tmp;
-	struct list_head dirty;
-	unsigned long flags;
+	int r;
+	struct waiter w;
 
-	/*
-	 * Grab the first @count entries from the dirty list
-	 */
-	INIT_LIST_HEAD(&dirty);
-	spin_lock_irqsave(&bm->lock, flags);
-	list_for_each_entry_safe(b, tmp, &bm->dirty_list, list) {
-		if (!count--)
-			break;
-		__transition(b, BS_WRITING);
-		list_add_tail(&b->list, &dirty);
+	spin_lock(&lock->lock);
+	r = __check_holder(lock);
+	if (r) {
+		spin_unlock(&lock->lock);
+		return r;
 	}
-	spin_unlock_irqrestore(&bm->lock, flags);
 
-	list_for_each_entry_safe(b, tmp, &dirty, list) {
-		list_del(&b->list);
-		write_block(b);
-	}
-}
+	if (__available_for_read(lock)) {
+		lock->count++;
+		__add_holder(lock, current);
+		spin_unlock(&lock->lock);
+		return 0;
+	}
+
+	get_task_struct(current);
+
+	w.task = current;
+	w.wants_write = 0;
+	list_add_tail(&w.list, &lock->waiters);
+	spin_unlock(&lock->lock);
 
-static void write_all_dirty(struct dm_block_manager *bm)
-{
-	write_dirty(bm, bm->cache_size);
+	__wait(&w);
+	put_task_struct(current);
+	return 0;
 }
 
-static void __clear_errors(struct dm_block_manager *bm)
+static int bl_down_read_nonblock(struct block_lock *lock)
 {
-	struct dm_block *b, *tmp;
-	list_for_each_entry_safe(b, tmp, &bm->error_list, list)
-		__transition(b, BS_EMPTY);
-}
-
-/*----------------------------------------------------------------
- * Waiting
- *--------------------------------------------------------------*/
-#ifdef __CHECKER__
-#  define __retains(x)	__attribute__((context(x, 1, 1)))
-#else
-#  define __retains(x)
-#endif
+	int r;
 
-#define __wait_block(wq, lock, flags, sched_fn, condition)	\
-do {								\
-	DEFINE_WAIT(wait);					\
-	add_wait_queue(wq, &wait);				\
-								\
-	for (;;) {						\
-		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); \
-		if (condition)					\
-			break;					\
-								\
-		spin_unlock_irqrestore(lock, flags);		\
-		sched_fn();					\
-		spin_lock_irqsave(lock, flags);			\
-	}							\
-								\
-	finish_wait(wq, &wait);					\
-} while (0)
+	spin_lock(&lock->lock);
+	r = __check_holder(lock);
+	if (r)
+		goto out;
 
-static void __wait_io(struct dm_block *b, unsigned long *flags)
-	__retains(&b->bm->lock)
-{
-	__wait_block(&b->io_q, &b->bm->lock, *flags, io_schedule,
-		     ((b->state != BS_READING) && (b->state != BS_WRITING)));
-}
+	if (__available_for_read(lock)) {
+		lock->count++;
+		__add_holder(lock, current);
+		r = 0;
+	} else
+		r = -EWOULDBLOCK;
 
-static void __wait_unlocked(struct dm_block *b, unsigned long *flags)
-	__retains(&b->bm->lock)
-{
-	__wait_block(&b->io_q, &b->bm->lock, *flags, schedule,
-		     ((b->state == BS_CLEAN) || (b->state == BS_DIRTY)));
+out:
+	spin_unlock(&lock->lock);
+	return r;
 }
 
-static void __wait_read_lockable(struct dm_block *b, unsigned long *flags)
-	__retains(&b->bm->lock)
+static void bl_up_read(struct block_lock *lock)
 {
-	__wait_block(&b->io_q, &b->bm->lock, *flags, schedule,
-		     (!b->write_lock_pending && (b->state == BS_CLEAN ||
-						 b->state == BS_DIRTY ||
-						 b->state == BS_READ_LOCKED)));
+	spin_lock(&lock->lock);
+	BUG_ON(lock->count <= 0);
+	__del_holder(lock, current);
+	--lock->count;
+	if (!list_empty(&lock->waiters))
+		__wake_many(lock);
+	spin_unlock(&lock->lock);
 }
 
-static void __wait_all_writes(struct dm_block_manager *bm, unsigned long *flags)
-	__retains(&bm->lock)
+static int bl_down_write(struct block_lock *lock)
 {
-	__wait_block(&bm->io_q, &bm->lock, *flags, io_schedule,
-		     !bm->writing_count);
-}
+	int r;
+	struct waiter w;
 
-static void __wait_all_io(struct dm_block_manager *bm, unsigned long *flags)
-	__retains(&bm->lock)
-{
-	__wait_block(&bm->io_q, &bm->lock, *flags, io_schedule,
-		     !bm->writing_count && !bm->reading_count);
-}
+	spin_lock(&lock->lock);
+	r = __check_holder(lock);
+	if (r) {
+		spin_unlock(&lock->lock);
+		return r;
+	}
 
-static void __wait_clean(struct dm_block_manager *bm, unsigned long *flags)
-	__retains(&bm->lock)
-{
-	__wait_block(&bm->io_q, &bm->lock, *flags, io_schedule,
-		     (!list_empty(&bm->clean_list) ||
-		      (!bm->writing_count)));
-}
+	if (lock->count == 0 && list_empty(&lock->waiters)) {
+		lock->count = -1;
+		__add_holder(lock, current);
+		spin_unlock(&lock->lock);
+		return 0;
+	}
 
-/*----------------------------------------------------------------
- * Finding a free block to recycle
- *--------------------------------------------------------------*/
-static int __recycle_block(struct dm_block_manager *bm, dm_block_t where,
-			   int need_read, struct dm_block_validator *v,
-			   unsigned long flags,
-			   struct dm_block **result)
-	__retains(&bm->lock)
-{
-	int r = 0;
-	struct dm_block *b;
-	unsigned long available;
+	get_task_struct(current);
+	w.task = current;
+	w.wants_write = 1;
 
 	/*
-	 * Wait for a block to appear on the empty or clean lists.
+	 * Writers given priority. We know there's only one mutator in the
+	 * system, so ignoring the ordering reversal.
 	 */
-retry:
-	while (1) {
-		/*
-		 * The calling thread may hold some locks on blocks, and
-		 * the rest be errored.  In which case we're never going to
-		 * succeed here.
-		 */
-		if (bm->error_count == bm->cache_size - bm->max_held_per_thread)
-			return -ENOMEM;
-
-		/*
-		 * Once we can lock and do io concurrently then we should
-		 * probably flush at bm->cache_size / 2 and write _all_
-		 * dirty blocks.
-		 */
-		available = bm->available_count + bm->writing_count;
-		if (available < bm->cache_size / 4) {
-			spin_unlock_irqrestore(&bm->lock, flags);
-			write_dirty(bm, bm->cache_size / 4);
-			spin_lock_irqsave(&bm->lock, flags);
-		}
-
-		if (!list_empty(&bm->empty_list)) {
-			b = list_first_entry(&bm->empty_list, struct dm_block, list);
-			break;
-
-		} else if (!list_empty(&bm->clean_list)) {
-			b = list_first_entry(&bm->clean_list, struct dm_block, list);
-			__transition(b, BS_EMPTY);
-			break;
-		}
-
-		__wait_clean(bm, &flags);
-	}
-
-	b->where = where;
-	__transition(b, BS_READING);
-
-	if (!need_read) {
-		memset(b->data, 0, bm->block_size);
-		b->validator = v;
-		__transition(b, BS_CLEAN);
-	} else {
-		spin_unlock_irqrestore(&bm->lock, flags);
-		read_block(b);
-		spin_lock_irqsave(&bm->lock, flags);
-		__wait_io(b, &flags);
-
-		/*
-		 * Has b been recycled whilst we were unlocked?
-		 */
-		if (b->where != where)
-			goto retry;
-
-		/*
-		 * Did the io succeed?
-		 */
-		if (b->state == BS_ERROR) {
-			/*
-			 * Since this is a read that has failed we can clear the error
-			 * immediately.	 Failed writes are revealed during a commit.
-			 */
-			__transition(b, BS_EMPTY);
-			r = -EIO;
-		} else {
-			/*
-			 * We set the validator late, since there's a
-			 * window while we're waiting for the read where
-			 * someone could have set a different one.
-			 */
-			b->validator = v;
-			if (b->validator) {
-				r = b->validator->check(b->validator, b, bm->block_size);
-				if (r) {
-					DMERR("%s validator check failed for block %llu",
-					      b->validator->name, (unsigned long long)b->where);
-					__transition(b, BS_EMPTY);
-				}
-			}
-		}
-	}
+	list_add(&w.list, &lock->waiters);
+	spin_unlock(&lock->lock);
 
-	if (!r)
-		*result = b;
+	__wait(&w);
+	put_task_struct(current);
 
-	return r;
+	return 0;
 }
 
-/*----------------------------------------------------------------
- * Low level block management
- *--------------------------------------------------------------*/
-
-static struct kmem_cache *dm_block_cache;  /* struct dm_block */
-
-static struct dm_block *alloc_block(struct dm_block_manager *bm)
+static void bl_up_write(struct block_lock *lock)
 {
-	struct dm_block *b = kmem_cache_alloc(dm_block_cache, GFP_KERNEL);
-
-	if (!b)
-		return NULL;
-
-	INIT_LIST_HEAD(&b->list);
-	INIT_HLIST_NODE(&b->hlist);
+	spin_lock(&lock->lock);
+	__del_holder(lock, current);
+	lock->count = 0;
+	if (!list_empty(&lock->waiters))
+		__wake_many(lock);
+	spin_unlock(&lock->lock);
+}
 
-	b->data = kmem_cache_alloc(bm->buffer_cache, GFP_KERNEL);
-	if (!b->data) {
-		kmem_cache_free(dm_block_cache, b);
-		return NULL;
-	}
+static void report_recursive_bug(dm_block_t b, int r)
+{
+	if (r == -EINVAL)
+		DMERR("recursive acquisition of block %llu requested.",
+		      (unsigned long long) b);
+}
 
-	b->validator = NULL;
-	b->state = BS_EMPTY;
-	init_waitqueue_head(&b->io_q);
-	b->read_lock_count = 0;
-	b->write_lock_pending = 0;
-	b->io_flags = 0;
-	b->bm = bm;
+/*----------------------------------------------------------------*/
 
-	return b;
+/*
+ * Block manager is currently implemented using dm-bufio.  struct
+ * dm_block_manager and struct dm_block map directly onto a couple of
+ * structs in the bufio interface.  I want to retain the freedom to move
+ * away from bufio in the future.  So these structs are just cast within
+ * this .c file, rather than making it through to the public interface.
+ */
+static struct dm_buffer *to_buffer(struct dm_block *b)
+{
+	return (struct dm_buffer *) b;
 }
 
-static void free_block(struct dm_block *b)
+static struct dm_bufio_client *to_bufio(struct dm_block_manager *bm)
 {
-	kmem_cache_free(b->bm->buffer_cache, b->data);
-	kmem_cache_free(dm_block_cache, b);
+	return (struct dm_bufio_client *) bm;
 }
 
-static int populate_bm(struct dm_block_manager *bm, unsigned count)
+dm_block_t dm_block_location(struct dm_block *b)
 {
-	int i;
-	LIST_HEAD(bs);
+	return dm_bufio_get_block_number(to_buffer(b));
+}
+EXPORT_SYMBOL_GPL(dm_block_location);
 
-	for (i = 0; i < count; i++) {
-		struct dm_block *b = alloc_block(bm);
-		if (!b) {
-			struct dm_block *tmp;
-			list_for_each_entry_safe(b, tmp, &bs, list)
-				free_block(b);
-			return -ENOMEM;
-		}
+void *dm_block_data(struct dm_block *b)
+{
+	return dm_bufio_get_block_data(to_buffer(b));
+}
+EXPORT_SYMBOL_GPL(dm_block_data);
 
-		list_add(&b->list, &bs);
-	}
+struct buffer_aux {
+	struct dm_block_validator *validator;
+	struct block_lock lock;
+	int write_locked;
+};
 
-	list_replace(&bs, &bm->empty_list);
-	bm->available_count = count;
+static void dm_block_manager_alloc_callback(struct dm_buffer *buf)
+{
+	struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
+	aux->validator = NULL;
+	bl_init(&aux->lock);
+}
 
-	return 0;
+static void dm_block_manager_write_callback(struct dm_buffer *buf)
+{
+	struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
+	if (aux->validator) {
+		aux->validator->prepare_for_write(aux->validator, (struct dm_block *) buf,
+			 dm_bufio_get_block_size(dm_bufio_get_client(buf)));
+	}
 }
 
 /*----------------------------------------------------------------
  * Public interface
  *--------------------------------------------------------------*/
-static unsigned calc_hash_size(unsigned cache_size)
-{
-	unsigned r = 32;	/* Minimum size is 16 */
-
-	while (r < cache_size)
-		r <<= 1;
-
-	return r >> 1;
-}
-
 struct dm_block_manager *dm_block_manager_create(struct block_device *bdev,
 						 unsigned block_size,
 						 unsigned cache_size,
 						 unsigned max_held_per_thread)
 {
-	unsigned i;
-	unsigned hash_size = calc_hash_size(cache_size);
-	size_t len = sizeof(struct dm_block_manager) +
-		     sizeof(struct hlist_head) * hash_size;
-	struct dm_block_manager *bm;
-
-	bm = kmalloc(len, GFP_KERNEL);
-	if (!bm)
-		return NULL;
-
-	bm->bdev = bdev;
-	bm->cache_size = max(MAX_CACHE_SIZE, cache_size);
-	bm->max_held_per_thread = max_held_per_thread;
-	bm->block_size = block_size;
-	bm->nr_blocks = i_size_read(bdev->bd_inode);
-	do_div(bm->nr_blocks, block_size);
-	init_waitqueue_head(&bm->io_q);
-	spin_lock_init(&bm->lock);
-
-	INIT_LIST_HEAD(&bm->empty_list);
-	INIT_LIST_HEAD(&bm->clean_list);
-	INIT_LIST_HEAD(&bm->dirty_list);
-	INIT_LIST_HEAD(&bm->error_list);
-	bm->error_count = 0;
-	bm->available_count = 0;
-	bm->reading_count = 0;
-	bm->writing_count = 0;
-
-	sprintf(bm->buffer_cache_name, "dm_block_buffer-%d-%d",
-		MAJOR(disk_devt(bdev->bd_disk)),
-		MINOR(disk_devt(bdev->bd_disk)));
-
-	bm->buffer_cache = kmem_cache_create(bm->buffer_cache_name,
-					     block_size, SECTOR_SIZE,
-					     0, NULL);
-	if (!bm->buffer_cache)
-		goto bad_free_bm;
-
-	bm->hash_size = hash_size;
-	bm->hash_mask = hash_size - 1;
-	for (i = 0; i < hash_size; i++)
-		INIT_HLIST_HEAD(bm->buckets + i);
-
-	bm->io = dm_io_client_create();
-	if (!bm->io)
-		goto bad_free_buffer_cache;
-
-	if (populate_bm(bm, cache_size) < 0)
-		goto bad_free_io_client;
-
-	return bm;
-
-bad_free_io_client:
-	dm_io_client_destroy(bm->io);
-bad_free_buffer_cache:
-	kmem_cache_destroy(bm->buffer_cache);
-bad_free_bm:
-	kfree(bm);
-
-	return NULL;
+	return (struct dm_block_manager *)
+		dm_bufio_client_create(bdev, block_size, max_held_per_thread,
+				       sizeof(struct buffer_aux),
+				       dm_block_manager_alloc_callback,
+				       dm_block_manager_write_callback);
 }
 EXPORT_SYMBOL_GPL(dm_block_manager_create);
 
 void dm_block_manager_destroy(struct dm_block_manager *bm)
 {
-	int i;
-	struct dm_block *b, *btmp;
-	struct hlist_node *n, *tmp;
-
-	dm_io_client_destroy(bm->io);
-
-	for (i = 0; i < bm->hash_size; i++)
-		hlist_for_each_entry_safe(b, n, tmp, bm->buckets + i, hlist)
-			free_block(b);
-
-	list_for_each_entry_safe(b, btmp, &bm->empty_list, list)
-		free_block(b);
-
-	kmem_cache_destroy(bm->buffer_cache);
-
-	kfree(bm);
+	return dm_bufio_client_destroy(to_bufio(bm));
 }
 EXPORT_SYMBOL_GPL(dm_block_manager_destroy);
 
 unsigned dm_bm_block_size(struct dm_block_manager *bm)
 {
-	return bm->block_size;
+	return dm_bufio_get_block_size(to_bufio(bm));
 }
 EXPORT_SYMBOL_GPL(dm_bm_block_size);
 
 dm_block_t dm_bm_nr_blocks(struct dm_block_manager *bm)
 {
-	return bm->nr_blocks;
+	return dm_bufio_get_device_size(to_bufio(bm));
 }
 
-static int lock_internal(struct dm_block_manager *bm, dm_block_t block,
-			 int how, int need_read, int can_block,
-			 struct dm_block_validator *v,
-			 struct dm_block **result)
-{
-	int r = 0;
-	struct dm_block *b;
-	unsigned long flags;
-
-	spin_lock_irqsave(&bm->lock, flags);
-retry:
-	b = __find_block(bm, block);
-	if (b) {
-		/*
-		 * The block may be in state BS_READING at this point.
-		 * Which means we're racing for this block against another
-		 * locking op.  This is fine, __wait_read_lockable() below
-		 * will do the right thing.  We do need to be careful
-		 * however that the validator isn't set until the lock is
-		 * full granted, otherwise the other thread could get the
-		 * lock, but this one's validator be used. This situation
-		 * only arises if there's a programming error in the code
-		 * driving bm.
-		 */
-
-		switch (how) {
-		case READ:
-			if (b->write_lock_pending || (b->state != BS_CLEAN &&
-						      b->state != BS_DIRTY &&
-						      b->state != BS_READ_LOCKED)) {
-				if (!can_block) {
-					spin_unlock_irqrestore(&bm->lock, flags);
-					return -EWOULDBLOCK;
-				}
-
-				__wait_read_lockable(b, &flags);
-
-				if (b->where != block)
-					goto retry;
-			}
-			break;
-
-		case WRITE:
-			while (b->state != BS_CLEAN && b->state != BS_DIRTY) {
-				if (!can_block) {
-					spin_unlock_irqrestore(&bm->lock, flags);
-					return -EWOULDBLOCK;
-				}
-
-				b->write_lock_pending++;
-				__wait_unlocked(b, &flags);
-				if (b->where != block)
-					/*
-					 * Recycled blocks have their
-					 * write_lock_pending count reset
-					 * to zero, so no need to undo the
-					 * above increment.
-					 */
-					goto retry;
-				b->write_lock_pending--;
-			}
-			break;
-		}
-
-		if (!need_read)
-			b->validator = v;
-		else {
-			if (b->validator && (v != b->validator)) {
-				DMERR("validator mismatch (old=%s vs new=%s) for block %llu",
-				      b->validator->name, v ? v->name : "NULL",
-				      (unsigned long long)b->where);
-				spin_unlock_irqrestore(&bm->lock, flags);
-				return -EINVAL;
-			}
-
-			if (!b->validator && v) {
-				b->validator = v;
-				r = b->validator->check(b->validator, b, bm->block_size);
-				if (r) {
-					DMERR("%s validator check failed for block %llu",
-					      b->validator->name,
-					      (unsigned long long)b->where);
-					spin_unlock_irqrestore(&bm->lock, flags);
-					return r;
-				}
-			}
-		}
-
-	} else if (!can_block) {
-		r = -EWOULDBLOCK;
-		goto out;
-
-	} else
-		r = __recycle_block(bm, block, need_read, v, flags, &b);
-
-	if (!r) {
-		switch (how) {
-		case READ:
-			b->read_lock_count++;
-
-			if (b->state == BS_DIRTY)
-				__transition(b, BS_READ_LOCKED_DIRTY);
-			else if (b->state == BS_CLEAN)
-				__transition(b, BS_READ_LOCKED);
-			break;
-
-		case WRITE:
-			__transition(b, BS_WRITE_LOCKED);
-			break;
+static int dm_bm_validate_buffer(struct dm_block_manager *bm,
+				 struct dm_buffer *buf,
+				 struct buffer_aux *aux,
+				 struct dm_block_validator *v)
+{
+	if (unlikely(!aux->validator)) {
+		int r;
+		if (!v)
+			return 0;
+		r = v->check(v, (struct dm_block *) buf, dm_bufio_get_block_size(to_bufio(bm)));
+		if (unlikely(r))
+			return r;
+		aux->validator = v;
+	} else {
+		if (unlikely(aux->validator != v)) {
+			DMERR("validator mismatch (old=%s vs new=%s) for block %llu",
+				aux->validator->name, v ? v->name : "NULL",
+				(unsigned long long)
+					dm_bufio_get_block_number(buf));
+			return -EINVAL;
 		}
-
-		*result = b;
 	}
 
-out:
-	spin_unlock_irqrestore(&bm->lock, flags);
-
-	return r;
+	return 0;
 }
-
 int dm_bm_read_lock(struct dm_block_manager *bm, dm_block_t b,
 		    struct dm_block_validator *v,
 		    struct dm_block **result)
 {
-	return lock_internal(bm, b, READ, 1, 1, v, result);
+	struct buffer_aux *aux;
+	void *p;
+	int r;
+
+	p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result);
+	if (unlikely(IS_ERR(p)))
+		return PTR_ERR(p);
+
+	aux = dm_bufio_get_aux_data(to_buffer(*result));
+	r = bl_down_read(&aux->lock);
+	if (unlikely(r)) {
+		dm_bufio_release(to_buffer(*result));
+		report_recursive_bug(b, r);
+		return r;
+	}
+
+	aux->write_locked = 0;
+
+	r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
+	if (unlikely(r)) {
+		bl_up_read(&aux->lock);
+		dm_bufio_release(to_buffer(*result));
+		return r;
+	}
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(dm_bm_read_lock);
 
@@ -849,7 +459,32 @@ int dm_bm_write_lock(struct dm_block_man
 		     dm_block_t b, struct dm_block_validator *v,
 		     struct dm_block **result)
 {
-	return lock_internal(bm, b, WRITE, 1, 1, v, result);
+	struct buffer_aux *aux;
+	void *p;
+	int r;
+
+	p = dm_bufio_read(to_bufio(bm), b, (struct dm_buffer **) result);
+	if (unlikely(IS_ERR(p)))
+		return PTR_ERR(p);
+
+	aux = dm_bufio_get_aux_data(to_buffer(*result));
+	r = bl_down_write(&aux->lock);
+	if (r) {
+		dm_bufio_release(to_buffer(*result));
+		report_recursive_bug(b, r);
+		return r;
+	}
+
+	aux->write_locked = 1;
+
+	r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
+	if (unlikely(r)) {
+		bl_up_write(&aux->lock);
+		dm_bufio_release(to_buffer(*result));
+		return r;
+	}
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(dm_bm_write_lock);
 
@@ -857,142 +492,139 @@ int dm_bm_read_try_lock(struct dm_block_
 			dm_block_t b, struct dm_block_validator *v,
 			struct dm_block **result)
 {
-	return lock_internal(bm, b, READ, 1, 0, v, result);
+	struct buffer_aux *aux;
+	void *p;
+	int r;
+
+	p = dm_bufio_get(to_bufio(bm), b, (struct dm_buffer **) result);
+	if (unlikely(IS_ERR(p)))
+		return PTR_ERR(p);
+	if (unlikely(!p))
+		return -EWOULDBLOCK;
+
+	aux = dm_bufio_get_aux_data(to_buffer(*result));
+	r = bl_down_read_nonblock(&aux->lock);
+	if (r < 0) {
+		dm_bufio_release(to_buffer(*result));
+		report_recursive_bug(b, r);
+		return r;
+	}
+	aux->write_locked = 0;
+
+	r = dm_bm_validate_buffer(bm, to_buffer(*result), aux, v);
+	if (unlikely(r)) {
+		bl_up_read(&aux->lock);
+		dm_bufio_release(to_buffer(*result));
+		return r;
+	}
+
+	return 0;
 }
 
 int dm_bm_write_lock_zero(struct dm_block_manager *bm,
 			  dm_block_t b, struct dm_block_validator *v,
 			  struct dm_block **result)
 {
-	int r = lock_internal(bm, b, WRITE, 0, 1, v, result);
+	int r;
+	struct buffer_aux *aux;
+	void *p;
+
+	p = dm_bufio_new(to_bufio(bm), b, (struct dm_buffer **) result);
+	if (unlikely(IS_ERR(p)))
+		return PTR_ERR(p);
+
+	memset(p, 0, dm_bm_block_size(bm));
+
+	aux = dm_bufio_get_aux_data(to_buffer(*result));
+	r = bl_down_write(&aux->lock);
+	if (r) {
+		dm_bufio_release(to_buffer(*result));
+		return r;
+	}
 
-	if (!r)
-		memset((*result)->data, 0, bm->block_size);
+	aux->write_locked = 1;
+	aux->validator = v;
 
-	return r;
+	return 0;
 }
 
 int dm_bm_unlock(struct dm_block *b)
 {
-	int r = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&b->bm->lock, flags);
-	switch (b->state) {
-	case BS_WRITE_LOCKED:
-		__transition(b, BS_DIRTY);
-		wake_up(&b->io_q);
-		break;
-
-	case BS_READ_LOCKED:
-		if (!--b->read_lock_count) {
-			__transition(b, BS_CLEAN);
-			wake_up(&b->io_q);
-		}
-		break;
+	struct buffer_aux *aux;
+	aux = dm_bufio_get_aux_data(to_buffer(b));
 
-	case BS_READ_LOCKED_DIRTY:
-		if (!--b->read_lock_count) {
-			__transition(b, BS_DIRTY);
-			wake_up(&b->io_q);
-		}
-		break;
+	if (aux->write_locked) {
+		dm_bufio_mark_buffer_dirty(to_buffer(b));
+		bl_up_write(&aux->lock);
+	} else
+		bl_up_read(&aux->lock);
 
-	default:
-		DMERR("block = %llu not locked",
-		      (unsigned long long)b->where);
-		r = -EINVAL;
-		break;
-	}
-	spin_unlock_irqrestore(&b->bm->lock, flags);
+	dm_bufio_release(to_buffer(b));
 
-	return r;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(dm_bm_unlock);
 
-static int __wait_flush(struct dm_block_manager *bm)
+int dm_bm_unlock_move(struct dm_block *b, dm_block_t n)
 {
-	int r = 0;
-	unsigned long flags;
+	struct buffer_aux *aux;
 
-	spin_lock_irqsave(&bm->lock, flags);
-	__wait_all_writes(bm, &flags);
+	aux = dm_bufio_get_aux_data(to_buffer(b));
 
-	if (!list_empty(&bm->error_list)) {
-		r = -EIO;
-		__clear_errors(bm);
-	}
-	spin_unlock_irqrestore(&bm->lock, flags);
+	if (aux->write_locked) {
+		dm_bufio_mark_buffer_dirty(to_buffer(b));
+		bl_up_write(&aux->lock);
+	} else
+		bl_up_read(&aux->lock);
 
-	return r;
+	dm_bufio_release_move(to_buffer(b), n);
+	return 0;
 }
 
 int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
 			   struct dm_block *superblock)
 {
 	int r;
-	unsigned long flags;
 
-	write_all_dirty(bm);
-	r = __wait_flush(bm);
-	if (r)
+	r = dm_bufio_write_dirty_buffers(to_bufio(bm));
+	if (unlikely(r))
+		return r;
+	r = dm_bufio_issue_flush(to_bufio(bm));
+	if (unlikely(r))
 		return r;
-
-	spin_lock_irqsave(&bm->lock, flags);
-	superblock->io_flags = REQ_FUA | REQ_FLUSH;
-	spin_unlock_irqrestore(&bm->lock, flags);
 
 	dm_bm_unlock(superblock);
-	write_all_dirty(bm);
 
-	return __wait_flush(bm);
+	r = dm_bufio_write_dirty_buffers(to_bufio(bm));
+	if (unlikely(r))
+		return r;
+	r = dm_bufio_issue_flush(to_bufio(bm));
+	if (unlikely(r))
+		return r;
+
+	return 0;
 }
 
 int dm_bm_rebind_block_device(struct dm_block_manager *bm,
 			      struct block_device *bdev)
 {
-	unsigned long flags;
-	dm_block_t nr_blocks = i_size_read(bdev->bd_inode);
-
-	do_div(nr_blocks, bm->block_size);
-
-	spin_lock_irqsave(&bm->lock, flags);
-	if (nr_blocks < bm->nr_blocks) {
-		spin_unlock_irqrestore(&bm->lock, flags);
-		return -EINVAL;
-	}
-
-	bm->bdev = bdev;
-	bm->nr_blocks = nr_blocks;
-
 	/*
-	 * Wait for any in-flight io that may be using the old bdev
+	 * !!! FIXME: remove this. It is supposedly unused.
 	 */
-	__wait_all_io(bm, &flags);
-	spin_unlock_irqrestore(&bm->lock, flags);
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dm_bm_rebind_block_device);
 
-/*----------------------------------------------------------------*/
-
-static int __init init_persistent_data(void)
+u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
 {
-	dm_block_cache = KMEM_CACHE(dm_block, SLAB_HWCACHE_ALIGN);
-	if (!dm_block_cache)
-		return -ENOMEM;
-
-	return 0;
+	return crc32c(~(u32) 0, data, len) ^ init_xor;
 }
+EXPORT_SYMBOL_GPL(dm_bm_checksum);
 
-static void __exit exit_persistent_data(void)
-{
-	kmem_cache_destroy(dm_block_cache);
-}
+/*----------------------------------------------------------------*/
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_DESCRIPTION("Immutable metadata library for dm");
-module_init(init_persistent_data);
-module_exit(exit_persistent_data);
+
+/*----------------------------------------------------------------*/
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-block-manager.h
===================================================================
--- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-block-manager.h
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-block-manager.h
@@ -7,9 +7,8 @@
 #ifndef _LINUX_DM_BLOCK_MANAGER_H
 #define _LINUX_DM_BLOCK_MANAGER_H
 
-#include <linux/blkdev.h>
 #include <linux/types.h>
-#include <linux/crc32c.h>
+#include <linux/blkdev.h>
 
 /*----------------------------------------------------------------*/
 
@@ -17,31 +16,21 @@
  * Block number.
  */
 typedef uint64_t dm_block_t;
-
-/*
- * An opaque handle to a block of data.
- */
 struct dm_block;
 
 dm_block_t dm_block_location(struct dm_block *b);
 void *dm_block_data(struct dm_block *b);
 
-/*
- * Use CRC32 checksumming on data blocks.
- */
-static inline uint32_t dm_block_csum_data(const void *data_le, unsigned length)
-{
-	return crc32c(~(u32)0, data_le, length);
-}
-
 /*----------------------------------------------------------------*/
 
-struct dm_block_manager;
-
 /*
+ * @name should be a unique identifier for the block manager, no longer
+ * than 32 chars.
+ *
  * @max_held_per_thread should be the maximum number of locks, read or
  * write, that an individual thread holds at any one time.
  */
+struct dm_block_manager;
 struct dm_block_manager *dm_block_manager_create(
 	struct block_device *bdev, unsigned block_size,
 	unsigned cache_size, unsigned max_held_per_thread);
@@ -108,6 +97,14 @@ int dm_bm_write_lock_zero(struct dm_bloc
 int dm_bm_unlock(struct dm_block *b);
 
 /*
+ * An optimisation; we often want to copy a block's contents to a new
+ * block.  eg, as part of the shadowing operation.  It's far better for
+ * bufio to do this move behind the scenes than hold 2 locks and memcpy the
+ * data.
+ */
+int dm_bm_unlock_move(struct dm_block *b, dm_block_t n);
+
+/*
  * It's a common idiom to have a superblock that should be committed last.
  *
  * @superblock should be write-locked on entry. It will be unlocked during
@@ -131,4 +128,8 @@ int dm_bm_flush_and_unlock(struct dm_blo
 int dm_bm_rebind_block_device(struct dm_block_manager *bm,
 			      struct block_device *bdev);
 
+u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
+
+/*----------------------------------------------------------------*/
+
 #endif	/* _LINUX_DM_BLOCK_MANAGER_H */
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-btree-internal.h
===================================================================
--- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-btree-internal.h
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-btree-internal.h
@@ -79,7 +79,7 @@ void init_shadow_spine(struct shadow_spi
 int exit_shadow_spine(struct shadow_spine *s);
 
 int shadow_step(struct shadow_spine *s, dm_block_t b,
-		struct dm_btree_value_type *vt, int *inc);
+		struct dm_btree_value_type *vt);
 
 /*
  * The spine must have at least one entry before calling this.
@@ -108,8 +108,12 @@ static inline void *value_base(struct no
 	return &n->keys[le32_to_cpu(n->header.max_entries)];
 }
 
+/*
+ * FIXME: Now that value size is stored in node we don't need the third parm.
+ */
 static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size)
 {
+	BUG_ON(value_size != le32_to_cpu(n->header.value_size));
 	return value_base(n) + (value_size * index);
 }
 
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-btree-remove.c
===================================================================
--- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-btree-remove.c
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-btree-remove.c
@@ -56,54 +56,64 @@
 static void node_shift(struct node *n, int shift)
 {
 	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
+	uint32_t value_size = le32_to_cpu(n->header.value_size);
 
 	if (shift < 0) {
 		shift = -shift;
+		BUG_ON(shift > nr_entries);
+		BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size));
 		memmove(key_ptr(n, 0),
 			key_ptr(n, shift),
 			(nr_entries - shift) * sizeof(__le64));
-		memmove(value_ptr(n, 0, sizeof(__le64)),
-			value_ptr(n, shift, sizeof(__le64)),
-			(nr_entries - shift) * sizeof(__le64));
+		memmove(value_ptr(n, 0, value_size),
+			value_ptr(n, shift, value_size),
+			(nr_entries - shift) * value_size);
 	} else {
+		BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
 		memmove(key_ptr(n, shift),
 			key_ptr(n, 0),
 			nr_entries * sizeof(__le64));
-		memmove(value_ptr(n, shift, sizeof(__le64)),
-			value_ptr(n, 0, sizeof(__le64)),
-			nr_entries * sizeof(__le64));
+		memmove(value_ptr(n, shift, value_size),
+			value_ptr(n, 0, value_size),
+			nr_entries * value_size);
 	}
 }
 
 static void node_copy(struct node *left, struct node *right, int shift)
 {
 	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
+	uint32_t value_size = le32_to_cpu(left->header.value_size);
+	BUG_ON(value_size != le32_to_cpu(right->header.value_size));
 
 	if (shift < 0) {
 		shift = -shift;
+		BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries));
 		memcpy(key_ptr(left, nr_left),
 		       key_ptr(right, 0),
 		       shift * sizeof(__le64));
-		memcpy(value_ptr(left, nr_left, sizeof(__le64)),
-		       value_ptr(right, 0, sizeof(__le64)),
-		       shift * sizeof(__le64));
+		memcpy(value_ptr(left, nr_left, value_size),
+		       value_ptr(right, 0, value_size),
+		       shift * value_size);
 	} else {
+		BUG_ON(shift > le32_to_cpu(right->header.max_entries));
 		memcpy(key_ptr(right, 0),
 		       key_ptr(left, nr_left - shift),
 		       shift * sizeof(__le64));
-		memcpy(value_ptr(right, 0, sizeof(__le64)),
-		       value_ptr(left, nr_left - shift, sizeof(__le64)),
-		       shift * sizeof(__le64));
+		memcpy(value_ptr(right, 0, value_size),
+		       value_ptr(left, nr_left - shift, value_size),
+		       shift * value_size);
 	}
 }
 
 /*
  * Delete a specific entry from a leaf node.
  */
-static void delete_at(struct node *n, unsigned index, size_t value_size)
+static void delete_at(struct node *n, unsigned index)
 {
 	unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
 	unsigned nr_to_copy = nr_entries - (index + 1);
+	uint32_t value_size = le32_to_cpu(n->header.value_size);
+	BUG_ON(index >= nr_entries);
 
 	if (nr_to_copy) {
 		memmove(key_ptr(n, index),
@@ -165,6 +175,9 @@ static int init_child(struct dm_btree_in
 	if (inc)
 		inc_children(info->tm, result->n, &le64_type);
 
+	*((__le64 *) value_ptr(parent, index, sizeof(__le64))) =
+		cpu_to_le64(dm_block_location(result->block));
+
 	return 0;
 }
 
@@ -188,9 +201,11 @@ static void shift(struct node *left, str
 
 	left->header.nr_entries =
 		cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count);
+	BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries));
 
 	right->header.nr_entries =
 		cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count);
+	BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries));
 }
 
 static void __rebalance2(struct dm_btree_info *info, struct node *parent,
@@ -207,10 +222,7 @@ static void __rebalance2(struct dm_btree
 		 */
 		node_copy(left, right, -nr_right);
 		left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
-
-		*((__le64 *) value_ptr(parent, l->index, sizeof(__le64))) =
-			cpu_to_le64(dm_block_location(l->block));
-		delete_at(parent, r->index, sizeof(__le64));
+		delete_at(parent, r->index);
 
 		/*
 		 * We need to decrement the right block, but not it's
@@ -222,12 +234,10 @@ static void __rebalance2(struct dm_btree
 		 * Rebalance.
 		 */
 		unsigned target_left = (nr_left + nr_right) / 2;
-
+		unsigned shift_ = nr_left - target_left;
+		BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_);
+		BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_);
 		shift(left, right, nr_left - target_left);
-		*((__le64 *) value_ptr(parent, l->index, sizeof(__le64))) =
-			cpu_to_le64(dm_block_location(l->block));
-		*((__le64 *) value_ptr(parent, r->index, sizeof(__le64))) =
-			cpu_to_le64(dm_block_location(r->block));
 		*key_ptr(parent, r->index) = right->keys[0];
 	}
 }
@@ -259,11 +269,7 @@ static int rebalance2(struct shadow_spin
 		return r;
 	}
 
-	r = exit_child(info, &right);
-	if (r)
-		return r;
-
-	return 0;
+	return exit_child(info, &right);
 }
 
 static void __rebalance3(struct dm_btree_info *info, struct node *parent,
@@ -280,6 +286,9 @@ static void __rebalance3(struct dm_btree
 
 	unsigned target;
 
+	BUG_ON(left->header.max_entries != center->header.max_entries);
+	BUG_ON(center->header.max_entries != right->header.max_entries);
+
 	if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) {
 		/*
 		 * Delete center node:
@@ -290,23 +299,20 @@ static void __rebalance3(struct dm_btree
 		 */
 		unsigned shift = min(max_entries - nr_left, nr_center);
 
+		BUG_ON(nr_left + shift > max_entries);
 		node_copy(left, center, -shift);
 		left->header.nr_entries = cpu_to_le32(nr_left + shift);
 
 		if (shift != nr_center) {
 			shift = nr_center - shift;
+			BUG_ON((nr_right + shift) >= max_entries);
 			node_shift(right, shift);
 			node_copy(center, right, shift);
 			right->header.nr_entries = cpu_to_le32(nr_right + shift);
 		}
-
-		*((__le64 *) value_ptr(parent, l->index, sizeof(__le64))) =
-			cpu_to_le64(dm_block_location(l->block));
-		*((__le64 *) value_ptr(parent, r->index, sizeof(__le64))) =
-			cpu_to_le64(dm_block_location(r->block));
 		*key_ptr(parent, r->index) = right->keys[0];
 
-		delete_at(parent, c->index, sizeof(__le64));
+		delete_at(parent, c->index);
 		r->index--;
 
 		dm_tm_dec(info->tm, dm_block_location(c->block));
@@ -319,7 +325,7 @@ static void __rebalance3(struct dm_btree
 	 * Rebalance
 	 */
 	target = (nr_left + nr_center + nr_right) / 3;
-	BUG_ON(target == nr_center);
+	BUG_ON(target > max_entries);
 
 	/*
 	 * Adjust the left node
@@ -330,14 +336,6 @@ static void __rebalance3(struct dm_btree
 	 * Adjust the right node
 	 */
 	shift(center, right, target - nr_right);
-
-	*((__le64 *) value_ptr(parent, l->index, sizeof(__le64))) =
-		cpu_to_le64(dm_block_location(l->block));
-	*((__le64 *) value_ptr(parent, c->index, sizeof(__le64))) =
-		cpu_to_le64(dm_block_location(c->block));
-	*((__le64 *) value_ptr(parent, r->index, sizeof(__le64))) =
-		cpu_to_le64(dm_block_location(r->block));
-
 	*key_ptr(parent, c->index) = center->keys[0];
 	*key_ptr(parent, r->index) = right->keys[0];
 }
@@ -428,9 +426,11 @@ static int rebalance_children(struct sha
 		memcpy(n, dm_block_data(child),
 		       dm_bm_block_size(dm_tm_get_bm(info->tm)));
 		r = dm_tm_unlock(info->tm, child);
-		dm_tm_dec(info->tm, dm_block_location(child));
+		if (r)
+			return r;
 
-		return r;
+		dm_tm_dec(info->tm, dm_block_location(child));
+		return 0;
 	}
 
 	i = lower_bound(n, key);
@@ -444,9 +444,8 @@ static int rebalance_children(struct sha
 	if (child_entries > del_threshold(n))
 		return 0;
 
-	has_left_sibling = i > 0 ? 1 : 0;
-	has_right_sibling =
-		(i >= (le32_to_cpu(n->header.nr_entries) - 1)) ? 0 : 1;
+	has_left_sibling = i > 0;
+	has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
 
 	if (!has_left_sibling)
 		r = rebalance2(s, info, i);
@@ -476,17 +475,17 @@ static int do_leaf(struct node *n, uint6
 
 /*
  * Prepares for removal from one level of the hierarchy.  The caller must
- * actually call delete_at() to remove the entry at index.
+ * call delete_at() to remove the entry at index.
  */
 static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
 		      struct dm_btree_value_type *vt, dm_block_t root,
 		      uint64_t key, unsigned *index)
 {
-	int i = *index, inc, r;
+	int i = *index, r;
 	struct node *n;
 
 	for (;;) {
-		r = shadow_step(s, root, vt, &inc);
+		r = shadow_step(s, root, vt);
 		if (r < 0)
 			break;
 
@@ -497,13 +496,11 @@ static int remove_raw(struct shadow_spin
 		 */
 		if (shadow_has_parent(s)) {
 			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
-			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)),
+			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)),
 			       &location, sizeof(__le64));
 		}
 
 		n = dm_block_data(shadow_current(s));
-		if (inc)
-			inc_children(info->tm, n, vt);
 
 		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
 			return do_leaf(n, key, index);
@@ -558,12 +555,10 @@ int dm_btree_remove(struct dm_btree_info
 			info->value_type.dec(info->value_type.context,
 					     value_ptr(n, index, info->value_type.size));
 
-		delete_at(n, index, info->value_type.size);
-
-		r = 0;
-		*new_root = shadow_root(&spine);
+		delete_at(n, index);
 	}
 
+	*new_root = shadow_root(&spine);
 	exit_shadow_spine(&spine);
 
 	return r;
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-btree-spine.c
===================================================================
--- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-btree-spine.c
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-btree-spine.c
@@ -13,6 +13,12 @@
 
 /*----------------------------------------------------------------*/
 
+#define BTREE_CSUM_XOR 121107
+
+static int node_check(struct dm_block_validator *v,
+		      struct dm_block *b,
+		      size_t block_size);
+
 static void node_prepare_for_write(struct dm_block_validator *v,
 				   struct dm_block *b,
 				   size_t block_size)
@@ -21,7 +27,11 @@ static void node_prepare_for_write(struc
 	struct node_header *h = &n->header;
 
 	h->blocknr = cpu_to_le64(dm_block_location(b));
-	h->csum = cpu_to_le32(dm_block_csum_data(&h->flags, block_size - sizeof(__le32)));
+	h->csum = cpu_to_le32(dm_bm_checksum(&h->flags,
+					     block_size - sizeof(__le32),
+					     BTREE_CSUM_XOR));
+
+	BUG_ON(node_check(v, b, 4096));
 }
 
 static int node_check(struct dm_block_validator *v,
@@ -32,6 +42,7 @@ static int node_check(struct dm_block_va
 	struct node_header *h = &n->header;
 	size_t value_size;
 	__le32 csum_disk;
+	uint32_t flags;
 
 	if (dm_block_location(b) != le64_to_cpu(h->blocknr)) {
 		DMERR("node_check failed blocknr %llu wanted %llu",
@@ -39,7 +50,9 @@ static int node_check(struct dm_block_va
 		return -ENOTBLK;
 	}
 
-	csum_disk = cpu_to_le32(dm_block_csum_data(&h->flags, block_size - sizeof(__le32)));
+	csum_disk = cpu_to_le32(dm_bm_checksum(&h->flags,
+					       block_size - sizeof(__le32),
+					       BTREE_CSUM_XOR));
 	if (csum_disk != h->csum) {
 		DMERR("node_check failed csum %u wanted %u",
 		      le32_to_cpu(csum_disk), le32_to_cpu(h->csum));
@@ -59,6 +72,15 @@ static int node_check(struct dm_block_va
 		return -EILSEQ;
 	}
 
+	/*
+	 * The node must be either INTERNAL or LEAF.
+	 */
+	flags = le32_to_cpu(h->flags);
+	if (!(flags & INTERNAL_NODE) && !(flags & LEAF_NODE)) {
+		DMERR("node_check failed, node is neither INTERNAL or LEAF");
+		return -EILSEQ;
+	}
+
 	return 0;
 }
 
@@ -78,13 +100,13 @@ static int bn_read_lock(struct dm_btree_
 
 static int bn_shadow(struct dm_btree_info *info, dm_block_t orig,
 	      struct dm_btree_value_type *vt,
-	      struct dm_block **result, int *inc)
+	      struct dm_block **result)
 {
-	int r;
+	int r, inc;
 
 	r = dm_tm_shadow_block(info->tm, orig, &btree_node_validator,
-			       result, inc);
-	if (!r && *inc)
+			       result, &inc);
+	if (!r && inc)
 		inc_children(info->tm, dm_block_data(*result), vt);
 
 	return r;
@@ -174,7 +196,7 @@ int exit_shadow_spine(struct shadow_spin
 }
 
 int shadow_step(struct shadow_spine *s, dm_block_t b,
-		struct dm_btree_value_type *vt, int *inc)
+		struct dm_btree_value_type *vt)
 {
 	int r;
 
@@ -186,7 +208,7 @@ int shadow_step(struct shadow_spine *s, 
 		s->count--;
 	}
 
-	r = bn_shadow(s->info, b, vt, s->nodes + s->count, inc);
+	r = bn_shadow(s->info, b, vt, s->nodes + s->count);
 	if (!r) {
 		if (!s->count)
 			s->root = dm_block_location(s->nodes[0]);
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-btree.c
===================================================================
--- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-btree.c
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-btree.c
@@ -119,7 +119,7 @@ static uint32_t calc_max_entries(size_t 
 	return 3 * n;
 }
 
-int dm_btree_create(struct dm_btree_info *info, dm_block_t *root)
+int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
 {
 	int r;
 	struct dm_block *b;
@@ -142,10 +142,9 @@ int dm_btree_create(struct dm_btree_info
 	n->header.value_size = cpu_to_le32(info->value_type.size);
 
 	*root = dm_block_location(b);
-
 	return unlock_block(info, b);
 }
-EXPORT_SYMBOL_GPL(dm_btree_create);
+EXPORT_SYMBOL_GPL(dm_btree_empty);
 
 /*----------------------------------------------------------------*/
 
@@ -201,7 +200,7 @@ static int push_frame(struct del_stack *
 
 	if (ref_count > 1)
 		/*
-		 * This is a shared node, so we can just decrement its
+		 * This is a shared node, so we can just decrement it's
 		 * reference counter and leave the children.
 		 */
 		dm_tm_dec(s->tm, b);
@@ -232,7 +231,7 @@ static void pop_frame(struct del_stack *
 	dm_tm_unlock(s->tm, f->b);
 }
 
-int dm_btree_destroy(struct dm_btree_info *info, dm_block_t root)
+int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 {
 	int r;
 	struct del_stack *s;
@@ -240,7 +239,6 @@ int dm_btree_destroy(struct dm_btree_inf
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
 		return -ENOMEM;
-
 	s->tm = info->tm;
 	s->top = -1;
 
@@ -293,16 +291,7 @@ out:
 	kfree(s);
 	return r;
 }
-EXPORT_SYMBOL_GPL(dm_btree_destroy);
-
-// FIXME Implement or remove this fn before final submission.
-int dm_btree_delete_gt(struct dm_btree_info *info, dm_block_t root, uint64_t *key,
-		    dm_block_t *new_root)
-{
-	/* FIXME: implement */
-	return 0;
-}
-EXPORT_SYMBOL_GPL(dm_btree_delete_gt);
+EXPORT_SYMBOL_GPL(dm_btree_del);
 
 /*----------------------------------------------------------------*/
 
@@ -587,17 +576,15 @@ static int btree_insert_raw(struct shado
 			    struct dm_btree_value_type *vt,
 			    uint64_t key, unsigned *index)
 {
-	int r, i = *index, inc, top = 1;
+	int r, i = *index, top = 1;
 	struct node *node;
 
 	for (;;) {
-		r = shadow_step(s, root, vt, &inc);
+		r = shadow_step(s, root, vt);
 		if (r < 0)
 			return r;
 
 		node = dm_block_data(shadow_current(s));
-		if (inc)
-			inc_children(s->info->tm, node, vt);
 
 		/*
 		 * We have to patch up the parent node, ugly, but I don't
@@ -644,13 +631,6 @@ static int btree_insert_raw(struct shado
 	if (i < 0 || le64_to_cpu(node->keys[i]) != key)
 		i++;
 
-	/* we're about to overwrite this value, so undo the increment for it */
-	/* FIXME: shame that inc information is leaking outside the spine.
-	 * Plus inc is just plain wrong in the event of a split */
-	if (le64_to_cpu(node->keys[i]) == key && inc)
-		if (vt->dec)
-			vt->dec(vt->context, value_ptr(node, i, vt->size));
-
 	*index = i;
 	return 0;
 }
@@ -688,7 +668,7 @@ static int insert(struct dm_btree_info *
 			dm_block_t new_tree;
 			__le64 new_le;
 
-			r = dm_btree_create(info, &new_tree);
+			r = dm_btree_empty(info, &new_tree);
 			if (r < 0)
 				goto bad;
 
@@ -770,42 +750,6 @@ EXPORT_SYMBOL_GPL(dm_btree_insert_notify
 
 /*----------------------------------------------------------------*/
 
-int dm_btree_clone(struct dm_btree_info *info, dm_block_t root,
-		   dm_block_t *clone)
-{
-	int r;
-	struct dm_block *b, *orig_b;
-	struct node *b_node, *orig_node;
-
-	/* Copy the root node */
-	r = new_block(info, &b);
-	if (r < 0)
-		return r;
-
-	r = dm_tm_read_lock(info->tm, root, &btree_node_validator, &orig_b);
-	if (r < 0) {
-		dm_block_t location = dm_block_location(b);
-
-		unlock_block(info, b);
-		dm_tm_dec(info->tm, location);
-	}
-
-	*clone = dm_block_location(b);
-	b_node = dm_block_data(b);
-	orig_node = dm_block_data(orig_b);
-
-	memcpy(b_node, orig_node,
-	       dm_bm_block_size(dm_tm_get_bm(info->tm)));
-	dm_tm_unlock(info->tm, orig_b);
-	inc_children(info->tm, b_node, &info->value_type);
-	dm_tm_unlock(info->tm, b);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(dm_btree_clone);
-
-/*----------------------------------------------------------------*/
-
 static int find_highest_key(struct ro_spine *s, dm_block_t block,
 			    uint64_t *result_key, dm_block_t *next_block)
 {
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-btree.h
===================================================================
--- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-btree.h
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-btree.h
@@ -91,21 +91,13 @@ struct dm_btree_info {
 /*
  * Set up an empty tree.  O(1).
  */
-int dm_btree_create(struct dm_btree_info *info, dm_block_t *root);
+int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root);
 
 /*
- * Destroy a tree.  O(n) - this is the slow one!  It can also block, so
+ * Delete a tree.  O(n) - this is the slow one!  It can also block, so
  * please don't call it on an IO path.
  */
-int dm_btree_destroy(struct dm_btree_info *info, dm_block_t root);
-
-/*
- * Delete part of a tree.  This is really specific to truncation of
- * thin devices.  It only removes keys from the bottom level-btree that
- * are greater than key[info->levels - 1].
- */
-int dm_btree_delete_gt(struct dm_btree_info *info, dm_block_t root, uint64_t *key,
-		    dm_block_t *new_root);
+int dm_btree_del(struct dm_btree_info *info, dm_block_t root);
 
 /*
  * All the lookup functions return -ENODATA if the key cannot be found.
@@ -143,11 +135,6 @@ int dm_btree_remove(struct dm_btree_info
 		    uint64_t *keys, dm_block_t *new_root);
 
 /*
- * Clone a tree. O(1)
- */
-int dm_btree_clone(struct dm_btree_info *info, dm_block_t root, dm_block_t *clone);
-
-/*
  * Returns < 0 on failure.  Otherwise the number of key entries that have
  * been filled out.  Remember trees can have zero entries, and as such have
  * no highest key.
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-checker.c
===================================================================
--- /dev/null
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-checker.c
@@ -0,0 +1,437 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-space-map-checker.h"
+
+#include <linux/device-mapper.h>
+
+#ifdef CONFIG_DM_DEBUG_SPACE_MAPS
+
+#define DM_MSG_PREFIX "space map checker"
+
+/*----------------------------------------------------------------*/
+
+struct count_array {
+	dm_block_t nr;
+	dm_block_t nr_free;
+
+	uint32_t *counts;
+};
+
+static int ca_get_count(struct count_array *ca, dm_block_t b, uint32_t *count)
+{
+	if (b >= ca->nr)
+		return -EINVAL;
+
+	*count = ca->counts[b];
+	return 0;
+}
+
+static int ca_count_more_than_one(struct count_array *ca, dm_block_t b, int *r)
+{
+	if (b >= ca->nr)
+		return -EINVAL;
+
+	*r = ca->counts[b] > 1;
+	return 0;
+}
+
+static int ca_set_count(struct count_array *ca, dm_block_t b, uint32_t count)
+{
+	uint32_t old_count;
+
+	if (b >= ca->nr)
+		return -EINVAL;
+
+	old_count = ca->counts[b];
+
+	if (!count && old_count)
+		ca->nr_free++;
+
+	else if (count && !old_count)
+		ca->nr_free--;
+
+	ca->counts[b] = count;
+	return 0;
+}
+
+static int ca_inc_block(struct count_array *ca, dm_block_t b)
+{
+	if (b >= ca->nr)
+		return -EINVAL;
+
+	ca_set_count(ca, b, ca->counts[b] + 1);
+	return 0;
+}
+
+static int ca_dec_block(struct count_array *ca, dm_block_t b)
+{
+	if (b >= ca->nr)
+		return -EINVAL;
+
+	BUG_ON(ca->counts[b] == 0);
+	ca_set_count(ca, b, ca->counts[b] - 1);
+	return 0;
+}
+
+static int ca_create(struct count_array *ca, struct dm_space_map *sm)
+{
+	int r;
+	dm_block_t nr_blocks;
+
+	r = dm_sm_get_nr_blocks(sm, &nr_blocks);
+	if (r)
+		return r;
+
+	ca->nr = nr_blocks;
+	ca->nr_free = nr_blocks;
+	ca->counts = kzalloc(sizeof(*ca->counts) * nr_blocks, GFP_KERNEL);
+	if (!ca->counts)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int ca_load(struct count_array *ca, struct dm_space_map *sm)
+{
+	int r;
+	uint32_t count;
+	dm_block_t nr_blocks, i;
+
+	r = dm_sm_get_nr_blocks(sm, &nr_blocks);
+	if (r)
+		return r;
+
+	BUG_ON(ca->nr != nr_blocks);
+
+	DMWARN("Loading debug space map from disk.  This may take some time");
+	for (i = 0; i < nr_blocks; i++) {
+		r = dm_sm_get_count(sm, i, &count);
+		if (r) {
+			DMERR("load failed");
+			return r;
+		}
+
+		ca_set_count(ca, i, count);
+	}
+	DMWARN("Load complete");
+
+	return 0;
+}
+
+static int ca_extend(struct count_array *ca, dm_block_t extra_blocks)
+{
+	dm_block_t nr_blocks = ca->nr + extra_blocks;
+	uint32_t *counts = kzalloc(sizeof(*counts) * nr_blocks, GFP_KERNEL);
+	if (!counts)
+		return -ENOMEM;
+
+	memcpy(counts, ca->counts, sizeof(*counts) * ca->nr);
+	kfree(ca->counts);
+	ca->nr = nr_blocks;
+	ca->nr_free += extra_blocks;
+	ca->counts = counts;
+	return 0;
+}
+
+static int ca_commit(struct count_array *old, struct count_array *new)
+{
+	if (old->nr != new->nr) {
+		BUG_ON(old->nr > new->nr);
+		ca_extend(old, new->nr - old->nr);
+	}
+
+	BUG_ON(old->nr != new->nr);
+	old->nr_free = new->nr_free;
+	memcpy(old->counts, new->counts, sizeof(*old->counts) * old->nr);
+	return 0;
+}
+
+static void ca_destroy(struct count_array *ca)
+{
+	kfree(ca->counts);
+}
+
+/*----------------------------------------------------------------*/
+
+struct sm_checker {
+	struct dm_space_map sm;
+
+	struct count_array old_counts;
+	struct count_array counts;
+
+	struct dm_space_map *real_sm;
+};
+
+static void sm_checker_destroy(struct dm_space_map *sm)
+{
+	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
+
+	dm_sm_destroy(smc->real_sm);
+	ca_destroy(&smc->old_counts);
+	ca_destroy(&smc->counts);
+	kfree(smc);
+}
+
+static int sm_checker_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
+{
+	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
+	int r = dm_sm_get_nr_blocks(smc->real_sm, count);
+	if (!r)
+		BUG_ON(smc->old_counts.nr != *count);
+	return r;
+}
+
+static int sm_checker_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
+{
+	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
+	int r = dm_sm_get_nr_free(smc->real_sm, count);
+	if (!r) {
+		/*
+		 * Slow, but we know it's correct.
+		 */
+		dm_block_t b, n = 0;
+		for (b = 0; b < smc->old_counts.nr; b++)
+			if (smc->old_counts.counts[b] == 0 &&
+			    smc->counts.counts[b] == 0)
+				n++;
+
+		if (n != *count)
+			DMERR("free block counts differ, checker %u, sm-disk:%u",
+			      (unsigned) n, (unsigned) *count);
+	}
+	return r;
+}
+
+static int sm_checker_new_block(struct dm_space_map *sm, dm_block_t *b)
+{
+	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
+	int r = dm_sm_new_block(smc->real_sm, b);
+
+	if (!r) {
+		BUG_ON(*b >= smc->old_counts.nr);
+		BUG_ON(smc->old_counts.counts[*b] != 0);
+		BUG_ON(*b >= smc->counts.nr);
+		BUG_ON(smc->counts.counts[*b] != 0);
+		ca_set_count(&smc->counts, *b, 1);
+	}
+
+	return r;
+}
+
+static int sm_checker_inc_block(struct dm_space_map *sm, dm_block_t b)
+{
+	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
+	int r = dm_sm_inc_block(smc->real_sm, b);
+	int r2 = ca_inc_block(&smc->counts, b);
+	BUG_ON(r != r2);
+	return r;
+}
+
+static int sm_checker_dec_block(struct dm_space_map *sm, dm_block_t b)
+{
+	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
+	int r = dm_sm_dec_block(smc->real_sm, b);
+	int r2 = ca_dec_block(&smc->counts, b);
+	BUG_ON(r != r2);
+	return r;
+}
+
+static int sm_checker_get_count(struct dm_space_map *sm, dm_block_t b, uint32_t *result)
+{
+	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
+	uint32_t result2 = 0;
+	int r = dm_sm_get_count(smc->real_sm, b, result);
+	int r2 = ca_get_count(&smc->counts, b, &result2);
+
+	BUG_ON(r != r2);
+	if (!r)
+		BUG_ON(*result != result2);
+	return r;
+}
+
+static int sm_checker_count_more_than_one(struct dm_space_map *sm, dm_block_t b, int *result)
+{
+	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
+	int result2 = 0;
+	int r = dm_sm_count_is_more_than_one(smc->real_sm, b, result);
+	int r2 = ca_count_more_than_one(&smc->counts, b, &result2);
+
+	BUG_ON(r != r2);
+	if (!r)
+		BUG_ON(!(*result) && result2);
+	return r;
+}
+
+static int sm_checker_set_count(struct dm_space_map *sm, dm_block_t b, uint32_t count)
+{
+	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
+	uint32_t old_rc;
+	int r = dm_sm_set_count(smc->real_sm, b, count);
+	int r2;
+
+	BUG_ON(b >= smc->counts.nr);
+	old_rc = smc->counts.counts[b];
+	r2 = ca_set_count(&smc->counts, b, count);
+	BUG_ON(r != r2);
+
+	return r;
+}
+
+static int sm_checker_commit(struct dm_space_map *sm)
+{
+	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
+	int r;
+
+	r = dm_sm_commit(smc->real_sm);
+	if (r)
+		return r;
+
+	r = ca_commit(&smc->old_counts, &smc->counts);
+	if (r)
+		return r;
+
+	return 0;
+}
+
+static int sm_checker_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
+{
+	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
+	int r = dm_sm_extend(smc->real_sm, extra_blocks);
+	if (r)
+		return r;
+
+	return ca_extend(&smc->counts, extra_blocks);
+}
+
+static int sm_checker_root_size(struct dm_space_map *sm, size_t *result)
+{
+	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
+	return dm_sm_root_size(smc->real_sm, result);
+}
+
+static int sm_checker_copy_root(struct dm_space_map *sm, void *copy_to_here_le, size_t len)
+{
+	struct sm_checker *smc = container_of(sm, struct sm_checker, sm);
+	return dm_sm_copy_root(smc->real_sm, copy_to_here_le, len);
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_space_map ops_ = {
+	.destroy = sm_checker_destroy,
+	.get_nr_blocks = sm_checker_get_nr_blocks,
+	.get_nr_free = sm_checker_get_nr_free,
+	.inc_block = sm_checker_inc_block,
+	.dec_block = sm_checker_dec_block,
+	.new_block = sm_checker_new_block,
+	.get_count = sm_checker_get_count,
+	.count_is_more_than_one = sm_checker_count_more_than_one,
+	.set_count = sm_checker_set_count,
+	.commit = sm_checker_commit,
+	.extend = sm_checker_extend,
+	.root_size = sm_checker_root_size,
+	.copy_root = sm_checker_copy_root
+};
+
+struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
+{
+	int r;
+	struct sm_checker *smc;
+
+	if (!sm)
+		return NULL;
+
+	smc = kmalloc(sizeof(*smc), GFP_KERNEL);
+	if (!smc)
+		return NULL;
+
+	memcpy(&smc->sm, &ops_, sizeof(smc->sm));
+	r = ca_create(&smc->old_counts, sm);
+	if (r) {
+		kfree(smc);
+		return NULL;
+	}
+
+	r = ca_create(&smc->counts, sm);
+	if (r) {
+		ca_destroy(&smc->old_counts);
+		kfree(smc);
+		return NULL;
+	}
+
+	smc->real_sm = sm;
+
+	r = ca_load(&smc->counts, sm);
+	if (r) {
+		ca_destroy(&smc->counts);
+		ca_destroy(&smc->old_counts);
+		kfree(smc);
+		return NULL;
+	}
+
+	r = ca_commit(&smc->old_counts, &smc->counts);
+	if (r) {
+		ca_destroy(&smc->counts);
+		ca_destroy(&smc->old_counts);
+		kfree(smc);
+		return NULL;
+	}
+
+	return &smc->sm;
+}
+EXPORT_SYMBOL_GPL(dm_sm_checker_create);
+
+struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
+{
+	int r;
+	struct sm_checker *smc;
+
+	if (!sm)
+		return NULL;
+
+	smc = kmalloc(sizeof(*smc), GFP_KERNEL);
+	if (!smc)
+		return NULL;
+
+	memcpy(&smc->sm, &ops_, sizeof(smc->sm));
+	r = ca_create(&smc->old_counts, sm);
+	if (r) {
+		kfree(smc);
+		return NULL;
+	}
+
+	r = ca_create(&smc->counts, sm);
+	if (r) {
+		ca_destroy(&smc->old_counts);
+		kfree(smc);
+		return NULL;
+	}
+
+	smc->real_sm = sm;
+	return &smc->sm;
+}
+EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
+
+/*----------------------------------------------------------------*/
+
+#else
+
+struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm)
+{
+	return sm;
+}
+EXPORT_SYMBOL_GPL(dm_sm_checker_create);
+
+struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm)
+{
+	return sm;
+}
+EXPORT_SYMBOL_GPL(dm_sm_checker_create_fresh);
+
+/*----------------------------------------------------------------*/
+
+#endif
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-checker.h
===================================================================
--- /dev/null
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-checker.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef SNAPSHOTS_SPACE_MAP_CHECKER_H
+#define SNAPSHOTS_SPACE_MAP_CHECKER_H
+
+#include "dm-space-map.h"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * This space map wraps a real on-disk space map, and verifies all of its
+ * operations.  It uses a lot of memory, so only use if you have a specific
+ * problem that you're debugging.
+ *
+ * Ownership of @sm passes.
+ */
+struct dm_space_map *dm_sm_checker_create(struct dm_space_map *sm);
+struct dm_space_map *dm_sm_checker_create_fresh(struct dm_space_map *sm);
+
+/*----------------------------------------------------------------*/
+
+#endif
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-common.c
===================================================================
--- /dev/null
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-common.c
@@ -0,0 +1,704 @@
+/*
+ * Copyright (C) 2011 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-space-map-common.h"
+#include "dm-transaction-manager.h"
+
+#include <linux/bitops.h>
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "space map common"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Index validator.
+ */
+#define INDEX_CSUM_XOR 160478
+
+static void index_prepare_for_write(struct dm_block_validator *v,
+				    struct dm_block *b,
+				    size_t block_size)
+{
+	struct disk_metadata_index *mi_le = dm_block_data(b);
+
+	mi_le->blocknr = cpu_to_le64(dm_block_location(b));
+	mi_le->csum = cpu_to_le32(dm_bm_checksum(&mi_le->padding,
+						 block_size - sizeof(__le32),
+						 INDEX_CSUM_XOR));
+}
+
+static int index_check(struct dm_block_validator *v,
+		       struct dm_block *b,
+		       size_t block_size)
+{
+	struct disk_metadata_index *mi_le = dm_block_data(b);
+	__le32 csum_disk;
+
+	if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) {
+		DMERR("index_check failed blocknr %llu wanted %llu",
+		      le64_to_cpu(mi_le->blocknr), dm_block_location(b));
+		return -ENOTBLK;
+	}
+
+	csum_disk = cpu_to_le32(dm_bm_checksum(&mi_le->padding,
+					       block_size - sizeof(__le32),
+					       INDEX_CSUM_XOR));
+	if (csum_disk != mi_le->csum) {
+		DMERR("index_check failed csum %u wanted %u",
+		      le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum));
+		return -EILSEQ;
+	}
+
+	return 0;
+}
+
+static struct dm_block_validator index_validator = {
+	.name = "index",
+	.prepare_for_write = index_prepare_for_write,
+	.check = index_check
+};
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Bitmap validator
+ */
+#define BITMAP_CSUM_XOR 240779
+
+static void bitmap_prepare_for_write(struct dm_block_validator *v,
+				     struct dm_block *b,
+				     size_t block_size)
+{
+	struct disk_bitmap_header *disk_header = dm_block_data(b);
+
+	disk_header->blocknr = cpu_to_le64(dm_block_location(b));
+	disk_header->csum = cpu_to_le32(dm_bm_checksum(&disk_header->not_used,
+						       block_size - sizeof(__le32),
+						       BITMAP_CSUM_XOR));
+}
+
+static int bitmap_check(struct dm_block_validator *v,
+			struct dm_block *b,
+			size_t block_size)
+{
+	struct disk_bitmap_header *disk_header = dm_block_data(b);
+	__le32 csum_disk;
+
+	if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) {
+		DMERR("bitmap check failed blocknr %llu wanted %llu",
+		      le64_to_cpu(disk_header->blocknr), dm_block_location(b));
+		return -ENOTBLK;
+	}
+
+	csum_disk = cpu_to_le32(dm_bm_checksum(&disk_header->not_used,
+					       block_size - sizeof(__le32),
+					       BITMAP_CSUM_XOR));
+	if (csum_disk != disk_header->csum) {
+		DMERR("bitmap check failed csum %u wanted %u",
+		      le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum));
+		return -EILSEQ;
+	}
+
+	return 0;
+}
+
+static struct dm_block_validator dm_sm_bitmap_validator = {
+	.name = "sm_bitmap",
+	.prepare_for_write = bitmap_prepare_for_write,
+	.check = bitmap_check
+};
+
+/*----------------------------------------------------------------*/
+
+#define ENTRIES_PER_WORD 32
+#define ENTRIES_SHIFT	5
+
+static void *dm_bitmap_data(struct dm_block *b)
+{
+	return dm_block_data(b) + sizeof(struct disk_bitmap_header);
+}
+
+#define WORD_MASK_HIGH 0xAAAAAAAAAAAAAAAAULL
+
+static unsigned bitmap_word_used(void *addr, unsigned b)
+{
+	__le64 *words_le = addr;
+	__le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
+
+	uint64_t bits = le64_to_cpu(*w_le);
+	uint64_t mask = (bits + WORD_MASK_HIGH + 1) & WORD_MASK_HIGH;
+
+	return !(~bits & mask);
+}
+
+static unsigned sm_lookup_bitmap(void *addr, unsigned b)
+{
+	__le64 *words_le = addr;
+	__le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
+
+	b = (b & (ENTRIES_PER_WORD - 1)) << 1;
+
+	return (!!test_bit_le(b, (void *) w_le) << 1) |
+		(!!test_bit_le(b + 1, (void *) w_le));
+}
+
+static void sm_set_bitmap(void *addr, unsigned b, unsigned val)
+{
+	__le64 *words_le = addr;
+	__le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
+
+	b = (b & (ENTRIES_PER_WORD - 1)) << 1;
+
+	if (val & 2)
+		__set_bit_le(b, (void *) w_le);
+	else
+		__clear_bit_le(b, (void *) w_le);
+
+	if (val & 1)
+		__set_bit_le(b + 1, (void *) w_le);
+	else
+		__clear_bit_le(b + 1, (void *) w_le);
+}
+
+static int sm_find_free(void *addr, unsigned begin, unsigned end,
+			unsigned *result)
+{
+	while (begin < end) {
+		if (!(begin & (ENTRIES_PER_WORD - 1)) &&
+		    bitmap_word_used(addr, begin)) {
+			begin += ENTRIES_PER_WORD;
+			continue;
+		}
+
+		if (!sm_lookup_bitmap(addr, begin)) {
+			*result = begin;
+			return 0;
+		}
+
+		begin++;
+	}
+
+	return -ENOSPC;
+}
+
+/*----------------------------------------------------------------*/
+
+static int sm_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
+{
+	ll->tm = tm;
+
+	ll->bitmap_info.tm = tm;
+	ll->bitmap_info.levels = 1;
+
+	/*
+	 * Because the new bitmap blocks are created via a shadow
+	 * operation, the old entry has already had its reference count
+	 * decremented and we don't need the btree to do any bookkeeping.
+	 */
+	ll->bitmap_info.value_type.size = sizeof(struct disk_index_entry);
+	ll->bitmap_info.value_type.inc = NULL;
+	ll->bitmap_info.value_type.dec = NULL;
+	ll->bitmap_info.value_type.equal = NULL;
+
+	ll->ref_count_info.tm = tm;
+	ll->ref_count_info.levels = 1;
+	ll->ref_count_info.value_type.size = sizeof(uint32_t);
+	ll->ref_count_info.value_type.inc = NULL;
+	ll->ref_count_info.value_type.dec = NULL;
+	ll->ref_count_info.value_type.equal = NULL;
+
+	ll->block_size = dm_bm_block_size(dm_tm_get_bm(tm));
+
+	if (ll->block_size > (1 << 30)) {
+		DMERR("block size too big to hold bitmaps");
+		return -EINVAL;
+	}
+
+	ll->entries_per_block = (ll->block_size - sizeof(struct disk_bitmap_header)) *
+		ENTRIES_PER_BYTE;
+	ll->nr_blocks = 0;
+	ll->bitmap_root = 0;
+	ll->ref_count_root = 0;
+
+	return 0;
+}
+
+int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks)
+{
+	int r;
+	dm_block_t i, nr_blocks, nr_indexes;
+	unsigned old_blocks, blocks;
+
+	nr_blocks = ll->nr_blocks + extra_blocks;
+	old_blocks = dm_sector_div_up(ll->nr_blocks, ll->entries_per_block);
+	blocks = dm_sector_div_up(nr_blocks, ll->entries_per_block);
+
+	nr_indexes = dm_sector_div_up(nr_blocks, ll->entries_per_block);
+	if (nr_indexes > ll->max_entries(ll)) {
+		DMERR("space map too large");
+		return -EINVAL;
+	}
+
+	for (i = old_blocks; i < blocks; i++) {
+		struct dm_block *b;
+		struct disk_index_entry idx;
+
+		r = dm_tm_new_block(ll->tm, &dm_sm_bitmap_validator, &b);
+		if (r < 0)
+			return r;
+		idx.blocknr = cpu_to_le64(dm_block_location(b));
+
+		r = dm_tm_unlock(ll->tm, b);
+		if (r < 0)
+			return r;
+
+		idx.nr_free = cpu_to_le32(ll->entries_per_block);
+		idx.none_free_before = 0;
+
+		r = ll->save_ie(ll, i, &idx);
+		if (r < 0)
+			return r;
+	}
+
+	ll->nr_blocks = nr_blocks;
+	return 0;
+}
+
+int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result)
+{
+	int r;
+	dm_block_t index = b;
+	struct disk_index_entry ie_disk;
+	struct dm_block *blk;
+
+	b = do_div(index, ll->entries_per_block);
+	r = ll->load_ie(ll, index, &ie_disk);
+	if (r < 0)
+		return r;
+
+	r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr),
+			    &dm_sm_bitmap_validator, &blk);
+	if (r < 0)
+		return r;
+
+	*result = sm_lookup_bitmap(dm_bitmap_data(blk), b);
+
+	return dm_tm_unlock(ll->tm, blk);
+}
+
+int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
+{
+	__le32 le_rc;
+	int r = sm_ll_lookup_bitmap(ll, b, result);
+
+	if (r)
+		return r;
+
+	if (*result != 3)
+		return r;
+
+	r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc);
+	if (r < 0)
+		return r;
+
+	*result = le32_to_cpu(le_rc);
+
+	return r;
+}
+
+int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
+			  dm_block_t end, dm_block_t *result)
+{
+	int r;
+	struct disk_index_entry ie_disk;
+	dm_block_t i, index_begin = begin;
+	dm_block_t index_end = dm_sector_div_up(end, ll->entries_per_block);
+
+	/*
+	 * FIXME: Use shifts
+	 */
+	begin = do_div(index_begin, ll->entries_per_block);
+	end = do_div(end, ll->entries_per_block);
+
+	for (i = index_begin; i < index_end; i++, begin = 0) {
+		struct dm_block *blk;
+		unsigned position;
+		uint32_t bit_end;
+
+		r = ll->load_ie(ll, i, &ie_disk);
+		if (r < 0)
+			return r;
+
+		if (le32_to_cpu(ie_disk.nr_free) == 0)
+			continue;
+
+		r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk.blocknr),
+				    &dm_sm_bitmap_validator, &blk);
+		if (r < 0)
+			return r;
+
+		bit_end = (i == index_end - 1) ?  end : ll->entries_per_block;
+
+		r = sm_find_free(dm_bitmap_data(blk),
+				 max_t(unsigned, begin, le32_to_cpu(ie_disk.none_free_before)),
+				 bit_end, &position);
+		if (r == -ENOSPC) {
+			/*
+			 * This might happen because we started searching
+			 * part way through the bitmap.
+			 */
+			dm_tm_unlock(ll->tm, blk);
+			continue;
+
+		} else if (r < 0) {
+			dm_tm_unlock(ll->tm, blk);
+			return r;
+		}
+
+		r = dm_tm_unlock(ll->tm, blk);
+		if (r < 0)
+			return r;
+
+		*result = i * ll->entries_per_block + (dm_block_t) position;
+		return 0;
+	}
+
+	return -ENOSPC;
+}
+
+int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
+		 uint32_t ref_count, enum allocation_event *ev)
+{
+	int r;
+	uint32_t bit, old;
+	struct dm_block *nb;
+	dm_block_t index = b;
+	struct disk_index_entry ie_disk;
+	void *bm_le;
+	int inc;
+
+	bit = do_div(index, ll->entries_per_block);
+	r = ll->load_ie(ll, index, &ie_disk);
+	if (r < 0)
+		return r;
+
+	r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ie_disk.blocknr),
+			       &dm_sm_bitmap_validator, &nb, &inc);
+	if (r < 0) {
+		DMERR("dm_tm_shadow_block() failed");
+		return r;
+	}
+	ie_disk.blocknr = cpu_to_le64(dm_block_location(nb));
+
+	bm_le = dm_bitmap_data(nb);
+	old = sm_lookup_bitmap(bm_le, bit);
+
+	if (ref_count <= 2) {
+		sm_set_bitmap(bm_le, bit, ref_count);
+
+		r = dm_tm_unlock(ll->tm, nb);
+		if (r < 0)
+			return r;
+
+		if (old > 2) {
+#if 0
+			/* FIXME: bug in dm_btree_remove causes corruption */
+			r = dm_btree_remove(&ll->ref_count_info,
+					    ll->ref_count_root,
+					    &b, &ll->ref_count_root);
+			if (r)
+				return r;
+#endif
+		}
+
+	} else {
+		__le32 le_rc = cpu_to_le32(ref_count);
+
+		sm_set_bitmap(bm_le, bit, 3);
+		r = dm_tm_unlock(ll->tm, nb);
+		if (r < 0)
+			return r;
+
+		__dm_bless_for_disk(&le_rc);
+		r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root,
+				    &b, &le_rc, &ll->ref_count_root);
+		if (r < 0) {
+			DMERR("ref count insert failed");
+			return r;
+		}
+	}
+
+	if (ref_count && !old) {
+		*ev = SM_ALLOC;
+		ll->nr_allocated++;
+		ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) - 1);
+		if (le32_to_cpu(ie_disk.none_free_before) == bit)
+			ie_disk.none_free_before = cpu_to_le32(bit + 1);
+
+	} else if (old && !ref_count) {
+		*ev = SM_FREE;
+		ll->nr_allocated--;
+		ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) + 1);
+		ie_disk.none_free_before = cpu_to_le32(min(le32_to_cpu(ie_disk.none_free_before), bit));
+	}
+
+	return ll->save_ie(ll, index, &ie_disk);
+}
+
+int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
+{
+	int r;
+	uint32_t rc;
+
+	r = sm_ll_lookup(ll, b, &rc);
+	if (r)
+		return r;
+
+	return sm_ll_insert(ll, b, rc + 1, ev);
+}
+
+int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
+{
+	int r;
+	uint32_t rc;
+
+	r = sm_ll_lookup(ll, b, &rc);
+	if (r)
+		return r;
+
+	if (!rc)
+		return -EINVAL;
+
+	return sm_ll_insert(ll, b, rc - 1, ev);
+}
+
+int sm_ll_commit(struct ll_disk *ll)
+{
+	return ll->commit(ll);
+}
+
+/*----------------------------------------------------------------*/
+
+static int metadata_ll_load_ie(struct ll_disk *ll, dm_block_t index,
+			       struct disk_index_entry *ie)
+{
+	memcpy(ie, ll->mi_le.index + index, sizeof(*ie));
+	return 0;
+}
+
+static int metadata_ll_save_ie(struct ll_disk *ll, dm_block_t index,
+			       struct disk_index_entry *ie)
+{
+	memcpy(ll->mi_le.index + index, ie, sizeof(*ie));
+	return 0;
+}
+
+static int metadata_ll_init_index(struct ll_disk *ll)
+{
+	int r;
+	struct dm_block *b;
+
+	r = dm_tm_new_block(ll->tm, &index_validator, &b);
+	if (r < 0)
+		return r;
+
+	memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
+	ll->bitmap_root = dm_block_location(b);
+
+	return dm_tm_unlock(ll->tm, b);
+}
+
+static int metadata_ll_open(struct ll_disk *ll)
+{
+	int r;
+	struct dm_block *block;
+
+	r = dm_tm_read_lock(ll->tm, ll->bitmap_root,
+			    &index_validator, &block);
+	if (r)
+		return r;
+
+	memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le));
+	return dm_tm_unlock(ll->tm, block);
+}
+
+static dm_block_t metadata_ll_max_entries(struct ll_disk *ll)
+{
+	return MAX_METADATA_BITMAPS;
+}
+
+static int metadata_ll_commit(struct ll_disk *ll)
+{
+	int r, inc;
+	struct dm_block *b;
+
+	r = dm_tm_shadow_block(ll->tm, ll->bitmap_root, &index_validator, &b, &inc);
+	if (r)
+		return r;
+
+	memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
+	ll->bitmap_root = dm_block_location(b);
+
+	return dm_tm_unlock(ll->tm, b);
+}
+
+int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm)
+{
+	int r;
+
+	r = sm_ll_init(ll, tm);
+	if (r < 0)
+		return r;
+
+	ll->load_ie = metadata_ll_load_ie;
+	ll->save_ie = metadata_ll_save_ie;
+	ll->init_index = metadata_ll_init_index;
+	ll->open_index = metadata_ll_open;
+	ll->max_entries = metadata_ll_max_entries;
+	ll->commit = metadata_ll_commit;
+
+	ll->nr_blocks = 0;
+	ll->nr_allocated = 0;
+
+	r = ll->init_index(ll);
+	if (r < 0)
+		return r;
+
+	r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root);
+	if (r < 0)
+		return r;
+
+	return 0;
+}
+
+int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
+			void *root_le, size_t len)
+{
+	int r;
+	struct disk_sm_root *smr = root_le;
+
+	if (len < sizeof(struct disk_sm_root)) {
+		DMERR("sm_metadata root too small");
+		return -ENOMEM;
+	}
+
+	r = sm_ll_init(ll, tm);
+	if (r < 0)
+		return r;
+
+	ll->load_ie = metadata_ll_load_ie;
+	ll->save_ie = metadata_ll_save_ie;
+	ll->init_index = metadata_ll_init_index;
+	ll->open_index = metadata_ll_open;
+	ll->max_entries = metadata_ll_max_entries;
+	ll->commit = metadata_ll_commit;
+
+	ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
+	ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
+	ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
+	ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
+
+	return ll->open_index(ll);
+}
+
+/*----------------------------------------------------------------*/
+
+static int disk_ll_load_ie(struct ll_disk *ll, dm_block_t index,
+			   struct disk_index_entry *ie)
+{
+	return dm_btree_lookup(&ll->bitmap_info, ll->bitmap_root, &index, ie);
+}
+
+static int disk_ll_save_ie(struct ll_disk *ll, dm_block_t index,
+			   struct disk_index_entry *ie)
+{
+	__dm_bless_for_disk(ie);
+	return dm_btree_insert(&ll->bitmap_info, ll->bitmap_root,
+			       &index, ie, &ll->bitmap_root);
+}
+
+static int disk_ll_init_index(struct ll_disk *ll)
+{
+	return dm_btree_empty(&ll->bitmap_info, &ll->bitmap_root);
+}
+
+static int disk_ll_open(struct ll_disk *ll)
+{
+	/* nothing to do */
+	return 0;
+}
+
+static dm_block_t disk_ll_max_entries(struct ll_disk *ll)
+{
+	return -1ULL;
+}
+
+static int disk_ll_commit(struct ll_disk *ll)
+{
+	return 0;
+}
+
+int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm)
+{
+	int r;
+
+	r = sm_ll_init(ll, tm);
+	if (r < 0)
+		return r;
+
+	ll->load_ie = disk_ll_load_ie;
+	ll->save_ie = disk_ll_save_ie;
+	ll->init_index = disk_ll_init_index;
+	ll->open_index = disk_ll_open;
+	ll->max_entries = disk_ll_max_entries;
+	ll->commit = disk_ll_commit;
+
+	ll->nr_blocks = 0;
+	ll->nr_allocated = 0;
+
+	r = ll->init_index(ll);
+	if (r < 0)
+		return r;
+
+	r = dm_btree_empty(&ll->ref_count_info, &ll->ref_count_root);
+	if (r < 0)
+		return r;
+
+	return 0;
+}
+
+int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm,
+		    void *root_le, size_t len)
+{
+	int r;
+	struct disk_sm_root *smr = root_le;
+
+	if (len < sizeof(struct disk_sm_root)) {
+		DMERR("sm_metadata root too small");
+		return -ENOMEM;
+	}
+
+	r = sm_ll_init(ll, tm);
+	if (r < 0)
+		return r;
+
+	ll->load_ie = disk_ll_load_ie;
+	ll->save_ie = disk_ll_save_ie;
+	ll->init_index = disk_ll_init_index;
+	ll->open_index = disk_ll_open;
+	ll->max_entries = disk_ll_max_entries;
+	ll->commit = disk_ll_commit;
+
+	ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
+	ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
+	ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
+	ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
+
+	return ll->open_index(ll);
+}
+
+/*----------------------------------------------------------------*/
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-common.h
===================================================================
--- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-space-map-common.h
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-common.h
@@ -9,8 +9,9 @@
 
 #include "dm-btree.h"
 
+/*----------------------------------------------------------------*/
+
 /*
- *--------------------------------------------------------------------
  * Low level disk format
  *
  * Bitmap btree
@@ -26,7 +27,6 @@
  *
  * Any entry that has a ref count higher than 2 gets entered in the ref
  * count tree.  The leaf values for this tree is the 32-bit ref count.
- *---------------------------------------------------------------------
  */
 
 struct disk_index_entry {
@@ -45,6 +45,15 @@ struct disk_metadata_index {
 	struct disk_index_entry index[MAX_METADATA_BITMAPS];
 } __packed;
 
+struct ll_disk;
+
+typedef int (*load_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *result);
+typedef int (*save_ie_fn)(struct ll_disk *ll, dm_block_t index, struct disk_index_entry *ie);
+typedef int (*init_index_fn)(struct ll_disk *ll);
+typedef int (*open_index_fn)(struct ll_disk *ll);
+typedef dm_block_t (*max_index_entries_fn)(struct ll_disk *ll);
+typedef int (*commit_fn)(struct ll_disk *ll);
+
 struct ll_disk {
 	struct dm_transaction_manager *tm;
 	struct dm_btree_info bitmap_info;
@@ -63,6 +72,12 @@ struct ll_disk {
 	dm_block_t ref_count_root;
 
 	struct disk_metadata_index mi_le;
+	load_ie_fn load_ie;
+	save_ie_fn save_ie;
+	init_index_fn init_index;
+	open_index_fn open_index;
+	max_index_entries_fn max_entries;
+	commit_fn commit;
 };
 
 struct disk_sm_root {
@@ -80,15 +95,32 @@ struct disk_bitmap_header {
 	__le64 blocknr;
 } __packed;
 
-/*
- * These bitops work on a block's worth of bits.
- */
-unsigned sm_lookup_bitmap(void *addr, unsigned b);
-void sm_set_bitmap(void *addr, unsigned b, unsigned val);
-int sm_find_free(void *addr, unsigned begin, unsigned end, unsigned *result);
+enum allocation_event {
+	SM_NONE,
+	SM_ALLOC,
+	SM_FREE,
+};
+
+/*----------------------------------------------------------------*/
 
-void *dm_bitmap_data(struct dm_block *b);
+int sm_ll_extend(struct ll_disk *ll, dm_block_t extra_blocks);
+int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result);
+int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result);
+int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
+			  dm_block_t end, dm_block_t *result);
+int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev);
+int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
+int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev);
+int sm_ll_commit(struct ll_disk *ll);
+
+int sm_ll_new_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm);
+int sm_ll_open_metadata(struct ll_disk *ll, struct dm_transaction_manager *tm,
+			void *root_le, size_t len);
+
+int sm_ll_new_disk(struct ll_disk *ll, struct dm_transaction_manager *tm);
+int sm_ll_open_disk(struct ll_disk *ll, struct dm_transaction_manager *tm,
+		    void *root_le, size_t len);
 
-extern struct dm_block_validator dm_sm_bitmap_validator;
+/*----------------------------------------------------------------*/
 
 #endif	/* DM_SPACE_MAP_COMMON_H */
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-disk.c
===================================================================
--- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-space-map-disk.c
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-disk.c
@@ -4,6 +4,7 @@
  * This file is released under the GPL.
  */
 
+#include "dm-space-map-checker.h"
 #include "dm-space-map-common.h"
 #include "dm-space-map-disk.h"
 #include "dm-space-map.h"
@@ -11,450 +12,13 @@
 
 #include <linux/list.h>
 #include <linux/slab.h>
-#include <linux/bitops.h>
 #include <linux/module.h>
 #include <linux/device-mapper.h>
 
 #define DM_MSG_PREFIX "space map disk"
 
-/*
- * Bitmap validator
- */
-static void bitmap_prepare_for_write(struct dm_block_validator *v,
-				     struct dm_block *b,
-				     size_t block_size)
-{
-	struct disk_bitmap_header *disk_header = dm_block_data(b);
-
-	disk_header->blocknr = cpu_to_le64(dm_block_location(b));
-	disk_header->csum = cpu_to_le32(dm_block_csum_data(&disk_header->not_used, block_size - sizeof(__le32)));
-}
-
-static int bitmap_check(struct dm_block_validator *v,
-			struct dm_block *b,
-			size_t block_size)
-{
-	struct disk_bitmap_header *disk_header = dm_block_data(b);
-	__le32 csum_disk;
-
-	if (dm_block_location(b) != le64_to_cpu(disk_header->blocknr)) {
-		DMERR("bitmap check failed blocknr %llu wanted %llu",
-		      le64_to_cpu(disk_header->blocknr), dm_block_location(b));
-		return -ENOTBLK;
-	}
-
-	csum_disk = cpu_to_le32(dm_block_csum_data(&disk_header->not_used, block_size - sizeof(__le32)));
-	if (csum_disk != disk_header->csum) {
-		DMERR("bitmap check failed csum %u wanted %u",
-		      le32_to_cpu(csum_disk), le32_to_cpu(disk_header->csum));
-		return -EILSEQ;
-	}
-
-	return 0;
-}
-
-struct dm_block_validator dm_sm_bitmap_validator = {
-	.name = "sm_bitmap",
-	.prepare_for_write = bitmap_prepare_for_write,
-	.check = bitmap_check
-};
-
 /*----------------------------------------------------------------*/
 
-#define ENTRIES_PER_WORD 32
-#define ENTRIES_SHIFT	5
-
-void *dm_bitmap_data(struct dm_block *b)
-{
-	return dm_block_data(b) + sizeof(struct disk_bitmap_header);
-}
-
-#define WORD_MASK_LOW 0x5555555555555555ULL
-#define WORD_MASK_HIGH 0xAAAAAAAAAAAAAAAAULL
-#define WORD_MASK_ALL 0xFFFFFFFFFFFFFFFFULL
-
-static unsigned bitmap_word_used(void *addr, unsigned b)
-{
-	__le64 *words_le = addr;
-	__le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
-
-	uint64_t bits = le64_to_cpu(*w_le);
-
-	return ((bits & WORD_MASK_LOW) == WORD_MASK_LOW ||
-		(bits & WORD_MASK_HIGH) == WORD_MASK_HIGH ||
-		(bits & WORD_MASK_ALL) == WORD_MASK_ALL);
-}
-
-unsigned sm_lookup_bitmap(void *addr, unsigned b)
-{
-	__le64 *words_le = addr;
-	__le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
-
-	b = (b & (ENTRIES_PER_WORD - 1)) << 1;
-
-	return (!!test_bit_le(b, (void *) w_le) << 1) |
-		(!!test_bit_le(b + 1, (void *) w_le));
-}
-
-void sm_set_bitmap(void *addr, unsigned b, unsigned val)
-{
-	__le64 *words_le = addr;
-	__le64 *w_le = words_le + (b >> ENTRIES_SHIFT);
-
-	b = (b & (ENTRIES_PER_WORD - 1)) << 1;
-
-	if (val & 2)
-		__set_bit_le(b, (void *) w_le);
-	else
-		__clear_bit_le(b, (void *) w_le);
-
-	if (val & 1)
-		__set_bit_le(b + 1, (void *) w_le);
-	else
-		__clear_bit_le(b + 1, (void *) w_le);
-}
-
-int sm_find_free(void *addr, unsigned begin, unsigned end,
-		 unsigned *result)
-{
-	while (begin < end) {
-		if (!(begin & (ENTRIES_PER_WORD - 1)) &&
-		    bitmap_word_used(addr, begin)) {
-			begin += ENTRIES_PER_WORD;
-			continue;
-		}
-
-		if (!sm_lookup_bitmap(addr, begin)) {
-			*result = begin;
-			return 0;
-		}
-
-		begin++;
-	}
-
-	return -ENOSPC;
-}
-
-static int disk_ll_init(struct ll_disk *io, struct dm_transaction_manager *tm)
-{
-	io->tm = tm;
-	io->bitmap_info.tm = tm;
-	io->bitmap_info.levels = 1;
-
-	/*
-	 * Because the new bitmap blocks are created via a shadow
-	 * operation, the old entry has already had its reference count
-	 * decremented and we don't need the btree to do any bookkeeping.
-	 */
-	io->bitmap_info.value_type.size = sizeof(struct disk_index_entry);
-	io->bitmap_info.value_type.inc = NULL;
-	io->bitmap_info.value_type.dec = NULL;
-	io->bitmap_info.value_type.equal = NULL;
-
-	io->ref_count_info.tm = tm;
-	io->ref_count_info.levels = 1;
-	io->ref_count_info.value_type.size = sizeof(uint32_t);
-	io->ref_count_info.value_type.inc = NULL;
-	io->ref_count_info.value_type.dec = NULL;
-	io->ref_count_info.value_type.equal = NULL;
-
-	io->block_size = dm_bm_block_size(dm_tm_get_bm(tm));
-
-	if (io->block_size > (1 << 30)) {
-		DMERR("block size too big to hold bitmaps");
-		return -EINVAL;
-	}
-
-	io->entries_per_block = (io->block_size - sizeof(struct disk_bitmap_header)) *
-				ENTRIES_PER_BYTE;
-	io->nr_blocks = 0;
-	io->bitmap_root = 0;
-	io->ref_count_root = 0;
-
-	return 0;
-}
-
-static int disk_ll_new(struct ll_disk *io, struct dm_transaction_manager *tm)
-{
-	int r;
-
-	r = disk_ll_init(io, tm);
-	if (r < 0)
-		return r;
-
-	io->nr_blocks = 0;
-	io->nr_allocated = 0;
-	r = dm_btree_create(&io->bitmap_info, &io->bitmap_root);
-	if (r < 0)
-		return r;
-
-	r = dm_btree_create(&io->ref_count_info, &io->ref_count_root);
-	if (r < 0) {
-		dm_btree_destroy(&io->bitmap_info, io->bitmap_root);
-		return r;
-	}
-
-	return 0;
-}
-
-static int disk_ll_extend(struct ll_disk *io, dm_block_t extra_blocks)
-{
-	int r;
-	dm_block_t i, nr_blocks;
-	unsigned old_blocks, blocks;
-
-	nr_blocks = io->nr_blocks + extra_blocks;
-	old_blocks = dm_sector_div_up(io->nr_blocks, io->entries_per_block);
-	blocks = dm_sector_div_up(nr_blocks, io->entries_per_block);
-
-	for (i = old_blocks; i < blocks; i++) {
-		struct dm_block *b;
-		struct disk_index_entry idx;
-
-		r = dm_tm_new_block(io->tm, &dm_sm_bitmap_validator, &b);
-		if (r < 0)
-			return r;
-		idx.blocknr = cpu_to_le64(dm_block_location(b));
-
-		r = dm_tm_unlock(io->tm, b);
-		if (r < 0)
-			return r;
-
-		idx.nr_free = cpu_to_le32(io->entries_per_block);
-		idx.none_free_before = 0;
-		__dm_bless_for_disk(&idx);
-
-		r = dm_btree_insert(&io->bitmap_info, io->bitmap_root,
-				    &i, &idx, &io->bitmap_root);
-		if (r < 0)
-			return r;
-	}
-
-	io->nr_blocks = nr_blocks;
-	return 0;
-}
-
-static int disk_ll_open(struct ll_disk *ll, struct dm_transaction_manager *tm,
-			void *root_le, size_t len)
-{
-	int r;
-	struct disk_sm_root *smr = root_le;
-
-	if (len < sizeof(struct disk_sm_root)) {
-		DMERR("sm_disk root too small");
-		return -ENOMEM;
-	}
-
-	r = disk_ll_init(ll, tm);
-	if (r < 0)
-		return r;
-
-	ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
-	ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
-	ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
-	ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
-
-	return 0;
-}
-
-static int disk_ll_lookup_bitmap(struct ll_disk *io, dm_block_t b, uint32_t *result)
-{
-	int r;
-	dm_block_t index = b;
-	struct disk_index_entry ie_disk;
-	struct dm_block *blk;
-
-	do_div(index, io->entries_per_block);
-	r = dm_btree_lookup(&io->bitmap_info, io->bitmap_root, &index, &ie_disk);
-	if (r < 0)
-		return r;
-
-	r = dm_tm_read_lock(io->tm, le64_to_cpu(ie_disk.blocknr), &dm_sm_bitmap_validator, &blk);
-	if (r < 0)
-		return r;
-
-	*result = sm_lookup_bitmap(dm_bitmap_data(blk), do_div(b, io->entries_per_block));
-
-	return dm_tm_unlock(io->tm, blk);
-}
-
-static int disk_ll_lookup(struct ll_disk *io, dm_block_t b, uint32_t *result)
-{
-	__le32 rc_le;
-	int r = disk_ll_lookup_bitmap(io, b, result);
-
-	if (r)
-		return r;
-
-	if (*result != 3)
-		return r;
-
-	r = dm_btree_lookup(&io->ref_count_info, io->ref_count_root, &b, &rc_le);
-	if (r < 0)
-		return r;
-
-	*result = le32_to_cpu(rc_le);
-
-	return r;
-}
-
-static int disk_ll_find_free_block(struct ll_disk *io, dm_block_t begin,
-				   dm_block_t end, dm_block_t *result)
-{
-	int r;
-	struct disk_index_entry ie_disk;
-	dm_block_t i, index_begin = begin;
-	dm_block_t index_end = dm_sector_div_up(end, io->entries_per_block);
-
-	begin = do_div(index_begin, io->entries_per_block);
-
-	for (i = index_begin; i < index_end; i++, begin = 0) {
-		struct dm_block *blk;
-		unsigned position;
-		uint32_t bit_end;
-
-		r = dm_btree_lookup(&io->bitmap_info, io->bitmap_root, &i, &ie_disk);
-		if (r < 0)
-			return r;
-
-		if (le32_to_cpu(ie_disk.nr_free) <= 0)
-			continue;
-
-		r = dm_tm_read_lock(io->tm, le64_to_cpu(ie_disk.blocknr),
-				    &dm_sm_bitmap_validator, &blk);
-		if (r < 0)
-			return r;
-
-		bit_end = (i == index_end - 1) ?
-			do_div(end, io->entries_per_block) : io->entries_per_block;
-
-		r = sm_find_free(dm_bitmap_data(blk),
-				 max((unsigned)begin, (unsigned)le32_to_cpu(ie_disk.none_free_before)),
-				 bit_end, &position);
-		if (r < 0) {
-			dm_tm_unlock(io->tm, blk);
-			continue;
-		}
-
-		r = dm_tm_unlock(io->tm, blk);
-		if (r < 0)
-			return r;
-
-		*result = i * io->entries_per_block + (dm_block_t) position;
-
-		return 0;
-	}
-
-	return -ENOSPC;
-}
-
-static int disk_ll_insert(struct ll_disk *io, dm_block_t b, uint32_t ref_count)
-{
-	int r;
-	uint32_t bit, old;
-	struct dm_block *nb;
-	dm_block_t index = b;
-	struct disk_index_entry ie_disk;
-	void *bm_le;
-	int inc;
-
-	do_div(index, io->entries_per_block);
-	r = dm_btree_lookup(&io->bitmap_info, io->bitmap_root, &index, &ie_disk);
-	if (r < 0)
-		return r;
-
-	r = dm_tm_shadow_block(io->tm, le64_to_cpu(ie_disk.blocknr),
-			       &dm_sm_bitmap_validator, &nb, &inc);
-	if (r < 0) {
-		DMERR("dm_tm_shadow_block() failed");
-		return r;
-	}
-	ie_disk.blocknr = cpu_to_le64(dm_block_location(nb));
-
-	bm_le = dm_bitmap_data(nb);
-	bit = do_div(b, io->entries_per_block);
-	old = sm_lookup_bitmap(bm_le, bit);
-
-	if (ref_count <= 2) {
-		sm_set_bitmap(bm_le, bit, ref_count);
-
-		if (old > 2) {
-			r = dm_btree_remove(&io->ref_count_info, io->ref_count_root,
-					    &b, &io->ref_count_root);
-			if (r) {
-				dm_tm_unlock(io->tm, nb);
-				return r;
-			}
-		}
-	} else {
-		__le32 rc_le = cpu_to_le32(ref_count);
-
-		__dm_bless_for_disk(&rc_le);
-
-		sm_set_bitmap(bm_le, bit, 3);
-		r = dm_btree_insert(&io->ref_count_info, io->ref_count_root,
-				    &b, &rc_le, &io->ref_count_root);
-		if (r < 0) {
-			dm_tm_unlock(io->tm, nb);
-			DMERR("ref count insert failed");
-			return r;
-		}
-	}
-
-	r = dm_tm_unlock(io->tm, nb);
-	if (r < 0)
-		return r;
-
-	if (ref_count && !old) {
-		io->nr_allocated++;
-		ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) - 1);
-		if (le32_to_cpu(ie_disk.none_free_before) == b)
-			ie_disk.none_free_before = cpu_to_le32(b + 1);
-
-	} else if (old && !ref_count) {
-		io->nr_allocated--;
-		ie_disk.nr_free = cpu_to_le32(le32_to_cpu(ie_disk.nr_free) + 1);
-		ie_disk.none_free_before = cpu_to_le32(min((dm_block_t) le32_to_cpu(ie_disk.none_free_before), b));
-	}
-
-	__dm_bless_for_disk(&ie_disk);
-
-	r = dm_btree_insert(&io->bitmap_info, io->bitmap_root, &index, &ie_disk, &io->bitmap_root);
-	if (r < 0)
-		return r;
-
-	return 0;
-}
-
-static int disk_ll_inc(struct ll_disk *ll, dm_block_t b)
-{
-	int r;
-	uint32_t rc;
-
-	r = disk_ll_lookup(ll, b, &rc);
-	if (r)
-		return r;
-
-	return disk_ll_insert(ll, b, rc + 1);
-}
-
-static int disk_ll_dec(struct ll_disk *ll, dm_block_t b)
-{
-	int r;
-	uint32_t rc;
-
-	r = disk_ll_lookup(ll, b, &rc);
-	if (r)
-		return r;
-
-	if (!rc)
-		return -EINVAL;
-
-	return disk_ll_insert(ll, b, rc - 1);
-}
-
-/*--------------------------------------------------------------*/
-
 /*
  * Space map interface.
  */
@@ -462,6 +26,10 @@ struct sm_disk {
 	struct dm_space_map sm;
 
 	struct ll_disk ll;
+	struct ll_disk old_ll;
+
+	dm_block_t begin;
+	dm_block_t nr_allocated_this_transaction;
 };
 
 static void sm_disk_destroy(struct dm_space_map *sm)
@@ -475,14 +43,13 @@ static int sm_disk_extend(struct dm_spac
 {
 	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 
-	return disk_ll_extend(&smd->ll, extra_blocks);
+	return sm_ll_extend(&smd->ll, extra_blocks);
 }
 
 static int sm_disk_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count)
 {
 	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-
-	*count = smd->ll.nr_blocks;
+	*count = smd->old_ll.nr_blocks;
 
 	return 0;
 }
@@ -490,8 +57,7 @@ static int sm_disk_get_nr_blocks(struct 
 static int sm_disk_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
 {
 	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-
-	*count = smd->ll.nr_blocks - smd->ll.nr_allocated;
+	*count = (smd->old_ll.nr_blocks - smd->old_ll.nr_allocated) - smd->nr_allocated_this_transaction;
 
 	return 0;
 }
@@ -500,8 +66,7 @@ static int sm_disk_get_count(struct dm_s
 			     uint32_t *result)
 {
 	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
-
-	return disk_ll_lookup(&smd->ll, b, result);
+	return sm_ll_lookup(&smd->ll, b, result);
 }
 
 static int sm_disk_count_is_more_than_one(struct dm_space_map *sm, dm_block_t b,
@@ -520,42 +85,127 @@ static int sm_disk_count_is_more_than_on
 static int sm_disk_set_count(struct dm_space_map *sm, dm_block_t b,
 			     uint32_t count)
 {
+	int r;
+	uint32_t old_count;
+	enum allocation_event ev;
 	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 
-	return disk_ll_insert(&smd->ll, b, count);
+	r = sm_ll_insert(&smd->ll, b, count, &ev);
+	if (!r) {
+		switch (ev) {
+		case SM_NONE:
+			break;
+
+		case SM_ALLOC:
+			/*
+			 * This _must_ be free in the prior transaction
+			 * otherwise we've lost atomicity.
+			 */
+			smd->nr_allocated_this_transaction++;
+			break;
+
+		case SM_FREE:
+			/*
+			 * It's only free if it's also free in the last
+			 * transaction.
+			 */
+			r = sm_ll_lookup(&smd->old_ll, b, &old_count);
+			if (r)
+				return r;
+
+			if (!old_count)
+				smd->nr_allocated_this_transaction--;
+			break;
+		}
+	}
+
+	return r;
 }
 
 static int sm_disk_inc_block(struct dm_space_map *sm, dm_block_t b)
 {
+	int r;
+	enum allocation_event ev;
 	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 
-	return disk_ll_inc(&smd->ll, b);
+	r = sm_ll_inc(&smd->ll, b, &ev);
+	if (!r && (ev == SM_ALLOC))
+		/*
+		 * This _must_ be free in the prior transaction
+		 * otherwise we've lost atomicity.
+		 */
+		smd->nr_allocated_this_transaction++;
+
+	return r;
 }
 
 static int sm_disk_dec_block(struct dm_space_map *sm, dm_block_t b)
 {
+	int r;
+	uint32_t old_count;
+	enum allocation_event ev;
 	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 
-	return disk_ll_dec(&smd->ll, b);
+	r = sm_ll_dec(&smd->ll, b, &ev);
+	if (!r && (ev == SM_FREE)) {
+		/*
+		 * It's only free if it's also free in the last
+		 * transaction.
+		 */
+		r = sm_ll_lookup(&smd->old_ll, b, &old_count);
+		if (r)
+			return r;
+
+		if (!old_count)
+			smd->nr_allocated_this_transaction--;
+	}
+
+	return r;
 }
 
 static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b)
 {
 	int r;
+	enum allocation_event ev;
 	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
 
-	/*
-	 * FIXME: We should start the search where we left off.
-	 */
-	r = disk_ll_find_free_block(&smd->ll, 0, smd->ll.nr_blocks, b);
+	/* FIXME: we should loop round a couple of times */
+	r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b);
 	if (r)
 		return r;
 
-	return disk_ll_inc(&smd->ll, *b);
+	smd->begin = *b + 1;
+	r = sm_ll_inc(&smd->ll, *b, &ev);
+	if (!r) {
+		BUG_ON(ev != SM_ALLOC);
+		smd->nr_allocated_this_transaction++;
+	}
+
+	return r;
 }
 
 static int sm_disk_commit(struct dm_space_map *sm)
 {
+	int r;
+	dm_block_t nr_free;
+	struct sm_disk *smd = container_of(sm, struct sm_disk, sm);
+
+	r = sm_disk_get_nr_free(sm, &nr_free);
+	if (r)
+		return r;
+
+	r = sm_ll_commit(&smd->ll);
+	if (r)
+		return r;
+
+	memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll));
+	smd->begin = 0;
+	smd->nr_allocated_this_transaction = 0;
+
+	r = sm_disk_get_nr_free(sm, &nr_free);
+	if (r)
+		return r;
+
 	return 0;
 }
 
@@ -602,8 +252,9 @@ static struct dm_space_map ops = {
 	.copy_root = sm_disk_copy_root
 };
 
-struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
-				       dm_block_t nr_blocks)
+static struct dm_space_map *dm_sm_disk_create_real(
+	struct dm_transaction_manager *tm,
+	dm_block_t nr_blocks)
 {
 	int r;
 	struct sm_disk *smd;
@@ -612,13 +263,15 @@ struct dm_space_map *dm_sm_disk_create(s
 	if (!smd)
 		return ERR_PTR(-ENOMEM);
 
+	smd->begin = 0;
+	smd->nr_allocated_this_transaction = 0;
 	memcpy(&smd->sm, &ops, sizeof(smd->sm));
 
-	r = disk_ll_new(&smd->ll, tm);
+	r = sm_ll_new_disk(&smd->ll, tm);
 	if (r)
 		goto bad;
 
-	r = disk_ll_extend(&smd->ll, nr_blocks);
+	r = sm_ll_extend(&smd->ll, nr_blocks);
 	if (r)
 		goto bad;
 
@@ -632,10 +285,18 @@ bad:
 	kfree(smd);
 	return ERR_PTR(r);
 }
+
+struct dm_space_map *dm_sm_disk_create(struct dm_transaction_manager *tm,
+				       dm_block_t nr_blocks)
+{
+	struct dm_space_map *sm = dm_sm_disk_create_real(tm, nr_blocks);
+	return dm_sm_checker_create_fresh(sm);
+}
 EXPORT_SYMBOL_GPL(dm_sm_disk_create);
 
-struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
-				     void *root_le, size_t len)
+static struct dm_space_map *dm_sm_disk_open_real(
+	struct dm_transaction_manager *tm,
+	void *root_le, size_t len)
 {
 	int r;
 	struct sm_disk *smd;
@@ -644,9 +305,11 @@ struct dm_space_map *dm_sm_disk_open(str
 	if (!smd)
 		return ERR_PTR(-ENOMEM);
 
+	smd->begin = 0;
+	smd->nr_allocated_this_transaction = 0;
 	memcpy(&smd->sm, &ops, sizeof(smd->sm));
 
-	r = disk_ll_open(&smd->ll, tm, root_le, len);
+	r = sm_ll_open_disk(&smd->ll, tm, root_le, len);
 	if (r)
 		goto bad;
 
@@ -660,4 +323,13 @@ bad:
 	kfree(smd);
 	return ERR_PTR(r);
 }
+
+struct dm_space_map *dm_sm_disk_open(struct dm_transaction_manager *tm,
+				     void *root_le, size_t len)
+{
+	return dm_sm_checker_create(
+		dm_sm_disk_open_real(tm, root_le, len));
+}
 EXPORT_SYMBOL_GPL(dm_sm_disk_open);
+
+/*----------------------------------------------------------------*/
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-metadata.c
===================================================================
--- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-space-map-metadata.c
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -10,7 +10,6 @@
 
 #include <linux/list.h>
 #include <linux/slab.h>
-#include <linux/bitops.h>
 #include <linux/device-mapper.h>
 
 #define DM_MSG_PREFIX "space map metadata"
@@ -18,384 +17,6 @@
 /*----------------------------------------------------------------*/
 
 /*
- * Index validator.
- */
-static void index_prepare_for_write(struct dm_block_validator *v,
-				    struct dm_block *b,
-				    size_t block_size)
-{
-	struct disk_metadata_index *mi_le = dm_block_data(b);
-
-	mi_le->blocknr = cpu_to_le64(dm_block_location(b));
-	mi_le->csum = cpu_to_le32(dm_block_csum_data(&mi_le->padding, block_size - sizeof(__le32)));
-}
-
-static int index_check(struct dm_block_validator *v,
-		       struct dm_block *b,
-		       size_t block_size)
-{
-	struct disk_metadata_index *mi_le = dm_block_data(b);
-	__le32 csum_disk;
-
-	if (dm_block_location(b) != le64_to_cpu(mi_le->blocknr)) {
-		DMERR("index_check failed blocknr %llu wanted %llu",
-		      le64_to_cpu(mi_le->blocknr), dm_block_location(b));
-		return -ENOTBLK;
-	}
-
-	csum_disk = cpu_to_le32(dm_block_csum_data(&mi_le->padding,
-						 block_size - sizeof(__le32)));
-	if (csum_disk != mi_le->csum) {
-		DMERR("index_check failed csum %u wanted %u",
-		      le32_to_cpu(csum_disk), le32_to_cpu(mi_le->csum));
-		return -EILSEQ;
-	}
-
-	return 0;
-}
-
-static struct dm_block_validator index_validator = {
-	.name = "index",
-	.prepare_for_write = index_prepare_for_write,
-	.check = index_check
-};
-
-/*----------------------------------------------------------------*/
-
-/*
- * Low-level disk ops.
- */
-static int metadata_ll_init(struct ll_disk *ll, struct dm_transaction_manager *tm)
-{
-	ll->tm = tm;
-
-	ll->ref_count_info.tm = tm;
-	ll->ref_count_info.levels = 1;
-	ll->ref_count_info.value_type.size = sizeof(uint32_t);
-	ll->ref_count_info.value_type.inc = NULL;
-	ll->ref_count_info.value_type.dec = NULL;
-	ll->ref_count_info.value_type.equal = NULL;
-
-	ll->block_size = dm_bm_block_size(dm_tm_get_bm(tm));
-
-	if (ll->block_size > (1 << 30)) {
-		DMERR("block size too big to hold bitmaps");
-		return -EINVAL;
-	}
-
-	ll->entries_per_block = (ll->block_size - sizeof(struct disk_bitmap_header)) *
-				ENTRIES_PER_BYTE;
-	ll->nr_blocks = 0;
-	ll->bitmap_root = 0;
-	ll->ref_count_root = 0;
-
-	return 0;
-}
-
-static int metadata_ll_new(struct ll_disk *ll, struct dm_transaction_manager *tm,
-			   dm_block_t nr_blocks)
-{
-	int r;
-	dm_block_t i;
-	unsigned blocks;
-	struct dm_block *index_block;
-
-	r = metadata_ll_init(ll, tm);
-	if (r < 0)
-		return r;
-
-	ll->nr_blocks = nr_blocks;
-	ll->nr_allocated = 0;
-
-	blocks = dm_sector_div_up(nr_blocks, ll->entries_per_block);
-	if (blocks > MAX_METADATA_BITMAPS) {
-		DMERR("metadata device too large");
-		return -EINVAL;
-	}
-
-	for (i = 0; i < blocks; i++) {
-		struct dm_block *b;
-		struct disk_index_entry *idx_le = ll->mi_le.index + i;
-
-		r = dm_tm_new_block(tm, &dm_sm_bitmap_validator, &b);
-		if (r < 0)
-			return r;
-		idx_le->blocknr = cpu_to_le64(dm_block_location(b));
-
-		r = dm_tm_unlock(tm, b);
-		if (r < 0)
-			return r;
-
-		idx_le->nr_free = cpu_to_le32(ll->entries_per_block);
-		idx_le->none_free_before = 0;
-	}
-
-	/*
-	 * Write the index.
-	 */
-	r = dm_tm_new_block(tm, &index_validator, &index_block);
-	if (r)
-		return r;
-
-	ll->bitmap_root = dm_block_location(index_block);
-	memcpy(dm_block_data(index_block), &ll->mi_le, sizeof(ll->mi_le));
-	r = dm_tm_unlock(tm, index_block);
-	if (r)
-		return r;
-
-	r = dm_btree_create(&ll->ref_count_info, &ll->ref_count_root);
-	if (r < 0)
-		return r;
-
-	return 0;
-}
-
-static int metadata_ll_open(struct ll_disk *ll, struct dm_transaction_manager *tm,
-			    void *root_le, size_t len)
-{
-	int r;
-	struct disk_sm_root *smr = root_le;
-	struct dm_block *block;
-
-	if (len < sizeof(struct disk_sm_root)) {
-		DMERR("sm_metadata root too small");
-		return -ENOMEM;
-	}
-
-	r = metadata_ll_init(ll, tm);
-	if (r < 0)
-		return r;
-
-	ll->nr_blocks = le64_to_cpu(smr->nr_blocks);
-	ll->nr_allocated = le64_to_cpu(smr->nr_allocated);
-	ll->bitmap_root = le64_to_cpu(smr->bitmap_root);
-
-	r = dm_tm_read_lock(tm, le64_to_cpu(smr->bitmap_root),
-			    &index_validator, &block);
-	if (r)
-		return r;
-
-	memcpy(&ll->mi_le, dm_block_data(block), sizeof(ll->mi_le));
-	r = dm_tm_unlock(tm, block);
-	if (r)
-		return r;
-
-	ll->ref_count_root = le64_to_cpu(smr->ref_count_root);
-	return 0;
-}
-
-static int metadata_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result)
-{
-	int r;
-	dm_block_t index = b;
-	struct disk_index_entry *ie_disk;
-	struct dm_block *blk;
-
-	b = do_div(index, ll->entries_per_block);
-	ie_disk = ll->mi_le.index + index;
-
-	r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk->blocknr),
-			    &dm_sm_bitmap_validator, &blk);
-	if (r < 0)
-		return r;
-
-	*result = sm_lookup_bitmap(dm_bitmap_data(blk), b);
-
-	return dm_tm_unlock(ll->tm, blk);
-}
-
-static int metadata_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result)
-{
-	__le32 le_rc;
-	int r = metadata_ll_lookup_bitmap(ll, b, result);
-
-	if (r)
-		return r;
-
-	if (*result != 3)
-		return r;
-
-	r = dm_btree_lookup(&ll->ref_count_info, ll->ref_count_root, &b, &le_rc);
-	if (r < 0)
-		return r;
-
-	*result = le32_to_cpu(le_rc);
-
-	return r;
-}
-
-static int metadata_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
-				       dm_block_t end, dm_block_t *result)
-{
-	int r;
-	struct disk_index_entry *ie_disk;
-	dm_block_t i, index_begin = begin;
-	dm_block_t index_end = dm_sector_div_up(end, ll->entries_per_block);
-
-	/*
-	 * FIXME: Use shifts
-	 */
-	begin = do_div(index_begin, ll->entries_per_block);
-	end = do_div(end, ll->entries_per_block);
-
-	for (i = index_begin; i < index_end; i++, begin = 0) {
-		struct dm_block *blk;
-		unsigned position;
-		uint32_t bit_end;
-
-		ie_disk = ll->mi_le.index + i;
-
-		if (le32_to_cpu(ie_disk->nr_free) <= 0)
-			continue;
-
-		r = dm_tm_read_lock(ll->tm, le64_to_cpu(ie_disk->blocknr),
-				    &dm_sm_bitmap_validator, &blk);
-		if (r < 0)
-			return r;
-
-		bit_end = (i == index_end - 1) ?  end : ll->entries_per_block;
-
-		r = sm_find_free(dm_bitmap_data(blk), begin, bit_end, &position);
-		if (r < 0) {
-			dm_tm_unlock(ll->tm, blk);
-			/*
-			 * Avoiding retry (FIXME: explain why)
-			 */
-			return r;
-		}
-
-		r = dm_tm_unlock(ll->tm, blk);
-		if (r < 0)
-			return r;
-
-		*result = i * ll->entries_per_block + (dm_block_t) position;
-
-		return 0;
-	}
-
-	return -ENOSPC;
-}
-
-static int metadata_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count)
-{
-	int r;
-	uint32_t bit, old;
-	struct dm_block *nb;
-	dm_block_t index = b;
-	struct disk_index_entry *ie_disk;
-	void *bm_le;
-	int inc;
-
-	bit = do_div(index, ll->entries_per_block);
-	ie_disk = ll->mi_le.index + index;
-
-	r = dm_tm_shadow_block(ll->tm, le64_to_cpu(ie_disk->blocknr),
-			       &dm_sm_bitmap_validator, &nb, &inc);
-	if (r < 0) {
-		DMERR("dm_tm_shadow_block() failed");
-		return r;
-	}
-	ie_disk->blocknr = cpu_to_le64(dm_block_location(nb));
-
-	bm_le = dm_bitmap_data(nb);
-	old = sm_lookup_bitmap(bm_le, bit);
-
-	if (ref_count <= 2) {
-		sm_set_bitmap(bm_le, bit, ref_count);
-
-		r = dm_tm_unlock(ll->tm, nb);
-		if (r < 0)
-			return r;
-
-		if (old > 2) {
-			r = dm_btree_remove(&ll->ref_count_info,
-					    ll->ref_count_root,
-					    &b, &ll->ref_count_root);
-			if (r) {
-				sm_set_bitmap(bm_le, bit, old);
-				return r;
-			}
-		}
-	} else {
-		__le32 le_rc = cpu_to_le32(ref_count);
-
-		__dm_bless_for_disk(&le_rc);
-
-		sm_set_bitmap(bm_le, bit, 3);
-		r = dm_tm_unlock(ll->tm, nb);
-		if (r < 0) {
-			__dm_unbless_for_disk(&le_rc);
-			return r;
-		}
-
-		r = dm_btree_insert(&ll->ref_count_info, ll->ref_count_root,
-				    &b, &le_rc, &ll->ref_count_root);
-		if (r < 0) {
-			/* FIXME: release shadow? or assume the whole transaction will be ditched */
-			DMERR("ref count insert failed");
-			return r;
-		}
-	}
-
-	if (ref_count && !old) {
-		ll->nr_allocated++;
-		ie_disk->nr_free = cpu_to_le32(le32_to_cpu(ie_disk->nr_free) - 1);
-		if (le32_to_cpu(ie_disk->none_free_before) == b)
-			ie_disk->none_free_before = cpu_to_le32(b + 1);
-	} else if (old && !ref_count) {
-		ll->nr_allocated--;
-		ie_disk->nr_free = cpu_to_le32(le32_to_cpu(ie_disk->nr_free) + 1);
-		ie_disk->none_free_before = cpu_to_le32(min((dm_block_t) le32_to_cpu(ie_disk->none_free_before), b));
-	}
-
-	return 0;
-}
-
-static int metadata_ll_inc(struct ll_disk *ll, dm_block_t b)
-{
-	int r;
-	uint32_t rc;
-
-	r = metadata_ll_lookup(ll, b, &rc);
-	if (r)
-		return r;
-
-	return metadata_ll_insert(ll, b, rc + 1);
-}
-
-static int metadata_ll_dec(struct ll_disk *ll, dm_block_t b)
-{
-	int r;
-	uint32_t rc;
-
-	r = metadata_ll_lookup(ll, b, &rc);
-	if (r)
-		return r;
-
-	if (!rc)
-		return -EINVAL;
-
-	return metadata_ll_insert(ll, b, rc - 1);
-}
-
-static int metadata_ll_commit(struct ll_disk *ll)
-{
-	int r, inc;
-	struct dm_block *b;
-
-	r = dm_tm_shadow_block(ll->tm, ll->bitmap_root, &index_validator, &b, &inc);
-	if (r)
-		return r;
-
-	memcpy(dm_block_data(b), &ll->mi_le, sizeof(ll->mi_le));
-	ll->bitmap_root = dm_block_location(b);
-
-	return dm_tm_unlock(ll->tm, b);
-}
-
-/*----------------------------------------------------------------*/
-
-/*
  * Space map interface.
  *
  * The low level disk format is written using the standard btree and
@@ -454,14 +75,15 @@ static int add_bop(struct sm_metadata *s
 static int commit_bop(struct sm_metadata *smm, struct block_op *op)
 {
 	int r = 0;
+	enum allocation_event ev;
 
 	switch (op->type) {
 	case BOP_INC:
-		r = metadata_ll_inc(&smm->ll, op->block);
+		r = sm_ll_inc(&smm->ll, op->block, &ev);
 		break;
 
 	case BOP_DEC:
-		r = metadata_ll_dec(&smm->ll, op->block);
+		r = sm_ll_dec(&smm->ll, op->block, &ev);
 		break;
 	}
 
@@ -575,7 +197,7 @@ static int sm_metadata_get_count(struct 
 		}
 	}
 
-	r = metadata_ll_lookup(&smm->ll, b, result);
+	r = sm_ll_lookup(&smm->ll, b, result);
 	if (r)
 		return r;
 
@@ -617,7 +239,7 @@ static int sm_metadata_count_is_more_tha
 		return 0;
 	}
 
-	r = metadata_ll_lookup_bitmap(&smm->ll, b, &rc);
+	r = sm_ll_lookup_bitmap(&smm->ll, b, &rc);
 	if (r)
 		return r;
 
@@ -636,6 +258,7 @@ static int sm_metadata_set_count(struct 
 				 uint32_t count)
 {
 	int r, r2;
+	enum allocation_event ev;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 
 	if (smm->recursion_count) {
@@ -644,7 +267,7 @@ static int sm_metadata_set_count(struct 
 	}
 
 	in(smm);
-	r = metadata_ll_insert(&smm->ll, b, count);
+	r = sm_ll_insert(&smm->ll, b, count, &ev);
 	r2 = out(smm);
 
 	return combine_errors(r, r2);
@@ -653,13 +276,14 @@ static int sm_metadata_set_count(struct 
 static int sm_metadata_inc_block(struct dm_space_map *sm, dm_block_t b)
 {
 	int r, r2 = 0;
+	enum allocation_event ev;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 
 	if (recursing(smm))
 		r = add_bop(smm, BOP_INC, b);
 	else {
 		in(smm);
-		r = metadata_ll_inc(&smm->ll, b);
+		r = sm_ll_inc(&smm->ll, b, &ev);
 		r2 = out(smm);
 	}
 
@@ -669,25 +293,27 @@ static int sm_metadata_inc_block(struct 
 static int sm_metadata_dec_block(struct dm_space_map *sm, dm_block_t b)
 {
 	int r, r2 = 0;
+	enum allocation_event ev;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 
 	if (recursing(smm))
 		r = add_bop(smm, BOP_DEC, b);
 	else {
 		in(smm);
-		r = metadata_ll_dec(&smm->ll, b);
+		r = sm_ll_dec(&smm->ll, b, &ev);
 		r2 = out(smm);
 	}
 
 	return combine_errors(r, r2);
 }
 
-static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
+static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b)
 {
 	int r, r2 = 0;
+	enum allocation_event ev;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 
-	r = metadata_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b);
+	r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b);
 	if (r)
 		return r;
 
@@ -697,7 +323,7 @@ static int sm_metadata_new_block(struct 
 		r = add_bop(smm, BOP_INC, *b);
 	else {
 		in(smm);
-		r = metadata_ll_inc(&smm->ll, *b);
+		r = sm_ll_inc(&smm->ll, *b, &ev);
 		r2 = out(smm);
 	}
 
@@ -707,14 +333,20 @@ static int sm_metadata_new_block(struct 
 	return combine_errors(r, r2);
 }
 
+static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
+{
+	int r = sm_metadata_new_block_(sm, b);
+	if (r)
+		DMERR("out of metadata space");
+	return r;
+}
+
 static int sm_metadata_commit(struct dm_space_map *sm)
 {
 	int r;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 
-	memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
-
-	r = metadata_ll_commit(&smm->ll);
+	r = sm_ll_commit(&smm->ll);
 	if (r)
 		return r;
 
@@ -910,6 +542,7 @@ int dm_sm_metadata_create(struct dm_spac
 {
 	int r;
 	dm_block_t i;
+	enum allocation_event ev;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 
 	smm->begin = superblock + 1;
@@ -918,9 +551,15 @@ int dm_sm_metadata_create(struct dm_spac
 	smm->nr_uncommitted = 0;
 
 	memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
-	r = metadata_ll_new(&smm->ll, tm, nr_blocks);
+
+	r = sm_ll_new_metadata(&smm->ll, tm);
 	if (r)
 		return r;
+
+	r = sm_ll_extend(&smm->ll, nr_blocks);
+	if (r)
+		return r;
+
 	memcpy(&smm->sm, &ops, sizeof(smm->sm));
 
 	/*
@@ -928,7 +567,7 @@ int dm_sm_metadata_create(struct dm_spac
 	 * allocated blocks that they were built from.
 	 */
 	for (i = superblock; !r && i < smm->begin; i++)
-		r = metadata_ll_inc(&smm->ll, i);
+		r = sm_ll_inc(&smm->ll, i, &ev);
 
 	if (r)
 		return r;
@@ -943,7 +582,7 @@ int dm_sm_metadata_open(struct dm_space_
 	int r;
 	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
 
-	r = metadata_ll_open(&smm->ll, tm, root_le, len);
+	r = sm_ll_open_metadata(&smm->ll, tm, root_le, len);
 	if (r)
 		return r;
 
@@ -952,5 +591,6 @@ int dm_sm_metadata_open(struct dm_space_
 	smm->allocated_this_transaction = 0;
 	smm->nr_uncommitted = 0;
 
-	return sm_metadata_commit(sm);
+	memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
+	return 0;
 }
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-space-map.h
===================================================================
--- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-space-map.h
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-space-map.h
@@ -16,9 +16,25 @@
 struct dm_space_map {
 	void (*destroy)(struct dm_space_map *sm);
 
+	/*
+	 * You must commit before allocating the newly added space.
+	 */
 	int (*extend)(struct dm_space_map *sm, dm_block_t extra_blocks);
 
+	/*
+	 * Extensions do not appear in this count until after commit has
+	 * been called.
+	 */
 	int (*get_nr_blocks)(struct dm_space_map *sm, dm_block_t *count);
+
+	/*
+	 * Space maps must never allocate a block from the previous
+	 * transaction, in case we need to rollback.  This complicates the
+	 * semantics of get_nr_free(), it should return the number of blocks
+	 * that are available for allocation _now_.  For instance you may
+	 * have blocks with a zero reference count that will not be
+	 * available for allocation until after the next commit.
+	 */
 	int (*get_nr_free)(struct dm_space_map *sm, dm_block_t *count);
 
 	int (*get_count)(struct dm_space_map *sm, dm_block_t b, uint32_t *result);
Index: linux-3.1-rc9/drivers/md/persistent-data/dm-transaction-manager.c
===================================================================
--- linux-3.1-rc9.orig/drivers/md/persistent-data/dm-transaction-manager.c
+++ linux-3.1-rc9/drivers/md/persistent-data/dm-transaction-manager.c
@@ -5,6 +5,7 @@
  */
 #include "dm-transaction-manager.h"
 #include "dm-space-map.h"
+#include "dm-space-map-checker.h"
 #include "dm-space-map-disk.h"
 #include "dm-space-map-metadata.h"
 #include "dm-persistent-data-internal.h"
@@ -49,13 +50,11 @@ static int is_shadow(struct dm_transacti
 	struct hlist_node *n;
 
 	spin_lock(&tm->lock);
-
 	hlist_for_each_entry(si, n, tm->buckets + bucket, hlist)
 		if (si->where == b) {
 			r = 1;
 			break;
 		}
-
 	spin_unlock(&tm->lock);
 
 	return r;
@@ -74,7 +73,6 @@ static void insert_shadow(struct dm_tran
 	if (si) {
 		si->where = b;
 		bucket = dm_hash_block(b, HASH_MASK);
-
 		spin_lock(&tm->lock);
 		hlist_add_head(&si->hlist, tm->buckets + bucket);
 		spin_unlock(&tm->lock);
@@ -96,6 +94,7 @@ static void wipe_shadow_table(struct dm_
 
 		INIT_HLIST_HEAD(bucket);
 	}
+
 	spin_unlock(&tm->lock);
 }
 
@@ -200,77 +199,52 @@ int dm_tm_new_block(struct dm_transactio
 
 static int __shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
 			  struct dm_block_validator *v,
-			  struct dm_block **result, int *inc_children)
+			  struct dm_block **result)
 {
 	int r;
 	dm_block_t new;
-	uint32_t count;
 	struct dm_block *orig_block;
 
 	r = dm_sm_new_block(tm->sm, &new);
 	if (r < 0)
 		return r;
 
-	r = dm_bm_write_lock_zero(tm->bm, new, v, result);
+	r = dm_sm_dec_block(tm->sm, orig);
 	if (r < 0)
-		goto bad_dec_block;
+		return r;
 
 	r = dm_bm_read_lock(tm->bm, orig, v, &orig_block);
 	if (r < 0)
-		goto bad_dec_block;
-
-	memcpy(dm_block_data(*result), dm_block_data(orig_block),
-	       dm_bm_block_size(tm->bm));
-
-	r = dm_bm_unlock(orig_block);
-	if (r < 0)
-		goto bad_dec_block;
-
-	r = dm_sm_get_count(tm->sm, orig, &count);
-	if (r < 0)
-		goto bad;
-
-	r = dm_sm_dec_block(tm->sm, orig);
-	if (r < 0)
-		goto bad;
-
-	*inc_children = count > 1;
-
-	return 0;
+		return r;
 
-bad:
-	dm_bm_unlock(*result);
-bad_dec_block:
-	dm_sm_dec_block(tm->sm, new);
+	r = dm_bm_unlock_move(orig_block, new);
+	if (r < 0) {
+		dm_bm_unlock(orig_block);
+		return r;
+	}
 
-	return r;
+	return dm_bm_write_lock(tm->bm, new, v, result);
 }
 
 int dm_tm_shadow_block(struct dm_transaction_manager *tm, dm_block_t orig,
 		       struct dm_block_validator *v, struct dm_block **result,
 		       int *inc_children)
 {
-	int r, more_than_one;
+	int r;
 
 	if (tm->is_clone)
 		return -EWOULDBLOCK;
 
-	if (is_shadow(tm, orig)) {
-		r = dm_sm_count_is_more_than_one(tm->sm, orig, &more_than_one);
-		if (r < 0)
-			return r;
-
-		if (!more_than_one) {
-			*inc_children = 0;
-			return dm_bm_write_lock(tm->bm, orig, v, result);
-		}
-		/* fall through */
-	}
-
-	r = __shadow_block(tm, orig, v, result, inc_children);
+	r = dm_sm_count_is_more_than_one(tm->sm, orig, inc_children);
 	if (r < 0)
 		return r;
 
+	if (is_shadow(tm, orig) && !*inc_children)
+		return dm_bm_write_lock(tm->bm, orig, v, result);
+
+	r = __shadow_block(tm, orig, v, result);
+	if (r < 0)
+		return r;
 	insert_shadow(tm, dm_block_location(*result));
 
 	return r;
@@ -312,6 +286,7 @@ void dm_tm_dec(struct dm_transaction_man
 
 	dm_sm_dec_block(tm->sm, b);
 }
+EXPORT_SYMBOL_GPL(dm_tm_dec);
 
 int dm_tm_ref(struct dm_transaction_manager *tm, dm_block_t b,
 	      uint32_t *result)
@@ -339,14 +314,15 @@ static int dm_tm_create_internal(struct 
 				 int create)
 {
 	int r;
+	struct dm_space_map *inner;
 
-	*sm = dm_sm_metadata_init();
-	if (IS_ERR(*sm))
-		return PTR_ERR(*sm);
+	inner = dm_sm_metadata_init();
+	if (IS_ERR(inner))
+		return PTR_ERR(inner);
 
-	*tm = dm_tm_create(bm, *sm);
+	*tm = dm_tm_create(bm, inner);
 	if (IS_ERR(*tm)) {
-		dm_sm_destroy(*sm);
+		dm_sm_destroy(inner);
 		return PTR_ERR(*tm);
 	}
 
@@ -358,13 +334,17 @@ static int dm_tm_create_internal(struct 
 			goto bad1;
 		}
 
-		r = dm_sm_metadata_create(*sm, *tm, dm_bm_nr_blocks(bm),
+		r = dm_sm_metadata_create(inner, *tm, dm_bm_nr_blocks(bm),
 					  sb_location);
 		if (r) {
 			DMERR("couldn't create metadata space map");
 			goto bad2;
 		}
 
+		*sm = dm_sm_checker_create(inner);
+		if (!*sm)
+			goto bad2;
+
 	} else {
 		r = dm_bm_write_lock(dm_tm_get_bm(*tm), sb_location,
 				     sb_validator, sblock);
@@ -373,13 +353,17 @@ static int dm_tm_create_internal(struct 
 			goto bad1;
 		}
 
-		r = dm_sm_metadata_open(*sm, *tm,
+		r = dm_sm_metadata_open(inner, *tm,
 					dm_block_data(*sblock) + root_offset,
 					root_max_len);
-		if (IS_ERR(*sm)) {
+		if (r) {
 			DMERR("couldn't open metadata space map");
 			goto bad2;
 		}
+
+		*sm = dm_sm_checker_create(inner);
+		if (!*sm)
+			goto bad2;
 	}
 
 	return 0;
@@ -388,7 +372,7 @@ bad2:
 	dm_tm_unlock(*tm, *sblock);
 bad1:
 	dm_tm_destroy(*tm);
-	dm_sm_destroy(*sm);
+	dm_sm_destroy(inner);
 	return r;
 }
 
@@ -412,3 +396,5 @@ int dm_tm_open_with_sm(struct dm_block_m
 				     root_max_len, tm, sm, sblock, 0);
 }
 EXPORT_SYMBOL_GPL(dm_tm_open_with_sm);
+
+/*----------------------------------------------------------------*/
