New snapshot implementation. This implementation has shared storage and
unlimited number of snapshots.

The work is split to two modules:
dm-multisnapshot.ko - the general module
dm-store-mikulas.ko - the snapshot store

The modularity allows to load other snapshot stores.

Usage:
Create two logical volumes, one for origin and one for snapshots.
(assume /dev/mapper/vg1-lv1 for origin and /dev/mapper/vg1-lv2 for snapshot in
these examples)

Clear the first sector of the snapshot volume:
dd if=/dev/zero of=/dev/mapper/vg1-lv2 bs=4096 count=1

Table line arguments:
- origin device
- shared store device
- chunk size
- number of generic arguments
- generic arguments
	sync-snapshots --- synchronize snapshots according to the list
	preserve-on-error --- halt the origin on error in the snapshot store
- shared store type
- number of arguments for shared store type
- shared store arguments
	cache-threshold size --- a background write is started
	cache-limit size --- a limit for metadata cache size
if sync-snapshots was specified
	- number of snapshot ids
	- snapshot ids

Load the shared snapshot driver:
echo 0 `blockdev --getsize /dev/mapper/vg1-lv1` multisnapshot /dev/mapper/vg1-lv1 /dev/mapper/vg1-lv2 16 0 mikulas 0|dmsetup create ms
(16 is the chunk size in 512-byte sectors. You can place different number there)
This creates the origin store on /dev/mapper/ms. If the store was zeroed, it
creates new structure, otherwise it loads existing structure.

Once this is done, you should no longer access /dev/mapper/vg1-lv1 and
/dev/mapper/vg1-lv2 and only use /dev/mapper/ms.

Create new snapshot:
dmsetup message /dev/mapper/ms 0 create
dmsetup status /dev/mapper/ms
	(this will find out the newly created snapshot ID)
dmsetup suspend /dev/mapper/ms
dmsetup resume /dev/mapper/ms

Attach the snapshot:
echo 0 `blockdev --getsize /dev/mapper/vg1-lv1` multisnap-snap /dev/mapper/vg1-lv1 0|dmsetup create ms0
(that '0' is the snapshot id ... you can use different number)
This attaches the snapshot '0' on /dev/mapper/ms0

Delete the snapshot:
dmsetup message /dev/mapper/ms 0 delete 0
(the parameter after "delete" is the snapshot id)

See status:
dmsetup status prints these information about the multisnapshot device:
- number of arguments befor the snapshot id list (5)
- 0 on active storage, -error number on error (-ENOSPC, -EIO, etc.)
- the new snapshot number that will be created, "-" if there is none
- total number of chunks on the device
- total number of allocated chunks
- a number of chunks allocated for metadata
- a number of snapshots
- existing snapshot IDs

Unload it:
dmsetup remove ms
dmsetup remove ms0
... etc. (note, once you unload the origin, the snapshots become inaccessible
- the devices exist but they return -EIO on everything)


Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/Kconfig                       |   19 
 drivers/md/Makefile                      |    8 
 drivers/md/dm-bufio.c                    |  969 ++++++++++++++++
 drivers/md/dm-bufio.h                    |   29 
 drivers/md/dm-multisnap-alloc.c          |  576 +++++++++
 drivers/md/dm-multisnap-blocks.c         |  198 +++
 drivers/md/dm-multisnap-btree.c          |  798 +++++++++++++
 drivers/md/dm-multisnap-commit.c         |  210 +++
 drivers/md/dm-multisnap-delete.c         |  134 ++
 drivers/md/dm-multisnap-freelist.c       |  236 +++
 drivers/md/dm-multisnap-io.c             |  191 +++
 drivers/md/dm-multisnap-mikulas-struct.h |  198 +++
 drivers/md/dm-multisnap-mikulas.c        |  667 +++++++++++
 drivers/md/dm-multisnap-mikulas.h        |  201 +++
 drivers/md/dm-multisnap-private.h        |  145 ++
 drivers/md/dm-multisnap-snaps.c          |  429 +++++++
 drivers/md/dm-multisnap.c                | 1871 +++++++++++++++++++++++++++++++
 drivers/md/dm-multisnap.h                |  153 ++
 18 files changed, 7032 insertions(+)

Index: linux-2.6.32/drivers/md/Kconfig
===================================================================
--- linux-2.6.32.orig/drivers/md/Kconfig
+++ linux-2.6.32/drivers/md/Kconfig
@@ -249,6 +249,25 @@ config DM_SNAPSHOT
        ---help---
          Allow volume managers to take writable snapshots of a device.
 
+config DM_MULTISNAPSHOT
+	tristate "Multisnapshot target"
+	depends on BLK_DEV_DM
+	---help---
+	  A new implementation of snapshots allowing sharing storage
+	  between several snapshots.
+
+	  A submenu allows to select a specific shared snapshot store
+	  driver.
+
+config DM_MULTISNAPSHOT_MIKULAS
+	tristate "Mikulas' snapshot store"
+	depends on DM_MULTISNAPSHOT
+	---help---
+	  Mikulas Patocka's snapshot store.
+
+	  A B+-tree-based log-structured storage allowing unlimited
+	  number of snapshots.
+
 config DM_MIRROR
        tristate "Mirror target"
        depends on BLK_DEV_DM
Index: linux-2.6.32/drivers/md/Makefile
===================================================================
--- linux-2.6.32.orig/drivers/md/Makefile
+++ linux-2.6.32/drivers/md/Makefile
@@ -7,6 +7,12 @@ dm-mod-y	+= dm.o dm-table.o dm-target.o 
 dm-multipath-y	+= dm-path-selector.o dm-mpath.o
 dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 		    dm-snap-persistent.o
+dm-multisnapshot-y += dm-multisnap.o
+dm-store-mikulas-y += dm-multisnap-mikulas.o dm-multisnap-alloc.o \
+		   dm-multisnap-blocks.o dm-multisnap-btree.o \
+		   dm-multisnap-commit.o dm-multisnap-delete.o \
+		   dm-multisnap-freelist.o dm-multisnap-io.o \
+		   dm-multisnap-snaps.o dm-bufio.o
 dm-mirror-y	+= dm-raid1.o
 dm-log-userspace-y \
 		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
@@ -41,6 +47,8 @@ obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipa
 obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
+obj-$(CONFIG_DM_MULTISNAPSHOT)	+= dm-multisnapshot.o
+obj-$(CONFIG_DM_MULTISNAPSHOT_MIKULAS) += dm-store-mikulas.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
Index: linux-2.6.32/drivers/md/dm-bufio.c
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-bufio.c
@@ -0,0 +1,969 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+#include <linux/dm-io.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "dm-bufio.h"
+
+/*
+ * dm_bufio_client_create --- create a buffered IO cache on a given device
+ * dm_bufio_client_destroy --- release a buffered IO cache
+ *
+ * dm_bufio_read --- read a given block from disk. Returns pointer to data.
+ *	Returns a pointer to dm_buffer that can be used to release the buffer
+ *	or to make it dirty.
+ * dm_bufio_new --- like dm_bufio_read, but don't read anything from the disk.
+ *	It is expected that the caller initializes the buffer and marks it
+ *	dirty.
+ * dm_bufio_release --- release a reference obtained with dm_bufio_read or
+ *	dm_bufio_new. The data pointer and dm_buffer pointer is no longer valid
+ *	after this call.
+ *
+ * WARNING: to avoid deadlocks, the thread can hold at most one buffer. Multiple
+ *	threads can hold each one buffer simultaneously.
+ *
+ * dm_bufio_mark_buffer_dirty --- mark a buffer dirty. It should be called after
+ *	the buffer is modified.
+ * dm_bufio_write_dirty_buffers --- write all dirty buffers. Guarantees that all
+ *	dirty buffers created prior to this call are on disk when this call
+ *	exits.
+ * dm_bufio_issue_flush --- send an empty write barrier to the device to flush
+ *	hardware disk cache.
+ *
+ * In case of memory pressure, the buffer may be written after
+ *	dm_bufio_mark_buffer_dirty, but before dm_bufio_write_dirty_buffers.
+ *	So, dm_bufio_write_dirty_buffers guarantees that the buffer is on-disk,
+ *	but the actual writing may occur earlier.
+ *
+ * dm_bufio_release_move --- like dm_bufio_release, and also move the buffer to
+ *	the new block. dm_bufio_write_dirty_buffers is needed to commit the new
+ *	block.
+ * dm_bufio_drop_buffers --- clear all buffers.
+ */
+
+/*
+ * Memory management policy:
+ *	When we're above threshold, start asynchronous writing of dirty buffers
+ *	and memory reclaiming --- but still allow new allocations.
+ *	When we're above limit, don't allocate any more space and synchronously
+ *	wait until existing buffers are freed.
+ *
+ * These default parameters can be overriden with parameters to
+ * dm_bufio_client_create.
+ */
+#define DM_BUFIO_THRESHOLD_MEMORY	(8 * 1048576)
+#define DM_BUFIO_LIMIT_MEMORY		(9 * 1048576)
+
+/*
+ * The number of bvec entries that are embedded directly in the buffer.
+ * If the chunk size is larger, dm-io is used to do the io.
+ */
+#define DM_BUFIO_INLINE_VECS		16
+
+/*
+ * Buffer hash
+ */
+#define DM_BUFIO_HASH_SIZE	(PAGE_SIZE / sizeof(struct hlist_head) / 2)
+#define DM_BUFIO_HASH(block)	((block) & (DM_BUFIO_HASH_SIZE - 1))
+
+/*
+ * Don't try to kmalloc blocks larger than this.
+ * For exaplanation, see dm_bufio_alloc_buffer_data below.
+ */
+#define DM_BUFIO_BLOCK_SIZE_KMALLOC_LIMIT	PAGE_SIZE
+
+/*
+ * Buffer state bits.
+ */
+#define B_READING	0
+#define B_WRITING	1
+#define B_DIRTY		2
+
+struct dm_bufio_client {
+	/*
+	 * Linking of buffers:
+	 *	all buffers are linked to cache_hash with their hash_list field.
+	 *	clean buffers that are not being written (B_WRITING not set)
+	 *		are linked to lru with their lru_list field.
+	 *	dirty and clean buffers that are being written are linked
+	 *		to dirty_lru with their	lru_list field. When the write
+	 *		finishes, the buffer cannot be immediatelly relinked
+	 *		(because we are in an interrupt context and relinking
+	 *		requires process context), so some clean-not-writing
+	 *		buffers	can be held on dirty_lru too. They are later
+	 *		added to
+	 *		lru in the process context.
+	 */
+	struct list_head lru;
+	struct list_head dirty_lru;
+	struct mutex lock;
+	struct block_device *bdev;
+	unsigned block_size;
+	unsigned char sectors_per_block_bits;
+	unsigned char pages_per_block_bits;
+
+	unsigned long n_buffers;
+	unsigned threshold_buffers;
+	unsigned limit_buffers;
+
+	struct dm_io_client *dm_io;
+
+	struct dm_buffer *reserved_buffer;
+	struct hlist_head cache_hash[DM_BUFIO_HASH_SIZE];
+	wait_queue_head_t free_buffer_wait;
+
+	int async_write_error;
+};
+
+/*
+ * A method, with wich the data is allocated:
+ * kmalloc(), __get_free_pages() or vmalloc().
+ * See the comment at dm_bufio_alloc_buffer_data.
+ */
+#define DATA_MODE_KMALLOC		1
+#define DATA_MODE_GET_FREE_PAGES	2
+#define DATA_MODE_VMALLOC		3
+
+struct dm_buffer {
+	struct hlist_node hash_list;
+	struct list_head lru_list;
+	sector_t block;
+	void *data;
+	char data_mode;		/* DATA_MODE_* */
+	unsigned hold_count;
+	int read_error;
+	int write_error;
+	unsigned long state;
+	struct dm_bufio_client *c;
+	struct bio bio;
+	struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
+};
+
+/*
+ * Allocating buffer data.
+ *
+ * Small buffers are allocated with kmalloc, to use space optimally.
+ *
+ * Large buffers:
+ * We use get_free_pages or vmalloc, both have their advantages and
+ * disadvantages.
+ * __get_free_pages can randomly fail, if the memory is fragmented.
+ * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
+ *	as low as 128M) --- so using it for caching is not appropriate.
+ * If the allocation may fail, we use __get_free_pages, memory fragmentation
+ *	won't have fatal effect here, it just causes flushes of some other
+ *	buffers and more I/O will be performed.
+ * If the allocation shouldn't fail, we use __vmalloc. This is only for
+ *	the initial reserve allocation, so there's no risk of wasting
+ *	all vmalloc space.
+ */
+
+static void *dm_bufio_alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, char *data_mode)
+{
+	if (c->block_size <= DM_BUFIO_BLOCK_SIZE_KMALLOC_LIMIT) {
+		*data_mode = DATA_MODE_KMALLOC;
+		return kmalloc(c->block_size, gfp_mask);
+	} else if (gfp_mask & __GFP_NORETRY) {
+		*data_mode = DATA_MODE_GET_FREE_PAGES;
+		return (void *)__get_free_pages(gfp_mask, c->pages_per_block_bits);
+	} else {
+		*data_mode = DATA_MODE_VMALLOC;
+		return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
+	}
+}
+
+/*
+ * Free buffer's data.
+ */
+
+static void dm_bufio_free_buffer_data(struct dm_bufio_client *c, void *data, char data_mode)
+{
+	switch (data_mode) {
+
+	case DATA_MODE_KMALLOC:
+		kfree(data);
+		break;
+	case DATA_MODE_GET_FREE_PAGES:
+		free_pages((unsigned long)data, c->pages_per_block_bits);
+		break;
+	case DATA_MODE_VMALLOC:
+		vfree(data);
+		break;
+	default:
+		printk(KERN_CRIT "dm_bufio_free_buffer_data: bad data mode: %d", data_mode);
+		BUG();
+
+	}
+}
+
+
+/*
+ * Allocate buffer and its data.
+ */
+
+static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
+{
+	struct dm_buffer *b;
+	b = kmalloc(sizeof(struct dm_buffer), gfp_mask);
+	if (!b)
+		return NULL;
+	b->c = c;
+	b->data = dm_bufio_alloc_buffer_data(c, gfp_mask, &b->data_mode);
+	if (!b->data) {
+		kfree(b);
+		return NULL;
+	}
+	return b;
+}
+
+/*
+ * Free buffer and its data.
+ */
+
+static void free_buffer(struct dm_buffer *b)
+{
+	dm_bufio_free_buffer_data(b->c, b->data, b->data_mode);
+	kfree(b);
+}
+
+
+/*
+ * Link buffer to the hash list and clean or dirty queue.
+ */
+
+static void link_buffer(struct dm_buffer *b, sector_t block, int dirty)
+{
+	struct dm_bufio_client *c = b->c;
+	c->n_buffers++;
+	b->block = block;
+	list_add(&b->lru_list, dirty ? &c->dirty_lru : &c->lru);
+	hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]);
+}
+
+/*
+ * Unlink buffer from the hash list and dirty or clean queue.
+ */
+
+static void unlink_buffer(struct dm_buffer *b)
+{
+	BUG_ON(!b->c->n_buffers);
+	b->c->n_buffers--;
+	hlist_del(&b->hash_list);
+	list_del(&b->lru_list);
+}
+
+/*
+ * Place the buffer to the head of dirty or clean LRU queue.
+ */
+
+static void relink_lru(struct dm_buffer *b, int dirty)
+{
+	struct dm_bufio_client *c = b->c;
+	list_del(&b->lru_list);
+	list_add(&b->lru_list, dirty ? &c->dirty_lru : &c->lru);
+}
+
+/*
+ * This function is called when wait_on_bit is actually waiting.
+ * It unplugs the underlying block device, so that coalesced I/Os in
+ * the request queue are dispatched to the device.
+ */
+
+static int do_io_schedule(void *word)
+{
+	struct dm_buffer *b = container_of(word, struct dm_buffer, state);
+	struct dm_bufio_client *c = b->c;
+
+	blk_run_address_space(c->bdev->bd_inode->i_mapping);
+
+	io_schedule();
+
+	return 0;
+}
+
+static void write_dirty_buffer(struct dm_buffer *b);
+
+/*
+ * Wait until any activity on the buffer finishes.
+ * Possibly write the buffer if it is dirty.
+ * When this function finishes, there is no I/O running on the buffer
+ * and the buffer is not dirty.
+ */
+
+static void make_buffer_clean(struct dm_buffer *b)
+{
+	BUG_ON(b->hold_count);
+	if (likely(!b->state))	/* fast case */
+		return;
+	wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
+	write_dirty_buffer(b);
+	wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
+}
+
+/*
+ * Find some buffer that is not held by anybody, clean it, unlink it and
+ * return it.
+ * If "wait" is zero, try less harder and don't block.
+ */
+
+static struct dm_buffer *get_unclaimed_buffer(struct dm_bufio_client *c, int wait)
+{
+	struct dm_buffer *b;
+	list_for_each_entry_reverse(b, &c->lru, lru_list) {
+		cond_resched();
+		BUG_ON(test_bit(B_WRITING, &b->state));
+		BUG_ON(test_bit(B_DIRTY, &b->state));
+		if (!b->hold_count) {
+			if (!wait && unlikely(test_bit(B_READING, &b->state)))
+				continue;
+			make_buffer_clean(b);
+			unlink_buffer(b);
+			return b;
+		}
+	}
+	list_for_each_entry_reverse(b, &c->dirty_lru, lru_list) {
+		cond_resched();
+		BUG_ON(test_bit(B_READING, &b->state));
+		if (!b->hold_count) {
+			if (!wait && (unlikely(test_bit(B_DIRTY, &b->state)) ||
+				      unlikely(test_bit(B_WRITING, &b->state)))) {
+				if (!test_bit(B_WRITING, &b->state))
+					write_dirty_buffer(b);
+				continue;
+			}
+			make_buffer_clean(b);
+			unlink_buffer(b);
+			return b;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Wait until some other threads free some buffer or release hold count
+ * on some buffer.
+ *
+ * This function is entered with c->lock held, drops it and regains it before
+ * exiting.
+ */
+
+static void wait_for_free_buffer(struct dm_bufio_client *c)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	add_wait_queue(&c->free_buffer_wait, &wait);
+	set_task_state(current, TASK_UNINTERRUPTIBLE);
+	mutex_unlock(&c->lock);
+
+	io_schedule();
+
+	set_task_state(current, TASK_RUNNING);
+	remove_wait_queue(&c->free_buffer_wait, &wait);
+
+	mutex_lock(&c->lock);
+}
+
+/*
+ * Allocate a new buffer. If the allocation is not possible, wait until some
+ * other thread frees a buffer.
+ *
+ * May drop the lock and regain it.
+ */
+
+static struct dm_buffer *alloc_buffer_wait(struct dm_bufio_client *c)
+{
+	struct dm_buffer *b;
+
+retry:
+	/*
+	 * dm-bufio is resistant to allocation failures (it just keeps
+	 * one buffer reserved in caes all the allocations fail).
+	 * So set flags to not try too hard:
+	 *	GFP_NOIO: don't recurse into the I/O layer
+	 *	__GFP_NOMEMALLOC: don't use emergency reserves
+	 *	__GFP_NORETRY: don't retry and rather return failure
+	 *	__GFP_NOWARN: don't print a warning in case of failure
+	 */
+	b = alloc_buffer(c, GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN);
+	if (b)
+		return b;
+
+	if (c->reserved_buffer) {
+		b = c->reserved_buffer;
+		c->reserved_buffer = NULL;
+		return b;
+	}
+
+	b = get_unclaimed_buffer(c, 1);
+	if (b)
+		return b;
+
+	wait_for_free_buffer(c);
+	goto retry;
+}
+
+/*
+ * Free a buffer and wake other threads waiting for free buffers.
+ */
+
+static void free_buffer_wake(struct dm_buffer *b)
+{
+	struct dm_bufio_client *c = b->c;
+
+	if (unlikely(!c->reserved_buffer))
+		c->reserved_buffer = b;
+	else
+		free_buffer(b);
+
+	wake_up(&c->free_buffer_wait);
+
+	cond_resched();
+}
+
+/*
+ * Check if we're over watermark.
+ * If we are over threshold_buffers, start freeing buffers.
+ * If we're over "limit_buffers", blocks until we get under the limit.
+ */
+
+static void check_watermark(struct dm_bufio_client *c)
+{
+	while (c->n_buffers > c->threshold_buffers) {
+		struct dm_buffer *b;
+		b = get_unclaimed_buffer(c, c->n_buffers > c->limit_buffers);
+		if (!b)
+			return;
+		free_buffer_wake(b);
+	}
+}
+
+static void dm_bufio_dmio_complete(unsigned long error, void *context);
+
+/*
+ * Submit I/O on the buffer.
+ *
+ * Bio interface is faster but it has some problems:
+ *	- the vector list is limited (increasing this limit increases
+ *		memory-consumption per buffer, so it is not viable)
+ *	- the memory must be direct-mapped, not vmallocated
+ *	- the I/O driver can spuriously reject requests if it thinks that
+ *		the requests are too big for the device or if they cross a
+ *		controller-defined memory boundary
+ *
+ * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
+ * it is not vmalloc()ated, try using the bio interface.
+ *
+ * If the buffer is big, if it is vmalloc()ated or if the underlying device
+ * rejects the bio because it is too large, use dmio layer to do the I/O.
+ * dmio layer splits the I/O to multiple requests, solving the above
+ * shorcomings.
+ */
+
+static void dm_bufio_submit_io(struct dm_buffer *b, int rw, sector_t block, bio_end_io_t *end_io)
+{
+	if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && b->data_mode != DATA_MODE_VMALLOC) {
+		char *ptr;
+		int len;
+		bio_init(&b->bio);
+		b->bio.bi_io_vec = b->bio_vec;
+		b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS;
+		b->bio.bi_sector = b->block << b->c->sectors_per_block_bits;
+		b->bio.bi_bdev = b->c->bdev;
+		b->bio.bi_end_io = end_io;
+
+		/*
+		 * we assume that if len >= PAGE_SIZE, ptr is page-aligned,
+		 * if len < PAGE_SIZE, the buffer doesn't cross page boundary.
+		 */
+		ptr = b->data;
+		len = b->c->block_size;
+		do {
+			if (!bio_add_page(&b->bio, virt_to_page(ptr), len < PAGE_SIZE ? len : PAGE_SIZE, virt_to_phys(ptr) & (PAGE_SIZE - 1))) {
+				BUG_ON(b->c->block_size <= PAGE_SIZE);
+				goto use_dmio;
+			}
+			len -= PAGE_SIZE;
+			ptr += PAGE_SIZE;
+		} while (len > 0);
+		submit_bio(rw, &b->bio);
+	} else
+use_dmio : {
+		int r;
+		struct dm_io_request io_req = {
+			.bi_rw = rw,
+			.notify.fn = dm_bufio_dmio_complete,
+			.notify.context = b,
+			.client = b->c->dm_io,
+		};
+		struct dm_io_region region = {
+			.bdev = b->c->bdev,
+			.sector = b->block << b->c->sectors_per_block_bits,
+			.count = b->c->block_size >> SECTOR_SHIFT,
+		};
+		if (b->data_mode != DATA_MODE_VMALLOC) {
+			io_req.mem.type = DM_IO_KMEM;
+			io_req.mem.ptr.addr = b->data;
+		} else {
+			io_req.mem.type = DM_IO_VMA;
+			io_req.mem.ptr.vma = b->data;
+		}
+		b->bio.bi_end_io = end_io;
+		r = dm_io(&io_req, 1, &region, NULL);
+		if (unlikely(r))
+			end_io(&b->bio, r);
+	}
+}
+
+/*
+ * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
+ * that the request was handled directly with bio interface.
+ */
+
+static void dm_bufio_dmio_complete(unsigned long error, void *context)
+{
+	struct dm_buffer *b = context;
+	int err = 0;
+	if (unlikely(error != 0))
+		err = -EIO;
+	b->bio.bi_end_io(&b->bio, err);
+}
+
+/* Find a buffer in the hash. */
+
+static struct dm_buffer *dm_bufio_find(struct dm_bufio_client *c, sector_t block)
+{
+	struct dm_buffer *b;
+	struct hlist_node *hn;
+	hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)], hash_list) {
+		cond_resched();
+		if (b->block == block)
+			return b;
+	}
+
+	return NULL;
+}
+
+static void read_endio(struct bio *bio, int error);
+
+/*
+ * A common routine for dm_bufio_new and dm_bufio_read.
+ * Operation of these function is very similar, except that dm_bufio_new
+ * doesn't read the buffer from the disk (assuming that the caller overwrites
+ * all the data and uses dm_bufio_mark_buffer_dirty to write new data back).
+ */
+
+static void *dm_bufio_new_read(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp, int read)
+{
+	struct dm_buffer *b, *new_b = NULL;
+
+	cond_resched();
+	mutex_lock(&c->lock);
+retry_search:
+	b = dm_bufio_find(c, block);
+	if (b) {
+		if (new_b)
+			free_buffer_wake(new_b);
+		b->hold_count++;
+		relink_lru(b, test_bit(B_DIRTY, &b->state) || test_bit(B_WRITING, &b->state));
+unlock_wait_ret:
+		mutex_unlock(&c->lock);
+wait_ret:
+		wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE);
+		if (b->read_error) {
+			int error = b->read_error;
+			dm_bufio_release(b);
+			return ERR_PTR(error);
+		}
+		*bp = b;
+		return b->data;
+	}
+	if (!new_b) {
+		new_b = alloc_buffer_wait(c);
+		goto retry_search;
+	}
+
+	check_watermark(c);
+
+	b = new_b;
+	b->hold_count = 1;
+	b->read_error = 0;
+	b->write_error = 0;
+	link_buffer(b, block, 0);
+
+	if (!read) {
+		b->state = 0;
+		goto unlock_wait_ret;
+	}
+
+	b->state = 1 << B_READING;
+
+	mutex_unlock(&c->lock);
+
+	dm_bufio_submit_io(b, READ, b->block, read_endio);
+
+	goto wait_ret;
+}
+
+/* Read the buffer and hold reference on it */
+
+void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp)
+{
+	return dm_bufio_new_read(c, block, bp, 1);
+}
+EXPORT_SYMBOL(dm_bufio_read);
+
+/* Get the buffer with possibly invalid data and hold reference on it */
+
+void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp)
+{
+	return dm_bufio_new_read(c, block, bp, 0);
+}
+EXPORT_SYMBOL(dm_bufio_new);
+
+/*
+ * The endio routine for reading: set the error, clear the bit and wake up
+ * anyone waiting on the buffer.
+ */
+
+static void read_endio(struct bio *bio, int error)
+{
+	struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
+	b->read_error = error;
+	BUG_ON(!test_bit(B_READING, &b->state));
+	smp_mb__before_clear_bit();
+	clear_bit(B_READING, &b->state);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&b->state, B_READING);
+}
+
+/*
+ * Release the reference held on the buffer.
+ */
+
+void dm_bufio_release(struct dm_buffer *b)
+{
+	struct dm_bufio_client *c = b->c;
+	mutex_lock(&c->lock);
+	BUG_ON(!b->hold_count);
+	BUG_ON(test_bit(B_READING, &b->state));
+	b->hold_count--;
+	if (!b->hold_count) {
+		wake_up(&c->free_buffer_wait);
+		/*
+		 * If there were errors on the buffer, and the buffer is not
+		 * to be written, free the buffer. There is no point in caching
+		 * invalid buffer.
+		 */
+		if ((b->read_error || b->write_error) &&
+		    !test_bit(B_WRITING, &b->state) &&
+		    !test_bit(B_DIRTY, &b->state)) {
+			unlink_buffer(b);
+			free_buffer_wake(b);
+		}
+	}
+	mutex_unlock(&c->lock);
+}
+EXPORT_SYMBOL(dm_bufio_release);
+
+/*
+ * Mark that the data in the buffer were modified and the buffer needs to
+ * be written back.
+ */
+
+void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
+{
+	struct dm_bufio_client *c = b->c;
+
+	mutex_lock(&c->lock);
+
+	if (!test_and_set_bit(B_DIRTY, &b->state))
+		relink_lru(b, 1);
+
+	mutex_unlock(&c->lock);
+}
+EXPORT_SYMBOL(dm_bufio_mark_buffer_dirty);
+
+static void write_endio(struct bio *bio, int error);
+
+/*
+ * Initiate a write on a dirty buffer, but don't wait for it.
+ * If the buffer is not dirty, exit.
+ * If there some previous write going on, wait for it to finish (we can't
+ * have two writes on the same buffer simultaneously).
+ * Finally, submit our write and don't wait on it. We set B_WRITING indicating
+ * that there is a write in progress.
+ */
+
+static void write_dirty_buffer(struct dm_buffer *b)
+{
+	if (!test_bit(B_DIRTY, &b->state))
+		return;
+	clear_bit(B_DIRTY, &b->state);
+	wait_on_bit_lock(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
+	dm_bufio_submit_io(b, WRITE, b->block, write_endio);
+}
+
+/*
+ * The endio routine for write.
+ * Set the error, clear B_WRITING bit and wake anyone who was waiting on it.
+ */
+
+static void write_endio(struct bio *bio, int error)
+{
+	struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
+	b->write_error = error;
+	if (unlikely(error)) {
+		struct dm_bufio_client *c = b->c;
+		cmpxchg(&c->async_write_error, 0, error);
+	}
+	BUG_ON(!test_bit(B_WRITING, &b->state));
+	smp_mb__before_clear_bit();
+	clear_bit(B_WRITING, &b->state);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&b->state, B_WRITING);
+}
+
+/*
+ * Write all the dirty buffers asynchronously.
+ */
+
+static void write_dirty_buffers_async(struct dm_bufio_client *c)
+{
+	struct dm_buffer *b;
+	list_for_each_entry_reverse(b, &c->dirty_lru, lru_list) {
+		cond_resched();
+		BUG_ON(test_bit(B_READING, &b->state));
+		write_dirty_buffer(b);
+	}
+}
+
+/*
+ * Write all the dirty buffers synchronously.
+ * For performance, it is essential that the buffers are written asynchronously
+ * and simultaneously (so that the block layer can merge the writes) and then
+ * waited upon.
+ *
+ * Finally, we flush hardware disk cache.
+ */
+
+int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
+{
+	int a, f;
+
+	struct dm_buffer *b, *tmp;
+	mutex_lock(&c->lock);
+	write_dirty_buffers_async(c);
+	mutex_unlock(&c->lock);
+	mutex_lock(&c->lock);
+	list_for_each_entry_safe_reverse(b, tmp, &c->dirty_lru, lru_list) {
+		cond_resched();
+		BUG_ON(test_bit(B_READING, &b->state));
+		if (test_bit(B_WRITING, &b->state)) {
+			b->hold_count++;
+			mutex_unlock(&c->lock);
+			wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
+			mutex_lock(&c->lock);
+			b->hold_count--;
+		}
+		if (!test_bit(B_DIRTY, &b->state) && !test_bit(B_WRITING, &b->state))
+			relink_lru(b, 0);
+	}
+	wake_up(&c->free_buffer_wait);
+	mutex_unlock(&c->lock);
+
+	a = xchg(&c->async_write_error, 0);
+	f = dm_bufio_issue_flush(c);
+	if (unlikely(a))
+		return a;
+	return f;
+}
+EXPORT_SYMBOL(dm_bufio_write_dirty_buffers);
+
+/*
+ * Use dm-io to send and empty barrier flush the device.
+ */
+
+int dm_bufio_issue_flush(struct dm_bufio_client *c)
+{
+	struct dm_io_request io_req = {
+		.bi_rw = WRITE_BARRIER,
+		.mem.type = DM_IO_KMEM,
+		.mem.ptr.bvec = NULL,
+		.client = c->dm_io,
+	};
+	struct dm_io_region io_reg = {
+		.bdev = c->bdev,
+		.sector = 0,
+		.count = 0,
+	};
+	return dm_io(&io_req, 1, &io_reg, NULL);
+}
+EXPORT_SYMBOL(dm_bufio_issue_flush);
+
+/*
+ * Release the buffer and copy it to the new location.
+ *
+ * We first delete any other buffer that may be at that new location.
+ *
+ * Then, we write the buffer to the original location if it was dirty.
+ *
+ * Then, if we are the only one who is holding the buffer, relink the buffer
+ * in the hash queue for the new location.
+ *
+ * If there was someone other holding the buffer, we write it to the new
+ * location but not relink it, because that other user needs to have the buffer
+ * at the same place.
+ */
+
+void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
+{
+	struct dm_bufio_client *c = b->c;
+	struct dm_buffer *underlying;
+
+	mutex_lock(&c->lock);
+
+retry:
+	underlying = dm_bufio_find(c, new_block);
+	if (unlikely(underlying != NULL)) {
+		if (underlying->hold_count) {
+			wait_for_free_buffer(c);
+			goto retry;
+		}
+		make_buffer_clean(underlying);
+		unlink_buffer(underlying);
+		free_buffer_wake(underlying);
+	}
+
+	BUG_ON(!b->hold_count);
+	BUG_ON(test_bit(B_READING, &b->state));
+	write_dirty_buffer(b);
+	if (b->hold_count == 1) {
+		wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
+		set_bit(B_DIRTY, &b->state);
+		unlink_buffer(b);
+		link_buffer(b, new_block, 1);
+	} else {
+		wait_on_bit_lock(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
+		dm_bufio_submit_io(b, WRITE, new_block, write_endio);
+		wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE);
+	}
+	mutex_unlock(&c->lock);
+	dm_bufio_release(b);
+}
+EXPORT_SYMBOL(dm_bufio_release_move);
+
+/*
+ * Free all the buffers (and possibly write them if they were dirty)
+ * It is required that the calling theread doesn't have any reference on
+ * any buffer.
+ */
+
+void dm_bufio_drop_buffers(struct dm_bufio_client *c)
+{
+	struct dm_buffer *b;
+	mutex_lock(&c->lock);
+	write_dirty_buffers_async(c);
+	while ((b = get_unclaimed_buffer(c, 1)))
+		free_buffer_wake(b);
+	BUG_ON(!list_empty(&c->lru));
+	BUG_ON(!list_empty(&c->dirty_lru));
+	mutex_unlock(&c->lock);
+}
+EXPORT_SYMBOL(dm_bufio_drop_buffers);
+
+/* Create the buffering interface */
+
+struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, unsigned flags, __u64 cache_threshold, __u64 cache_limit)
+{
+	int r;
+	struct dm_bufio_client *c;
+	unsigned i;
+
+	BUG_ON(block_size < 1 << SECTOR_SHIFT || (block_size & (block_size - 1)));
+
+	c = kmalloc(sizeof(*c), GFP_KERNEL);
+	if (!c) {
+		r = -ENOMEM;
+		goto bad_client;
+	}
+
+	c->bdev = bdev;
+	c->block_size = block_size;
+	c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT;
+	c->pages_per_block_bits = ffs(block_size) - 1 >= PAGE_SHIFT ? ffs(block_size) - 1 - PAGE_SHIFT : 0;
+	INIT_LIST_HEAD(&c->lru);
+	INIT_LIST_HEAD(&c->dirty_lru);
+	for (i = 0; i < DM_BUFIO_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&c->cache_hash[i]);
+	mutex_init(&c->lock);
+	c->n_buffers = 0;
+
+	if (!cache_limit)
+		cache_limit = DM_BUFIO_LIMIT_MEMORY;
+	c->limit_buffers = cache_limit >> (c->sectors_per_block_bits + SECTOR_SHIFT);
+	if (!c->limit_buffers)
+		c->limit_buffers = 1;
+
+	if (!cache_threshold)
+		cache_threshold = DM_BUFIO_THRESHOLD_MEMORY;
+	if (cache_threshold > cache_limit)
+		cache_threshold = cache_limit;
+	c->threshold_buffers = cache_threshold >> (c->sectors_per_block_bits + SECTOR_SHIFT);
+	if (!c->threshold_buffers)
+		c->threshold_buffers = 1;
+
+	/*printk("%d %d\n", c->limit_buffers, c->threshold_buffers);*/
+
+	init_waitqueue_head(&c->free_buffer_wait);
+	c->async_write_error = 0;
+
+	/* Number of pages is not really hard limit, just a mempool size */
+	c->dm_io = dm_io_client_create((block_size + PAGE_SIZE - 1) / PAGE_SIZE);
+	if (IS_ERR(c->dm_io)) {
+		r = PTR_ERR(c->dm_io);
+		goto bad_dm_io;
+	}
+
+	c->reserved_buffer = alloc_buffer(c, GFP_KERNEL);
+	if (!c->reserved_buffer) {
+		r = -ENOMEM;
+		goto bad_buffer;
+	}
+
+	return c;
+
+bad_buffer:
+	dm_io_client_destroy(c->dm_io);
+bad_dm_io:
+	kfree(c);
+bad_client:
+	return ERR_PTR(r);
+}
+EXPORT_SYMBOL(dm_bufio_client_create);
+
+/*
+ * Free the buffering interface.
+ * It is required that there are no references on any buffers.
+ */
+
+void dm_bufio_client_destroy(struct dm_bufio_client *c)
+{
+	unsigned i;
+	dm_bufio_drop_buffers(c);
+	for (i = 0; i < DM_BUFIO_HASH_SIZE; i++)
+		BUG_ON(!hlist_empty(&c->cache_hash[i]));
+	BUG_ON(!c->reserved_buffer);
+	free_buffer(c->reserved_buffer);
+	BUG_ON(c->n_buffers != 0);
+	dm_io_client_destroy(c->dm_io);
+	kfree(c);
+}
+EXPORT_SYMBOL(dm_bufio_client_destroy);
Index: linux-2.6.32/drivers/md/dm-bufio.h
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-bufio.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_BUFIO_H
+#define DM_BUFIO_H
+
+struct dm_bufio_client;
+struct dm_buffer;
+
+void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp);
+void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, struct dm_buffer **bp);
+void dm_bufio_release(struct dm_buffer *b);
+
+void dm_bufio_mark_buffer_dirty(struct dm_buffer *b);
+int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c);
+int dm_bufio_issue_flush(struct dm_bufio_client *c);
+
+void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block);
+
+struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, unsigned flags, __u64 cache_threshold, __u64 cache_limit);
+void dm_bufio_client_destroy(struct dm_bufio_client *c);
+void dm_bufio_drop_buffers(struct dm_bufio_client *c);
+
+#endif
Index: linux-2.6.32/drivers/md/dm-multisnap-alloc.c
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-multisnap-alloc.c
@@ -0,0 +1,576 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-multisnap-mikulas.h"
+
+#define rshift_roundup(val, bits)	(((val) + ((chunk_t)1 << (bits)) - 1) >> (bits))
+
+#define BITS_PER_BYTE_SHIFT	3
+#define BYTES_PER_POINTER_SHIFT	3
+
+/*
+ * Initialize bitmaps, write them from the position "writing block".
+ */
+
+void dm_multisnap_create_bitmaps(struct dm_exception_store *s, chunk_t writing_block)
+{
+	int r;
+	struct dm_buffer *bp;
+	chunk_t direct_bitmap_blocks, total_bitmap_blocks, total_preallocated_blocks;
+	chunk_t lower_depth_block;
+	unsigned i, d;
+	chunk_t ii;
+
+	r = dm_multisnap_bitmap_depth(s->chunk_shift, s->dev_size);
+	if (r < 0) {
+		DMERR("dm_multisnap_create_bitmaps: device is too large");
+		dm_multisnap_set_error(s->dm, r);
+		return;
+	}
+	s->bitmap_depth = r;
+
+	direct_bitmap_blocks = rshift_roundup(s->dev_size, s->chunk_shift + BITS_PER_BYTE_SHIFT);
+
+	if (direct_bitmap_blocks > CB_BITMAP_IDX_MAX) {
+		DMERR("dm_multisnap_create_bitmaps: device is too large");
+		dm_multisnap_set_error(s->dm, -ERANGE);
+		return;
+	}
+
+	total_bitmap_blocks = 0;
+	for (i = 0; i <= s->bitmap_depth; i++) {
+		unsigned shift = (s->chunk_shift - BYTES_PER_POINTER_SHIFT) * i;
+		total_bitmap_blocks += rshift_roundup(direct_bitmap_blocks, shift);
+	}
+	total_preallocated_blocks = writing_block + total_bitmap_blocks;
+	for (ii = 0; ii < total_preallocated_blocks; ii++) {
+		if (dm_multisnap_is_commit_block(s, ii))
+			total_preallocated_blocks++;
+	}
+
+	if (total_preallocated_blocks >= s->dev_size) {
+		DMERR("dm_multisnap_create_bitmaps: device is too small");
+		dm_multisnap_set_error(s->dm, -ENOSPC);
+		return;
+	}
+
+/* Write direct bitmap blocks */
+
+	lower_depth_block = writing_block;
+	for (ii = 0; ii < direct_bitmap_blocks; ii++, writing_block++) {
+		void *bmp;
+		while (dm_multisnap_is_commit_block(s, writing_block))
+			writing_block++;
+		bmp = dm_bufio_new(s->bufio, writing_block, &bp);
+		if (IS_ERR(bmp)) {
+			DMERR("dm_multisnap_create_bitmaps: can't create direct bitmap block at %llx", (unsigned long long)writing_block);
+			dm_multisnap_set_error(s->dm, PTR_ERR(bmp));
+			return;
+		}
+		cond_resched();
+		memset(bmp, 0, s->chunk_size);
+		cond_resched();
+		for (i = 0; i < s->chunk_size << BITS_PER_BYTE_SHIFT; i++) {
+			chunk_t block_to_test = (ii << (s->chunk_shift + BITS_PER_BYTE_SHIFT)) | i;
+			if (block_to_test >= s->dev_size) {
+				generic___set_le_bit(i, bmp);
+			} else if (block_to_test < total_preallocated_blocks || dm_multisnap_is_commit_block(s, block_to_test)) {
+				generic___set_le_bit(i, bmp);
+				dm_multisnap_status_lock(s->dm);
+				s->total_allocated++;
+				dm_multisnap_status_unlock(s->dm);
+			}
+			cond_resched();
+		}
+		dm_bufio_mark_buffer_dirty(bp);
+		dm_bufio_release(bp);
+	}
+
+/* Write indirect bitmap blocks */
+
+	for (d = 1; d <= s->bitmap_depth; d++) {
+		chunk_t this_depth_block = writing_block;
+		for (ii = 0; ii < rshift_roundup(direct_bitmap_blocks, d * (s->chunk_shift - BYTES_PER_POINTER_SHIFT)); ii++, writing_block++) {
+			__u64 *bmp;
+			while (dm_multisnap_is_commit_block(s, writing_block))
+				writing_block++;
+			bmp = dm_bufio_new(s->bufio, writing_block, &bp);
+			if (IS_ERR(bmp)) {
+				DMERR("dm_multisnap_create_bitmaps: can't create indirect bitmap block at %llx", (unsigned long long)writing_block);
+				dm_multisnap_set_error(s->dm, PTR_ERR(bmp));
+				return;
+			}
+			for (i = 0; i < s->chunk_size >> BYTES_PER_POINTER_SHIFT; i++) {
+				if (((ii << d * (s->chunk_shift - BYTES_PER_POINTER_SHIFT)) | (i << (d - 1) * (s->chunk_shift - BYTES_PER_POINTER_SHIFT))) >= direct_bitmap_blocks) {
+					bmp[i] = cpu_to_le64(0);
+					continue;
+				}
+				while (dm_multisnap_is_commit_block(s, lower_depth_block))
+					lower_depth_block++;
+				bmp[i] = cpu_to_le64(lower_depth_block);
+				lower_depth_block++;
+			}
+			dm_bufio_mark_buffer_dirty(bp);
+			dm_bufio_release(bp);
+		}
+		lower_depth_block = this_depth_block;
+	}
+
+	s->bitmap_root = writing_block - 1;
+}
+
+static void dm_multisnap_add_bitmap(struct dm_exception_store *s);
+
+void dm_multisnap_extend_bitmaps(struct dm_exception_store *s, chunk_t new_size)
+{
+	while (s->dev_size < new_size) {
+		struct dm_buffer *bp;
+		void *bmp;
+		bitmap_t bitmap_no = s->dev_size >> (s->chunk_shift + BITS_PER_BYTE_SHIFT);
+		unsigned i = s->dev_size & ((1 << (s->chunk_shift + BITS_PER_BYTE_SHIFT)) - 1);
+		chunk_t c = s->dev_size;
+		if (!i) {
+			dm_multisnap_add_bitmap(s);
+			if (unlikely(dm_multisnap_has_error(s->dm)))
+				return;
+		}
+		bmp = dm_multisnap_map_bitmap(s, bitmap_no, &bp, NULL, NULL);
+		if (unlikely(!bmp))
+			return;
+		for (; i < s->chunk_size << BITS_PER_BYTE_SHIFT; i++, c++) {
+			if (unlikely(dm_multisnap_is_commit_block(s, c)))
+				generic___set_le_bit(i, bmp);
+			else
+				generic___clear_le_bit(i, bmp);
+		}
+		dm_bufio_mark_buffer_dirty(bp);
+		dm_bufio_release(bp);
+
+		s->dev_size = ((chunk_t)bitmap_no + 1) << (s->chunk_shift + BITS_PER_BYTE_SHIFT);
+		if (s->dev_size > new_size)
+			s->dev_size = new_size;
+	}
+}
+
+static void dm_multisnap_add_bitmap(struct dm_exception_store *s)
+{
+	struct path_element path[MAX_BITMAP_DEPTH];
+	struct dm_buffer *bp;
+	int d;
+	__u64 *bmpp;
+	unsigned i;
+	chunk_t c, bitmap_blk, new_blk;
+	bitmap_t bitmap_no = s->dev_size >> (s->chunk_shift + BITS_PER_BYTE_SHIFT);
+	void *bmp = dm_multisnap_alloc_make_block(s, &bitmap_blk, &bp);
+	if (!bmp)
+		return;
+	c = (chunk_t)bitmap_no << (s->chunk_shift + BITS_PER_BYTE_SHIFT);
+	for (i = 0; i < s->chunk_size << BITS_PER_BYTE_SHIFT; i++, c++) {
+		if (unlikely(dm_multisnap_is_commit_block(s, c)))
+			generic___set_le_bit(i, bmp);
+		else
+			generic___clear_le_bit(i, bmp);
+	}
+	dm_bufio_mark_buffer_dirty(bp);
+	dm_bufio_release(bp);
+
+	/* just get the path to the last block */
+	bmp = dm_multisnap_map_bitmap(s, bitmap_no - 1, &bp, NULL, path);
+	if (unlikely(!bmp))
+		return;
+	dm_bufio_release(bp);
+
+	for (d = s->bitmap_depth - 1; d >= 0; d--) {
+		if (path[d].idx + 1 < path[d].n_entries) {
+			__u64 *bmpp = dm_multisnap_read_block(s, path[d].block, &bp);
+			if (!bmpp)
+				return;
+			bmpp[path[d].idx + 1] = cpu_to_le64(bitmap_blk);
+			dm_bufio_mark_buffer_dirty(bp);
+			dm_bufio_release(bp);
+			return;
+		} else {
+			bmpp = dm_multisnap_alloc_make_block(s, &new_blk, &bp);
+			if (!bmpp)
+				return;
+			memset(bmpp, 0, s->chunk_size);
+			bmpp[0] = cpu_to_le64(bitmap_blk);
+			dm_bufio_mark_buffer_dirty(bp);
+			dm_bufio_release(bp);
+			bitmap_blk = new_blk;
+		}
+	}
+
+	/* make new root */
+	bmpp = dm_multisnap_alloc_make_block(s, &new_blk, &bp);
+	if (!bmpp)
+		return;
+	memset(bmpp, 0, s->chunk_size);
+	bmpp[0] = cpu_to_le64(s->bitmap_root);
+	bmpp[1] = cpu_to_le64(bitmap_blk);
+	dm_bufio_mark_buffer_dirty(bp);
+	dm_bufio_release(bp);
+	s->bitmap_root = new_blk;
+	s->bitmap_depth++;
+}
+
+/*
+ * Read a leaf bitmap node with index "bitmap".
+ * Return the pointer to the data, store the held buffer to bl.
+ * Return the block in block and path in path.
+ */
+
+void *dm_multisnap_map_bitmap(struct dm_exception_store *s, bitmap_t bitmap, struct dm_buffer **bp, chunk_t *block, struct path_element *path)
+{
+	__u64 *bmp;
+	unsigned idx;
+	unsigned d = s->bitmap_depth;
+	chunk_t blk = s->bitmap_root;
+	chunk_t parent = 0;
+
+	while (1) {
+		bmp = dm_multisnap_read_block(s, blk, bp);
+		if (unlikely(!bmp)) {
+			DMERR("dm_multisnap_map_bitmap: can't read bitmap at %llx (%llx), pointed to by %llx (%llx), depth %d/%d, index %llx",
+				(unsigned long long)blk,
+				(unsigned long long)dm_multisnap_remap_block(s, blk),
+				(unsigned long long)parent,
+				(unsigned long long)dm_multisnap_remap_block(s, parent),
+				s->bitmap_depth - d,
+				s->bitmap_depth,
+				(unsigned long long)bitmap);
+			return NULL;
+		}
+		if (!d) {
+			if (block)
+				*block = blk;
+			return bmp;
+		}
+
+		idx = (bitmap >> ((d - 1) * (s->chunk_shift - BYTES_PER_POINTER_SHIFT))) & ((s->chunk_size - 1) >> BYTES_PER_POINTER_SHIFT);
+
+		if (unlikely(path != NULL)) {
+			path[s->bitmap_depth - d].block = blk;
+			path[s->bitmap_depth - d].idx = idx;
+			path[s->bitmap_depth - d].n_entries = s->chunk_size >> BYTES_PER_POINTER_SHIFT;
+		}
+
+		parent = blk;
+		blk = le64_to_cpu(bmp[idx]);
+
+		dm_bufio_release(*bp);
+
+		d--;
+	}
+}
+
+/*
+ * Find a free bit from "start" to "end" (in bits).
+ * If wide_search is nonzero, search for the whole free byte first.
+ */
+
+static int find_bit(const void *bmp, unsigned start, unsigned end, int wide_search)
+{
+	const void *p;
+	unsigned bit;
+	if (unlikely(start >= end))
+		return -ENOSPC;
+	cond_resched();
+	if (likely(!generic_test_le_bit(start, bmp)))
+		return start;
+	if (likely(wide_search)) {
+		cond_resched();
+		p = memchr(bmp + (start >> 3), 0, (end >> 3) - (start >> 3));
+		cond_resched();
+		if (p) {
+			bit = (((const __u8 *)p - (const __u8 *)bmp) << 3) | (start & 7);
+			while (bit > start && !generic_test_le_bit(bit - 1, bmp))
+				bit--;
+			goto ret_bit;
+		}
+	}
+	bit = generic_find_next_zero_le_bit(bmp, end, start);
+	cond_resched();
+
+ret_bit:
+	if (unlikely(bit >= end))
+		return -ENOSPC;
+	return bit;
+}
+
+static unsigned bitmap_limit(struct dm_exception_store *s, bitmap_t bmp)
+{
+	if (bmp == (bitmap_t)(s->dev_size >> (s->chunk_shift + BITS_PER_BYTE_SHIFT)))
+		return (unsigned)s->dev_size & ((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1);
+	return s->chunk_size << BITS_PER_BYTE_SHIFT;
+}
+
+int dm_multisnap_alloc_blocks(struct dm_exception_store *s, chunk_t *results, unsigned n_blocks, int flags)
+{
+	void *bmp;
+	struct dm_buffer *bp;
+	chunk_t block;
+	int wrap_around = 0;
+	int start_bit;
+	int wide_search;
+	int i;
+	bitmap_t bitmap_no;
+	int c;
+	int bit;
+	chunk_t to_free = 0;
+
+	bitmap_no = s->alloc_rover >> (s->chunk_shift + BITS_PER_BYTE_SHIFT);
+next_bitmap:
+	bmp = dm_multisnap_map_bitmap(s, bitmap_no, &bp, &block, NULL);
+	if (unlikely(!bmp))
+		return -1;
+
+	wide_search = 1;
+find_again:
+	start_bit = s->alloc_rover & ((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1);
+
+	for (i = 0; i < n_blocks; i++) {
+find_another_bit:
+		bit = find_bit(bmp, start_bit, bitmap_limit(s, bitmap_no), wide_search);
+		if (unlikely(bit < 0)) {
+bit_find_failed:
+			if (wide_search) {
+				wide_search = 0;
+				goto find_again;
+			}
+			dm_bufio_release(bp);
+			s->alloc_rover = (chunk_t) ++bitmap_no << (s->chunk_shift + BITS_PER_BYTE_SHIFT);
+			if (unlikely(s->alloc_rover >= s->dev_size)) {
+				s->alloc_rover = 0;
+				bitmap_no = 0;
+				wrap_around++;
+				if (wrap_around >= 2) {
+					DMERR("snapshot overflow");
+					dm_multisnap_set_error(s->dm, -ENOSPC);
+					return -1;
+				}
+			}
+			goto next_bitmap;
+		}
+		results[i] = ((chunk_t)bitmap_no << (s->chunk_shift + BITS_PER_BYTE_SHIFT)) | bit;
+		start_bit = bit + 1;
+		dm_bufio_release(bp);
+
+		c = dm_multisnap_check_allocated_block(s, results[i]);
+		if (dm_multisnap_has_error(s->dm))
+			return -1;
+
+		bmp = dm_multisnap_read_block(s, block, &bp);
+		if (unlikely(!bmp))
+			return -1;
+
+		if (c)
+			goto find_another_bit;
+	}
+
+	if (flags & ALLOC_DRY)
+		goto bp_release_return;
+
+	if (!dm_multisnap_block_is_uncommitted(s, block)) {
+		chunk_t new_block;
+find_another_bit_for_bitmap:
+		bit = find_bit(bmp, start_bit, bitmap_limit(s, bitmap_no), wide_search);
+		if (unlikely(bit < 0))
+			goto bit_find_failed;
+
+		new_block = ((chunk_t)bitmap_no << (s->chunk_shift + BITS_PER_BYTE_SHIFT)) | bit;
+		start_bit = bit + 1;
+
+		dm_bufio_release(bp);
+		c = dm_multisnap_check_allocated_block(s, new_block);
+		if (dm_multisnap_has_error(s->dm))
+			return -1;
+
+		bmp = dm_multisnap_read_block(s, block, &bp);
+		if (unlikely(!bmp))
+			return -1;
+
+		if (c)
+			goto find_another_bit_for_bitmap;
+
+		/*
+		 * Warning: record the address of a block to free in a special
+		 * variable.
+		 *
+		 * If we freed it here, that could recurse back to
+		 * dm_multisnap_alloc_blocks and corrupt allocations. Free it
+		 * later when we are done with the allocation and all the
+		 * allocated blocks are marked in the bitmap.
+		 */
+		bmp = dm_multisnap_duplicate_block(s, block, new_block, bitmap_no, &bp, &to_free);
+		if (unlikely(!bmp))
+			return -1;
+
+		generic___set_le_bit(bit, bmp);
+		dm_multisnap_status_lock(s->dm);
+		s->total_allocated++;
+		dm_multisnap_status_unlock(s->dm);
+	}
+
+	for (i = 0; i < n_blocks; i++)
+		generic___set_le_bit(results[i] & ((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1), bmp);
+	dm_multisnap_status_lock(s->dm);
+	s->total_allocated += n_blocks;
+	dm_multisnap_status_unlock(s->dm);
+
+	dm_bufio_mark_buffer_dirty(bp);
+
+bp_release_return:
+	dm_bufio_release(bp);
+
+	s->alloc_rover = (s->alloc_rover & ~(chunk_t)((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1)) + start_bit;
+	if (unlikely(s->alloc_rover >= s->dev_size))
+		s->alloc_rover = 0;
+
+	if (unlikely(to_free != 0))
+		dm_multisnap_free_block(s, to_free, 0);
+
+	return 0;
+}
+
+void *dm_multisnap_alloc_duplicate_block(struct dm_exception_store *s, chunk_t block, struct dm_buffer **bp, void *ptr)
+{
+	int r;
+	chunk_t new_chunk;
+	void *data;
+
+	if (dm_multisnap_block_is_uncommitted(s, block))
+		return ptr;
+
+	dm_bufio_release(*bp);
+
+	r = dm_multisnap_alloc_blocks(s, &new_chunk, 1, 0);
+	if (r)
+		return NULL;
+
+	data = dm_multisnap_read_block(s, block, bp);
+	if (!data)
+		return NULL;
+
+	return dm_multisnap_duplicate_block(s, block, new_chunk, CB_BITMAP_IDX_NONE, bp, NULL);
+}
+
+void *dm_multisnap_alloc_make_block(struct dm_exception_store *s, chunk_t *result, struct dm_buffer **bp)
+{
+	int r = dm_multisnap_alloc_blocks(s, result, 1, 0);
+	if (unlikely(r < 0))
+		return NULL;
+
+	return dm_multisnap_make_block(s, *result, bp);
+}
+
+void dm_multisnap_free_blocks_immediate(struct dm_exception_store *s, chunk_t block, unsigned n_blocks)
+{
+	void *bmp;
+	struct dm_buffer *bp;
+
+	if (!n_blocks)
+		return;
+
+	if (unlikely(block + n_blocks > s->dev_size)) {
+		DMERR("dm_multisnap_free_block_immediate: freeing invalid blocks %llx, %x", (unsigned long long)block, n_blocks);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return;
+	}
+
+	if (block + n_blocks == s->alloc_rover)
+		s->alloc_rover = block;
+
+	do {
+		bitmap_t bitmap_no = block >> (s->chunk_shift + BITS_PER_BYTE_SHIFT);
+
+		bmp = dm_multisnap_map_bitmap(s, bitmap_no, &bp, NULL, NULL);
+		if (!bmp)
+			return;
+
+		do {
+			generic___clear_le_bit(block & ((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1), bmp);
+			dm_multisnap_status_lock(s->dm);
+			s->total_allocated--;
+			dm_multisnap_status_unlock(s->dm);
+			n_blocks--;
+			block++;
+			cond_resched();
+		} while (n_blocks && (block & ((s->chunk_size << BITS_PER_BYTE_SHIFT) - 1)));
+
+		dm_bufio_mark_buffer_dirty(bp);
+		dm_bufio_release(bp);
+	} while (unlikely(n_blocks != 0));
+}
+
+
+void dm_multisnap_bitmap_finalize_tmp_remap(struct dm_exception_store *s, struct tmp_remap *tmp_remap)
+{
+	chunk_t block;
+	struct dm_buffer *bp;
+	__u64 *new_block;
+	struct path_element path[MAX_BITMAP_DEPTH];
+	int results_ptr;
+
+	chunk_t new_blockn;
+	int i;
+
+	/*
+	 * Preallocate twice the required amount of blocks, so that resolving
+	 * the next tmp_remap (created here, in dm_multisnap_alloc_blocks)
+	 * doesn't have to allocate anything.
+	 */
+	if (s->n_preallocated_blocks < s->bitmap_depth) {
+		if (unlikely(dm_multisnap_alloc_blocks(s, s->preallocated_blocks + s->n_preallocated_blocks, s->bitmap_depth * 2 - s->n_preallocated_blocks, 0) < 0))
+			return;
+		s->n_preallocated_blocks = s->bitmap_depth * 2;
+	}
+	results_ptr = 0;
+
+	new_block = dm_multisnap_map_bitmap(s, tmp_remap->bitmap_idx, &bp, &block, path);
+	if (unlikely(!new_block))
+		return;
+
+	dm_bufio_release(bp);
+
+	new_blockn = tmp_remap->new;
+	for (i = s->bitmap_depth - 1; i >= 0; i--) {
+		chunk_t block_to_free;
+		int remapped = 0;
+		__u64 *bmp = dm_multisnap_read_block(s, path[i].block, &bp);
+		if (unlikely(!bmp))
+			return;
+
+		if (!dm_multisnap_block_is_uncommitted(s, path[i].block)) {
+			remapped = 1;
+			dm_bufio_release_move(bp, s->preallocated_blocks[results_ptr]);
+			bmp = dm_multisnap_read_block(s, s->preallocated_blocks[results_ptr], &bp);
+			if (!bmp)
+				return;
+			/* !!! TODO: add to a list of newly allocated blocks */
+		}
+
+		block_to_free = le64_to_cpu(bmp[path[i].idx]);
+		bmp[path[i].idx] = cpu_to_le64(new_blockn);
+		dm_bufio_mark_buffer_dirty(bp);
+		dm_bufio_release(bp);
+
+		dm_multisnap_free_block(s, block_to_free, 0);
+
+		if (!remapped)
+			goto skip_it;
+		new_blockn = s->preallocated_blocks[results_ptr];
+		results_ptr++;
+	}
+
+	dm_multisnap_free_block(s, s->bitmap_root, 0);
+	s->bitmap_root = new_blockn;
+
+skip_it:
+	memmove(s->preallocated_blocks, s->preallocated_blocks + results_ptr, (s->n_preallocated_blocks -= results_ptr) * sizeof(chunk_t));
+}
Index: linux-2.6.32/drivers/md/dm-multisnap-blocks.c
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-multisnap-blocks.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-multisnap-mikulas.h"
+
+/*
+ * Check that the block is valid.
+ */
+static int check_invalid(struct dm_exception_store *s, chunk_t block)
+{
+	if (unlikely(block >= s->dev_size) ||
+	    unlikely(block == SB_BLOCK) ||
+	    unlikely(dm_multisnap_is_commit_block(s, block))) {
+		DMERR("check_invalid: access to invalid part of the device: %llx, size %llx", (unsigned long long)block, (unsigned long long)s->dev_size);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return 1;
+	}
+	return 0;
+}
+
+static struct tmp_remap *find_tmp_remap(struct dm_exception_store *s, chunk_t block)
+{
+	struct tmp_remap *t;
+	struct hlist_node *hn;
+	unsigned hash = TMP_REMAP_HASH(block);
+	hlist_for_each_entry(t, hn, &s->tmp_remap[hash], hash_list) {
+		if (t->old == block)
+			return t;
+		cond_resched();
+	}
+	return NULL;
+}
+
+chunk_t dm_multisnap_remap_block(struct dm_exception_store *s, chunk_t block)
+{
+	struct tmp_remap *t;
+	t = find_tmp_remap(s, block);
+	if (t)
+		return t->new;
+	return block;
+}
+
+void *dm_multisnap_read_block(struct dm_exception_store *s, chunk_t block, struct dm_buffer **bp)
+{
+	void *buf;
+	cond_resched();
+
+	if (check_invalid(s, block))
+		return NULL;
+
+	block = dm_multisnap_remap_block(s, block);
+
+	if (check_invalid(s, block))
+		return NULL;
+
+	buf = dm_bufio_read(s->bufio, block, bp);
+	if (unlikely(IS_ERR(buf))) {
+		DMERR("dm_multisnap_read_block: error read chunk %llx", (unsigned long long)block);
+		dm_multisnap_set_error(s->dm, PTR_ERR(buf));
+		return NULL;
+	}
+	return buf;
+}
+
+int dm_multisnap_block_is_uncommitted(struct dm_exception_store *s, chunk_t chunk)
+{
+	struct tmp_remap *t;
+	check_invalid(s, chunk);
+	t = find_tmp_remap(s, chunk);
+	return t && t->uncommitted;
+}
+
+void *dm_multisnap_duplicate_block(struct dm_exception_store *s, chunk_t old_chunk, chunk_t new_chunk, bitmap_t bitmap_idx, struct dm_buffer **bp, chunk_t *to_free_ptr)
+{
+	chunk_t to_free_val;
+	void *buf;
+	struct tmp_remap *t;
+
+	if (unlikely(check_invalid(s, old_chunk)) ||
+	    unlikely(check_invalid(s, new_chunk)))
+		return NULL;
+
+	if (!to_free_ptr)
+		to_free_ptr = &to_free_val;
+	*to_free_ptr = 0;
+
+	t = find_tmp_remap(s, old_chunk);
+	if (t) {
+		if (unlikely(t->bitmap_idx != bitmap_idx)) {
+			DMERR("dm_multisnap_duplicate_block: bitmap_idx doesn't match, %X != %X", t->bitmap_idx, bitmap_idx);
+			dm_multisnap_set_error(s->dm, -EFSERROR);
+			return NULL;
+		}
+		*to_free_ptr = t->new;
+		t->new = new_chunk;
+	} else {
+		if (unlikely(list_empty(&s->free_tmp_remaps))) {
+			DMERR("dm_multisnap_duplicate_block: all remap blocks used");
+			dm_multisnap_set_error(s->dm, -EFSERROR);
+			return NULL;
+		}
+		t = list_first_entry(&s->free_tmp_remaps, struct tmp_remap, list);
+		t->new = new_chunk;
+		t->old = old_chunk;
+		t->bitmap_idx = bitmap_idx;
+		hlist_add_head(&t->hash_list, &s->tmp_remap[TMP_REMAP_HASH(old_chunk)]);
+		s->n_used_tmp_remaps++;
+	}
+	list_del(&t->list);
+	if (bitmap_idx == CB_BITMAP_IDX_NONE)
+		list_add_tail(&t->list, &s->used_bt_tmp_remaps);
+	else
+		list_add_tail(&t->list, &s->used_bitmap_tmp_remaps);
+	t->uncommitted = 1;
+	dm_bufio_release_move(*bp, new_chunk);
+
+	if (to_free_ptr == &to_free_val && to_free_val)
+		dm_multisnap_free_block(s, to_free_val, 0);
+
+	buf = dm_bufio_read(s->bufio, new_chunk, bp);
+	if (IS_ERR(buf)) {
+		DMERR("dm_multisnap_duplicate_block: error reading chunk %llx", (unsigned long long)new_chunk);
+		dm_multisnap_set_error(s->dm, PTR_ERR(buf));
+		return NULL;
+	}
+	return buf;
+}
+
+void dm_multisnap_free_tmp_remap(struct dm_exception_store *s, struct tmp_remap *t)
+{
+	list_del(&t->list);
+	hlist_del(&t->hash_list);
+	s->n_used_tmp_remaps--;
+	list_add(&t->list, &s->free_tmp_remaps);
+}
+
+void *dm_multisnap_make_block(struct dm_exception_store *s, chunk_t new_chunk, struct dm_buffer **bp)
+{
+	void *buf;
+
+	if (unlikely(check_invalid(s, new_chunk)))
+		return NULL;
+
+	/* !!! TODO: add it to the list of recently allocated blocks */
+
+	buf = dm_bufio_new(s->bufio, new_chunk, bp);
+	if (unlikely(IS_ERR(buf))) {
+		DMERR("dm_multisnap_make_block: error creating new block at chunk %llx", (unsigned long long)new_chunk);
+		dm_multisnap_set_error(s->dm, PTR_ERR(buf));
+		return NULL;
+	}
+	return buf;
+}
+
+void dm_multisnap_free_block_and_duplicates(struct dm_exception_store *s, chunk_t chunk)
+{
+	struct tmp_remap *t;
+
+	if (unlikely(check_invalid(s, chunk)))
+		return;
+
+	t = find_tmp_remap(s, chunk);
+	if (t) {
+		dm_multisnap_free_block(s, t->new, 0);
+		dm_multisnap_free_tmp_remap(s, t);
+	}
+	dm_multisnap_free_block(s, chunk, 0);
+}
+
+int dm_multisnap_is_commit_block(struct dm_exception_store *s, chunk_t block)
+{
+	if (unlikely(block < FIRST_CB_BLOCK))
+		return 0;
+	if (likely(!(s->cb_stride & (s->cb_stride - 1))))
+		return (block & (s->cb_stride - 1)) == (FIRST_CB_BLOCK & (s->cb_stride - 1));
+	else
+		return sector_div(block, s->cb_stride) == FIRST_CB_BLOCK % s->cb_stride;
+}
+
+void dm_multisnap_init_stop_cycles(stop_cycles_t *cy)
+{
+	(*cy)[1] = 0;
+}
+
+int dm_multisnap_stop_cycles(struct dm_exception_store *s, stop_cycles_t *cy, chunk_t key)
+{
+	if (unlikely((*cy)[0] == key) && unlikely((*cy)[1] != 0)) {
+		DMERR("dm_multisnap_stop_cycles: cycle detected at chunk %llx", (unsigned long long)key);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return -1;
+	}
+	return 0;
+}
Index: linux-2.6.32/drivers/md/dm-multisnap-btree.c
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-multisnap-btree.c
@@ -0,0 +1,798 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-multisnap-mikulas.h"
+
+/*
+ * Read one btree node and do basic consistency checks.
+ * Any btree access should be done with this function.
+ */
+
+static struct dm_multisnap_bt_node *dm_multisnap_read_btnode(struct dm_exception_store *s, int depth, chunk_t block, unsigned want_entries, struct dm_buffer **bp)
+{
+	struct dm_multisnap_bt_node *node;
+
+	BUG_ON((unsigned)depth >= s->bt_depth);
+
+	node = dm_multisnap_read_block(s, block, bp);
+	if (unlikely(!node))
+		return NULL;
+
+	if (unlikely(node->signature != BT_SIGNATURE)) {
+		dm_bufio_release(*bp);
+		DMERR("dm_multisnap_read_btnode: bad signature on btree node %llx", (unsigned long long)block);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return NULL;
+	}
+
+	if (unlikely((unsigned)(le32_to_cpu(node->n_entries) - 1) >= s->btree_entries) ||
+	    (want_entries && unlikely(le32_to_cpu(node->n_entries) != want_entries))) {
+		dm_bufio_release(*bp);
+		DMERR("dm_multisnap_read_btnode: bad number of entries in btree node %llx: %x, wanted %x", (unsigned long long)block, le32_to_cpu(node->n_entries), want_entries);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return NULL;
+	}
+
+	return node;
+}
+
+/*
+ * I know they hate inline, but I still maintain the point that it is
+ * appropriate here.
+ */
+
+static inline void write_orig_chunk(struct dm_multisnap_bt_entry *be, chunk_t n)
+{
+	write_48(be, orig_chunk, n);
+	if (sizeof(chunk_t) == 4 && unlikely(n > CHUNK_T_MAX))
+		be->orig_chunk2 = cpu_to_le16(0xffff);
+}
+
+/*
+ * Add an entry (key, new_chunk) at an appropriate index to the btree node.
+ * Move the existing entries
+ */
+
+static void add_at_idx(struct dm_multisnap_bt_node *node, unsigned index, struct bt_key *key, chunk_t new_chunk)
+{
+	memmove(&node->entries[index + 1], &node->entries[index], (le32_to_cpu(node->n_entries) - index) * sizeof(struct dm_multisnap_bt_entry));
+	write_orig_chunk(&node->entries[index], key->chunk);
+	write_48(&node->entries[index], new_chunk, new_chunk);
+	node->entries[index].snap_from = cpu_to_mikulas_snapid(key->snap_from);
+	node->entries[index].snap_to = cpu_to_mikulas_snapid(key->snap_to);
+	node->entries[index].flags = cpu_to_le32(0);
+	node->n_entries = cpu_to_le32(le32_to_cpu(node->n_entries) + 1);
+}
+
+/*
+ * Create an initial btree.
+ * (*writing_block) is updated to point after the btree.
+ */
+
+void dm_multisnap_create_btree(struct dm_exception_store *s, chunk_t *writing_block)
+{
+	struct dm_buffer *bp;
+	struct dm_multisnap_bt_node *node;
+	struct bt_key new_key;
+
+	while (dm_multisnap_is_commit_block(s, *writing_block))
+		(*writing_block)++;
+
+	if (*writing_block >= s->dev_size) {
+		DMERR("dm_multisnap_create_btree: device is too small");
+		dm_multisnap_set_error(s->dm, -ENOSPC);
+		return;
+	}
+
+	node = dm_bufio_new(s->bufio, *writing_block, &bp);
+	if (IS_ERR(node)) {
+		DMERR("dm_multisnap_create_btree: 't create direct bitmap block at %llx", (unsigned long long)*writing_block);
+		dm_multisnap_set_error(s->dm, PTR_ERR(node));
+		return;
+	}
+	memset(node, 0, s->chunk_size);
+	node->signature = BT_SIGNATURE;
+	node->n_entries = cpu_to_le32(0);
+
+	/*
+	 * A btree node must have at least one entry --- so create this empty
+	 * one
+	 */
+	new_key.snap_from = new_key.snap_to = SNAPID_T_LAST;
+	new_key.chunk = CHUNK_T_LAST;
+	add_at_idx(node, 0, &new_key, 0);
+
+	dm_bufio_mark_buffer_dirty(bp);
+	dm_bufio_release(bp);
+	s->bt_root = *writing_block;
+	s->bt_depth = 1;
+	(*writing_block)++;
+}
+
+/*
+ * Compare btree entry and a search key. Returns:
+ *	-1: the entry is lower than the key
+ *	1: the entry is higher than the key
+ *	0: the entry matches the key (both entry and key have ranges, a match
+ *		is returned when the ranges overlap)
+ */
+
+static int compare_key(struct dm_multisnap_bt_entry *e, struct bt_key *key)
+{
+	chunk_t orig_chunk = read_48(e, orig_chunk);
+	if (orig_chunk < key->chunk)
+		return -1;
+	if (orig_chunk > key->chunk)
+		return 1;
+
+	if (mikulas_snapid_to_cpu(e->snap_to) < key->snap_from)
+		return -1;
+	if (mikulas_snapid_to_cpu(e->snap_from) > key->snap_to)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Perform binary search on the btree node.
+ * Returns: 1 - found, 0 - not found
+ * 	*result - if found, then the first entry in the requested range
+ *		- if not found, then the first entry after the requested range
+ */
+
+static int binary_search(struct dm_multisnap_bt_node *node, struct bt_key *key, unsigned *result)
+{
+	int c;
+	int first = 0;
+	int last = le32_to_cpu(node->n_entries) - 1;
+
+	while (1) {
+		int middle = (first + last) >> 1;
+		struct dm_multisnap_bt_entry *e = &node->entries[middle];
+
+		c = compare_key(e, key);
+
+		if (first == last)
+			break;
+
+		if (c < 0)
+			first = middle + 1;
+		else
+			last = middle;
+
+		cond_resched();
+	}
+
+	*result = first;
+	return !c;
+}
+
+/*
+ * Find a given key in the btree.
+ *
+ * Returns: 1 - found, 0 - not found, -1 - error
+ *	In case of not error (0 or 1 is returned), the node and held buffer for
+ *	this node is returned (the buffer must be released with
+ *	dm_bufio_release). Also, path with s->bt_depth entries is returned.
+ */
+
+static int walk_btree(struct dm_exception_store *s, struct bt_key *key, struct dm_multisnap_bt_node **nodep, struct dm_buffer **bp, struct path_element path[MAX_BT_DEPTH])
+{
+#define		node (*nodep)
+	int r;
+	chunk_t block = s->bt_root;
+	unsigned d = 0;
+
+	/*
+	 * These four are purely to check tree consistency.
+	 * They could be commented out. But it's safer to leave them there.
+	 */
+	chunk_t want_last_chunk = CHUNK_T_LAST;
+	mikulas_snapid_t want_last_snapid = SNAPID_T_LAST;
+	chunk_t last_chunk;
+	mikulas_snapid_t last_snapid;
+
+	while (1) {
+		path[d].block = block;
+		node = dm_multisnap_read_btnode(s, d, block, 0, bp);
+		if (!node)
+			return -1;
+		path[d].n_entries = le32_to_cpu(node->n_entries);
+
+		/* Check consistency (can be commented out) */
+		last_chunk = read_48(&node->entries[path[d].n_entries - 1], orig_chunk);
+		last_snapid = mikulas_snapid_to_cpu(node->entries[path[d].n_entries - 1].snap_to);
+		if (unlikely(last_chunk != want_last_chunk) ||
+		    unlikely(last_snapid != want_last_snapid)) {
+#if 1
+			/* Convert old format into new format */
+			if (last_chunk == CHUNK_T_LAST && last_snapid == SNAPID_T_LAST && d != s->bt_depth - 1) {
+				write_orig_chunk(&node->entries[path[d].n_entries - 1], want_last_chunk);
+				node->entries[path[d].n_entries - 1].snap_from = node->entries[path[d].n_entries - 1].snap_to = cpu_to_mikulas_snapid(want_last_snapid);
+				dm_bufio_mark_buffer_dirty(*bp);
+			} else
+#endif
+			{
+				DMERR("walk_btree: invalid last entry in node %llx: last_chunk %llx, want_last_chunk %llx, last_snapid: %llx, want_last_snapid: %llx, searching for %llx, %llx-%llx",
+					(unsigned long long)last_chunk,
+					(unsigned long long)want_last_chunk,
+					(unsigned long long)last_snapid,
+					(unsigned long long)want_last_snapid,
+					(unsigned long long)block,
+					(unsigned long long)key->chunk,
+					(unsigned long long)key->snap_from,
+					(unsigned long long)key->snap_to);
+				dm_multisnap_set_error(s->dm, -EFSERROR);
+				return -1;
+			}
+		}
+
+		r = binary_search(node, key, &path[d].idx);
+
+		want_last_chunk = read_48(&node->entries[path[d].idx], orig_chunk);
+		want_last_snapid = mikulas_snapid_to_cpu(node->entries[path[d].idx].snap_to);
+
+		block = read_48(&node->entries[path[d].idx], new_chunk);
+		if (++d == s->bt_depth)
+			break;
+		dm_bufio_release(*bp);
+	}
+	if (unlikely(compare_key(&node->entries[path[s->bt_depth - 1].idx], key) < 0))
+		path[s->bt_depth - 1].idx++;
+	return r;
+#undef node
+}
+
+/*
+ * Find a given key in the btree.
+ *
+ * Returns: 1 - found, 0 - not found, -1 - error
+ *	In case the node is found, key contains updated key and result contains
+ *	the resulting chunk.
+ */
+
+int dm_multisnap_find_in_btree(struct dm_exception_store *s, struct bt_key *key, chunk_t *result)
+{
+	struct dm_multisnap_bt_node *node;
+	struct path_element path[MAX_BT_DEPTH];
+	struct dm_buffer *bp;
+
+	int r = walk_btree(s, key, &node, &bp, path);
+	if (unlikely(r < 0))
+		return r;
+
+	if (r) {
+		struct dm_multisnap_bt_entry *entry = &node->entries[path[s->bt_depth - 1].idx];
+		*result = read_48(entry, new_chunk);
+		key->chunk = read_48(entry, orig_chunk);
+		key->snap_from = mikulas_snapid_to_cpu(entry->snap_from);
+		key->snap_to = mikulas_snapid_to_cpu(entry->snap_to);
+	}
+	dm_bufio_release(bp);
+
+	return r;
+}
+
+/*
+ * Scan the btree sequentially.
+ * Start with the given key. Perform "call" on each leaf node. When call returns
+ * nonzero, terminate the scan and return the value returned from call.
+ * When the whole tree is scanned, return 0.
+ * On error, return -1.
+ */
+
+int dm_multisnap_list_btree(struct dm_exception_store *s, struct bt_key *key, int (*call)(struct dm_exception_store *, struct dm_multisnap_bt_node *, struct dm_multisnap_bt_entry *, void *), void *cookie)
+{
+	struct dm_multisnap_bt_node *node;
+	struct path_element path[MAX_BT_DEPTH];
+	struct dm_buffer *bp;
+	int depth;
+	int i;
+	int r;
+
+	r = walk_btree(s, key, &node, &bp, path);
+	if (unlikely(r < 0))
+		return r;
+
+list_next_node:
+	for (i = path[s->bt_depth - 1].idx; i < le32_to_cpu(node->n_entries); i++) {
+		cond_resched();
+		r = call(s, node, &node->entries[i], cookie);
+		if (unlikely(r)) {
+			dm_bufio_release(bp);
+			return r;
+		}
+	}
+	dm_bufio_release(bp);
+
+	for (depth = s->bt_depth - 2; depth >= 0; depth--) {
+		int idx;
+		node = dm_multisnap_read_btnode(s, depth, path[depth].block, path[depth].n_entries, &bp);
+		if (!node)
+			return -1;
+		idx = path[depth].idx + 1;
+		if (idx < path[depth].n_entries) {
+			r = compare_key(&node->entries[idx], key);
+			if (unlikely(r <= 0)) {
+				DMERR("dm_multisnap_list_btree: non-monotonic btree: node %llx, index %x", (unsigned long long)path[depth].block, idx);
+				dm_bufio_release(bp);
+				dm_multisnap_set_error(s->dm, -EFSERROR);
+				return 0;
+			}
+			path[depth].idx = idx;
+			do {
+				depth++;
+				path[depth].block = read_48(&node->entries[path[depth - 1].idx], new_chunk);
+				path[depth].idx = 0;
+				dm_bufio_release(bp);
+				node = dm_multisnap_read_btnode(s, depth, path[depth].block, 0, &bp);
+				if (!node)
+					return -1;
+				path[depth].n_entries = le32_to_cpu(node->n_entries);
+			} while (depth < s->bt_depth - 1);
+			goto list_next_node;
+		}
+		dm_bufio_release(bp);
+	}
+
+	return 0;
+}
+
+/*
+ * Add a key and chunk to the btree.
+ * The key must not overlap with any existing btree entry.
+ */
+
+void dm_multisnap_add_to_btree(struct dm_exception_store *s, struct bt_key *key, chunk_t new_chunk)
+{
+	struct dm_multisnap_bt_node *node;
+	struct dm_buffer *bp;
+	struct path_element path[MAX_BT_DEPTH];
+	int depth;
+
+	unsigned split_entries, split_index, split_offset, split_size;
+	struct bt_key new_key;
+	struct dm_multisnap_bt_entry *last_one;
+	chunk_t new_root;
+
+	int r = walk_btree(s, key, &node, &bp, path);
+
+	if (unlikely(r)) {
+		if (r > 0) {
+			dm_bufio_release(bp);
+			DMERR("dm_multisnap_add_to_btree: adding key that already exists: %llx, %llx-%llx", (unsigned long long)key->chunk, (unsigned long long)key->snap_from, (unsigned long long)key->snap_to);
+			dm_multisnap_set_error(s->dm, -EFSERROR);
+		}
+		return;
+	}
+
+	depth = s->bt_depth - 1;
+
+go_up:
+	node = dm_multisnap_alloc_duplicate_block(s, path[depth].block, &bp, node);
+	if (unlikely(!node))
+		return;
+
+	if (likely(le32_to_cpu(node->n_entries) < s->btree_entries)) {
+		add_at_idx(node, path[depth].idx, key, new_chunk);
+		dm_bufio_mark_buffer_dirty(bp);
+		dm_bufio_release(bp);
+		return;
+	}
+	cond_resched();
+	memcpy(s->tmp_chunk, node, s->chunk_size);
+	cond_resched();
+	add_at_idx(s->tmp_chunk, path[depth].idx, key, new_chunk);
+
+	split_entries = le32_to_cpu(((struct dm_multisnap_bt_node *)s->tmp_chunk)->n_entries);
+	split_index = split_entries / 2;
+	split_offset = sizeof(struct dm_multisnap_bt_node) + split_index * sizeof(struct dm_multisnap_bt_entry);
+	split_size = sizeof(struct dm_multisnap_bt_node) + split_entries * sizeof(struct dm_multisnap_bt_entry);
+	cond_resched();
+	memcpy(node, s->tmp_chunk, sizeof(struct dm_multisnap_bt_node));
+	cond_resched();
+	memcpy((char *)node + sizeof(struct dm_multisnap_bt_node), (char *)s->tmp_chunk + split_offset, split_size - split_offset);
+	cond_resched();
+	memset((char *)node + sizeof(struct dm_multisnap_bt_node) + split_size - split_offset, 0, s->chunk_size - (sizeof(struct dm_multisnap_bt_node) + split_size - split_offset));
+	cond_resched();
+	node->n_entries = cpu_to_le32(split_entries - split_index);
+
+	dm_bufio_mark_buffer_dirty(bp);
+	dm_bufio_release(bp);
+
+	node = dm_multisnap_alloc_make_block(s, &new_chunk, &bp);
+	if (unlikely(!node))
+		return;
+
+	cond_resched();
+	memcpy(node, s->tmp_chunk, split_offset);
+	cond_resched();
+	memset((char *)node + split_offset, 0, s->chunk_size - split_offset);
+	cond_resched();
+	node->n_entries = cpu_to_le32(split_index);
+
+	last_one = &node->entries[split_index - 1];
+	new_key.chunk = read_48(last_one, orig_chunk);
+	new_key.snap_from = mikulas_snapid_to_cpu(last_one->snap_to);
+	new_key.snap_to = mikulas_snapid_to_cpu(last_one->snap_to);
+
+	key = &new_key;
+
+	dm_bufio_mark_buffer_dirty(bp);
+	dm_bufio_release(bp);
+
+	if (depth--) {
+		node = dm_multisnap_read_btnode(s, depth, path[depth].block, path[depth].n_entries, &bp);
+		if (unlikely(!node))
+			return;
+		goto go_up;
+	}
+
+	if (s->bt_depth >= MAX_BT_DEPTH) {
+		DMERR("dm_multisnap_add_to_btree: max b+-tree depth reached");
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return;
+	}
+
+	node = dm_multisnap_alloc_make_block(s, &new_root, &bp);
+	if (unlikely(!node))
+		return;
+
+	cond_resched();
+	memset(node, 0, s->chunk_size);
+	cond_resched();
+	node->signature = BT_SIGNATURE;
+	node->n_entries = cpu_to_le32(0);
+	add_at_idx(node, 0, &new_key, new_chunk);
+	new_key.snap_from = new_key.snap_to = SNAPID_T_LAST;
+	new_key.chunk = CHUNK_T_LAST;
+	add_at_idx(node, 1, &new_key, path[0].block);
+
+	dm_bufio_mark_buffer_dirty(bp);
+	dm_bufio_release(bp);
+
+	s->bt_root = new_root;
+	s->bt_depth++;
+}
+
+/*
+ * Change the last entry from old_chunk/old_snapid to new_chunk/new_snapid.
+ * Start at a given depth and go upward to the root.
+ */
+
+static void dm_multisnap_fixup_backlimits(struct dm_exception_store *s, struct path_element path[MAX_BT_DEPTH], int depth, chunk_t old_chunk, mikulas_snapid_t old_snapid, chunk_t new_chunk, mikulas_snapid_t new_snapid)
+{
+	int idx;
+	struct dm_multisnap_bt_node *node;
+	struct dm_buffer *bp;
+
+	if (old_chunk == new_chunk && old_snapid == new_snapid)
+		return;
+
+	for (depth--; depth >= 0; depth--) {
+		node = dm_multisnap_read_btnode(s, depth, path[depth].block, path[depth].n_entries, &bp);
+		if (unlikely(!node))
+			return;
+
+		node = dm_multisnap_alloc_duplicate_block(s, path[depth].block, &bp, node);
+		if (unlikely(!node))
+			return;
+
+		idx = path[depth].idx;
+
+		if (unlikely(read_48(&node->entries[idx], orig_chunk) != old_chunk) ||
+		    unlikely(mikulas_snapid_to_cpu(node->entries[idx].snap_from) != old_snapid) ||
+		    unlikely(mikulas_snapid_to_cpu(node->entries[idx].snap_to) != old_snapid)) {
+			dm_bufio_release(bp);
+			DMERR("dm_multisnap_fixup_backlimits: btree limit does not match, block %llx, idx %x, orig_chunk %llx, snap_from %llx, snap_to %llx, want %llx, %llx",
+				(unsigned long long)path[depth].block,
+				idx,
+				(unsigned long long)read_48(&node->entries[idx], orig_chunk),
+				(unsigned long long)mikulas_snapid_to_cpu(node->entries[idx].snap_from),
+				(unsigned long long)mikulas_snapid_to_cpu(node->entries[idx].snap_to),
+				(unsigned long long)old_chunk,
+				(unsigned long long)old_snapid);
+			dm_multisnap_set_error(s->dm, -EFSERROR);
+			return;
+		}
+		write_48(&node->entries[idx], orig_chunk, new_chunk);
+		node->entries[idx].snap_from = node->entries[idx].snap_to = cpu_to_mikulas_snapid(new_snapid);
+
+		dm_bufio_mark_buffer_dirty(bp);
+		dm_bufio_release(bp);
+
+		if (path[depth].idx != path[depth].n_entries - 1)
+			return;
+	}
+	DMERR("dm_multisnap_fixup_backlimits: the last entry modified, %llx/%llx -> %llx/%llx",
+		(unsigned long long)old_chunk,
+		(unsigned long long)old_snapid,
+		(unsigned long long)new_chunk,
+		(unsigned long long)new_snapid);
+	dm_multisnap_set_error(s->dm, -EFSERROR);
+}
+
+/*
+ * Restrict the range of an existing btree entry.
+ * The key must have the same beginning or end as some existing entry (not both)
+ * The range of the key is excluded from the entry.
+ */
+
+void dm_multisnap_restrict_btree_entry(struct dm_exception_store *s, struct bt_key *key)
+{
+	struct dm_multisnap_bt_node *node;
+	struct path_element path[MAX_BT_DEPTH];
+	struct dm_buffer *bp;
+	int idx;
+	struct dm_multisnap_bt_entry *entry;
+	mikulas_snapid_t from, to, new_to;
+
+	int r = walk_btree(s, key, &node, &bp, path);
+	if (unlikely(r < 0))
+		return;
+
+	if (!r) {
+		dm_bufio_release(bp);
+		DMERR("dm_multisnap_restrict_btree_entry: unknown key: %llx, %llx-%llx", (unsigned long long)key->chunk, (unsigned long long)key->snap_from, (unsigned long long)key->snap_to);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return;
+	}
+
+	node = dm_multisnap_alloc_duplicate_block(s, path[s->bt_depth - 1].block, &bp, node);
+	if (unlikely(!node))
+		return;
+
+	idx = path[s->bt_depth - 1].idx;
+	entry = &node->entries[idx];
+	from = mikulas_snapid_to_cpu(entry->snap_from);
+	to = new_to = mikulas_snapid_to_cpu(entry->snap_to);
+	if (key->snap_from == from && key->snap_to < to)
+		entry->snap_from = cpu_to_mikulas_snapid(key->snap_to + 1);
+	else if (key->snap_from > from && key->snap_to == to)
+		new_to = entry->snap_to = cpu_to_mikulas_snapid(key->snap_from - 1);
+	else {
+		dm_bufio_release(bp);
+		DMERR("dm_multisnap_restrict_btree_entry: invali range to restruct: %llx, %llx-%llx %llx-%llx", (unsigned long long)key->chunk, (unsigned long long)from, (unsigned long long)to, (unsigned long long)key->snap_from, (unsigned long long)key->snap_to);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return;
+	}
+
+	dm_bufio_mark_buffer_dirty(bp);
+	dm_bufio_release(bp);
+
+	if (unlikely(idx == path[s->bt_depth - 1].n_entries - 1))
+		dm_multisnap_fixup_backlimits(s, path, s->bt_depth - 1, key->chunk, to, key->chunk, new_to);
+}
+
+/*
+ * Expand range of an existing btree entry.
+ * The key represents the whole new range (including the old and new part).
+ */
+
+void dm_multisnap_extend_btree_entry(struct dm_exception_store *s, struct bt_key *key)
+{
+	struct dm_multisnap_bt_node *node;
+	struct path_element path[MAX_BT_DEPTH];
+	struct dm_buffer *bp;
+	int idx;
+	struct dm_multisnap_bt_entry *entry;
+	mikulas_snapid_t from, to, new_to;
+
+	int r = walk_btree(s, key, &node, &bp, path);
+	if (unlikely(r < 0))
+		return;
+
+	if (!r) {
+		dm_bufio_release(bp);
+		DMERR("dm_multisnap_extend_btree_entry: unknown key: %llx, %llx-%llx", (unsigned long long)key->chunk, (unsigned long long)key->snap_from, (unsigned long long)key->snap_to);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return;
+	}
+
+	node = dm_multisnap_alloc_duplicate_block(s, path[s->bt_depth - 1].block, &bp, node);
+	if (unlikely(!node))
+		return;
+
+	idx = path[s->bt_depth - 1].idx;
+	entry = &node->entries[idx];
+	from = mikulas_snapid_to_cpu(entry->snap_from);
+	to = new_to = mikulas_snapid_to_cpu(entry->snap_to);
+	if (key->snap_from < from)
+		entry->snap_from = cpu_to_mikulas_snapid(key->snap_from);
+	if (key->snap_to > to)
+		new_to = entry->snap_to = cpu_to_mikulas_snapid(key->snap_to);
+
+	dm_bufio_mark_buffer_dirty(bp);
+	dm_bufio_release(bp);
+
+	if (unlikely(idx == path[s->bt_depth - 1].n_entries - 1))
+		dm_multisnap_fixup_backlimits(s, path, s->bt_depth - 1, key->chunk, to, key->chunk, new_to);
+}
+
+/*
+ * Delete an entry from the btree.
+ */
+
+void dm_multisnap_delete_from_btree(struct dm_exception_store *s, struct bt_key *key)
+{
+	struct dm_multisnap_bt_node *node;
+	struct path_element path[MAX_BT_DEPTH];
+	struct dm_buffer *bp;
+	int idx;
+	struct dm_multisnap_bt_entry *entry;
+	mikulas_snapid_t from, to;
+	int depth, n_entries;
+
+	struct dm_multisnap_bt_entry *last_one;
+	chunk_t last_one_chunk;
+	mikulas_snapid_t last_one_snap_to;
+
+	int r = walk_btree(s, key, &node, &bp, path);
+	if (unlikely(r < 0))
+		return;
+
+	if (unlikely(!r)) {
+		dm_bufio_release(bp);
+		DMERR("dm_multisnap_delete_from_btree: unknown key: %llx, %llx-%llx", (unsigned long long)key->chunk, (unsigned long long)key->snap_from, (unsigned long long)key->snap_to);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return;
+	}
+
+	depth = s->bt_depth - 1;
+
+	idx = path[depth].idx;
+	entry = &node->entries[idx];
+	from = mikulas_snapid_to_cpu(entry->snap_from);
+	to = mikulas_snapid_to_cpu(entry->snap_to);
+	if (unlikely(from != key->snap_from) || unlikely(to != key->snap_to)) {
+		dm_bufio_release(bp);
+		DMERR("dm_multisnap_restrict_btree: invali range to restruct: %llx, %llx-%llx %llx-%llx", (unsigned long long)key->chunk, (unsigned long long)from, (unsigned long long)to, (unsigned long long)key->snap_from, (unsigned long long)key->snap_to);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return;
+	}
+
+	while (unlikely((n_entries = le32_to_cpu(node->n_entries)) == 1)) {
+		dm_bufio_release(bp);
+		if (unlikely(!depth)) {
+			DMERR("dm_multisnap_restrict_btree: b-tree is empty");
+			dm_multisnap_set_error(s->dm, -EFSERROR);
+			return;
+		}
+		dm_multisnap_free_block_and_duplicates(s, path[depth].block);
+		depth--;
+		node = dm_multisnap_read_btnode(s, depth, path[depth].block, path[depth].n_entries, &bp);
+		if (!node)
+			return;
+	}
+
+	node = dm_multisnap_alloc_duplicate_block(s, path[depth].block, &bp, node);
+	if (unlikely(!node))
+		return;
+
+	idx = path[depth].idx;
+
+	/*{
+		int x;
+		printk("before:\n");
+		for (x = 0; x < n_entries; x++)
+			printk("%llx, %x-%x -> %llx\n", read_48(&node->entries[x], orig_chunk), mikulas_snapid_to_cpu(node->entries[x].snap_from), mikulas_snapid_to_cpu(node->entries[x].snap_to), read_48(&node->entries[x], new_chunk));
+	}*/
+
+	cond_resched();
+	memmove(node->entries + idx, node->entries + idx + 1, (n_entries - idx - 1) * sizeof(struct dm_multisnap_bt_entry));
+	cond_resched();
+	n_entries--;
+	memset(node->entries + n_entries, 0, sizeof(struct dm_multisnap_bt_entry));
+
+	node->n_entries = cpu_to_le32(n_entries);
+
+	/*{
+		int x;
+		printk("after:\n");
+		for (x = 0; x < n_entries; x++)
+			printk("%llx, %x-%x -> %llx\n", read_48(&node->entries[x], orig_chunk), mikulas_snapid_to_cpu(node->entries[x].snap_from), mikulas_snapid_to_cpu(node->entries[x].snap_to), read_48(&node->entries[x], new_chunk));
+	}*/
+
+	last_one = &node->entries[n_entries - 1];
+	last_one_chunk = read_48(last_one, orig_chunk);
+	last_one_snap_to = mikulas_snapid_to_cpu(last_one->snap_to);
+
+	dm_bufio_mark_buffer_dirty(bp);
+	dm_bufio_release(bp);
+
+	if (unlikely(idx == n_entries))
+		dm_multisnap_fixup_backlimits(s, path, depth, key->chunk, key->snap_to, last_one_chunk, last_one_snap_to);
+}
+
+/*
+ * Process btree tmp remaps.
+ * Find the whole path for tmp_remap and write the path as new entries, from
+ * the root.
+ */
+
+void dm_multisnap_bt_finalize_tmp_remap(struct dm_exception_store *s, struct tmp_remap *tmp_remap)
+{
+	struct dm_buffer *bp;
+	struct dm_multisnap_bt_node *node;
+	struct bt_key key;
+	struct path_element path[MAX_BT_DEPTH];
+	int results_ptr;
+
+	chunk_t new_blockn;
+	int r;
+	int i;
+
+	if (s->n_preallocated_blocks < s->bt_depth) {
+		if (dm_multisnap_alloc_blocks(s, s->preallocated_blocks + s->n_preallocated_blocks, s->bt_depth - s->n_preallocated_blocks, 0) < 0)
+			return;
+		s->n_preallocated_blocks = s->bt_depth;
+	}
+	results_ptr = 0;
+
+	/*
+	 * Read the key from this node --- we'll walk the btree according
+	 * to this key to find a path from the root.
+	 */
+	node = dm_multisnap_read_btnode(s, s->bt_depth - 1, tmp_remap->new, 0, &bp);
+	if (!node)
+		return;
+	key.chunk = read_48(&node->entries[0], orig_chunk);
+	key.snap_from = key.snap_to = mikulas_snapid_to_cpu(node->entries[0].snap_from);
+	dm_bufio_release(bp);
+
+	r = walk_btree(s, &key, &node, &bp, path);
+	if (r < 0)
+		return;
+
+	dm_bufio_release(bp);
+
+	for (i = s->bt_depth - 1; i >= 0; i--)
+		if (path[i].block == tmp_remap->old)
+			goto found;
+
+	DMERR("block %llx/%llx was not found in btree when searching for %llx/%llx", (unsigned long long)tmp_remap->old, (unsigned long long)tmp_remap->new, (unsigned long long)key.chunk, (unsigned long long)key.snap_from);
+	for (i = 0; i < s->bt_depth; i++)
+		DMERR("path[%d]: %llx/%x", i, (unsigned long long)path[i].block, path[i].idx);
+	dm_multisnap_set_error(s->dm, -EFSERROR);
+	return;
+
+found:
+
+	dm_multisnap_free_block(s, tmp_remap->old, 0);
+
+	new_blockn = tmp_remap->new;
+	for (i--; i >= 0; i--) {
+		int remapped = 0;
+		node = dm_multisnap_read_btnode(s, i, path[i].block, path[i].n_entries, &bp);
+		if (!node)
+			return;
+		if (!dm_multisnap_block_is_uncommitted(s, path[i].block)) {
+			remapped = 1;
+			dm_bufio_release_move(bp, s->preallocated_blocks[results_ptr]);
+			dm_multisnap_free_block_and_duplicates(s, path[i].block);
+			node = dm_multisnap_read_btnode(s, i, s->preallocated_blocks[results_ptr], path[i].n_entries, &bp);
+			if (!node)
+				return;
+			/* !!! TODO: add to a list of newly allocated blocks */
+		}
+		write_48(&node->entries[path[i].idx], new_chunk, new_blockn);
+		dm_bufio_mark_buffer_dirty(bp);
+		dm_bufio_release(bp);
+
+		if (!remapped)
+			goto skip_it;
+		new_blockn = s->preallocated_blocks[results_ptr];
+		results_ptr++;
+	}
+
+	s->bt_root = new_blockn;
+
+skip_it:
+	memmove(s->preallocated_blocks, s->preallocated_blocks + results_ptr, (s->n_preallocated_blocks -= results_ptr) * sizeof(chunk_t));
+}
+
Index: linux-2.6.32/drivers/md/dm-multisnap-commit.c
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-multisnap-commit.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-multisnap-mikulas.h"
+
+static void dm_multisnap_finalize_tmp_remaps(struct dm_exception_store *s)
+{
+	struct tmp_remap *t;
+	int i;
+
+	while (s->n_used_tmp_remaps) {
+		if (dm_multisnap_has_error(s->dm))
+			return;
+		if (s->n_used_tmp_remaps < N_REMAPS - 1) {
+			/*
+			 * prefer btree remaps ...
+			 * if there are none, do bitmap remaps
+			 */
+			if (!list_empty(&s->used_bt_tmp_remaps)) {
+				t = container_of(s->used_bt_tmp_remaps.next, struct tmp_remap, list);
+				dm_multisnap_bt_finalize_tmp_remap(s, t);
+				dm_multisnap_free_tmp_remap(s, t);
+				continue;
+			}
+		}
+
+/* else: 0 or 1 free remaps : finalize bitmaps */
+		if (!list_empty(&s->used_bitmap_tmp_remaps)) {
+			t = container_of(s->used_bitmap_tmp_remaps.next, struct tmp_remap, list);
+			dm_multisnap_bitmap_finalize_tmp_remap(s, t);
+			dm_multisnap_free_tmp_remap(s, t);
+			continue;
+		} else {
+			DMERR("dm_multisnap_finalize_tmp_remaps: no bitmap tmp remaps, n_used_tmp_remaps %u", s->n_used_tmp_remaps);
+			dm_multisnap_set_error(s->dm, -EFSERROR);
+			return;
+		}
+	}
+
+	if (dm_multisnap_has_error(s->dm))
+		return;
+
+	for (i = s->n_preallocated_blocks - 1; i >= 0; i--)
+		dm_multisnap_free_blocks_immediate(s, s->preallocated_blocks[i], 1);
+	s->n_preallocated_blocks = 0;
+}
+
+void dm_multisnap_transaction_mark(struct dm_exception_store *s)
+{
+	/*
+	 * Accounting:
+	 * max number of modified/allocated blocks during btree add:
+	 *	s->bt_depth * 2 + 1
+	 * one additional entry for newly allocated data chunk
+	 * one additional entry for bitmap finalization
+	 */
+	if (unlikely(N_REMAPS - s->n_used_tmp_remaps < s->bt_depth * 2 + 3))
+		dm_multisnap_finalize_tmp_remaps(s);
+}
+
+void dm_multisnap_commit(struct dm_exception_store *s)
+{
+	struct tmp_remap *t;
+	chunk_t cb_addr;
+	chunk_t cb_div, cb_offset;
+	struct multisnap_commit_block *cb;
+	struct multisnap_superblock *sb;
+	unsigned idx;
+	struct dm_buffer *bp;
+	int r;
+
+	dm_multisnap_transaction_mark(s);
+
+	dm_multisnap_flush_freelist_before_commit(s);
+
+	if (dm_multisnap_has_error(s->dm)) {
+		struct multisnap_superblock *sb;
+
+		if (!dm_multisnap_drop_on_error(s->dm))
+			return;
+
+		sb = dm_bufio_read(s->bufio, SB_BLOCK, &bp);
+		if (IS_ERR(sb))
+			return;
+
+		if (!le32_to_cpu(sb->error)) {
+			sb->error = cpu_to_le32(dm_multisnap_has_error(s->dm));
+			dm_bufio_mark_buffer_dirty(bp);
+		}
+
+		dm_bufio_release(bp);
+		return;
+	}
+
+	list_for_each_entry(t, &s->used_bitmap_tmp_remaps, list)
+		t->uncommitted = 0;
+
+	list_for_each_entry(t, &s->used_bt_tmp_remaps, list)
+		t->uncommitted = 0;
+
+	r = dm_bufio_write_dirty_buffers(s->bufio);
+	if (unlikely(r < 0)) {
+		DMERR("dm_multisnap_commit: error writing data");
+		dm_multisnap_set_error(s->dm, r);
+		return;
+	}
+
+	cb_addr = s->alloc_rover;
+
+	if (cb_addr < FIRST_CB_BLOCK)
+		cb_addr = FIRST_CB_BLOCK;
+	cb_div = cb_addr - FIRST_CB_BLOCK;
+	cb_offset = sector_div(cb_div, s->cb_stride);
+	cb_addr += s->cb_stride - cb_offset;
+	if (cb_offset < s->cb_stride / 2 || cb_addr >= s->dev_size)
+		cb_addr -= s->cb_stride;
+
+	cb = dm_bufio_new(s->bufio, cb_addr, &bp);
+	if (IS_ERR(cb)) {
+		DMERR("dm_multisnap_commit: can't allocate new commit block at %llx", (unsigned long long)cb_addr);
+		dm_multisnap_set_error(s->dm, PTR_ERR(cb));
+		return;
+	}
+
+	s->commit_sequence++;
+
+	cb->signature = CB_SIGNATURE;
+	cb->snapshot_num = cpu_to_le32(s->snapshot_num);
+	cb->sequence = cpu_to_le64(s->commit_sequence);
+	write_48(cb, dev_size, s->dev_size);
+	write_48(cb, total_allocated, s->total_allocated);
+	write_48(cb, data_allocated, s->data_allocated);
+	write_48(cb, bitmap_root, s->bitmap_root);
+	write_48(cb, alloc_rover, s->alloc_rover);
+	write_48(cb, freelist, s->freelist_ptr);
+	write_48(cb, delete_rover, s->delete_rover_chunk);
+	write_48(cb, bt_root, s->bt_root);
+	cb->bt_depth = s->bt_depth;
+	cb->flags = s->flags;
+	memset(cb->pad, 0, sizeof cb->pad);
+	idx = 0;
+	list_for_each_entry(t, &s->used_bitmap_tmp_remaps, list) {
+		BUG_ON(idx >= N_REMAPS);
+		write_48(&cb->tmp_remap[idx], old, t->old);
+		write_48(&cb->tmp_remap[idx], new, t->new);
+		cb->tmp_remap[idx].bitmap_idx = cpu_to_le32(t->bitmap_idx);
+		idx++;
+	}
+	list_for_each_entry(t, &s->used_bt_tmp_remaps, list) {
+		BUG_ON(idx >= N_REMAPS);
+		write_48(&cb->tmp_remap[idx], old, t->old);
+		write_48(&cb->tmp_remap[idx], new, t->new);
+		cb->tmp_remap[idx].bitmap_idx = cpu_to_le32(t->bitmap_idx);
+		idx++;
+	}
+	for (; idx < N_REMAPS; idx++) {
+		write_48(&cb->tmp_remap[idx], old, 0);
+		write_48(&cb->tmp_remap[idx], new, 0);
+		cb->tmp_remap[idx].bitmap_idx = cpu_to_le32(0);
+	}
+	dm_bufio_mark_buffer_dirty(bp);
+	dm_bufio_release(bp);
+	r = dm_bufio_write_dirty_buffers(s->bufio);
+	if (unlikely(r < 0)) {
+		DMERR("dm_multisnap_commit: can't write commit block at %llx", (unsigned long long)cb_addr);
+		dm_multisnap_set_error(s->dm, r);
+		return;
+	}
+
+	if (likely(cb_addr == s->valid_commit_block) ||
+	    likely(cb_addr == s->valid_commit_block + s->cb_stride))
+		goto return_success;
+
+	sb = dm_bufio_read(s->bufio, SB_BLOCK, &bp);
+	if (IS_ERR(sb)) {
+		DMERR("dm_multisnap_commit: can't read super block");
+		dm_multisnap_set_error(s->dm, PTR_ERR(sb));
+		return;
+	}
+
+	if (unlikely(sb->signature != SB_SIGNATURE)) {
+		dm_bufio_release(bp);
+		DMERR("dm_multisnap_commit: invalid super block signature when committing");
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return;
+	}
+
+	sb->commit_block = cpu_to_le64(cb_addr);
+
+	dm_bufio_mark_buffer_dirty(bp);
+	dm_bufio_release(bp);
+	r = dm_bufio_write_dirty_buffers(s->bufio);
+	if (unlikely(r < 0)) {
+		DMERR("dm_multisnap_commit: can't write super block");
+		dm_multisnap_set_error(s->dm, r);
+		return;
+	}
+
+return_success:
+	s->valid_commit_block = cb_addr;
+
+	dm_multisnap_load_freelist(s);
+
+	return;
+}
Index: linux-2.6.32/drivers/md/dm-multisnap-delete.c
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-multisnap-delete.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-multisnap-mikulas.h"
+
+/*
+ * Commit after this number of deleted entries.
+ * Too big number causes spurious overflows on nearly-full device.
+ * Too small number degrades delete performance.
+ */
+#define COMMIT_AFTER		128
+
+struct list_cookie {
+	struct bt_key key;
+	chunk_t new_chunk;
+};
+
+#define RET_END			1
+#define RET_DO_FREE		2
+#define RET_RESCHEDULE		3
+
+static int list_callback(struct dm_exception_store *s, struct dm_multisnap_bt_node *node, struct dm_multisnap_bt_entry *bt, void *cookie)
+{
+	struct list_cookie *lc = cookie;
+	mikulas_snapid_t found_from, found_to;
+
+	lc->key.chunk = read_48(bt, orig_chunk);
+	lc->key.snap_from = mikulas_snapid_to_cpu(bt->snap_from);
+	lc->key.snap_to = mikulas_snapid_to_cpu(bt->snap_to);
+
+	if (unlikely(lc->key.chunk > CHUNK_T_MAX))
+		return RET_END;
+
+	s->delete_rover_chunk = lc->key.chunk;
+	s->delete_rover_snapid = lc->key.snap_to + 1;
+	if (unlikely(!s->delete_rover_snapid))
+		s->delete_rover_chunk++;
+
+	if (!dm_multisnap_find_next_snapid_range(s, lc->key.snap_from, &found_from, &found_to) || found_from > lc->key.snap_to) {
+		/*
+		 * This range maps unused snapshots, delete it.
+		 * But we can't do it now, so submit it to the caller;
+		 */
+		lc->new_chunk = read_48(bt, new_chunk);
+		return RET_DO_FREE;
+	}
+
+	/*
+	 * If we are at a last entry in the btree node, drop the lock and
+	 * allow other requests to be processed.
+	 *
+	 * This avoids a starvation when there are no nodes to delete.
+	 */
+	if (bt == &node->entries[le32_to_cpu(node->n_entries) - 1])
+		return RET_RESCHEDULE;
+
+	return 0;
+}
+
+static void delete_step(struct dm_exception_store *s)
+{
+	struct bt_key key;
+	int r;
+	struct list_cookie lc;
+
+	key.chunk = s->delete_rover_chunk;
+	key.snap_from = s->delete_rover_snapid;
+	key.snap_to = s->delete_rover_snapid;
+
+	r = dm_multisnap_list_btree(s, &key, list_callback, &lc);
+
+	if (unlikely(r < 0))
+		return;
+
+	switch (r) {
+
+	case RET_END:
+		s->flags &= ~MULTISNAP_FLAG_DELETING;
+
+		/* If we finished the job and there is no pending I/O, commit */
+		if (dm_multisnap_can_commit(s->dm))
+			dm_multisnap_call_commit(s->dm);
+
+		return;
+	case RET_DO_FREE:
+		if (unlikely(dm_multisnap_has_error(s->dm)))
+			return;
+
+		/*printk("deleting: %lx (%llx-%llx) -> %lx\n", lc.key.chunk, lc.key.snap_from, lc.key.snap_to, lc.new_chunk);*/
+
+		dm_multisnap_delete_from_btree(s, &lc.key);
+
+		dm_multisnap_transaction_mark(s);
+
+		dm_multisnap_free_block(s, lc.new_chunk, FREELIST_DATA_FLAG);
+
+		/* fall through */
+	case RET_RESCHEDULE:
+		if (dm_multisnap_can_commit(s->dm)) {
+			if (++s->delete_commit_count >= COMMIT_AFTER) {
+				s->delete_commit_count = 0;
+				dm_multisnap_call_commit(s->dm);
+			}
+		}
+		return;
+	default:
+		printk(KERN_CRIT "delete_step: invalid return value %d", r);
+		BUG();
+
+	}
+}
+
+void dm_multisnap_background_delete(struct dm_exception_store *s, struct dm_multisnap_background_work *bw)
+{
+	if (unlikely(dm_multisnap_has_error(s->dm)))
+		return;
+
+	if (s->flags & MULTISNAP_FLAG_DELETING) {
+		delete_step(s);
+	} else if (s->flags & MULTISNAP_FLAG_PENDING_DELETE) {
+		s->flags &= ~MULTISNAP_FLAG_PENDING_DELETE;
+		s->flags |= MULTISNAP_FLAG_DELETING;
+		s->delete_rover_chunk = 0;
+		s->delete_rover_snapid = 0;
+	} else
+		return;
+
+	dm_multisnap_queue_work(s->dm, &s->delete_work);
+}
Index: linux-2.6.32/drivers/md/dm-multisnap-freelist.c
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-multisnap-freelist.c
@@ -0,0 +1,236 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-multisnap-mikulas.h"
+
+void dm_multisnap_init_freelist(struct dm_multisnap_freelist *fl, unsigned chunk_size)
+{
+	cond_resched();
+	memset(fl, 0, chunk_size);
+	cond_resched();
+	fl->signature = FL_SIGNATURE;
+	write_48(fl, backlink, 0);
+	fl->n_entries = cpu_to_le32(0);
+}
+
+static int add_to_freelist(struct dm_exception_store *s, chunk_t block, unsigned flags)
+{
+	int i;
+	struct dm_multisnap_freelist *fl = s->freelist;
+	for (i = le32_to_cpu(fl->n_entries) - 1; i >= 0; i--) {
+		chunk_t x = read_48(&fl->entries[i], block);
+		unsigned r = le16_to_cpu(fl->entries[i].run_length) & FREELIST_RL_MASK;
+		unsigned f = le16_to_cpu(fl->entries[i].run_length) & FREELIST_DATA_FLAG;
+		if (block >= x && block < x + r) {
+			DMERR("add_to_freelist: freeing already free block %llx (%llx - %x)", (unsigned long long)block, (unsigned long long)x, r);
+			dm_multisnap_set_error(s->dm, -EFSERROR);
+			return -1;
+		}
+		if (likely(r < FREELIST_RL_MASK) && likely(f == flags)) {
+			if (block == x - 1) {
+				write_48(&fl->entries[i], block, x - 1);
+				goto inc_length;
+			}
+			if (block == x + r) {
+inc_length:
+				fl->entries[i].run_length = cpu_to_le16((r + 1) | f);
+				return 1;
+			}
+		}
+		cond_resched();
+	}
+	i = le32_to_cpu(fl->n_entries);
+	if (i < dm_multisnap_freelist_entries(s->chunk_size)) {
+		fl->n_entries = cpu_to_le32(i + 1);
+		write_48(&fl->entries[i], block, block);
+		fl->entries[i].run_length = cpu_to_le16(1 | flags);
+		return 1;
+	}
+	return 0;
+}
+
+static struct dm_multisnap_freelist *read_freelist(struct dm_exception_store *s, chunk_t block, struct dm_buffer **bp)
+{
+	struct dm_multisnap_freelist *fl;
+	fl = dm_bufio_read(s->bufio, block, bp);
+	if (IS_ERR(fl)) {
+		DMERR("read_freelist: can't read freelist block %llx", (unsigned long long)block);
+		dm_multisnap_set_error(s->dm, PTR_ERR(fl));
+		return NULL;
+	}
+	if (fl->signature != FL_SIGNATURE) {
+		dm_bufio_release(*bp);
+		DMERR("read_freelist: bad signature freelist block %llx", (unsigned long long)block);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return NULL;
+	}
+	if (le32_to_cpu(fl->n_entries) > dm_multisnap_freelist_entries(s->chunk_size)) {
+		dm_bufio_release(*bp);
+		DMERR("read_freelist: bad number of entries in freelist block %llx", (unsigned long long)block);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return NULL;
+	}
+	return fl;
+}
+
+static void alloc_write_freelist(struct dm_exception_store *s)
+{
+	chunk_t new_block;
+	struct dm_multisnap_freelist *fl;
+	struct dm_buffer *bp;
+
+	if (dm_multisnap_alloc_blocks(s, &new_block, 1, ALLOC_DRY))
+		return;
+
+	fl = dm_bufio_new(s->bufio, new_block, &bp);
+	if (IS_ERR(fl)) {
+		DMERR("alloc_write_freelist: can't make new freelist block %llx", (unsigned long long)new_block);
+		dm_multisnap_set_error(s->dm, PTR_ERR(fl));
+		return;
+	}
+
+	memcpy(fl, s->freelist, s->chunk_size);
+
+	dm_bufio_mark_buffer_dirty(bp);
+	dm_bufio_release(bp);
+
+	dm_multisnap_init_freelist(s->freelist, s->chunk_size);
+	write_48(s->freelist, backlink, new_block);
+}
+
+void dm_multisnap_free_block(struct dm_exception_store *s, chunk_t block, unsigned flags)
+{
+	if (likely(add_to_freelist(s, block, flags)))
+		return;
+
+	alloc_write_freelist(s);
+	if (dm_multisnap_has_error(s->dm))
+		return;
+
+	if (add_to_freelist(s, block, flags))
+		return;
+
+	BUG();
+}
+
+static int check_against_freelist(struct dm_multisnap_freelist *fl, chunk_t block)
+{
+	int i;
+	for (i = le32_to_cpu(fl->n_entries) - 1; i >= 0; i--) {
+		chunk_t x = read_48(&fl->entries[i], block);
+		unsigned r = le16_to_cpu(fl->entries[i].run_length) & FREELIST_RL_MASK;
+		if (block - x >= 0 && unlikely(block - x < r))
+			return 1;
+		cond_resched();
+	}
+	return 0;
+}
+
+static int check_against_freelist_chain(struct dm_exception_store *s, chunk_t fl_block, chunk_t block)
+{
+	stop_cycles_t cy;
+	dm_multisnap_init_stop_cycles(&cy);
+
+	while (unlikely(fl_block != 0)) {
+		int c;
+		struct dm_buffer *bp;
+		struct dm_multisnap_freelist *fl;
+
+		if (dm_multisnap_stop_cycles(s, &cy, fl_block))
+			return -1;
+
+		if (unlikely(block == fl_block))
+			return 1;
+
+		fl = read_freelist(s, fl_block, &bp);
+		if (unlikely(!fl))
+			return -1;
+		c = check_against_freelist(fl, block);
+		fl_block = read_48(fl, backlink);
+		dm_bufio_release(bp);
+		if (unlikely(c))
+			return c;
+	}
+	return 0;
+}
+
+int dm_multisnap_check_allocated_block(struct dm_exception_store *s, chunk_t block)
+{
+	int c;
+
+	c = check_against_freelist(s->freelist, block);
+	if (unlikely(c))
+		return c;
+
+	c = check_against_freelist_chain(s, read_48(s->freelist, backlink), block);
+	if (unlikely(c))
+		return c;
+
+	c = check_against_freelist_chain(s, s->freelist_ptr, block);
+	if (unlikely(c))
+		return c;
+
+	return 0;
+}
+
+void dm_multisnap_flush_freelist_before_commit(struct dm_exception_store *s)
+{
+	alloc_write_freelist(s);
+
+	if (dm_multisnap_has_error(s->dm))
+		return;
+
+	s->freelist_ptr = read_48(s->freelist, backlink);
+}
+
+static void free_blocks_in_freelist(struct dm_exception_store *s, struct dm_multisnap_freelist *fl)
+{
+	int i;
+	for (i = le32_to_cpu(fl->n_entries) - 1; i >= 0; i--) {
+		chunk_t x = read_48(&fl->entries[i], block);
+		unsigned r = le16_to_cpu(fl->entries[i].run_length) & FREELIST_RL_MASK;
+		unsigned f = le16_to_cpu(fl->entries[i].run_length) & FREELIST_DATA_FLAG;
+		dm_multisnap_free_blocks_immediate(s, x, r);
+		if (likely(f & FREELIST_DATA_FLAG)) {
+			dm_multisnap_status_lock(s->dm);
+			s->data_allocated -= r;
+			dm_multisnap_status_unlock(s->dm);
+		}
+		cond_resched();
+	}
+}
+
+void dm_multisnap_load_freelist(struct dm_exception_store *s)
+{
+	chunk_t fl_block = s->freelist_ptr;
+
+	stop_cycles_t cy;
+	dm_multisnap_init_stop_cycles(&cy);
+
+	while (fl_block) {
+		struct dm_buffer *bp;
+		struct dm_multisnap_freelist *fl;
+
+		if (dm_multisnap_stop_cycles(s, &cy, fl_block))
+			break;
+
+		if (dm_multisnap_has_error(s->dm))
+			break;
+
+		fl = read_freelist(s, fl_block, &bp);
+		if (!fl)
+			break;
+		memcpy(s->freelist, fl, s->chunk_size);
+		dm_bufio_release(bp);
+
+		free_blocks_in_freelist(s, s->freelist);
+		fl_block = read_48(s->freelist, backlink);
+	}
+
+	dm_multisnap_init_freelist(s->freelist, s->chunk_size);
+}
Index: linux-2.6.32/drivers/md/dm-multisnap-io.c
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-multisnap-io.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-multisnap-mikulas.h"
+
+/*
+ * This function will check if there is remapping for a given snapid/chunk.
+ * It returns 1 if remapping exists and is read-only (shared by other snapshots)
+ * and 2 if it exists and is read-write (not shared by anyone).
+ */
+
+int dm_multisnap_find_snapshot_chunk(struct dm_exception_store *s, snapid_t snapid, chunk_t chunk, int write, chunk_t *result)
+{
+	int r;
+	struct bt_key key;
+	mikulas_snapid_t from, to;
+	mikulas_snapid_t find_from, find_to;
+
+	key.chunk = chunk;
+	key.snap_from = snapid;
+	key.snap_to = snapid;
+	r = dm_multisnap_find_in_btree(s, &key, result);
+	if (!r) {
+		s->query_new_key.chunk = chunk;
+		s->query_new_key.snap_from = snapid;
+		s->query_new_key.snap_to = snapid;
+		s->query_active = 1;
+	}
+	if (r <= 0 || !write)
+		return r;
+
+	from = to = snapid;
+	if ((snapid & MIKULAS_SUBSNAPID_MASK) == MIKULAS_SUBSNAPID_MASK) {
+		from = snapid & ~MIKULAS_SUBSNAPID_MASK;
+		if (!dm_multisnap_find_next_snapid_range(s, snapid, &find_from, &find_to))
+			BUG();
+		if (from < find_from)
+			from = find_from;
+	}
+
+	/*
+	 * We are writing to a snapshot --- check if anything outside <from-to>
+	 * range exists, if it does, it needs to be copied.
+	 */
+
+	if (key.snap_from < from) {
+		if (likely(dm_multisnap_find_next_snapid_range(s, key.snap_from, &find_from, &find_to))) {
+			if (find_from < from) {
+				s->query_new_key.chunk = chunk;
+				s->query_new_key.snap_from = from;
+				s->query_new_key.snap_to = key.snap_to;
+				s->query_block_from = key.snap_from;
+				s->query_block_to = key.snap_to;
+				s->query_active = 2;
+				return 1;
+			}
+			if (unlikely(find_from > from))
+				BUG(); /* SNAPID not in our tree */
+		} else
+			BUG(); /* we're asking for a SNAPID not in our tree */
+	}
+	if (key.snap_to > to) {
+		if (likely(dm_multisnap_find_next_snapid_range(s, to + 1, &find_from, &find_to))) {
+			if (find_from <= key.snap_to) {
+				s->query_new_key.chunk = chunk;
+				s->query_new_key.snap_from = key.snap_from;
+				s->query_new_key.snap_to = to;
+				s->query_block_from = key.snap_from;
+				s->query_block_to = key.snap_to;
+				s->query_active = 2;
+				return 1;
+			}
+		}
+	}
+	return 2;
+}
+
+void dm_multisnap_reset_query(struct dm_exception_store *s)
+{
+	s->query_active = 0;
+
+	s->query_snapid = 0;
+}
+
+int dm_multisnap_query_next_remap(struct dm_exception_store *s, chunk_t chunk)
+{
+	int r;
+	chunk_t sink;
+	mikulas_snapid_t from, to;
+
+	s->query_active = 0;
+
+	while (dm_multisnap_find_next_snapid_range(s, s->query_snapid, &from, &to)) {
+		struct bt_key key;
+next_btree_search:
+		if (dm_multisnap_has_error(s->dm))
+			return -1;
+		key.chunk = chunk;
+		key.snap_from = from;
+		key.snap_to = to;
+		r = dm_multisnap_find_in_btree(s, &key, &sink);
+		if (unlikely(r < 0))
+			return -1;
+
+		if (!r) {
+			s->query_new_key.chunk = chunk;
+			s->query_new_key.snap_from = from;
+			s->query_new_key.snap_to = to;
+			s->query_active = 1;
+			return 1;
+		}
+
+		if (key.snap_from > from) {
+			s->query_new_key.chunk = chunk;
+			s->query_new_key.snap_from = from;
+			s->query_new_key.snap_to = key.snap_from - 1;
+			s->query_active = 1;
+			return 1;
+		}
+
+		if (key.snap_to < to) {
+			from = key.snap_to + 1;
+			goto next_btree_search;
+		}
+
+		s->query_snapid = to + 1;
+	}
+
+	return 0;
+}
+
+void dm_multisnap_add_next_remap(struct dm_exception_store *s, union chunk_descriptor *cd, chunk_t *new_chunk)
+{
+	int r;
+
+	BUG_ON(s->query_active != 1);
+	s->query_active = 0;
+
+	cd->range.from = s->query_new_key.snap_from;
+	cd->range.to = s->query_new_key.snap_to;
+
+	r = dm_multisnap_alloc_blocks(s, new_chunk, 1, 0);
+	if (unlikely(r < 0))
+		return;
+
+	dm_multisnap_status_lock(s->dm);
+	s->data_allocated++;
+	dm_multisnap_status_unlock(s->dm);
+
+	dm_multisnap_add_to_btree(s, &s->query_new_key, *new_chunk);
+	dm_multisnap_transaction_mark(s);
+}
+
+void dm_multisnap_make_chunk_writeable(struct dm_exception_store *s, union chunk_descriptor *cd, chunk_t *new_chunk)
+{
+	int r;
+
+	BUG_ON(s->query_active != 2);
+	s->query_active = 0;
+
+	cd->range.from = s->query_block_from;
+	cd->range.to = s->query_block_to;
+
+	r = dm_multisnap_alloc_blocks(s, new_chunk, 1, 0);
+	if (unlikely(r < 0))
+		return;
+
+	dm_multisnap_status_lock(s->dm);
+	s->data_allocated++;
+	dm_multisnap_status_unlock(s->dm);
+
+	dm_multisnap_restrict_btree_entry(s, &s->query_new_key);
+	dm_multisnap_transaction_mark(s);
+
+	if (unlikely(dm_multisnap_has_error(s->dm)))
+		return;
+
+	dm_multisnap_add_to_btree(s, &s->query_new_key, *new_chunk);
+	dm_multisnap_transaction_mark(s);
+}
+
+int dm_multisnap_check_conflict(struct dm_exception_store *s, union chunk_descriptor *cd, snapid_t snapid)
+{
+	return snapid >= cd->range.from && snapid <= cd->range.to;
+}
+
Index: linux-2.6.32/drivers/md/dm-multisnap-mikulas-struct.h
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-multisnap-mikulas-struct.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_MULTISNAP_MIKULAS_STRUCT_H
+#define DM_MULTISNAP_MIKULAS_STRUCT_H
+
+/* on-disk structures */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+#include "dm-multisnap.h"
+
+/*
+ * Encoding of snapshot numbers:
+ *
+ * If CONFIG_DM_MULTISNAPSHOT_MIKULAS_SNAP_OF_SNAP is not selected (normally it
+ * is), then mikulas_snapid_t is 32-bit sequential number. It continually grows.
+ *
+ * IF CONFIG_DM_MULTISNAPSHOT_MIKULAS_SNAP_OF_SNAP is selected (by default),
+ * then mikulas_snapid_t is 64-bit number. The high 32 bits are sequential
+ * snapshot number. With each new snapshot, it is incremented. The low 32 bits
+ * are subsnapshot number. Single snapshots (snapshots of the origin) have
+ * low 32 bits equal to all ones. Snapshots-of-snapshots have high 32 bits
+ * equal as their master snapshot and low 32 bits start with zero and is
+ * incremented with each new snapshot-of-snapshot.
+ *
+ * More levels (snapshots-of-snapshots-of-snapshots) are not allowed.
+ */
+
+#ifndef CONFIG_DM_MULTISNAPSHOT_MIKULAS_SNAP_OF_SNAP
+typedef __u32 mikulas_snapid_t;
+#define MIKULAS_SNAPID_STEP_BITS	0
+#define mikulas_snapid_to_cpu		le32_to_cpu
+#define cpu_to_mikulas_snapid		cpu_to_le32
+#else
+typedef __u64 mikulas_snapid_t;
+#define MIKULAS_SNAPID_STEP_BITS	32
+#define mikulas_snapid_to_cpu		le64_to_cpu
+#define cpu_to_mikulas_snapid		cpu_to_le64
+#endif
+
+#define MIKULAS_SUBSNAPID_MASK	(((mikulas_snapid_t)1 << MIKULAS_SNAPID_STEP_BITS) - 1)
+#define SNAPID_T_LAST		((mikulas_snapid_t)0xffffffffffffffffULL)
+#define SNAPID_T_MAX		((mikulas_snapid_t)0xfffffffffffffffeULL)
+
+#define CHUNK_BITS		48
+#define CHUNK_T_LAST		((chunk_t)(1LL << CHUNK_BITS) - 1)
+#define CHUNK_T_SNAP_PRESENT	((chunk_t)(1LL << CHUNK_BITS) - 1)
+#define CHUNK_T_MAX		((chunk_t)(1LL << CHUNK_BITS) - 2)
+
+#define CB_STRIDE_DEFAULT	1024
+
+#define SB_BLOCK	0
+
+#define SB_SIGNATURE	cpu_to_be32(0xF6015342)
+
+struct multisnap_superblock {
+	__u32 signature;
+	__u32 chunk_size;
+	__u32 cb_stride;
+	__s32 error;
+	__u64 commit_block;
+};
+
+
+#define FIRST_CB_BLOCK	1
+
+#define CB_SIGNATURE	cpu_to_be32(0xF6014342)
+
+struct commit_block_tmp_remap {
+	__u32 old1;
+	__u16 old2;
+	__u16 new2;
+	__u32 new1;
+	__u32 bitmap_idx;
+};
+
+#define CB_BITMAP_IDX_MAX	0xfffffffd
+#define CB_BITMAP_IDX_NONE	0xfffffffe
+#define CB_BITMAP_IDX_FREE	0xffffffff
+
+#define N_REMAPS	27
+
+struct multisnap_commit_block {
+	__u32 signature;
+	__u32 snapshot_num;
+	__u64 sequence;
+
+	__u32 dev_size1;
+	__u16 dev_size2;
+	__u16 total_allocated2;
+	__u32 total_allocated1;
+	__u32 data_allocated1;
+
+	__u16 data_allocated2;
+	__u16 bitmap_root2;
+	__u32 bitmap_root1;
+	__u32 alloc_rover1;
+	__u16 alloc_rover2;
+	__u16 freelist2;
+
+	__u32 freelist1;
+	__u32 delete_rover1;
+	__u16 delete_rover2;
+	__u16 bt_root2;
+	__u32 bt_root1;
+
+	__u8 bt_depth;
+	__u8 flags;
+	__u8 pad[14];
+
+	struct commit_block_tmp_remap tmp_remap[N_REMAPS];
+};
+
+#define MULTISNAP_FLAG_DELETING		0x01
+#define MULTISNAP_FLAG_PENDING_DELETE	0x02
+
+#define MAX_BITMAP_DEPTH		6
+
+static inline int dm_multisnap_bitmap_depth(unsigned chunk_shift, __u64 device_size)
+{
+	unsigned depth = 0;
+	__u64 entries = 8 << chunk_shift;
+	while (entries < device_size) {
+		depth++;
+		entries <<= chunk_shift - 3;
+		if (!entries)
+			return -ERANGE;
+	}
+
+	if (depth > MAX_BITMAP_DEPTH)
+		return -ERANGE;
+
+	return depth;
+}
+
+
+/* B+-tree entry. Sorted by orig_chunk and snap_from/to */
+
+#define MAX_BT_DEPTH		12
+
+struct dm_multisnap_bt_entry {
+	__u32 orig_chunk1;
+	__u16 orig_chunk2;
+	__u16 new_chunk2;
+	__u32 new_chunk1;
+	__u32 flags;
+	mikulas_snapid_t snap_from;
+	mikulas_snapid_t snap_to;
+};
+
+#define BT_SIGNATURE	cpu_to_be32(0xF6014254)
+
+struct dm_multisnap_bt_node {
+	__u32 signature;
+	__u32 n_entries;
+	struct dm_multisnap_bt_entry entries[0];
+};
+
+static inline unsigned dm_multisnap_btree_entries(unsigned chunk_size)
+{
+	return (chunk_size - sizeof(struct dm_multisnap_bt_node)) / sizeof(struct dm_multisnap_bt_entry);
+}
+
+
+/* Freelist */
+
+struct dm_multisnap_freelist_entry {
+	__u32 block1;
+	__u16 block2;
+	__u16 run_length;
+};
+
+#define FREELIST_RL_MASK	0x7fff
+#define FREELIST_DATA_FLAG	0x8000
+
+#define FL_SIGNATURE	cpu_to_be32(0xF601464C)
+
+struct dm_multisnap_freelist {
+	__u32 signature;
+	__u32 backlink1;
+	__u16 backlink2;
+	__u32 n_entries;
+	struct dm_multisnap_freelist_entry entries[0];
+};
+
+static inline unsigned dm_multisnap_freelist_entries(unsigned chunk_size)
+{
+	return (chunk_size - sizeof(struct dm_multisnap_freelist)) / sizeof(struct dm_multisnap_freelist);
+}
+
+#endif
Index: linux-2.6.32/drivers/md/dm-multisnap-mikulas.c
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-multisnap-mikulas.c
@@ -0,0 +1,667 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-multisnap-mikulas.h"
+
+static void init_commit_block(struct dm_exception_store *s)
+{
+	int i;
+
+	dm_multisnap_init_freelist(s->freelist, s->chunk_size);
+
+	s->snapshot_num = 0;
+	s->total_allocated = 0;
+	s->data_allocated = 0;
+	s->bitmap_root = 0;
+	s->alloc_rover = 0;
+	s->freelist_ptr = 0;
+	s->delete_rover_chunk = 0;
+	s->delete_rover_snapid = 0;
+	s->bt_root = 0;
+	s->bt_depth = 0;
+	s->flags = 0;
+
+	for (i = 0; i < TMP_REMAP_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&s->tmp_remap[i]);
+	s->n_used_tmp_remaps = 0;
+	INIT_LIST_HEAD(&s->used_bitmap_tmp_remaps);
+	INIT_LIST_HEAD(&s->used_bt_tmp_remaps);
+	INIT_LIST_HEAD(&s->free_tmp_remaps);
+
+	for (i = 0; i < N_REMAPS; i++) {
+		struct tmp_remap *t = &s->tmp_remap_store[i];
+		list_add(&t->list, &s->free_tmp_remaps);
+	}
+
+	s->dev_size = 0;
+	s->bitmap_depth = 0;
+	s->btree_entries = dm_multisnap_btree_entries(s->chunk_size);
+}
+
+static void load_commit_block(struct dm_exception_store *s)
+{
+	struct dm_buffer *bp;
+	struct multisnap_commit_block *cb;
+	__u64 dev_size;
+	int bitmap_depth;
+	unsigned i;
+
+	cb = dm_bufio_read(s->bufio, s->valid_commit_block, &bp);
+	if (IS_ERR(cb)) {
+		DMERR("load_commit_block: can't re-read commit block %llx", (unsigned long long)s->valid_commit_block);
+		dm_multisnap_set_error(s->dm, PTR_ERR(cb));
+		return;
+	}
+	if (cb->signature != CB_SIGNATURE) {
+		dm_bufio_release(bp);
+		DMERR("load_commit_block: bad signature when re-reading commit block %llx", (unsigned long long)s->valid_commit_block);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return;
+	}
+
+	init_commit_block(s);
+
+	dev_size = read_48(cb, dev_size);
+	s->snapshot_num = le32_to_cpu(cb->snapshot_num);
+	s->total_allocated = read_48(cb, total_allocated);
+	s->data_allocated = read_48(cb, data_allocated);
+	s->bitmap_root = read_48(cb, bitmap_root);
+	s->alloc_rover = read_48(cb, alloc_rover);
+	s->freelist_ptr = read_48(cb, freelist);
+	s->delete_rover_chunk = read_48(cb, delete_rover);
+	s->delete_rover_snapid = 0;
+	s->bt_root = read_48(cb, bt_root);
+	s->bt_depth = cb->bt_depth;
+	s->flags = cb->flags;
+
+	if (s->bt_depth > MAX_BT_DEPTH || !s->bt_depth) {
+		dm_bufio_release(bp);
+		DMERR("load_commit_block: invalid b+-tree depth in commit block %llx", (unsigned long long)s->valid_commit_block);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return;
+	}
+
+	INIT_LIST_HEAD(&s->free_tmp_remaps);
+	for (i = 0; i < N_REMAPS; i++) {
+		struct tmp_remap *t = &s->tmp_remap_store[i];
+		if (read_48(&cb->tmp_remap[i], old)) {
+			t->old = read_48(&cb->tmp_remap[i], old);
+			t->new = read_48(&cb->tmp_remap[i], new);
+			t->uncommitted = 0;
+			t->bitmap_idx = le32_to_cpu(cb->tmp_remap[i].bitmap_idx);
+			hlist_add_head(&t->hash_list, &s->tmp_remap[TMP_REMAP_HASH(t->old)]);
+			if (t->bitmap_idx == CB_BITMAP_IDX_NONE)
+				list_add(&t->list, &s->used_bt_tmp_remaps);
+			else
+				list_add(&t->list, &s->used_bitmap_tmp_remaps);
+			s->n_used_tmp_remaps++;
+		} else {
+			list_add(&t->list, &s->free_tmp_remaps);
+		}
+	}
+
+	dm_bufio_release(bp);
+
+	if ((chunk_t)(dev_size + s->cb_stride) < (chunk_t)dev_size) {
+		DMERR("load_commit_block: device is too large. Compile kernel with 64-bit sector numbers");
+		dm_multisnap_set_error(s->dm, -ERANGE);
+		return;
+	}
+	bitmap_depth = dm_multisnap_bitmap_depth(s->chunk_shift, dev_size);
+	if (bitmap_depth < 0) {
+		DMERR("load_commit_block: device is too large");
+		dm_multisnap_set_error(s->dm, bitmap_depth);
+		return;
+	}
+	s->dev_size = dev_size;
+	s->bitmap_depth = bitmap_depth;
+
+	dm_multisnap_load_freelist(s);
+}
+
+static void find_commit_block(struct dm_exception_store *s)
+{
+	struct dm_buffer *bp;
+	struct multisnap_commit_block *cb;
+	chunk_t cb_addr = s->sb_commit_block;
+	__u64 sequence;
+	__u64 dev_size;
+	s->valid_commit_block = 0;
+	s->commit_sequence = 0;
+
+try_next:
+	cb = dm_bufio_read(s->bufio, cb_addr, &bp);
+	if (IS_ERR(cb)) {
+		DMERR("find_commit_block: can't read commit block %llx", (unsigned long long)cb_addr);
+		dm_multisnap_set_error(s->dm, PTR_ERR(cb));
+		return;
+	}
+	if (cb->signature != CB_SIGNATURE) {
+		dm_bufio_release(bp);
+		DMERR("find_commit_block: bad signature on commit block %llx", (unsigned long long)cb_addr);
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return;
+	}
+
+	sequence = le64_to_cpu(cb->sequence);
+	dev_size = read_48(cb, dev_size);
+
+	dm_bufio_release(bp);
+
+	if (sequence > s->commit_sequence) {
+		s->commit_sequence = sequence;
+		s->valid_commit_block = cb_addr;
+		if ((__u64)cb_addr + s->cb_stride < dev_size) {
+			cb_addr += s->cb_stride;
+			goto try_next;
+		}
+	}
+	if (!s->valid_commit_block) {
+		DMERR("find_commit_block: no valid commit block");
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		return;
+	}
+}
+
+static int get_size(struct dm_exception_store *s, chunk_t *size)
+{
+	__u64 dev_size;
+	dev_size = i_size_read(dm_multisnap_snapshot_bdev(s->dm)->bd_inode) >> s->chunk_shift;
+	*size = dev_size;
+	if ((chunk_t)(dev_size + s->cb_stride) < dev_size)
+		return -EFBIG;
+
+	return 0;
+}
+
+static void initialize_device(struct dm_exception_store *s)
+{
+	int r;
+	struct dm_buffer *bp;
+	struct multisnap_superblock *sb;
+	struct multisnap_commit_block *cb;
+	chunk_t cb_block;
+	chunk_t block_to_write;
+
+	s->cb_stride = CB_STRIDE_DEFAULT;
+
+	r = get_size(s, &s->dev_size);
+	if (r) {
+		DMERR("initialize_device: device is too large. Compile kernel with 64-bit sector numbers");
+		dm_multisnap_set_error(s->dm, r);
+		return;
+	}
+
+	s->total_allocated = 0;
+	s->data_allocated = 0;
+
+	block_to_write = SB_BLOCK + 1;
+
+/* Write btree */
+	dm_multisnap_create_btree(s, &block_to_write);
+	if (dm_multisnap_has_error(s->dm))
+		return;
+
+/* Write bitmaps */
+	dm_multisnap_create_bitmaps(s, block_to_write);
+	if (dm_multisnap_has_error(s->dm))
+		return;
+
+/* Write commit blocks */
+	if (FIRST_CB_BLOCK >= s->dev_size) {
+		DMERR("initialize_device: device is too small");
+		dm_multisnap_set_error(s->dm, -ENOSPC);
+		return;
+	}
+	for (cb_block = FIRST_CB_BLOCK; cb_block < s->dev_size; cb_block += s->cb_stride) {
+		cb = dm_bufio_new(s->bufio, cb_block, &bp);
+		if (IS_ERR(cb)) {
+			DMERR("initialize_device: can't allocate commit block at %llx", (unsigned long long)cb_block);
+			dm_multisnap_set_error(s->dm, PTR_ERR(cb));
+			return;
+		}
+		memset(cb, 0, s->chunk_size);
+		cb->signature = CB_SIGNATURE;
+		cb->sequence = cpu_to_le64(cb_block == FIRST_CB_BLOCK);
+		if (cb_block == FIRST_CB_BLOCK) {
+			cb->snapshot_num = cpu_to_le32(0);
+			write_48(cb, dev_size, s->dev_size);
+			write_48(cb, total_allocated, s->total_allocated);
+			write_48(cb, data_allocated, s->data_allocated);
+			write_48(cb, bitmap_root, s->bitmap_root);
+			write_48(cb, freelist, 0);
+			write_48(cb, delete_rover, 0);
+			write_48(cb, bt_root, s->bt_root);
+			cb->bt_depth = s->bt_depth;
+			cb->flags = 0;
+		}
+		dm_bufio_mark_buffer_dirty(bp);
+		dm_bufio_release(bp);
+	}
+	r = dm_bufio_write_dirty_buffers(s->bufio);
+	if (r) {
+		DMERR("initialize_device: write error when initializing device");
+		dm_multisnap_set_error(s->dm, r);
+		return;
+	}
+
+/* Write super block */
+	sb = dm_bufio_new(s->bufio, SB_BLOCK, &bp);
+	if (IS_ERR(sb)) {
+		DMERR("initialize_device: can't allocate super block");
+		dm_multisnap_set_error(s->dm, PTR_ERR(sb));
+		return;
+	}
+	memset(sb, 0, s->chunk_size);
+	sb->signature = SB_SIGNATURE;
+	sb->chunk_size = cpu_to_le32(s->chunk_size);
+	sb->cb_stride = cpu_to_le32(s->cb_stride);
+	sb->error = cpu_to_le32(0);
+	sb->commit_block = cpu_to_le64(FIRST_CB_BLOCK);
+	dm_bufio_mark_buffer_dirty(bp);
+	dm_bufio_release(bp);
+	r = dm_bufio_write_dirty_buffers(s->bufio);
+	if (r) {
+		DMERR("initialize_device: can't write super block");
+		dm_multisnap_set_error(s->dm, r);
+		return;
+	}
+}
+
+static void extend_exception_store(struct dm_exception_store *s, chunk_t new_size)
+{
+	struct dm_buffer *bp;
+	chunk_t cb_block;
+	struct multisnap_commit_block *cb;
+
+	/*printk("extending store: %Lx -> %Lx\n", (unsigned long long)s->dev_size, (unsigned long long)new_size);*/
+
+/* Write commit blocks */
+	for (cb_block = FIRST_CB_BLOCK; cb_block < new_size; cb_block += s->cb_stride) {
+		cond_resched();
+		if (cb_block < s->dev_size)
+			continue;
+		cb = dm_bufio_new(s->bufio, cb_block, &bp);
+		if (IS_ERR(cb)) {
+			DMERR("initialize_device: can't allocate commit block at %llx", (unsigned long long)cb_block);
+			dm_multisnap_set_error(s->dm, PTR_ERR(cb));
+			return;
+		}
+		memset(cb, 0, s->chunk_size);
+		cb->signature = CB_SIGNATURE;
+		cb->sequence = cpu_to_le64(0);
+		dm_bufio_mark_buffer_dirty(bp);
+		dm_bufio_release(bp);
+	}
+
+	dm_multisnap_extend_bitmaps(s, new_size);
+
+	s->valid_commit_block = (chunk_t)-1;
+
+	dm_multisnap_commit(s);
+}
+
+static int read_super(struct dm_exception_store *s, char **error)
+{
+	struct dm_buffer *bp;
+	struct multisnap_superblock *sb;
+	int initialized;
+	__s32 e;
+
+	init_commit_block(s);
+
+	initialized = 0;
+re_read:
+	sb = dm_bufio_read(s->bufio, SB_BLOCK, &bp);
+	if (IS_ERR(sb)) {
+		*error = "Could not read superblock";
+		return PTR_ERR(sb);
+	}
+
+	if (sb->signature != SB_SIGNATURE) {
+		int i;
+		if (initialized) {
+			*error = "Invalid signature after initialization";
+			return -EIO;
+		}
+		for (i = 0; i < 1 << SECTOR_SHIFT; i++) {
+			if (((char *)sb)[i]) {
+				dm_bufio_release(bp);
+				*error = "Uninitialized device";
+				return -ENXIO;
+			}
+		}
+		dm_bufio_release(bp);
+		initialize_device(s);
+		if (dm_multisnap_has_error(s->dm)) {
+			*error = "Can't initialize device";
+			return dm_multisnap_has_error(s->dm);
+		}
+		initialized = 1;
+		goto re_read;
+	}
+	if (le32_to_cpu(sb->chunk_size) != s->chunk_size) {
+		dm_bufio_release(bp);
+		*error = "Bad chunk size";
+		return -EINVAL;
+	}
+	s->cb_stride = le32_to_cpu(sb->cb_stride);
+	if (s->cb_stride <= 1) {
+		dm_bufio_release(bp);
+		*error = "Bad commit block stride in superblock";
+		return -EFSERROR;
+	}
+	s->sb_commit_block = le64_to_cpu(sb->commit_block);
+	e = le32_to_cpu(sb->error);
+	dm_bufio_release(bp);
+
+	find_commit_block(s);
+
+	if (dm_multisnap_has_error(s->dm)) {
+		if (dm_multisnap_drop_on_error(s->dm))
+			return 0;
+		*error = "Unable to find commit block";
+		return dm_multisnap_has_error(s->dm);
+	}
+
+	load_commit_block(s);
+
+	if (dm_multisnap_has_error(s->dm)) {
+		if (dm_multisnap_drop_on_error(s->dm))
+			return 0;
+		*error = "Unable to load commit block";
+		return dm_multisnap_has_error(s->dm);
+	}
+
+	if (e < 0) {
+		/* Don't read the B+-tree if there was an error */
+		DMERR("read_super: activating invalidated snapshot store, error %d", e);
+		dm_multisnap_set_error(s->dm, e);
+		return 0;
+	}
+
+	dm_multisnap_read_snapshots(s);
+	if (dm_multisnap_has_error(s->dm)) {
+		if (dm_multisnap_drop_on_error(s->dm))
+			return 0;
+		*error = "Could not read snapshot list";
+		return dm_multisnap_has_error(s->dm);
+	}
+
+	return 0;
+}
+
+static void dm_multisnap_mikulas_lock_acquired(struct dm_exception_store *s, int flags)
+{
+	int r;
+	chunk_t new_size;
+
+	if (!dm_multisnap_can_commit(s->dm))
+		return;
+
+	r = get_size(s, &new_size);
+	if (unlikely(r))
+		return;
+
+	if (unlikely(new_size != s->dev_size)) {
+		if (unlikely(new_size < s->dev_size)) {
+			DMERR("dm_multisnap_mikulas_lock_acquired: device shrinked");
+			dm_multisnap_set_error(s->dm, -EINVAL);
+			return;
+		}
+		extend_exception_store(s, new_size);
+	}
+}
+
+/*#define PRINT_BTREE*/
+
+#ifdef PRINT_BTREE
+static int print_btree_callback(struct dm_exception_store *s, struct dm_multisnap_bt_entry *bt, void *cookie)
+{
+	printk(KERN_DEBUG "entry: %llx, %x-%x -> %llx\n", (unsigned long long)read_48(bt, orig_chunk), mikulas_snapid_to_cpu(bt->snap_from), mikulas_snapid_to_cpu(bt->snap_to), (unsigned long long)read_48(bt, new_chunk));
+	return 0;
+}
+
+static void print_btree(struct dm_exception_store *s)
+{
+	struct bt_key key = { 0, 0, 0 };
+	int r = dm_multisnap_list_btree(s, &key, print_btree_callback, NULL);
+	printk(KERN_DEBUG "list ended: %d\n", r);
+}
+#endif
+
+/*#define PRINT_BITMAPS*/
+
+#ifdef PRINT_BITMAPS
+static void print_bitmaps(struct dm_exception_store *s)
+{
+	chunk_t c;
+	printk(KERN_DEBUG "allocated:");
+	for (c = 0; c < s->dev_size; c += s->chunk_size * 8) {
+		struct dm_buffer *bp;
+		unsigned i;
+		void *bmp = dm_multisnap_map_bitmap(s, c >> (s->chunk_shift + 3), &bp, NULL, NULL);
+		if (!bmp)
+			continue;
+		for (i = 0; i < s->chunk_size * 8; i++)
+			if (generic_test_le_bit(i, bmp)) {
+				chunk_t block = c + i;
+				if (!dm_multisnap_is_commit_block(s, block))
+					printk(" %llx", (unsigned long long)block);
+				cond_resched();
+			}
+		}
+
+		dm_bufio_release(bp);
+	}
+	printk("\n");
+}
+#endif
+
+static int dm_multisnap_mikulas_init(struct dm_multisnap *dm, struct dm_exception_store **sp, unsigned argc, char **argv, char **error)
+{
+	int r;
+	struct dm_exception_store *s;
+
+	s = kzalloc(sizeof(struct dm_exception_store), GFP_KERNEL);
+	if (!s) {
+		*error = "Could not allocate private area";
+		r = -ENOMEM;
+		goto bad_private;
+	}
+	*sp = s;
+
+	s->dm = dm;
+	s->chunk_size = dm_multisnap_chunk_size(dm);
+	s->chunk_shift = ffs(s->chunk_size) - 1;
+
+	s->active_snapshots = RB_ROOT;
+	s->n_preallocated_blocks = 0;
+	s->query_active = 0;
+
+	s->delete_work.work = dm_multisnap_background_delete;
+	s->delete_work.queued = 0;
+	s->delete_commit_count = 0;
+
+	s->cache_threshold = 0;
+	s->cache_limit = 0;
+
+	while (argc) {
+		char *string;
+		r = dm_multisnap_get_string(&argv, &argc, &string, error);
+		if (r)
+			goto bad_arguments;
+		if (!strcasecmp(string, "cache-threshold")) {
+			r = dm_multisnap_get_uint64(&argv, &argc, &s->cache_threshold, error);
+			if (r)
+				goto bad_arguments;
+		} else if (!strcasecmp(string, "cache-limit")) {
+			r = dm_multisnap_get_uint64(&argv, &argc, &s->cache_limit, error);
+			if (r)
+				goto bad_arguments;
+		} else {
+			*error = "Unknown parameter";
+			r = -EINVAL;
+			goto bad_arguments;
+		}
+	}
+
+
+	s->tmp_chunk = vmalloc(s->chunk_size + sizeof(struct dm_multisnap_bt_entry));
+	if (!s->tmp_chunk) {
+		*error = "Can't allocate temporary chunk";
+		r = -ENOMEM;
+		goto bad_tmp_chunk;
+	}
+
+	s->freelist = vmalloc(s->chunk_size);
+	if (!s->freelist) {
+		*error = "Can't allocate freelist";
+		r = -ENOMEM;
+		goto bad_freelist;
+	}
+
+	s->bufio = dm_bufio_client_create(dm_multisnap_snapshot_bdev(s->dm), s->chunk_size, 0, s->cache_threshold, s->cache_limit);
+	if (IS_ERR(s->bufio)) {
+		*error = "Can't create bufio client";
+		r = PTR_ERR(s->bufio);
+		goto bad_bufio;
+	}
+
+	r = read_super(s, error);
+	if (r)
+		goto bad_super;
+
+	if (s->flags & (MULTISNAP_FLAG_DELETING | MULTISNAP_FLAG_PENDING_DELETE))
+		dm_multisnap_queue_work(s->dm, &s->delete_work);
+
+#ifdef PRINT_BTREE
+	print_btree(s);
+#endif
+#ifdef PRINT_BITMAPS
+	print_bitmaps(s);
+#endif
+
+	return 0;
+
+bad_super:
+	dm_bufio_client_destroy(s->bufio);
+bad_bufio:
+	vfree(s->freelist);
+bad_freelist:
+	vfree(s->tmp_chunk);
+bad_tmp_chunk:
+bad_arguments:
+	kfree(s);
+bad_private:
+	return r;
+}
+
+static void dm_multisnap_mikulas_exit(struct dm_exception_store *s)
+{
+	int i;
+
+	dm_multisnap_cancel_work(s->dm, &s->delete_work);
+
+	i = 0;
+	while (!list_empty(&s->used_bitmap_tmp_remaps)) {
+		struct tmp_remap *t = list_first_entry(&s->used_bitmap_tmp_remaps, struct tmp_remap, list);
+		list_del(&t->list);
+		hlist_del(&t->hash_list);
+		i++;
+	}
+
+	while (!list_empty(&s->used_bt_tmp_remaps)) {
+		struct tmp_remap *t = list_first_entry(&s->used_bt_tmp_remaps, struct tmp_remap, list);
+		list_del(&t->list);
+		hlist_del(&t->hash_list);
+		i++;
+	}
+
+	BUG_ON(i != s->n_used_tmp_remaps);
+	while (!list_empty(&s->free_tmp_remaps)) {
+		struct tmp_remap *t = list_first_entry(&s->free_tmp_remaps, struct tmp_remap, list);
+		list_del(&t->list);
+		i++;
+	}
+	BUG_ON(i != N_REMAPS);
+
+	for (i = 0; i < TMP_REMAP_HASH_SIZE; i++)
+		BUG_ON(!hlist_empty(&s->tmp_remap[i]));
+
+	dm_bufio_client_destroy(s->bufio);
+	vfree(s->freelist);
+	vfree(s->tmp_chunk);
+	kfree(s);
+}
+
+static void dm_multisnap_status_table(struct dm_exception_store *s, char *result, unsigned maxlen)
+{
+	int npar = 0;
+	if (s->cache_threshold)
+		npar += 2;
+	if (s->cache_limit)
+		npar += 2;
+
+	snprintf(result, maxlen, " %d", npar);
+	dm_multisnap_adjust_string(&result, &maxlen);
+
+	if (s->cache_threshold) {
+		snprintf(result, maxlen, " cache-threshold %llu", (unsigned long long)s->cache_threshold);
+		dm_multisnap_adjust_string(&result, &maxlen);
+	}
+	if (s->cache_limit) {
+		snprintf(result, maxlen, " cache-limit %llu", (unsigned long long)s->cache_limit);
+		dm_multisnap_adjust_string(&result, &maxlen);
+	}
+}
+
+struct dm_multisnap_exception_store dm_multisnap_mikulas_store = {
+	.name			= "mikulas",
+	.module			= THIS_MODULE,
+	.init_exception_store	= dm_multisnap_mikulas_init,
+	.exit_exception_store	= dm_multisnap_mikulas_exit,
+	.store_lock_acquired	= dm_multisnap_mikulas_lock_acquired,
+#ifdef CONFIG_DM_MULTISNAPSHOT_MIKULAS_SNAP_OF_SNAP
+	.print_snapid		= dm_multisnap_print_snapid,
+	.read_snapid		= dm_multisnap_read_snapid,
+#endif
+	.status_table		= dm_multisnap_status_table,
+	.get_space		= dm_multisnap_get_space,
+	.allocate_snapid	= dm_multisnap_allocate_snapid,
+	.create_snapshot	= dm_multisnap_create_snapshot,
+	.delete_snapshot	= dm_multisnap_delete_snapshot,
+	.get_next_snapid	= dm_multisnap_get_next_snapid,
+	.compare_snapids_for_create = dm_multisnap_compare_snapids_for_create,
+	.find_snapshot_chunk	= dm_multisnap_find_snapshot_chunk,
+	.reset_query		= dm_multisnap_reset_query,
+	.query_next_remap	= dm_multisnap_query_next_remap,
+	.add_next_remap		= dm_multisnap_add_next_remap,
+	.make_chunk_writeable	= dm_multisnap_make_chunk_writeable,
+	.check_conflict		= dm_multisnap_check_conflict,
+	.commit			= dm_multisnap_commit,
+};
+
+static int __init dm_multisnapshot_mikulas_module_init(void)
+{
+	BUG_ON(sizeof(struct multisnap_commit_block) != 512);
+	return dm_multisnap_register_exception_store(&dm_multisnap_mikulas_store);
+}
+
+static void __exit dm_multisnapshot_mikulas_module_exit(void)
+{
+	dm_multisnap_unregister_exception_store(&dm_multisnap_mikulas_store);
+}
+
+module_init(dm_multisnapshot_mikulas_module_init);
+module_exit(dm_multisnapshot_mikulas_module_exit);
+
+MODULE_DESCRIPTION(DM_NAME " multisnapshot Mikulas' exceptions store");
+MODULE_AUTHOR("Mikulas Patocka");
+MODULE_LICENSE("GPL");
+
Index: linux-2.6.32/drivers/md/dm-multisnap-mikulas.h
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-multisnap-mikulas.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_MULTISNAP_MIKULAS_H
+#define DM_MULTISNAP_MIKULAS_H
+
+/*
+ * This can be optionally undefined to get 32-bit snapshot numbers.
+ * Breaks on-disk format compatibility.
+ */
+#define CONFIG_DM_MULTISNAPSHOT_MIKULAS_SNAP_OF_SNAP
+
+#include "dm-multisnap.h"
+#include "dm-multisnap-mikulas-struct.h"
+
+#include "dm-bufio.h"
+
+#include <linux/vmalloc.h>
+
+typedef __u32 bitmap_t;
+
+#define read_48(struc, entry)		(le32_to_cpu((struc)->entry##1) | ((chunk_t)le16_to_cpu((struc)->entry##2) << 31 << 1))
+#define write_48(struc, entry, val)	do { (struc)->entry##1 = cpu_to_le32(val); (struc)->entry##2 = cpu_to_le16((chunk_t)(val) >> 31 >> 1); } while (0)
+
+#define TMP_REMAP_HASH_SIZE	256
+#define TMP_REMAP_HASH(c)	((c) & (TMP_REMAP_HASH_SIZE - 1))
+
+struct tmp_remap {
+	/* List entry for tmp_remap */
+	struct hlist_node hash_list;
+	/* List entry for used_tmp_remaps/free_tmp_remaps */
+	struct list_head list;
+	chunk_t old;
+	chunk_t new;
+	bitmap_t bitmap_idx;
+	int uncommitted;
+};
+
+struct bt_key {
+	chunk_t chunk;
+	mikulas_snapid_t snap_from;
+	mikulas_snapid_t snap_to;
+};
+
+struct path_element {
+	chunk_t block;
+	unsigned idx;
+	unsigned n_entries;
+};
+
+struct dm_exception_store {
+	struct dm_multisnap *dm;
+	struct dm_bufio_client *bufio;
+
+	chunk_t dev_size;
+	unsigned chunk_size;
+	unsigned char chunk_shift;
+	unsigned char bitmap_depth;
+	unsigned btree_entries;
+	__u8 bt_depth;
+	__u8 flags;
+	__u32 snapshot_num;
+	unsigned cb_stride;
+
+	chunk_t bitmap_root;
+	chunk_t alloc_rover;
+	chunk_t bt_root;
+	chunk_t sb_commit_block;
+	chunk_t valid_commit_block;
+	chunk_t delete_rover_chunk;
+	mikulas_snapid_t delete_rover_snapid;
+
+	chunk_t total_allocated;
+	chunk_t data_allocated;
+
+	__u64 commit_sequence;
+
+	void *tmp_chunk;
+
+	struct rb_root active_snapshots;
+
+	/* Used during query/add remap */
+	chunk_t query_snapid;
+	struct bt_key query_new_key;
+	unsigned char query_active;
+	chunk_t query_block_from;
+	chunk_t query_block_to;
+
+	/* List heads for struct tmp_remap->list */
+	unsigned n_used_tmp_remaps;
+	struct list_head used_bitmap_tmp_remaps;
+	struct list_head used_bt_tmp_remaps;
+	struct list_head free_tmp_remaps;
+	/* List head for struct tmp_remap->hash_list */
+	struct hlist_head tmp_remap[TMP_REMAP_HASH_SIZE];
+	struct tmp_remap tmp_remap_store[N_REMAPS];
+
+	unsigned n_preallocated_blocks;
+	chunk_t preallocated_blocks[MAX_BITMAP_DEPTH * 2];
+
+	struct dm_multisnap_freelist *freelist;
+	chunk_t freelist_ptr;
+
+	struct dm_multisnap_background_work delete_work;
+	unsigned delete_commit_count;
+
+	__u64 cache_threshold;
+	__u64 cache_limit;
+};
+
+/* dm-multisnap-alloc.c */
+
+void dm_multisnap_create_bitmaps(struct dm_exception_store *s, chunk_t start);
+void dm_multisnap_extend_bitmaps(struct dm_exception_store *s, chunk_t new_size);
+void *dm_multisnap_map_bitmap(struct dm_exception_store *s, bitmap_t bitmap, struct dm_buffer **bp, chunk_t *block, struct path_element *path);
+int dm_multisnap_alloc_blocks(struct dm_exception_store *s, chunk_t *results, unsigned n_blocks, int flags);
+#define ALLOC_DRY	1
+void *dm_multisnap_alloc_duplicate_block(struct dm_exception_store *s, chunk_t block, struct dm_buffer **bp, void *ptr);
+void *dm_multisnap_alloc_make_block(struct dm_exception_store *s, chunk_t *result, struct dm_buffer **bp);
+void dm_multisnap_free_blocks_immediate(struct dm_exception_store *s, chunk_t block, unsigned n_blocks);
+void dm_multisnap_bitmap_finalize_tmp_remap(struct dm_exception_store *s, struct tmp_remap *tmp_remap);
+
+/* dm-multisnap-blocks.c */
+
+chunk_t dm_multisnap_remap_block(struct dm_exception_store *s, chunk_t block);
+void *dm_multisnap_read_block(struct dm_exception_store *s, chunk_t block, struct dm_buffer **bp);
+int dm_multisnap_block_is_uncommitted(struct dm_exception_store *s, chunk_t block);
+void *dm_multisnap_duplicate_block(struct dm_exception_store *s, chunk_t old_chunk, chunk_t new_chunk, bitmap_t bitmap_idx, struct dm_buffer **bp, chunk_t *to_free);
+void dm_multisnap_free_tmp_remap(struct dm_exception_store *s, struct tmp_remap *t);
+void *dm_multisnap_make_block(struct dm_exception_store *s, chunk_t new_chunk, struct dm_buffer **bp);
+void dm_multisnap_free_block_and_duplicates(struct dm_exception_store *s, chunk_t chunk);
+
+int dm_multisnap_is_commit_block(struct dm_exception_store *s, chunk_t block);
+
+typedef chunk_t stop_cycles_t[2];
+
+void dm_multisnap_init_stop_cycles(stop_cycles_t *cy);
+int dm_multisnap_stop_cycles(struct dm_exception_store *s, stop_cycles_t *cy, chunk_t key);
+
+/* dm-multisnap-btree.c */
+
+void dm_multisnap_create_btree(struct dm_exception_store *s, chunk_t *start);
+int dm_multisnap_find_in_btree(struct dm_exception_store *s, struct bt_key *key, chunk_t *result);
+void dm_multisnap_add_to_btree(struct dm_exception_store *s, struct bt_key *key, chunk_t new_chunk);
+void dm_multisnap_restrict_btree_entry(struct dm_exception_store *s, struct bt_key *key);
+void dm_multisnap_extend_btree_entry(struct dm_exception_store *s, struct bt_key *key);
+void dm_multisnap_delete_from_btree(struct dm_exception_store *s, struct bt_key *key);
+void dm_multisnap_bt_finalize_tmp_remap(struct dm_exception_store *s, struct tmp_remap *tmp_remap);
+int dm_multisnap_list_btree(struct dm_exception_store *s, struct bt_key *key, int (*call)(struct dm_exception_store *, struct dm_multisnap_bt_node *, struct dm_multisnap_bt_entry *, void *), void *cookie);
+
+/* dm-multisnap-commit.c */
+
+void dm_multisnap_transaction_mark(struct dm_exception_store *s);
+void dm_multisnap_commit(struct dm_exception_store *s);
+
+/* dm-multisnap-delete.c */
+
+void dm_multisnap_background_delete(struct dm_exception_store *s, struct dm_multisnap_background_work *bw);
+
+/* dm-multisnap-freelist.c */
+
+void dm_multisnap_init_freelist(struct dm_multisnap_freelist *fl, unsigned chunk_size);
+void dm_multisnap_free_block(struct dm_exception_store *s, chunk_t block, unsigned flags);
+int dm_multisnap_check_allocated_block(struct dm_exception_store *s, chunk_t block);
+void dm_multisnap_flush_freelist_before_commit(struct dm_exception_store *s);
+void dm_multisnap_load_freelist(struct dm_exception_store *s);
+
+/* dm-multisnap-io.c */
+
+int dm_multisnap_find_snapshot_chunk(struct dm_exception_store *s, snapid_t snapid, chunk_t chunk, int write, chunk_t *result);
+void dm_multisnap_reset_query(struct dm_exception_store *s);
+int dm_multisnap_query_next_remap(struct dm_exception_store *s, chunk_t chunk);
+void dm_multisnap_add_next_remap(struct dm_exception_store *s, union chunk_descriptor *cd, chunk_t *new_chunk);
+void dm_multisnap_make_chunk_writeable(struct dm_exception_store *s, union chunk_descriptor *cd, chunk_t *new_chunk);
+int dm_multisnap_check_conflict(struct dm_exception_store *s, union chunk_descriptor *cd, snapid_t snapid);
+
+/* dm-multisnap-snaps.c */
+
+snapid_t dm_multisnap_get_next_snapid(struct dm_exception_store *s, snapid_t snapid);
+int dm_multisnap_compare_snapids_for_create(const void *p1, const void *p2);
+int dm_multisnap_find_next_snapid_range(struct dm_exception_store *s, snapid_t snapid, snapid_t *from, snapid_t *to);
+
+void dm_multisnap_destroy_snapshot_tree(struct dm_exception_store *s);
+void dm_multisnap_read_snapshots(struct dm_exception_store *s);
+int dm_multisnap_allocate_snapid(struct dm_exception_store *s, snapid_t *snapid);
+int dm_multisnap_create_snapshot(struct dm_exception_store *s, snapid_t snapid);
+int dm_multisnap_delete_snapshot(struct dm_exception_store *s, snapid_t snapid);
+
+void dm_multisnap_get_space(struct dm_exception_store *s, unsigned long long *chunks_total, unsigned long long *chunks_allocated, unsigned long long *chunks_metadata_allocated);
+
+#ifdef CONFIG_DM_MULTISNAPSHOT_MIKULAS_SNAP_OF_SNAP
+void dm_multisnap_print_snapid(struct dm_exception_store *s, char *string, unsigned maxlen, snapid_t snapid);
+int dm_multisnap_read_snapid(struct dm_exception_store *s, char *string, snapid_t *snapid, char **error);
+#endif
+
+#endif
Index: linux-2.6.32/drivers/md/dm-multisnap-private.h
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-multisnap-private.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_MULTISNAP_PRIVATE_H
+#define DM_MULTISNAP_PRIVATE_H
+
+#include "dm-multisnap.h"
+
+/*
+ * Private structures for dm-multisnap.c.
+ * This file should not be included by exception store drivers.
+ */
+
+#include <linux/dm-kcopyd.h>
+
+#define PENDING_HASH_SIZE	256
+#define PENDING_HASH(c)		((c) & (PENDING_HASH_SIZE - 1))
+#define PENDING_MEMPOOL_SIZE	256
+
+#define MULTISNAP_KCOPYD_PAGES	(((1UL << 20) >> PAGE_SHIFT) ? : 1)
+
+#define MAX_CHUNKS_TO_REMAP	DM_KCOPYD_MAX_REGIONS
+
+#define DM_TRACKED_CHUNK_HASH_SIZE	16
+#define DM_TRACKED_CHUNK_HASH(x)	((unsigned long)(x) & (DM_TRACKED_CHUNK_HASH_SIZE - 1))
+#define DM_TRACKED_CHUNK_POOL_SIZE	256
+
+struct dm_multisnap {
+	struct dm_exception_store *p;
+	struct dm_multisnap_exception_store *store;
+
+	struct dm_dev *origin;
+	struct dm_dev *snapshot;
+
+	int error;
+
+	unsigned chunk_size;
+	unsigned char chunk_shift;
+
+	unsigned char flags;
+#define DM_MULTISNAP_SYNC_SNAPSHOTS		1
+#define DM_MULTISNAP_PRESERVE_ON_ERROR		2
+
+	sector_t origin_sectors;
+
+	struct mutex master_lock;
+	struct mutex status_lock;
+	struct workqueue_struct *wq;
+	struct work_struct work;
+	struct bio_list bios; /* protected with dm_multisnap_bio_list_lock */
+	struct list_head background_works;
+
+	/* All snapshot IOs */
+	mempool_t *tracked_chunk_pool;
+
+	/* these two are protected with dm_multisnap_bio_list_lock */
+	long n_tracked_ios;
+	struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
+
+	mempool_t *pending_pool;
+
+	struct dm_kcopyd_client *kcopyd;
+
+	/*
+	 * The following two variables do a trick to avoid the need for
+	 * atomic operations.
+	 *
+	 * kcopyd_jobs_submitted_count is incremented each time a job is
+	 * submitted to kcopyd. master_lock protects it.
+	 *
+	 * kcopyd_jobs_finished_count is incremented each time a kcopyd
+	 * callback is called. The callback is single-threaded, so it needs
+	 * no protection.
+	 *
+	 * Both kcopyd_jobs_submitted_count and kcopyd_jobs_finished_count
+	 * can be updated simultaneously. But none of these variables is
+	 * updated multiple times concurrently.
+	 *
+	 * When these two are equal, there are no jobs in flight. When they
+	 * are equal and master_lock is held, we know that there are no jobs
+	 * in flight and no new can be submitted --- i.e. we can commit.
+	 */
+	unsigned long kcopyd_jobs_submitted_count;
+	unsigned long kcopyd_jobs_finished_count;
+
+	/* This may only be accessed from kcopyd callback, it has no locking */
+	struct list_head pes_waiting_for_commit;
+
+	/* Increased each time a commit happens */
+	unsigned commit_sequence;
+
+	/* List head for struct dm_multisnap_pending_exception->hash_list */
+	struct hlist_head pending_hash[PENDING_HASH_SIZE];
+
+	char pending_mempool_allocation_failed;
+
+	/* The new snapshot id to be created */
+	char new_snapid_valid;
+	snapid_t new_snapid;
+
+	/* List head for struct dm_multisnap_snap->list_snaps */
+	struct list_head all_snaps;
+
+	/* List entry for all_multisnapshots */
+	struct list_head list_all;
+};
+
+struct dm_multisnap_snap {
+	struct dm_multisnap *s;
+	snapid_t snapid;
+	/* List entry for struct dm_multisnap->list_all */
+	struct list_head list_snaps;
+	char origin_name[16];
+	char snapid_string[1];
+};
+
+struct dm_multisnap_tracked_chunk {
+	struct hlist_node node;
+	chunk_t chunk;
+	unsigned long bio_rw;
+	struct dm_multisnap *s;
+};
+
+struct dm_multisnap_pending_exception {
+	/* List entry for struct dm_multisnap->pending_hash */
+	struct hlist_node hash_list;
+
+	struct dm_multisnap *s;
+	struct bio_list bios;
+
+	chunk_t chunk;
+
+	int n_descs;
+	union chunk_descriptor desc[MAX_CHUNKS_TO_REMAP];
+
+	/* List entry for struct dm_multisnap->pes_waiting_for_commit */
+	struct list_head list;
+};
+
+#endif
Index: linux-2.6.32/drivers/md/dm-multisnap-snaps.c
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-multisnap-snaps.c
@@ -0,0 +1,429 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-multisnap-mikulas.h"
+
+struct snapshot_range {
+	struct rb_node node;
+	mikulas_snapid_t from;
+	mikulas_snapid_t to;
+};
+
+static struct snapshot_range *rb_find_insert_snapshot(struct dm_exception_store *s, mikulas_snapid_t from, mikulas_snapid_t to, int add)
+{
+	struct snapshot_range *new;
+	struct snapshot_range *found = NULL;
+	struct rb_node **p = &s->active_snapshots.rb_node;
+	struct rb_node *parent = NULL;
+	while (*p) {
+		parent = *p;
+#define rn	rb_entry(parent, struct snapshot_range, node)
+		if (to < rn->from) {
+go_left:
+			p = &rn->node.rb_left;
+		} else if (from > rn->to) {
+			p = &rn->node.rb_right;
+		} else {
+			if (!add) {
+				found = rn;
+		/* If there is range query, we need to find the leftmost node */
+				if (from < rn->from)
+					goto go_left;
+				break;
+			} else {
+				dm_multisnap_set_error(s->dm, -EFSERROR);
+				DMERR("rb_insert_snapshot: inserting overlapping entry: (%llx,%llx) overlaps (%llx,%llx)", (unsigned long long)from, (unsigned long long)to, (unsigned long long)rn->from, (unsigned long long)rn->to);
+				return NULL;
+			}
+		}
+#undef rn
+	}
+	if (!add)
+		return found;
+
+	dm_multisnap_status_assert_locked(s->dm);
+
+	new = kmalloc(sizeof(struct snapshot_range), GFP_KERNEL);
+	if (!new) {
+		DMERR("rb_insert_snapshot: can't allocate memory for snapshot descriptor");
+		dm_multisnap_set_error(s->dm, -ENOMEM);
+		return NULL;
+	}
+
+	new->from = from;
+	new->to = to;
+
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, &s->active_snapshots);
+
+	return new;
+}
+
+static struct snapshot_range *rb_find_snapshot(struct dm_exception_store *s, mikulas_snapid_t from, mikulas_snapid_t to)
+{
+	return rb_find_insert_snapshot(s, from, to, 0);
+}
+
+static int rb_insert_snapshot_unlocked(struct dm_exception_store *s, mikulas_snapid_t from, mikulas_snapid_t to)
+{
+	struct snapshot_range *rn;
+	rn = rb_find_insert_snapshot(s, from, to, 1);
+	if (!rn)
+		return -1;
+	return 0;
+}
+
+static int rb_insert_snapshot(struct dm_exception_store *s, mikulas_snapid_t from, mikulas_snapid_t to)
+{
+	int r;
+	dm_multisnap_status_lock(s->dm);
+	r = rb_insert_snapshot_unlocked(s, from, to);
+	dm_multisnap_status_unlock(s->dm);
+	return r;
+}
+
+static int rb_extend_range(struct dm_exception_store *s, mikulas_snapid_t from, mikulas_snapid_t to)
+{
+	struct snapshot_range *rn;
+	rn = rb_find_insert_snapshot(s, from, from, 0);
+	if (!rn) {
+		DMERR("rb_extend_range: snapshot %llx not found", (unsigned long long)from);
+		return -1;
+	}
+	if (rn->to != from) {
+		DMERR("rb_extend_range: bad attempt to extend range: %llx >= %llx", (unsigned long long)rn->to, (unsigned long long)from);
+		return -1;
+	}
+	dm_multisnap_status_lock(s->dm);
+	rn->to = to;
+	dm_multisnap_status_unlock(s->dm);
+	return 0;
+}
+
+static int rb_delete_range(struct dm_exception_store *s, mikulas_snapid_t from, mikulas_snapid_t to)
+{
+	struct snapshot_range *sr = rb_find_snapshot(s, from, from);
+
+	if (!sr || sr->to < to) {
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		DMERR("rb_delete_range: deleting non-existing snapid %llx-%llx", (unsigned long long)from, (unsigned long long)to);
+		return -1;
+	}
+
+	dm_multisnap_status_lock(s->dm);
+	if (sr->from < from) {
+		mikulas_snapid_t orig_to = sr->to;
+		sr->to = from - 1;
+		if (orig_to > to) {
+			if (rb_insert_snapshot_unlocked(s, to + 1, orig_to)) {
+				sr->to = orig_to;
+				dm_multisnap_status_unlock(s->dm);
+				return -1;
+			}
+		}
+	} else {
+		if (sr->to > to) {
+			sr->from = to + 1;
+		} else {
+			rb_erase(&sr->node, &s->active_snapshots);
+			kfree(sr);
+		}
+	}
+	dm_multisnap_status_unlock(s->dm);
+	return 0;
+}
+
+snapid_t dm_multisnap_get_next_snapid(struct dm_exception_store *s, snapid_t snapid)
+{
+	struct snapshot_range *rn;
+
+	rn = rb_find_snapshot(s, snapid, SNAPID_T_MAX);
+	if (!rn)
+		return SNAPID_T_ORIGIN;
+	if (rn->from > snapid)
+		snapid = rn->from;
+	if (rn->to >= (snapid | MIKULAS_SUBSNAPID_MASK))
+		return snapid | MIKULAS_SUBSNAPID_MASK;
+	return snapid;
+}
+
+int dm_multisnap_find_next_snapid_range(struct dm_exception_store *s, snapid_t snapid, snapid_t *from, snapid_t *to)
+{
+	struct snapshot_range *rn;
+	rn = rb_find_snapshot(s, snapid, SNAPID_T_MAX);
+	if (!rn)
+		return 0;
+	*from = rn->from;
+	*to = rn->to;
+	return 1;
+}
+
+void dm_multisnap_destroy_snapshot_tree(struct dm_exception_store *s)
+{
+	struct rb_node *root;
+	while ((root = s->active_snapshots.rb_node)) {
+#define rn	rb_entry(root, struct snapshot_range, node)
+		rb_erase(root, &s->active_snapshots);
+		kfree(rn);
+#undef rn
+	}
+}
+
+void dm_multisnap_read_snapshots(struct dm_exception_store *s)
+{
+	struct bt_key snap_key;
+	chunk_t ignore;
+	int r;
+
+	dm_multisnap_destroy_snapshot_tree(s);
+
+	snap_key.snap_from = 0;
+find_next:
+	snap_key.snap_to = SNAPID_T_MAX;
+	snap_key.chunk = CHUNK_T_SNAP_PRESENT;
+
+	r = dm_multisnap_find_in_btree(s, &snap_key, &ignore);
+
+	if (unlikely(r < 0))
+		return;
+
+	if (r) {
+		/*printk("inserting snapid %llx-%llx\n", (unsigned long long)snap_key.snap_from, (unsigned long long)snap_key.snap_to);*/
+		if (unlikely(snap_key.snap_to > SNAPID_T_MAX)) {
+			dm_multisnap_set_error(s->dm, -EFSERROR);
+			DMERR("dm_multisnap_read_snapshots: invalid snapshot id");
+			return;
+		}
+		r = rb_insert_snapshot(s, snap_key.snap_from, snap_key.snap_to);
+		if (unlikely(r < 0))
+			return;
+		snap_key.snap_from = snap_key.snap_to + 1;
+		goto find_next;
+	}
+}
+
+int dm_multisnap_allocate_snapid(struct dm_exception_store *s, snapid_t *snapid)
+{
+	*snapid = ((mikulas_snapid_t)s->snapshot_num << MIKULAS_SNAPID_STEP_BITS) | MIKULAS_SUBSNAPID_MASK;
+	/*printk("allocating: %x, %llx\n", s->snapshot_num, (unsigned long long)*snapid);*/
+	if (s->snapshot_num == 0xffffffff || *snapid > SNAPID_T_MAX) {
+		DMERR("dm_multisnap_allocate_snapid: 2^32 snapshot limit reached");
+		return -ENOSPC;
+	}
+	return 0;
+}
+
+int dm_multisnap_create_snapshot(struct dm_exception_store *s, snapid_t snapid)
+{
+	int r;
+	struct bt_key snap_key;
+
+	if ((snapid & MIKULAS_SUBSNAPID_MASK) != MIKULAS_SUBSNAPID_MASK) {
+		DMERR("dm_multisnap_create_snapshot: snapshots of snapshots not yet supported");
+		return -EOPNOTSUPP;
+	}
+
+	if ((snapid >> MIKULAS_SNAPID_STEP_BITS) < s->snapshot_num || snapid > SNAPID_T_MAX) {
+		DMERR("dm_multisnap_create_snapshot: invalid snapshot id %llx (allowed range %llx - %llx)", (unsigned long long)snapid, (unsigned long long)s->snapshot_num, (unsigned long long)SNAPID_T_MAX);
+		return -EINVAL;
+	}
+	if (dm_multisnap_snapshot_exists(s->dm, snapid)) {
+		DMERR("dm_multisnap_create_snapshot: snapshot with id %llx already exists", (unsigned long long)snapid);
+		return -EINVAL;
+	}
+
+	if (snapid > MIKULAS_SUBSNAPID_MASK && dm_multisnap_snapshot_exists(s->dm, snapid - MIKULAS_SUBSNAPID_MASK - 1)) {
+		/* Extend existing key range */
+
+		r = rb_extend_range(s, snapid - MIKULAS_SUBSNAPID_MASK - 1, snapid);
+
+		if (r < 0)
+			return dm_multisnap_has_error(s->dm);
+
+		snap_key.chunk = CHUNK_T_SNAP_PRESENT;
+		snap_key.snap_from = snapid - MIKULAS_SUBSNAPID_MASK - 1;
+		snap_key.snap_to = snapid;
+		dm_multisnap_extend_btree_entry(s, &snap_key);
+	} else {
+		/* Add new entry */
+
+		r = rb_insert_snapshot(s, snapid - MIKULAS_SUBSNAPID_MASK, snapid);
+		if (r < 0)
+			return dm_multisnap_has_error(s->dm);
+
+		snap_key.chunk = CHUNK_T_SNAP_PRESENT;
+		snap_key.snap_from = snapid - MIKULAS_SUBSNAPID_MASK;
+		snap_key.snap_to = snapid;
+		dm_multisnap_add_to_btree(s, &snap_key, 0);
+	}
+	if (dm_multisnap_has_error(s->dm))
+		return dm_multisnap_has_error(s->dm);
+
+	/*printk("multisnapshot: created snapshot with id %llu\n", (unsigned long long)snapid);*/
+
+	s->snapshot_num = (snapid >> MIKULAS_SNAPID_STEP_BITS) + 1;
+
+	dm_multisnap_transaction_mark(s);
+	dm_multisnap_commit(s);
+
+	return 0;
+}
+
+int dm_multisnap_delete_snapshot(struct dm_exception_store *s, snapid_t snapid)
+{
+	int r;
+	struct bt_key snap_key;
+	mikulas_snapid_t from, to;
+	chunk_t ignore;
+	struct snapshot_range *sr = rb_find_snapshot(s, snapid, snapid);
+
+	if (!sr) {
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		DMERR("dm_multisnap_delete_snapshot: snapshot id %llx not found in rb-tree", (unsigned long long)snapid);
+	}
+
+	from = to = snapid;
+	if ((snapid & MIKULAS_SUBSNAPID_MASK) == MIKULAS_SUBSNAPID_MASK) {
+		from = snapid & ~MIKULAS_SUBSNAPID_MASK;
+		if (from < sr->from)
+			from = sr->from;
+	}
+
+	r = rb_delete_range(s, from, to);
+	if (r < 0)
+		return dm_multisnap_has_error(s->dm);
+
+	snap_key.chunk = CHUNK_T_SNAP_PRESENT;
+	snap_key.snap_from = from;
+	snap_key.snap_to = from;
+
+	r = dm_multisnap_find_in_btree(s, &snap_key, &ignore);
+	if (r <= 0) {
+		if (!r) {
+			dm_multisnap_set_error(s->dm, -EFSERROR);
+			DMERR("dm_multisnap_delete_snapshot: snapshot id %llx not found in b-tree", (unsigned long long)from);
+		}
+		return dm_multisnap_has_error(s->dm);
+	}
+	if (snap_key.snap_to < to) {
+		dm_multisnap_set_error(s->dm, -EFSERROR);
+		DMERR("dm_multisnap_delete_snapshot: snapshot id %llx-%llx not found in b-tree", (unsigned long long)from, (unsigned long long)to);
+		return dm_multisnap_has_error(s->dm);
+	}
+
+	if (snap_key.snap_from < from) {
+		snap_key.snap_from = from;
+		dm_multisnap_restrict_btree_entry(s, &snap_key);
+
+		dm_multisnap_transaction_mark(s);
+
+		if (dm_multisnap_has_error(s->dm))
+			return dm_multisnap_has_error(s->dm);
+
+		if (snap_key.snap_to > to) {
+			snap_key.snap_from = to + 1;
+			dm_multisnap_add_to_btree(s, &snap_key, 0);
+		}
+	} else {
+		if (snap_key.snap_to > to) {
+			snap_key.snap_to = to;
+			dm_multisnap_restrict_btree_entry(s, &snap_key);
+		} else {
+			dm_multisnap_delete_from_btree(s, &snap_key);
+		}
+	}
+
+	dm_multisnap_transaction_mark(s);
+
+	s->flags |= MULTISNAP_FLAG_PENDING_DELETE;
+	dm_multisnap_queue_work(s->dm, &s->delete_work);
+
+	dm_multisnap_commit(s);
+
+	return 0;
+}
+
+int dm_multisnap_compare_snapids_for_create(const void *p1, const void *p2)
+{
+	mikulas_snapid_t s1 = *(const snapid_t *)p1;
+	mikulas_snapid_t s2 = *(const snapid_t *)p2;
+	mikulas_snapid_t ms1 = s1 >> MIKULAS_SNAPID_STEP_BITS;
+	mikulas_snapid_t ms2 = s2 >> MIKULAS_SNAPID_STEP_BITS;
+	if (ms1 < ms2)
+		return -1;
+	if (ms1 > ms2)
+		return 1;
+	s1 &= MIKULAS_SUBSNAPID_MASK;
+	s2 &= MIKULAS_SUBSNAPID_MASK;
+	if (s1 == MIKULAS_SUBSNAPID_MASK && s2 != MIKULAS_SUBSNAPID_MASK)
+		return -1;
+	if (s1 != MIKULAS_SUBSNAPID_MASK && s2 == MIKULAS_SUBSNAPID_MASK)
+		return 1;
+	if (s1 < s2)
+		return -1;
+	if (s1 > s2)
+		return 1;
+	return 0;
+}
+
+void dm_multisnap_get_space(struct dm_exception_store *s, unsigned long long *chunks_total, unsigned long long *chunks_allocated, unsigned long long *chunks_metadata_allocated)
+{
+	dm_multisnap_status_assert_locked(s->dm);
+	*chunks_total = s->dev_size;
+	*chunks_allocated = s->total_allocated;
+	*chunks_metadata_allocated = s->total_allocated - s->data_allocated;
+}
+
+#ifdef CONFIG_DM_MULTISNAPSHOT_MIKULAS_SNAP_OF_SNAP
+
+void dm_multisnap_print_snapid(struct dm_exception_store *s, char *string, unsigned maxlen, snapid_t snapid)
+{
+	unsigned master = snapid >> MIKULAS_SNAPID_STEP_BITS;
+	unsigned subsnap = snapid & MIKULAS_SUBSNAPID_MASK;
+	if (subsnap == MIKULAS_SUBSNAPID_MASK)
+		snprintf(string, maxlen, "%u", master);
+	else
+		snprintf(string, maxlen, "%u.%u", master, subsnap);
+}
+
+int dm_multisnap_read_snapid(struct dm_exception_store *s, char *string, snapid_t *snapid, char **error)
+{
+	unsigned long master;
+	unsigned long subsnap;
+	if (!string[0]) {
+err:
+		*error = "Invalid snapshot id";
+		return -EINVAL;
+	}
+
+	master = simple_strtoul(string, &string, 10);
+
+	if (!string[0])
+		subsnap = MIKULAS_SUBSNAPID_MASK;
+	else {
+		if (string[0] != '.' || !string[1])
+			goto err;
+		string++;
+		subsnap = simple_strtoul(string, &string, 10);
+		if (string[0])
+			goto err;
+		if (subsnap >= MIKULAS_SUBSNAPID_MASK) {
+bad_number:
+			*error = "Number out of range";
+			return -EINVAL;
+		}
+	}
+
+	if (master >= SNAPID_T_MAX >> MIKULAS_SNAPID_STEP_BITS)
+		goto bad_number;
+
+	*snapid = (mikulas_snapid_t)master << MIKULAS_SNAPID_STEP_BITS | subsnap;
+	return 0;
+}
+
+#endif
Index: linux-2.6.32/drivers/md/dm-multisnap.c
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-multisnap.c
@@ -0,0 +1,1871 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-multisnap-private.h"
+
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/sort.h>
+
+static void dm_multisnap_process_bios(struct dm_multisnap *s);
+
+static void dm_multisnap_lock(struct dm_multisnap *s)
+{
+	mutex_lock(&s->master_lock);
+	if (s->p && s->store->store_lock_acquired)
+		s->store->store_lock_acquired(s->p, 0);
+}
+
+static void dm_multisnap_unlock(struct dm_multisnap *s)
+{
+	mutex_unlock(&s->master_lock);
+}
+
+static int dm_multisnap_lock_contended(struct dm_multisnap *s)
+{
+	return !list_empty(&s->master_lock.wait_list);
+}
+
+static void dm_multisnap_assert_locked(struct dm_multisnap *s)
+{
+	BUG_ON(!mutex_is_locked(&s->master_lock));
+}
+
+void dm_multisnap_status_lock(struct dm_multisnap *s)
+{
+	mutex_lock(&s->status_lock);
+}
+EXPORT_SYMBOL(dm_multisnap_status_lock);
+
+void dm_multisnap_status_unlock(struct dm_multisnap *s)
+{
+	mutex_unlock(&s->status_lock);
+}
+EXPORT_SYMBOL(dm_multisnap_status_unlock);
+
+void dm_multisnap_status_assert_locked(struct dm_multisnap *s)
+{
+	BUG_ON(!mutex_is_locked(&s->status_lock));
+}
+EXPORT_SYMBOL(dm_multisnap_status_assert_locked);
+
+struct block_device *dm_multisnap_snapshot_bdev(struct dm_multisnap *s)
+{
+	return s->snapshot->bdev;
+}
+EXPORT_SYMBOL(dm_multisnap_snapshot_bdev);
+
+unsigned dm_multisnap_chunk_size(struct dm_multisnap *s)
+{
+	return s->chunk_size;
+}
+EXPORT_SYMBOL(dm_multisnap_chunk_size);
+
+void dm_multisnap_set_error(struct dm_multisnap *s, int error)
+{
+	if (!s->error)
+		s->error = error;
+	dump_stack();
+}
+EXPORT_SYMBOL(dm_multisnap_set_error);
+
+int dm_multisnap_has_error(struct dm_multisnap *s)
+{
+	return s->error;
+}
+EXPORT_SYMBOL(dm_multisnap_has_error);
+
+int dm_multisnap_drop_on_error(struct dm_multisnap *s)
+{
+	return !(s->flags & DM_MULTISNAP_PRESERVE_ON_ERROR);
+}
+EXPORT_SYMBOL(dm_multisnap_drop_on_error);
+
+static DEFINE_MUTEX(all_multisnapshots_lock);
+static LIST_HEAD(all_multisnapshots);
+
+static chunk_t sector_to_chunk(struct dm_multisnap *s, sector_t sector)
+{
+	return sector >> (s->chunk_shift - SECTOR_SHIFT);
+}
+
+static sector_t chunk_to_sector(struct dm_multisnap *s, chunk_t chunk)
+{
+	return chunk << (s->chunk_shift - SECTOR_SHIFT);
+}
+
+int dm_multisnap_snapshot_exists(struct dm_multisnap *s, snapid_t snapid)
+{
+	return snapid == s->store->get_next_snapid(s->p, snapid);
+}
+EXPORT_SYMBOL(dm_multisnap_snapshot_exists);
+
+/*
+ * Any reading/writing of snapids in table/status/message must go
+ * through this functions, so that snapid format for userspace can
+ * be overriden.
+ */
+
+static void print_snapid(struct dm_multisnap *s, char *string, unsigned maxlen, snapid_t snapid)
+{
+	if (s->store->print_snapid)
+		s->store->print_snapid(s->p, string, maxlen, snapid);
+	else
+		snprintf(string, maxlen, "%llu", (unsigned long long)snapid);
+}
+
+static int read_snapid(struct dm_multisnap *s, char *string, snapid_t *snapid, char **error)
+{
+	if (s->store->read_snapid)
+		return s->store->read_snapid(s->p, string, snapid, error);
+	else {
+		int r;
+
+		char *argv_array[1] = { string };
+		char **argv = argv_array;
+		unsigned argc = 1;
+		__u64 uint64;
+
+		r = dm_multisnap_get_uint64(&argv, &argc, &uint64, error);
+		if (r)
+			return r;
+
+		*snapid = uint64;
+		return 0;
+	}
+}
+
+/* --- bio list --- */
+
+static DEFINE_SPINLOCK(dm_multisnap_bio_list_lock);
+
+static void wakeup_kmultisnapd(struct dm_multisnap *s)
+{
+	queue_work(s->wq, &s->work);
+}
+
+static void dm_multisnap_enqueue_bio_unlocked(struct dm_multisnap *s, struct bio *bio)
+{
+	bio_list_add(&s->bios, bio);
+}
+
+static void dm_multisnap_enqueue_bio(struct dm_multisnap *s, struct bio *bio)
+{
+	spin_lock_irq(&dm_multisnap_bio_list_lock);
+	dm_multisnap_enqueue_bio_unlocked(s, bio);
+	spin_unlock_irq(&dm_multisnap_bio_list_lock);
+}
+
+static void dm_multisnap_enqueue_bio_list(struct dm_multisnap *s, struct bio_list *bl)
+{
+	struct bio *bio;
+	while ((bio = bio_list_pop(bl)))
+		dm_multisnap_enqueue_bio(s, bio);
+}
+
+/* Reduce the size of the bio */
+
+static void bio_trim(struct bio *bio, unsigned size)
+{
+	unsigned i;
+	bio->bi_size = size;
+	for (i = 0; i < bio->bi_vcnt; i++) {
+		if (size <= bio->bi_io_vec[i].bv_len) {
+			bio->bi_io_vec[i].bv_len = size;
+			bio->bi_vcnt = i + 1;
+			bio->bi_flags &= ~(1 << BIO_SEG_VALID);
+			return;
+		}
+		size -= bio->bi_io_vec[i].bv_len;
+	}
+	BUG();
+}
+
+/* --- encode 64-bit snapids in bio */
+
+static snapid_t bio_get_snapid(struct bio *bio)
+{
+	return ((__u64)bio->bi_seg_front_size << 32) | bio->bi_seg_back_size;
+}
+
+static void bio_put_snapid(struct bio *bio, snapid_t snapid)
+{
+	bio->bi_seg_front_size = (__u64)snapid >> 32;
+	bio->bi_seg_back_size = snapid;
+}
+
+/* --- tracked chnuks --- */
+
+static struct kmem_cache *tracked_chunk_cache;
+
+static int chunk_is_tracked(struct dm_multisnap *s, chunk_t chunk)
+{
+	struct dm_multisnap_tracked_chunk *c;
+	struct hlist_node *hn;
+
+	spin_lock_irq(&dm_multisnap_bio_list_lock);
+
+	hlist_for_each_entry(c, hn,
+	    &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
+		if (likely(c->chunk == chunk)) {
+			spin_unlock_irq(&dm_multisnap_bio_list_lock);
+			return 1;
+		}
+	}
+
+	spin_unlock_irq(&dm_multisnap_bio_list_lock);
+
+	return 0;
+}
+
+/* --- pending exception cache --- */
+
+static struct kmem_cache *pending_exception_cache;
+
+#define GFP_PENDING_EXCEPTION	GFP_NOIO
+
+static void pending_exception_ctor(void *pe_)
+{
+	struct dm_multisnap_pending_exception *pe = pe_;
+	bio_list_init(&pe->bios);
+}
+
+static struct dm_multisnap_pending_exception *dm_multisnap_alloc_pending_exception(struct dm_multisnap *s, chunk_t chunk)
+{
+	struct dm_multisnap_pending_exception *pe;
+	/*
+	 * Warning, we don't want to wait. Because we are holding master_lock
+	 * and taking this lock is needed to complete the exception.
+	 *
+	 * If an allocation failure happens, we must go up, drop the lock,
+	 * try dummy mempool allocation and go here again.
+	 */
+	pe = mempool_alloc(s->pending_pool, GFP_PENDING_EXCEPTION & ~__GFP_WAIT);
+	if (unlikely(!pe))
+		return NULL;
+
+	pe->s = s;
+	pe->chunk = chunk;
+	hlist_add_head(&pe->hash_list, &s->pending_hash[PENDING_HASH(chunk)]);
+	return pe;
+}
+
+static void dm_multisnap_free_pending_exception(struct dm_multisnap_pending_exception *pe)
+{
+	hlist_del(&pe->hash_list);
+	mempool_free(pe, pe->s->pending_pool);
+}
+
+static void dm_multisnap_wait_for_pending_exception(struct dm_multisnap *s)
+{
+	/*
+	 * Wait until there is something in the mempool. Free it immediatelly.
+	 */
+	struct dm_multisnap_pending_exception *pe;
+
+	pe = mempool_alloc(s->pending_pool, GFP_PENDING_EXCEPTION | __GFP_WAIT);
+	mempool_free(pe, s->pending_pool);
+}
+
+/*
+ * Check if the chunk+snapid conflicts with any pending exception.
+ *
+ * If it does, queue the bio on the pending exception.
+ */
+
+static int check_pending_io(struct dm_multisnap *s, struct bio *bio, chunk_t chunk, snapid_t snapid)
+{
+	struct dm_multisnap_pending_exception *pe;
+	struct hlist_node *hn;
+	hlist_for_each_entry(pe, hn, &s->pending_hash[PENDING_HASH(chunk)], hash_list) {
+		if (pe->chunk == chunk) {
+			int i;
+			if (snapid == SNAPID_T_ORIGIN)
+				goto conflict;
+			for (i = 0; i < pe->n_descs; i++) {
+				if (s->store->check_conflict(s->p, &pe->desc[i], snapid))
+					goto conflict;
+			}
+		}
+		cond_resched();
+	}
+	return 0;
+
+conflict:
+	bio_list_add(&pe->bios, bio);
+	return 1;
+}
+
+/* --- commit --- */
+
+/*
+ * Test if commit can be performed. If these two variables are not equal,
+ * there are some pending kcopyd jobs and we must not commit.
+ */
+
+int dm_multisnap_can_commit(struct dm_multisnap *s)
+{
+	return s->kcopyd_jobs_submitted_count == s->kcopyd_jobs_finished_count;
+}
+EXPORT_SYMBOL(dm_multisnap_can_commit);
+
+/*
+ * Call exception store commit method.
+ * This can be called only if dm_multisnap_can_commit returned true;
+ * master_lock must be locked.
+ */
+
+void dm_multisnap_call_commit(struct dm_multisnap *s)
+{
+	s->store->commit(s->p);
+	s->commit_sequence++;
+}
+EXPORT_SYMBOL(dm_multisnap_call_commit);
+
+/*
+ * Force commit at this point. It is guaranteed that commit happened when
+ * this function exits.
+ * master_lock must be unlocked.
+ *
+ * If the commit cannot be performed immediatelly (because there are pending
+ * chunks being copied), the function drops the lock and polls. It won't
+ * livelock --- either it will be possible to do the commit or someone
+ * have done the commit already (commit_sequence changed).
+ *
+ * The polling is justified because this function is only called when deleting
+ * a snapshot or when suspending the origin with postsuspend. These functions
+ * are not performance-critical, thus 1ms delay won't cause a performance
+ * problem.
+ */
+
+static int dm_multisnap_force_commit(struct dm_multisnap *s)
+{
+	int err;
+	unsigned commit_sequence;
+
+	dm_multisnap_lock(s);
+
+	commit_sequence = s->commit_sequence;
+
+	while (!dm_multisnap_can_commit(s)) {
+		dm_multisnap_unlock(s);
+		msleep(1);
+		dm_multisnap_lock(s);
+		if (s->commit_sequence != commit_sequence)
+			goto unlock_ret;
+	}
+
+	dm_multisnap_call_commit(s);
+
+unlock_ret:
+	err = dm_multisnap_has_error(s);
+	dm_multisnap_unlock(s);
+
+	return err;
+}
+
+/* --- kcopyd callback --- */
+
+static void remap_callback(int read_err, unsigned long write_err, void *pe_)
+{
+	struct dm_multisnap_pending_exception *pe = pe_;
+	struct dm_multisnap *s = pe->s;
+
+	if (unlikely((read_err | write_err) != 0)) {
+		DMERR("remap_callback: kcopyd I/O error: %d, %lx", read_err, write_err);
+		dm_multisnap_set_error(s, -EIO);
+	}
+
+	list_add_tail(&pe->list, &s->pes_waiting_for_commit);
+
+	s->kcopyd_jobs_finished_count++;
+
+	/* If there are more jobs pending, don't commit */
+	if (!dm_multisnap_can_commit(s))
+		return;
+
+	dm_multisnap_lock(s);
+
+	/* Recheck after the loc was taken */
+	if (unlikely(!dm_multisnap_can_commit(s))) {
+		/* Not yet ... kmultisnapd has just added something */
+		dm_multisnap_unlock(s);
+		return;
+	}
+
+	/* We need to commit stuff */
+
+	dm_multisnap_call_commit(s);
+
+	do {
+		pe = container_of(s->pes_waiting_for_commit.next, struct dm_multisnap_pending_exception, list);
+
+		/*
+		 * When we are about to free the pending exception, we must
+		 * wait for all reads to the apropriate chunk to
+		 * finish.
+		 *
+		 * This prevents the following race condition:
+		 * - someone reads the chunk in the snapshot with no exception
+		 * - that read is remapped directly to the origin, the read
+		 *	is delayed for some reason
+		 * - someone other writes to the origin, this triggers realloc
+		 * - the realloc finishes
+		 * - the write is dispatched to the origin
+		 * - the read submitted first is dispatched and reads modified
+		 *	data
+		 *
+		 * This race is very improbable (non-shared snapshots had this
+		 * race too and it hasn't ever been reported seen, except in
+		 * artifically simulated cases). So we use active waiting with
+		 * msleep(1).
+		 */
+
+		while (chunk_is_tracked(s, pe->chunk))
+			msleep(1);
+
+		list_del(&pe->list);
+		dm_multisnap_enqueue_bio_list(s, &pe->bios);
+		dm_multisnap_free_pending_exception(pe);
+	} while (!list_empty(&s->pes_waiting_for_commit));
+
+	/*
+	 * Process the bios that we have just added to the queue.
+	 * It's faster to process them now than to hand them over to
+	 * kmultisnapd.
+	 */
+	dm_multisnap_process_bios(s);
+
+	dm_multisnap_unlock(s);
+
+	blk_unplug(bdev_get_queue(s->origin->bdev));
+	blk_unplug(bdev_get_queue(s->snapshot->bdev));
+}
+
+static void dispatch_kcopyd(struct dm_multisnap *s, struct dm_multisnap_pending_exception *pe, int from_snapshot, chunk_t chunk, struct bio *bio, struct dm_io_region *dests, unsigned n_dests)
+{
+	unsigned i;
+	struct dm_io_region src;
+
+	pe->n_descs = n_dests;
+
+	bio_list_add(&pe->bios, bio);
+
+	src.bdev = likely(!from_snapshot) ? s->origin->bdev : s->snapshot->bdev;
+	src.sector = chunk_to_sector(s, chunk);
+	src.count = s->chunk_size >> SECTOR_SHIFT;
+
+	if (likely(!from_snapshot) && unlikely(src.sector + src.count > s->origin_sectors)) {
+		if (src.sector >= s->origin_sectors)
+			src.count = 0;
+		else
+			src.count = s->origin_sectors - src.sector;
+
+		for (i = 0; i < pe->n_descs; i++)
+			dests[i].count = src.count;
+	}
+
+	s->kcopyd_jobs_submitted_count++;
+
+	dm_kcopyd_copy(s->kcopyd, &src, n_dests, dests, 0, remap_callback, pe);
+}
+
+/* --- bio processing --- */
+
+/*
+ * Process bio on the origin.
+ * Reads and barriers never go here, they are dispatched directly.
+ */
+
+static void do_origin_write(struct dm_multisnap *s, struct bio *bio)
+{
+	int r;
+	unsigned i;
+	chunk_t chunk, new_chunk;
+	struct dm_multisnap_pending_exception *pe;
+	struct dm_io_region dests[MAX_CHUNKS_TO_REMAP];
+
+	/* reads are processed directly in multisnap_origin_map */
+	BUG_ON(bio_rw(bio) != WRITE);
+
+	if (bio->bi_sector + (bio->bi_size >> SECTOR_SHIFT) > s->origin_sectors) {
+		DMERR("do_origin_write: access out of device, flags %lx, sector %llx, size %x, origin sectors %llx", bio->bi_flags, (unsigned long long)bio->bi_sector, bio->bi_size, (unsigned long long)s->origin_sectors);
+		bio_endio(bio, -EIO);
+		return;
+	}
+
+	if (unlikely(dm_multisnap_has_error(s)))
+		goto err_endio;
+
+	s->store->reset_query(s->p);
+
+	chunk = sector_to_chunk(s, bio->bi_sector);
+
+	r = s->store->query_next_remap(s->p, chunk);
+	if (unlikely(r < 0))
+		goto err_endio;
+
+	if (likely(!r)) {
+		/* There is nothing to remap */
+
+		if (unlikely(check_pending_io(s, bio, chunk, SNAPID_T_ORIGIN)))
+			return;
+dispatch_write:
+		bio->bi_bdev = s->origin->bdev;
+		generic_make_request(bio);
+		return;
+	}
+
+	pe = dm_multisnap_alloc_pending_exception(s, chunk);
+	if (unlikely(!pe)) {
+		s->pending_mempool_allocation_failed = 1;
+		dm_multisnap_enqueue_bio(s, bio);
+		return;
+	}
+
+	i = 0;
+	goto midcycle;
+	for (; i < MAX_CHUNKS_TO_REMAP; i++) {
+		r = s->store->query_next_remap(s->p, chunk);
+		if (unlikely(r < 0))
+			goto free_err_endio;
+		if (likely(!r))
+			break;
+
+midcycle:
+		s->store->add_next_remap(s->p, &pe->desc[i], &new_chunk);
+		if (unlikely(dm_multisnap_has_error(s)))
+			goto free_err_endio;
+
+		dests[i].bdev = s->snapshot->bdev;
+		dests[i].sector = chunk_to_sector(s, new_chunk);
+		dests[i].count = s->chunk_size >> SECTOR_SHIFT;
+	}
+
+	dispatch_kcopyd(s, pe, 0, chunk, bio, dests, i);
+	return;
+
+free_err_endio:
+	dm_multisnap_free_pending_exception(pe);
+err_endio:
+	r = -EIO;
+	if (!(s->flags & DM_MULTISNAP_PRESERVE_ON_ERROR))
+		goto dispatch_write;
+
+	bio_endio(bio, r);
+	return;
+}
+
+/*
+ * Process bio on the snapshot.
+ * Barriers never go here, they are dispatched directly.
+ */
+
+static void do_snapshot_io(struct dm_multisnap *s, struct bio *bio, snapid_t id)
+{
+	chunk_t chunk, result, copy_from;
+	int r;
+	struct dm_multisnap_pending_exception *pe;
+	struct dm_io_region dest;
+
+	if (unlikely(bio_rw(bio) == WRITE) && unlikely(!s->store->make_chunk_writeable))
+		goto err_endio;
+
+	if (unlikely(dm_multisnap_has_error(s)))
+		goto err_endio;
+
+	chunk = sector_to_chunk(s, bio->bi_sector);
+	r = s->store->find_snapshot_chunk(s->p, id, chunk, bio_rw(bio) == WRITE, &result);
+	if (unlikely(r < 0))
+		goto err_endio;
+
+	if (!r) {
+
+		/* Not found in the snapshot */
+
+		if (likely(bio_rw(bio) != WRITE)) {
+			union map_info *map_context;
+			struct dm_multisnap_tracked_chunk *c;
+
+			if (unlikely(bio->bi_sector + (bio->bi_size >> SECTOR_SHIFT) > s->origin_sectors)) {
+				zero_fill_bio(bio);
+				if (bio->bi_sector >= s->origin_sectors) {
+					bio_endio(bio, 0);
+					return;
+				}
+				bio_trim(bio, (s->origin_sectors - bio->bi_sector) << SECTOR_SHIFT);
+			}
+
+			/*
+			 * Redirect reads to the origin.
+			 * Record the bio in the hash of tracked bios.
+			 * This prevents read-vs-realloc race.
+			 *
+			 * An important requirement is that when any bio is
+			 * added to tracked_chunk_hash, the bio must be finished
+			 * and removed from the hash without taking master_lock.
+			 *
+			 * So we add it immediatelly before submitting the bio
+			 * with generic_make_request.
+			 */
+
+			bio->bi_bdev = s->origin->bdev;
+
+			map_context = dm_get_mapinfo(bio);
+			BUG_ON(!map_context);
+			c = map_context->ptr;
+
+			spin_lock_irq(&dm_multisnap_bio_list_lock);
+			BUG_ON(!hlist_unhashed(&c->node));
+			hlist_add_head(&c->node, &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(c->chunk)]);
+			spin_unlock_irq(&dm_multisnap_bio_list_lock);
+		} else {
+			pe = dm_multisnap_alloc_pending_exception(s, chunk);
+			if (unlikely(!pe))
+				goto failed_pe_allocation;
+
+			s->store->add_next_remap(s->p, &pe->desc[0], &result);
+			if (unlikely(dm_multisnap_has_error(s)))
+				goto free_err_endio;
+
+			dest.bdev = s->snapshot->bdev;
+			dest.sector = chunk_to_sector(s, result);
+			dest.count = s->chunk_size >> SECTOR_SHIFT;
+
+			dispatch_kcopyd(s, pe, 0, chunk, bio, &dest, 1);
+			return;
+		}
+	} else {
+
+		/* Found in the snapshot */
+
+		if (unlikely(check_pending_io(s, bio, chunk, id)))
+			return;
+
+		if (unlikely(bio_rw(bio) == WRITE) && r == 1) {
+			copy_from = result;
+
+			pe = dm_multisnap_alloc_pending_exception(s, chunk);
+			if (unlikely(!pe))
+				goto failed_pe_allocation;
+
+			s->store->make_chunk_writeable(s->p, &pe->desc[0], &result);
+			if (unlikely(dm_multisnap_has_error(s)))
+				goto free_err_endio;
+
+			dest.bdev = s->snapshot->bdev;
+			dest.sector = chunk_to_sector(s, result);
+			dest.count = s->chunk_size >> SECTOR_SHIFT;
+
+			dispatch_kcopyd(s, pe, 1, copy_from, bio, &dest, 1);
+			return;
+		}
+
+		bio->bi_bdev = s->snapshot->bdev;
+		bio->bi_sector &= (s->chunk_size >> SECTOR_SHIFT) - 1;
+		bio->bi_sector |= chunk_to_sector(s, result);
+	}
+	generic_make_request(bio);
+	return;
+
+free_err_endio:
+	dm_multisnap_free_pending_exception(pe);
+err_endio:
+	r = -EIO;
+	bio_endio(bio, r);
+	return;
+
+failed_pe_allocation:
+	s->pending_mempool_allocation_failed = 1;
+	dm_multisnap_enqueue_bio(s, bio);
+	return;
+}
+
+/*
+ * The main routine used to process everything in the thread.
+ * It must be called with master_lock held.
+ * It is usually called from the worker thread, but can also be called
+ * from other places (for example kcopyd callback), assuming that the caller
+ * holds master_lock.
+ */
+
+static void dm_multisnap_process_bios(struct dm_multisnap *s)
+{
+	struct bio *bio;
+	snapid_t snapid;
+
+again:
+	cond_resched();
+
+	if (!list_empty(&s->background_works)) {
+		struct dm_multisnap_background_work *bw = list_entry(s->background_works.next, struct dm_multisnap_background_work, list);
+		list_del(&bw->list);
+		bw->queued = 0;
+		bw->work(s->p, bw);
+
+		cond_resched();
+	}
+
+	spin_lock_irq(&dm_multisnap_bio_list_lock);
+	bio = bio_list_pop(&s->bios);
+	spin_unlock_irq(&dm_multisnap_bio_list_lock);
+
+	if (unlikely(!bio))
+		return;
+
+	snapid = bio_get_snapid(bio);
+	if (snapid == SNAPID_T_ORIGIN)
+		do_origin_write(s, bio);
+	else
+		do_snapshot_io(s, bio, snapid);
+
+	if (!bio_list_empty(&s->bios) || !list_empty(&s->background_works)) {
+		if (likely(!bio_list_empty(&s->bios)) &&
+		    likely(!s->pending_mempool_allocation_failed) &&
+		    likely(!dm_multisnap_lock_contended(s)))
+			goto again;
+		wakeup_kmultisnapd(s);
+	}
+}
+
+/*
+ * Background-job routines exported for exception store drivers.
+ *
+ * Jobs queued with these routines will be executed on background, with the
+ * master lock held.
+ */
+
+void dm_multisnap_queue_work(struct dm_multisnap *s, struct dm_multisnap_background_work *bw)
+{
+	dm_multisnap_assert_locked(s);
+
+	if (bw->queued) {
+		BUG_ON(bw->queued != 1);
+		return;
+	}
+
+	bw->queued = 1;
+	list_add(&bw->list, &s->background_works);
+	wakeup_kmultisnapd(s);
+}
+EXPORT_SYMBOL(dm_multisnap_queue_work);
+
+void dm_multisnap_cancel_work(struct dm_multisnap *s, struct dm_multisnap_background_work *bw)
+{
+	dm_multisnap_assert_locked(s);
+
+	if (!bw->queued)
+		return;
+
+	bw->queued = 0;
+	list_del(&bw->list);
+}
+EXPORT_SYMBOL(dm_multisnap_cancel_work);
+
+/*
+ * The main work thread.
+ */
+
+static void dm_multisnap_work(struct work_struct *work)
+{
+	struct dm_multisnap *s = container_of(work, struct dm_multisnap, work);
+
+	dm_multisnap_lock(s);
+	dm_multisnap_process_bios(s);
+	dm_multisnap_unlock(s);
+
+	/*
+	 * If there was some mempool allocation failure, we must fail, outside
+	 * the lock, until there is some free memory.
+	 * If this branch is taken, the work is already queued again, so it
+	 * reexecutes after finding some memory.
+	 */
+	if (unlikely(s->pending_mempool_allocation_failed)) {
+		s->pending_mempool_allocation_failed = 0;
+		dm_multisnap_wait_for_pending_exception(s);
+	}
+
+	blk_unplug(bdev_get_queue(s->origin->bdev));
+	blk_unplug(bdev_get_queue(s->snapshot->bdev));
+}
+
+static struct dm_multisnap *find_multisnapshot(struct block_device *origin)
+{
+	struct dm_multisnap *s;
+	list_for_each_entry(s, &all_multisnapshots, list_all)
+		if (s->origin->bdev == origin)
+			return s;
+	return NULL;
+}
+
+/* --- exception stores --- */
+
+static DEFINE_MUTEX(exception_stores_lock);
+static LIST_HEAD(all_exception_stores);
+
+static struct dm_multisnap_exception_store *dm_multisnap_find_exception_store(const char *name)
+{
+	struct dm_multisnap_exception_store *store;
+
+	list_for_each_entry(store, &all_exception_stores, list)
+		if (!strcmp(store->name, name))
+			return store;
+
+	return NULL;
+}
+
+static int dm_multisnap_exception_store_active(struct dm_multisnap_exception_store *find)
+{
+	struct dm_multisnap_exception_store *store;
+
+	list_for_each_entry(store, &all_exception_stores, list)
+		if (store == find)
+			return 1;
+
+	return 0;
+}
+
+int dm_multisnap_register_exception_store(struct dm_multisnap_exception_store *store)
+{
+	mutex_lock(&exception_stores_lock);
+
+	BUG_ON(dm_multisnap_exception_store_active(store));
+
+	if (dm_multisnap_find_exception_store(store->name)) {
+		mutex_unlock(&exception_stores_lock);
+		return -EEXIST;
+	}
+	list_add(&store->list, &all_exception_stores);
+
+	mutex_unlock(&exception_stores_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(dm_multisnap_register_exception_store);
+
+void dm_multisnap_unregister_exception_store(struct dm_multisnap_exception_store *store)
+{
+	mutex_lock(&exception_stores_lock);
+
+	BUG_ON(!dm_multisnap_exception_store_active(store));
+	list_del(&store->list);
+
+	mutex_unlock(&exception_stores_lock);
+}
+EXPORT_SYMBOL(dm_multisnap_unregister_exception_store);
+
+static struct dm_multisnap_exception_store *dm_multisnap_get_exception_store(const char *name)
+{
+	struct dm_multisnap_exception_store *store;
+
+	mutex_lock(&exception_stores_lock);
+
+	store = dm_multisnap_find_exception_store(name);
+	if (store) {
+		if (!try_module_get(store->module))
+			store = NULL;
+	}
+
+	mutex_unlock(&exception_stores_lock);
+
+	return store;
+}
+
+static void dm_multisnap_put_exception_store(struct dm_multisnap_exception_store *store)
+{
+	mutex_lock(&exception_stores_lock);
+
+	BUG_ON(!dm_multisnap_exception_store_active(store));
+	module_put(store->module);
+
+	mutex_unlock(&exception_stores_lock);
+}
+
+/* --- argument parser --- */
+
+int dm_multisnap_get_string(char ***argv, unsigned *argc, char **string, char **error)
+{
+	if (!*argc) {
+		*error = "Not enough arguments";
+		return -EINVAL;
+	}
+	*string = *(*argv)++;
+	(*argc)--;
+	return 0;
+}
+EXPORT_SYMBOL(dm_multisnap_get_string);
+
+int dm_multisnap_get_uint64(char ***argv, unsigned *argc, __u64 *uint64, char **error)
+{
+	char *string;
+	int r = dm_multisnap_get_string(argv, argc, &string, error);
+	if (r)
+		return r;
+	if (!*string) {
+invalid_number:
+		*error = "Invalid number";
+		return -EINVAL;
+	}
+	*uint64 = simple_strtoull(string, &string, 10);
+	if (*string)
+		goto invalid_number;
+	return 0;
+}
+EXPORT_SYMBOL(dm_multisnap_get_uint64);
+
+int dm_multisnap_get_uint(char ***argv, unsigned *argc, unsigned *uint, char **error)
+{
+	__u64 uint64;
+	int r = dm_multisnap_get_uint64(argv, argc, &uint64, error);
+	if (r)
+		return r;
+	*uint = uint64;
+	if (uint64 != *uint) {
+		*error = "Number out of range";
+		return -ERANGE;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(dm_multisnap_get_uint);
+
+int dm_multisnap_get_argcount(char ***argv, unsigned *argc, unsigned *uint, char **error)
+{
+	int r = dm_multisnap_get_uint(argv, argc, uint, error);
+	if (r)
+		return r;
+	if (*uint > *argc) {
+		*error = "Not enough arguments";
+		return -EINVAL;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(dm_multisnap_get_argcount);
+
+void dm_multisnap_adjust_string(char **result, unsigned *maxlen)
+{
+	unsigned len = strlen(*result);
+	*result += len;
+	*maxlen -= len;
+}
+EXPORT_SYMBOL(dm_multisnap_adjust_string);
+
+/* --- target methods --- */
+
+static int compare_snapids(const void *p1, const void *p2)
+{
+	snapid_t s1 = *(const snapid_t *)p1;
+	snapid_t s2 = *(const snapid_t *)p2;
+	if (s1 < s2)
+		return -1;
+	if (s1 > s2)
+		return 1;
+	return 0;
+}
+
+/* --- constructor & destructor --- */
+
+static int multisnap_origin_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	int r;
+	int i;
+	char *origin_path;
+	char *snapshot_path;
+	unsigned chunk_size;
+	unsigned generic_args;
+	char *store_name;
+	unsigned store_args;
+	unsigned num_snapshots;
+
+	struct dm_multisnap *s, *ss;
+
+	mutex_lock(&all_multisnapshots_lock);
+
+	r = dm_multisnap_get_string(&argv, &argc, &origin_path, &ti->error);
+	if (r)
+		goto bad_arguments;
+	r = dm_multisnap_get_string(&argv, &argc, &snapshot_path, &ti->error);
+	if (r)
+		goto bad_arguments;
+	r = dm_multisnap_get_uint(&argv, &argc, &chunk_size, &ti->error);
+	if (r)
+		goto bad_arguments;
+
+	s = kmalloc(sizeof(struct dm_multisnap), GFP_KERNEL);
+	if (!s) {
+		ti->error = "Can't allocate multisnapshot structure";
+		r = -ENOMEM;
+		goto bad_s;
+	}
+
+	ti->private = s;
+
+	s->p = NULL;
+	s->error = 0;
+	s->flags = 0;
+	mutex_init(&s->master_lock);
+	mutex_init(&s->status_lock);
+	INIT_WORK(&s->work, dm_multisnap_work);
+	bio_list_init(&s->bios);
+	INIT_LIST_HEAD(&s->background_works);
+	s->kcopyd_jobs_submitted_count = 0;
+	s->kcopyd_jobs_finished_count = 0;
+	INIT_LIST_HEAD(&s->pes_waiting_for_commit);
+	s->commit_sequence = 0;
+	for (i = 0; i < PENDING_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&s->pending_hash[i]);
+	s->pending_mempool_allocation_failed = 0;
+	s->new_snapid_valid = 0;
+	INIT_LIST_HEAD(&s->all_snaps);
+
+	r = dm_multisnap_get_argcount(&argv, &argc, &generic_args, &ti->error);
+	if (r)
+		goto bad_arguments;
+	while (generic_args--) {
+		char *arg;
+		r = dm_multisnap_get_string(&argv, &argc, &arg, &ti->error);
+		if (r)
+			goto bad_generic_arguments;
+
+	/* Synchronize snapshot list against a list given in the target table */
+		if (!strcasecmp(arg, "sync-snapshots"))
+			s->flags |= DM_MULTISNAP_SYNC_SNAPSHOTS;
+	/* Don't drop the snapshot store on error, rather stop the origin */
+		else if (!strcasecmp(arg, "preserve-on-error"))
+			s->flags |= DM_MULTISNAP_PRESERVE_ON_ERROR;
+		else {
+			r = -EINVAL;
+			ti->error = "Invalid argument";
+			goto bad_generic_arguments;
+		}
+	}
+
+	r = dm_get_device(ti, origin_path, 0, 0, FMODE_READ | FMODE_WRITE, &s->origin);
+	if (r) {
+		ti->error = "Could not get origin device";
+		goto bad_origin;
+	}
+	s->origin_sectors = i_size_read(s->origin->bdev->bd_inode) >> SECTOR_SHIFT;
+
+	r = dm_get_device(ti, snapshot_path, 0, 0, FMODE_READ | FMODE_WRITE, &s->snapshot);
+	if (r) {
+		ti->error = "Could not get snapshot device";
+		goto bad_snapshot;
+	}
+
+	/*
+	 * Prevent multiple load over the same devices.
+	 *
+	 * Currently, multisnapshot target is loaded just once, there is no
+	 * place where it would be reloaded (even lvchange --refresh doesn't
+	 * do it), so there is no need to handle loading the target multiple
+	 * times for the same devices and "handover" of the exception store.
+	 *
+	 * As a safeguard to protect against possible data corruption from
+	 * userspace misbehavior, we check that there is no other target loaded
+	 * that has the origin or the snapshot store on the same devices.
+	 */
+
+	list_for_each_entry(ss, &all_multisnapshots, list_all)
+		if (ss->origin->bdev == s->origin->bdev ||
+		    ss->snapshot->bdev == s->snapshot->bdev) {
+			ti->error = "Another multisnapshot with the same devices";
+			r = -EINVAL;
+			goto bad_conflicting_snapshot;
+		}
+
+	/* Validate the chunk size */
+
+	if (chunk_size > INT_MAX / 512) {
+		ti->error = "Chunk size is too high";
+		r = -EINVAL;
+		goto bad_chunk_size;
+	}
+	if (!is_power_of_2(chunk_size)) {
+		ti->error = "Chunk size is not power of two";
+		r = -EINVAL;
+		goto bad_chunk_size;
+	}
+	chunk_size *= 512;
+	if (chunk_size < bdev_logical_block_size(s->origin->bdev) ||
+	    chunk_size < bdev_logical_block_size(s->snapshot->bdev)) {
+		ti->error = "Chunk size is smaller than device block size";
+		r = -EINVAL;
+		goto bad_chunk_size;
+	}
+	s->chunk_size = chunk_size;
+	s->chunk_shift = ffs(chunk_size) - 1;
+
+	s->pending_pool = mempool_create_slab_pool(PENDING_MEMPOOL_SIZE, pending_exception_cache);
+	if (!s->pending_pool) {
+		ti->error = "Could not allocate mempool for pending exceptions";
+		r = -ENOMEM;
+		goto bad_pending_pool;
+	}
+
+	s->tracked_chunk_pool = mempool_create_slab_pool(DM_TRACKED_CHUNK_POOL_SIZE, tracked_chunk_cache);
+	if (!s->tracked_chunk_pool) {
+		ti->error = "Could not allocate tracked_chunk mempool for tracking reads";
+		goto bad_tracked_chunk_pool;
+	}
+	s->n_tracked_ios = 0;
+	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
+
+	r = dm_kcopyd_client_create(MULTISNAP_KCOPYD_PAGES, &s->kcopyd);
+	if (r) {
+		ti->error = "Could not create kcopyd client";
+		goto bad_kcopyd;
+	}
+
+	r = dm_multisnap_get_string(&argv, &argc, &store_name, &ti->error);
+	if (r)
+		goto bad_store;
+
+	r = dm_multisnap_get_argcount(&argv, &argc, &store_args, &ti->error);
+	if (r)
+		goto bad_store;
+
+	s->store = dm_multisnap_get_exception_store(store_name);
+	if (!s->store) {
+		request_module("dm-store-%s", store_name);
+		s->store = dm_multisnap_get_exception_store(store_name);
+		if (!s->store) {
+			ti->error = "Can't get exception store type";
+			r = -ENOENT;
+			goto bad_store;
+		}
+	}
+
+	s->wq = create_singlethread_workqueue("kmultisnapd");
+	if (!s->wq) {
+		ti->error = "Could not create kernel thread";
+		r = -ENOMEM;
+		goto bad_thread;
+	}
+
+	dm_multisnap_lock(s);
+	r = s->store->init_exception_store(s, &s->p, store_args, argv, &ti->error);
+	if (r) {
+		s->p = NULL;
+		goto exception_store_error;
+	}
+
+	ti->split_io = s->chunk_size >> SECTOR_SHIFT;
+	ti->num_flush_requests = 1;
+
+	argv += store_args;
+	argc -= store_args;
+
+	/*
+	 * Synchronize snapshot IDs according to the table line:
+	 *	allocate IDs that are specified on the table line
+	 *	free IDs that are not specified on the table line
+	 */
+	if (s->flags & DM_MULTISNAP_SYNC_SNAPSHOTS) {
+		snapid_t sn, n, *snapids;
+		r = dm_multisnap_get_argcount(&argv, &argc, &num_snapshots, &ti->error);
+		if (r)
+			goto error_syncing_snapshots;
+		snapids = vmalloc(sizeof(snapid_t) * (num_snapshots + 1));
+		if (!snapids && num_snapshots) {
+			ti->error = "Could not allocate snapids array";
+			goto bad_kcopyd;
+		}
+		for (n = 0; n < num_snapshots; n++) {
+			char *string;
+			r = dm_multisnap_get_string(&argv, &argc, &string, &ti->error);
+			if (r) {
+				vfree(snapids);
+				goto error_syncing_snapshots;
+			}
+			r = read_snapid(s, string, &snapids[n], &ti->error);
+			if (r) {
+				vfree(snapids);
+				goto error_syncing_snapshots;
+			}
+		}
+		snapids[num_snapshots] = SNAPID_T_ORIGIN;
+
+		/* Delete the snapshots that shouldn't be there */
+		sort(snapids, num_snapshots, sizeof(snapid_t), compare_snapids, NULL);
+		sn = s->store->get_next_snapid(s->p, 0);
+		for (n = 0; n <= num_snapshots; n++) {
+			while (sn < snapids[n]) {
+				if (!dm_multisnap_has_error(s)) {
+					r = s->store->delete_snapshot(s->p, sn);
+					if (r && s->flags & DM_MULTISNAP_PRESERVE_ON_ERROR) {
+						ti->error = "Can't delete snapshot";
+						vfree(snapids);
+						goto error_syncing_snapshots;
+					}
+				}
+				sn = s->store->get_next_snapid(s->p, sn + 1);
+				if (sn == SNAPID_T_ORIGIN)
+					goto delete_done;
+			}
+			if (sn == snapids[n]) {
+				sn = s->store->get_next_snapid(s->p, sn + 1);
+				if (sn == SNAPID_T_ORIGIN)
+					goto delete_done;
+			}
+		}
+delete_done:
+
+		/* Create the snapshots that should be there */
+		if (s->store->compare_snapids_for_create)
+			sort(snapids, num_snapshots, sizeof(snapid_t), s->store->compare_snapids_for_create, NULL);
+		for (n = 0; n <= num_snapshots; n++) {
+			if (!dm_multisnap_snapshot_exists(s, snapids[n])) {
+				if (!dm_multisnap_has_error(s)) {
+					r = s->store->create_snapshot(s->p, snapids[n]);
+					if (r && s->flags & DM_MULTISNAP_PRESERVE_ON_ERROR) {
+						ti->error = "Can't create snapshot";
+						vfree(snapids);
+						goto error_syncing_snapshots;
+					}
+				}
+			}
+		}
+		vfree(snapids);
+	}
+
+	dm_multisnap_unlock(s);
+
+	list_add(&s->list_all, &all_multisnapshots);
+
+	mutex_unlock(&all_multisnapshots_lock);
+	return 0;
+
+error_syncing_snapshots:
+	s->store->exit_exception_store(s->p);
+	s->p = NULL;
+exception_store_error:
+	dm_multisnap_unlock(s);
+	destroy_workqueue(s->wq);
+bad_thread:
+	dm_multisnap_put_exception_store(s->store);
+bad_store:
+	dm_kcopyd_client_destroy(s->kcopyd);
+bad_kcopyd:
+	mempool_destroy(s->tracked_chunk_pool);
+bad_tracked_chunk_pool:
+	mempool_destroy(s->pending_pool);
+bad_pending_pool:
+bad_conflicting_snapshot:
+bad_chunk_size:
+	dm_put_device(ti, s->snapshot);
+bad_snapshot:
+	dm_put_device(ti, s->origin);
+bad_origin:
+bad_generic_arguments:
+	kfree(s);
+bad_s:
+bad_arguments:
+	mutex_unlock(&all_multisnapshots_lock);
+	return r;
+}
+
+static void multisnap_origin_dtr(struct dm_target *ti)
+{
+	struct dm_multisnap *s = ti->private;
+	struct dm_multisnap_snap *sn;
+	unsigned i;
+
+	mutex_lock(&all_multisnapshots_lock);
+
+	/* Make sure that any more IOs won't be submitted by snapshot targets */
+	list_for_each_entry(sn, &s->all_snaps, list_snaps) {
+		spin_lock_irq(&dm_multisnap_bio_list_lock);
+		sn->s = NULL;
+		spin_unlock_irq(&dm_multisnap_bio_list_lock);
+	}
+	list_del(&s->all_snaps);
+
+	/*
+	 * This code is called in the destructor, it is not performance
+	 * sensitive and thus we use polling with active waiting (msleep(1)).
+	 *
+	 * A possible 1ms delay on device destruction won't cause any trouble
+	 * and this polling is simpler and less bug-prone than using wait
+	 * queues.
+	 */
+poll_for_ios:
+	/* Wait for IOs on the snapshot */
+	spin_lock_irq(&dm_multisnap_bio_list_lock);
+	if (s->n_tracked_ios) {
+		spin_unlock_irq(&dm_multisnap_bio_list_lock);
+		msleep(1);
+		goto poll_for_ios;
+	}
+	spin_unlock_irq(&dm_multisnap_bio_list_lock);
+
+	/* Bug-check that there are really no IOs */
+	BUG_ON(!bio_list_empty(&s->bios));
+	for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
+		BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
+
+	/* Wait for pending reallocations */
+	dm_multisnap_lock(s);
+	for (i = 0; i < PENDING_HASH_SIZE; i++)
+		if (!hlist_empty(&s->pending_hash[i])) {
+			dm_multisnap_unlock(s);
+			msleep(1);
+			goto poll_for_ios;
+		}
+	dm_multisnap_unlock(s);
+
+	flush_workqueue(s->wq);
+
+	dm_multisnap_lock(s);
+	dm_multisnap_call_commit(s);
+	s->store->exit_exception_store(s->p);
+	s->p = NULL;
+	list_del(&s->list_all);
+	dm_multisnap_unlock(s);
+
+	destroy_workqueue(s->wq);
+	kfree(s->p);
+	dm_kcopyd_client_destroy(s->kcopyd);
+	mempool_destroy(s->tracked_chunk_pool);
+	mempool_destroy(s->pending_pool);
+	dm_put_device(ti, s->snapshot);
+	dm_put_device(ti, s->origin);
+	dm_multisnap_put_exception_store(s->store);
+
+	kfree(s);
+
+	mutex_unlock(&all_multisnapshots_lock);
+}
+
+static int multisnap_origin_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
+{
+	struct dm_multisnap *s = ti->private;
+
+	/*
+	 * Do the most common case quickly: reads and write barriers are
+	 * dispatched to the origin device directly.
+	 */
+	if (likely(bio_rw(bio) != WRITE) || unlikely(bio_empty_barrier(bio))) {
+		bio->bi_bdev = s->origin->bdev;
+		return DM_MAPIO_REMAPPED;
+	}
+
+	bio_put_snapid(bio, SNAPID_T_ORIGIN);
+
+	dm_multisnap_enqueue_bio(s, bio);
+	wakeup_kmultisnapd(s);
+
+	return DM_MAPIO_SUBMITTED;
+}
+
+static int multisnap_origin_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	struct dm_multisnap *s = ti->private;
+	int r;
+
+	mutex_lock(&all_multisnapshots_lock);
+	dm_multisnap_lock(s);
+
+	if (argc == 1 && !strcasecmp(argv[0], "create")) {
+		/*
+		 * Prepare snapshot creation.
+		 *
+		 * We allocate a snapid, and return it in the status.
+		 *
+		 * The snapshot is really created in postsuspend method (to
+		 * make sure that possibly mounted filesystem is quiescent and
+		 * the snapshot will be consistent).
+		 */
+		r = dm_multisnap_has_error(s);
+		if (r)
+			goto unlock_ret;
+
+
+		dm_multisnap_status_lock(s);
+		s->new_snapid_valid = 0;
+		dm_multisnap_status_unlock(s);
+
+		r = s->store->allocate_snapid(s->p, &s->new_snapid);
+		if (r)
+			goto unlock_ret;
+
+		dm_multisnap_status_lock(s);
+		s->new_snapid_valid = 1;
+		dm_multisnap_status_unlock(s);
+
+		r = dm_multisnap_has_error(s);
+		goto unlock_ret;
+	}
+	if (argc == 2 && !strcasecmp(argv[0], "delete")) {
+		/*
+		 * Delete a snapshot.
+		 */
+		char *error;
+		snapid_t snapid;
+		struct dm_multisnap_snap *sn;
+		struct bio *bio, *next;
+
+		r = read_snapid(s, argv[1], &snapid, &error);
+		if (r) {
+			DMWARN("invalid snapshot id: %s", error);
+			goto unlock_ret;
+		}
+
+		if (!s->store->delete_snapshot) {
+			DMERR("snapshot store doesn't support delete");
+			r = -EOPNOTSUPP;
+			goto unlock_ret;
+		}
+
+		r = dm_multisnap_has_error(s);
+		if (r)
+			goto unlock_ret;
+
+		/* Kick off possibly attached snapshot */
+		list_for_each_entry(sn, &s->all_snaps, list_snaps) {
+			if (sn->snapid == snapid) {
+				spin_lock_irq(&dm_multisnap_bio_list_lock);
+				sn->s = NULL;
+				spin_unlock_irq(&dm_multisnap_bio_list_lock);
+			}
+		}
+
+		/* Terminate bios queued for this snapshot so far */
+		spin_lock_irq(&dm_multisnap_bio_list_lock);
+		bio = bio_list_get(&s->bios);
+		spin_unlock_irq(&dm_multisnap_bio_list_lock);
+		for (; bio; bio = next) {
+			next = bio->bi_next;
+			bio->bi_next = NULL;
+			if (bio_get_snapid(bio) == snapid)
+				bio_endio(bio, -EIO);
+			else
+				dm_multisnap_enqueue_bio(s, bio);
+		}
+
+		if (!dm_multisnap_snapshot_exists(s, snapid)) {
+			DMWARN("snapshot with this id doesn't exists.");
+			r = -EINVAL;
+			goto unlock_ret;
+		}
+
+		r = s->store->delete_snapshot(s->p, snapid);
+		if (r)
+			goto unlock_ret;
+
+		dm_multisnap_unlock(s);
+
+		r = dm_multisnap_force_commit(s);
+
+		goto unlock2_ret;
+	}
+
+	DMWARN("unrecognised message received.");
+	r = -EINVAL;
+
+unlock_ret:
+	dm_multisnap_unlock(s);
+unlock2_ret:
+	mutex_unlock(&all_multisnapshots_lock);
+
+	return r;
+}
+
+/* Print used snapshot IDs into a supplied string */
+
+static void print_snapshot_ids(struct dm_multisnap *s, char *result, unsigned maxlen)
+{
+	snapid_t nsnap = 0;
+	snapid_t sn = 0;
+	while ((sn = s->store->get_next_snapid(s->p, sn)) != SNAPID_T_ORIGIN)
+		sn++, nsnap++;
+	snprintf(result, maxlen, " %llu", (unsigned long long)nsnap);
+	dm_multisnap_adjust_string(&result, &maxlen);
+	sn = 0;
+	while ((sn = s->store->get_next_snapid(s->p, sn)) != SNAPID_T_ORIGIN) {
+		snprintf(result, maxlen, " ");
+		dm_multisnap_adjust_string(&result, &maxlen);
+		print_snapid(s, result, maxlen, sn);
+		dm_multisnap_adjust_string(&result, &maxlen);
+		sn++;
+	}
+}
+
+static int multisnap_origin_status(struct dm_target *ti, status_type_t type, char *result, unsigned maxlen)
+{
+	struct dm_multisnap *s = ti->private;
+
+	/*
+	 * Use a special status lock, so that this code can execute even
+	 * when the underlying device is suspended and there is no possibility
+	 * to optain the master lock.
+	 */
+	dm_multisnap_status_lock(s);
+
+	switch (type) {
+		case STATUSTYPE_INFO: {
+			unsigned long long total, alloc, meta;
+			snprintf(result, maxlen, "5 %d ", dm_multisnap_has_error(s));
+			dm_multisnap_adjust_string(&result, &maxlen);
+			if (s->new_snapid_valid)
+				print_snapid(s, result, maxlen, s->new_snapid);
+			else
+				snprintf(result, maxlen, "-");
+			dm_multisnap_adjust_string(&result, &maxlen);
+			if (s->store->get_space)
+				s->store->get_space(s->p, &total, &alloc, &meta);
+			else
+				total = alloc = meta = 0;
+			total <<= s->chunk_shift - SECTOR_SHIFT;
+			alloc <<= s->chunk_shift - SECTOR_SHIFT;
+			meta <<= s->chunk_shift - SECTOR_SHIFT;
+			snprintf(result, maxlen, " %llu %llu %llu", total, alloc, meta);
+			dm_multisnap_adjust_string(&result, &maxlen);
+			print_snapshot_ids(s, result, maxlen);
+			dm_multisnap_adjust_string(&result, &maxlen);
+			break;
+		}
+		case STATUSTYPE_TABLE: {
+			unsigned ngen = 0;
+			if (s->flags & DM_MULTISNAP_SYNC_SNAPSHOTS)
+				ngen++;
+			if (s->flags & DM_MULTISNAP_PRESERVE_ON_ERROR)
+				ngen++;
+			snprintf(result, maxlen, "%s %s %u %u%s%s %s",
+				s->origin->name,
+				s->snapshot->name,
+				s->chunk_size / 512,
+				ngen,
+				s->flags & DM_MULTISNAP_SYNC_SNAPSHOTS ?
+					" sync-snapshots" : "",
+				s->flags & DM_MULTISNAP_PRESERVE_ON_ERROR ?
+					" preserve-on-error" : "",
+				s->store->name);
+			dm_multisnap_adjust_string(&result, &maxlen);
+			if (s->store->status_table)
+				s->store->status_table(s->p, result, maxlen);
+			else
+				snprintf(result, maxlen, " 0");
+			dm_multisnap_adjust_string(&result, &maxlen);
+			if (s->flags & DM_MULTISNAP_SYNC_SNAPSHOTS) {
+				print_snapshot_ids(s, result, maxlen);
+				dm_multisnap_adjust_string(&result, &maxlen);
+			}
+			break;
+		}
+	}
+
+	dm_multisnap_status_unlock(s);
+
+	/* If there's no space left in the buffer, ask for larger size */
+	return maxlen <= 1;
+}
+
+/*
+ * In postsuspend, we optionally create a snapshot that we prepared with
+ * a message.
+ */
+
+static void multisnap_origin_postsuspend(struct dm_target *ti)
+{
+	struct dm_multisnap *s = ti->private;
+
+	dm_multisnap_lock(s);
+
+	if (s->new_snapid_valid && !dm_multisnap_has_error(s)) {
+		/*
+		 * No way to return the error code, but it is recorded
+		 * in s->error anyway.
+		 */
+		s->store->create_snapshot(s->p, s->new_snapid);
+		s->new_snapid_valid = 0;
+	}
+
+	dm_multisnap_unlock(s);
+
+	dm_multisnap_force_commit(s);
+}
+
+static int multisnap_snap_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	int r;
+	char *origin_path;
+	char *snapid_str;
+	snapid_t snapid;
+	int doesnt_exist;
+
+	struct dm_dev *origin;
+
+	struct dm_multisnap *s;
+	struct dm_multisnap_snap *sn;
+
+	r = dm_multisnap_get_string(&argv, &argc, &origin_path, &ti->error);
+	if (r)
+		goto bad_arguments;
+	r = dm_multisnap_get_string(&argv, &argc, &snapid_str, &ti->error);
+	if (r)
+		goto bad_arguments;
+	r = dm_get_device(ti, origin_path, 0, 0, FMODE_READ | FMODE_WRITE, &origin);
+	if (r) {
+		ti->error = "Could not get origin device";
+		goto bad_origin;
+	}
+	mutex_lock(&all_multisnapshots_lock);
+	s = find_multisnapshot(origin->bdev);
+	if (!s) {
+		r = -ENXIO;
+		ti->error = "Origin target not loaded";
+		goto origin_not_loaded;
+	}
+
+	dm_multisnap_lock(s);
+
+	r = read_snapid(s, snapid_str, &snapid, &ti->error);
+	if (r) {
+		dm_multisnap_unlock(s);
+		goto snapid_doesnt_exist;
+	}
+
+	doesnt_exist = 0;
+	if (!dm_multisnap_snapshot_exists(s, snapid)) {
+		if (dm_multisnap_has_error(s) && dm_multisnap_drop_on_error(s)) {
+			/*
+			 * If there was an error, we don't know which snapshot
+			 * IDs are available. So we must accept it. But we
+			 * abort all accesses to this snapshot with an error.
+			 */
+			 doesnt_exist = 1;
+		} else {
+			dm_multisnap_unlock(s);
+			r = -ENOENT;
+			ti->error = "Snapshot with this id doesn't exist";
+			goto snapid_doesnt_exist;
+		}
+	}
+	dm_multisnap_unlock(s);
+
+	sn = kmalloc(sizeof(*sn) + strlen(snapid_str), GFP_KERNEL);
+	if (!sn) {
+		ti->error = "Could not allocate multisnapshot_snap structure";
+		r = -ENOMEM;
+		goto cant_allocate;
+	}
+	sn->s = doesnt_exist ? NULL : s;
+	sn->snapid = snapid;
+	list_add(&sn->list_snaps, &s->all_snaps);
+	strlcpy(sn->origin_name, origin->name, sizeof sn->origin_name);
+	strcpy(sn->snapid_string, snapid_str);
+
+	mutex_unlock(&all_multisnapshots_lock);
+
+	dm_put_device(ti, origin);
+
+	ti->private = sn;
+	ti->split_io = s->chunk_size >> SECTOR_SHIFT;
+	ti->num_flush_requests = 1;
+
+	return 0;
+
+cant_allocate:
+snapid_doesnt_exist:
+origin_not_loaded:
+	dm_put_device(ti, origin);
+	mutex_unlock(&all_multisnapshots_lock);
+bad_origin:
+bad_arguments:
+	return r;
+}
+
+static void multisnap_snap_dtr(struct dm_target *ti)
+{
+	struct dm_multisnap_snap *sn = ti->private;
+
+	mutex_lock(&all_multisnapshots_lock);
+
+	list_del(&sn->list_snaps);
+	kfree(sn);
+
+	mutex_unlock(&all_multisnapshots_lock);
+}
+
+/*
+ * Each snapshot I/O is counted in n_tracked_ios in the origin and
+ * has struct dm_multisnap_tracked_chunk allocated.
+ * dm_multisnap_tracked_chunk->node can be optionally linked into origin's hash
+ * of tracked I/Os.
+ */
+
+static int multisnap_snap_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
+{
+	struct dm_multisnap_snap *sn = ti->private;
+	struct dm_multisnap *s;
+	struct dm_multisnap_tracked_chunk *c;
+
+	bio_put_snapid(bio, sn->snapid);
+
+	spin_lock_irq(&dm_multisnap_bio_list_lock);
+	s = sn->s;
+	if (unlikely(!s)) {
+		spin_unlock_irq(&dm_multisnap_bio_list_lock);
+		return -EIO;
+	}
+		/*
+		 * make sure that the origin is not unloaded under us while
+		 * we drop the lock
+		 */
+	s->n_tracked_ios++;
+
+	c = mempool_alloc(s->tracked_chunk_pool, GFP_ATOMIC);
+	if (unlikely(!c)) {
+		spin_unlock_irq(&dm_multisnap_bio_list_lock);
+		c = mempool_alloc(s->tracked_chunk_pool, GFP_NOIO);
+		spin_lock_irq(&dm_multisnap_bio_list_lock);
+	}
+	c->s = s;
+	c->chunk = sector_to_chunk(s, bio->bi_sector);
+	c->bio_rw = bio_rw(bio);
+	INIT_HLIST_NODE(&c->node);
+	map_context->ptr = c;
+
+	if (unlikely(bio_empty_barrier(bio))) {
+		bio->bi_bdev = s->snapshot->bdev;
+		spin_unlock_irq(&dm_multisnap_bio_list_lock);
+		return DM_MAPIO_REMAPPED;
+	}
+
+	dm_multisnap_enqueue_bio_unlocked(s, bio);
+	spin_unlock_irq(&dm_multisnap_bio_list_lock);
+
+	wakeup_kmultisnapd(s);
+
+	return DM_MAPIO_SUBMITTED;
+}
+
+static int multisnap_snap_end_io(struct dm_target *ti, struct bio *bio, int error, union map_info *map_context)
+{
+	struct dm_multisnap_tracked_chunk *c = map_context->ptr;
+	struct dm_multisnap *s = c->s;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dm_multisnap_bio_list_lock, flags);
+
+	s->n_tracked_ios--;
+	if (!hlist_unhashed(&c->node))
+		hlist_del(&c->node);
+	mempool_free(c, s->tracked_chunk_pool);
+
+	spin_unlock_irqrestore(&dm_multisnap_bio_list_lock, flags);
+
+	return 0;
+}
+
+static int multisnap_snap_status(struct dm_target *ti, status_type_t type, char *result, unsigned maxlen)
+{
+	struct dm_multisnap_snap *sn = ti->private;
+
+	switch (type) {
+
+	case STATUSTYPE_INFO:
+		/* there is no status */
+		result[0] = 0;
+		dm_multisnap_adjust_string(&result, &maxlen);
+		break;
+	case STATUSTYPE_TABLE:
+		snprintf(result, maxlen, "%s %s", sn->origin_name, sn->snapid_string);
+		dm_multisnap_adjust_string(&result, &maxlen);
+		break;
+	}
+
+	/* If there's no space left in the buffer, ask for larger size */
+	return maxlen <= 1;
+}
+
+static struct target_type multisnap_origin_target = {
+	.name		= "multisnapshot",
+	.version 	= {1, 0, 0},
+	.module		= THIS_MODULE,
+	.ctr		= multisnap_origin_ctr,
+	.dtr		= multisnap_origin_dtr,
+	.map		= multisnap_origin_map,
+	.message 	= multisnap_origin_message,
+	.status		= multisnap_origin_status,
+	.postsuspend	= multisnap_origin_postsuspend,
+};
+
+static struct target_type multisnap_snap_target = {
+	.name		= "multisnap-snap",
+	.version 	= {1, 0, 0},
+	.module		= THIS_MODULE,
+	.ctr		= multisnap_snap_ctr,
+	.dtr		= multisnap_snap_dtr,
+	.map		= multisnap_snap_map,
+	.end_io		= multisnap_snap_end_io,
+	.status		= multisnap_snap_status,
+};
+
+static int __init dm_multisnapshot_init(void)
+{
+	int r;
+
+	pending_exception_cache = kmem_cache_create(
+			"dm_multisnap_pending_exception",
+			sizeof(struct dm_multisnap_pending_exception),
+			__alignof__(struct dm_multisnap_pending_exception),
+			0,
+			pending_exception_ctor);
+	if (!pending_exception_cache) {
+		DMERR("Couldn't create exception cache.");
+		r = -ENOMEM;
+		goto bad_exception_cache;
+	}
+	tracked_chunk_cache = KMEM_CACHE(dm_multisnap_tracked_chunk, 0);
+	if (!tracked_chunk_cache) {
+		DMERR("Couldn't create cache to track chunks in use.");
+		r = -ENOMEM;
+		goto bad_tracked_chunk_cache;
+	}
+
+	r = dm_register_target(&multisnap_origin_target);
+	if (r < 0) {
+		DMERR("multisnap_origin_target target register failed %d", r);
+		goto bad_multisnap_origin_target;
+	}
+
+	r = dm_register_target(&multisnap_snap_target);
+	if (r < 0) {
+		DMERR("multisnap_snap_target target register failed %d", r);
+		goto bad_multisnap_snap_target;
+	}
+
+	return 0;
+
+bad_multisnap_snap_target:
+	dm_unregister_target(&multisnap_origin_target);
+bad_multisnap_origin_target:
+	kmem_cache_destroy(tracked_chunk_cache);
+bad_tracked_chunk_cache:
+	kmem_cache_destroy(pending_exception_cache);
+bad_exception_cache:
+	return r;
+}
+
+static void __exit dm_multisnapshot_exit(void)
+{
+	dm_unregister_target(&multisnap_origin_target);
+	dm_unregister_target(&multisnap_snap_target);
+	kmem_cache_destroy(tracked_chunk_cache);
+	kmem_cache_destroy(pending_exception_cache);
+}
+
+/* Module hooks */
+module_init(dm_multisnapshot_init);
+module_exit(dm_multisnapshot_exit);
+
+MODULE_DESCRIPTION(DM_NAME " multisnapshot target");
+MODULE_AUTHOR("Mikulas Patocka");
+MODULE_LICENSE("GPL");
Index: linux-2.6.32/drivers/md/dm-multisnap.h
===================================================================
--- /dev/null
+++ linux-2.6.32/drivers/md/dm-multisnap.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2009 Red Hat Czech, s.r.o.
+ *
+ * Mikulas Patocka <mpatocka@redhat.com>
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_MULTISNAP_H
+#define DM_MULTISNAP_H
+
+/*
+ * This file defines the interface between generic driver (dm-multisnap.c)
+ * and exception store drivers.
+ */
+
+#include <linux/device-mapper.h>
+#include <linux/list.h>
+
+#define EFSERROR	EPERM
+
+#define DM_MSG_PREFIX "multisnapshot"
+
+#define SNAPID_T_ORIGIN		0xffffffffffffffffULL
+
+typedef sector_t chunk_t;
+typedef __u64 snapid_t;
+
+struct dm_multisnap;		/* private to dm-multisnap.c */
+struct dm_exception_store;	/* private to the exception store driver */
+
+struct dm_multisnap_background_work {
+	struct list_head list;
+	void (*work)(struct dm_exception_store *, struct dm_multisnap_background_work *);
+	int queued;
+};
+
+union chunk_descriptor {
+	__u64 bitmask;
+	struct {
+		snapid_t from;
+		snapid_t to;
+	} range;
+};
+
+struct dm_multisnap_exception_store {
+	struct list_head list;
+	struct module *module;
+	const char *name;
+
+	/* < 0 - error */
+	int (*init_exception_store)(struct dm_multisnap *dm, struct dm_exception_store **s, unsigned argc, char **argv, char **error);
+
+	void (*exit_exception_store)(struct dm_exception_store *s);
+
+	void (*store_lock_acquired)(struct dm_exception_store *s, int flags);
+
+	/* These two can override format of snapids in the table. Can be NULL */
+	void (*print_snapid)(struct dm_exception_store *s, char *string, unsigned maxlen, snapid_t snapid);
+	int (*read_snapid)(struct dm_exception_store *s, char *string, snapid_t *snapid, char **error);
+
+	/* return the exception-store specific table arguments */
+	void (*status_table)(struct dm_exception_store *s, char *result, unsigned maxlen);
+
+	/* return the space */
+	void (*get_space)(struct dm_exception_store *s, unsigned long long *chunks_total, unsigned long long *chunks_allocated, unsigned long long *chunks_metadata_allocated);
+
+	/* < 0 - error */
+	int (*allocate_snapid)(struct dm_exception_store *s, snapid_t *snapid);
+
+	/* < 0 - error */
+	int (*create_snapshot)(struct dm_exception_store *s, snapid_t snapid);
+
+	/* < 0 - error (may be NULL if not supported) */
+	int (*delete_snapshot)(struct dm_exception_store *s, snapid_t snapid);
+
+	/*
+	 * Get the first snapid at or after snapid in its argument.
+	 * If there are no more snapids, return SNAPID_T_ORIGIN.
+	 */
+	snapid_t (*get_next_snapid)(struct dm_exception_store *s, snapid_t snapid);
+
+	/*
+	 * qsort()-compatible function to order snapshots for creation.
+	 * may be NULL if standard ordering should be used.
+	 */
+	int (*compare_snapids_for_create)(const void *p1, const void *p2);
+
+	/* 0 - not found, 1 - found (read-only), 2 - found (writeable), < 0 - error */
+	int (*find_snapshot_chunk)(struct dm_exception_store *s, snapid_t id, chunk_t chunk, int write, chunk_t *result);
+
+	/*
+	 * Chunk interface between exception store and generic code.
+	 * Allowed sequences:
+	 *
+	 * - first call reset_query
+	 *   then repeatedly query next exception to make with query_next_remap
+	 *   and add it to btree with add_next_remap. This can be repeated until
+	 *   query_next_remap indicates that it has nothing more or until all 8
+	 *   kcopyd slots are filled.
+	 *
+	 * - call find_snapshot_chunk, if it returns 0, you can call
+	 *   add_next_remap to add the chunk to the btree.
+	 *
+	 * - call find_snapshot_chunk, if it returns 1 (shared chunk), call
+	 *   make_chunk_writeable to relocate that chunk.
+	 */
+
+	void (*reset_query)(struct dm_exception_store *s);
+	int (*query_next_remap)(struct dm_exception_store *s, chunk_t chunk);
+	void (*add_next_remap)(struct dm_exception_store *s, union chunk_descriptor *cd, chunk_t *new_chunk);
+
+	/* may be NULL if writeable snapshots are not supported */
+	void (*make_chunk_writeable)(struct dm_exception_store *s, union chunk_descriptor *cd, chunk_t *new_chunk);
+	int (*check_conflict)(struct dm_exception_store *s, union chunk_descriptor *cd, snapid_t snapid);
+
+	void (*commit)(struct dm_exception_store *s);
+};
+
+/* dm-multisnap.c */
+
+/* Access generic information about the snapshot */
+struct block_device *dm_multisnap_snapshot_bdev(struct dm_multisnap *s);
+unsigned dm_multisnap_chunk_size(struct dm_multisnap *s);
+void dm_multisnap_set_error(struct dm_multisnap *s, int error);
+int dm_multisnap_has_error(struct dm_multisnap *s);
+int dm_multisnap_drop_on_error(struct dm_multisnap *s);
+int dm_multisnap_snapshot_exists(struct dm_multisnap *s, snapid_t snapid);
+
+void dm_multisnap_status_lock(struct dm_multisnap *s);
+void dm_multisnap_status_unlock(struct dm_multisnap *s);
+void dm_multisnap_status_assert_locked(struct dm_multisnap *s);
+
+/* Commit. dm_multisnap_call_commit can be called only if dm_multisnap_can_commit returns true */
+int dm_multisnap_can_commit(struct dm_multisnap *s);
+void dm_multisnap_call_commit(struct dm_multisnap *s);
+
+/* Delayed work for delete/merge */
+void dm_multisnap_queue_work(struct dm_multisnap *s, struct dm_multisnap_background_work *bw);
+void dm_multisnap_cancel_work(struct dm_multisnap *s, struct dm_multisnap_background_work *bw);
+
+/* Parsing command line */
+int dm_multisnap_get_string(char ***argv, unsigned *argc, char **string, char **error);
+int dm_multisnap_get_uint64(char ***argv, unsigned *argc, __u64 *uint64, char **error);
+int dm_multisnap_get_uint(char ***argv, unsigned *argc, unsigned *uint, char **error);
+int dm_multisnap_get_argcount(char ***argv, unsigned *argc, unsigned *uint, char **error);
+void dm_multisnap_adjust_string(char **result, unsigned *maxlen);
+
+/* Register/unregister the exception store driver */
+int dm_multisnap_register_exception_store(struct dm_multisnap_exception_store *store);
+void dm_multisnap_unregister_exception_store(struct dm_multisnap_exception_store *store);
+
+#endif