<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">--- linux-2.4.26-rc1/drivers/md/dm-daemon.c	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-daemon.c	Wed Mar 31 13:48:32 2004
@@ -0,0 +1,113 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the LGPL.
+ */
+
+#include "dm.h"
+#include "dm-daemon.h"
+
+#include &lt;linux/module.h&gt;
+#include &lt;linux/sched.h&gt;
+
+static int daemon(void *arg)
+{
+	struct dm_daemon *dd = (struct dm_daemon *) arg;
+	DECLARE_WAITQUEUE(wq, current);
+
+	daemonize();
+	reparent_to_init();
+
+	/* block all signals */
+	spin_lock_irq(&amp;current-&gt;sigmask_lock);
+	sigfillset(&amp;current-&gt;blocked);
+	flush_signals(current);
+	spin_unlock_irq(&amp;current-&gt;sigmask_lock);
+
+	strcpy(current-&gt;comm, dd-&gt;name);
+	atomic_set(&amp;dd-&gt;please_die, 0);
+
+	add_wait_queue(&amp;dd-&gt;job_queue, &amp;wq);
+
+	down(&amp;dd-&gt;run_lock);
+	up(&amp;dd-&gt;start_lock);
+
+	/*
+	 * dd-&gt;fn() could do anything, very likely it will
+	 * suspend.  So we can't set the state to
+	 * TASK_INTERRUPTIBLE before calling it.  In order to
+	 * prevent a race with a waking thread we do this little
+	 * dance with the dd-&gt;woken variable.
+	 */
+	while (1) {
+		do {
+			set_current_state(TASK_RUNNING);
+
+			if (atomic_read(&amp;dd-&gt;please_die))
+				goto out;
+
+			atomic_set(&amp;dd-&gt;woken, 0);
+			dd-&gt;fn();
+			yield();
+
+			set_current_state(TASK_INTERRUPTIBLE);
+		} while (atomic_read(&amp;dd-&gt;woken));
+
+		schedule();
+	}
+
+ out:
+	remove_wait_queue(&amp;dd-&gt;job_queue, &amp;wq);
+	up(&amp;dd-&gt;run_lock);
+	return 0;
+}
+
+int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void))
+{
+	pid_t pid = 0;
+
+	/*
+	 * Initialise the dm_daemon.
+	 */
+	dd-&gt;fn = fn;
+	strncpy(dd-&gt;name, name, sizeof(dd-&gt;name) - 1);
+	sema_init(&amp;dd-&gt;start_lock, 1);
+	sema_init(&amp;dd-&gt;run_lock, 1);
+	init_waitqueue_head(&amp;dd-&gt;job_queue);
+
+	/*
+	 * Start the new thread.
+	 */
+	down(&amp;dd-&gt;start_lock);
+	pid = kernel_thread(daemon, dd, 0);
+	if (pid &lt;= 0) {
+		DMERR("Failed to start %s thread", name);
+		return -EAGAIN;
+	}
+
+	/*
+	 * wait for the daemon to up this mutex.
+	 */
+	down(&amp;dd-&gt;start_lock);
+	up(&amp;dd-&gt;start_lock);
+
+	return 0;
+}
+
+void dm_daemon_stop(struct dm_daemon *dd)
+{
+	atomic_set(&amp;dd-&gt;please_die, 1);
+	dm_daemon_wake(dd);
+	down(&amp;dd-&gt;run_lock);
+	up(&amp;dd-&gt;run_lock);
+}
+
+void dm_daemon_wake(struct dm_daemon *dd)
+{
+	atomic_set(&amp;dd-&gt;woken, 1);
+	wake_up_interruptible(&amp;dd-&gt;job_queue);
+}
+
+EXPORT_SYMBOL(dm_daemon_start);
+EXPORT_SYMBOL(dm_daemon_stop);
+EXPORT_SYMBOL(dm_daemon_wake);
--- linux-2.4.26-rc1/drivers/md/dm-daemon.h	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-daemon.h	Wed Mar 31 13:48:32 2004
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef DM_DAEMON_H
+#define DM_DAEMON_H
+
+#include &lt;asm/atomic.h&gt;
+#include &lt;asm/semaphore.h&gt;
+
+struct dm_daemon {
+	void (*fn)(void);
+	char name[16];
+	atomic_t please_die;
+	struct semaphore start_lock;
+	struct semaphore run_lock;
+
+	atomic_t woken;
+	wait_queue_head_t job_queue;
+};
+
+int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void));
+void dm_daemon_stop(struct dm_daemon *dd);
+void dm_daemon_wake(struct dm_daemon *dd);
+int dm_daemon_running(struct dm_daemon *dd);
+
+#endif
--- linux-2.4.26-rc1/drivers/md/dm-exception-store.c	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-exception-store.c	Wed Mar 31 13:48:32 2004
@@ -0,0 +1,673 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-snapshot.h"
+#include "dm-io.h"
+#include "kcopyd.h"
+
+#include &lt;linux/mm.h&gt;
+#include &lt;linux/pagemap.h&gt;
+#include &lt;linux/vmalloc.h&gt;
+#include &lt;linux/slab.h&gt;
+
+/*-----------------------------------------------------------------
+ * Persistent snapshots, by persistent we mean that the snapshot
+ * will survive a reboot.
+ *---------------------------------------------------------------*/
+
+/*
+ * We need to store a record of which parts of the origin have
+ * been copied to the snapshot device.  The snapshot code
+ * requires that we copy exception chunks to chunk aligned areas
+ * of the COW store.  It makes sense therefore, to store the
+ * metadata in chunk size blocks.
+ *
+ * There is no backward or forward compatibility implemented,
+ * snapshots with different disk versions than the kernel will
+ * not be usable.  It is expected that "lvcreate" will blank out
+ * the start of a fresh COW device before calling the snapshot
+ * constructor.
+ *
+ * The first chunk of the COW device just contains the header.
+ * After this there is a chunk filled with exception metadata,
+ * followed by as many exception chunks as can fit in the
+ * metadata areas.
+ *
+ * All on disk structures are in little-endian format.  The end
+ * of the exceptions info is indicated by an exception with a
+ * new_chunk of 0, which is invalid since it would point to the
+ * header chunk.
+ */
+
+/*
+ * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
+ */
+#define SNAP_MAGIC 0x70416e53
+
+/*
+ * The on-disk version of the metadata.
+ */
+#define SNAPSHOT_DISK_VERSION 1
+
+struct disk_header {
+	uint32_t magic;
+
+	/*
+	 * Is this snapshot valid.  There is no way of recovering
+	 * an invalid snapshot.
+	 */
+	uint32_t valid;
+
+	/*
+	 * Simple, incrementing version. no backward
+	 * compatibility.
+	 */
+	uint32_t version;
+
+	/* In sectors */
+	uint32_t chunk_size;
+};
+
+struct disk_exception {
+	uint64_t old_chunk;
+	uint64_t new_chunk;
+};
+
+struct commit_callback {
+	void (*callback)(void *, int success);
+	void *context;
+};
+
+/*
+ * The top level structure for a persistent exception store.
+ */
+struct pstore {
+	struct dm_snapshot *snap;	/* up pointer to my snapshot */
+	int version;
+	int valid;
+	uint32_t chunk_size;
+	uint32_t exceptions_per_area;
+
+	/*
+	 * Now that we have an asynchronous kcopyd there is no
+	 * need for large chunk sizes, so it wont hurt to have a
+	 * whole chunks worth of metadata in memory at once.
+	 */
+	void *area;
+
+	/*
+	 * Used to keep track of which metadata area the data in
+	 * 'chunk' refers to.
+	 */
+	uint32_t current_area;
+
+	/*
+	 * The next free chunk for an exception.
+	 */
+	uint32_t next_free;
+
+	/*
+	 * The index of next free exception in the current
+	 * metadata area.
+	 */
+	uint32_t current_committed;
+
+	atomic_t pending_count;
+	uint32_t callback_count;
+	struct commit_callback *callbacks;
+};
+
+static inline unsigned int sectors_to_pages(unsigned int sectors)
+{
+	return sectors / (PAGE_SIZE / SECTOR_SIZE);
+}
+
+static int alloc_area(struct pstore *ps)
+{
+	int r = -ENOMEM;
+	size_t i, len, nr_pages;
+	struct page *page, *last = NULL;
+
+	len = ps-&gt;chunk_size &lt;&lt; SECTOR_SHIFT;
+
+	/*
+	 * Allocate the chunk_size block of memory that will hold
+	 * a single metadata area.
+	 */
+	ps-&gt;area = vmalloc(len);
+	if (!ps-&gt;area)
+		return r;
+
+	nr_pages = sectors_to_pages(ps-&gt;chunk_size);
+
+	/*
+	 * We lock the pages for ps-&gt;area into memory since
+	 * they'll be doing a lot of io.  We also chain them
+	 * together ready for dm-io.
+	 */
+	for (i = 0; i &lt; nr_pages; i++) {
+		page = vmalloc_to_page(ps-&gt;area + (i * PAGE_SIZE));
+		LockPage(page);
+		if (last)
+			last-&gt;list.next = &amp;page-&gt;list;
+		last = page;
+	}
+
+	return 0;
+}
+
+static void free_area(struct pstore *ps)
+{
+	size_t i, nr_pages;
+	struct page *page;
+
+	nr_pages = sectors_to_pages(ps-&gt;chunk_size);
+	for (i = 0; i &lt; nr_pages; i++) {
+		page = vmalloc_to_page(ps-&gt;area + (i * PAGE_SIZE));
+		page-&gt;list.next = NULL;
+		UnlockPage(page);
+	}
+
+	vfree(ps-&gt;area);
+}
+
+/*
+ * Read or write a chunk aligned and sized block of data from a device.
+ */
+static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
+{
+	struct io_region where;
+	unsigned int bits;
+
+	where.dev = ps-&gt;snap-&gt;cow-&gt;dev;
+	where.sector = ps-&gt;chunk_size * chunk;
+	where.count = ps-&gt;chunk_size;
+
+	return dm_io_sync(1, &amp;where, rw, vmalloc_to_page(ps-&gt;area), 0, &amp;bits);
+}
+
+/*
+ * Read or write a metadata area.  Remembering to skip the first
+ * chunk which holds the header.
+ */
+static int area_io(struct pstore *ps, uint32_t area, int rw)
+{
+	int r;
+	uint32_t chunk;
+
+	/* convert a metadata area index to a chunk index */
+	chunk = 1 + ((ps-&gt;exceptions_per_area + 1) * area);
+
+	r = chunk_io(ps, chunk, rw);
+	if (r)
+		return r;
+
+	ps-&gt;current_area = area;
+	return 0;
+}
+
+static int zero_area(struct pstore *ps, uint32_t area)
+{
+	memset(ps-&gt;area, 0, ps-&gt;chunk_size &lt;&lt; SECTOR_SHIFT);
+	return area_io(ps, area, WRITE);
+}
+
+static int read_header(struct pstore *ps, int *new_snapshot)
+{
+	int r;
+	struct disk_header *dh;
+
+	r = chunk_io(ps, 0, READ);
+	if (r)
+		return r;
+
+	dh = (struct disk_header *) ps-&gt;area;
+
+	if (le32_to_cpu(dh-&gt;magic) == 0) {
+		*new_snapshot = 1;
+
+	} else if (le32_to_cpu(dh-&gt;magic) == SNAP_MAGIC) {
+		*new_snapshot = 0;
+		ps-&gt;valid = le32_to_cpu(dh-&gt;valid);
+		ps-&gt;version = le32_to_cpu(dh-&gt;version);
+		ps-&gt;chunk_size = le32_to_cpu(dh-&gt;chunk_size);
+
+	} else {
+		DMWARN("Invalid/corrupt snapshot");
+		r = -ENXIO;
+	}
+
+	return r;
+}
+
+static int write_header(struct pstore *ps)
+{
+	struct disk_header *dh;
+
+	memset(ps-&gt;area, 0, ps-&gt;chunk_size &lt;&lt; SECTOR_SHIFT);
+
+	dh = (struct disk_header *) ps-&gt;area;
+	dh-&gt;magic = cpu_to_le32(SNAP_MAGIC);
+	dh-&gt;valid = cpu_to_le32(ps-&gt;valid);
+	dh-&gt;version = cpu_to_le32(ps-&gt;version);
+	dh-&gt;chunk_size = cpu_to_le32(ps-&gt;chunk_size);
+
+	return chunk_io(ps, 0, WRITE);
+}
+
+/*
+ * Access functions for the disk exceptions, these do the endian conversions.
+ */
+static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
+{
+	if (index &gt;= ps-&gt;exceptions_per_area)
+		return NULL;
+
+	return ((struct disk_exception *) ps-&gt;area) + index;
+}
+
+static int read_exception(struct pstore *ps,
+			  uint32_t index, struct disk_exception *result)
+{
+	struct disk_exception *e;
+
+	e = get_exception(ps, index);
+	if (!e)
+		return -EINVAL;
+
+	/* copy it */
+	result-&gt;old_chunk = le64_to_cpu(e-&gt;old_chunk);
+	result-&gt;new_chunk = le64_to_cpu(e-&gt;new_chunk);
+
+	return 0;
+}
+
+static int write_exception(struct pstore *ps,
+			   uint32_t index, struct disk_exception *de)
+{
+	struct disk_exception *e;
+
+	e = get_exception(ps, index);
+	if (!e)
+		return -EINVAL;
+
+	/* copy it */
+	e-&gt;old_chunk = cpu_to_le64(de-&gt;old_chunk);
+	e-&gt;new_chunk = cpu_to_le64(de-&gt;new_chunk);
+
+	return 0;
+}
+
+/*
+ * Registers the exceptions that are present in the current area.
+ * 'full' is filled in to indicate if the area has been
+ * filled.
+ */
+static int insert_exceptions(struct pstore *ps, int *full)
+{
+	int r;
+	unsigned int i;
+	struct disk_exception de;
+
+	/* presume the area is full */
+	*full = 1;
+
+	for (i = 0; i &lt; ps-&gt;exceptions_per_area; i++) {
+		r = read_exception(ps, i, &amp;de);
+
+		if (r)
+			return r;
+
+		/*
+		 * If the new_chunk is pointing at the start of
+		 * the COW device, where the first metadata area
+		 * is we know that we've hit the end of the
+		 * exceptions.  Therefore the area is not full.
+		 */
+		if (de.new_chunk == 0LL) {
+			ps-&gt;current_committed = i;
+			*full = 0;
+			break;
+		}
+
+		/*
+		 * Keep track of the start of the free chunks.
+		 */
+		if (ps-&gt;next_free &lt;= de.new_chunk)
+			ps-&gt;next_free = de.new_chunk + 1;
+
+		/*
+		 * Otherwise we add the exception to the snapshot.
+		 */
+		r = dm_add_exception(ps-&gt;snap, de.old_chunk, de.new_chunk);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static int read_exceptions(struct pstore *ps)
+{
+	uint32_t area;
+	int r, full = 1;
+
+	/*
+	 * Keeping reading chunks and inserting exceptions until
+	 * we find a partially full area.
+	 */
+	for (area = 0; full; area++) {
+		r = area_io(ps, area, READ);
+		if (r)
+			return r;
+
+		r = insert_exceptions(ps, &amp;full);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static inline struct pstore *get_info(struct exception_store *store)
+{
+	return (struct pstore *) store-&gt;context;
+}
+
+static void persistent_fraction_full(struct exception_store *store,
+				     sector_t *numerator, sector_t *denominator)
+{
+	*numerator = get_info(store)-&gt;next_free * store-&gt;snap-&gt;chunk_size;
+	*denominator = get_dev_size(store-&gt;snap-&gt;cow-&gt;dev);
+}
+
+static void persistent_destroy(struct exception_store *store)
+{
+	struct pstore *ps = get_info(store);
+
+	dm_io_put(sectors_to_pages(ps-&gt;chunk_size));
+	vfree(ps-&gt;callbacks);
+	free_area(ps);
+	kfree(ps);
+}
+
+static int persistent_read_metadata(struct exception_store *store)
+{
+	int r, new_snapshot;
+	struct pstore *ps = get_info(store);
+
+	/*
+	 * Read the snapshot header.
+	 */
+	r = read_header(ps, &amp;new_snapshot);
+	if (r)
+		return r;
+
+	/*
+	 * Do we need to setup a new snapshot ?
+	 */
+	if (new_snapshot) {
+		r = write_header(ps);
+		if (r) {
+			DMWARN("write_header failed");
+			return r;
+		}
+
+		r = zero_area(ps, 0);
+		if (r) {
+			DMWARN("zero_area(0) failed");
+			return r;
+		}
+
+	} else {
+		/*
+		 * Sanity checks.
+		 */
+		if (!ps-&gt;valid) {
+			DMWARN("snapshot is marked invalid");
+			return -EINVAL;
+		}
+
+		if (ps-&gt;version != SNAPSHOT_DISK_VERSION) {
+			DMWARN("unable to handle snapshot disk version %d",
+			       ps-&gt;version);
+			return -EINVAL;
+		}
+
+		/*
+		 * Read the metadata.
+		 */
+		r = read_exceptions(ps);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static int persistent_prepare(struct exception_store *store,
+			      struct exception *e)
+{
+	struct pstore *ps = get_info(store);
+	uint32_t stride;
+	sector_t size = get_dev_size(store-&gt;snap-&gt;cow-&gt;dev);
+
+	/* Is there enough room ? */
+	if (size &lt; ((ps-&gt;next_free + 1) * store-&gt;snap-&gt;chunk_size))
+		return -ENOSPC;
+
+	e-&gt;new_chunk = ps-&gt;next_free;
+
+	/*
+	 * Move onto the next free pending, making sure to take
+	 * into account the location of the metadata chunks.
+	 */
+	stride = (ps-&gt;exceptions_per_area + 1);
+	if ((++ps-&gt;next_free % stride) == 1)
+		ps-&gt;next_free++;
+
+	atomic_inc(&amp;ps-&gt;pending_count);
+	return 0;
+}
+
+static void persistent_commit(struct exception_store *store,
+			      struct exception *e,
+			      void (*callback) (void *, int success),
+			      void *callback_context)
+{
+	int r;
+	unsigned int i;
+	struct pstore *ps = get_info(store);
+	struct disk_exception de;
+	struct commit_callback *cb;
+
+	de.old_chunk = e-&gt;old_chunk;
+	de.new_chunk = e-&gt;new_chunk;
+	write_exception(ps, ps-&gt;current_committed++, &amp;de);
+
+	/*
+	 * Add the callback to the back of the array.  This code
+	 * is the only place where the callback array is
+	 * manipulated, and we know that it will never be called
+	 * multiple times concurrently.
+	 */
+	cb = ps-&gt;callbacks + ps-&gt;callback_count++;
+	cb-&gt;callback = callback;
+	cb-&gt;context = callback_context;
+
+	/*
+	 * If there are no more exceptions in flight, or we have
+	 * filled this metadata area we commit the exceptions to
+	 * disk.
+	 */
+	if (atomic_dec_and_test(&amp;ps-&gt;pending_count) ||
+	    (ps-&gt;current_committed == ps-&gt;exceptions_per_area)) {
+		r = area_io(ps, ps-&gt;current_area, WRITE);
+		if (r)
+			ps-&gt;valid = 0;
+
+		for (i = 0; i &lt; ps-&gt;callback_count; i++) {
+			cb = ps-&gt;callbacks + i;
+			cb-&gt;callback(cb-&gt;context, r == 0 ? 1 : 0);
+		}
+
+		ps-&gt;callback_count = 0;
+	}
+
+	/*
+	 * Have we completely filled the current area ?
+	 */
+	if (ps-&gt;current_committed == ps-&gt;exceptions_per_area) {
+		ps-&gt;current_committed = 0;
+		r = zero_area(ps, ps-&gt;current_area + 1);
+		if (r)
+			ps-&gt;valid = 0;
+	}
+}
+
+static void persistent_drop(struct exception_store *store)
+{
+	struct pstore *ps = get_info(store);
+
+	ps-&gt;valid = 0;
+	if (write_header(ps))
+		DMWARN("write header failed");
+}
+
+int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
+{
+	int r;
+	struct pstore *ps;
+
+	r = dm_io_get(sectors_to_pages(chunk_size));
+	if (r)
+		return r;
+
+	/* allocate the pstore */
+	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps) {
+		r = -ENOMEM;
+		goto bad;
+	}
+
+	ps-&gt;snap = store-&gt;snap;
+	ps-&gt;valid = 1;
+	ps-&gt;version = SNAPSHOT_DISK_VERSION;
+	ps-&gt;chunk_size = chunk_size;
+	ps-&gt;exceptions_per_area = (chunk_size &lt;&lt; SECTOR_SHIFT) /
+	    sizeof(struct disk_exception);
+	ps-&gt;next_free = 2;	/* skipping the header and first area */
+	ps-&gt;current_committed = 0;
+
+	r = alloc_area(ps);
+	if (r)
+		goto bad;
+
+	/*
+	 * Allocate space for all the callbacks.
+	 */
+	ps-&gt;callback_count = 0;
+	atomic_set(&amp;ps-&gt;pending_count, 0);
+	ps-&gt;callbacks = vcalloc(ps-&gt;exceptions_per_area,
+				sizeof(*ps-&gt;callbacks));
+
+	if (!ps-&gt;callbacks) {
+		r = -ENOMEM;
+		goto bad;
+	}
+
+	store-&gt;destroy = persistent_destroy;
+	store-&gt;read_metadata = persistent_read_metadata;
+	store-&gt;prepare_exception = persistent_prepare;
+	store-&gt;commit_exception = persistent_commit;
+	store-&gt;drop_snapshot = persistent_drop;
+	store-&gt;fraction_full = persistent_fraction_full;
+	store-&gt;context = ps;
+
+	return 0;
+
+      bad:
+	dm_io_put(sectors_to_pages(chunk_size));
+	if (ps) {
+		if (ps-&gt;callbacks)
+			vfree(ps-&gt;callbacks);
+
+		kfree(ps);
+	}
+	return r;
+}
+
+/*-----------------------------------------------------------------
+ * Implementation of the store for non-persistent snapshots.
+ *---------------------------------------------------------------*/
+struct transient_c {
+	sector_t next_free;
+};
+
+void transient_destroy(struct exception_store *store)
+{
+	kfree(store-&gt;context);
+}
+
+int transient_read_metadata(struct exception_store *store)
+{
+	return 0;
+}
+
+int transient_prepare(struct exception_store *store, struct exception *e)
+{
+	struct transient_c *tc = (struct transient_c *) store-&gt;context;
+	sector_t size = get_dev_size(store-&gt;snap-&gt;cow-&gt;dev);
+
+	if (size &lt; (tc-&gt;next_free + store-&gt;snap-&gt;chunk_size))
+		return -1;
+
+	e-&gt;new_chunk = sector_to_chunk(store-&gt;snap, tc-&gt;next_free);
+	tc-&gt;next_free += store-&gt;snap-&gt;chunk_size;
+
+	return 0;
+}
+
+void transient_commit(struct exception_store *store,
+		      struct exception *e,
+		      void (*callback) (void *, int success),
+		      void *callback_context)
+{
+	/* Just succeed */
+	callback(callback_context, 1);
+}
+
+static void transient_fraction_full(struct exception_store *store,
+				    sector_t *numerator, sector_t *denominator)
+{
+	*numerator = ((struct transient_c *) store-&gt;context)-&gt;next_free;
+	*denominator = get_dev_size(store-&gt;snap-&gt;cow-&gt;dev);
+}
+
+int dm_create_transient(struct exception_store *store,
+			struct dm_snapshot *s, int blocksize)
+{
+	struct transient_c *tc;
+
+	memset(store, 0, sizeof(*store));
+	store-&gt;destroy = transient_destroy;
+	store-&gt;read_metadata = transient_read_metadata;
+	store-&gt;prepare_exception = transient_prepare;
+	store-&gt;commit_exception = transient_commit;
+	store-&gt;fraction_full = transient_fraction_full;
+	store-&gt;snap = s;
+
+	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
+	if (!tc)
+		return -ENOMEM;
+
+	tc-&gt;next_free = 0;
+	store-&gt;context = tc;
+
+	return 0;
+}
--- linux-2.4.26-rc1/drivers/md/dm-io.c	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-io.c	Wed Mar 31 13:48:32 2004
@@ -0,0 +1,361 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-io.h"
+
+#include &lt;linux/mempool.h&gt;
+#include &lt;linux/module.h&gt;
+#include &lt;linux/slab.h&gt;
+#include &lt;linux/sched.h&gt;
+#include &lt;linux/bitops.h&gt;
+
+/* FIXME: can we shrink this ? */
+struct io_context {
+	int rw;
+	unsigned int error;
+	atomic_t count;
+	struct task_struct *sleeper;
+	io_notify_fn callback;
+	void *context;
+};
+
+/*
+ * We maintain a pool of buffer heads for dispatching the io.
+ */
+static unsigned int _num_bhs;
+static mempool_t *_buffer_pool;
+
+/*
+ * io contexts are only dynamically allocated for asynchronous
+ * io.  Since async io is likely to be the majority of io we'll
+ * have the same number of io contexts as buffer heads ! (FIXME:
+ * must reduce this).
+ */
+mempool_t *_io_pool;
+
+static void *alloc_bh(int gfp_mask, void *pool_data)
+{
+	struct buffer_head *bh;
+
+	bh = kmem_cache_alloc(bh_cachep, gfp_mask);
+	if (bh) {
+		bh-&gt;b_reqnext = NULL;
+		init_waitqueue_head(&amp;bh-&gt;b_wait);
+		INIT_LIST_HEAD(&amp;bh-&gt;b_inode_buffers);
+	}
+
+	return bh;
+}
+
+static void *alloc_io(int gfp_mask, void *pool_data)
+{
+	return kmalloc(sizeof(struct io_context), gfp_mask);
+}
+
+static void free_io(void *element, void *pool_data)
+{
+	kfree(element);
+}
+
+static unsigned int pages_to_buffers(unsigned int pages)
+{
+	return 4 * pages;	/* too many ? */
+}
+
+static int resize_pool(unsigned int new_bhs)
+{
+	int r = 0;
+
+	if (_buffer_pool) {
+		if (new_bhs == 0) {
+			/* free off the pools */
+			mempool_destroy(_buffer_pool);
+			mempool_destroy(_io_pool);
+			_buffer_pool = _io_pool = NULL;
+		} else {
+			/* resize the pools */
+			r = mempool_resize(_buffer_pool, new_bhs, GFP_KERNEL);
+			if (!r)
+				r = mempool_resize(_io_pool,
+						   new_bhs, GFP_KERNEL);
+		}
+	} else {
+		/* create new pools */
+		_buffer_pool = mempool_create(new_bhs, alloc_bh,
+					      mempool_free_slab, bh_cachep);
+		if (!_buffer_pool)
+			r = -ENOMEM;
+
+		_io_pool = mempool_create(new_bhs, alloc_io, free_io, NULL);
+		if (!_io_pool) {
+			mempool_destroy(_buffer_pool);
+			_buffer_pool = NULL;
+			r = -ENOMEM;
+		}
+	}
+
+	if (!r)
+		_num_bhs = new_bhs;
+
+	return r;
+}
+
+int dm_io_get(unsigned int num_pages)
+{
+	return resize_pool(_num_bhs + pages_to_buffers(num_pages));
+}
+
+void dm_io_put(unsigned int num_pages)
+{
+	resize_pool(_num_bhs - pages_to_buffers(num_pages));
+}
+
+/*-----------------------------------------------------------------
+ * We need to keep track of which region a buffer is doing io
+ * for.  In order to save a memory allocation we store this in an
+ * unused field of the buffer head, and provide these access
+ * functions.
+ *
+ * FIXME: add compile time check that an unsigned int can fit
+ * into a pointer.
+ *
+ *---------------------------------------------------------------*/
+static inline void bh_set_region(struct buffer_head *bh, unsigned int region)
+{
+	bh-&gt;b_journal_head = (void *) region;
+}
+
+static inline int bh_get_region(struct buffer_head *bh)
+{
+	return (unsigned int) bh-&gt;b_journal_head;
+}
+
+/*-----------------------------------------------------------------
+ * We need an io object to keep track of the number of bhs that
+ * have been dispatched for a particular io.
+ *---------------------------------------------------------------*/
+static void dec_count(struct io_context *io, unsigned int region, int error)
+{
+	if (error)
+		set_bit(region, &amp;io-&gt;error);
+
+	if (atomic_dec_and_test(&amp;io-&gt;count)) {
+		if (io-&gt;sleeper)
+			wake_up_process(io-&gt;sleeper);
+
+		else {
+			int r = io-&gt;error;
+			io_notify_fn fn = io-&gt;callback;
+			void *context = io-&gt;context;
+
+			mempool_free(io, _io_pool);
+			fn(r, context);
+		}
+	}
+}
+
+static void endio(struct buffer_head *bh, int uptodate)
+{
+	struct io_context *io = (struct io_context *) bh-&gt;b_private;
+
+	if (!uptodate &amp;&amp; io-&gt;rw != WRITE) {
+		/*
+		 * We need to zero this region, otherwise people
+		 * like kcopyd may write the arbitrary contents
+		 * of the page.
+		 */
+		memset(bh-&gt;b_data, 0, bh-&gt;b_size);
+	}
+
+	dec_count((struct io_context *) bh-&gt;b_private,
+		  bh_get_region(bh), !uptodate);
+	mempool_free(bh, _buffer_pool);
+}
+
+/*
+ * Primitives for alignment calculations.
+ */
+int fls(unsigned n)
+{
+	return generic_fls32(n);
+}
+
+static inline int log2_floor(unsigned n)
+{
+	return ffs(n) - 1;
+}
+
+static inline int log2_align(unsigned n)
+{
+	return fls(n) - 1;
+}
+
+/*
+ * Returns the next block for io.
+ */
+static int do_page(kdev_t dev, sector_t *block, sector_t end_block,
+		   unsigned int block_size,
+		   struct page *p, unsigned int offset,
+		   unsigned int region, struct io_context *io)
+{
+	struct buffer_head *bh;
+	sector_t b = *block;
+	sector_t blocks_per_page = PAGE_SIZE / block_size;
+	unsigned int this_size; /* holds the size of the current io */
+	sector_t len;
+
+	if (!blocks_per_page) {
+		DMERR("dm-io: PAGE_SIZE (%lu) &lt; block_size (%u) unsupported",
+		      PAGE_SIZE, block_size);
+		return 0;
+	}
+
+	while ((offset &lt; PAGE_SIZE) &amp;&amp; (b != end_block)) {
+		bh = mempool_alloc(_buffer_pool, GFP_NOIO);
+		init_buffer(bh, endio, io);
+		bh_set_region(bh, region);
+
+		/*
+		 * Block size must be a power of 2 and aligned
+		 * correctly.
+		 */
+
+		len = min(end_block - b, blocks_per_page);
+		len = min(len, blocks_per_page - offset / block_size);
+
+		if (!len) {
+			DMERR("dm-io: Invalid offset/block_size (%u/%u).",
+			      offset, block_size);
+			return 0;
+		}
+
+		this_size = 1 &lt;&lt; log2_align(len);
+		if (b)
+			this_size = min(this_size,
+					(unsigned) 1 &lt;&lt; log2_floor(b));
+
+		/*
+		 * Add in the job offset.
+		 */
+		bh-&gt;b_blocknr = (b / this_size);
+		bh-&gt;b_size = block_size * this_size;
+		set_bh_page(bh, p, offset);
+		bh-&gt;b_this_page = bh;
+
+		bh-&gt;b_dev = dev;
+		atomic_set(&amp;bh-&gt;b_count, 1);
+
+		bh-&gt;b_state = ((1 &lt;&lt; BH_Uptodate) | (1 &lt;&lt; BH_Mapped) |
+			       (1 &lt;&lt; BH_Lock));
+
+		if (io-&gt;rw == WRITE)
+			clear_bit(BH_Dirty, &amp;bh-&gt;b_state);
+
+		atomic_inc(&amp;io-&gt;count);
+		submit_bh(io-&gt;rw, bh);
+
+		b += this_size;
+		offset += block_size * this_size;
+	}
+
+	*block = b;
+	return (b == end_block);
+}
+
+static void do_region(unsigned int region, struct io_region *where,
+		      struct page *page, unsigned int offset,
+		      struct io_context *io)
+{
+	unsigned int block_size = get_hardsect_size(where-&gt;dev);
+	unsigned int sblock_size = block_size &gt;&gt; 9;
+	sector_t block = where-&gt;sector / sblock_size;
+	sector_t end_block = (where-&gt;sector + where-&gt;count) / sblock_size;
+
+	while (1) {
+		if (do_page(where-&gt;dev, &amp;block, end_block, block_size,
+			    page, offset, region, io))
+			break;
+
+		offset = 0;	/* only offset the first page */
+
+		page = list_entry(page-&gt;list.next, struct page, list);
+	}
+}
+
+static void dispatch_io(unsigned int num_regions, struct io_region *where,
+			struct page *pages, unsigned int offset,
+			struct io_context *io)
+{
+	int i;
+
+	for (i = 0; i &lt; num_regions; i++)
+		if (where[i].count)
+			do_region(i, where + i, pages, offset, io);
+
+	/*
+	 * Drop the extra refence that we were holding to avoid
+	 * the io being completed too early.
+	 */
+	dec_count(io, 0, 0);
+}
+
+/*
+ * Synchronous io
+ */
+int dm_io_sync(unsigned int num_regions, struct io_region *where,
+	       int rw, struct page *pages, unsigned int offset,
+	       unsigned int *error_bits)
+{
+	struct io_context io;
+
+	BUG_ON(num_regions &gt; 1 &amp;&amp; rw != WRITE);
+
+	io.rw = rw;
+	io.error = 0;
+	atomic_set(&amp;io.count, 1); /* see dispatch_io() */
+	io.sleeper = current;
+
+	dispatch_io(num_regions, where, pages, offset, &amp;io);
+	run_task_queue(&amp;tq_disk);
+
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+
+		if (!atomic_read(&amp;io.count))
+			break;
+
+		schedule();
+	}
+	set_current_state(TASK_RUNNING);
+
+	*error_bits = io.error;
+	return io.error ? -EIO : 0;
+}
+
+/*
+ * Asynchronous io
+ */
+int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
+		struct page *pages, unsigned int offset,
+		io_notify_fn fn, void *context)
+{
+	struct io_context *io = mempool_alloc(_io_pool, GFP_NOIO);
+
+	io-&gt;rw = rw;
+	io-&gt;error = 0;
+	atomic_set(&amp;io-&gt;count, 1); /* see dispatch_io() */
+	io-&gt;sleeper = NULL;
+	io-&gt;callback = fn;
+	io-&gt;context = context;
+
+	dispatch_io(num_regions, where, pages, offset, io);
+	return 0;
+}
+
+EXPORT_SYMBOL(dm_io_get);
+EXPORT_SYMBOL(dm_io_put);
+EXPORT_SYMBOL(dm_io_sync);
+EXPORT_SYMBOL(dm_io_async);
--- linux-2.4.26-rc1/drivers/md/dm-io.h	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-io.h	Wed Mar 31 13:48:32 2004
@@ -0,0 +1,86 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef _DM_IO_H
+#define _DM_IO_H
+
+#include "dm.h"
+
+#include &lt;linux/list.h&gt;
+
+/* Move these to bitops.h eventually */
+/* Improved generic_fls algorithm (in 2.4 there is no generic_fls so far) */
+/* (c) 2002, D.Phillips and Sistina Software */
+/* Licensed under Version 2 of the GPL */
+
+static unsigned generic_fls8(unsigned n)
+{
+	return n &amp; 0xf0 ?
+	    n &amp; 0xc0 ? (n &gt;&gt; 7) + 7 : (n &gt;&gt; 5) + 5:
+	    n &amp; 0x0c ? (n &gt;&gt; 3) + 3 : n - ((n + 1) &gt;&gt; 2);
+}
+
+static inline unsigned generic_fls16(unsigned n)
+{
+	return	n &amp; 0xff00? generic_fls8(n &gt;&gt; 8) + 8 : generic_fls8(n);
+}
+
+static inline unsigned generic_fls32(unsigned n)
+{
+	return	n &amp; 0xffff0000 ? generic_fls16(n &gt;&gt; 16) + 16 : generic_fls16(n);
+}
+
+/* FIXME make this configurable */
+#define DM_MAX_IO_REGIONS 8
+
+struct io_region {
+	kdev_t dev;
+	sector_t sector;
+	sector_t count;
+};
+
+
+/*
+ * 'error' is a bitset, with each bit indicating whether an error
+ * occurred doing io to the corresponding region.
+ */
+typedef void (*io_notify_fn)(unsigned int error, void *context);
+
+
+/*
+ * Before anyone uses the IO interface they should call
+ * dm_io_get(), specifying roughly how many pages they are
+ * expecting to perform io on concurrently.
+ *
+ * This function may block.
+ */
+int dm_io_get(unsigned int num_pages);
+void dm_io_put(unsigned int num_pages);
+
+
+/*
+ * Synchronous IO.
+ *
+ * Please ensure that the rw flag in the next two functions is
+ * either READ or WRITE, ie. we don't take READA.  Any
+ * regions with a zero count field will be ignored.
+ */
+int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
+	       struct page *pages, unsigned int offset,
+	       unsigned int *error_bits);
+
+
+/*
+ * Aynchronous IO.
+ *
+ * The 'where' array may be safely allocated on the stack since
+ * the function takes a copy.
+ */
+int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
+		struct page *pages, unsigned int offset,
+		io_notify_fn fn, void *context);
+
+#endif
--- linux-2.4.26-rc1/drivers/md/dm-ioctl.c	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-ioctl.c	Wed Mar 31 14:02:59 2004
@@ -0,0 +1,1366 @@
+/*
+ * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include &lt;linux/module.h&gt;
+#include &lt;linux/vmalloc.h&gt;
+#include &lt;linux/miscdevice.h&gt;
+#include &lt;linux/dm-ioctl.h&gt;
+#include &lt;linux/init.h&gt;
+#include &lt;linux/wait.h&gt;
+#include &lt;linux/blk.h&gt;
+#include &lt;linux/slab.h&gt;
+
+#include &lt;asm/uaccess.h&gt;
+
+#define DM_DRIVER_EMAIL "dm-devel@redhat.com"
+
+/*-----------------------------------------------------------------
+ * The ioctl interface needs to be able to look up devices by
+ * name or uuid.
+ *---------------------------------------------------------------*/
+struct hash_cell {
+	struct list_head name_list;
+	struct list_head uuid_list;
+
+	char *name;
+	char *uuid;
+	struct mapped_device *md;
+	struct dm_table *new_map;
+
+	/* I hate devfs */
+	devfs_handle_t devfs_entry;
+};
+
+struct vers_iter {
+	size_t param_size;
+	struct dm_target_versions *vers, *old_vers;
+	char *end;
+	uint32_t flags;
+};
+
+#define NUM_BUCKETS 64
+#define MASK_BUCKETS (NUM_BUCKETS - 1)
+static struct list_head _name_buckets[NUM_BUCKETS];
+static struct list_head _uuid_buckets[NUM_BUCKETS];
+
+static devfs_handle_t _dev_dir;
+void dm_hash_remove_all(void);
+
+/*
+ * Guards access to both hash tables.
+ */
+static DECLARE_RWSEM(_hash_lock);
+
+static void init_buckets(struct list_head *buckets)
+{
+	unsigned int i;
+
+	for (i = 0; i &lt; NUM_BUCKETS; i++)
+		INIT_LIST_HEAD(buckets + i);
+}
+
+int dm_hash_init(void)
+{
+	init_buckets(_name_buckets);
+	init_buckets(_uuid_buckets);
+	_dev_dir = devfs_mk_dir(0, DM_DIR, NULL);
+	return 0;
+}
+
+void dm_hash_exit(void)
+{
+	dm_hash_remove_all();
+	devfs_unregister(_dev_dir);
+}
+
+/*-----------------------------------------------------------------
+ * Hash function:
+ * We're not really concerned with the str hash function being
+ * fast since it's only used by the ioctl interface.
+ *---------------------------------------------------------------*/
+static unsigned int hash_str(const char *str)
+{
+	const unsigned int hash_mult = 2654435387U;
+	unsigned int h = 0;
+
+	while (*str)
+		h = (h + (unsigned int) *str++) * hash_mult;
+
+	return h &amp; MASK_BUCKETS;
+}
+
+/*-----------------------------------------------------------------
+ * Code for looking up a device by name
+ *---------------------------------------------------------------*/
+static struct hash_cell *__get_name_cell(const char *str)
+{
+	struct list_head *tmp;
+	struct hash_cell *hc;
+	unsigned int h = hash_str(str);
+
+	list_for_each (tmp, _name_buckets + h) {
+		hc = list_entry(tmp, struct hash_cell, name_list);
+		if (!strcmp(hc-&gt;name, str))
+			return hc;
+	}
+
+	return NULL;
+}
+
+static struct hash_cell *__get_uuid_cell(const char *str)
+{
+	struct list_head *tmp;
+	struct hash_cell *hc;
+	unsigned int h = hash_str(str);
+
+	list_for_each (tmp, _uuid_buckets + h) {
+		hc = list_entry(tmp, struct hash_cell, uuid_list);
+		if (!strcmp(hc-&gt;uuid, str))
+			return hc;
+	}
+
+	return NULL;
+}
+
+/*-----------------------------------------------------------------
+ * Inserting, removing and renaming a device.
+ *---------------------------------------------------------------*/
+static inline char *kstrdup(const char *str)
+{
+	char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
+	if (r)
+		strcpy(r, str);
+	return r;
+}
+
+static struct hash_cell *alloc_cell(const char *name, const char *uuid,
+				    struct mapped_device *md)
+{
+	struct hash_cell *hc;
+
+	hc = kmalloc(sizeof(*hc), GFP_KERNEL);
+	if (!hc)
+		return NULL;
+
+	hc-&gt;name = kstrdup(name);
+	if (!hc-&gt;name) {
+		kfree(hc);
+		return NULL;
+	}
+
+	if (!uuid)
+		hc-&gt;uuid = NULL;
+
+	else {
+		hc-&gt;uuid = kstrdup(uuid);
+		if (!hc-&gt;uuid) {
+			kfree(hc-&gt;name);
+			kfree(hc);
+			return NULL;
+		}
+	}
+
+	INIT_LIST_HEAD(&amp;hc-&gt;name_list);
+	INIT_LIST_HEAD(&amp;hc-&gt;uuid_list);
+	hc-&gt;md = md;
+	hc-&gt;new_map = NULL;
+	return hc;
+}
+
+static void free_cell(struct hash_cell *hc)
+{
+	if (hc) {
+		kfree(hc-&gt;name);
+		kfree(hc-&gt;uuid);
+		kfree(hc);
+	}
+}
+
+/*
+ * devfs stuff.
+ */
+static int register_with_devfs(struct hash_cell *hc)
+{
+	kdev_t dev = dm_kdev(hc-&gt;md);
+
+	hc-&gt;devfs_entry =
+	    devfs_register(_dev_dir, hc-&gt;name, DEVFS_FL_CURRENT_OWNER,
+			   major(dev), minor(dev),
+			   S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
+			   &amp;dm_blk_dops, NULL);
+
+	return 0;
+}
+
+static int unregister_with_devfs(struct hash_cell *hc)
+{
+	devfs_unregister(hc-&gt;devfs_entry);
+	return 0;
+}
+
+/*
+ * The kdev_t and uuid of a device can never change once it is
+ * initially inserted.
+ */
+int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
+{
+	struct hash_cell *cell;
+
+	/*
+	 * Allocate the new cells.
+	 */
+	cell = alloc_cell(name, uuid, md);
+	if (!cell)
+		return -ENOMEM;
+
+	/*
+	 * Insert the cell into both hash tables.
+	 */
+	down_write(&amp;_hash_lock);
+	if (__get_name_cell(name))
+		goto bad;
+
+	list_add(&amp;cell-&gt;name_list, _name_buckets + hash_str(name));
+
+	if (uuid) {
+		if (__get_uuid_cell(uuid)) {
+			list_del(&amp;cell-&gt;name_list);
+			goto bad;
+		}
+		list_add(&amp;cell-&gt;uuid_list, _uuid_buckets + hash_str(uuid));
+	}
+	register_with_devfs(cell);
+	dm_get(md);
+	up_write(&amp;_hash_lock);
+
+	return 0;
+
+      bad:
+	up_write(&amp;_hash_lock);
+	free_cell(cell);
+	return -EBUSY;
+}
+
+void __hash_remove(struct hash_cell *hc)
+{
+	/* remove from the dev hash */
+	list_del(&amp;hc-&gt;uuid_list);
+	list_del(&amp;hc-&gt;name_list);
+	unregister_with_devfs(hc);
+	dm_put(hc-&gt;md);
+	if (hc-&gt;new_map)
+		dm_table_put(hc-&gt;new_map);
+	free_cell(hc);
+}
+
+void dm_hash_remove_all(void)
+{
+	int i;
+	struct hash_cell *hc;
+	struct list_head *tmp, *n;
+
+	down_write(&amp;_hash_lock);
+	for (i = 0; i &lt; NUM_BUCKETS; i++) {
+		list_for_each_safe (tmp, n, _name_buckets + i) {
+			hc = list_entry(tmp, struct hash_cell, name_list);
+			__hash_remove(hc);
+		}
+	}
+	up_write(&amp;_hash_lock);
+}
+
+int dm_hash_rename(const char *old, const char *new)
+{
+	char *new_name, *old_name;
+	struct hash_cell *hc;
+
+	/*
+	 * duplicate new.
+	 */
+	new_name = kstrdup(new);
+	if (!new_name)
+		return -ENOMEM;
+
+	down_write(&amp;_hash_lock);
+
+	/*
+	 * Is new free ?
+	 */
+	hc = __get_name_cell(new);
+	if (hc) {
+		DMWARN("asked to rename to an already existing name %s -&gt; %s",
+		       old, new);
+		up_write(&amp;_hash_lock);
+		kfree(new_name);
+		return -EBUSY;
+	}
+
+	/*
+	 * Is there such a device as 'old' ?
+	 */
+	hc = __get_name_cell(old);
+	if (!hc) {
+		DMWARN("asked to rename a non existent device %s -&gt; %s",
+		       old, new);
+		up_write(&amp;_hash_lock);
+		kfree(new_name);
+		return -ENXIO;
+	}
+
+	/*
+	 * rename and move the name cell.
+	 */
+	list_del(&amp;hc-&gt;name_list);
+	old_name = hc-&gt;name;
+	hc-&gt;name = new_name;
+	list_add(&amp;hc-&gt;name_list, _name_buckets + hash_str(new_name));
+
+	/* rename the device node in devfs */
+	unregister_with_devfs(hc);
+	register_with_devfs(hc);
+
+	up_write(&amp;_hash_lock);
+	kfree(old_name);
+	return 0;
+}
+
+/*-----------------------------------------------------------------
+ * Implementation of the ioctl commands
+ *---------------------------------------------------------------*/
+/*
+ * All the ioctl commands get dispatched to functions with this
+ * prototype.
+ */
+typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
+
+static int remove_all(struct dm_ioctl *param, size_t param_size)
+{
+	dm_hash_remove_all();
+	param-&gt;data_size = 0;
+	return 0;
+}
+
+/*
+ * Round up the ptr to an 8-byte boundary.
+ */
+#define ALIGN_MASK 7
+static inline void *align_ptr(void *ptr)
+{
+	return (void *) (((size_t) (ptr + ALIGN_MASK)) &amp; ~ALIGN_MASK);
+}
+
+/*
+ * Retrieves the data payload buffer from an already allocated
+ * struct dm_ioctl.
+ */
+static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
+			       size_t *len)
+{
+	param-&gt;data_start = align_ptr(param + 1) - (void *) param;
+
+	if (param-&gt;data_start &lt; param_size)
+		*len = param_size - param-&gt;data_start;
+	else
+		*len = 0;
+
+	return ((void *) param) + param-&gt;data_start;
+}
+
+static int list_devices(struct dm_ioctl *param, size_t param_size)
+{
+	unsigned int i;
+	struct hash_cell *hc;
+	size_t len, needed = 0;
+	struct dm_name_list *nl, *old_nl = NULL;
+
+	down_write(&amp;_hash_lock);
+
+	/*
+	 * Loop through all the devices working out how much
+	 * space we need.
+	 */
+	for (i = 0; i &lt; NUM_BUCKETS; i++) {
+		list_for_each_entry (hc, _name_buckets + i, name_list) {
+			needed += sizeof(struct dm_name_list);
+			needed += strlen(hc-&gt;name) + 1;
+			needed += ALIGN_MASK;
+		}
+	}
+
+	/*
+	 * Grab our output buffer.
+	 */
+	nl = get_result_buffer(param, param_size, &amp;len);
+	if (len &lt; needed) {
+		param-&gt;flags |= DM_BUFFER_FULL_FLAG;
+		goto out;
+	}
+	param-&gt;data_size = param-&gt;data_start + needed;
+
+	nl-&gt;dev = 0;	/* Flags no data */
+
+	/*
+	 * Now loop through filling out the names.
+	 */
+	for (i = 0; i &lt; NUM_BUCKETS; i++) {
+		list_for_each_entry (hc, _name_buckets + i, name_list) {
+			if (old_nl)
+				old_nl-&gt;next = (uint32_t) ((void *) nl -
+							   (void *) old_nl);
+
+			nl-&gt;dev = dm_kdev(hc-&gt;md);
+			nl-&gt;next = 0;
+			strcpy(nl-&gt;name, hc-&gt;name);
+
+			old_nl = nl;
+			nl = align_ptr(((void *) ++nl) + strlen(hc-&gt;name) + 1);
+		}
+	}
+
+ out:
+	up_write(&amp;_hash_lock);
+	return 0;
+}
+
+static void list_version_get_needed(struct target_type *tt, void *param)
+{
+	int *needed = param;
+
+	*needed += strlen(tt-&gt;name);
+	*needed += sizeof(tt-&gt;version);
+	*needed += ALIGN_MASK;
+}
+
+static void list_version_get_info(struct target_type *tt, void *param)
+{
+	struct vers_iter *info = param;
+
+	/* Check space - it might have changed since the first iteration */
+	if ((char *)info-&gt;vers + sizeof(tt-&gt;version) + strlen(tt-&gt;name) + 1 &gt;
+	    info-&gt;end) {
+		info-&gt;flags = DM_BUFFER_FULL_FLAG;
+		return;
+	}
+
+	if (info-&gt;old_vers)
+		info-&gt;old_vers-&gt;next = (uint32_t) ((void *)info-&gt;vers -
+		    (void *)info-&gt;old_vers);
+
+	info-&gt;vers-&gt;version[0] = tt-&gt;version[0];
+	info-&gt;vers-&gt;version[1] = tt-&gt;version[1];
+	info-&gt;vers-&gt;version[2] = tt-&gt;version[2];
+	info-&gt;vers-&gt;next = 0;
+	strcpy(info-&gt;vers-&gt;name, tt-&gt;name);
+
+	info-&gt;old_vers = info-&gt;vers;
+	info-&gt;vers = align_ptr(((void *) ++info-&gt;vers) + strlen(tt-&gt;name) + 1);
+}
+
+static int list_versions(struct dm_ioctl *param, size_t param_size)
+{
+	size_t len, needed = 0;
+	struct dm_target_versions *vers;
+	struct vers_iter iter_info;
+
+	/*
+	 * Loop through all the devices working out how much
+	 * space we need.
+	 */
+	dm_target_iterate(list_version_get_needed, &amp;needed);
+
+	/*
+	 * Grab our output buffer.
+	 */
+	vers = get_result_buffer(param, param_size, &amp;len);
+	if (len &lt; needed) {
+		param-&gt;flags |= DM_BUFFER_FULL_FLAG;
+		goto out;
+	}
+	param-&gt;data_size = param-&gt;data_start + needed;
+
+	iter_info.param_size = param_size;
+	iter_info.old_vers = NULL;
+	iter_info.vers = vers;
+	iter_info.flags = 0;
+	iter_info.end = (char *)vers+len;
+
+	/*
+	 * Now loop through filling out the names &amp; versions.
+	 */
+	dm_target_iterate(list_version_get_info, &amp;iter_info);
+	param-&gt;flags |= iter_info.flags;
+
+      out:
+	return 0;
+}
+
+static int check_name(const char *name)
+{
+	if (strchr(name, '/')) {
+		DMWARN("invalid device name");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Fills in a dm_ioctl structure, ready for sending back to
+ * userland.
+ */
+static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
+{
+	kdev_t dev = dm_kdev(md);
+	struct dm_table *table;
+	struct block_device *bdev;
+
+	param-&gt;flags &amp;= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
+			  DM_ACTIVE_PRESENT_FLAG);
+
+	if (dm_suspended(md))
+		param-&gt;flags |= DM_SUSPEND_FLAG;
+
+	param-&gt;dev = kdev_t_to_nr(dev);
+
+	if (is_read_only(dev))
+		param-&gt;flags |= DM_READONLY_FLAG;
+
+	param-&gt;event_nr = dm_get_event_nr(md);
+
+	table = dm_get_table(md);
+	if (table) {
+		param-&gt;flags |= DM_ACTIVE_PRESENT_FLAG;
+		param-&gt;target_count = dm_table_get_num_targets(table);
+		dm_table_put(table);
+	} else
+		param-&gt;target_count = 0;
+
+	bdev = bdget(param-&gt;dev);
+	if (!bdev)
+		return -ENXIO;
+	param-&gt;open_count = bdev-&gt;bd_openers;
+	bdput(bdev);
+
+	return 0;
+}
+
+static int dev_create(struct dm_ioctl *param, size_t param_size)
+{
+	int r;
+	kdev_t dev = 0;
+	struct mapped_device *md;
+
+	r = check_name(param-&gt;name);
+	if (r)
+		return r;
+
+	if (param-&gt;flags &amp; DM_PERSISTENT_DEV_FLAG)
+		dev = to_kdev_t(param-&gt;dev);
+
+	r = dm_create(dev, &amp;md);
+	if (r)
+		return r;
+
+	r = dm_hash_insert(param-&gt;name, *param-&gt;uuid ? param-&gt;uuid : NULL, md);
+	if (r) {
+		dm_put(md);
+		return r;
+	}
+
+	param-&gt;flags &amp;= ~DM_INACTIVE_PRESENT_FLAG;
+
+	r = __dev_status(md, param);
+	dm_put(md);
+
+	return r;
+}
+
+/*
+ * Always use UUID for lookups if it's present, otherwise use name.
+ */
+static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
+{
+	return *param-&gt;uuid ?
+	    __get_uuid_cell(param-&gt;uuid) : __get_name_cell(param-&gt;name);
+}
+
+static inline struct mapped_device *find_device(struct dm_ioctl *param)
+{
+	struct hash_cell *hc;
+	struct mapped_device *md = NULL;
+
+	down_read(&amp;_hash_lock);
+	hc = __find_device_hash_cell(param);
+	if (hc) {
+		md = hc-&gt;md;
+
+		/*
+		 * Sneakily write in both the name and the uuid
+		 * while we have the cell.
+		 */
+		strncpy(param-&gt;name, hc-&gt;name, sizeof(param-&gt;name));
+		if (hc-&gt;uuid)
+			strncpy(param-&gt;uuid, hc-&gt;uuid, sizeof(param-&gt;uuid) - 1);
+		else
+			param-&gt;uuid[0] = '\0';
+
+		if (hc-&gt;new_map)
+			param-&gt;flags |= DM_INACTIVE_PRESENT_FLAG;
+		else
+			param-&gt;flags &amp;= ~DM_INACTIVE_PRESENT_FLAG;
+
+		dm_get(md);
+	}
+	up_read(&amp;_hash_lock);
+
+	return md;
+}
+
+static int dev_remove(struct dm_ioctl *param, size_t param_size)
+{
+	struct hash_cell *hc;
+
+	down_write(&amp;_hash_lock);
+	hc = __find_device_hash_cell(param);
+
+	if (!hc) {
+		DMWARN("device doesn't appear to be in the dev hash table.");
+		up_write(&amp;_hash_lock);
+		return -ENXIO;
+	}
+
+	__hash_remove(hc);
+	up_write(&amp;_hash_lock);
+	param-&gt;data_size = 0;
+	return 0;
+}
+
+/*
+ * Check a string doesn't overrun the chunk of
+ * memory we copied from userland.
+ */
+static int invalid_str(char *str, void *end)
+{
+	while ((void *) str &lt; end)
+		if (!*str++)
+			return 0;
+
+	return -EINVAL;
+}
+
+static int dev_rename(struct dm_ioctl *param, size_t param_size)
+{
+	int r;
+	char *new_name = (char *) param + param-&gt;data_start;
+
+	if (new_name &lt; (char *) (param + 1) ||
+	    invalid_str(new_name, (void *) param + param_size)) {
+		DMWARN("Invalid new logical volume name supplied.");
+		return -EINVAL;
+	}
+
+	r = check_name(new_name);
+	if (r)
+		return r;
+
+	param-&gt;data_size = 0;
+	return dm_hash_rename(param-&gt;name, new_name);
+}
+
+static int do_suspend(struct dm_ioctl *param)
+{
+	int r = 0;
+	struct mapped_device *md;
+
+	md = find_device(param);
+	if (!md)
+		return -ENXIO;
+
+	if (!dm_suspended(md))
+		r = dm_suspend(md);
+
+	if (!r)
+		r = __dev_status(md, param);
+
+	dm_put(md);
+	return r;
+}
+
+static int do_resume(struct dm_ioctl *param)
+{
+	int r = 0;
+	struct hash_cell *hc;
+	struct mapped_device *md;
+	struct dm_table *new_map;
+
+	down_write(&amp;_hash_lock);
+
+	hc = __find_device_hash_cell(param);
+	if (!hc) {
+		DMWARN("device doesn't appear to be in the dev hash table.");
+		up_write(&amp;_hash_lock);
+		return -ENXIO;
+	}
+
+	md = hc-&gt;md;
+	dm_get(md);
+
+	new_map = hc-&gt;new_map;
+	hc-&gt;new_map = NULL;
+	param-&gt;flags &amp;= ~DM_INACTIVE_PRESENT_FLAG;
+
+	up_write(&amp;_hash_lock);
+
+	/* Do we need to load a new map ? */
+	if (new_map) {
+		/* Suspend if it isn't already suspended */
+		if (!dm_suspended(md))
+			dm_suspend(md);
+
+		r = dm_swap_table(md, new_map);
+		if (r) {
+			dm_put(md);
+			dm_table_put(new_map);
+			return r;
+		}
+
+		if (dm_table_get_mode(new_map) &amp; FMODE_WRITE)
+			set_device_ro(dm_kdev(md), 0);
+		else
+			set_device_ro(dm_kdev(md), 1);
+
+		dm_table_put(new_map);
+	}
+
+	if (dm_suspended(md))
+		r = dm_resume(md);
+
+	if (!r)
+		r = __dev_status(md, param);
+
+	dm_put(md);
+	return r;
+}
+
+/*
+ * Set or unset the suspension state of a device.
+ * If the device already is in the requested state we just return its status.
+ */
+static int dev_suspend(struct dm_ioctl *param, size_t param_size)
+{
+	if (param-&gt;flags &amp; DM_SUSPEND_FLAG)
+		return do_suspend(param);
+
+	return do_resume(param);
+}
+
+/*
+ * Copies device info back to user space, used by
+ * the create and info ioctls.
+ */
+static int dev_status(struct dm_ioctl *param, size_t param_size)
+{
+	int r;
+	struct mapped_device *md;
+
+	md = find_device(param);
+	if (!md)
+		return -ENXIO;
+
+	r = __dev_status(md, param);
+	dm_put(md);
+	return r;
+}
+
+/*
+ * Build up the status struct for each target
+ */
+static void retrieve_status(struct dm_table *table, struct dm_ioctl *param,
+			    size_t param_size)
+{
+	unsigned int i, num_targets;
+	struct dm_target_spec *spec;
+	char *outbuf, *outptr;
+	status_type_t type;
+	size_t remaining, len, used = 0;
+
+	outptr = outbuf = get_result_buffer(param, param_size, &amp;len);
+
+	if (param-&gt;flags &amp; DM_STATUS_TABLE_FLAG)
+		type = STATUSTYPE_TABLE;
+	else
+		type = STATUSTYPE_INFO;
+
+	/* Get all the target info */
+	num_targets = dm_table_get_num_targets(table);
+	for (i = 0; i &lt; num_targets; i++) {
+		struct dm_target *ti = dm_table_get_target(table, i);
+
+		remaining = len - (outptr - outbuf);
+		if (remaining &lt;= sizeof(struct dm_target_spec)) {
+			param-&gt;flags |= DM_BUFFER_FULL_FLAG;
+			break;
+		}
+
+		spec = (struct dm_target_spec *) outptr;
+
+		spec-&gt;status = 0;
+		spec-&gt;sector_start = ti-&gt;begin;
+		spec-&gt;length = ti-&gt;len;
+		strncpy(spec-&gt;target_type, ti-&gt;type-&gt;name,
+			sizeof(spec-&gt;target_type));
+
+		outptr += sizeof(struct dm_target_spec);
+		remaining = len - (outptr - outbuf);
+
+		/* Get the status/table string from the target driver */
+		if (ti-&gt;type-&gt;status) {
+			if (ti-&gt;type-&gt;status(ti, type, outptr, remaining)) {
+				param-&gt;flags |= DM_BUFFER_FULL_FLAG;
+				break;
+			}
+		} else
+			outptr[0] = '\0';
+
+		outptr += strlen(outptr) + 1;
+		used = param-&gt;data_start + (outptr - outbuf);
+
+		align_ptr(outptr);
+		spec-&gt;next = outptr - outbuf;
+	}
+
+	if (used)
+		param-&gt;data_size = used;
+
+	param-&gt;target_count = num_targets;
+}
+
+/*
+ * Wait for a device to report an event
+ */
+static int dev_wait(struct dm_ioctl *param, size_t param_size)
+{
+	int r;
+	struct mapped_device *md;
+	struct dm_table *table;
+	DECLARE_WAITQUEUE(wq, current);
+
+	md = find_device(param);
+	if (!md)
+		return -ENXIO;
+
+	/*
+	 * Wait for a notification event
+	 */
+	set_current_state(TASK_INTERRUPTIBLE);
+	if (!dm_add_wait_queue(md, &amp;wq, param-&gt;event_nr)) {
+		schedule();
+		dm_remove_wait_queue(md, &amp;wq);
+	}
+	set_current_state(TASK_RUNNING);
+
+	/*
+	 * The userland program is going to want to know what
+	 * changed to trigger the event, so we may as well tell
+	 * him and save an ioctl.
+	 */
+	r = __dev_status(md, param);
+	if (r)
+		goto out;
+
+	table = dm_get_table(md);
+	if (table) {
+		retrieve_status(table, param, param_size);
+		dm_table_put(table);
+	}
+
+ out:
+	dm_put(md);
+	return r;
+}
+
+static inline int get_mode(struct dm_ioctl *param)
+{
+	int mode = FMODE_READ | FMODE_WRITE;
+
+	if (param-&gt;flags &amp; DM_READONLY_FLAG)
+		mode = FMODE_READ;
+
+	return mode;
+}
+
+static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
+		       struct dm_target_spec **spec, char **target_params)
+{
+	*spec = (struct dm_target_spec *) ((unsigned char *) last + next);
+	*target_params = (char *) (*spec + 1);
+
+	if (*spec &lt; (last + 1))
+		return -EINVAL;
+
+	return invalid_str(*target_params, end);
+}
+
+static int populate_table(struct dm_table *table, struct dm_ioctl *param,
+			  size_t param_size)
+{
+	int r;
+	unsigned int i = 0;
+	struct dm_target_spec *spec = (struct dm_target_spec *) param;
+	uint32_t next = param-&gt;data_start;
+	void *end = (void *) param + param_size;
+	char *target_params;
+
+	if (!param-&gt;target_count) {
+		DMWARN("populate_table: no targets specified");
+		return -EINVAL;
+	}
+
+	for (i = 0; i &lt; param-&gt;target_count; i++) {
+
+		r = next_target(spec, next, end, &amp;spec, &amp;target_params);
+		if (r) {
+			DMWARN("unable to find target");
+			return r;
+		}
+
+		r = dm_table_add_target(table, spec-&gt;target_type,
+					(sector_t) spec-&gt;sector_start,
+					(sector_t) spec-&gt;length,
+					target_params);
+		if (r) {
+			DMWARN("error adding target to table");
+			return r;
+		}
+
+		next = spec-&gt;next;
+	}
+
+	return dm_table_complete(table);
+}
+
+static int table_load(struct dm_ioctl *param, size_t param_size)
+{
+	int r;
+	struct hash_cell *hc;
+	struct dm_table *t;
+
+	r = dm_table_create(&amp;t, get_mode(param), param-&gt;target_count);
+	if (r)
+		return r;
+
+	r = populate_table(t, param, param_size);
+	if (r) {
+		dm_table_put(t);
+		return r;
+	}
+
+	down_write(&amp;_hash_lock);
+	hc = __find_device_hash_cell(param);
+	if (!hc) {
+		DMWARN("device doesn't appear to be in the dev hash table.");
+		up_write(&amp;_hash_lock);
+		return -ENXIO;
+	}
+
+	if (hc-&gt;new_map)
+		dm_table_put(hc-&gt;new_map);
+	hc-&gt;new_map = t;
+	param-&gt;flags |= DM_INACTIVE_PRESENT_FLAG;
+
+	r = __dev_status(hc-&gt;md, param);
+	up_write(&amp;_hash_lock);
+	return r;
+}
+
+static int table_clear(struct dm_ioctl *param, size_t param_size)
+{
+	int r;
+	struct hash_cell *hc;
+
+	down_write(&amp;_hash_lock);
+
+	hc = __find_device_hash_cell(param);
+	if (!hc) {
+		DMWARN("device doesn't appear to be in the dev hash table.");
+		up_write(&amp;_hash_lock);
+		return -ENXIO;
+	}
+
+	if (hc-&gt;new_map) {
+		dm_table_put(hc-&gt;new_map);
+		hc-&gt;new_map = NULL;
+	}
+
+	param-&gt;flags &amp;= ~DM_INACTIVE_PRESENT_FLAG;
+
+	r = __dev_status(hc-&gt;md, param);
+	up_write(&amp;_hash_lock);
+	return r;
+}
+
+/*
+ * Retrieves a list of devices used by a particular dm device.
+ */
+static void retrieve_deps(struct dm_table *table, struct dm_ioctl *param,
+			  size_t param_size)
+{
+	unsigned int count = 0;
+	struct list_head *tmp;
+	size_t len, needed;
+	struct dm_target_deps *deps;
+
+	deps = get_result_buffer(param, param_size, &amp;len);
+
+	/*
+	 * Count the devices.
+	 */
+	list_for_each(tmp, dm_table_get_devices(table))
+		count++;
+
+	/*
+	 * Check we have enough space.
+	 */
+	needed = sizeof(*deps) + (sizeof(*deps-&gt;dev) * count);
+	if (len &lt; needed) {
+		param-&gt;flags |= DM_BUFFER_FULL_FLAG;
+		return;
+	}
+
+	/*
+	 * Fill in the devices.
+	 */
+	deps-&gt;count = count;
+	count = 0;
+	list_for_each(tmp, dm_table_get_devices(table)) {
+		struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
+		deps-&gt;dev[count++] = dd-&gt;bdev-&gt;bd_dev;
+	}
+
+	param-&gt;data_size = param-&gt;data_start + needed;
+}
+
+static int table_deps(struct dm_ioctl *param, size_t param_size)
+{
+	int r;
+	struct mapped_device *md;
+	struct dm_table *table;
+
+	md = find_device(param);
+	if (!md)
+		return -ENXIO;
+
+	r = __dev_status(md, param);
+	if (r)
+		goto out;
+
+	table = dm_get_table(md);
+	if (table) {
+		retrieve_deps(table, param, param_size);
+		dm_table_put(table);
+	}
+
+ out:
+	dm_put(md);
+	return r;
+}
+
+/*
+ * Return the status of a device as a text string for each
+ * target.
+ */
+static int table_status(struct dm_ioctl *param, size_t param_size)
+{
+	int r;
+	struct mapped_device *md;
+	struct dm_table *table;
+
+	md = find_device(param);
+	if (!md)
+		return -ENXIO;
+
+	r = __dev_status(md, param);
+	if (r)
+		goto out;
+ 
+	table = dm_get_table(md);
+	if (table) {
+		retrieve_status(table, param, param_size);
+		dm_table_put(table);
+	}
+
+ out:
+	dm_put(md);
+	return r;
+}
+
+/*-----------------------------------------------------------------
+ * Implementation of open/close/ioctl on the special char
+ * device.
+ *---------------------------------------------------------------*/
+static ioctl_fn lookup_ioctl(unsigned int cmd)
+{
+	static struct {
+		int cmd;
+		ioctl_fn fn;
+	} _ioctls[] = {
+		{DM_VERSION_CMD, NULL},	/* version is dealt with elsewhere */
+		{DM_REMOVE_ALL_CMD, remove_all},
+		{DM_LIST_DEVICES_CMD, list_devices},
+
+		{DM_DEV_CREATE_CMD, dev_create},
+		{DM_DEV_REMOVE_CMD, dev_remove},
+		{DM_DEV_RENAME_CMD, dev_rename},
+		{DM_DEV_SUSPEND_CMD, dev_suspend},
+		{DM_DEV_STATUS_CMD, dev_status},
+		{DM_DEV_WAIT_CMD, dev_wait},
+
+		{DM_TABLE_LOAD_CMD, table_load},
+		{DM_TABLE_CLEAR_CMD, table_clear},
+		{DM_TABLE_DEPS_CMD, table_deps},
+		{DM_TABLE_STATUS_CMD, table_status},
+
+		{DM_LIST_VERSIONS_CMD, list_versions}
+	};
+
+	return (cmd &gt;= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
+}
+
+/*
+ * As well as checking the version compatibility this always
+ * copies the kernel interface version out.
+ */
+static int check_version(unsigned int cmd, struct dm_ioctl *user)
+{
+	uint32_t version[3];
+	int r = 0;
+
+	if (copy_from_user(version, user-&gt;version, sizeof(version)))
+		return -EFAULT;
+
+	if ((DM_VERSION_MAJOR != version[0]) ||
+	    (DM_VERSION_MINOR &lt; version[1])) {
+		DMWARN("ioctl interface mismatch: "
+		       "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
+		       DM_VERSION_MAJOR, DM_VERSION_MINOR,
+		       DM_VERSION_PATCHLEVEL,
+		       version[0], version[1], version[2], cmd);
+		r = -EINVAL;
+	}
+
+	/*
+	 * Fill in the kernel version.
+	 */
+	version[0] = DM_VERSION_MAJOR;
+	version[1] = DM_VERSION_MINOR;
+	version[2] = DM_VERSION_PATCHLEVEL;
+	if (copy_to_user(user-&gt;version, version, sizeof(version)))
+		return -EFAULT;
+
+	return r;
+}
+
+static void free_params(struct dm_ioctl *param)
+{
+	vfree(param);
+}
+
+static int copy_params(struct dm_ioctl *user, struct dm_ioctl **param)
+{
+	struct dm_ioctl tmp, *dmi;
+
+	if (copy_from_user(&amp;tmp, user, sizeof(tmp)))
+		return -EFAULT;
+
+	if (tmp.data_size &lt; sizeof(tmp))
+		return -EINVAL;
+
+	dmi = (struct dm_ioctl *) vmalloc(tmp.data_size);
+	if (!dmi)
+		return -ENOMEM;
+
+	if (copy_from_user(dmi, user, tmp.data_size)) {
+		vfree(dmi);
+		return -EFAULT;
+	}
+
+	*param = dmi;
+	return 0;
+}
+
+static int validate_params(uint cmd, struct dm_ioctl *param)
+{
+	/* Always clear this flag */
+	param-&gt;flags &amp;= ~DM_BUFFER_FULL_FLAG;
+
+	/* Ignores parameters */
+	if (cmd == DM_REMOVE_ALL_CMD || cmd == DM_LIST_DEVICES_CMD ||
+	    cmd == DM_LIST_VERSIONS_CMD)
+		return 0;
+
+	/* Unless creating, either name or uuid but not both */
+	if (cmd != DM_DEV_CREATE_CMD) {
+		if ((!*param-&gt;uuid &amp;&amp; !*param-&gt;name) ||
+		    (*param-&gt;uuid &amp;&amp; *param-&gt;name)) {
+			DMWARN("one of name or uuid must be supplied, cmd(%u)",
+			       cmd);
+			return -EINVAL;
+		}
+	}
+
+	/* Ensure strings are terminated */
+	param-&gt;name[DM_NAME_LEN - 1] = '\0';
+	param-&gt;uuid[DM_UUID_LEN - 1] = '\0';
+
+	return 0;
+}
+
+static int ctl_ioctl(struct inode *inode, struct file *file,
+		     uint command, ulong u)
+{
+	int r = 0;
+	unsigned int cmd;
+	struct dm_ioctl *param;
+	struct dm_ioctl *user = (struct dm_ioctl *) u;
+	ioctl_fn fn = NULL;
+	size_t param_size;
+
+	/* only root can play with this */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (_IOC_TYPE(command) != DM_IOCTL)
+		return -ENOTTY;
+
+	cmd = _IOC_NR(command);
+
+	/*
+	 * Check the interface version passed in.  This also
+	 * writes out the kernel's interface version.
+	 */
+	r = check_version(cmd, user);
+	if (r)
+		return r;
+
+	/*
+	 * Nothing more to do for the version command.
+	 */
+	if (cmd == DM_VERSION_CMD)
+		return 0;
+
+	fn = lookup_ioctl(cmd);
+	if (!fn) {
+		DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
+		return -ENOTTY;
+	}
+
+	/*
+	 * FIXME: I don't like this, we're trying to avoid low
+	 * memory issues when a device is suspended.
+	 */
+	current-&gt;flags |= PF_MEMALLOC;
+
+	/*
+	 * Copy the parameters into kernel space.
+	 */
+	r = copy_params(user, &amp;param);
+	if (r) {
+		current-&gt;flags &amp;= ~PF_MEMALLOC;
+		return r;
+	}
+
+	r = validate_params(cmd, param);
+	if (r)
+		goto out;
+
+	param_size = param-&gt;data_size;
+	param-&gt;data_size = sizeof(*param);
+	r = fn(param, param_size);
+
+	/*
+	 * Copy the results back to userland.
+	 */
+	if (!r &amp;&amp; copy_to_user(user, param, param-&gt;data_size))
+		r = -EFAULT;
+
+ out:
+	free_params(param);
+	current-&gt;flags &amp;= ~PF_MEMALLOC;
+	return r;
+}
+
+static struct file_operations _ctl_fops = {
+	.ioctl	 = ctl_ioctl,
+	.owner	 = THIS_MODULE,
+};
+
+static devfs_handle_t _ctl_handle;
+
+static struct miscdevice _dm_misc = {
+	.minor = MISC_DYNAMIC_MINOR,
+	.name  = DM_NAME,
+	.fops  = &amp;_ctl_fops
+};
+
+/*
+ * Create misc character device and link to DM_DIR/control.
+ */
+int __init dm_interface_init(void)
+{
+	int r;
+	char rname[64];
+
+	r = dm_hash_init();
+	if (r)
+		return r;
+
+	r = misc_register(&amp;_dm_misc);
+	if (r) {
+		DMERR("misc_register failed for control device");
+		dm_hash_exit();
+		return r;
+	}
+
+	r = devfs_generate_path(_dm_misc.devfs_handle, rname + 3,
+				sizeof rname - 3);
+	if (r == -ENOSYS)
+		goto done;	/* devfs not present */
+
+	if (r &lt; 0) {
+		DMERR("devfs_generate_path failed for control device");
+		goto failed;
+	}
+
+	strncpy(rname + r, "../", 3);
+	r = devfs_mk_symlink(NULL, DM_DIR "/control",
+			     DEVFS_FL_DEFAULT, rname + r, &amp;_ctl_handle, NULL);
+	if (r) {
+		DMERR("devfs_mk_symlink failed for control device");
+		goto failed;
+	}
+	devfs_auto_unregister(_dm_misc.devfs_handle, _ctl_handle);
+
+      done:
+	DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR,
+	       DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA,
+	       DM_DRIVER_EMAIL);
+	return 0;
+
+      failed:
+	misc_deregister(&amp;_dm_misc);
+	dm_hash_exit();
+	return r;
+}
+
+void dm_interface_exit(void)
+{
+	if (misc_deregister(&amp;_dm_misc) &lt; 0)
+		DMERR("misc_deregister failed for control device");
+
+	dm_hash_exit();
+}
--- linux-2.4.26-rc1/drivers/md/dm-linear.c	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-linear.c	Wed Mar 31 13:49:08 2004
@@ -0,0 +1,124 @@
+/*
+ * Copyright (C) 2001-2003 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include &lt;linux/module.h&gt;
+#include &lt;linux/init.h&gt;
+#include &lt;linux/blkdev.h&gt;
+#include &lt;linux/slab.h&gt;
+
+/*
+ * Linear: maps a linear range of a device.
+ */
+struct linear_c {
+	struct dm_dev *dev;
+	sector_t start;
+};
+
+/*
+ * Construct a linear mapping: &lt;dev_path&gt; &lt;offset&gt;
+ */
+static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct linear_c *lc;
+
+	if (argc != 2) {
+		ti-&gt;error = "dm-linear: Invalid argument count";
+		return -EINVAL;
+	}
+
+	lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+	if (lc == NULL) {
+		ti-&gt;error = "dm-linear: Cannot allocate linear context";
+		return -ENOMEM;
+	}
+
+	if (sscanf(argv[1], SECTOR_FORMAT, &amp;lc-&gt;start) != 1) {
+		ti-&gt;error = "dm-linear: Invalid device sector";
+		goto bad;
+	}
+
+	if (dm_get_device(ti, argv[0], lc-&gt;start, ti-&gt;len,
+			  dm_table_get_mode(ti-&gt;table), &amp;lc-&gt;dev)) {
+		ti-&gt;error = "dm-linear: Device lookup failed";
+		goto bad;
+	}
+
+	ti-&gt;private = lc;
+	return 0;
+
+      bad:
+	kfree(lc);
+	return -EINVAL;
+}
+
+static void linear_dtr(struct dm_target *ti)
+{
+	struct linear_c *lc = (struct linear_c *) ti-&gt;private;
+
+	dm_put_device(ti, lc-&gt;dev);
+	kfree(lc);
+}
+
+static int linear_map(struct dm_target *ti, struct buffer_head *bh, int rw,
+		      union map_info *map_context)
+{
+	struct linear_c *lc = (struct linear_c *) ti-&gt;private;
+
+	bh-&gt;b_rdev = lc-&gt;dev-&gt;dev;
+	bh-&gt;b_rsector = lc-&gt;start + (bh-&gt;b_rsector - ti-&gt;begin);
+
+	return 1;
+}
+
+static int linear_status(struct dm_target *ti, status_type_t type,
+			 char *result, unsigned int maxlen)
+{
+	struct linear_c *lc = (struct linear_c *) ti-&gt;private;
+	kdev_t kdev;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		result[0] = '\0';
+		break;
+
+	case STATUSTYPE_TABLE:
+		kdev = to_kdev_t(lc-&gt;dev-&gt;bdev-&gt;bd_dev);
+		snprintf(result, maxlen, "%s " SECTOR_FORMAT,
+			 dm_kdevname(kdev), lc-&gt;start);
+		break;
+	}
+	return 0;
+}
+
+static struct target_type linear_target = {
+	.name   = "linear",
+	.version= {1, 0, 1},
+	.module = THIS_MODULE,
+	.ctr    = linear_ctr,
+	.dtr    = linear_dtr,
+	.map    = linear_map,
+	.status = linear_status,
+};
+
+int __init dm_linear_init(void)
+{
+	int r = dm_register_target(&amp;linear_target);
+
+	if (r &lt; 0)
+		DMERR("linear: register failed %d", r);
+
+	return r;
+}
+
+void dm_linear_exit(void)
+{
+	int r = dm_unregister_target(&amp;linear_target);
+
+	if (r &lt; 0)
+		DMERR("linear: unregister failed %d", r);
+}
--- linux-2.4.26-rc1/drivers/md/dm-log.c	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-log.c	Wed Mar 31 13:48:32 2004
@@ -0,0 +1,310 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the LGPL.
+ */
+
+#include &lt;linux/init.h&gt;
+#include &lt;linux/slab.h&gt;
+#include &lt;linux/module.h&gt;
+#include &lt;linux/vmalloc.h&gt;
+
+#include "dm-log.h"
+#include "dm-io.h"
+
+static LIST_HEAD(_log_types);
+static spinlock_t _lock = SPIN_LOCK_UNLOCKED;
+
+int dm_register_dirty_log_type(struct dirty_log_type *type)
+{
+	spin_lock(&amp;_lock);
+	type-&gt;use_count = 0;
+	if (type-&gt;module)
+		__MOD_INC_USE_COUNT(type-&gt;module);
+
+	list_add(&amp;type-&gt;list, &amp;_log_types);
+	spin_unlock(&amp;_lock);
+
+	return 0;
+}
+
+int dm_unregister_dirty_log_type(struct dirty_log_type *type)
+{
+	spin_lock(&amp;_lock);
+
+	if (type-&gt;use_count)
+		DMWARN("Attempt to unregister a log type that is still in use");
+	else {
+		list_del(&amp;type-&gt;list);
+		if (type-&gt;module)
+			__MOD_DEC_USE_COUNT(type-&gt;module);
+	}
+
+	spin_unlock(&amp;_lock);
+
+	return 0;
+}
+
+static struct dirty_log_type *get_type(const char *type_name)
+{
+	struct dirty_log_type *type;
+	struct list_head *tmp;
+
+	spin_lock(&amp;_lock);
+	list_for_each (tmp, &amp;_log_types) {
+		type = list_entry(tmp, struct dirty_log_type, list);
+		if (!strcmp(type_name, type-&gt;name)) {
+			type-&gt;use_count++;
+			spin_unlock(&amp;_lock);
+			return type;
+		}
+	}
+
+	spin_unlock(&amp;_lock);
+	return NULL;
+}
+
+static void put_type(struct dirty_log_type *type)
+{
+	spin_lock(&amp;_lock);
+	type-&gt;use_count--;
+	spin_unlock(&amp;_lock);
+}
+
+struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size,
+				      unsigned int argc, char **argv)
+{
+	struct dirty_log_type *type;
+	struct dirty_log *log;
+
+	log = kmalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		return NULL;
+
+	type = get_type(type_name);
+	if (!type) {
+		kfree(log);
+		return NULL;
+	}
+
+	log-&gt;type = type;
+	if (type-&gt;ctr(log, dev_size, argc, argv)) {
+		kfree(log);
+		put_type(type);
+		return NULL;
+	}
+
+	return log;
+}
+
+void dm_destroy_dirty_log(struct dirty_log *log)
+{
+	log-&gt;type-&gt;dtr(log);
+	put_type(log-&gt;type);
+	kfree(log);
+}
+
+
+/*-----------------------------------------------------------------
+ * In core log, ie. trivial, non-persistent
+ *
+ * For now we'll keep this simple and just have 2 bitsets, one
+ * for clean/dirty, the other for sync/nosync.  The sync bitset
+ * will be freed when everything is in sync.
+ *
+ * FIXME: problems with a 64bit sector_t
+ *---------------------------------------------------------------*/
+struct core_log {
+	sector_t region_size;
+	unsigned int region_count;
+	unsigned long *clean_bits;
+	unsigned long *sync_bits;
+	unsigned long *recovering_bits;	/* FIXME: this seems excessive */
+
+	int sync_search;
+};
+
+#define BYTE_SHIFT 3
+
+static int core_ctr(struct dirty_log *log, sector_t dev_size,
+		    unsigned int argc, char **argv)
+{
+	struct core_log *clog;
+	sector_t region_size;
+	unsigned int region_count;
+	size_t bitset_size;
+
+	if (argc != 1) {
+		DMWARN("wrong number of arguments to core_log");
+		return -EINVAL;
+	}
+
+	if (sscanf(argv[0], SECTOR_FORMAT, &amp;region_size) != 1) {
+		DMWARN("invalid region size string");
+		return -EINVAL;
+	}
+
+	region_count = dm_div_up(dev_size, region_size);
+
+	clog = kmalloc(sizeof(*clog), GFP_KERNEL);
+	if (!clog) {
+		DMWARN("couldn't allocate core log");
+		return -ENOMEM;
+	}
+
+	clog-&gt;region_size = region_size;
+	clog-&gt;region_count = region_count;
+
+	/*
+ 	 * Work out how many words we need to hold the bitset.
+ 	 */
+	bitset_size = dm_round_up(region_count,
+				  sizeof(*clog-&gt;clean_bits) &lt;&lt; BYTE_SHIFT);
+	bitset_size &gt;&gt;= BYTE_SHIFT;
+
+	clog-&gt;clean_bits = vmalloc(bitset_size);
+	if (!clog-&gt;clean_bits) {
+		DMWARN("couldn't allocate clean bitset");
+		kfree(clog);
+		return -ENOMEM;
+	}
+	memset(clog-&gt;clean_bits, -1, bitset_size);
+
+	clog-&gt;sync_bits = vmalloc(bitset_size);
+	if (!clog-&gt;sync_bits) {
+		DMWARN("couldn't allocate sync bitset");
+		vfree(clog-&gt;clean_bits);
+		kfree(clog);
+		return -ENOMEM;
+	}
+	memset(clog-&gt;sync_bits, 0, bitset_size);
+
+	clog-&gt;recovering_bits = vmalloc(bitset_size);
+	if (!clog-&gt;recovering_bits) {
+		DMWARN("couldn't allocate sync bitset");
+		vfree(clog-&gt;sync_bits);
+		vfree(clog-&gt;clean_bits);
+		kfree(clog);
+		return -ENOMEM;
+	}
+	memset(clog-&gt;recovering_bits, 0, bitset_size);
+	clog-&gt;sync_search = 0;
+	log-&gt;context = clog;
+	return 0;
+}
+
+static void core_dtr(struct dirty_log *log)
+{
+	struct core_log *clog = (struct core_log *) log-&gt;context;
+	vfree(clog-&gt;clean_bits);
+	vfree(clog-&gt;sync_bits);
+	vfree(clog-&gt;recovering_bits);
+	kfree(clog);
+}
+
+static sector_t core_get_region_size(struct dirty_log *log)
+{
+	struct core_log *clog = (struct core_log *) log-&gt;context;
+	return clog-&gt;region_size;
+}
+
+static int core_is_clean(struct dirty_log *log, region_t region)
+{
+	struct core_log *clog = (struct core_log *) log-&gt;context;
+	return test_bit(region, clog-&gt;clean_bits);
+}
+
+static int core_in_sync(struct dirty_log *log, region_t region, int block)
+{
+	struct core_log *clog = (struct core_log *) log-&gt;context;
+
+	return test_bit(region, clog-&gt;sync_bits) ? 1 : 0;
+}
+
+static int core_flush(struct dirty_log *log)
+{
+	/* no op */
+	return 0;
+}
+
+static void core_mark_region(struct dirty_log *log, region_t region)
+{
+	struct core_log *clog = (struct core_log *) log-&gt;context;
+	clear_bit(region, clog-&gt;clean_bits);
+}
+
+static void core_clear_region(struct dirty_log *log, region_t region)
+{
+	struct core_log *clog = (struct core_log *) log-&gt;context;
+	set_bit(region, clog-&gt;clean_bits);
+}
+
+static int core_get_resync_work(struct dirty_log *log, region_t *region)
+{
+	struct core_log *clog = (struct core_log *) log-&gt;context;
+
+	if (clog-&gt;sync_search &gt;= clog-&gt;region_count)
+		return 0;
+
+	do {
+		*region = find_next_zero_bit(clog-&gt;sync_bits,
+					     clog-&gt;region_count,
+					     clog-&gt;sync_search);
+		clog-&gt;sync_search = *region + 1;
+
+		if (*region == clog-&gt;region_count)
+			return 0;
+
+	} while (test_bit(*region, clog-&gt;recovering_bits));
+
+	set_bit(*region, clog-&gt;recovering_bits);
+	return 1;
+}
+
+static void core_complete_resync_work(struct dirty_log *log, region_t region,
+				      int success)
+{
+	struct core_log *clog = (struct core_log *) log-&gt;context;
+
+	clear_bit(region, clog-&gt;recovering_bits);
+	if (success)
+		set_bit(region, clog-&gt;sync_bits);
+}
+
+static struct dirty_log_type _core_type = {
+	.name = "core",
+
+	.ctr = core_ctr,
+	.dtr = core_dtr,
+	.get_region_size = core_get_region_size,
+	.is_clean = core_is_clean,
+	.in_sync = core_in_sync,
+	.flush = core_flush,
+	.mark_region = core_mark_region,
+	.clear_region = core_clear_region,
+	.get_resync_work = core_get_resync_work,
+	.complete_resync_work = core_complete_resync_work
+};
+
+__init int dm_dirty_log_init(void)
+{
+	int r;
+
+	r = dm_register_dirty_log_type(&amp;_core_type);
+	if (r)
+		DMWARN("couldn't register core log");
+
+	return r;
+}
+
+void dm_dirty_log_exit(void)
+{
+	dm_unregister_dirty_log_type(&amp;_core_type);
+}
+
+EXPORT_SYMBOL(dm_register_dirty_log_type);
+EXPORT_SYMBOL(dm_unregister_dirty_log_type);
+EXPORT_SYMBOL(dm_dirty_log_init);
+EXPORT_SYMBOL(dm_dirty_log_exit);
+EXPORT_SYMBOL(dm_create_dirty_log);
+EXPORT_SYMBOL(dm_destroy_dirty_log);
--- linux-2.4.26-rc1/drivers/md/dm-log.h	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-log.h	Wed Mar 31 13:48:32 2004
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2003 Sistina Software
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef DM_DIRTY_LOG
+#define DM_DIRTY_LOG
+
+#include "dm.h"
+
+typedef sector_t region_t;
+
+struct dirty_log_type;
+
+struct dirty_log {
+	struct dirty_log_type *type;
+	void *context;
+};
+
+struct dirty_log_type {
+	struct list_head list;
+	const char *name;
+	struct module *module;
+	unsigned int use_count;
+
+	int (*ctr)(struct dirty_log *log, sector_t dev_size,
+		   unsigned int argc, char **argv);
+	void (*dtr)(struct dirty_log *log);
+
+	/*
+	 * Retrieves the smallest size of region that the log can
+	 * deal with.
+	 */
+	sector_t (*get_region_size)(struct dirty_log *log);
+
+        /*
+	 * A predicate to say whether a region is clean or not.
+	 * May block.
+	 */
+	int (*is_clean)(struct dirty_log *log, region_t region);
+
+	/*
+	 *  Returns: 0, 1, -EWOULDBLOCK, &lt; 0
+	 *
+	 * A predicate function to check the area given by
+	 * [sector, sector + len) is in sync.
+	 *
+	 * If -EWOULDBLOCK is returned the state of the region is
+	 * unknown, typically this will result in a read being
+	 * passed to a daemon to deal with, since a daemon is
+	 * allowed to block.
+	 */
+	int (*in_sync)(struct dirty_log *log, region_t region, int can_block);
+
+	/*
+	 * Flush the current log state (eg, to disk).  This
+	 * function may block.
+	 */
+	int (*flush)(struct dirty_log *log);
+
+	/*
+	 * Mark an area as clean or dirty.  These functions may
+	 * block, though for performance reasons blocking should
+	 * be extremely rare (eg, allocating another chunk of
+	 * memory for some reason).
+	 */
+	void (*mark_region)(struct dirty_log *log, region_t region);
+	void (*clear_region)(struct dirty_log *log, region_t region);
+
+	/*
+	 * Returns: &lt;0 (error), 0 (no region), 1 (region)
+	 *
+	 * The mirrord will need perform recovery on regions of
+	 * the mirror that are in the NOSYNC state.  This
+	 * function asks the log to tell the caller about the
+	 * next region that this machine should recover.
+	 *
+	 * Do not confuse this function with 'in_sync()', one
+	 * tells you if an area is synchronised, the other
+	 * assigns recovery work.
+	*/
+	int (*get_resync_work)(struct dirty_log *log, region_t *region);
+
+	/*
+	 * This notifies the log that the resync of an area has
+	 * been completed.  The log should then mark this region
+	 * as CLEAN.
+	 */
+	void (*complete_resync_work)(struct dirty_log *log,
+				     region_t region, int success);
+};
+
+int dm_register_dirty_log_type(struct dirty_log_type *type);
+int dm_unregister_dirty_log_type(struct dirty_log_type *type);
+
+
+/*
+ * Make sure you use these two functions, rather than calling
+ * type-&gt;constructor/destructor() directly.
+ */
+struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size,
+				      unsigned int argc, char **argv);
+void dm_destroy_dirty_log(struct dirty_log *log);
+
+/*
+ * init/exit functions.
+ */
+int dm_dirty_log_init(void);
+void dm_dirty_log_exit(void);
+
+#endif
--- linux-2.4.26-rc1/drivers/md/dm-raid1.c	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-raid1.c	Wed Mar 31 13:49:14 2004
@@ -0,0 +1,1295 @@
+/*
+ * Copyright (C) 2003 Sistina Software Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "dm-daemon.h"
+#include "dm-io.h"
+#include "dm-log.h"
+#include "kcopyd.h"
+
+#include &lt;linux/ctype.h&gt;
+#include &lt;linux/init.h&gt;
+#include &lt;linux/mempool.h&gt;
+#include &lt;linux/module.h&gt;
+#include &lt;linux/pagemap.h&gt;
+#include &lt;linux/slab.h&gt;
+#include &lt;linux/time.h&gt;
+#include &lt;linux/vmalloc.h&gt;
+
+static struct dm_daemon _kmirrord;
+
+/*-----------------------------------------------------------------
+ * buffer lists:
+ *
+ * We play with singly linked lists of buffers, but we want to be
+ * careful to add new buffers to the back of the list, to avoid
+ * buffers being starved of attention.
+ *---------------------------------------------------------------*/
+struct buffer_list {
+	struct buffer_head *head;
+	struct buffer_head *tail;
+};
+
+static inline void buffer_list_init(struct buffer_list *bl)
+{
+	bl-&gt;head = bl-&gt;tail = NULL;
+}
+
+static inline void buffer_list_add(struct buffer_list *bl,
+				   struct buffer_head *bh)
+{
+	bh-&gt;b_reqnext = NULL;
+
+	if (bl-&gt;tail) {
+		bl-&gt;tail-&gt;b_reqnext = bh;
+		bl-&gt;tail = bh;
+	} else
+		bl-&gt;head = bl-&gt;tail = bh;
+}
+
+static struct buffer_head *buffer_list_pop(struct buffer_list *bl)
+{
+	struct buffer_head *bh = bl-&gt;head;
+
+	if (bh) {
+		bl-&gt;head = bl-&gt;head-&gt;b_reqnext;
+		if (!bl-&gt;head)
+			bl-&gt;tail = NULL;
+
+		bh-&gt;b_reqnext = NULL;
+	}
+
+	return bh;
+}
+
+/*-----------------------------------------------------------------
+ * Region hash
+ *
+ * The mirror splits itself up into discrete regions.  Each
+ * region can be in one of three states: clean, dirty,
+ * nosync.  There is no need to put clean regions in the hash.
+ *
+ * In addition to being present in the hash table a region _may_
+ * be present on one of three lists.
+ *
+ *   clean_regions: Regions on this list have no io pending to
+ *   them, they are in sync, we are no longer interested in them,
+ *   they are dull.  rh_update_states() will remove them from the
+ *   hash table.
+ *
+ *   quiesced_regions: These regions have been spun down, ready
+ *   for recovery.  rh_recovery_start() will remove regions from
+ *   this list and hand them to kmirrord, which will schedule the
+ *   recovery io with kcopyd.
+ *
+ *   recovered_regions: Regions that kcopyd has successfully
+ *   recovered.  rh_update_states() will now schedule any delayed
+ *   io, up the recovery_count, and remove the region from the
+ *   hash.
+ *
+ * There are 2 locks:
+ *   A rw spin lock 'hash_lock' protects just the hash table,
+ *   this is never held in write mode from interrupt context,
+ *   which I believe means that we only have to disable irqs when
+ *   doing a write lock.
+ *
+ *   An ordinary spin lock 'region_lock' that protects the three
+ *   lists in the region_hash, with the 'state', 'list' and
+ *   'bhs_delayed' fields of the regions.  This is used from irq
+ *   context, so all other uses will have to suspend local irqs.
+ *---------------------------------------------------------------*/
+struct mirror_set;
+struct region_hash {
+	struct mirror_set *ms;
+	sector_t region_size;
+
+	/* holds persistent region state */
+	struct dirty_log *log;
+
+	/* hash table */
+	rwlock_t hash_lock;
+	mempool_t *region_pool;
+	unsigned int mask;
+	unsigned int nr_buckets;
+	struct list_head *buckets;
+
+	spinlock_t region_lock;
+	struct semaphore recovery_count;
+	struct list_head clean_regions;
+	struct list_head quiesced_regions;
+	struct list_head recovered_regions;
+};
+
+enum {
+	RH_CLEAN,
+	RH_DIRTY,
+	RH_NOSYNC,
+	RH_RECOVERING
+};
+
+struct region {
+	struct region_hash *rh;	/* FIXME: can we get rid of this ? */
+	region_t key;
+	int state;
+
+	struct list_head hash_list;
+	struct list_head list;
+
+	atomic_t pending;
+	struct buffer_head *delayed_bhs;
+};
+
+/*
+ * Conversion fns
+ */
+static inline region_t bh_to_region(struct region_hash *rh,
+				    struct buffer_head *bh)
+{
+	return bh-&gt;b_rsector / rh-&gt;region_size;
+}
+
+static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
+{
+	return region * rh-&gt;region_size;
+}
+
+/* FIXME move this */
+static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw);
+
+static void *region_alloc(int gfp_mask, void *pool_data)
+{
+	return kmalloc(sizeof(struct region), gfp_mask);
+}
+
+static void region_free(void *element, void *pool_data)
+{
+	kfree(element);
+}
+
+#define MIN_REGIONS 64
+#define MAX_RECOVERY 1
+static int rh_init(struct region_hash *rh, struct mirror_set *ms,
+		   struct dirty_log *log, sector_t region_size,
+		   region_t nr_regions)
+{
+	unsigned int nr_buckets, max_buckets;
+	size_t i;
+
+	/*
+	 * Calculate a suitable number of buckets for our hash
+	 * table.
+	 */
+	max_buckets = nr_regions &gt;&gt; 6;
+	for (nr_buckets = 128u; nr_buckets &lt; max_buckets; nr_buckets &lt;&lt;= 1)
+		;
+	nr_buckets &gt;&gt;= 1;
+
+	rh-&gt;ms = ms;
+	rh-&gt;log = log;
+	rh-&gt;region_size = region_size;
+	rwlock_init(&amp;rh-&gt;hash_lock);
+	rh-&gt;mask = nr_buckets - 1;
+	rh-&gt;nr_buckets = nr_buckets;
+
+	rh-&gt;buckets = vmalloc(nr_buckets * sizeof(*rh-&gt;buckets));
+	if (!rh-&gt;buckets) {
+		DMERR("unable to allocate region hash memory");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i &lt; nr_buckets; i++)
+		INIT_LIST_HEAD(rh-&gt;buckets + i);
+
+	spin_lock_init(&amp;rh-&gt;region_lock);
+	sema_init(&amp;rh-&gt;recovery_count, 0);
+	INIT_LIST_HEAD(&amp;rh-&gt;clean_regions);
+	INIT_LIST_HEAD(&amp;rh-&gt;quiesced_regions);
+	INIT_LIST_HEAD(&amp;rh-&gt;recovered_regions);
+
+	rh-&gt;region_pool = mempool_create(MIN_REGIONS, region_alloc,
+					 region_free, NULL);
+	if (!rh-&gt;region_pool) {
+		vfree(rh-&gt;buckets);
+		rh-&gt;buckets = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void rh_exit(struct region_hash *rh)
+{
+	unsigned int h;
+	struct region *reg;
+	struct list_head *tmp, *tmp2;
+
+	BUG_ON(!list_empty(&amp;rh-&gt;quiesced_regions));
+	for (h = 0; h &lt; rh-&gt;nr_buckets; h++) {
+		list_for_each_safe (tmp, tmp2, rh-&gt;buckets + h) {
+			reg = list_entry(tmp, struct region, hash_list);
+			BUG_ON(atomic_read(&amp;reg-&gt;pending));
+			mempool_free(reg, rh-&gt;region_pool);
+		}
+	}
+
+	if (rh-&gt;log)
+		dm_destroy_dirty_log(rh-&gt;log);
+	if (rh-&gt;region_pool)
+		mempool_destroy(rh-&gt;region_pool);
+	vfree(rh-&gt;buckets);
+}
+
+#define RH_HASH_MULT 2654435387U
+
+static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
+{
+	return (unsigned int) ((region * RH_HASH_MULT) &gt;&gt; 12) &amp; rh-&gt;mask;
+}
+
+static struct region *__rh_lookup(struct region_hash *rh, region_t region)
+{
+	struct region *reg;
+
+	list_for_each_entry (reg, rh-&gt;buckets + rh_hash(rh, region), hash_list)
+		if (reg-&gt;key == region)
+			return reg;
+
+	return NULL;
+}
+
+static void __rh_insert(struct region_hash *rh, struct region *reg)
+{
+	unsigned int h = rh_hash(rh, reg-&gt;key);
+	list_add(&amp;reg-&gt;hash_list, rh-&gt;buckets + h);
+}
+
+static struct region *__rh_alloc(struct region_hash *rh, region_t region)
+{
+	struct region *reg, *nreg;
+
+	read_unlock(&amp;rh-&gt;hash_lock);
+	nreg = mempool_alloc(rh-&gt;region_pool, GFP_NOIO);
+	nreg-&gt;state = rh-&gt;log-&gt;type-&gt;in_sync(rh-&gt;log, region, 1) ?
+		RH_CLEAN : RH_NOSYNC;
+	nreg-&gt;rh = rh;
+	nreg-&gt;key = region;
+
+	INIT_LIST_HEAD(&amp;nreg-&gt;list);
+
+	atomic_set(&amp;nreg-&gt;pending, 0);
+	nreg-&gt;delayed_bhs = NULL;
+	write_lock_irq(&amp;rh-&gt;hash_lock);
+
+	reg = __rh_lookup(rh, region);
+	if (reg)
+		/* we lost the race */
+		mempool_free(nreg, rh-&gt;region_pool);
+
+	else {
+		__rh_insert(rh, nreg);
+		if (nreg-&gt;state == RH_CLEAN) {
+			spin_lock_irq(&amp;rh-&gt;region_lock);
+			list_add(&amp;nreg-&gt;list, &amp;rh-&gt;clean_regions);
+			spin_unlock_irq(&amp;rh-&gt;region_lock);
+		}
+		reg = nreg;
+	}
+	write_unlock_irq(&amp;rh-&gt;hash_lock);
+	read_lock(&amp;rh-&gt;hash_lock);
+
+	return reg;
+}
+
+static inline struct region *__rh_find(struct region_hash *rh, region_t region)
+{
+	struct region *reg;
+
+	reg = __rh_lookup(rh, region);
+	if (!reg)
+		reg = __rh_alloc(rh, region);
+
+	return reg;
+}
+
+static int rh_state(struct region_hash *rh, region_t region, int may_block)
+{
+	int r;
+	struct region *reg;
+
+	read_lock(&amp;rh-&gt;hash_lock);
+	reg = __rh_lookup(rh, region);
+	read_unlock(&amp;rh-&gt;hash_lock);
+
+	if (reg)
+		return reg-&gt;state;
+
+	/*
+	 * The region wasn't in the hash, so we fall back to the
+	 * dirty log.
+	 */
+	r = rh-&gt;log-&gt;type-&gt;in_sync(rh-&gt;log, region, may_block);
+
+	/*
+	 * Any error from the dirty log (eg. -EWOULDBLOCK) gets
+	 * taken as a RH_NOSYNC
+	 */
+	return r == 1 ? RH_CLEAN : RH_NOSYNC;
+}
+
+static inline int rh_in_sync(struct region_hash *rh,
+			     region_t region, int may_block)
+{
+	int state = rh_state(rh, region, may_block);
+	return state == RH_CLEAN || state == RH_DIRTY;
+}
+
+static void dispatch_buffers(struct mirror_set *ms, struct buffer_head *bh)
+{
+	struct buffer_head *nbh;
+
+	while (bh) {
+		nbh = bh-&gt;b_reqnext;
+		queue_bh(ms, bh, WRITE);
+		bh = nbh;
+	}
+}
+
+static void rh_update_states(struct region_hash *rh)
+{
+	struct list_head *tmp, *tmp2;
+	struct region *reg;
+
+	LIST_HEAD(clean);
+	LIST_HEAD(recovered);
+
+	/*
+	 * Quickly grab the lists.
+	 */
+	write_lock_irq(&amp;rh-&gt;hash_lock);
+	spin_lock(&amp;rh-&gt;region_lock);
+	if (!list_empty(&amp;rh-&gt;clean_regions)) {
+		list_splice(&amp;rh-&gt;clean_regions, &amp;clean);
+		INIT_LIST_HEAD(&amp;rh-&gt;clean_regions);
+
+		list_for_each_entry (reg, &amp;clean, list) {
+			rh-&gt;log-&gt;type-&gt;clear_region(rh-&gt;log, reg-&gt;key);
+			list_del(&amp;reg-&gt;hash_list);
+		}
+	}
+
+	if (!list_empty(&amp;rh-&gt;recovered_regions)) {
+		list_splice(&amp;rh-&gt;recovered_regions, &amp;recovered);
+		INIT_LIST_HEAD(&amp;rh-&gt;recovered_regions);
+
+		list_for_each_entry (reg, &amp;recovered, list)
+			list_del(&amp;reg-&gt;hash_list);
+	}
+	spin_unlock(&amp;rh-&gt;region_lock);
+	write_unlock_irq(&amp;rh-&gt;hash_lock);
+
+	/*
+	 * All the regions on the recovered and clean lists have
+	 * now been pulled out of the system, so no need to do
+	 * any more locking.
+	 */
+	list_for_each_safe (tmp, tmp2, &amp;recovered) {
+		reg = list_entry(tmp, struct region, list);
+
+		rh-&gt;log-&gt;type-&gt;complete_resync_work(rh-&gt;log, reg-&gt;key, 1);
+		dispatch_buffers(rh-&gt;ms, reg-&gt;delayed_bhs);
+		up(&amp;rh-&gt;recovery_count);
+		mempool_free(reg, rh-&gt;region_pool);
+	}
+
+	list_for_each_safe (tmp, tmp2, &amp;clean) {
+		reg = list_entry(tmp, struct region, list);
+		mempool_free(reg, rh-&gt;region_pool);
+	}
+}
+
+static void rh_inc(struct region_hash *rh, region_t region)
+{
+	struct region *reg;
+
+	read_lock(&amp;rh-&gt;hash_lock);
+	reg = __rh_find(rh, region);
+	if (reg-&gt;state == RH_CLEAN) {
+		rh-&gt;log-&gt;type-&gt;mark_region(rh-&gt;log, reg-&gt;key);
+
+		spin_lock_irq(&amp;rh-&gt;region_lock);
+		reg-&gt;state = RH_DIRTY;
+		list_del_init(&amp;reg-&gt;list);	/* take off the clean list */
+		spin_unlock_irq(&amp;rh-&gt;region_lock);
+	}
+
+	atomic_inc(&amp;reg-&gt;pending);
+	read_unlock(&amp;rh-&gt;hash_lock);
+}
+
+static void rh_inc_pending(struct region_hash *rh, struct buffer_list *buffers)
+{
+	struct buffer_head *bh;
+
+	for (bh = buffers-&gt;head; bh; bh = bh-&gt;b_reqnext)
+		rh_inc(rh, bh_to_region(rh, bh));
+}
+
+static void rh_dec(struct region_hash *rh, region_t region)
+{
+	unsigned long flags;
+	struct region *reg;
+	int wake = 0;
+
+	read_lock(&amp;rh-&gt;hash_lock);
+	reg = __rh_lookup(rh, region);
+	read_unlock(&amp;rh-&gt;hash_lock);
+
+	if (atomic_dec_and_test(&amp;reg-&gt;pending)) {
+		spin_lock_irqsave(&amp;rh-&gt;region_lock, flags);
+		if (reg-&gt;state == RH_RECOVERING) {
+			list_add_tail(&amp;reg-&gt;list, &amp;rh-&gt;quiesced_regions);
+		} else {
+			reg-&gt;state = RH_CLEAN;
+			list_add(&amp;reg-&gt;list, &amp;rh-&gt;clean_regions);
+		}
+		spin_unlock_irqrestore(&amp;rh-&gt;region_lock, flags);
+		wake = 1;
+	}
+
+	if (wake)
+		dm_daemon_wake(&amp;_kmirrord);
+}
+
+/*
+ * Starts quiescing a region in preparation for recovery.
+ */
+static int __rh_recovery_prepare(struct region_hash *rh)
+{
+	int r;
+	struct region *reg;
+	region_t region;
+
+	/*
+	 * Ask the dirty log what's next.
+	 */
+	r = rh-&gt;log-&gt;type-&gt;get_resync_work(rh-&gt;log, &amp;region);
+	if (r &lt;= 0)
+		return r;
+
+	/*
+	 * Get this region, and start it quiescing by setting the
+	 * recovering flag.
+	 */
+	read_lock(&amp;rh-&gt;hash_lock);
+	reg = __rh_find(rh, region);
+	read_unlock(&amp;rh-&gt;hash_lock);
+
+	spin_lock_irq(&amp;rh-&gt;region_lock);
+	reg-&gt;state = RH_RECOVERING;
+
+	/* Already quiesced ? */
+	if (atomic_read(&amp;reg-&gt;pending))
+		list_del_init(&amp;reg-&gt;list);
+
+	else {
+		list_del_init(&amp;reg-&gt;list);
+		list_add(&amp;reg-&gt;list, &amp;rh-&gt;quiesced_regions);
+	}
+	spin_unlock_irq(&amp;rh-&gt;region_lock);
+
+	return 1;
+}
+
+static void rh_recovery_prepare(struct region_hash *rh)
+{
+	while (!down_trylock(&amp;rh-&gt;recovery_count))
+		if (__rh_recovery_prepare(rh) &lt;= 0) {
+			up(&amp;rh-&gt;recovery_count);
+			break;
+		}
+}
+
+/*
+ * Returns any quiesced regions.
+ */
+static struct region *rh_recovery_start(struct region_hash *rh)
+{
+	struct region *reg = NULL;
+
+	spin_lock_irq(&amp;rh-&gt;region_lock);
+	if (!list_empty(&amp;rh-&gt;quiesced_regions)) {
+		reg = list_entry(rh-&gt;quiesced_regions.next,
+				 struct region, list);
+		list_del_init(&amp;reg-&gt;list);	/* remove from the quiesced list */
+	}
+	spin_unlock_irq(&amp;rh-&gt;region_lock);
+
+	return reg;
+}
+
+/* FIXME: success ignored for now */
+static void rh_recovery_end(struct region *reg, int success)
+{
+	struct region_hash *rh = reg-&gt;rh;
+
+	spin_lock_irq(&amp;rh-&gt;region_lock);
+	list_add(&amp;reg-&gt;list, &amp;reg-&gt;rh-&gt;recovered_regions);
+	spin_unlock_irq(&amp;rh-&gt;region_lock);
+
+	dm_daemon_wake(&amp;_kmirrord);
+}
+
+static void rh_flush(struct region_hash *rh)
+{
+	rh-&gt;log-&gt;type-&gt;flush(rh-&gt;log);
+}
+
+static void rh_delay(struct region_hash *rh, struct buffer_head *bh)
+{
+	struct region *reg;
+
+	read_lock(&amp;rh-&gt;hash_lock);
+	reg = __rh_find(rh, bh_to_region(rh, bh));
+	bh-&gt;b_reqnext = reg-&gt;delayed_bhs;
+	reg-&gt;delayed_bhs = bh;
+	read_unlock(&amp;rh-&gt;hash_lock);
+}
+
+static void rh_stop_recovery(struct region_hash *rh)
+{
+	int i;
+
+	/* wait for any recovering regions */
+	for (i = 0; i &lt; MAX_RECOVERY; i++)
+		down(&amp;rh-&gt;recovery_count);
+}
+
+static void rh_start_recovery(struct region_hash *rh)
+{
+	int i;
+
+	for (i = 0; i &lt; MAX_RECOVERY; i++)
+		up(&amp;rh-&gt;recovery_count);
+
+	dm_daemon_wake(&amp;_kmirrord);
+}
+
+/*-----------------------------------------------------------------
+ * Mirror set structures.
+ *---------------------------------------------------------------*/
+struct mirror {
+	atomic_t error_count;
+	struct dm_dev *dev;
+	sector_t offset;
+};
+
+struct mirror_set {
+	struct dm_target *ti;
+	struct list_head list;
+	struct region_hash rh;
+	struct kcopyd_client *kcopyd_client;
+
+	spinlock_t lock;	/* protects the next two lists */
+	struct buffer_list reads;
+	struct buffer_list writes;
+
+	/* recovery */
+	region_t nr_regions;
+	region_t sync_count;
+
+	unsigned int nr_mirrors;
+	struct mirror mirror[0];
+};
+
+/*
+ * Every mirror should look like this one.
+ */
+#define DEFAULT_MIRROR 0
+
+/*
+ * This is yucky.  We squirrel the mirror_set struct away inside
+ * b_reqnext for write buffers.  This is safe since the bh
+ * doesn't get submitted to the lower levels of block layer.
+ */
+static struct mirror_set *bh_get_ms(struct buffer_head *bh)
+{
+	return (struct mirror_set *) bh-&gt;b_reqnext;
+}
+
+static void bh_set_ms(struct buffer_head *bh, struct mirror_set *ms)
+{
+	bh-&gt;b_reqnext = (struct buffer_head *) ms;
+}
+
+/*-----------------------------------------------------------------
+ * Recovery.
+ *
+ * When a mirror is first activated we may find that some regions
+ * are in the no-sync state.  We have to recover these by
+ * recopying from the default mirror to all the others.
+ *---------------------------------------------------------------*/
+static void recovery_complete(int read_err, unsigned int write_err,
+			      void *context)
+{
+	struct region *reg = (struct region *) context;
+	struct mirror_set *ms = reg-&gt;rh-&gt;ms;
+
+	/* FIXME: better error handling */
+	rh_recovery_end(reg, read_err || write_err);
+	if (++ms-&gt;sync_count == ms-&gt;nr_regions)
+		/* the sync is complete */
+		dm_table_event(ms-&gt;ti-&gt;table);
+}
+
+static int recover(struct mirror_set *ms, struct region *reg)
+{
+	int r;
+	unsigned int i;
+	struct io_region from, to[ms-&gt;nr_mirrors - 1], *dest;
+	struct mirror *m;
+	unsigned int flags = 0;
+
+	/* fill in the source */
+	m = ms-&gt;mirror + DEFAULT_MIRROR;
+	from.dev = m-&gt;dev-&gt;dev;
+	from.sector = m-&gt;offset + region_to_sector(reg-&gt;rh, reg-&gt;key);
+	if (reg-&gt;key == (ms-&gt;nr_regions - 1)) {
+		/*
+		 * The final region may be smaller than
+		 * region_size.
+		 */
+		from.count = ms-&gt;ti-&gt;len &amp; (reg-&gt;rh-&gt;region_size - 1);
+		if (!from.count)
+			from.count = reg-&gt;rh-&gt;region_size;
+	} else
+		from.count = reg-&gt;rh-&gt;region_size;
+
+	/* fill in the destinations */
+	for (i = 1; i &lt; ms-&gt;nr_mirrors; i++) {
+		m = ms-&gt;mirror + i;
+		dest = to + (i - 1);
+
+		dest-&gt;dev = m-&gt;dev-&gt;dev;
+		dest-&gt;sector = m-&gt;offset + region_to_sector(reg-&gt;rh, reg-&gt;key);
+		dest-&gt;count = from.count;
+	}
+
+	/* hand to kcopyd */
+	set_bit(KCOPYD_IGNORE_ERROR, &amp;flags);
+	r = kcopyd_copy(ms-&gt;kcopyd_client, &amp;from, ms-&gt;nr_mirrors - 1, to, flags,
+			recovery_complete, reg);
+
+	return r;
+}
+
+static void do_recovery(struct mirror_set *ms)
+{
+	int r;
+	struct region *reg;
+
+	/*
+	 * Start quiescing some regions.
+	 */
+	rh_recovery_prepare(&amp;ms-&gt;rh);
+
+	/*
+	 * Copy any already quiesced regions.
+	 */
+	while ((reg = rh_recovery_start(&amp;ms-&gt;rh))) {
+		r = recover(ms, reg);
+		if (r)
+			rh_recovery_end(reg, 0);
+	}
+}
+
+/*-----------------------------------------------------------------
+ * Reads
+ *---------------------------------------------------------------*/
+static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
+{
+	/* FIXME: add read balancing */
+	return ms-&gt;mirror + DEFAULT_MIRROR;
+}
+
+/*
+ * remap a buffer to a particular mirror.
+ */
+static void map_buffer(struct mirror_set *ms,
+		       struct mirror *m, struct buffer_head *bh)
+{
+	bh-&gt;b_rdev = m-&gt;dev-&gt;dev;
+	bh-&gt;b_rsector = m-&gt;offset + (bh-&gt;b_rsector - ms-&gt;ti-&gt;begin);
+}
+
+static void do_reads(struct mirror_set *ms, struct buffer_list *reads)
+{
+	region_t region;
+	struct buffer_head *bh;
+	struct mirror *m;
+
+	while ((bh = buffer_list_pop(reads))) {
+		region = bh_to_region(&amp;ms-&gt;rh, bh);
+
+		/*
+		 * We can only read balance if the region is in sync.
+		 */
+		if (rh_in_sync(&amp;ms-&gt;rh, region, 0))
+			m = choose_mirror(ms, bh-&gt;b_rsector);
+		else
+			m = ms-&gt;mirror + DEFAULT_MIRROR;
+
+		map_buffer(ms, m, bh);
+		generic_make_request(READ, bh);
+	}
+}
+
+/*-----------------------------------------------------------------
+ * Writes.
+ *
+ * We do different things with the write io depending on the
+ * state of the region that it's in:
+ *
+ * SYNC: 	increment pending, use kcopyd to write to *all* mirrors
+ * RECOVERING:	delay the io until recovery completes
+ * NOSYNC:	increment pending, just write to the default mirror
+ *---------------------------------------------------------------*/
+static void write_callback(unsigned int error, void *context)
+{
+	unsigned int i;
+	int uptodate = 1;
+	struct buffer_head *bh = (struct buffer_head *) context;
+	struct mirror_set *ms;
+
+	ms = bh_get_ms(bh);
+	bh_set_ms(bh, NULL);
+
+	/*
+	 * NOTE: We don't decrement the pending count here,
+	 * instead it is done by the targets endio function.
+	 * This way we handle both writes to SYNC and NOSYNC
+	 * regions with the same code.
+	 */
+
+	if (error) {
+		/*
+		 * only error the io if all mirrors failed.
+		 * FIXME: bogus
+		 */
+		uptodate = 0;
+		for (i = 0; i &lt; ms-&gt;nr_mirrors; i++)
+			if (!test_bit(i, &amp;error)) {
+				uptodate = 1;
+				break;
+			}
+	}
+	bh-&gt;b_end_io(bh, uptodate);
+}
+
+static void do_write(struct mirror_set *ms, struct buffer_head *bh)
+{
+	unsigned int i;
+	struct io_region io[ms-&gt;nr_mirrors];
+	struct mirror *m;
+
+	for (i = 0; i &lt; ms-&gt;nr_mirrors; i++) {
+		m = ms-&gt;mirror + i;
+
+		io[i].dev = m-&gt;dev-&gt;dev;
+		io[i].sector = m-&gt;offset + (bh-&gt;b_rsector - ms-&gt;ti-&gt;begin);
+		io[i].count = bh-&gt;b_size &gt;&gt; 9;
+	}
+
+	bh_set_ms(bh, ms);
+	dm_io_async(ms-&gt;nr_mirrors, io, WRITE, bh-&gt;b_page,
+		    (unsigned int) bh-&gt;b_data &amp; ~PAGE_MASK, write_callback, bh);
+}
+
+static void do_writes(struct mirror_set *ms, struct buffer_list *writes)
+{
+	int state;
+	struct buffer_head *bh;
+	struct buffer_list sync, nosync, recover, *this_list = NULL;
+
+	if (!writes-&gt;head)
+		return;
+
+	/*
+	 * Classify each write.
+	 */
+	buffer_list_init(&amp;sync);
+	buffer_list_init(&amp;nosync);
+	buffer_list_init(&amp;recover);
+
+	while ((bh = buffer_list_pop(writes))) {
+		state = rh_state(&amp;ms-&gt;rh, bh_to_region(&amp;ms-&gt;rh, bh), 1);
+		switch (state) {
+		case RH_CLEAN:
+		case RH_DIRTY:
+			this_list = &amp;sync;
+			break;
+
+		case RH_NOSYNC:
+			this_list = &amp;nosync;
+			break;
+
+		case RH_RECOVERING:
+			this_list = &amp;recover;
+			break;
+		}
+
+		buffer_list_add(this_list, bh);
+	}
+
+	/*
+	 * Increment the pending counts for any regions that will
+	 * be written to (writes to recover regions are going to
+	 * be delayed).
+	 */
+	rh_inc_pending(&amp;ms-&gt;rh, &amp;sync);
+	rh_inc_pending(&amp;ms-&gt;rh, &amp;nosync);
+	rh_flush(&amp;ms-&gt;rh);
+
+	/*
+	 * Dispatch io.
+	 */
+	while ((bh = buffer_list_pop(&amp;sync)))
+		do_write(ms, bh);
+
+	while ((bh = buffer_list_pop(&amp;recover)))
+		rh_delay(&amp;ms-&gt;rh, bh);
+
+	while ((bh = buffer_list_pop(&amp;nosync))) {
+		map_buffer(ms, ms-&gt;mirror + DEFAULT_MIRROR, bh);
+		generic_make_request(WRITE, bh);
+	}
+}
+
+/*-----------------------------------------------------------------
+ * kmirrord
+ *---------------------------------------------------------------*/
+static LIST_HEAD(_mirror_sets);
+static DECLARE_RWSEM(_mirror_sets_lock);
+
+static void do_mirror(struct mirror_set *ms)
+{
+	struct buffer_list reads, writes;
+
+	spin_lock(&amp;ms-&gt;lock);
+	memcpy(&amp;reads, &amp;ms-&gt;reads, sizeof(reads));
+	buffer_list_init(&amp;ms-&gt;reads);
+	memcpy(&amp;writes, &amp;ms-&gt;writes, sizeof(writes));
+	buffer_list_init(&amp;ms-&gt;writes);
+	spin_unlock(&amp;ms-&gt;lock);
+
+	rh_update_states(&amp;ms-&gt;rh);
+	do_recovery(ms);
+	do_reads(ms, &amp;reads);
+	do_writes(ms, &amp;writes);
+	run_task_queue(&amp;tq_disk);
+}
+
+static void do_work(void)
+{
+	struct mirror_set *ms;
+
+	down_read(&amp;_mirror_sets_lock);
+	list_for_each_entry (ms, &amp;_mirror_sets, list)
+		do_mirror(ms);
+	up_read(&amp;_mirror_sets_lock);
+}
+
+/*-----------------------------------------------------------------
+ * Target functions
+ *---------------------------------------------------------------*/
+static struct mirror_set *alloc_context(unsigned int nr_mirrors,
+					sector_t region_size,
+					struct dm_target *ti,
+					struct dirty_log *dl)
+{
+	size_t len;
+	struct mirror_set *ms = NULL;
+
+	if (array_too_big(sizeof(*ms), sizeof(ms-&gt;mirror[0]), nr_mirrors))
+		return NULL;
+
+	len = sizeof(*ms) + (sizeof(ms-&gt;mirror[0]) * nr_mirrors);
+
+	ms = kmalloc(len, GFP_KERNEL);
+	if (!ms) {
+		ti-&gt;error = "dm-mirror: Cannot allocate mirror context";
+		return NULL;
+	}
+
+	memset(ms, 0, len);
+	spin_lock_init(&amp;ms-&gt;lock);
+
+	ms-&gt;ti = ti;
+	ms-&gt;nr_mirrors = nr_mirrors;
+	ms-&gt;nr_regions = dm_div_up(ti-&gt;len, region_size);
+
+	if (rh_init(&amp;ms-&gt;rh, ms, dl, region_size, ms-&gt;nr_regions)) {
+		ti-&gt;error = "dm-mirror: Error creating dirty region hash";
+		kfree(ms);
+		return NULL;
+	}
+
+	return ms;
+}
+
+static void free_context(struct mirror_set *ms, struct dm_target *ti,
+			 unsigned int m)
+{
+	while (m--)
+		dm_put_device(ti, ms-&gt;mirror[m].dev);
+
+	rh_exit(&amp;ms-&gt;rh);
+	kfree(ms);
+}
+
+static inline int _check_region_size(struct dm_target *ti, sector_t size)
+{
+	return !(size % (PAGE_SIZE &gt;&gt; 9) || (size &amp; (size - 1)) ||
+		 size &gt; ti-&gt;len);
+}
+
+static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
+		      unsigned int mirror, char **argv)
+{
+	sector_t offset;
+
+	if (sscanf(argv[1], SECTOR_FORMAT, &amp;offset) != 1) {
+		ti-&gt;error = "dm-mirror: Invalid offset";
+		return -EINVAL;
+	}
+
+	if (dm_get_device(ti, argv[0], offset, ti-&gt;len,
+			  dm_table_get_mode(ti-&gt;table),
+			  &amp;ms-&gt;mirror[mirror].dev)) {
+		ti-&gt;error = "dm-mirror: Device lookup failure";
+		return -ENXIO;
+	}
+
+	ms-&gt;mirror[mirror].offset = offset;
+
+	return 0;
+}
+
+static int add_mirror_set(struct mirror_set *ms)
+{
+	down_write(&amp;_mirror_sets_lock);
+	list_add_tail(&amp;ms-&gt;list, &amp;_mirror_sets);
+	up_write(&amp;_mirror_sets_lock);
+	dm_daemon_wake(&amp;_kmirrord);
+
+	return 0;
+}
+
+static void del_mirror_set(struct mirror_set *ms)
+{
+	down_write(&amp;_mirror_sets_lock);
+	list_del(&amp;ms-&gt;list);
+	up_write(&amp;_mirror_sets_lock);
+}
+
+/*
+ * Create dirty log: log_type #log_params &lt;log_params&gt;
+ */
+static struct dirty_log *create_dirty_log(struct dm_target *ti,
+					  unsigned int argc, char **argv,
+					  unsigned int *args_used)
+{
+	unsigned int param_count;
+	struct dirty_log *dl;
+
+	if (argc &lt; 2) {
+		ti-&gt;error = "dm-mirror: Insufficient mirror log arguments";
+		return NULL;
+	}
+
+	if (sscanf(argv[1], "%u", &amp;param_count) != 1 || param_count != 1) {
+		ti-&gt;error = "dm-mirror: Invalid mirror log argument count";
+		return NULL;
+	}
+
+	*args_used = 2 + param_count;
+
+	if (argc &lt; *args_used) {
+		ti-&gt;error = "dm-mirror: Insufficient mirror log arguments";
+		return NULL;
+	}
+
+	dl = dm_create_dirty_log(argv[0], ti-&gt;len, param_count, argv + 2);
+	if (!dl) {
+		ti-&gt;error = "dm-mirror: Error creating mirror dirty log";
+		return NULL;
+	}
+
+	if (!_check_region_size(ti, dl-&gt;type-&gt;get_region_size(dl))) {
+		ti-&gt;error = "dm-mirror: Invalid region size";
+		dm_destroy_dirty_log(dl);
+		return NULL;
+	}
+
+	return dl;
+}
+
+/*
+ * Construct a mirror mapping:
+ *
+ * log_type #log_params &lt;log_params&gt;
+ * #mirrors [mirror_path offset]{2,}
+ *
+ * For now, #log_params = 1, log_type = "core"
+ *
+ */
+#define DM_IO_PAGES 64
+static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	int r;
+	unsigned int nr_mirrors, m, args_used;
+	struct mirror_set *ms;
+	struct dirty_log *dl;
+
+	dl = create_dirty_log(ti, argc, argv, &amp;args_used);
+	if (!dl)
+		return -EINVAL;
+
+	argv += args_used;
+	argc -= args_used;
+
+	if (!argc || sscanf(argv[0], "%u", &amp;nr_mirrors) != 1 ||
+	    nr_mirrors &lt; 2) {
+		ti-&gt;error = "dm-mirror: Invalid number of mirrors";
+		dm_destroy_dirty_log(dl);
+		return -EINVAL;
+	}
+
+	argv++, argc--;
+
+	if (argc != nr_mirrors * 2) {
+		ti-&gt;error = "dm-mirror: Wrong number of mirror arguments";
+		dm_destroy_dirty_log(dl);
+		return -EINVAL;
+	}
+
+	ms = alloc_context(nr_mirrors, dl-&gt;type-&gt;get_region_size(dl), ti, dl);
+	if (!ms) {
+		dm_destroy_dirty_log(dl);
+		return -ENOMEM;
+	}
+
+	/* Get the mirror parameter sets */
+	for (m = 0; m &lt; nr_mirrors; m++) {
+		r = get_mirror(ms, ti, m, argv);
+		if (r) {
+			free_context(ms, ti, m);
+			return r;
+		}
+		argv += 2;
+		argc -= 2;
+	}
+
+	ti-&gt;private = ms;
+
+	r = kcopyd_client_create(DM_IO_PAGES, &amp;ms-&gt;kcopyd_client);
+	if (r) {
+		free_context(ms, ti, ms-&gt;nr_mirrors);
+		return r;
+	}
+
+	add_mirror_set(ms);
+	return 0;
+}
+
+static void mirror_dtr(struct dm_target *ti)
+{
+	struct mirror_set *ms = (struct mirror_set *) ti-&gt;private;
+
+	del_mirror_set(ms);
+	kcopyd_client_destroy(ms-&gt;kcopyd_client);
+	free_context(ms, ti, ms-&gt;nr_mirrors);
+}
+
+static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw)
+{
+	int wake = 0;
+	struct buffer_list *bl;
+
+	bl = (rw == WRITE) ? &amp;ms-&gt;writes : &amp;ms-&gt;reads;
+	spin_lock(&amp;ms-&gt;lock);
+	wake = !(bl-&gt;head);
+	buffer_list_add(bl, bh);
+	spin_unlock(&amp;ms-&gt;lock);
+
+	if (wake)
+		dm_daemon_wake(&amp;_kmirrord);
+}
+
+/*
+ * Mirror mapping function
+ */
+static int mirror_map(struct dm_target *ti, struct buffer_head *bh,
+		      int rw, union map_info *map_context)
+{
+	int r;
+	struct mirror *m;
+	struct mirror_set *ms = ti-&gt;private;
+
+	/* FIXME: nasty hack, 32 bit sector_t only */
+	map_context-&gt;ll = bh-&gt;b_rsector / ms-&gt;rh.region_size;
+
+	if (rw == WRITE) {
+		queue_bh(ms, bh, rw);
+		return 0;
+	}
+
+	r = ms-&gt;rh.log-&gt;type-&gt;in_sync(ms-&gt;rh.log, bh_to_region(&amp;ms-&gt;rh, bh), 0);
+	if (r &lt; 0 &amp;&amp; r != -EWOULDBLOCK)
+		return r;
+
+	if (r == -EWOULDBLOCK)	/* FIXME: ugly */
+		r = 0;
+
+	/*
+	 * We don't want to fast track a recovery just for a read
+	 * ahead.  So we just let it silently fail.
+	 * FIXME: get rid of this.
+	 */
+	if (!r &amp;&amp; rw == READA)
+		return -EIO;
+
+	if (!r) {
+		/* Pass this io over to the daemon */
+		queue_bh(ms, bh, rw);
+		return 0;
+	}
+
+	m = choose_mirror(ms, bh-&gt;b_rsector);
+	if (!m)
+		return -EIO;
+
+	map_buffer(ms, m, bh);
+	return 1;
+}
+
+static int mirror_end_io(struct dm_target *ti, struct buffer_head *bh,
+			 int rw, int error, union map_info *map_context)
+{
+	struct mirror_set *ms = (struct mirror_set *) ti-&gt;private;
+	region_t region = map_context-&gt;ll;
+
+	/*
+	 * We need to dec pending if this was a write.
+	 */
+	if (rw == WRITE)
+		rh_dec(&amp;ms-&gt;rh, region);
+
+	return 0;
+}
+
+static void mirror_suspend(struct dm_target *ti)
+{
+	struct mirror_set *ms = (struct mirror_set *) ti-&gt;private;
+	rh_stop_recovery(&amp;ms-&gt;rh);
+}
+
+static void mirror_resume(struct dm_target *ti)
+{
+	struct mirror_set *ms = (struct mirror_set *) ti-&gt;private;
+	rh_start_recovery(&amp;ms-&gt;rh);
+}
+
+static int mirror_status(struct dm_target *ti, status_type_t type,
+			 char *result, unsigned int maxlen)
+{
+	unsigned int m, sz = 0;
+	struct mirror_set *ms = (struct mirror_set *) ti-&gt;private;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		sz += snprintf(result + sz, maxlen - sz, "%d ", ms-&gt;nr_mirrors);
+
+		for (m = 0; m &lt; ms-&gt;nr_mirrors; m++)
+			sz += snprintf(result + sz, maxlen - sz, "%s ",
+				       dm_kdevname(ms-&gt;mirror[m].dev-&gt;dev));
+
+		sz += snprintf(result + sz, maxlen - sz, "%lu/%lu",
+			       ms-&gt;sync_count, ms-&gt;nr_regions);
+		break;
+
+	case STATUSTYPE_TABLE:
+		sz += snprintf(result + sz, maxlen - sz,
+			       "%s 1 " SECTOR_FORMAT " %d ",
+			       ms-&gt;rh.log-&gt;type-&gt;name, ms-&gt;rh.region_size,
+			       ms-&gt;nr_mirrors);
+
+		for (m = 0; m &lt; ms-&gt;nr_mirrors; m++)
+			sz += snprintf(result + sz, maxlen - sz, "%s %ld ",
+				       dm_kdevname(ms-&gt;mirror[m].dev-&gt;dev),
+				       ms-&gt;mirror[m].offset);
+	}
+
+	return 0;
+}
+
+static struct target_type mirror_target = {
+	.name	 = "mirror",
+	.version = {1, 0, 1},
+	.module	 = THIS_MODULE,
+	.ctr	 = mirror_ctr,
+	.dtr	 = mirror_dtr,
+	.map	 = mirror_map,
+	.end_io	 = mirror_end_io,
+	.suspend = mirror_suspend,
+	.resume	 = mirror_resume,
+	.status	 = mirror_status,
+};
+
+static int __init dm_mirror_init(void)
+{
+	int r;
+
+	r = dm_dirty_log_init();
+	if (r)
+		return r;
+
+	r = dm_daemon_start(&amp;_kmirrord, "kmirrord", do_work);
+	if (r) {
+		DMERR("couldn't start kmirrord");
+		dm_dirty_log_exit();
+		return r;
+	}
+
+	r = dm_register_target(&amp;mirror_target);
+	if (r &lt; 0) {
+		DMERR("%s: Failed to register mirror target",
+		      mirror_target.name);
+		dm_dirty_log_exit();
+		dm_daemon_stop(&amp;_kmirrord);
+	}
+
+	return r;
+}
+
+static void __exit dm_mirror_exit(void)
+{
+	int r;
+
+	r = dm_unregister_target(&amp;mirror_target);
+	if (r &lt; 0)
+		DMERR("%s: unregister failed %d", mirror_target.name, r);
+
+	dm_daemon_stop(&amp;_kmirrord);
+	dm_dirty_log_exit();
+}
+
+/* Module hooks */
+module_init(dm_mirror_init);
+module_exit(dm_mirror_exit);
+
+MODULE_DESCRIPTION(DM_NAME " mirror target");
+MODULE_AUTHOR("Heinz Mauelshagen &lt;mge@sistina.com&gt;");
+MODULE_LICENSE("GPL");
--- linux-2.4.26-rc1/drivers/md/dm-snapshot.c	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-snapshot.c	Wed Mar 31 13:52:03 2004
@@ -0,0 +1,1237 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include &lt;linux/config.h&gt;
+#include &lt;linux/ctype.h&gt;
+#include &lt;linux/module.h&gt;
+#include &lt;linux/init.h&gt;
+#include &lt;linux/slab.h&gt;
+#include &lt;linux/list.h&gt;
+#include &lt;linux/fs.h&gt;
+#include &lt;linux/blkdev.h&gt;
+#include &lt;linux/mempool.h&gt;
+#include &lt;linux/device-mapper.h&gt;
+#include &lt;linux/vmalloc.h&gt;
+
+#include "dm-snapshot.h"
+#include "kcopyd.h"
+
+/*
+ * FIXME: Remove this before release.
+ */
+#if 0
+#define DMDEBUG(x...) DMWARN( ## x)
+#else
+#define DMDEBUG(x...)
+#endif
+
+/*
+ * The percentage increment we will wake up users at
+ */
+#define WAKE_UP_PERCENT 5
+
+/*
+ * kcopyd priority of snapshot operations
+ */
+#define SNAPSHOT_COPY_PRIORITY 2
+
+/*
+ * Each snapshot reserves this many pages for io
+ * FIXME: calculate this
+ */
+#define SNAPSHOT_PAGES 256
+
+struct pending_exception {
+	struct exception e;
+
+	/*
+	 * Origin buffers waiting for this to complete are held
+	 * in a list (using b_reqnext).
+	 */
+	struct buffer_head *origin_bhs;
+	struct buffer_head *snapshot_bhs;
+
+	/*
+	 * Other pending_exceptions that are processing this
+	 * chunk.  When this list is empty, we know we can
+	 * complete the origins.
+	 */
+	struct list_head siblings;
+
+	/* Pointer back to snapshot context */
+	struct dm_snapshot *snap;
+
+	/*
+	 * 1 indicates the exception has already been sent to
+	 * kcopyd.
+	 */
+	int started;
+};
+
+/*
+ * Hash table mapping origin volumes to lists of snapshots and
+ * a lock to protect it
+ */
+static kmem_cache_t *exception_cache;
+static kmem_cache_t *pending_cache;
+static mempool_t *pending_pool;
+
+/*
+ * One of these per registered origin, held in the snapshot_origins hash
+ */
+struct origin {
+	/* The origin device */
+	kdev_t dev;
+
+	struct list_head hash_list;
+
+	/* List of snapshots for this origin */
+	struct list_head snapshots;
+};
+
+/*
+ * Size of the hash table for origin volumes. If we make this
+ * the size of the minors list then it should be nearly perfect
+ */
+#define ORIGIN_HASH_SIZE 256
+#define ORIGIN_MASK      0xFF
+static struct list_head *_origins;
+static struct rw_semaphore _origins_lock;
+
+static int init_origin_hash(void)
+{
+	int i;
+
+	_origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
+			   GFP_KERNEL);
+	if (!_origins) {
+		DMERR("Device mapper: Snapshot: unable to allocate memory");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i &lt; ORIGIN_HASH_SIZE; i++)
+		INIT_LIST_HEAD(_origins + i);
+	init_rwsem(&amp;_origins_lock);
+
+	return 0;
+}
+
+static void exit_origin_hash(void)
+{
+	kfree(_origins);
+}
+
+static inline unsigned int origin_hash(kdev_t dev)
+{
+	return MINOR(dev) &amp; ORIGIN_MASK;
+}
+
+static struct origin *__lookup_origin(kdev_t origin)
+{
+	struct list_head *slist;
+	struct list_head *ol;
+	struct origin *o;
+
+	ol = &amp;_origins[origin_hash(origin)];
+	list_for_each(slist, ol) {
+		o = list_entry(slist, struct origin, hash_list);
+
+		if (o-&gt;dev == origin)
+			return o;
+	}
+
+	return NULL;
+}
+
+static void __insert_origin(struct origin *o)
+{
+	struct list_head *sl = &amp;_origins[origin_hash(o-&gt;dev)];
+	list_add_tail(&amp;o-&gt;hash_list, sl);
+}
+
+/*
+ * Make a note of the snapshot and its origin so we can look it
+ * up when the origin has a write on it.
+ */
+static int register_snapshot(struct dm_snapshot *snap)
+{
+	struct origin *o;
+	kdev_t dev = snap-&gt;origin-&gt;dev;
+
+	down_write(&amp;_origins_lock);
+	o = __lookup_origin(dev);
+
+	if (!o) {
+		/* New origin */
+		o = kmalloc(sizeof(*o), GFP_KERNEL);
+		if (!o) {
+			up_write(&amp;_origins_lock);
+			return -ENOMEM;
+		}
+
+		/* Initialise the struct */
+		INIT_LIST_HEAD(&amp;o-&gt;snapshots);
+		o-&gt;dev = dev;
+
+		__insert_origin(o);
+	}
+
+	list_add_tail(&amp;snap-&gt;list, &amp;o-&gt;snapshots);
+
+	up_write(&amp;_origins_lock);
+	return 0;
+}
+
+static void unregister_snapshot(struct dm_snapshot *s)
+{
+	struct origin *o;
+
+	down_write(&amp;_origins_lock);
+	o = __lookup_origin(s-&gt;origin-&gt;dev);
+
+	list_del(&amp;s-&gt;list);
+	if (list_empty(&amp;o-&gt;snapshots)) {
+		list_del(&amp;o-&gt;hash_list);
+		kfree(o);
+	}
+
+	up_write(&amp;_origins_lock);
+}
+
+/*
+ * Implementation of the exception hash tables.
+ */
+static int init_exception_table(struct exception_table *et, uint32_t size)
+{
+	unsigned int i;
+
+	et-&gt;hash_mask = size - 1;
+	et-&gt;table = vcalloc(size, sizeof(struct list_head));
+	if (!et-&gt;table)
+		return -ENOMEM;
+
+	for (i = 0; i &lt; size; i++)
+		INIT_LIST_HEAD(et-&gt;table + i);
+
+	return 0;
+}
+
+static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
+{
+	struct list_head *slot, *entry, *temp;
+	struct exception *ex;
+	int i, size;
+
+	size = et-&gt;hash_mask + 1;
+	for (i = 0; i &lt; size; i++) {
+		slot = et-&gt;table + i;
+
+		list_for_each_safe(entry, temp, slot) {
+			ex = list_entry(entry, struct exception, hash_list);
+			kmem_cache_free(mem, ex);
+		}
+	}
+
+	vfree(et-&gt;table);
+}
+
+/*
+ * FIXME: check how this hash fn is performing.
+ */
+static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
+{
+	return chunk &amp; et-&gt;hash_mask;
+}
+
+static void insert_exception(struct exception_table *eh, struct exception *e)
+{
+	struct list_head *l = &amp;eh-&gt;table[exception_hash(eh, e-&gt;old_chunk)];
+	list_add(&amp;e-&gt;hash_list, l);
+}
+
+static inline void remove_exception(struct exception *e)
+{
+	list_del(&amp;e-&gt;hash_list);
+}
+
+/*
+ * Return the exception data for a sector, or NULL if not
+ * remapped.
+ */
+static struct exception *lookup_exception(struct exception_table *et,
+					  chunk_t chunk)
+{
+	struct list_head *slot, *el;
+	struct exception *e;
+
+	slot = &amp;et-&gt;table[exception_hash(et, chunk)];
+	list_for_each(el, slot) {
+		e = list_entry(el, struct exception, hash_list);
+		if (e-&gt;old_chunk == chunk)
+			return e;
+	}
+
+	return NULL;
+}
+
+static inline struct exception *alloc_exception(void)
+{
+	struct exception *e;
+
+	e = kmem_cache_alloc(exception_cache, GFP_NOIO);
+	if (!e)
+		e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
+
+	return e;
+}
+
+static inline void free_exception(struct exception *e)
+{
+	kmem_cache_free(exception_cache, e);
+}
+
+static inline struct pending_exception *alloc_pending_exception(void)
+{
+	return mempool_alloc(pending_pool, GFP_NOIO);
+}
+
+static inline void free_pending_exception(struct pending_exception *pe)
+{
+	mempool_free(pe, pending_pool);
+}
+
+int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
+{
+	struct exception *e;
+
+	e = alloc_exception();
+	if (!e)
+		return -ENOMEM;
+
+	e-&gt;old_chunk = old;
+	e-&gt;new_chunk = new;
+	insert_exception(&amp;s-&gt;complete, e);
+	return 0;
+}
+
+/*
+ * Hard coded magic.
+ */
+static int calc_max_buckets(void)
+{
+	unsigned long mem;
+
+	mem = num_physpages &lt;&lt; PAGE_SHIFT;
+	mem /= 50;
+	mem /= sizeof(struct list_head);
+
+	return mem;
+}
+
+/*
+ * Rounds a number down to a power of 2.
+ */
+static inline uint32_t round_down(uint32_t n)
+{
+	while (n &amp; (n - 1))
+		n &amp;= (n - 1);
+	return n;
+}
+
+/*
+ * Allocate room for a suitable hash table.
+ */
+static int init_hash_tables(struct dm_snapshot *s)
+{
+	sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
+
+	/*
+	 * Calculate based on the size of the original volume or
+	 * the COW volume...
+	 */
+	cow_dev_size = get_dev_size(s-&gt;cow-&gt;dev);
+	origin_dev_size = get_dev_size(s-&gt;origin-&gt;dev);
+	max_buckets = calc_max_buckets();
+
+	hash_size = min(origin_dev_size, cow_dev_size) / s-&gt;chunk_size;
+	hash_size = min(hash_size, max_buckets);
+
+	/* Round it down to a power of 2 */
+	hash_size = round_down(hash_size);
+	if (init_exception_table(&amp;s-&gt;complete, hash_size))
+		return -ENOMEM;
+
+	/*
+	 * Allocate hash table for in-flight exceptions
+	 * Make this smaller than the real hash table
+	 */
+	hash_size &gt;&gt;= 3;
+	if (!hash_size)
+		hash_size = 64;
+
+	if (init_exception_table(&amp;s-&gt;pending, hash_size)) {
+		exit_exception_table(&amp;s-&gt;complete, exception_cache);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Round a number up to the nearest 'size' boundary.  size must
+ * be a power of 2.
+ */
+static inline ulong round_up(ulong n, ulong size)
+{
+	size--;
+	return (n + size) &amp; ~size;
+}
+
+/*
+ * Construct a snapshot mapping: &lt;origin_dev&gt; &lt;COW-dev&gt; &lt;p/n&gt; &lt;chunk-size&gt;
+ */
+static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct dm_snapshot *s;
+	unsigned long chunk_size;
+	int r = -EINVAL;
+	char persistent;
+	char *origin_path;
+	char *cow_path;
+	char *value;
+	int blocksize;
+
+	if (argc &lt; 4) {
+		ti-&gt;error = "dm-snapshot: requires exactly 4 arguments";
+		r = -EINVAL;
+		goto bad1;
+	}
+
+	origin_path = argv[0];
+	cow_path = argv[1];
+	persistent = toupper(*argv[2]);
+
+	if (persistent != 'P' &amp;&amp; persistent != 'N') {
+		ti-&gt;error = "Persistent flag is not P or N";
+		r = -EINVAL;
+		goto bad1;
+	}
+
+	chunk_size = simple_strtoul(argv[3], &amp;value, 10);
+	if (chunk_size == 0 || value == NULL) {
+		ti-&gt;error = "Invalid chunk size";
+		r = -EINVAL;
+		goto bad1;
+	}
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (s == NULL) {
+		ti-&gt;error = "Cannot allocate snapshot context private "
+		    "structure";
+		r = -ENOMEM;
+		goto bad1;
+	}
+
+	r = dm_get_device(ti, origin_path, 0, ti-&gt;len, FMODE_READ, &amp;s-&gt;origin);
+	if (r) {
+		ti-&gt;error = "Cannot get origin device";
+		goto bad2;
+	}
+
+	/* FIXME: get cow length */
+	r = dm_get_device(ti, cow_path, 0, 0,
+			  FMODE_READ | FMODE_WRITE, &amp;s-&gt;cow);
+	if (r) {
+		dm_put_device(ti, s-&gt;origin);
+		ti-&gt;error = "Cannot get COW device";
+		goto bad2;
+	}
+
+	/*
+	 * Chunk size must be multiple of page size.  Silently
+	 * round up if it's not.
+	 */
+	chunk_size = round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE);
+
+	/* Validate the chunk size against the device block size */
+	blocksize = get_hardsect_size(s-&gt;cow-&gt;dev);
+	if (chunk_size % (blocksize / SECTOR_SIZE)) {
+		ti-&gt;error = "Chunk size is not a multiple of device blocksize";
+		r = -EINVAL;
+		goto bad3;
+	}
+
+	/* Check the sizes are small enough to fit in one kiovec */
+	if (chunk_size &gt; KIO_MAX_SECTORS) {
+		ti-&gt;error = "Chunk size is too big";
+		r = -EINVAL;
+		goto bad3;
+	}
+
+	/* Check chunk_size is a power of 2 */
+	if (chunk_size &amp; (chunk_size - 1)) {
+		ti-&gt;error = "Chunk size is not a power of 2";
+		r = -EINVAL;
+		goto bad3;
+	}
+
+	s-&gt;chunk_size = chunk_size;
+	s-&gt;chunk_mask = chunk_size - 1;
+	s-&gt;type = persistent;
+	for (s-&gt;chunk_shift = 0; chunk_size;
+	     s-&gt;chunk_shift++, chunk_size &gt;&gt;= 1)
+		;
+	s-&gt;chunk_shift--;
+
+	s-&gt;valid = 1;
+	s-&gt;have_metadata = 0;
+	s-&gt;last_percent = 0;
+	init_rwsem(&amp;s-&gt;lock);
+	s-&gt;table = ti-&gt;table;
+
+	/* Allocate hash table for COW data */
+	if (init_hash_tables(s)) {
+		ti-&gt;error = "Unable to allocate hash table space";
+		r = -ENOMEM;
+		goto bad3;
+	}
+
+	/*
+	 * Check the persistent flag - done here because we need the iobuf
+	 * to check the LV header
+	 */
+	s-&gt;store.snap = s;
+
+	if (persistent == 'P')
+		r = dm_create_persistent(&amp;s-&gt;store, s-&gt;chunk_size);
+	else
+		r = dm_create_transient(&amp;s-&gt;store, s, blocksize);
+
+	if (r) {
+		ti-&gt;error = "Couldn't create exception store";
+		r = -EINVAL;
+		goto bad4;
+	}
+
+	r = kcopyd_client_create(SNAPSHOT_PAGES, &amp;s-&gt;kcopyd_client);
+	if (r) {
+		ti-&gt;error = "Could not create kcopyd client";
+		goto bad5;
+	}
+
+	/* Flush IO to the origin device */
+	fsync_dev(s-&gt;origin-&gt;dev);
+
+	/* Add snapshot to the list of snapshots for this origin */
+	if (register_snapshot(s)) {
+		r = -EINVAL;
+		ti-&gt;error = "Cannot register snapshot origin";
+		goto bad6;
+	}
+
+	ti-&gt;private = s;
+	return 0;
+
+ bad6:
+	kcopyd_client_destroy(s-&gt;kcopyd_client);
+
+ bad5:
+	s-&gt;store.destroy(&amp;s-&gt;store);
+
+ bad4:
+	exit_exception_table(&amp;s-&gt;pending, pending_cache);
+	exit_exception_table(&amp;s-&gt;complete, exception_cache);
+
+ bad3:
+	dm_put_device(ti, s-&gt;cow);
+	dm_put_device(ti, s-&gt;origin);
+
+ bad2:
+	kfree(s);
+
+ bad1:
+	return r;
+}
+
+static void snapshot_dtr(struct dm_target *ti)
+{
+	struct dm_snapshot *s = (struct dm_snapshot *) ti-&gt;private;
+
+	dm_table_event(ti-&gt;table);
+
+	unregister_snapshot(s);
+
+	exit_exception_table(&amp;s-&gt;pending, pending_cache);
+	exit_exception_table(&amp;s-&gt;complete, exception_cache);
+
+	/* Deallocate memory used */
+	s-&gt;store.destroy(&amp;s-&gt;store);
+
+	dm_put_device(ti, s-&gt;origin);
+	dm_put_device(ti, s-&gt;cow);
+	kcopyd_client_destroy(s-&gt;kcopyd_client);
+	kfree(s);
+}
+
+/*
+ * We hold lists of buffer_heads, using the b_reqnext field.
+ */
+static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh)
+{
+	bh-&gt;b_reqnext = *queue;
+	*queue = bh;
+}
+
+/*
+ * FIXME: inefficient.
+ */
+static void queue_buffers(struct buffer_head **queue, struct buffer_head *bhs)
+{
+	while (*queue)
+		queue = &amp;((*queue)-&gt;b_reqnext);
+
+	*queue = bhs;
+}
+
+/*
+ * Flush a list of buffers.
+ */
+static void flush_buffers(struct buffer_head *bh)
+{
+	struct buffer_head *n;
+
+	DMDEBUG("begin flush");
+	while (bh) {
+		n = bh-&gt;b_reqnext;
+		bh-&gt;b_reqnext = NULL;
+		DMDEBUG("flushing %p", bh);
+		generic_make_request(WRITE, bh);
+		bh = n;
+	}
+
+	run_task_queue(&amp;tq_disk);
+}
+
+/*
+ * Error a list of buffers.
+ */
+static void error_buffers(struct buffer_head *bh)
+{
+	struct buffer_head *n;
+
+	while (bh) {
+		n = bh-&gt;b_reqnext;
+		bh-&gt;b_reqnext = NULL;
+		buffer_IO_error(bh);
+		bh = n;
+	}
+}
+
+static struct buffer_head *__flush_bhs(struct pending_exception *pe)
+{
+	struct pending_exception *sibling;
+
+	if (list_empty(&amp;pe-&gt;siblings))
+		return pe-&gt;origin_bhs;
+
+	sibling = list_entry(pe-&gt;siblings.next,
+			     struct pending_exception, siblings);
+
+	list_del(&amp;pe-&gt;siblings);
+
+	/* FIXME: I think there's a race on SMP machines here, add spin lock */
+	queue_buffers(&amp;sibling-&gt;origin_bhs, pe-&gt;origin_bhs);
+
+	return NULL;
+}
+
+static void pending_complete(struct pending_exception *pe, int success)
+{
+	struct exception *e;
+	struct dm_snapshot *s = pe-&gt;snap;
+	struct buffer_head *flush = NULL;
+
+	if (success) {
+		e = alloc_exception();
+		if (!e) {
+			DMWARN("Unable to allocate exception.");
+			down_write(&amp;s-&gt;lock);
+			s-&gt;store.drop_snapshot(&amp;s-&gt;store);
+			s-&gt;valid = 0;
+			flush = __flush_bhs(pe);
+			up_write(&amp;s-&gt;lock);
+
+			error_buffers(pe-&gt;snapshot_bhs);
+			goto out;
+		}
+
+		/*
+		 * Add a proper exception, and remove the
+		 * in-flight exception from the list.
+		 */
+		down_write(&amp;s-&gt;lock);
+
+		memcpy(e, &amp;pe-&gt;e, sizeof(*e));
+		insert_exception(&amp;s-&gt;complete, e);
+		remove_exception(&amp;pe-&gt;e);
+		flush = __flush_bhs(pe);
+
+		/* Submit any pending write BHs */
+		up_write(&amp;s-&gt;lock);
+
+		flush_buffers(pe-&gt;snapshot_bhs);
+		DMDEBUG("Exception completed successfully.");
+
+		/* Notify any interested parties */
+		if (s-&gt;store.fraction_full) {
+			sector_t numerator, denominator;
+			int pc;
+
+			s-&gt;store.fraction_full(&amp;s-&gt;store, &amp;numerator,
+					       &amp;denominator);
+			pc = numerator * 100 / denominator;
+
+			if (pc &gt;= s-&gt;last_percent + WAKE_UP_PERCENT) {
+				dm_table_event(s-&gt;table);
+				s-&gt;last_percent = pc - pc % WAKE_UP_PERCENT;
+			}
+		}
+
+	} else {
+		/* Read/write error - snapshot is unusable */
+		down_write(&amp;s-&gt;lock);
+		if (s-&gt;valid)
+			DMERR("Error reading/writing snapshot");
+		s-&gt;store.drop_snapshot(&amp;s-&gt;store);
+		s-&gt;valid = 0;
+		remove_exception(&amp;pe-&gt;e);
+		flush = __flush_bhs(pe);
+		up_write(&amp;s-&gt;lock);
+
+		error_buffers(pe-&gt;snapshot_bhs);
+
+		dm_table_event(s-&gt;table);
+		DMDEBUG("Exception failed.");
+	}
+
+ out:
+	if (flush)
+		flush_buffers(flush);
+
+	free_pending_exception(pe);
+}
+
+static void commit_callback(void *context, int success)
+{
+	struct pending_exception *pe = (struct pending_exception *) context;
+	pending_complete(pe, success);
+}
+
+/*
+ * Called when the copy I/O has finished.  kcopyd actually runs
+ * this code so don't block.
+ */
+static void copy_callback(int read_err, unsigned int write_err, void *context)
+{
+	struct pending_exception *pe = (struct pending_exception *) context;
+	struct dm_snapshot *s = pe-&gt;snap;
+
+	if (read_err || write_err)
+		pending_complete(pe, 0);
+
+	else
+		/* Update the metadata if we are persistent */
+		s-&gt;store.commit_exception(&amp;s-&gt;store, &amp;pe-&gt;e, commit_callback,
+					  pe);
+}
+
+/*
+ * Dispatches the copy operation to kcopyd.
+ */
+static inline void start_copy(struct pending_exception *pe)
+{
+	struct dm_snapshot *s = pe-&gt;snap;
+	struct io_region src, dest;
+	kdev_t dev = s-&gt;origin-&gt;dev;
+	int *sizes = blk_size[major(dev)];
+	sector_t dev_size = (sector_t) -1;
+
+	if (pe-&gt;started)
+		return;
+
+	/* this is protected by snap-&gt;lock */
+	pe-&gt;started = 1;
+
+	if (sizes &amp;&amp; sizes[minor(dev)])
+		dev_size = sizes[minor(dev)] &lt;&lt; 1;
+
+	src.dev = dev;
+	src.sector = chunk_to_sector(s, pe-&gt;e.old_chunk);
+	src.count = min(s-&gt;chunk_size, dev_size - src.sector);
+
+	dest.dev = s-&gt;cow-&gt;dev;
+	dest.sector = chunk_to_sector(s, pe-&gt;e.new_chunk);
+	dest.count = src.count;
+
+	/* Hand over to kcopyd */
+	kcopyd_copy(s-&gt;kcopyd_client,
+		    &amp;src, 1, &amp;dest, 0, copy_callback, pe);
+}
+
+/*
+ * Looks to see if this snapshot already has a pending exception
+ * for this chunk, otherwise it allocates a new one and inserts
+ * it into the pending table.
+ */
+static struct pending_exception *find_pending_exception(struct dm_snapshot *s,
+							struct buffer_head *bh)
+{
+	struct exception *e;
+	struct pending_exception *pe;
+	chunk_t chunk = sector_to_chunk(s, bh-&gt;b_rsector);
+
+	/*
+	 * Is there a pending exception for this already ?
+	 */
+	e = lookup_exception(&amp;s-&gt;pending, chunk);
+	if (e) {
+		/* cast the exception to a pending exception */
+		pe = list_entry(e, struct pending_exception, e);
+
+	} else {
+		/* Create a new pending exception */
+		pe = alloc_pending_exception();
+		pe-&gt;e.old_chunk = chunk;
+		pe-&gt;origin_bhs = pe-&gt;snapshot_bhs = NULL;
+		INIT_LIST_HEAD(&amp;pe-&gt;siblings);
+		pe-&gt;snap = s;
+		pe-&gt;started = 0;
+
+		if (s-&gt;store.prepare_exception(&amp;s-&gt;store, &amp;pe-&gt;e)) {
+			free_pending_exception(pe);
+			s-&gt;valid = 0;
+			return NULL;
+		}
+
+		insert_exception(&amp;s-&gt;pending, &amp;pe-&gt;e);
+	}
+
+	return pe;
+}
+
+static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
+				   struct buffer_head *bh)
+{
+	bh-&gt;b_rdev = s-&gt;cow-&gt;dev;
+	bh-&gt;b_rsector = chunk_to_sector(s, e-&gt;new_chunk) +
+	    (bh-&gt;b_rsector &amp; s-&gt;chunk_mask);
+}
+
+static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, int rw,
+			union map_info *map_context)
+{
+	struct exception *e;
+	struct dm_snapshot *s = (struct dm_snapshot *) ti-&gt;private;
+	int r = 1;
+	chunk_t chunk;
+	struct pending_exception *pe;
+
+	chunk = sector_to_chunk(s, bh-&gt;b_rsector);
+
+	/* Full snapshots are not usable */
+	if (!s-&gt;valid)
+		return -1;
+
+	/*
+	 * Write to snapshot - higher level takes care of RW/RO
+	 * flags so we should only get this if we are
+	 * writeable.
+	 */
+	if (rw == WRITE) {
+
+		down_write(&amp;s-&gt;lock);
+
+		/* If the block is already remapped - use that, else remap it */
+		e = lookup_exception(&amp;s-&gt;complete, chunk);
+		if (e)
+			remap_exception(s, e, bh);
+
+		else {
+			pe = find_pending_exception(s, bh);
+
+			if (!pe) {
+				s-&gt;store.drop_snapshot(&amp;s-&gt;store);
+				s-&gt;valid = 0;
+				r = -EIO;
+			} else {
+				remap_exception(s, &amp;pe-&gt;e, bh);
+				queue_buffer(&amp;pe-&gt;snapshot_bhs, bh);
+				start_copy(pe);
+				r = 0;
+			}
+		}
+
+		up_write(&amp;s-&gt;lock);
+
+	} else {
+		/*
+		 * FIXME: this read path scares me because we
+		 * always use the origin when we have a pending
+		 * exception.  However I can't think of a
+		 * situation where this is wrong - ejt.
+		 */
+
+		/* Do reads */
+		down_read(&amp;s-&gt;lock);
+
+		/* See if it it has been remapped */
+		e = lookup_exception(&amp;s-&gt;complete, chunk);
+		if (e)
+			remap_exception(s, e, bh);
+		else
+			bh-&gt;b_rdev = s-&gt;origin-&gt;dev;
+
+		up_read(&amp;s-&gt;lock);
+	}
+
+	return r;
+}
+
+void snapshot_resume(struct dm_target *ti)
+{
+	struct dm_snapshot *s = (struct dm_snapshot *) ti-&gt;private;
+
+	if (s-&gt;have_metadata)
+		return;
+
+	if (s-&gt;store.read_metadata(&amp;s-&gt;store)) {
+		down_write(&amp;s-&gt;lock);
+		s-&gt;valid = 0;
+		up_write(&amp;s-&gt;lock);
+	}
+
+	s-&gt;have_metadata = 1;
+}
+
+static int snapshot_status(struct dm_target *ti, status_type_t type,
+			   char *result, unsigned int maxlen)
+{
+	struct dm_snapshot *snap = (struct dm_snapshot *) ti-&gt;private;
+	char cow[16];
+	char org[16];
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		if (!snap-&gt;valid)
+			snprintf(result, maxlen, "Invalid");
+		else {
+			if (snap-&gt;store.fraction_full) {
+				sector_t numerator, denominator;
+				snap-&gt;store.fraction_full(&amp;snap-&gt;store,
+							  &amp;numerator,
+							  &amp;denominator);
+				snprintf(result, maxlen,
+					 SECTOR_FORMAT "/" SECTOR_FORMAT,
+					 numerator, denominator);
+			}
+			else
+				snprintf(result, maxlen, "Unknown");
+		}
+		break;
+
+	case STATUSTYPE_TABLE:
+		/*
+		 * kdevname returns a static pointer so we need
+		 * to make private copies if the output is to
+		 * make sense.
+		 */
+		strncpy(cow, dm_kdevname(snap-&gt;cow-&gt;dev), sizeof(cow));
+		strncpy(org, dm_kdevname(snap-&gt;origin-&gt;dev), sizeof(org));
+		snprintf(result, maxlen, "%s %s %c %ld", org, cow,
+			 snap-&gt;type, snap-&gt;chunk_size);
+		break;
+	}
+
+	return 0;
+}
+
+/*-----------------------------------------------------------------
+ * Origin methods
+ *---------------------------------------------------------------*/
+static void list_merge(struct list_head *l1, struct list_head *l2)
+{
+	struct list_head *l1_n, *l2_p;
+
+	l1_n = l1-&gt;next;
+	l2_p = l2-&gt;prev;
+
+	l1-&gt;next = l2;
+	l2-&gt;prev = l1;
+
+	l2_p-&gt;next = l1_n;
+	l1_n-&gt;prev = l2_p;
+}
+
+static int __origin_write(struct list_head *snapshots, struct buffer_head *bh)
+{
+	int r = 1, first = 1;
+	struct list_head *sl;
+	struct dm_snapshot *snap;
+	struct exception *e;
+	struct pending_exception *pe, *last = NULL;
+	chunk_t chunk;
+
+	/* Do all the snapshots on this origin */
+	list_for_each(sl, snapshots) {
+		snap = list_entry(sl, struct dm_snapshot, list);
+
+		/* Only deal with valid snapshots */
+		if (!snap-&gt;valid)
+			continue;
+
+		down_write(&amp;snap-&gt;lock);
+
+		/*
+		 * Remember, different snapshots can have
+		 * different chunk sizes.
+		 */
+		chunk = sector_to_chunk(snap, bh-&gt;b_rsector);
+
+		/*
+		 * Check exception table to see if block
+		 * is already remapped in this snapshot
+		 * and trigger an exception if not.
+		 */
+		e = lookup_exception(&amp;snap-&gt;complete, chunk);
+		if (!e) {
+			pe = find_pending_exception(snap, bh);
+			if (!pe) {
+				snap-&gt;store.drop_snapshot(&amp;snap-&gt;store);
+				snap-&gt;valid = 0;
+
+			} else {
+				if (last)
+					list_merge(&amp;pe-&gt;siblings,
+						   &amp;last-&gt;siblings);
+
+				last = pe;
+				r = 0;
+			}
+		}
+
+		up_write(&amp;snap-&gt;lock);
+	}
+
+	/*
+	 * Now that we have a complete pe list we can start the copying.
+	 */
+	if (last) {
+		pe = last;
+		do {
+			down_write(&amp;pe-&gt;snap-&gt;lock);
+			if (first)
+				queue_buffer(&amp;pe-&gt;origin_bhs, bh);
+			start_copy(pe);
+			up_write(&amp;pe-&gt;snap-&gt;lock);
+			first = 0;
+			pe = list_entry(pe-&gt;siblings.next,
+					struct pending_exception, siblings);
+
+		} while (pe != last);
+	}
+
+	return r;
+}
+
+/*
+ * Called on a write from the origin driver.
+ */
+int do_origin(struct dm_dev *origin, struct buffer_head *bh)
+{
+	struct origin *o;
+	int r;
+
+	down_read(&amp;_origins_lock);
+	o = __lookup_origin(origin-&gt;dev);
+	if (!o)
+		BUG();
+
+	r = __origin_write(&amp;o-&gt;snapshots, bh);
+	up_read(&amp;_origins_lock);
+
+	return r;
+}
+
+/*
+ * Origin: maps a linear range of a device, with hooks for snapshotting.
+ */
+
+/*
+ * Construct an origin mapping: &lt;dev_path&gt;
+ * The context for an origin is merely a 'struct dm_dev *'
+ * pointing to the real device.
+ */
+static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	int r;
+	struct dm_dev *dev;
+
+	if (argc != 1) {
+		ti-&gt;error = "dm-origin: incorrect number of arguments";
+		return -EINVAL;
+	}
+
+	r = dm_get_device(ti, argv[0], 0, ti-&gt;len,
+			  dm_table_get_mode(ti-&gt;table), &amp;dev);
+	if (r) {
+		ti-&gt;error = "Cannot get target device";
+		return r;
+	}
+
+	ti-&gt;private = dev;
+	return 0;
+}
+
+static void origin_dtr(struct dm_target *ti)
+{
+	struct dm_dev *dev = (struct dm_dev *) ti-&gt;private;
+	dm_put_device(ti, dev);
+}
+
+static int origin_map(struct dm_target *ti, struct buffer_head *bh, int rw,
+		      union map_info *map_context)
+{
+	struct dm_dev *dev = (struct dm_dev *) ti-&gt;private;
+	bh-&gt;b_rdev = dev-&gt;dev;
+
+	/* Only tell snapshots if this is a write */
+	return (rw == WRITE) ? do_origin(dev, bh) : 1;
+}
+
+static int origin_status(struct dm_target *ti, status_type_t type, char *result,
+			 unsigned int maxlen)
+{
+	struct dm_dev *dev = (struct dm_dev *) ti-&gt;private;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		result[0] = '\0';
+		break;
+
+	case STATUSTYPE_TABLE:
+		snprintf(result, maxlen, "%s", dm_kdevname(dev-&gt;dev));
+		break;
+	}
+
+	return 0;
+}
+
+static struct target_type origin_target = {
+	name:	"snapshot-origin",
+	version: {1, 0, 1},
+	module:	THIS_MODULE,
+	ctr:	origin_ctr,
+	dtr:	origin_dtr,
+	map:	origin_map,
+	status:	origin_status,
+};
+
+static struct target_type snapshot_target = {
+	name:	"snapshot",
+	version: {1, 0, 1},
+	module:	THIS_MODULE,
+	ctr:	snapshot_ctr,
+	dtr:	snapshot_dtr,
+	map:	snapshot_map,
+	resume: snapshot_resume,
+	status:	snapshot_status,
+};
+
+int __init dm_snapshot_init(void)
+{
+	int r;
+
+	r = dm_register_target(&amp;snapshot_target);
+	if (r) {
+		DMERR("snapshot target register failed %d", r);
+		return r;
+	}
+
+	r = dm_register_target(&amp;origin_target);
+	if (r &lt; 0) {
+		DMERR("Device mapper: Origin: register failed %d\n", r);
+		goto bad1;
+	}
+
+	r = init_origin_hash();
+	if (r) {
+		DMERR("init_origin_hash failed.");
+		goto bad2;
+	}
+
+	exception_cache = kmem_cache_create("dm-snapshot-ex",
+					    sizeof(struct exception),
+					    __alignof__(struct exception),
+					    0, NULL, NULL);
+	if (!exception_cache) {
+		DMERR("Couldn't create exception cache.");
+		r = -ENOMEM;
+		goto bad3;
+	}
+
+	pending_cache =
+	    kmem_cache_create("dm-snapshot-in",
+			      sizeof(struct pending_exception),
+			      __alignof__(struct pending_exception),
+			      0, NULL, NULL);
+	if (!pending_cache) {
+		DMERR("Couldn't create pending cache.");
+		r = -ENOMEM;
+		goto bad4;
+	}
+
+	pending_pool = mempool_create(128, mempool_alloc_slab,
+				      mempool_free_slab, pending_cache);
+	if (!pending_pool) {
+		DMERR("Couldn't create pending pool.");
+		r = -ENOMEM;
+		goto bad5;
+	}
+
+	return 0;
+
+      bad5:
+	kmem_cache_destroy(pending_cache);
+      bad4:
+	kmem_cache_destroy(exception_cache);
+      bad3:
+	exit_origin_hash();
+      bad2:
+	dm_unregister_target(&amp;origin_target);
+      bad1:
+	dm_unregister_target(&amp;snapshot_target);
+	return r;
+}
+
+void dm_snapshot_exit(void)
+{
+	int r;
+
+	r = dm_unregister_target(&amp;snapshot_target);
+	if (r)
+		DMERR("snapshot unregister failed %d", r);
+
+	r = dm_unregister_target(&amp;origin_target);
+	if (r)
+		DMERR("origin unregister failed %d", r);
+
+	exit_origin_hash();
+	mempool_destroy(pending_pool);
+	kmem_cache_destroy(pending_cache);
+	kmem_cache_destroy(exception_cache);
+}
--- linux-2.4.26-rc1/drivers/md/dm-snapshot.h	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-snapshot.h	Wed Mar 31 13:48:32 2004
@@ -0,0 +1,158 @@
+/*
+ * dm-snapshot.c
+ *
+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_SNAPSHOT_H
+#define DM_SNAPSHOT_H
+
+#include "dm.h"
+#include &lt;linux/blkdev.h&gt;
+
+struct exception_table {
+	uint32_t hash_mask;
+	struct list_head *table;
+};
+
+/*
+ * The snapshot code deals with largish chunks of the disk at a
+ * time. Typically 64k - 256k.
+ */
+/* FIXME: can we get away with limiting these to a uint32_t ? */
+typedef sector_t chunk_t;
+
+/*
+ * An exception is used where an old chunk of data has been
+ * replaced by a new one.
+ */
+struct exception {
+	struct list_head hash_list;
+
+	chunk_t old_chunk;
+	chunk_t new_chunk;
+};
+
+/*
+ * Abstraction to handle the meta/layout of exception stores (the
+ * COW device).
+ */
+struct exception_store {
+
+	/*
+	 * Destroys this object when you've finished with it.
+	 */
+	void (*destroy) (struct exception_store *store);
+
+	/*
+	 * The target shouldn't read the COW device until this is
+	 * called.
+	 */
+	int (*read_metadata) (struct exception_store *store);
+
+	/*
+	 * Find somewhere to store the next exception.
+	 */
+	int (*prepare_exception) (struct exception_store *store,
+				  struct exception *e);
+
+	/*
+	 * Update the metadata with this exception.
+	 */
+	void (*commit_exception) (struct exception_store *store,
+				  struct exception *e,
+				  void (*callback) (void *, int success),
+				  void *callback_context);
+
+	/*
+	 * The snapshot is invalid, note this in the metadata.
+	 */
+	void (*drop_snapshot) (struct exception_store *store);
+
+	/*
+	 * Return how full the snapshot is.
+	 */
+	void (*fraction_full) (struct exception_store *store,
+			       sector_t *numerator,
+			       sector_t *denominator);
+
+	struct dm_snapshot *snap;
+	void *context;
+};
+
+struct dm_snapshot {
+	struct rw_semaphore lock;
+	struct dm_table *table;
+
+	struct dm_dev *origin;
+	struct dm_dev *cow;
+
+	/* List of snapshots per Origin */
+	struct list_head list;
+
+	/* Size of data blocks saved - must be a power of 2 */
+	chunk_t chunk_size;
+	chunk_t chunk_mask;
+	chunk_t chunk_shift;
+
+	/* You can't use a snapshot if this is 0 (e.g. if full) */
+	int valid;
+	int have_metadata;
+
+	/* Used for display of table */
+	char type;
+
+	/* The last percentage we notified */
+	int last_percent;
+
+	struct exception_table pending;
+	struct exception_table complete;
+
+	/* The on disk metadata handler */
+	struct exception_store store;
+
+	struct kcopyd_client *kcopyd_client;
+};
+
+/*
+ * Used by the exception stores to load exceptions hen
+ * initialising.
+ */
+int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
+
+/*
+ * Constructor and destructor for the default persistent
+ * store.
+ */
+int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
+
+int dm_create_transient(struct exception_store *store,
+			struct dm_snapshot *s, int blocksize);
+
+/*
+ * Return the number of sectors in the device.
+ */
+static inline sector_t get_dev_size(kdev_t dev)
+{
+	int *sizes;
+
+	sizes = blk_size[MAJOR(dev)];
+	if (sizes)
+		return sizes[MINOR(dev)] &lt;&lt; 1;
+
+	return 0;
+}
+
+static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
+{
+	return (sector &amp; ~s-&gt;chunk_mask) &gt;&gt; s-&gt;chunk_shift;
+}
+
+static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
+{
+	return chunk &lt;&lt; s-&gt;chunk_shift;
+}
+
+#endif
--- linux-2.4.26-rc1/drivers/md/dm-stripe.c	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-stripe.c	Wed Mar 31 13:49:22 2004
@@ -0,0 +1,259 @@
+/*
+ * Copyright (C) 2001-2003 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include &lt;linux/module.h&gt;
+#include &lt;linux/init.h&gt;
+#include &lt;linux/blkdev.h&gt;
+#include &lt;linux/slab.h&gt;
+
+struct stripe {
+	struct dm_dev *dev;
+	sector_t physical_start;
+};
+
+struct stripe_c {
+	uint32_t stripes;
+
+	/* The size of this target / num. stripes */
+	uint32_t stripe_width;
+
+	/* stripe chunk size */
+	uint32_t chunk_shift;
+	sector_t chunk_mask;
+
+	struct stripe stripe[0];
+};
+
+static inline struct stripe_c *alloc_context(unsigned int stripes)
+{
+	size_t len;
+
+	if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
+			  stripes))
+		return NULL;
+
+	len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
+
+	return kmalloc(len, GFP_KERNEL);
+}
+
+/*
+ * Parse a single &lt;dev&gt; &lt;sector&gt; pair
+ */
+static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
+		      unsigned int stripe, char **argv)
+{
+	sector_t start;
+
+	if (sscanf(argv[1], SECTOR_FORMAT, &amp;start) != 1)
+		return -EINVAL;
+
+	if (dm_get_device(ti, argv[0], start, sc-&gt;stripe_width,
+			  dm_table_get_mode(ti-&gt;table),
+			  &amp;sc-&gt;stripe[stripe].dev))
+		return -ENXIO;
+
+	sc-&gt;stripe[stripe].physical_start = start;
+	return 0;
+}
+
+/*
+ * FIXME: Nasty function, only present because we can't link
+ * against __moddi3 and __divdi3.
+ *
+ * returns a == b * n
+ */
+static int multiple(sector_t a, sector_t b, sector_t *n)
+{
+	sector_t acc, prev, i;
+
+	*n = 0;
+	while (a &gt;= b) {
+		for (acc = b, prev = 0, i = 1;
+		     acc &lt;= a;
+		     prev = acc, acc &lt;&lt;= 1, i &lt;&lt;= 1)
+			;
+
+		a -= prev;
+		*n += i &gt;&gt; 1;
+	}
+
+	return a == 0;
+}
+
+/*
+ * Construct a striped mapping.
+ * &lt;number of stripes&gt; &lt;chunk size (2^^n)&gt; [&lt;dev_path&gt; &lt;offset&gt;]+
+ */
+static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct stripe_c *sc;
+	sector_t width;
+	uint32_t stripes;
+	uint32_t chunk_size;
+	char *end;
+	int r;
+	unsigned int i;
+
+	if (argc &lt; 2) {
+		ti-&gt;error = "dm-stripe: Not enough arguments";
+		return -EINVAL;
+	}
+
+	stripes = simple_strtoul(argv[0], &amp;end, 10);
+	if (*end) {
+		ti-&gt;error = "dm-stripe: Invalid stripe count";
+		return -EINVAL;
+	}
+
+	chunk_size = simple_strtoul(argv[1], &amp;end, 10);
+	if (*end) {
+		ti-&gt;error = "dm-stripe: Invalid chunk_size";
+		return -EINVAL;
+	}
+
+	/*
+	 * chunk_size is a power of two
+	 */
+	if (!chunk_size || (chunk_size &amp; (chunk_size - 1))) {
+		ti-&gt;error = "dm-stripe: Invalid chunk size";
+		return -EINVAL;
+	}
+
+	if (!multiple(ti-&gt;len, stripes, &amp;width)) {
+		ti-&gt;error = "dm-stripe: Target length not divisable by "
+		    "number of stripes";
+		return -EINVAL;
+	}
+
+	/*
+	 * Do we have enough arguments for that many stripes ?
+	 */
+	if (argc != (2 + 2 * stripes)) {
+		ti-&gt;error = "dm-stripe: Not enough destinations specified";
+		return -EINVAL;
+	}
+
+	sc = alloc_context(stripes);
+	if (!sc) {
+		ti-&gt;error = "dm-stripe: Memory allocation for striped context "
+		    "failed";
+		return -ENOMEM;
+	}
+
+	sc-&gt;stripes = stripes;
+	sc-&gt;stripe_width = width;
+
+	sc-&gt;chunk_mask = ((sector_t) chunk_size) - 1;
+	for (sc-&gt;chunk_shift = 0; chunk_size; sc-&gt;chunk_shift++)
+		chunk_size &gt;&gt;= 1;
+	sc-&gt;chunk_shift--;
+
+	/*
+	 * Get the stripe destinations.
+	 */
+	for (i = 0; i &lt; stripes; i++) {
+		argv += 2;
+
+		r = get_stripe(ti, sc, i, argv);
+		if (r &lt; 0) {
+			ti-&gt;error = "dm-stripe: Couldn't parse stripe "
+			    "destination";
+			while (i--)
+				dm_put_device(ti, sc-&gt;stripe[i].dev);
+			kfree(sc);
+			return r;
+		}
+	}
+
+	ti-&gt;private = sc;
+	return 0;
+}
+
+static void stripe_dtr(struct dm_target *ti)
+{
+	unsigned int i;
+	struct stripe_c *sc = (struct stripe_c *) ti-&gt;private;
+
+	for (i = 0; i &lt; sc-&gt;stripes; i++)
+		dm_put_device(ti, sc-&gt;stripe[i].dev);
+
+	kfree(sc);
+}
+
+static int stripe_map(struct dm_target *ti, struct buffer_head *bh, int rw,
+		      union map_info *context)
+{
+	struct stripe_c *sc = (struct stripe_c *) ti-&gt;private;
+
+	sector_t offset = bh-&gt;b_rsector - ti-&gt;begin;
+	uint32_t chunk = (uint32_t) (offset &gt;&gt; sc-&gt;chunk_shift);
+	uint32_t stripe = chunk % sc-&gt;stripes;	/* 32bit modulus */
+	chunk = chunk / sc-&gt;stripes;
+
+	bh-&gt;b_rdev = sc-&gt;stripe[stripe].dev-&gt;dev;
+	bh-&gt;b_rsector = sc-&gt;stripe[stripe].physical_start +
+	    (chunk &lt;&lt; sc-&gt;chunk_shift) + (offset &amp; sc-&gt;chunk_mask);
+	return 1;
+}
+
+static int stripe_status(struct dm_target *ti, status_type_t type,
+			 char *result, unsigned int maxlen)
+{
+	struct stripe_c *sc = (struct stripe_c *) ti-&gt;private;
+	int offset;
+	unsigned int i;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		result[0] = '\0';
+		break;
+
+	case STATUSTYPE_TABLE:
+		offset = snprintf(result, maxlen, "%d " SECTOR_FORMAT,
+				  sc-&gt;stripes, sc-&gt;chunk_mask + 1);
+		for (i = 0; i &lt; sc-&gt;stripes; i++) {
+			offset +=
+			    snprintf(result + offset, maxlen - offset,
+				     " %s " SECTOR_FORMAT,
+		       dm_kdevname(to_kdev_t(sc-&gt;stripe[i].dev-&gt;bdev-&gt;bd_dev)),
+				     sc-&gt;stripe[i].physical_start);
+		}
+		break;
+	}
+	return 0;
+}
+
+static struct target_type stripe_target = {
+	.name   = "striped",
+	.version= {1, 0, 1},
+	.module = THIS_MODULE,
+	.ctr    = stripe_ctr,
+	.dtr    = stripe_dtr,
+	.map    = stripe_map,
+	.status = stripe_status,
+};
+
+int __init dm_stripe_init(void)
+{
+	int r;
+
+	r = dm_register_target(&amp;stripe_target);
+	if (r &lt; 0)
+		DMWARN("striped target registration failed");
+
+	return r;
+}
+
+void dm_stripe_exit(void)
+{
+	if (dm_unregister_target(&amp;stripe_target))
+		DMWARN("striped target unregistration failed");
+
+	return;
+}
--- linux-2.4.26-rc1/drivers/md/dm-table.c	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-table.c	Wed Mar 31 13:48:32 2004
@@ -0,0 +1,679 @@
+/*
+ * Copyright (C) 2001 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include &lt;linux/module.h&gt;
+#include &lt;linux/vmalloc.h&gt;
+#include &lt;linux/blkdev.h&gt;
+#include &lt;linux/ctype.h&gt;
+#include &lt;linux/slab.h&gt;
+#include &lt;asm/atomic.h&gt;
+
+#define MAX_DEPTH 16
+#define NODE_SIZE L1_CACHE_BYTES
+#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
+#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
+
+struct dm_table {
+	atomic_t holders;
+
+	/* btree table */
+	unsigned int depth;
+	unsigned int counts[MAX_DEPTH];	/* in nodes */
+	sector_t *index[MAX_DEPTH];
+
+	unsigned int num_targets;
+	unsigned int num_allocated;
+	sector_t *highs;
+	struct dm_target *targets;
+
+	/*
+	 * Indicates the rw permissions for the new logical
+	 * device.  This should be a combination of FMODE_READ
+	 * and FMODE_WRITE.
+	 */
+	int mode;
+
+	/* a list of devices used by this table */
+	struct list_head devices;
+
+	/* events get handed up using this callback */
+	void (*event_fn)(void *);
+	void *event_context;
+};
+
+/*
+ * Similar to ceiling(log_size(n))
+ */
+static unsigned int int_log(unsigned long n, unsigned long base)
+{
+	int result = 0;
+
+	while (n &gt; 1) {
+		n = dm_div_up(n, base);
+		result++;
+	}
+
+	return result;
+}
+
+/*
+ * Calculate the index of the child node of the n'th node k'th key.
+ */
+static inline unsigned int get_child(unsigned int n, unsigned int k)
+{
+	return (n * CHILDREN_PER_NODE) + k;
+}
+
+/*
+ * Return the n'th node of level l from table t.
+ */
+static inline sector_t *get_node(struct dm_table *t, unsigned int l,
+				 unsigned int n)
+{
+	return t-&gt;index[l] + (n * KEYS_PER_NODE);
+}
+
+/*
+ * Return the highest key that you could lookup from the n'th
+ * node on level l of the btree.
+ */
+static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
+{
+	for (; l &lt; t-&gt;depth - 1; l++)
+		n = get_child(n, CHILDREN_PER_NODE - 1);
+
+	if (n &gt;= t-&gt;counts[l])
+		return (sector_t) - 1;
+
+	return get_node(t, l, n)[KEYS_PER_NODE - 1];
+}
+
+/*
+ * Fills in a level of the btree based on the highs of the level
+ * below it.
+ */
+static int setup_btree_index(unsigned int l, struct dm_table *t)
+{
+	unsigned int n, k;
+	sector_t *node;
+
+	for (n = 0U; n &lt; t-&gt;counts[l]; n++) {
+		node = get_node(t, l, n);
+
+		for (k = 0U; k &lt; KEYS_PER_NODE; k++)
+			node[k] = high(t, l + 1, get_child(n, k));
+	}
+
+	return 0;
+}
+
+
+
+int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
+{
+	struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
+
+	if (!t)
+		return -ENOMEM;
+
+	memset(t, 0, sizeof(*t));
+	INIT_LIST_HEAD(&amp;t-&gt;devices);
+	atomic_set(&amp;t-&gt;holders, 1);
+
+	num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
+
+	/* Allocate both the target array and offset array at once. */
+	t-&gt;highs = (sector_t *) vcalloc(sizeof(struct dm_target) +
+					sizeof(sector_t), num_targets);
+	if (!t-&gt;highs) {
+		kfree(t);
+		return -ENOMEM;
+	}
+
+	memset(t-&gt;highs, -1, sizeof(*t-&gt;highs) * num_targets);
+
+	t-&gt;targets = (struct dm_target *) (t-&gt;highs + num_targets);
+	t-&gt;num_allocated = num_targets;
+	t-&gt;mode = mode;
+	*result = t;
+	return 0;
+}
+
+static void free_devices(struct list_head *devices)
+{
+	struct list_head *tmp, *next;
+
+	for (tmp = devices-&gt;next; tmp != devices; tmp = next) {
+		struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
+		next = tmp-&gt;next;
+		kfree(dd);
+	}
+}
+
+void table_destroy(struct dm_table *t)
+{
+	unsigned int i;
+
+	/* free the indexes (see dm_table_complete) */
+	if (t-&gt;depth &gt;= 2)
+		vfree(t-&gt;index[t-&gt;depth - 2]);
+
+	/* free the targets */
+	for (i = 0; i &lt; t-&gt;num_targets; i++) {
+		struct dm_target *tgt = t-&gt;targets + i;
+
+		if (tgt-&gt;type-&gt;dtr)
+			tgt-&gt;type-&gt;dtr(tgt);
+
+		dm_put_target_type(tgt-&gt;type);
+	}
+
+	vfree(t-&gt;highs);
+
+	/* free the device list */
+	if (t-&gt;devices.next != &amp;t-&gt;devices) {
+		DMWARN("devices still present during destroy: "
+		       "dm_table_remove_device calls missing");
+
+		free_devices(&amp;t-&gt;devices);
+	}
+
+	kfree(t);
+}
+
+void dm_table_get(struct dm_table *t)
+{
+	atomic_inc(&amp;t-&gt;holders);
+}
+
+void dm_table_put(struct dm_table *t)
+{
+	if (atomic_dec_and_test(&amp;t-&gt;holders))
+		table_destroy(t);
+}
+
+/*
+ * Convert a device path to a dev_t.
+ */
+static int lookup_device(const char *path, kdev_t *dev)
+{
+	int r;
+	struct nameidata nd;
+	struct inode *inode;
+
+	if (!path_init(path, LOOKUP_FOLLOW, &amp;nd))
+		return 0;
+
+	if ((r = path_walk(path, &amp;nd)))
+		goto out;
+
+	inode = nd.dentry-&gt;d_inode;
+	if (!inode) {
+		r = -ENOENT;
+		goto out;
+	}
+
+	if (!S_ISBLK(inode-&gt;i_mode)) {
+		r = -ENOTBLK;
+		goto out;
+	}
+
+	*dev = inode-&gt;i_rdev;
+
+      out:
+	path_release(&amp;nd);
+	return r;
+}
+
+/*
+ * See if we've already got a device in the list.
+ */
+static struct dm_dev *find_device(struct list_head *l, kdev_t dev)
+{
+	struct list_head *tmp;
+
+	list_for_each(tmp, l) {
+		struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
+		if (kdev_same(dd-&gt;dev, dev))
+			return dd;
+	}
+
+	return NULL;
+}
+
+/*
+ * Open a device so we can use it as a map destination.
+ */
+static int open_dev(struct dm_dev *dd)
+{
+	if (dd-&gt;bdev)
+		BUG();
+
+	dd-&gt;bdev = bdget(kdev_t_to_nr(dd-&gt;dev));
+	if (!dd-&gt;bdev)
+		return -ENOMEM;
+
+	return blkdev_get(dd-&gt;bdev, dd-&gt;mode, 0, BDEV_RAW);
+}
+
+/*
+ * Close a device that we've been using.
+ */
+static void close_dev(struct dm_dev *dd)
+{
+	if (!dd-&gt;bdev)
+		return;
+
+	blkdev_put(dd-&gt;bdev, BDEV_RAW);
+	dd-&gt;bdev = NULL;
+}
+
+/*
+ * If possible (ie. blk_size[major] is set), this checks an area
+ * of a destination device is valid.
+ */
+static int check_device_area(kdev_t dev, sector_t start, sector_t len)
+{
+	int *sizes;
+	sector_t dev_size;
+
+	if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)]))
+		/* we don't know the device details,
+		 * so give the benefit of the doubt */
+		return 1;
+
+	/* convert to 512-byte sectors */
+	dev_size &lt;&lt;= 1;
+
+	return ((start &lt; dev_size) &amp;&amp; (len &lt;= (dev_size - start)));
+}
+
+/*
+ * This upgrades the mode on an already open dm_dev.  Being
+ * careful to leave things as they were if we fail to reopen the
+ * device.
+ */
+static int upgrade_mode(struct dm_dev *dd, int new_mode)
+{
+	int r;
+	struct dm_dev dd_copy;
+
+	memcpy(&amp;dd_copy, dd, sizeof(dd_copy));
+
+	dd-&gt;mode |= new_mode;
+	dd-&gt;bdev = NULL;
+	r = open_dev(dd);
+	if (!r)
+		close_dev(&amp;dd_copy);
+	else
+		memcpy(dd, &amp;dd_copy, sizeof(dd_copy));
+
+	return r;
+}
+
+/*
+ * Add a device to the list, or just increment the usage count if
+ * it's already present.
+ */
+int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
+		  sector_t len, int mode, struct dm_dev **result)
+{
+	int r;
+	kdev_t dev;
+	struct dm_dev *dd;
+	unsigned major, minor;
+	struct dm_table *t = ti-&gt;table;
+
+	if (!t)
+		BUG();
+
+	if (sscanf(path, "%u:%u", &amp;major, &amp;minor) == 2) {
+		/* Extract the major/minor numbers */
+		dev = mk_kdev(major, minor);
+	} else {
+		/* convert the path to a device */
+		if ((r = lookup_device(path, &amp;dev)))
+			return r;
+	}
+
+	dd = find_device(&amp;t-&gt;devices, dev);
+	if (!dd) {
+		dd = kmalloc(sizeof(*dd), GFP_KERNEL);
+		if (!dd)
+			return -ENOMEM;
+
+		dd-&gt;dev = dev;
+		dd-&gt;mode = mode;
+		dd-&gt;bdev = NULL;
+
+		if ((r = open_dev(dd))) {
+			kfree(dd);
+			return r;
+		}
+
+		atomic_set(&amp;dd-&gt;count, 0);
+		list_add(&amp;dd-&gt;list, &amp;t-&gt;devices);
+
+	} else if (dd-&gt;mode != (mode | dd-&gt;mode)) {
+		r = upgrade_mode(dd, mode);
+		if (r)
+			return r;
+	}
+	atomic_inc(&amp;dd-&gt;count);
+
+	if (!check_device_area(dd-&gt;dev, start, len)) {
+		DMWARN("device %s too small for target", path);
+		dm_put_device(ti, dd);
+		return -EINVAL;
+	}
+
+	*result = dd;
+
+	return 0;
+}
+
+/*
+ * Decrement a devices use count and remove it if neccessary.
+ */
+void dm_put_device(struct dm_target *ti, struct dm_dev *dd)
+{
+	if (atomic_dec_and_test(&amp;dd-&gt;count)) {
+		close_dev(dd);
+		list_del(&amp;dd-&gt;list);
+		kfree(dd);
+	}
+}
+
+/*
+ * Checks to see if the target joins onto the end of the table.
+ */
+static int adjoin(struct dm_table *table, struct dm_target *ti)
+{
+	struct dm_target *prev;
+
+	if (!table-&gt;num_targets)
+		return !ti-&gt;begin;
+
+	prev = &amp;table-&gt;targets[table-&gt;num_targets - 1];
+	return (ti-&gt;begin == (prev-&gt;begin + prev-&gt;len));
+}
+
+/*
+ * Used to dynamically allocate the arg array.
+ */
+static char **realloc_argv(unsigned *array_size, char **old_argv)
+{
+	char **argv;
+	unsigned new_size;
+
+	new_size = *array_size ? *array_size * 2 : 64;
+	argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL);
+	if (argv) {
+		memcpy(argv, old_argv, *array_size * sizeof(*argv));
+		*array_size = new_size;
+	}
+
+	kfree(old_argv);
+	return argv;
+}
+
+/*
+ * Destructively splits up the argument list to pass to ctr.
+ */
+static int split_args(int *argc, char ***argvp, char *input)
+{
+	char *start, *end = input, *out, **argv = NULL;
+	unsigned array_size = 0;
+
+	*argc = 0;
+	argv = realloc_argv(&amp;array_size, argv);
+	if (!argv)
+		return -ENOMEM;
+
+	while (1) {
+		start = end;
+
+		/* Skip whitespace */
+		while (*start &amp;&amp; isspace(*start))
+			start++;
+
+		if (!*start)
+			break;	/* success, we hit the end */
+
+		/* 'out' is used to remove any back-quotes */
+		end = out = start;
+		while (*end) {
+			/* Everything apart from '\0' can be quoted */
+			if (*end == '\\' &amp;&amp; *(end + 1)) {
+				*out++ = *(end + 1);
+				end += 2;
+				continue;
+			}
+
+			if (isspace(*end))
+				break;	/* end of token */
+
+			*out++ = *end++;
+		}
+
+		/* have we already filled the array ? */
+		if ((*argc + 1) &gt; array_size) {
+			argv = realloc_argv(&amp;array_size, argv);
+			if (!argv)
+				return -ENOMEM;
+		}
+
+		/* we know this is whitespace */
+		if (*end)
+			end++;
+
+		/* terminate the string and put it in the array */
+		*out = '\0';
+		argv[*argc] = start;
+		(*argc)++;
+	}
+
+	*argvp = argv;
+	return 0;
+}
+
+int dm_table_add_target(struct dm_table *t, const char *type,
+			sector_t start, sector_t len, char *params)
+{
+	int r = -EINVAL, argc;
+	char **argv;
+	struct dm_target *tgt;
+
+	if (t-&gt;num_targets &gt;= t-&gt;num_allocated)
+		return -ENOMEM;
+
+	tgt = t-&gt;targets + t-&gt;num_targets;
+	memset(tgt, 0, sizeof(*tgt));
+
+	tgt-&gt;type = dm_get_target_type(type);
+	if (!tgt-&gt;type) {
+		tgt-&gt;error = "unknown target type";
+		return -EINVAL;
+	}
+
+	tgt-&gt;table = t;
+	tgt-&gt;begin = start;
+	tgt-&gt;len = len;
+	tgt-&gt;error = "Unknown error";
+
+	/*
+	 * Does this target adjoin the previous one ?
+	 */
+	if (!adjoin(t, tgt)) {
+		tgt-&gt;error = "Gap in table";
+		r = -EINVAL;
+		goto bad;
+	}
+
+	r = split_args(&amp;argc, &amp;argv, params);
+	if (r) {
+		tgt-&gt;error = "couldn't split parameters (insufficient memory)";
+		goto bad;
+	}
+
+	r = tgt-&gt;type-&gt;ctr(tgt, argc, argv);
+	kfree(argv);
+	if (r)
+		goto bad;
+
+	t-&gt;highs[t-&gt;num_targets++] = tgt-&gt;begin + tgt-&gt;len - 1;
+	return 0;
+
+      bad:
+	printk(KERN_ERR DM_NAME ": %s\n", tgt-&gt;error);
+	dm_put_target_type(tgt-&gt;type);
+	return r;
+}
+
+static int setup_indexes(struct dm_table *t)
+{
+	int i;
+	unsigned int total = 0;
+	sector_t *indexes;
+
+	/* allocate the space for *all* the indexes */
+	for (i = t-&gt;depth - 2; i &gt;= 0; i--) {
+		t-&gt;counts[i] = dm_div_up(t-&gt;counts[i + 1], CHILDREN_PER_NODE);
+		total += t-&gt;counts[i];
+	}
+
+	indexes = (sector_t *) vcalloc(total, (unsigned long) NODE_SIZE);
+	if (!indexes)
+		return -ENOMEM;
+
+	/* set up internal nodes, bottom-up */
+	for (i = t-&gt;depth - 2, total = 0; i &gt;= 0; i--) {
+		t-&gt;index[i] = indexes;
+		indexes += (KEYS_PER_NODE * t-&gt;counts[i]);
+		setup_btree_index(i, t);
+	}
+
+	return 0;
+}
+
+/*
+ * Builds the btree to index the map.
+ */
+int dm_table_complete(struct dm_table *t)
+{
+	int r = 0;
+	unsigned int leaf_nodes;
+
+	/* how many indexes will the btree have ? */
+	leaf_nodes = dm_div_up(t-&gt;num_targets, KEYS_PER_NODE);
+	t-&gt;depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
+
+	/* leaf layer has already been set up */
+	t-&gt;counts[t-&gt;depth - 1] = leaf_nodes;
+	t-&gt;index[t-&gt;depth - 1] = t-&gt;highs;
+
+	if (t-&gt;depth &gt;= 2)
+		r = setup_indexes(t);
+
+	return r;
+}
+
+static spinlock_t _event_lock = SPIN_LOCK_UNLOCKED;
+void dm_table_event_callback(struct dm_table *t,
+			     void (*fn)(void *), void *context)
+{
+	spin_lock_irq(&amp;_event_lock);
+	t-&gt;event_fn = fn;
+	t-&gt;event_context = context;
+	spin_unlock_irq(&amp;_event_lock);
+}
+
+void dm_table_event(struct dm_table *t)
+{
+	spin_lock(&amp;_event_lock);
+	if (t-&gt;event_fn)
+		t-&gt;event_fn(t-&gt;event_context);
+	spin_unlock(&amp;_event_lock);
+}
+
+sector_t dm_table_get_size(struct dm_table *t)
+{
+	return t-&gt;num_targets ? (t-&gt;highs[t-&gt;num_targets - 1] + 1) : 0;
+}
+
+struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
+{
+	if (index &gt; t-&gt;num_targets)
+		return NULL;
+
+	return t-&gt;targets + index;
+}
+
+/*
+ * Search the btree for the correct target.
+ */
+struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
+{
+	unsigned int l, n = 0, k = 0;
+	sector_t *node;
+
+	for (l = 0; l &lt; t-&gt;depth; l++) {
+		n = get_child(n, k);
+		node = get_node(t, l, n);
+
+		for (k = 0; k &lt; KEYS_PER_NODE; k++)
+			if (node[k] &gt;= sector)
+				break;
+	}
+
+	return &amp;t-&gt;targets[(KEYS_PER_NODE * n) + k];
+}
+
+unsigned int dm_table_get_num_targets(struct dm_table *t)
+{
+	return t-&gt;num_targets;
+}
+
+struct list_head *dm_table_get_devices(struct dm_table *t)
+{
+	return &amp;t-&gt;devices;
+}
+
+int dm_table_get_mode(struct dm_table *t)
+{
+	return t-&gt;mode;
+}
+
+void dm_table_suspend_targets(struct dm_table *t)
+{
+	int i;
+
+	for (i = 0; i &lt; t-&gt;num_targets; i++) {
+		struct dm_target *ti = t-&gt;targets + i;
+
+		if (ti-&gt;type-&gt;suspend)
+			ti-&gt;type-&gt;suspend(ti);
+	}
+}
+
+void dm_table_resume_targets(struct dm_table *t)
+{
+	int i;
+
+	for (i = 0; i &lt; t-&gt;num_targets; i++) {
+		struct dm_target *ti = t-&gt;targets + i;
+
+		if (ti-&gt;type-&gt;resume)
+			ti-&gt;type-&gt;resume(ti);
+	}
+}
+
+EXPORT_SYMBOL(dm_get_device);
+EXPORT_SYMBOL(dm_put_device);
+EXPORT_SYMBOL(dm_table_event);
+EXPORT_SYMBOL(dm_table_get_mode);
--- linux-2.4.26-rc1/drivers/md/dm-target.c	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm-target.c	Wed Mar 31 13:54:19 2004
@@ -0,0 +1,203 @@
+/*
+ * Copyright (C) 2001 Sistina Software (UK) Limited
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+
+#include &lt;linux/module.h&gt;
+#include &lt;linux/kmod.h&gt;
+#include &lt;linux/slab.h&gt;
+
+struct tt_internal {
+	struct target_type tt;
+
+	struct list_head list;
+	long use;
+};
+
+static LIST_HEAD(_targets);
+static DECLARE_RWSEM(_lock);
+
+#define DM_MOD_NAME_SIZE 32
+
+static inline struct tt_internal *__find_target_type(const char *name)
+{
+	struct list_head *tih;
+	struct tt_internal *ti;
+
+	list_for_each(tih, &amp;_targets) {
+		ti = list_entry(tih, struct tt_internal, list);
+
+		if (!strcmp(name, ti-&gt;tt.name))
+			return ti;
+	}
+
+	return NULL;
+}
+
+static struct tt_internal *get_target_type(const char *name)
+{
+	struct tt_internal *ti;
+
+	down_read(&amp;_lock);
+	ti = __find_target_type(name);
+
+	if (ti) {
+		if (ti-&gt;use == 0 &amp;&amp; ti-&gt;tt.module)
+			__MOD_INC_USE_COUNT(ti-&gt;tt.module);
+		ti-&gt;use++;
+	}
+	up_read(&amp;_lock);
+
+	return ti;
+}
+
+static void load_module(const char *name)
+{
+	char module_name[DM_MOD_NAME_SIZE] = "dm-";
+
+	/* Length check for strcat() below */
+	if (strlen(name) &gt; (DM_MOD_NAME_SIZE - 4))
+		return;
+
+	strcat(module_name, name);
+	request_module(module_name);
+}
+
+struct target_type *dm_get_target_type(const char *name)
+{
+	struct tt_internal *ti = get_target_type(name);
+
+	if (!ti) {
+		load_module(name);
+		ti = get_target_type(name);
+	}
+
+	return ti ? &amp;ti-&gt;tt : NULL;
+}
+
+void dm_put_target_type(struct target_type *t)
+{
+	struct tt_internal *ti = (struct tt_internal *) t;
+
+	down_read(&amp;_lock);
+	if (--ti-&gt;use == 0 &amp;&amp; ti-&gt;tt.module)
+		__MOD_DEC_USE_COUNT(ti-&gt;tt.module);
+
+	if (ti-&gt;use &lt; 0)
+		BUG();
+	up_read(&amp;_lock);
+
+	return;
+}
+
+static struct tt_internal *alloc_target(struct target_type *t)
+{
+	struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
+
+	if (ti) {
+		memset(ti, 0, sizeof(*ti));
+		ti-&gt;tt = *t;
+	}
+
+	return ti;
+}
+
+
+int dm_target_iterate(void (*iter_func)(struct target_type *tt,
+					void *param), void *param)
+{
+	struct tt_internal *ti;
+
+	down_read(&amp;_lock);
+	list_for_each_entry (ti, &amp;_targets, list)
+		iter_func(&amp;ti-&gt;tt, param);
+	up_read(&amp;_lock);
+
+	return 0;
+}
+
+int dm_register_target(struct target_type *t)
+{
+	int rv = 0;
+	struct tt_internal *ti = alloc_target(t);
+
+	if (!ti)
+		return -ENOMEM;
+
+	down_write(&amp;_lock);
+	if (__find_target_type(t-&gt;name)) {
+		kfree(ti);
+		rv = -EEXIST;
+	} else
+		list_add(&amp;ti-&gt;list, &amp;_targets);
+
+	up_write(&amp;_lock);
+	return rv;
+}
+
+int dm_unregister_target(struct target_type *t)
+{
+	struct tt_internal *ti;
+
+	down_write(&amp;_lock);
+	if (!(ti = __find_target_type(t-&gt;name))) {
+		up_write(&amp;_lock);
+		return -EINVAL;
+	}
+
+	if (ti-&gt;use) {
+		up_write(&amp;_lock);
+		return -ETXTBSY;
+	}
+
+	list_del(&amp;ti-&gt;list);
+	kfree(ti);
+
+	up_write(&amp;_lock);
+	return 0;
+}
+
+/*
+ * io-err: always fails an io, useful for bringing
+ * up LVs that have holes in them.
+ */
+static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
+{
+	return 0;
+}
+
+static void io_err_dtr(struct dm_target *ti)
+{
+	/* empty */
+}
+
+static int io_err_map(struct dm_target *ti, struct buffer_head *bh, int rw,
+		      union map_info *map_context)
+{
+	return -EIO;
+}
+
+static struct target_type error_target = {
+	.name = "error",
+	.version = {1, 0, 1},
+	.ctr  = io_err_ctr,
+	.dtr  = io_err_dtr,
+	.map  = io_err_map,
+};
+
+int dm_target_init(void)
+{
+	return dm_register_target(&amp;error_target);
+}
+
+void dm_target_exit(void)
+{
+	if (dm_unregister_target(&amp;error_target))
+		DMWARN("error target unregistration failed");
+}
+
+EXPORT_SYMBOL(dm_register_target);
+EXPORT_SYMBOL(dm_unregister_target);
--- linux-2.4.26-rc1/drivers/md/dm.c	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm.c	Wed Mar 31 13:48:32 2004
@@ -0,0 +1,1115 @@
+/*
+ * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm.h"
+#include "kcopyd.h"
+
+#include &lt;linux/init.h&gt;
+#include &lt;linux/module.h&gt;
+#include &lt;linux/blk.h&gt;
+#include &lt;linux/blkpg.h&gt;
+#include &lt;linux/mempool.h&gt;
+#include &lt;linux/slab.h&gt;
+#include &lt;linux/major.h&gt;
+#include &lt;linux/kdev_t.h&gt;
+#include &lt;linux/lvm.h&gt;
+
+#include &lt;asm/uaccess.h&gt;
+
+static const char *_name = DM_NAME;
+#define DEFAULT_READ_AHEAD 64
+
+struct dm_io {
+	struct mapped_device *md;
+
+	struct dm_target *ti;
+	int rw;
+	union map_info map_context;
+	void (*end_io) (struct buffer_head * bh, int uptodate);
+	void *context;
+};
+
+struct deferred_io {
+	int rw;
+	struct buffer_head *bh;
+	struct deferred_io *next;
+};
+
+/*
+ * Bits for the md-&gt;flags field.
+ */
+#define DMF_BLOCK_IO 0
+#define DMF_SUSPENDED 1
+
+struct mapped_device {
+	struct rw_semaphore lock;
+	atomic_t holders;
+
+	kdev_t dev;
+	unsigned long flags;
+
+	/*
+	 * A list of ios that arrived while we were suspended.
+	 */
+	atomic_t pending;
+	wait_queue_head_t wait;
+	struct deferred_io *deferred;
+
+	/*
+	 * The current mapping.
+	 */
+	struct dm_table *map;
+
+	/*
+	 * io objects are allocated from here.
+	 */
+	mempool_t *io_pool;
+
+	/*
+	 * Event handling.
+	 */
+	uint32_t event_nr;
+	wait_queue_head_t eventq;
+};
+
+#define MIN_IOS 256
+static kmem_cache_t *_io_cache;
+
+static struct mapped_device *get_kdev(kdev_t dev);
+static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh);
+static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb);
+
+/*-----------------------------------------------------------------
+ * In order to avoid the 256 minor number limit we are going to
+ * register more major numbers as neccessary.
+ *---------------------------------------------------------------*/
+#define MAX_MINORS (1 &lt;&lt; MINORBITS)
+
+struct major_details {
+	unsigned int major;
+
+	int transient;
+	struct list_head transient_list;
+
+	unsigned int first_free_minor;
+	int nr_free_minors;
+
+	struct mapped_device *mds[MAX_MINORS];
+	int blk_size[MAX_MINORS];
+	int blksize_size[MAX_MINORS];
+	int hardsect_size[MAX_MINORS];
+};
+
+static struct rw_semaphore _dev_lock;
+static struct major_details *_majors[MAX_BLKDEV];
+
+/*
+ * This holds a list of majors that non-specified device numbers
+ * may be allocated from.  Only majors with free minors appear on
+ * this list.
+ */
+static LIST_HEAD(_transients_free);
+
+static int __alloc_major(unsigned int major, struct major_details **result)
+{
+	int r;
+	unsigned int transient = !major;
+	struct major_details *maj;
+
+	/* Major already allocated? */
+	if (major &amp;&amp; _majors[major])
+		return 0;
+
+	maj = kmalloc(sizeof(*maj), GFP_KERNEL);
+	if (!maj)
+		return -ENOMEM;
+
+	memset(maj, 0, sizeof(*maj));
+	INIT_LIST_HEAD(&amp;maj-&gt;transient_list);
+
+	maj-&gt;nr_free_minors = MAX_MINORS;
+
+	r = register_blkdev(major, _name, &amp;dm_blk_dops);
+	if (r &lt; 0) {
+		DMERR("register_blkdev failed for %d", major);
+		kfree(maj);
+		return r;
+	}
+	if (r &gt; 0)
+		major = r;
+
+	maj-&gt;major = major;
+
+	if (transient) {
+		maj-&gt;transient = transient;
+		list_add_tail(&amp;maj-&gt;transient_list, &amp;_transients_free);
+	}
+
+	_majors[major] = maj;
+
+	blk_size[major] = maj-&gt;blk_size;
+	blksize_size[major] = maj-&gt;blksize_size;
+	hardsect_size[major] = maj-&gt;hardsect_size;
+	read_ahead[major] = DEFAULT_READ_AHEAD;
+
+	blk_queue_make_request(BLK_DEFAULT_QUEUE(major), dm_request);
+
+	*result = maj;
+	return 0;
+}
+
+static void __free_major(struct major_details *maj)
+{
+	unsigned int major = maj-&gt;major;
+
+	list_del(&amp;maj-&gt;transient_list);
+
+	read_ahead[major] = 0;
+	blk_size[major] = NULL;
+	blksize_size[major] = NULL;
+	hardsect_size[major] = NULL;
+
+	_majors[major] = NULL;
+	kfree(maj);
+
+	if (unregister_blkdev(major, _name) &lt; 0)
+		DMERR("devfs_unregister_blkdev failed");
+}
+
+static void free_all_majors(void)
+{
+	unsigned int major = ARRAY_SIZE(_majors);
+
+	down_write(&amp;_dev_lock);
+
+	while (major--)
+		if (_majors[major])
+			__free_major(_majors[major]);
+
+	up_write(&amp;_dev_lock);
+}
+
+static void free_dev(kdev_t dev)
+{
+	unsigned int major = major(dev);
+	unsigned int minor = minor(dev);
+	struct major_details *maj;
+
+	down_write(&amp;_dev_lock);
+
+	maj = _majors[major];
+	if (!maj)
+		goto out;
+
+	maj-&gt;mds[minor] = NULL;
+	maj-&gt;nr_free_minors++;
+
+	if (maj-&gt;nr_free_minors == MAX_MINORS) {
+		__free_major(maj);
+		goto out;
+	}
+
+	if (!maj-&gt;transient)
+		goto out;
+
+	if (maj-&gt;nr_free_minors == 1)
+		list_add_tail(&amp;maj-&gt;transient_list, &amp;_transients_free);
+
+	if (minor &lt; maj-&gt;first_free_minor)
+		maj-&gt;first_free_minor = minor;
+
+      out:
+	up_write(&amp;_dev_lock);
+}
+
+static void __alloc_minor(struct major_details *maj, unsigned int minor,
+			  struct mapped_device *md)
+{
+	maj-&gt;mds[minor] = md;
+	md-&gt;dev = mk_kdev(maj-&gt;major, minor);
+	maj-&gt;nr_free_minors--;
+
+	if (maj-&gt;transient &amp;&amp; !maj-&gt;nr_free_minors)
+		list_del_init(&amp;maj-&gt;transient_list);
+}
+
+/*
+ * See if requested kdev_t is available.
+ */
+static int specific_dev(kdev_t dev, struct mapped_device *md)
+{
+	int r = 0;
+	unsigned int major = major(dev);
+	unsigned int minor = minor(dev);
+	struct major_details *maj;
+
+	if (!major || (major &gt; MAX_BLKDEV) || (minor &gt;= MAX_MINORS)) {
+		DMWARN("device number requested out of range (%d, %d)",
+		       major, minor);
+		return -EINVAL;
+	}
+
+	down_write(&amp;_dev_lock);
+	maj = _majors[major];
+
+	/* Register requested major? */
+	if (!maj) {
+		r = __alloc_major(major, &amp;maj);
+		if (r)
+			goto out;
+
+		major = maj-&gt;major;
+	}
+
+	if (maj-&gt;mds[minor]) {
+		r = -EBUSY;
+		goto out;
+	}
+
+	__alloc_minor(maj, minor, md);
+
+      out:
+	up_write(&amp;_dev_lock);
+
+	return r;
+}
+
+/*
+ * Find first unused device number, requesting a new major number if required.
+ */
+static int first_free_dev(struct mapped_device *md)
+{
+	int r = 0;
+	struct major_details *maj;
+
+	down_write(&amp;_dev_lock);
+
+	if (list_empty(&amp;_transients_free)) {
+		r = __alloc_major(0, &amp;maj);
+		if (r)
+			goto out;
+	} else
+		maj = list_entry(_transients_free.next, struct major_details,
+				 transient_list);
+
+	while (maj-&gt;mds[maj-&gt;first_free_minor++])
+		;
+
+	__alloc_minor(maj, maj-&gt;first_free_minor - 1, md);
+
+      out:
+	up_write(&amp;_dev_lock);
+
+	return r;
+}
+
+static struct mapped_device *get_kdev(kdev_t dev)
+{
+	struct mapped_device *md;
+	struct major_details *maj;
+
+	down_read(&amp;_dev_lock);
+	maj = _majors[major(dev)];
+	if (!maj) {
+		md = NULL;
+		goto out;
+	}
+	md = maj-&gt;mds[minor(dev)];
+	if (md)
+		dm_get(md);
+      out:
+	up_read(&amp;_dev_lock);
+
+	return md;
+}
+
+/*-----------------------------------------------------------------
+ * init/exit code
+ *---------------------------------------------------------------*/
+
+static __init int local_init(void)
+{
+	init_rwsem(&amp;_dev_lock);
+
+	/* allocate a slab for the dm_ios */
+	_io_cache = kmem_cache_create("dm io",
+				      sizeof(struct dm_io), 0, 0, NULL, NULL);
+
+	if (!_io_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void local_exit(void)
+{
+	kmem_cache_destroy(_io_cache);
+	free_all_majors();
+
+	DMINFO("cleaned up");
+}
+
+/*
+ * We have a lot of init/exit functions, so it seems easier to
+ * store them in an array.  The disposable macro 'xx'
+ * expands a prefix into a pair of function names.
+ */
+static struct {
+	int (*init) (void);
+	void (*exit) (void);
+
+} _inits[] = {
+#define xx(n) {n ## _init, n ## _exit},
+	xx(local)
+	xx(kcopyd)
+	xx(dm_target)
+	xx(dm_linear)
+	xx(dm_stripe)
+	xx(dm_snapshot)
+	xx(dm_interface)
+#undef xx
+};
+
+static int __init dm_init(void)
+{
+	const int count = ARRAY_SIZE(_inits);
+
+	int r, i;
+
+	for (i = 0; i &lt; count; i++) {
+		r = _inits[i].init();
+		if (r)
+			goto bad;
+	}
+
+	return 0;
+
+      bad:
+	while (i--)
+		_inits[i].exit();
+
+	return r;
+}
+
+static void __exit dm_exit(void)
+{
+	int i = ARRAY_SIZE(_inits);
+
+	while (i--)
+		_inits[i].exit();
+}
+
+/*
+ * Block device functions
+ */
+static int dm_blk_open(struct inode *inode, struct file *file)
+{
+	struct mapped_device *md;
+
+	md = get_kdev(inode-&gt;i_rdev);
+	if (!md)
+		return -ENXIO;
+
+	return 0;
+}
+
+static int dm_blk_close(struct inode *inode, struct file *file)
+{
+	struct mapped_device *md;
+
+	md = get_kdev(inode-&gt;i_rdev);
+	dm_put(md);		/* put the reference gained by dm_blk_open */
+	dm_put(md);
+	return 0;
+}
+
+static inline struct dm_io *alloc_io(struct mapped_device *md)
+{
+	return mempool_alloc(md-&gt;io_pool, GFP_NOIO);
+}
+
+static inline void free_io(struct mapped_device *md, struct dm_io *io)
+{
+	mempool_free(io, md-&gt;io_pool);
+}
+
+static inline struct deferred_io *alloc_deferred(void)
+{
+	return kmalloc(sizeof(struct deferred_io), GFP_NOIO);
+}
+
+static inline void free_deferred(struct deferred_io *di)
+{
+	kfree(di);
+}
+
+static inline sector_t volume_size(kdev_t dev)
+{
+	return blk_size[major(dev)][minor(dev)] &lt;&lt; 1;
+}
+
+/* FIXME: check this */
+static int dm_blk_ioctl(struct inode *inode, struct file *file,
+			unsigned int command, unsigned long a)
+{
+	kdev_t dev = inode-&gt;i_rdev;
+	long size;
+
+	switch (command) {
+	case BLKROSET:
+	case BLKROGET:
+	case BLKRASET:
+	case BLKRAGET:
+	case BLKFLSBUF:
+	case BLKSSZGET:
+		//case BLKRRPART: /* Re-read partition tables */
+		//case BLKPG:
+	case BLKELVGET:
+	case BLKELVSET:
+	case BLKBSZGET:
+	case BLKBSZSET:
+		return blk_ioctl(dev, command, a);
+		break;
+
+	case BLKGETSIZE:
+		size = volume_size(dev);
+		if (copy_to_user((void *) a, &amp;size, sizeof(long)))
+			return -EFAULT;
+		break;
+
+	case BLKGETSIZE64:
+		size = volume_size(dev);
+		if (put_user((u64) ((u64) size) &lt;&lt; 9, (u64 *) a))
+			return -EFAULT;
+		break;
+
+	case BLKRRPART:
+		return -ENOTTY;
+
+	case LV_BMAP:
+		return dm_user_bmap(inode, (struct lv_bmap *) a);
+
+	default:
+		DMWARN("unknown block ioctl 0x%x", command);
+		return -ENOTTY;
+	}
+
+	return 0;
+}
+
+/*
+ * Add the buffer to the list of deferred io.
+ */
+static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw)
+{
+	struct deferred_io *di;
+
+	di = alloc_deferred();
+	if (!di)
+		return -ENOMEM;
+
+	down_write(&amp;md-&gt;lock);
+
+	if (!test_bit(DMF_BLOCK_IO, &amp;md-&gt;flags)) {
+		up_write(&amp;md-&gt;lock);
+		free_deferred(di);
+		return 1;
+	}
+
+	di-&gt;bh = bh;
+	di-&gt;rw = rw;
+	di-&gt;next = md-&gt;deferred;
+	md-&gt;deferred = di;
+
+	up_write(&amp;md-&gt;lock);
+	return 0;		/* deferred successfully */
+}
+
+/*
+ * bh-&gt;b_end_io routine that decrements the pending count
+ * and then calls the original bh-&gt;b_end_io fn.
+ */
+static void dec_pending(struct buffer_head *bh, int uptodate)
+{
+	int r;
+	struct dm_io *io = bh-&gt;b_private;
+	dm_endio_fn endio = io-&gt;ti-&gt;type-&gt;end_io;
+
+	if (endio) {
+		r = endio(io-&gt;ti, bh, io-&gt;rw, uptodate ? 0 : -EIO,
+			  &amp;io-&gt;map_context);
+		if (r &lt; 0)
+			uptodate = 0;
+
+		else if (r &gt; 0)
+			/* the target wants another shot at the io */
+			return;
+	}
+
+	if (atomic_dec_and_test(&amp;io-&gt;md-&gt;pending))
+		/* nudge anyone waiting on suspend queue */
+		wake_up(&amp;io-&gt;md-&gt;wait);
+
+	bh-&gt;b_end_io = io-&gt;end_io;
+	bh-&gt;b_private = io-&gt;context;
+	free_io(io-&gt;md, io);
+
+	bh-&gt;b_end_io(bh, uptodate);
+}
+
+/*
+ * Do the bh mapping for a given leaf
+ */
+static inline int __map_buffer(struct mapped_device *md, int rw,
+			       struct buffer_head *bh, struct dm_io *io)
+{
+	struct dm_target *ti;
+
+	if (!md-&gt;map)
+		return -EINVAL;
+
+	ti = dm_table_find_target(md-&gt;map, bh-&gt;b_rsector);
+	if (!ti-&gt;type)
+		return -EINVAL;
+
+	/* hook the end io request fn */
+	atomic_inc(&amp;md-&gt;pending);
+	io-&gt;md = md;
+	io-&gt;ti = ti;
+	io-&gt;rw = rw;
+	io-&gt;end_io = bh-&gt;b_end_io;
+	io-&gt;context = bh-&gt;b_private;
+	bh-&gt;b_end_io = dec_pending;
+	bh-&gt;b_private = io;
+
+	return ti-&gt;type-&gt;map(ti, bh, rw, &amp;io-&gt;map_context);
+}
+
+/*
+ * Checks to see if we should be deferring io, if so it queues it
+ * and returns 1.
+ */
+static inline int __deferring(struct mapped_device *md, int rw,
+			      struct buffer_head *bh)
+{
+	int r;
+
+	/*
+	 * If we're suspended we have to queue this io for later.
+	 */
+	while (test_bit(DMF_BLOCK_IO, &amp;md-&gt;flags)) {
+		up_read(&amp;md-&gt;lock);
+
+		/*
+		 * There's no point deferring a read ahead
+		 * request, just drop it.
+		 */
+		if (rw == READA) {
+			down_read(&amp;md-&gt;lock);
+			return -EIO;
+		}
+
+		r = queue_io(md, bh, rw);
+		down_read(&amp;md-&gt;lock);
+
+		if (r &lt; 0)
+			return r;
+
+		if (r == 0)
+			return 1;	/* deferred successfully */
+
+	}
+
+	return 0;
+}
+
+static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh)
+{
+	int r;
+	struct dm_io *io;
+	struct mapped_device *md;
+
+	md = get_kdev(bh-&gt;b_rdev);
+	if (!md) {
+		buffer_IO_error(bh);
+		return 0;
+	}
+
+	io = alloc_io(md);
+	down_read(&amp;md-&gt;lock);
+
+	r = __deferring(md, rw, bh);
+	if (r &lt; 0)
+		goto bad;
+
+	else if (!r) {
+		/* not deferring */
+		r = __map_buffer(md, rw, bh, io);
+		if (r &lt; 0)
+			goto bad;
+	} else
+		r = 0;
+
+	up_read(&amp;md-&gt;lock);
+	dm_put(md);
+	return r;
+
+      bad:
+	buffer_IO_error(bh);
+	up_read(&amp;md-&gt;lock);
+	dm_put(md);
+	return 0;
+}
+
+static int check_dev_size(kdev_t dev, unsigned long block)
+{
+	unsigned int major = major(dev);
+	unsigned int minor = minor(dev);
+
+	/* FIXME: check this */
+	unsigned long max_sector = (blk_size[major][minor] &lt;&lt; 1) + 1;
+	unsigned long sector = (block + 1) * (blksize_size[major][minor] &gt;&gt; 9);
+
+	return (sector &gt; max_sector) ? 0 : 1;
+}
+
+/*
+ * Creates a dummy buffer head and maps it (for lilo).
+ */
+static int __bmap(struct mapped_device *md, kdev_t dev, unsigned long block,
+		  kdev_t *r_dev, unsigned long *r_block)
+{
+	struct buffer_head bh;
+	struct dm_target *ti;
+	union map_info map_context;
+	int r;
+
+	if (test_bit(DMF_BLOCK_IO, &amp;md-&gt;flags)) {
+		return -EPERM;
+	}
+
+	if (!check_dev_size(dev, block)) {
+		return -EINVAL;
+	}
+
+	if (!md-&gt;map)
+		return -EINVAL;
+
+	/* setup dummy bh */
+	memset(&amp;bh, 0, sizeof(bh));
+	bh.b_blocknr = block;
+	bh.b_dev = bh.b_rdev = dev;
+	bh.b_size = blksize_size[major(dev)][minor(dev)];
+	bh.b_rsector = block * (bh.b_size &gt;&gt; 9);
+
+	/* find target */
+	ti = dm_table_find_target(md-&gt;map, bh.b_rsector);
+
+	/* do the mapping */
+	r = ti-&gt;type-&gt;map(ti, &amp;bh, READ, &amp;map_context);
+	ti-&gt;type-&gt;end_io(ti, &amp;bh, READ, 0, &amp;map_context);
+
+	if (!r) {
+		*r_dev = bh.b_rdev;
+		*r_block = bh.b_rsector / (bh.b_size &gt;&gt; 9);
+	}
+
+	return r;
+}
+
+/*
+ * Marshals arguments and results between user and kernel space.
+ */
+static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb)
+{
+	struct mapped_device *md;
+	unsigned long block, r_block;
+	kdev_t r_dev;
+	int r;
+
+	if (get_user(block, &amp;lvb-&gt;lv_block))
+		return -EFAULT;
+
+	md = get_kdev(inode-&gt;i_rdev);
+	if (!md)
+		return -ENXIO;
+
+	down_read(&amp;md-&gt;lock);
+	r = __bmap(md, inode-&gt;i_rdev, block, &amp;r_dev, &amp;r_block);
+	up_read(&amp;md-&gt;lock);
+	dm_put(md);
+
+	if (!r &amp;&amp; (put_user(kdev_t_to_nr(r_dev), &amp;lvb-&gt;lv_dev) ||
+		   put_user(r_block, &amp;lvb-&gt;lv_block)))
+		r = -EFAULT;
+
+	return r;
+}
+
+static void free_md(struct mapped_device *md)
+{
+	free_dev(md-&gt;dev);
+	mempool_destroy(md-&gt;io_pool);
+	kfree(md);
+}
+
+/*
+ * Allocate and initialise a blank device with a given minor.
+ */
+static struct mapped_device *alloc_md(kdev_t dev)
+{
+	int r;
+	struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
+
+	if (!md) {
+		DMWARN("unable to allocate device, out of memory.");
+		return NULL;
+	}
+
+	memset(md, 0, sizeof(*md));
+
+	/* Allocate suitable device number */
+	if (!dev)
+		r = first_free_dev(md);
+	else
+		r = specific_dev(dev, md);
+
+	if (r) {
+		kfree(md);
+		return NULL;
+	}
+
+	md-&gt;io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
+				     mempool_free_slab, _io_cache);
+	if (!md-&gt;io_pool) {
+		free_md(md);
+		kfree(md);
+		return NULL;
+	}
+
+	init_rwsem(&amp;md-&gt;lock);
+	atomic_set(&amp;md-&gt;holders, 1);
+	atomic_set(&amp;md-&gt;pending, 0);
+	init_waitqueue_head(&amp;md-&gt;wait);
+	init_waitqueue_head(&amp;md-&gt;eventq);
+
+	return md;
+}
+
+/*
+ * The hardsect size for a mapped device is the largest hardsect size
+ * from the devices it maps onto.
+ */
+static int __find_hardsect_size(struct list_head *devices)
+{
+	int result = 512, size;
+	struct list_head *tmp;
+
+	list_for_each (tmp, devices) {
+		struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
+		size = get_hardsect_size(dd-&gt;dev);
+		if (size &gt; result)
+			result = size;
+	}
+
+	return result;
+}
+
+/*
+ * Bind a table to the device.
+ */
+static void event_callback(void *context)
+{
+	struct mapped_device *md = (struct mapped_device *) context;
+
+	down_write(&amp;md-&gt;lock);
+	md-&gt;event_nr++;
+	wake_up_interruptible(&amp;md-&gt;eventq);
+	up_write(&amp;md-&gt;lock);
+}
+
+static int __bind(struct mapped_device *md, struct dm_table *t)
+{
+	unsigned int minor = minor(md-&gt;dev);
+	unsigned int major = major(md-&gt;dev);
+	md-&gt;map = t;
+
+	/* in k */
+	blk_size[major][minor] = dm_table_get_size(t) &gt;&gt; 1;
+	blksize_size[major][minor] = BLOCK_SIZE;
+	hardsect_size[major][minor] =
+	    __find_hardsect_size(dm_table_get_devices(t));
+	register_disk(NULL, md-&gt;dev, 1, &amp;dm_blk_dops, blk_size[major][minor]);
+
+	dm_table_event_callback(md-&gt;map, event_callback, md);
+	dm_table_get(t);
+	return 0;
+}
+
+static void __unbind(struct mapped_device *md)
+{
+	unsigned int minor = minor(md-&gt;dev);
+	unsigned int major = major(md-&gt;dev);
+
+	if (md-&gt;map) {
+		dm_table_event_callback(md-&gt;map, NULL, NULL);
+		dm_table_put(md-&gt;map);
+		md-&gt;map = NULL;
+
+	}
+
+	blk_size[major][minor] = 0;
+	blksize_size[major][minor] = 0;
+	hardsect_size[major][minor] = 0;
+}
+
+/*
+ * Constructor for a new device.
+ */
+int dm_create(kdev_t dev, struct mapped_device **result)
+{
+	struct mapped_device *md;
+
+	md = alloc_md(dev);
+	if (!md)
+		return -ENXIO;
+
+	__unbind(md);	/* Ensure zero device size */
+
+	*result = md;
+	return 0;
+}
+
+void dm_get(struct mapped_device *md)
+{
+	atomic_inc(&amp;md-&gt;holders);
+}
+
+void dm_put(struct mapped_device *md)
+{
+	if (atomic_dec_and_test(&amp;md-&gt;holders)) {
+		if (md-&gt;map)
+			dm_table_suspend_targets(md-&gt;map);
+		__unbind(md);
+		free_md(md);
+	}
+}
+
+/*
+ * Requeue the deferred io by calling generic_make_request.
+ */
+static void flush_deferred_io(struct deferred_io *c)
+{
+	struct deferred_io *n;
+
+	while (c) {
+		n = c-&gt;next;
+		generic_make_request(c-&gt;rw, c-&gt;bh);
+		free_deferred(c);
+		c = n;
+	}
+}
+
+/*
+ * Swap in a new table (destroying old one).
+ */
+int dm_swap_table(struct mapped_device *md, struct dm_table *table)
+{
+	int r;
+
+	down_write(&amp;md-&gt;lock);
+
+	/*
+	 * The device must be suspended, or have no table bound yet.
+	 */
+	if (md-&gt;map &amp;&amp; !test_bit(DMF_SUSPENDED, &amp;md-&gt;flags)) {
+		up_write(&amp;md-&gt;lock);
+		return -EPERM;
+	}
+
+	__unbind(md);
+	r = __bind(md, table);
+	if (r)
+		return r;
+
+	up_write(&amp;md-&gt;lock);
+	return 0;
+}
+
+/*
+ * We need to be able to change a mapping table under a mounted
+ * filesystem.  For example we might want to move some data in
+ * the background.  Before the table can be swapped with
+ * dm_bind_table, dm_suspend must be called to flush any in
+ * flight io and ensure that any further io gets deferred.
+ */
+int dm_suspend(struct mapped_device *md)
+{
+	int r = 0;
+	DECLARE_WAITQUEUE(wait, current);
+
+	down_write(&amp;md-&gt;lock);
+
+	/*
+	 * First we set the BLOCK_IO flag so no more ios will be
+	 * mapped.
+	 */
+	if (test_bit(DMF_BLOCK_IO, &amp;md-&gt;flags)) {
+		up_write(&amp;md-&gt;lock);
+		return -EINVAL;
+	}
+
+	set_bit(DMF_BLOCK_IO, &amp;md-&gt;flags);
+	add_wait_queue(&amp;md-&gt;wait, &amp;wait);
+	up_write(&amp;md-&gt;lock);
+
+	/*
+	 * Then we wait for the already mapped ios to
+	 * complete.
+	 */
+	run_task_queue(&amp;tq_disk);
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		if (!atomic_read(&amp;md-&gt;pending) || signal_pending(current))
+			break;
+
+		schedule();
+	}
+	set_current_state(TASK_RUNNING);
+
+	down_write(&amp;md-&gt;lock);
+	remove_wait_queue(&amp;md-&gt;wait, &amp;wait);
+
+	/* did we flush everything ? */
+	if (atomic_read(&amp;md-&gt;pending)) {
+		clear_bit(DMF_BLOCK_IO, &amp;md-&gt;flags);
+		r = -EINTR;
+	} else {
+		set_bit(DMF_SUSPENDED, &amp;md-&gt;flags);
+		if (md-&gt;map)
+			dm_table_suspend_targets(md-&gt;map);
+	}
+	up_write(&amp;md-&gt;lock);
+
+	return r;
+}
+
+int dm_resume(struct mapped_device *md)
+{
+	struct deferred_io *def;
+
+	down_write(&amp;md-&gt;lock);
+	if (!test_bit(DMF_SUSPENDED, &amp;md-&gt;flags)) {
+		up_write(&amp;md-&gt;lock);
+		return -EINVAL;
+	}
+
+	if (md-&gt;map)
+		dm_table_resume_targets(md-&gt;map);
+
+	clear_bit(DMF_SUSPENDED, &amp;md-&gt;flags);
+	clear_bit(DMF_BLOCK_IO, &amp;md-&gt;flags);
+	def = md-&gt;deferred;
+	md-&gt;deferred = NULL;
+	up_write(&amp;md-&gt;lock);
+
+	flush_deferred_io(def);
+	run_task_queue(&amp;tq_disk);
+
+	return 0;
+}
+
+struct dm_table *dm_get_table(struct mapped_device *md)
+{
+	struct dm_table *t;
+
+	down_read(&amp;md-&gt;lock);
+	t = md-&gt;map;
+	if (t)
+		dm_table_get(t);
+	up_read(&amp;md-&gt;lock);
+
+	return t;
+}
+
+/*-----------------------------------------------------------------
+ * Event notification.
+ *---------------------------------------------------------------*/
+uint32_t dm_get_event_nr(struct mapped_device *md)
+{
+	uint32_t r;
+
+	down_read(&amp;md-&gt;lock);
+	r = md-&gt;event_nr;
+	up_read(&amp;md-&gt;lock);
+
+	return r;
+}
+
+int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
+		      uint32_t event_nr)
+{
+	down_write(&amp;md-&gt;lock);
+	if (event_nr != md-&gt;event_nr) {
+		up_write(&amp;md-&gt;lock);
+		return 1;
+	}
+
+	add_wait_queue(&amp;md-&gt;eventq, wq);
+	up_write(&amp;md-&gt;lock);
+
+	return 0;
+}
+
+const char *dm_kdevname(kdev_t dev)
+{
+	static char buffer[32];
+	sprintf(buffer, "%03d:%03d", MAJOR(dev), MINOR(dev));
+	return buffer;
+}
+
+void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq)
+{
+	down_write(&amp;md-&gt;lock);
+	remove_wait_queue(&amp;md-&gt;eventq, wq);
+	up_write(&amp;md-&gt;lock);
+}
+
+kdev_t dm_kdev(struct mapped_device *md)
+{
+	kdev_t dev;
+
+	down_read(&amp;md-&gt;lock);
+	dev = md-&gt;dev;
+	up_read(&amp;md-&gt;lock);
+
+	return dev;
+}
+
+int dm_suspended(struct mapped_device *md)
+{
+	return test_bit(DMF_SUSPENDED, &amp;md-&gt;flags);
+}
+
+struct block_device_operations dm_blk_dops = {
+	.open = dm_blk_open,
+	.release = dm_blk_close,
+	.ioctl = dm_blk_ioctl,
+	.owner = THIS_MODULE
+};
+
+/*
+ * module hooks
+ */
+module_init(dm_init);
+module_exit(dm_exit);
+
+MODULE_DESCRIPTION(DM_NAME " driver");
+MODULE_AUTHOR("Joe Thornber &lt;thornber@sistina.com&gt;");
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL(dm_kdevname);
--- linux-2.4.26-rc1/drivers/md/dm.h	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/dm.h	Wed Mar 31 13:49:22 2004
@@ -0,0 +1,177 @@
+/*
+ * Internal header file for device mapper
+ *
+ * Copyright (C) 2001, 2002 Sistina Software
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef DM_INTERNAL_H
+#define DM_INTERNAL_H
+
+#include &lt;linux/fs.h&gt;
+#include &lt;linux/device-mapper.h&gt;
+#include &lt;linux/list.h&gt;
+#include &lt;linux/blkdev.h&gt;
+
+#define DM_NAME "device-mapper"
+#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
+#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
+#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
+
+/*
+ * FIXME: I think this should be with the definition of sector_t
+ * in types.h.
+ */
+#ifdef CONFIG_LBD
+#define SECTOR_FORMAT "%Lu"
+#else
+#define SECTOR_FORMAT "%lu"
+#endif
+
+#define SECTOR_SHIFT 9
+#define SECTOR_SIZE (1 &lt;&lt; SECTOR_SHIFT)
+
+extern struct block_device_operations dm_blk_dops;
+
+/*
+ * List of devices that a metadevice uses and should open/close.
+ */
+struct dm_dev {
+	struct list_head list;
+
+	atomic_t count;
+	int mode;
+	kdev_t dev;
+	struct block_device *bdev;
+};
+
+struct dm_table;
+struct mapped_device;
+
+/*-----------------------------------------------------------------
+ * Functions for manipulating a struct mapped_device.
+ * Drop the reference with dm_put when you finish with the object.
+ *---------------------------------------------------------------*/
+int dm_create(kdev_t dev, struct mapped_device **md);
+
+/*
+ * Reference counting for md.
+ */
+void dm_get(struct mapped_device *md);
+void dm_put(struct mapped_device *md);
+
+/*
+ * A device can still be used while suspended, but I/O is deferred.
+ */
+int dm_suspend(struct mapped_device *md);
+int dm_resume(struct mapped_device *md);
+
+/*
+ * The device must be suspended before calling this method.
+ */
+int dm_swap_table(struct mapped_device *md, struct dm_table *t);
+
+/*
+ * Drop a reference on the table when you've finished with the
+ * result.
+ */
+struct dm_table *dm_get_table(struct mapped_device *md);
+
+/*
+ * Event functions.
+ */
+uint32_t dm_get_event_nr(struct mapped_device *md);
+int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
+		      uint32_t event_nr);
+void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq);
+
+/*
+ * Info functions.
+ */
+kdev_t dm_kdev(struct mapped_device *md);
+int dm_suspended(struct mapped_device *md);
+
+/*-----------------------------------------------------------------
+ * Functions for manipulating a table.  Tables are also reference
+ * counted.
+ *---------------------------------------------------------------*/
+int dm_table_create(struct dm_table **result, int mode, unsigned num_targets);
+
+void dm_table_get(struct dm_table *t);
+void dm_table_put(struct dm_table *t);
+
+int dm_table_add_target(struct dm_table *t, const char *type,
+			sector_t start,	sector_t len, char *params);
+int dm_table_complete(struct dm_table *t);
+void dm_table_event_callback(struct dm_table *t,
+			     void (*fn)(void *), void *context);
+void dm_table_event(struct dm_table *t);
+sector_t dm_table_get_size(struct dm_table *t);
+struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
+struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
+unsigned int dm_table_get_num_targets(struct dm_table *t);
+struct list_head *dm_table_get_devices(struct dm_table *t);
+int dm_table_get_mode(struct dm_table *t);
+void dm_table_suspend_targets(struct dm_table *t);
+void dm_table_resume_targets(struct dm_table *t);
+
+/*-----------------------------------------------------------------
+ * A registry of target types.
+ *---------------------------------------------------------------*/
+int dm_target_init(void);
+void dm_target_exit(void);
+struct target_type *dm_get_target_type(const char *name);
+void dm_put_target_type(struct target_type *t);
+int dm_target_iterate(void (*iter_func)(struct target_type *tt,
+					void *param), void *param);
+
+
+/*-----------------------------------------------------------------
+ * Useful inlines.
+ *---------------------------------------------------------------*/
+static inline int array_too_big(unsigned long fixed, unsigned long obj,
+				unsigned long num)
+{
+	return (num &gt; (ULONG_MAX - fixed) / obj);
+}
+
+/*
+ * ceiling(n / size) * size
+ */
+static inline unsigned long dm_round_up(unsigned long n, unsigned long size)
+{
+	unsigned long r = n % size;
+	return n + (r ? (size - r) : 0);
+}
+
+/*
+ * Ceiling(n / size)
+ */
+static inline unsigned long dm_div_up(unsigned long n, unsigned long size)
+{
+	return dm_round_up(n, size) / size;
+}
+
+const char *dm_kdevname(kdev_t dev);
+
+/*
+ * The device-mapper can be driven through one of two interfaces;
+ * ioctl or filesystem, depending which patch you have applied.
+ */
+int dm_interface_init(void);
+void dm_interface_exit(void);
+
+/*
+ * Targets for linear and striped mappings
+ */
+int dm_linear_init(void);
+void dm_linear_exit(void);
+
+int dm_stripe_init(void);
+void dm_stripe_exit(void);
+
+int dm_snapshot_init(void);
+void dm_snapshot_exit(void);
+
+#endif
--- linux-2.4.26-rc1/drivers/md/kcopyd.c	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/kcopyd.c	Wed Mar 31 13:48:32 2004
@@ -0,0 +1,666 @@
+/*
+ * Copyright (C) 2002 Sistina Software (UK) Limited.
+ *
+ * This file is released under the GPL.
+ */
+
+#include &lt;asm/atomic.h&gt;
+
+#include &lt;linux/blkdev.h&gt;
+#include &lt;linux/config.h&gt;
+#include &lt;linux/device-mapper.h&gt;
+#include &lt;linux/fs.h&gt;
+#include &lt;linux/init.h&gt;
+#include &lt;linux/list.h&gt;
+#include &lt;linux/locks.h&gt;
+#include &lt;linux/mempool.h&gt;
+#include &lt;linux/module.h&gt;
+#include &lt;linux/pagemap.h&gt;
+#include &lt;linux/slab.h&gt;
+#include &lt;linux/vmalloc.h&gt;
+
+#include "kcopyd.h"
+#include "dm-daemon.h"
+
+/* FIXME: this is only needed for the DMERR macros */
+#include "dm.h"
+
+static struct dm_daemon _kcopyd;
+
+#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE)
+#define SUB_JOB_SIZE 128
+#define PAGES_PER_SUB_JOB (SUB_JOB_SIZE / SECTORS_PER_PAGE)
+#define SUB_JOB_COUNT 8
+
+/*-----------------------------------------------------------------
+ * Each kcopyd client has its own little pool of preallocated
+ * pages for kcopyd io.
+ *---------------------------------------------------------------*/
+struct kcopyd_client {
+	struct list_head list;
+
+	spinlock_t lock;
+	struct list_head pages;
+	unsigned int nr_pages;
+	unsigned int nr_free_pages;
+	unsigned int max_split;
+};
+
+static inline void __push_page(struct kcopyd_client *kc, struct page *p)
+{
+	list_add(&amp;p-&gt;list, &amp;kc-&gt;pages);
+	kc-&gt;nr_free_pages++;
+}
+
+static inline struct page *__pop_page(struct kcopyd_client *kc)
+{
+	struct page *p;
+
+	p = list_entry(kc-&gt;pages.next, struct page, list);
+	list_del(&amp;p-&gt;list);
+	kc-&gt;nr_free_pages--;
+
+	return p;
+}
+
+static int kcopyd_get_pages(struct kcopyd_client *kc,
+			    unsigned int nr, struct list_head *pages)
+{
+	struct page *p;
+	INIT_LIST_HEAD(pages);
+
+	spin_lock(&amp;kc-&gt;lock);
+	if (kc-&gt;nr_free_pages &lt; nr) {
+		spin_unlock(&amp;kc-&gt;lock);
+		return -ENOMEM;
+	}
+
+	while (nr--) {
+		p = __pop_page(kc);
+		list_add(&amp;p-&gt;list, pages);
+	}
+	spin_unlock(&amp;kc-&gt;lock);
+
+	return 0;
+}
+
+static void kcopyd_put_pages(struct kcopyd_client *kc, struct list_head *pages)
+{
+	struct list_head *tmp, *tmp2;
+
+	spin_lock(&amp;kc-&gt;lock);
+	list_for_each_safe (tmp, tmp2, pages)
+		__push_page(kc, list_entry(tmp, struct page, list));
+	spin_unlock(&amp;kc-&gt;lock);
+}
+
+/*
+ * These three functions resize the page pool.
+ */
+static void release_pages(struct list_head *pages)
+{
+	struct page *p;
+	struct list_head *tmp, *tmp2;
+
+	list_for_each_safe (tmp, tmp2, pages) {
+		p = list_entry(tmp, struct page, list);
+		UnlockPage(p);
+		__free_page(p);
+	}
+}
+
+static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr)
+{
+	unsigned int i;
+	struct page *p;
+	LIST_HEAD(new);
+
+	for (i = 0; i &lt; nr; i++) {
+		p = alloc_page(GFP_KERNEL);
+		if (!p) {
+			release_pages(&amp;new);
+			return -ENOMEM;
+		}
+
+		LockPage(p);
+		list_add(&amp;p-&gt;list, &amp;new);
+	}
+
+	kcopyd_put_pages(kc, &amp;new);
+	kc-&gt;nr_pages += nr;
+	kc-&gt;max_split = kc-&gt;nr_pages / PAGES_PER_SUB_JOB;
+	if (kc-&gt;max_split &gt; SUB_JOB_COUNT)
+		kc-&gt;max_split = SUB_JOB_COUNT;
+
+	return 0;
+}
+
+static void client_free_pages(struct kcopyd_client *kc)
+{
+	BUG_ON(kc-&gt;nr_free_pages != kc-&gt;nr_pages);
+	release_pages(&amp;kc-&gt;pages);
+	kc-&gt;nr_free_pages = kc-&gt;nr_pages = 0;
+}
+
+/*-----------------------------------------------------------------
+ * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
+ * for this reason we use a mempool to prevent the client from
+ * ever having to do io (which could cause a deadlock).
+ *---------------------------------------------------------------*/
+struct kcopyd_job {
+	struct kcopyd_client *kc;
+	struct list_head list;
+	unsigned int flags;
+
+	/*
+	 * Error state of the job.
+	 */
+	int read_err;
+	unsigned int write_err;
+
+	/*
+	 * Either READ or WRITE
+	 */
+	int rw;
+	struct io_region source;
+
+	/*
+	 * The destinations for the transfer.
+	 */
+	unsigned int num_dests;
+	struct io_region dests[KCOPYD_MAX_REGIONS];
+
+	sector_t offset;
+	unsigned int nr_pages;
+	struct list_head pages;
+
+	/*
+	 * Set this to ensure you are notified when the job has
+	 * completed.  'context' is for callback to use.
+	 */
+	kcopyd_notify_fn fn;
+	void *context;
+
+	/*
+	 * These fields are only used if the job has been split
+	 * into more manageable parts.
+	 */
+	struct semaphore lock;
+	atomic_t sub_jobs;
+	sector_t progress;
+};
+
+/* FIXME: this should scale with the number of pages */
+#define MIN_JOBS 512
+
+static kmem_cache_t *_job_cache;
+static mempool_t *_job_pool;
+
+/*
+ * We maintain three lists of jobs:
+ *
+ * i)   jobs waiting for pages
+ * ii)  jobs that have pages, and are waiting for the io to be issued.
+ * iii) jobs that have completed.
+ *
+ * All three of these are protected by job_lock.
+ */
+static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED;
+
+static LIST_HEAD(_complete_jobs);
+static LIST_HEAD(_io_jobs);
+static LIST_HEAD(_pages_jobs);
+
+static int jobs_init(void)
+{
+	INIT_LIST_HEAD(&amp;_complete_jobs);
+	INIT_LIST_HEAD(&amp;_io_jobs);
+	INIT_LIST_HEAD(&amp;_pages_jobs);
+
+	_job_cache = kmem_cache_create("kcopyd-jobs",
+				       sizeof(struct kcopyd_job),
+				       __alignof__(struct kcopyd_job),
+				       0, NULL, NULL);
+	if (!_job_cache)
+		return -ENOMEM;
+
+	_job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
+				   mempool_free_slab, _job_cache);
+	if (!_job_pool) {
+		kmem_cache_destroy(_job_cache);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void jobs_exit(void)
+{
+	BUG_ON(!list_empty(&amp;_complete_jobs));
+	BUG_ON(!list_empty(&amp;_io_jobs));
+	BUG_ON(!list_empty(&amp;_pages_jobs));
+
+	mempool_destroy(_job_pool);
+	kmem_cache_destroy(_job_cache);
+}
+
+/*
+ * Functions to push and pop a job onto the head of a given job
+ * list.
+ */
+static inline struct kcopyd_job *pop(struct list_head *jobs)
+{
+	struct kcopyd_job *job = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&amp;_job_lock, flags);
+
+	if (!list_empty(jobs)) {
+		job = list_entry(jobs-&gt;next, struct kcopyd_job, list);
+		list_del(&amp;job-&gt;list);
+	}
+	spin_unlock_irqrestore(&amp;_job_lock, flags);
+
+	return job;
+}
+
+static inline void push(struct list_head *jobs, struct kcopyd_job *job)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&amp;_job_lock, flags);
+	list_add_tail(&amp;job-&gt;list, jobs);
+	spin_unlock_irqrestore(&amp;_job_lock, flags);
+}
+
+/*
+ * These three functions process 1 item from the corresponding
+ * job list.
+ *
+ * They return:
+ * &lt; 0: error
+ *   0: success
+ * &gt; 0: can't process yet.
+ */
+static int run_complete_job(struct kcopyd_job *job)
+{
+	void *context = job-&gt;context;
+	int read_err = job-&gt;read_err;
+	unsigned int write_err = job-&gt;write_err;
+	kcopyd_notify_fn fn = job-&gt;fn;
+
+	kcopyd_put_pages(job-&gt;kc, &amp;job-&gt;pages);
+	mempool_free(job, _job_pool);
+	fn(read_err, write_err, context);
+	return 0;
+}
+
+static void complete_io(unsigned int error, void *context)
+{
+	struct kcopyd_job *job = (struct kcopyd_job *) context;
+
+	if (error) {
+		if (job-&gt;rw == WRITE)
+			job-&gt;write_err &amp;= error;
+		else
+			job-&gt;read_err = 1;
+
+		if (!test_bit(KCOPYD_IGNORE_ERROR, &amp;job-&gt;flags)) {
+			push(&amp;_complete_jobs, job);
+			dm_daemon_wake(&amp;_kcopyd);
+			return;
+		}
+	}
+
+	if (job-&gt;rw == WRITE)
+		push(&amp;_complete_jobs, job);
+
+	else {
+		job-&gt;rw = WRITE;
+		push(&amp;_io_jobs, job);
+	}
+
+	dm_daemon_wake(&amp;_kcopyd);
+}
+
+/*
+ * Request io on as many buffer heads as we can currently get for
+ * a particular job.
+ */
+static int run_io_job(struct kcopyd_job *job)
+{
+	int r;
+
+	if (job-&gt;rw == READ)
+		r = dm_io_async(1, &amp;job-&gt;source, job-&gt;rw,
+				list_entry(job-&gt;pages.next, struct page, list),
+				job-&gt;offset, complete_io, job);
+
+	else
+		r = dm_io_async(job-&gt;num_dests, job-&gt;dests, job-&gt;rw,
+				list_entry(job-&gt;pages.next, struct page, list),
+				job-&gt;offset, complete_io, job);
+
+	return r;
+}
+
+static int run_pages_job(struct kcopyd_job *job)
+{
+	int r;
+
+	job-&gt;nr_pages = dm_div_up(job-&gt;dests[0].count + job-&gt;offset,
+				  SECTORS_PER_PAGE);
+	r = kcopyd_get_pages(job-&gt;kc, job-&gt;nr_pages, &amp;job-&gt;pages);
+	if (!r) {
+		/* this job is ready for io */
+		push(&amp;_io_jobs, job);
+		return 0;
+	}
+
+	if (r == -ENOMEM)
+		/* can't complete now */
+		return 1;
+
+	return r;
+}
+
+/*
+ * Run through a list for as long as possible.  Returns the count
+ * of successful jobs.
+ */
+static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
+{
+	struct kcopyd_job *job;
+	int r, count = 0;
+
+	while ((job = pop(jobs))) {
+
+		r = fn(job);
+
+		if (r &lt; 0) {
+			/* error this rogue job */
+			if (job-&gt;rw == WRITE)
+				job-&gt;write_err = (unsigned int) -1;
+			else
+				job-&gt;read_err = 1;
+			push(&amp;_complete_jobs, job);
+			break;
+		}
+
+		if (r &gt; 0) {
+			/*
+			 * We couldn't service this job ATM, so
+			 * push this job back onto the list.
+			 */
+			push(jobs, job);
+			break;
+		}
+
+		count++;
+	}
+
+	return count;
+}
+
+/*
+ * kcopyd does this every time it's woken up.
+ */
+static void do_work(void)
+{
+	/*
+	 * The order that these are called is *very* important.
+	 * complete jobs can free some pages for pages jobs.
+	 * Pages jobs when successful will jump onto the io jobs
+	 * list.  io jobs call wake when they complete and it all
+	 * starts again.
+	 */
+	process_jobs(&amp;_complete_jobs, run_complete_job);
+	process_jobs(&amp;_pages_jobs, run_pages_job);
+	process_jobs(&amp;_io_jobs, run_io_job);
+	run_task_queue(&amp;tq_disk);
+}
+
+/*
+ * If we are copying a small region we just dispatch a single job
+ * to do the copy, otherwise the io has to be split up into many
+ * jobs.
+ */
+static void dispatch_job(struct kcopyd_job *job)
+{
+	push(&amp;_pages_jobs, job);
+	dm_daemon_wake(&amp;_kcopyd);
+}
+
+static void segment_complete(int read_err,
+			     unsigned int write_err, void *context)
+{
+	/* FIXME: tidy this function */
+	sector_t progress = 0;
+	sector_t count = 0;
+	struct kcopyd_job *job = (struct kcopyd_job *) context;
+
+	down(&amp;job-&gt;lock);
+
+	/* update the error */
+	if (read_err)
+		job-&gt;read_err = 1;
+
+	if (write_err)
+		job-&gt;write_err &amp;= write_err;
+
+	/*
+	 * Only dispatch more work if there hasn't been an error.
+	 */
+	if ((!job-&gt;read_err &amp;&amp; !job-&gt;write_err) ||
+	    test_bit(KCOPYD_IGNORE_ERROR, &amp;job-&gt;flags)) {
+		/* get the next chunk of work */
+		progress = job-&gt;progress;
+		count = job-&gt;source.count - progress;
+		if (count) {
+			if (count &gt; SUB_JOB_SIZE)
+				count = SUB_JOB_SIZE;
+
+			job-&gt;progress += count;
+		}
+	}
+	up(&amp;job-&gt;lock);
+
+	if (count) {
+		int i;
+		struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO);
+
+		memcpy(sub_job, job, sizeof(*job));
+		sub_job-&gt;source.sector += progress;
+		sub_job-&gt;source.count = count;
+
+		for (i = 0; i &lt; job-&gt;num_dests; i++) {
+			sub_job-&gt;dests[i].sector += progress;
+			sub_job-&gt;dests[i].count = count;
+		}
+
+		sub_job-&gt;fn = segment_complete;
+		sub_job-&gt;context = job;
+		dispatch_job(sub_job);
+
+	} else if (atomic_dec_and_test(&amp;job-&gt;sub_jobs)) {
+
+		/*
+		 * To avoid a race we must keep the job around
+		 * until after the notify function has completed.
+		 * Otherwise the client may try and stop the job
+		 * after we've completed.
+		 */
+		job-&gt;fn(read_err, write_err, job-&gt;context);
+		mempool_free(job, _job_pool);
+	}
+}
+
+/*
+ * Create some little jobs that will do the move between
+ * them.
+ */
+static void split_job(struct kcopyd_job *job)
+{
+	int nr;
+
+	nr = dm_div_up(job-&gt;source.count, SUB_JOB_SIZE);
+	if (nr &gt; job-&gt;kc-&gt;max_split)
+		nr = job-&gt;kc-&gt;max_split;
+
+	atomic_set(&amp;job-&gt;sub_jobs, nr);
+	while (nr--)
+		segment_complete(0, 0u, job);
+}
+
+int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
+		unsigned int num_dests, struct io_region *dests,
+		unsigned int flags, kcopyd_notify_fn fn, void *context)
+{
+	struct kcopyd_job *job;
+
+	/*
+	 * Allocate a new job.
+	 */
+	job = mempool_alloc(_job_pool, GFP_NOIO);
+
+	/*
+	 * set up for the read.
+	 */
+	job-&gt;kc = kc;
+	job-&gt;flags = flags;
+	job-&gt;read_err = 0;
+	job-&gt;write_err = 0;
+	job-&gt;rw = READ;
+
+	memcpy(&amp;job-&gt;source, from, sizeof(*from));
+
+	job-&gt;num_dests = num_dests;
+	memcpy(&amp;job-&gt;dests, dests, sizeof(*dests) * num_dests);
+
+	job-&gt;offset = 0;
+	job-&gt;nr_pages = 0;
+	INIT_LIST_HEAD(&amp;job-&gt;pages);
+
+	job-&gt;fn = fn;
+	job-&gt;context = context;
+
+	if (job-&gt;source.count &lt; SUB_JOB_SIZE)
+		dispatch_job(job);
+
+	else {
+		init_MUTEX(&amp;job-&gt;lock);
+		job-&gt;progress = 0;
+		split_job(job);
+	}
+
+	return 0;
+}
+
+/*
+ * Cancels a kcopyd job, eg. someone might be deactivating a
+ * mirror.
+ */
+int kcopyd_cancel(struct kcopyd_job *job, int block)
+{
+	/* FIXME: finish */
+	return -1;
+}
+
+/*-----------------------------------------------------------------
+ * Unit setup
+ *---------------------------------------------------------------*/
+static DECLARE_MUTEX(_client_lock);
+static LIST_HEAD(_clients);
+
+static int client_add(struct kcopyd_client *kc)
+{
+	down(&amp;_client_lock);
+	list_add(&amp;kc-&gt;list, &amp;_clients);
+	up(&amp;_client_lock);
+	return 0;
+}
+
+static void client_del(struct kcopyd_client *kc)
+{
+	down(&amp;_client_lock);
+	list_del(&amp;kc-&gt;list);
+	up(&amp;_client_lock);
+}
+
+int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
+{
+	int r = 0;
+	struct kcopyd_client *kc;
+
+	if (nr_pages * SECTORS_PER_PAGE &lt; SUB_JOB_SIZE) {
+		DMERR("kcopyd client requested %u pages: minimum is %lu",
+		      nr_pages, SUB_JOB_SIZE / SECTORS_PER_PAGE);
+		return -ENOMEM;
+	}
+
+	kc = kmalloc(sizeof(*kc), GFP_KERNEL);
+	if (!kc)
+		return -ENOMEM;
+
+	kc-&gt;lock = SPIN_LOCK_UNLOCKED;
+	INIT_LIST_HEAD(&amp;kc-&gt;pages);
+	kc-&gt;nr_pages = kc-&gt;nr_free_pages = 0;
+	r = client_alloc_pages(kc, nr_pages);
+	if (r) {
+		kfree(kc);
+		return r;
+	}
+
+	r = dm_io_get(nr_pages);
+	if (r) {
+		client_free_pages(kc);
+		kfree(kc);
+		return r;
+	}
+
+	r = client_add(kc);
+	if (r) {
+		dm_io_put(nr_pages);
+		client_free_pages(kc);
+		kfree(kc);
+		return r;
+	}
+
+	*result = kc;
+	return 0;
+}
+
+void kcopyd_client_destroy(struct kcopyd_client *kc)
+{
+	dm_io_put(kc-&gt;nr_pages);
+	client_free_pages(kc);
+	client_del(kc);
+	kfree(kc);
+}
+
+
+int __init kcopyd_init(void)
+{
+	int r;
+
+	r = jobs_init();
+	if (r)
+		return r;
+
+	r = dm_daemon_start(&amp;_kcopyd, "kcopyd", do_work);
+	if (r)
+		jobs_exit();
+
+	return r;
+}
+
+void kcopyd_exit(void)
+{
+	jobs_exit();
+	dm_daemon_stop(&amp;_kcopyd);
+}
+
+EXPORT_SYMBOL(kcopyd_client_create);
+EXPORT_SYMBOL(kcopyd_client_destroy);
+EXPORT_SYMBOL(kcopyd_copy);
+EXPORT_SYMBOL(kcopyd_cancel);
--- linux-2.4.26-rc1/drivers/md/kcopyd.h	Thu Jan  1 01:00:00 1970
+++ linux/drivers/md/kcopyd.h	Wed Mar 31 13:48:32 2004
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2001 Sistina Software
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_KCOPYD_H
+#define DM_KCOPYD_H
+
+/*
+ * Needed for the definition of offset_t.
+ */
+#include &lt;linux/device-mapper.h&gt;
+#include &lt;linux/iobuf.h&gt;
+
+#include "dm-io.h"
+
+int kcopyd_init(void);
+void kcopyd_exit(void);
+
+/* FIXME: make this configurable */
+#define KCOPYD_MAX_REGIONS 8
+
+#define KCOPYD_IGNORE_ERROR 1
+
+/*
+ * To use kcopyd you must first create a kcopyd client object.
+ */
+struct kcopyd_client;
+int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result);
+void kcopyd_client_destroy(struct kcopyd_client *kc);
+
+/*
+ * Submit a copy job to kcopyd.  This is built on top of the
+ * previous three fns.
+ *
+ * read_err is a boolean,
+ * write_err is a bitset, with 1 bit for each destination region
+ */
+typedef void (*kcopyd_notify_fn)(int read_err,
+				 unsigned int write_err, void *context);
+
+int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
+		unsigned int num_dests, struct io_region *dests,
+		unsigned int flags, kcopyd_notify_fn fn, void *context);
+
+#endif
--- linux-2.4.26-rc1/include/linux/device-mapper.h	Thu Jan  1 01:00:00 1970
+++ linux/include/linux/device-mapper.h	Wed Mar 31 14:05:02 2004
@@ -0,0 +1,105 @@
+/*
+ * Copyright (C) 2001 Sistina Software (UK) Limited.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef _LINUX_DEVICE_MAPPER_H
+#define _LINUX_DEVICE_MAPPER_H
+
+typedef unsigned long sector_t;
+
+struct dm_target;
+struct dm_table;
+struct dm_dev;
+
+typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
+
+union map_info {
+	void *ptr;
+	unsigned long long ll;
+};
+
+/*
+ * In the constructor the target parameter will already have the
+ * table, type, begin and len fields filled in.
+ */
+typedef int (*dm_ctr_fn) (struct dm_target * target, unsigned int argc,
+			  char **argv);
+
+/*
+ * The destructor doesn't need to free the dm_target, just
+ * anything hidden ti-&gt;private.
+ */
+typedef void (*dm_dtr_fn) (struct dm_target * ti);
+
+/*
+ * The map function must return:
+ * &lt; 0: error
+ * = 0: The target will handle the io by resubmitting it later
+ * &gt; 0: simple remap complete
+ */
+typedef int (*dm_map_fn) (struct dm_target * ti, struct buffer_head * bh,
+			  int rw, union map_info *map_context);
+
+/*
+ * Returns:
+ * &lt; 0 : error (currently ignored)
+ * 0   : ended successfully
+ * 1   : for some reason the io has still not completed (eg,
+ *       multipath target might want to requeue a failed io).
+ */
+typedef int (*dm_endio_fn) (struct dm_target * ti,
+			    struct buffer_head * bh, int rw, int error,
+			    union map_info *map_context);
+typedef void (*dm_suspend_fn) (struct dm_target *ti);
+typedef void (*dm_resume_fn) (struct dm_target *ti);
+typedef int (*dm_status_fn) (struct dm_target * ti, status_type_t status_type,
+			     char *result, unsigned int maxlen);
+
+void dm_error(const char *message);
+
+/*
+ * Constructors should call these functions to ensure destination devices
+ * are opened/closed correctly.
+ * FIXME: too many arguments.
+ */
+int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
+		  sector_t len, int mode, struct dm_dev **result);
+void dm_put_device(struct dm_target *ti, struct dm_dev *d);
+
+/*
+ * Information about a target type
+ */
+struct target_type {
+	const char *name;
+	struct module *module;
+	unsigned version[3];
+	dm_ctr_fn ctr;
+	dm_dtr_fn dtr;
+	dm_map_fn map;
+	dm_endio_fn end_io;
+	dm_suspend_fn suspend;
+	dm_resume_fn resume;
+	dm_status_fn status;
+};
+
+struct dm_target {
+	struct dm_table *table;
+	struct target_type *type;
+
+	/* target limits */
+	sector_t begin;
+	sector_t len;
+
+	/* target specific data */
+	void *private;
+
+	/* Used to provide an error string from the ctr */
+	char *error;
+};
+
+int dm_register_target(struct target_type *t);
+int dm_unregister_target(struct target_type *t);
+
+#endif				/* _LINUX_DEVICE_MAPPER_H */
--- linux-2.4.26-rc1/include/linux/dm-ioctl.h	Thu Jan  1 01:00:00 1970
+++ linux/include/linux/dm-ioctl.h	Wed Mar 31 14:06:21 2004
@@ -0,0 +1,252 @@
+/*
+ * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef _LINUX_DM_IOCTL_H
+#define _LINUX_DM_IOCTL_H
+
+#include &lt;linux/types.h&gt;
+
+#define DM_DIR "mapper"		/* Slashes not supported */
+#define DM_MAX_TYPE_NAME 16
+#define DM_NAME_LEN 128
+#define DM_UUID_LEN 129
+
+/*
+ * A traditional ioctl interface for the device mapper.
+ *
+ * Each device can have two tables associated with it, an
+ * 'active' table which is the one currently used by io passing
+ * through the device, and an 'inactive' one which is a table
+ * that is being prepared as a replacement for the 'active' one.
+ *
+ * DM_VERSION:
+ * Just get the version information for the ioctl interface.
+ *
+ * DM_REMOVE_ALL:
+ * Remove all dm devices, destroy all tables.  Only really used
+ * for debug.
+ *
+ * DM_LIST_DEVICES:
+ * Get a list of all the dm device names.
+ *
+ * DM_DEV_CREATE:
+ * Create a new device, neither the 'active' or 'inactive' table
+ * slots will be filled.  The device will be in suspended state
+ * after creation, however any io to the device will get errored
+ * since it will be out-of-bounds.
+ *
+ * DM_DEV_REMOVE:
+ * Remove a device, destroy any tables.
+ *
+ * DM_DEV_RENAME:
+ * Rename a device.
+ *
+ * DM_SUSPEND:
+ * This performs both suspend and resume, depending which flag is
+ * passed in.
+ * Suspend: This command will not return until all pending io to
+ * the device has completed.  Further io will be deferred until
+ * the device is resumed.
+ * Resume: It is no longer an error to issue this command on an
+ * unsuspended device.  If a table is present in the 'inactive'
+ * slot, it will be moved to the active slot, then the old table
+ * from the active slot will be _destroyed_.  Finally the device
+ * is resumed.
+ *
+ * DM_DEV_STATUS:
+ * Retrieves the status for the table in the 'active' slot.
+ *
+ * DM_DEV_WAIT:
+ * Wait for a significant event to occur to the device.  This
+ * could either be caused by an event triggered by one of the
+ * targets of the table in the 'active' slot, or a table change.
+ *
+ * DM_TABLE_LOAD:
+ * Load a table into the 'inactive' slot for the device.  The
+ * device does _not_ need to be suspended prior to this command.
+ *
+ * DM_TABLE_CLEAR:
+ * Destroy any table in the 'inactive' slot (ie. abort).
+ *
+ * DM_TABLE_DEPS:
+ * Return a set of device dependencies for the 'active' table.
+ *
+ * DM_TABLE_STATUS:
+ * Return the targets status for the 'active' table.
+ */
+
+/*
+ * All ioctl arguments consist of a single chunk of memory, with
+ * this structure at the start.  If a uuid is specified any
+ * lookup (eg. for a DM_INFO) will be done on that, *not* the
+ * name.
+ */
+struct dm_ioctl {
+	/*
+	 * The version number is made up of three parts:
+	 * major - no backward or forward compatibility,
+	 * minor - only backwards compatible,
+	 * patch - both backwards and forwards compatible.
+	 *
+	 * All clients of the ioctl interface should fill in the
+	 * version number of the interface that they were
+	 * compiled with.
+	 *
+	 * All recognised ioctl commands (ie. those that don't
+	 * return -ENOTTY) fill out this field, even if the
+	 * command failed.
+	 */
+	uint32_t version[3];	/* in/out */
+	uint32_t data_size;	/* total size of data passed in
+				 * including this struct */
+
+	uint32_t data_start;	/* offset to start of data
+				 * relative to start of this struct */
+
+	uint32_t target_count;	/* in/out */
+	int32_t open_count;	/* out */
+	uint32_t flags;		/* in/out */
+	uint32_t event_nr;      /* in/out */
+	uint32_t padding;
+
+	uint64_t dev;		/* in/out */
+
+	char name[DM_NAME_LEN];	/* device name */
+	char uuid[DM_UUID_LEN];	/* unique identifier for
+				 * the block device */
+};
+
+/*
+ * Used to specify tables.  These structures appear after the
+ * dm_ioctl.
+ */
+struct dm_target_spec {
+	uint64_t sector_start;
+	uint64_t length;
+	int32_t status;		/* used when reading from kernel only */
+
+	/*
+	 * Offset in bytes (from the start of this struct) to
+	 * next target_spec.
+	 */
+	uint32_t next;
+
+	char target_type[DM_MAX_TYPE_NAME];
+
+	/*
+	 * Parameter string starts immediately after this object.
+	 * Be careful to add padding after string to ensure correct
+	 * alignment of subsequent dm_target_spec.
+	 */
+};
+
+/*
+ * Used to retrieve the target dependencies.
+ */
+struct dm_target_deps {
+	uint32_t count;		/* Array size */
+	uint32_t padding;	/* unused */
+	uint64_t dev[0];	/* out */
+};
+
+/*
+ * Used to get a list of all dm devices.
+ */
+struct dm_name_list {
+	uint64_t dev;
+	uint32_t next;		/* offset to the next record from
+				   the _start_ of this */
+	char name[0];
+};
+
+/*
+ * Used to retrieve the target versions
+ */
+struct dm_target_versions {
+	uint32_t next;
+	uint32_t version[3];
+
+	char name[0];
+};
+
+/*
+ * If you change this make sure you make the corresponding change
+ * to dm-ioctl.c:lookup_ioctl()
+ */
+enum {
+	/* Top level cmds */
+	DM_VERSION_CMD = 0,
+	DM_REMOVE_ALL_CMD,
+	DM_LIST_DEVICES_CMD,
+
+	/* device level cmds */
+	DM_DEV_CREATE_CMD,
+	DM_DEV_REMOVE_CMD,
+	DM_DEV_RENAME_CMD,
+	DM_DEV_SUSPEND_CMD,
+	DM_DEV_STATUS_CMD,
+	DM_DEV_WAIT_CMD,
+
+	/* Table level cmds */
+	DM_TABLE_LOAD_CMD,
+	DM_TABLE_CLEAR_CMD,
+	DM_TABLE_DEPS_CMD,
+	DM_TABLE_STATUS_CMD,
+
+	/* Added later */
+	DM_LIST_VERSIONS_CMD,
+};
+
+#define DM_IOCTL 0xfd
+
+#define DM_VERSION       _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl)
+#define DM_REMOVE_ALL    _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl)
+#define DM_LIST_DEVICES  _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl)
+
+#define DM_DEV_CREATE    _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl)
+#define DM_DEV_REMOVE    _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl)
+#define DM_DEV_RENAME    _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl)
+#define DM_DEV_SUSPEND   _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl)
+#define DM_DEV_STATUS    _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl)
+#define DM_DEV_WAIT      _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl)
+
+#define DM_TABLE_LOAD    _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl)
+#define DM_TABLE_CLEAR   _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl)
+#define DM_TABLE_DEPS    _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, struct dm_ioctl)
+#define DM_TABLE_STATUS  _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl)
+
+#define DM_LIST_VERSIONS _IOWR(DM_IOCTL, DM_LIST_VERSIONS_CMD, struct dm_ioctl)
+
+#define DM_VERSION_MAJOR	4
+#define DM_VERSION_MINOR	1
+#define DM_VERSION_PATCHLEVEL	1
+#define DM_VERSION_EXTRA	"-ioctl (2004-04-07)"
+
+/* Status bits */
+#define DM_READONLY_FLAG	(1 &lt;&lt; 0) /* In/Out */
+#define DM_SUSPEND_FLAG		(1 &lt;&lt; 1) /* In/Out */
+#define DM_PERSISTENT_DEV_FLAG	(1 &lt;&lt; 3) /* In */
+
+/*
+ * Flag passed into ioctl STATUS command to get table information
+ * rather than current status.
+ */
+#define DM_STATUS_TABLE_FLAG	(1 &lt;&lt; 4) /* In */
+
+/*
+ * Flags that indicate whether a table is present in either of
+ * the two table slots that a device has.
+ */
+#define DM_ACTIVE_PRESENT_FLAG   (1 &lt;&lt; 5) /* Out */
+#define DM_INACTIVE_PRESENT_FLAG (1 &lt;&lt; 6) /* Out */
+
+/*
+ * Indicates that the buffer passed in wasn't big enough for the
+ * results.
+ */
+#define DM_BUFFER_FULL_FLAG	(1 &lt;&lt; 8) /* Out */
+
+#endif				/* _LINUX_DM_IOCTL_H */
</pre></body></html>