FIXME - explain   #define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * 8)

---
 Documentation/device-mapper/thin-provisioning.txt |   25 -
 drivers/md/dm-thin-metadata.c                     |  305 ++++++--------
 drivers/md/dm-thin-metadata.h                     |   11 
 drivers/md/dm-thin.c                              |  473 ++++++++++++++--------
 4 files changed, 464 insertions(+), 350 deletions(-)

Index: linux-3.1-rc9/Documentation/device-mapper/thin-provisioning.txt
===================================================================
--- linux-3.1-rc9.orig/Documentation/device-mapper/thin-provisioning.txt
+++ linux-3.1-rc9/Documentation/device-mapper/thin-provisioning.txt
@@ -88,16 +88,15 @@ Using an existing pool device
 		 $data_block_size $low_water_mark"
 
 $data_block_size gives the smallest unit of disk space that can be
-allocated at a time.  As with all sizes passed to device-mapper, this
-is expressed in units of 512-byte sectors.  People primarily
-interested in thin provisioning may want to use a value such as 1024.
-People doing lots of snapshotting may want a smaller value such as
-128.  $data_block_size must be the same for the lifetime of the
+allocated at a time expressed in units of 512-byte sectors.  People
+primarily interested in thin provisioning may want to use a value such
+as 1024.  People doing lots of snapshotting may want a smaller value
+such as 128.  $data_block_size must be the same for the lifetime of the
 metadata device.
 
-$low_water_mark is expressed in 512-byte sectors.  If free space on
-the data device drops below this level then a dm event will be
-triggered which a userspace daemon should catch allowing it to
+$low_water_mark is expressed in blocks of size $data_block_size.  If
+free space on the data device drops below this level then a dm event
+will be triggered which a userspace daemon should catch allowing it to
 extend the pool device.  Only one such event will be sent.
 
 FIXME - Do we get a second event after a table reload when you're
@@ -177,7 +176,7 @@ Reference
 i) Constructor
 
     thin-pool <metadata dev> <data dev> <data block size (sectors)> \
-	      <low water mark (sectors)> [<number of feature args> [<arg>]*]
+	      <low water mark (blocks)> [<number of feature args> [<arg>]*]
 
     Optional feature arguments:
     - 'skip_block_zeroing': skips the zeroing of newly-provisioned blocks.
@@ -187,16 +186,16 @@ i) Constructor
 
 ii) Status
 
-    <transaction id> <used metadata sectors>/<total metadata sectors>
-    <used data sectors>/<total data sectors> <held metadata root>
+    <transaction id> <used metadata blocks>/<total metadata blocks>
+    <used data blocks>/<total data blocks> <held metadata root>
 
 
     transaction id:
 	A 64-bit number used by userspace to help synchronise with metadata
 	from volume managers.
 
-    used data sectors / total data sectors
-	If the number of free sectors drops below the pool's low water mark a
+    used data blocks / total data blocks
+	If the number of free blocks drops below the pool's low water mark a
 	dm event will be sent to userspace.  This event is edge-triggered and
 	it will occur only once after each resume so volume manager writers
 	should register for the event and then check the target's status.
Index: linux-3.1-rc9/drivers/md/dm-thin-metadata.c
===================================================================
--- linux-3.1-rc9.orig/drivers/md/dm-thin-metadata.c
+++ linux-3.1-rc9/drivers/md/dm-thin-metadata.c
@@ -77,7 +77,6 @@
 #define THIN_SUPERBLOCK_MAGIC 27022010
 #define THIN_SUPERBLOCK_LOCATION 0
 #define THIN_VERSION 1
-#define THIN_METADATA_BLOCK_SIZE 4096
 #define THIN_METADATA_CACHE_SIZE 64
 #define SECTOR_TO_BLOCK_SHIFT 3
 
@@ -174,7 +173,6 @@ struct dm_pool_metadata {
 	struct rw_semaphore root_lock;
 	uint32_t time;
 	int need_commit;
-	struct dm_block *sblock;
 	dm_block_t root;
 	dm_block_t details_root;
 	struct list_head thin_devices;
@@ -200,6 +198,8 @@ struct dm_thin_device {
  * superblock validator
  *--------------------------------------------------------------*/
 
+#define SUPERBLOCK_CSUM_XOR 160774
+
 static void sb_prepare_for_write(struct dm_block_validator *v,
 				 struct dm_block *b,
 				 size_t block_size)
@@ -207,7 +207,9 @@ static void sb_prepare_for_write(struct 
 	struct thin_disk_superblock *disk_super = dm_block_data(b);
 
 	disk_super->blocknr = cpu_to_le64(dm_block_location(b));
-	disk_super->csum = cpu_to_le32(dm_block_csum_data(&disk_super->flags, sizeof(*disk_super) - sizeof(__le32)));
+	disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
+						      block_size - sizeof(__le32),
+						      SUPERBLOCK_CSUM_XOR));
 }
 
 static int sb_check(struct dm_block_validator *v,
@@ -231,7 +233,9 @@ static int sb_check(struct dm_block_vali
 		return -EILSEQ;
 	}
 
-	csum_le = cpu_to_le32(dm_block_csum_data(&disk_super->flags, sizeof(*disk_super) - sizeof(__le32)));
+	csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
+					     block_size - sizeof(__le32),
+					     SUPERBLOCK_CSUM_XOR));
 	if (csum_le != disk_super->csum) {
 		DMERR("sb_check failed: csum %u: wanted %u",
 		      le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
@@ -319,7 +323,7 @@ static void subtree_dec(void *context, v
 
 	memcpy(&root_le, value, sizeof(root_le));
 	root = le64_to_cpu(root_le);
-	if (dm_btree_destroy(info, root))
+	if (dm_btree_del(info, root))
 		DMERR("btree delete failed\n");
 }
 
@@ -361,13 +365,13 @@ static int superblock_all_zeroes(struct 
 	return dm_bm_unlock(b);
 }
 
-static struct dm_pool_metadata *alloc_pmd(struct dm_block_manager *bm,
-					  dm_block_t nr_blocks, int create)
+static int init_pmd(struct dm_pool_metadata *pmd,
+		    struct dm_block_manager *bm,
+		    dm_block_t nr_blocks, int create)
 {
 	int r;
 	struct dm_space_map *sm, *data_sm;
 	struct dm_transaction_manager *tm;
-	struct dm_pool_metadata *pmd = NULL;
 	struct dm_block *sblock;
 
 	if (create) {
@@ -375,7 +379,7 @@ static struct dm_pool_metadata *alloc_pm
 					 &sb_validator, &tm, &sm, &sblock);
 		if (r < 0) {
 			DMERR("tm_create_with_sm failed");
-			return ERR_PTR(r);
+			return r;
 		}
 
 		data_sm = dm_sm_disk_create(tm, nr_blocks);
@@ -384,18 +388,6 @@ static struct dm_pool_metadata *alloc_pm
 			r = PTR_ERR(data_sm);
 			goto bad;
 		}
-
-		r = dm_tm_pre_commit(tm);
-		if (r < 0) {
-			DMERR("couldn't pre commit");
-			goto bad_data_sm;
-		}
-
-		r = dm_tm_commit(tm, sblock);
-		if (r < 0) {
-			DMERR("couldn't commit");
-			goto bad_data_sm;
-		}
 	} else {
 		struct thin_disk_superblock *disk_super = NULL;
 		size_t space_map_root_offset =
@@ -406,7 +398,7 @@ static struct dm_pool_metadata *alloc_pm
 				       SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock);
 		if (r < 0) {
 			DMERR("tm_open_with_sm failed");
-			return ERR_PTR(r);
+			return r;
 		}
 
 		disk_super = dm_block_data(sblock);
@@ -417,14 +409,12 @@ static struct dm_pool_metadata *alloc_pm
 			r = PTR_ERR(data_sm);
 			goto bad;
 		}
-
-		dm_tm_unlock(tm, sblock);
 	}
 
-	pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
-	if (!pmd) {
-		DMERR("could not allocate metadata struct");
-		r = -ENOMEM;
+
+	r = dm_tm_unlock(tm, sblock);
+	if (r < 0) {
+		DMERR("couldn't unlock superblock");
 		goto bad_data_sm;
 	}
 
@@ -436,11 +426,9 @@ static struct dm_pool_metadata *alloc_pm
 	if (!pmd->nb_tm) {
 		DMERR("could not create clone tm");
 		r = -ENOMEM;
-		goto bad_pmd;
+		goto bad_data_sm;
 	}
 
-	pmd->sblock = NULL;
-
 	pmd->info.tm = tm;
 	pmd->info.levels = 2;
 	pmd->info.value_type.context = pmd->data_sm;
@@ -484,17 +472,15 @@ static struct dm_pool_metadata *alloc_pm
 	pmd->details_root = 0;
 	INIT_LIST_HEAD(&pmd->thin_devices);
 
-	return pmd;
+	return 0;
 
-bad_pmd:
-	kfree(pmd);
 bad_data_sm:
 	dm_sm_destroy(data_sm);
 bad:
 	dm_tm_destroy(tm);
 	dm_sm_destroy(sm);
 
-	return ERR_PTR(r);
+	return r;
 }
 
 static int __begin_transaction(struct dm_pool_metadata *pmd)
@@ -502,22 +488,23 @@ static int __begin_transaction(struct dm
 	int r;
 	u32 features;
 	struct thin_disk_superblock *disk_super;
+	struct dm_block *sblock;
 
 	/*
 	 * __maybe_commit_transaction() resets these
 	 */
-	WARN_ON(pmd->sblock);
 	WARN_ON(pmd->need_commit);
 
 	/*
-	 * superblock is unlocked via dm_tm_commit()
+	 * We re-read the superblock every time.  Shouldn't need to do this
+	 * really.
 	 */
-	r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
-			     &sb_validator, &pmd->sblock);
+	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+			    &sb_validator, &sblock);
 	if (r)
 		return r;
 
-	disk_super = dm_block_data(pmd->sblock);
+	disk_super = dm_block_data(sblock);
 	pmd->time = le32_to_cpu(disk_super->time);
 	pmd->root = le64_to_cpu(disk_super->data_mapping_root);
 	pmd->details_root = le64_to_cpu(disk_super->device_details_root);
@@ -530,24 +517,27 @@ static int __begin_transaction(struct dm
 		DMERR("could not access metadata due to "
 		      "unsupported optional features (%lx).",
 		      (unsigned long)features);
-		return -EINVAL;
+		r = -EINVAL;
+		goto out;
 	}
 
 	/*
 	 * Check for read-only metadata to skip the following RDWR checks.
 	 */
 	if (get_disk_ro(pmd->bdev->bd_disk))
-		return 0;
+		goto out;
 
 	features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
 	if (features) {
 		DMERR("could not access metadata RDWR due to "
 		      "unsupported optional features (%lx).",
 		      (unsigned long)features);
-		return -EINVAL;
+		r = -EINVAL;
 	}
 
-	return 0;
+out:
+	dm_bm_unlock(sblock);
+	return r;
 }
 
 static int __write_changed_details(struct dm_pool_metadata *pmd)
@@ -587,21 +577,18 @@ static int __write_changed_details(struc
 	return 0;
 }
 
-/*
- * If there is data waiting to be committed, commit it.
- * Returns 1 if commit took place, 0 if not, or < 0 on error.
- */
-static int __maybe_commit_transaction(struct dm_pool_metadata *pmd)
+static int __commit_transaction(struct dm_pool_metadata *pmd)
 {
 	/*
 	 * FIXME: Associated pool should be made read-only on failure.
 	 */
 	int r;
-	size_t len;
+	size_t metadata_len, data_len;
 	struct thin_disk_superblock *disk_super;
+	struct dm_block *sblock;
 
 	/*
-	 * thin_disk_superblock is assumed not to exceed a 512-byte sector.
+	 * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
 	 */
 	BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
 
@@ -612,57 +599,53 @@ static int __maybe_commit_transaction(st
 	if (!pmd->need_commit)
 		goto out;
 
+	r = dm_sm_commit(pmd->data_sm);
+	if (r < 0)
+		goto out;
+
 	r = dm_tm_pre_commit(pmd->tm);
 	if (r < 0)
 		goto out;
 
-	r = dm_sm_root_size(pmd->metadata_sm, &len);
+	r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
 	if (r < 0)
 		goto out;
 
-	disk_super = dm_block_data(pmd->sblock);
+	r = dm_sm_root_size(pmd->metadata_sm, &data_len);
+	if (r < 0)
+		goto out;
+
+	r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+			     &sb_validator, &sblock);
+	if (r)
+		goto out;
+
+	disk_super = dm_block_data(sblock);
 	disk_super->time = cpu_to_le32(pmd->time);
 	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
 	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
 	disk_super->trans_id = cpu_to_le64(pmd->trans_id);
 	disk_super->flags = cpu_to_le32(pmd->flags);
 
-	r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, len);
+	r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
+			    metadata_len);
 	if (r < 0)
-		goto out;
+		goto out_locked;
 
-	r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root, len);
+	r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
+			    data_len);
 	if (r < 0)
-		goto out;
+		goto out_locked;
 
-	r = dm_tm_commit(pmd->tm, pmd->sblock);
-	if (!r) {
-		r = 1;
-		pmd->sblock = NULL;
+	r = dm_tm_commit(pmd->tm, sblock);
+	if (!r)
 		pmd->need_commit = 0;
-	}
 
 out:
 	return r;
-}
-
-int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
-{
-	int r;
 
-	down_write(&pmd->root_lock);
-
-	r = __maybe_commit_transaction(pmd);
-	if (r <= 0)
-		goto out;
-
-	/*
-	 * Open the next transaction.
-	 */
-	r = __begin_transaction(pmd);
-
-out:
-	up_write(&pmd->root_lock);
+out_locked:
+	dm_bm_unlock(sblock);
 	return r;
 }
 
@@ -675,24 +658,40 @@ struct dm_pool_metadata *dm_pool_metadat
 	sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
 	struct dm_block_manager *bm;
 	int create;
+	struct dm_block *sblock;
 
+	pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
+	if (!pmd) {
+		DMERR("could not allocate metadata struct");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/*
+	 * Max hex locks:
+	 *  3 for btree insert +
+	 *  2 for btree lookup used within space map
+	 */
 	bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE,
-				     THIN_METADATA_CACHE_SIZE, 6);
+				     THIN_METADATA_CACHE_SIZE, 5);
 	if (!bm) {
 		DMERR("could not create block manager");
+		kfree(pmd);
 		return ERR_PTR(-ENOMEM);
 	}
 
 	r = superblock_all_zeroes(bm, &create);
 	if (r) {
 		dm_block_manager_destroy(bm);
+		kfree(pmd);
 		return ERR_PTR(r);
 	}
 
-	pmd = alloc_pmd(bm, 0, create);
-	if (IS_ERR(pmd)) {
+
+	r = init_pmd(pmd, bm, 0, create);
+	if (r) {
 		dm_block_manager_destroy(bm);
-		return pmd;
+		kfree(pmd);
+		return ERR_PTR(r);
 	}
 	pmd->bdev = bdev;
 
@@ -706,13 +705,12 @@ struct dm_pool_metadata *dm_pool_metadat
 	/*
 	 * Create.
 	 */
-	if (!pmd->sblock) {
-		r = __begin_transaction(pmd);
-		if (r < 0)
-			goto bad;
-	}
+	r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+			     &sb_validator, &sblock);
+	if (r)
+		goto bad;
 
-	disk_super = dm_block_data(pmd->sblock);
+	disk_super = dm_block_data(sblock);
 	disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
 	disk_super->version = cpu_to_le32(THIN_VERSION);
 	disk_super->time = 0;
@@ -720,11 +718,15 @@ struct dm_pool_metadata *dm_pool_metadat
 	disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
 	disk_super->data_block_size = cpu_to_le32(data_block_size);
 
-	r = dm_btree_create(&pmd->info, &pmd->root);
+	r = dm_bm_unlock(sblock);
 	if (r < 0)
 		goto bad;
 
-	r = dm_btree_create(&pmd->details_info, &pmd->details_root);
+	r = dm_btree_empty(&pmd->info, &pmd->root);
+	if (r < 0)
+		goto bad;
+
+	r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
 	if (r < 0) {
 		DMERR("couldn't create devices root");
 		goto bad;
@@ -770,14 +772,10 @@ int dm_pool_metadata_close(struct dm_poo
 		return -EBUSY;
 	}
 
-	if (pmd->sblock) {
-		r = __maybe_commit_transaction(pmd);
-		if (r < 0)
-			DMWARN("%s: __maybe_commit_transaction() failed, error = %d",
-			       __func__, r);
-		if (pmd->sblock)
-			dm_tm_unlock(pmd->tm, pmd->sblock);
-	}
+	r = __commit_transaction(pmd);
+	if (r < 0)
+		DMWARN("%s: __commit_transaction() failed, error = %d",
+		       __func__, r);
 
 	dm_tm_destroy(pmd->tm);
 	dm_tm_destroy(pmd->nb_tm);
@@ -805,7 +803,7 @@ static int __open_device(struct dm_pool_
 	struct disk_device_details details_le;
 
 	/*
-	 * If the device is already open, just increment its open_count.
+	 * Check the device isn't already open.
 	 */
 	list_for_each_entry(td2, &pmd->thin_devices, list)
 		if (td2->id == dev) {
@@ -823,9 +821,6 @@ static int __open_device(struct dm_pool_
 		if (r != -ENODATA || !create)
 			return r;
 
-		/*
-		 * New device.
-		 */
 		changed = 1;
 		details_le.mapped_blocks = 0;
 		details_le.transaction_id = cpu_to_le64(pmd->trans_id);
@@ -874,7 +869,7 @@ static int __create_thin(struct dm_pool_
 	/*
 	 * Create an empty btree for the mappings.
 	 */
-	r = dm_btree_create(&pmd->bl_info, &dev_root);
+	r = dm_btree_empty(&pmd->bl_info, &dev_root);
 	if (r)
 		return r;
 
@@ -885,7 +880,7 @@ static int __create_thin(struct dm_pool_
 	__dm_bless_for_disk(&value);
 	r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
 	if (r) {
-		dm_btree_destroy(&pmd->bl_info, dev_root);
+		dm_btree_del(&pmd->bl_info, dev_root);
 		return r;
 	}
 
@@ -893,7 +888,7 @@ static int __create_thin(struct dm_pool_
 	if (r) {
 		__close_device(td);
 		dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
-		dm_btree_destroy(&pmd->bl_info, dev_root);
+		dm_btree_del(&pmd->bl_info, dev_root);
 		return r;
 	}
 	td->changed = 1;
@@ -938,7 +933,7 @@ static int __create_snap(struct dm_pool_
 			 dm_thin_id dev, dm_thin_id origin)
 {
 	int r;
-	dm_block_t origin_root, snap_root;
+	dm_block_t origin_root;
 	uint64_t key = origin, dev_key = dev;
 	struct dm_thin_device *td;
 	struct disk_device_details details_le;
@@ -956,18 +951,16 @@ static int __create_snap(struct dm_pool_
 		return r;
 	origin_root = le64_to_cpu(value);
 
-	/* clone the origin */
-	r = dm_btree_clone(&pmd->bl_info, origin_root, &snap_root);
-	if (r)
-		return r;
+	/* clone the origin, an inc will do */
+	dm_tm_inc(pmd->tm, origin_root);
 
 	/* insert into the main mapping tree */
-	value = cpu_to_le64(snap_root);
+	value = cpu_to_le64(origin_root);
 	__dm_bless_for_disk(&value);
 	key = dev;
 	r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
 	if (r) {
-		dm_btree_destroy(&pmd->bl_info, snap_root);
+		dm_tm_dec(pmd->tm, origin_root);
 		return r;
 	}
 
@@ -1049,46 +1042,6 @@ int dm_pool_delete_thin_device(struct dm
 	return r;
 }
 
-static int __trim_thin_dev(struct dm_thin_device *td, sector_t new_size)
-{
-	struct dm_pool_metadata *pmd = td->pmd;
-	/* FIXME: convert new size to blocks */
-	uint64_t key[2] = { td->id, new_size - 1 };
-
-	td->changed = 1;
-
-	/*
-	 * We need to truncate all the extraneous mappings.
-	 *
-	 * FIXME: We have to be careful to do this atomically.
-	 * Perhaps clone the bottom layer first so we can revert?
-	 */
-	return dm_btree_delete_gt(&pmd->info, pmd->root, key, &pmd->root);
-}
-
-// FIXME Incomplete implementation. Finish or remove it before final submission.
-int dm_pool_trim_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
-			     sector_t new_size)
-{
-	int r;
-	struct dm_thin_device *td;
-
-	down_write(&pmd->root_lock);
-	r = __open_device(pmd, dev, 1, &td);
-	if (r)
-		DMERR("couldn't open virtual device");
-	else {
-		r = __trim_thin_dev(td, new_size);
-		__close_device(td);
-	}
-
-	/* FIXME: update mapped_blocks */
-
-	up_write(&pmd->root_lock);
-
-	return r;
-}
-
 int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
 					uint64_t current_id,
 					uint64_t new_id)
@@ -1117,17 +1070,34 @@ int dm_pool_get_metadata_transaction_id(
 	return 0;
 }
 
+static int __get_held_metadata_root(struct dm_pool_metadata *pmd,
+				    dm_block_t *result)
+{
+	int r;
+	struct thin_disk_superblock *disk_super;
+	struct dm_block *sblock;
+
+	r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
+			     &sb_validator, &sblock);
+	if (r)
+		return r;
+
+	disk_super = dm_block_data(sblock);
+	*result = le64_to_cpu(disk_super->held_root);
+
+	return dm_bm_unlock(sblock);
+}
+
 int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd,
 				   dm_block_t *result)
 {
-	struct thin_disk_superblock *disk_super;
+	int r;
 
 	down_read(&pmd->root_lock);
-	disk_super = dm_block_data(pmd->sblock);
-	*result = le64_to_cpu(disk_super->held_root);
+	r = __get_held_metadata_root(pmd, result);
 	up_read(&pmd->root_lock);
 
-	return 0;
+	return r;
 }
 
 int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
@@ -1275,6 +1245,25 @@ int dm_pool_alloc_data_block(struct dm_p
 	return r;
 }
 
+int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
+{
+	int r;
+
+	down_write(&pmd->root_lock);
+
+	r = __commit_transaction(pmd);
+	if (r <= 0)
+		goto out;
+
+	/*
+	 * Open the next transaction.
+	 */
+	r = __begin_transaction(pmd);
+out:
+	up_write(&pmd->root_lock);
+	return r;
+}
+
 int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
 {
 	int r;
Index: linux-3.1-rc9/drivers/md/dm-thin-metadata.h
===================================================================
--- linux-3.1-rc9.orig/drivers/md/dm-thin-metadata.h
+++ linux-3.1-rc9/drivers/md/dm-thin-metadata.h
@@ -9,6 +9,9 @@
 
 #include "persistent-data/dm-block-manager.h"
 
+/* FIXME: need metadata blocksize function later... */
+#define THIN_METADATA_BLOCK_SIZE 4096
+
 /*----------------------------------------------------------------*/
 
 struct dm_pool_metadata;
@@ -64,14 +67,6 @@ int dm_pool_delete_thin_device(struct dm
 			       dm_thin_id dev);
 
 /*
- * Thin devices don't have a size, however they do keep track of the
- * highest mapped block.  This trimming function allows the user to remove
- * mappings above a certain virtual block.
- */
-int dm_pool_trim_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
-			     sector_t new_size);
-
-/*
  * Commits _all_ metadata changes: device creation, deletion, mapping
  * updates.
  */
Index: linux-3.1-rc9/drivers/md/dm-thin.c
===================================================================
--- linux-3.1-rc9.orig/drivers/md/dm-thin.c
+++ linux-3.1-rc9/drivers/md/dm-thin.c
@@ -468,15 +468,17 @@ struct pool {
 	struct dm_target *ti;	/* Only set if a pool target is bound */
 
 	struct mapped_device *pool_md;
+	struct block_device *md_dev;
 	struct dm_pool_metadata *pmd;
 
 	uint32_t sectors_per_block;
 	unsigned block_shift;
 	dm_block_t offset_mask;
-	dm_block_t low_water_mark;
+	dm_block_t low_water_blocks;
 
 	unsigned zero_new_blocks:1;
 	unsigned low_water_triggered:1;	/* A dm event has been sent */
+	unsigned no_free_space:1;	/* An ENOSPC warning has been issued */
 
 	struct bio_prison *prison;
 	struct dm_kcopyd_client *copier;
@@ -484,8 +486,11 @@ struct pool {
 	struct workqueue_struct *wq;
 	struct work_struct worker;
 
+	unsigned ref_count;
+
 	spinlock_t lock;
 	struct bio_list deferred_bios;
+	struct bio_list deferred_flush_bios;
 	struct list_head prepared_mappings;
 
 	struct bio_list retry_on_resume_list;
@@ -493,11 +498,8 @@ struct pool {
 	struct deferred_set ds;	/* FIXME: move to thin_c */
 
 	struct new_mapping *next_mapping;
-
 	mempool_t *mapping_pool;
 	mempool_t *endio_hook_pool;
-
-	atomic_t ref_count;
 };
 
 /*
@@ -510,7 +512,7 @@ struct pool_c {
 	struct dm_dev *metadata_dev;
 	struct dm_target_callbacks callbacks;
 
-	sector_t low_water_mark;
+	dm_block_t low_water_blocks;
 	unsigned zero_new_blocks:1;
 };
 
@@ -528,45 +530,59 @@ struct thin_c {
 /*----------------------------------------------------------------*/
 
 /*
- * A global list that uses a struct mapped_device as a key.
+ * A global list of pools that uses a struct mapped_device as a key.
  */
 static struct dm_thin_pool_table {
-	spinlock_t lock;
+	struct mutex mutex;
 	struct list_head pools;
 } dm_thin_pool_table;
 
 static void pool_table_init(void)
 {
-	spin_lock_init(&dm_thin_pool_table.lock);
-
+	mutex_init(&dm_thin_pool_table.mutex);
 	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
 }
 
-static void pool_table_insert(struct pool *pool)
+static void __pool_table_insert(struct pool *pool)
 {
-	spin_lock(&dm_thin_pool_table.lock);
+	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 	list_add(&pool->list, &dm_thin_pool_table.pools);
-	spin_unlock(&dm_thin_pool_table.lock);
 }
 
-static void pool_table_remove(struct pool *pool)
+static void __pool_table_remove(struct pool *pool)
 {
-	spin_lock(&dm_thin_pool_table.lock);
+	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 	list_del(&pool->list);
-	spin_unlock(&dm_thin_pool_table.lock);
 }
 
-static struct pool *pool_table_lookup(struct mapped_device *md)
+static struct pool *__pool_table_lookup(struct mapped_device *md)
 {
 	struct pool *pool = NULL, *tmp;
 
-	spin_lock(&dm_thin_pool_table.lock);
-	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list)
+	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
+
+	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 		if (tmp->pool_md == md) {
 			pool = tmp;
 			break;
 		}
-	spin_unlock(&dm_thin_pool_table.lock);
+	}
+
+	return pool;
+}
+
+static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
+{
+	struct pool *pool = NULL, *tmp;
+
+	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
+
+	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
+		if (tmp->md_dev == md_dev) {
+			pool = tmp;
+			break;
+		}
+	}
 
 	return pool;
 }
@@ -597,23 +613,27 @@ static void remap(struct thin_c *tc, str
 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 			    dm_block_t block)
 {
-	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
-		int r = dm_pool_commit_metadata(tc->pool->pmd);
-		if (r) {
-			DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
-			      __func__, r);
-			bio_io_error(bio);
-			return;
-		}
-	}
+	struct pool *pool = tc->pool;
+	unsigned long flags;
 
 	remap(tc, bio, block);
-	generic_make_request(bio);
+
+	/*
+	 * Batch together any FUA/FLUSH bios we find and then issue
+	 * a single commit for them in process_deferred_bios().
+	 */
+	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
+		spin_lock_irqsave(&pool->lock, flags);
+		bio_list_add(&pool->deferred_flush_bios, bio);
+		spin_unlock_irqrestore(&pool->lock, flags);
+	} else
+		generic_make_request(bio);
 }
 
 /*
- * wake_worker() is used by thin_defer_bio and pool_preresume to continue
- * deferred IO processing after pool resume.
+ * wake_worker() is used when new work is queued and when
+ * pool_resume is ready to continue deferred IO
+ * processing.
  */
 static void wake_worker(struct pool *pool)
 {
@@ -625,7 +645,6 @@ static void wake_worker(struct pool *poo
 /*
  * Bio endio functions.
  */
-
 struct endio_hook {
 	struct thin_c *tc;
 	bio_end_io_t *saved_bi_end_io;
@@ -737,14 +756,14 @@ static void cell_defer(struct thin_c *tc
 
 	spin_lock_irqsave(&pool->lock, flags);
 	cell_release(cell, &pool->deferred_bios);
-	spin_unlock_irqrestore(&pool->lock, flags);
+	spin_unlock_irqrestore(&tc->pool->lock, flags);
 
 	wake_worker(pool);
 }
 
 /*
- * As above, but ignoring @exception (a write bio that covers
- * the block) because it has already been processed.
+ * Same as cell_defer above, except it omits one particular detainee,
+ * a write bio that covers the block and has already been processed.
  */
 static void cell_defer_except(struct thin_c *tc, struct cell *cell,
 			      struct bio *exception)
@@ -805,6 +824,7 @@ static void process_prepared_mapping(str
 	} else
 		cell_defer(tc, m->cell, m->data_block);
 
+	list_del(&m->list);
 	mempool_free(m, tc->pool->mapping_pool);
 }
 
@@ -812,14 +832,14 @@ static void process_prepared_mappings(st
 {
 	unsigned long flags;
 	struct list_head maps;
-	struct new_mapping *m;
+	struct new_mapping *m, *tmp;
 
 	INIT_LIST_HEAD(&maps);
 	spin_lock_irqsave(&pool->lock, flags);
 	list_splice_init(&pool->prepared_mappings, &maps);
 	spin_unlock_irqrestore(&pool->lock, flags);
 
-	list_for_each_entry(m, &maps, list)
+	list_for_each_entry_safe(m, tmp, &maps, list)
 		process_prepared_mapping(m);
 }
 
@@ -935,11 +955,13 @@ static void schedule_zero(struct thin_c 
 	 */
 	if (!pool->zero_new_blocks)
 		process_prepared_mapping(m);
+
 	else if (io_overwrites_block(pool, bio)) {
 		m->bio = bio;
 		save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 		dm_get_mapinfo(bio)->ptr = m;
 		remap_and_issue(tc, bio, data_block);
+
 	} else {
 		int r;
 		struct dm_io_region to;
@@ -957,21 +979,6 @@ static void schedule_zero(struct thin_c 
 	}
 }
 
-/*
- * If we have run out of space, queue bios until the device is
- * resumed, presumably after having been reloaded with more space.
- */
-static void retry_when_resumed(struct bio *bio)
-{
-	struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
-	struct pool *pool = tc->pool;
-	unsigned long flags;
-
-	spin_lock_irqsave(&pool->lock, flags);
-	bio_list_add(&pool->retry_on_resume_list, bio);
-	spin_unlock_irqrestore(&pool->lock, flags);
-}
-
 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 {
 	int r;
@@ -983,13 +990,49 @@ static int alloc_data_block(struct thin_
 	if (r)
 		return r;
 
-	if (free_blocks <= pool->low_water_mark && !pool->low_water_triggered) {
+	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
+		DMWARN("%s: reached low water mark, sending event.",
+		       dm_device_name(pool->pool_md));
 		spin_lock_irqsave(&pool->lock, flags);
 		pool->low_water_triggered = 1;
 		spin_unlock_irqrestore(&pool->lock, flags);
 		dm_table_event(pool->ti->table);
 	}
 
+	if (!free_blocks) {
+		if (pool->no_free_space)
+			return -ENOSPC;
+		else {
+			/*
+			 * Try to commit to see if that will free up some
+			 * more space.
+			 */
+			r = dm_pool_commit_metadata(pool->pmd);
+			if (r) {
+				DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
+				      __func__, r);
+				return r;
+			}
+
+			r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
+			if (r)
+				return r;
+
+			/*
+			 * If we still have no space we set a flag to avoid
+			 * doing all this checking and return -ENOSPC.
+			 */
+			if (!free_blocks) {
+				DMWARN("%s: no free space available.",
+				       dm_device_name(pool->pool_md));
+				spin_lock_irqsave(&pool->lock, flags);
+				pool->no_free_space = 1;
+				spin_unlock_irqrestore(&pool->lock, flags);
+				return -ENOSPC;
+			}
+		}
+	}
+
 	r = dm_pool_alloc_data_block(pool->pmd, result);
 	if (r)
 		return r;
@@ -997,6 +1040,21 @@ static int alloc_data_block(struct thin_
 	return 0;
 }
 
+/*
+ * If we have run out of space, queue bios until the device is
+ * resumed, presumably after having been reloaded with more space.
+ */
+static void retry_on_resume(struct bio *bio)
+{
+	struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
+	struct pool *pool = tc->pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	bio_list_add(&pool->retry_on_resume_list, bio);
+	spin_unlock_irqrestore(&pool->lock, flags);
+}
+
 static void no_space(struct cell *cell)
 {
 	struct bio *bio;
@@ -1006,7 +1064,7 @@ static void no_space(struct cell *cell)
 	cell_release(cell, &bios);
 
 	while ((bio = bio_list_pop(&bios)))
-		retry_when_resumed(bio);
+		retry_on_resume(bio);
 }
 
 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
@@ -1040,8 +1098,8 @@ static void process_shared_bio(struct th
 			       struct dm_thin_lookup_result *lookup_result)
 {
 	struct cell *cell;
-	struct cell_key key;
 	struct pool *pool = tc->pool;
+	struct cell_key key;
 
 	/*
 	 * If cell is already occupied, then sharing is already in the process
@@ -1133,6 +1191,10 @@ static void process_bio(struct thin_c *t
 		 * one that puts bios into a cell, and we know there were
 		 * no preceding bios.
 		 */
+		/*
+		 * TODO: this will probably have to change when discard goes
+		 * back in.
+		 */
 		cell_release_singleton(cell, bio);
 
 		if (lookup_result.shared)
@@ -1157,6 +1219,7 @@ static void process_deferred_bios(struct
 	unsigned long flags;
 	struct bio *bio;
 	struct bio_list bios;
+	int r;
 
 	bio_list_init(&bios);
 
@@ -1167,22 +1230,45 @@ static void process_deferred_bios(struct
 
 	while ((bio = bio_list_pop(&bios))) {
 		struct thin_c *tc = dm_get_mapinfo(bio)->ptr;
-
 		/*
-		 * If we've got no free new_mapping structs, and processing this bio
-		 * might require one, we pause until there are some prepared mappings to
-		 * process.
+		 * If we've got no free new_mapping structs, and processing
+		 * this bio might require one, we pause until there are some
+		 * prepared mappings to process.
 		 */
 		if (ensure_next_mapping(pool)) {
 			spin_lock_irqsave(&pool->lock, flags);
 			bio_list_merge(&pool->deferred_bios, &bios);
 			spin_unlock_irqrestore(&pool->lock, flags);
 
-			return;
+			break;
 		}
-
 		process_bio(tc, bio);
 	}
+
+	/*
+	 * If there are any deferred flush bios, we must commit
+	 * the metadata before issuing them.
+	 */
+	bio_list_init(&bios);
+	spin_lock_irqsave(&pool->lock, flags);
+	bio_list_merge(&bios, &pool->deferred_flush_bios);
+	bio_list_init(&pool->deferred_flush_bios);
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	if (bio_list_empty(&bios))
+		return;
+
+	r = dm_pool_commit_metadata(pool->pmd);
+	if (r) {
+		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
+		      __func__, r);
+		while ((bio = bio_list_pop(&bios)))
+			bio_io_error(bio);
+		return;
+	}
+
+	while ((bio = bio_list_pop(&bios)))
+		generic_make_request(bio);
 }
 
 static void do_worker(struct work_struct *ws)
@@ -1215,8 +1301,7 @@ static void thin_defer_bio(struct thin_c
 }
 
 /*
- * Non-blocking function designed to be called from the target's map
- * function.
+ * Non-blocking function called from the thin target's map function.
  */
 static int thin_bio_map(struct dm_target *ti, struct bio *bio,
 			union map_info *map_context)
@@ -1256,7 +1341,7 @@ static int thin_bio_map(struct dm_target
 			 * ensure a consistent application view
 			 * (i.e. lockfs).
 			 *
-			 * More distant ancestors are irrelevant: the
+			 * More distant ancestors are irrelevant. The
 			 * shared flag will be set in their case.
 			 */
 			thin_defer_bio(tc, bio);
@@ -1309,8 +1394,7 @@ static int bind_control_target(struct po
 	struct pool_c *pt = ti->private;
 
 	pool->ti = ti;
-	pool->low_water_mark = dm_sector_div_up(pt->low_water_mark,
-						pool->sectors_per_block);
+	pool->low_water_blocks = pt->low_water_blocks;
 	pool->zero_new_blocks = pt->zero_new_blocks;
 	dm_pool_rebind_metadata_device(pool->pmd, pt->metadata_dev->bdev);
 
@@ -1326,8 +1410,10 @@ static void unbind_control_target(struct
 /*----------------------------------------------------------------
  * Pool creation
  *--------------------------------------------------------------*/
-static void pool_destroy(struct pool *pool)
+static void __pool_destroy(struct pool *pool)
 {
+	__pool_table_remove(pool);
+
 	if (dm_pool_metadata_close(pool->pmd) < 0)
 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
 
@@ -1339,13 +1425,13 @@ static void pool_destroy(struct pool *po
 
 	if (pool->next_mapping)
 		mempool_free(pool->next_mapping, pool->mapping_pool);
-
 	mempool_destroy(pool->mapping_pool);
 	mempool_destroy(pool->endio_hook_pool);
 	kfree(pool);
 }
 
-static struct pool *pool_create(struct block_device *metadata_dev,
+static struct pool *pool_create(struct mapped_device *pool_md,
+				struct block_device *metadata_dev,
 				unsigned long block_size, char **error)
 {
 	int r;
@@ -1370,7 +1456,7 @@ static struct pool *pool_create(struct b
 	pool->sectors_per_block = block_size;
 	pool->block_shift = ffs(block_size) - 1;
 	pool->offset_mask = block_size - 1;
-	pool->low_water_mark = 0;
+	pool->low_water_blocks = 0;
 	pool->zero_new_blocks = 1;
 	pool->prison = prison_create(PRISON_CELLS);
 	if (!pool->prison) {
@@ -1401,8 +1487,10 @@ static struct pool *pool_create(struct b
 	INIT_WORK(&pool->worker, do_worker);
 	spin_lock_init(&pool->lock);
 	bio_list_init(&pool->deferred_bios);
+	bio_list_init(&pool->deferred_flush_bios);
 	INIT_LIST_HEAD(&pool->prepared_mappings);
 	pool->low_water_triggered = 0;
+	pool->no_free_space = 0;
 	bio_list_init(&pool->retry_on_resume_list);
 	ds_init(&pool->ds);
 
@@ -1422,7 +1510,10 @@ static struct pool *pool_create(struct b
 		err_p = ERR_PTR(-ENOMEM);
 		goto bad_endio_hook_pool;
 	}
-	atomic_set(&pool->ref_count, 1);
+	pool->ref_count = 1;
+	pool->pool_md = pool_md;
+	pool->md_dev = metadata_dev;
+	__pool_table_insert(pool);
 
 	return pool;
 
@@ -1443,29 +1534,38 @@ bad_pool:
 	return err_p;
 }
 
-static void pool_inc(struct pool *pool)
+static void __pool_inc(struct pool *pool)
 {
-	atomic_inc(&pool->ref_count);
+	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
+	pool->ref_count++;
 }
 
-static void pool_dec(struct pool *pool)
+static void __pool_dec(struct pool *pool)
 {
-	if (atomic_dec_and_test(&pool->ref_count))
-		pool_destroy(pool);
+	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
+	BUG_ON(!pool->ref_count);
+	if (!--pool->ref_count)
+		__pool_destroy(pool);
 }
 
-static struct pool *pool_find(struct mapped_device *pool_md,
-			      struct block_device *metadata_dev,
-			      unsigned long block_size,
-			      char **error)
+static struct pool *__pool_find(struct mapped_device *pool_md,
+				struct block_device *metadata_dev,
+				unsigned long block_size, char **error)
 {
-	struct pool *pool;
+	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
 
-	pool = pool_table_lookup(pool_md);
-	if (pool)
-		pool_inc(pool);
-	else
-		pool = pool_create(metadata_dev, block_size, error);
+	if (pool) {
+		if (pool->pool_md != pool_md)
+			return ERR_PTR(-EBUSY);
+		__pool_inc(pool);
+
+	} else {
+		pool = __pool_table_lookup(pool_md);
+		if (pool)
+			__pool_inc(pool);
+		else
+			pool = pool_create(pool_md, metadata_dev, block_size, error);
+	}
 
 	return pool;
 }
@@ -1534,12 +1634,12 @@ static int pool_is_congested(struct dm_t
 
 /*
  * thin-pool <metadata dev> <data dev>
- *           <data block size (sectors)>
- *           <low water mark (sectors)>
- *           [<#feature args> [<arg>]*]
+ *	     <data block size (sectors)>
+ *	     <low water mark (sectors)>
+ *	     [<#feature args> [<arg>]*]
  *
  * Optional feature arguments are:
- *           skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
+ *	     skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
  */
 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
@@ -1550,13 +1650,16 @@ static int pool_ctr(struct dm_target *ti
 	struct dm_arg_set as;
 	struct dm_dev *data_dev;
 	unsigned long block_size;
-	dm_block_t low_water;
+	dm_block_t low_water_blocks;
 	struct dm_dev *metadata_dev;
 	sector_t metadata_dev_size;
 
+	mutex_lock(&dm_thin_pool_table.mutex);
+
 	if (argc < 4) {
 		ti->error = "Invalid argument count";
-		return -EINVAL;
+		r = -EINVAL;
+		goto out_unlock;
 	}
 	as.argc = argc;
 	as.argv = argv;
@@ -1564,7 +1667,7 @@ static int pool_ctr(struct dm_target *ti
 	r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
 	if (r) {
 		ti->error = "Error opening metadata block device";
-		return r;
+		goto out_unlock;
 	}
 
 	metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
@@ -1589,8 +1692,7 @@ static int pool_ctr(struct dm_target *ti
 		goto out;
 	}
 
-	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water) ||
-	    !low_water) {
+	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
 		ti->error = "Invalid low water mark";
 		r = -EINVAL;
 		goto out;
@@ -1607,39 +1709,44 @@ static int pool_ctr(struct dm_target *ti
 	if (r)
 		goto out;
 
-	pool = pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
-			 block_size, &ti->error);
-	if (IS_ERR(pool)) {
-		r = PTR_ERR(pool);
-		goto out;
-	}
-
 	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
 	if (!pt) {
-		pool_destroy(pool);
 		r = -ENOMEM;
 		goto out;
 	}
+
+	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
+			   block_size, &ti->error);
+	if (IS_ERR(pool)) {
+		r = PTR_ERR(pool);
+		goto out_free_pt;
+	}
+
 	pt->pool = pool;
 	pt->ti = ti;
 	pt->metadata_dev = metadata_dev;
 	pt->data_dev = data_dev;
-	pt->low_water_mark = low_water;
+	pt->low_water_blocks = low_water_blocks;
 	pt->zero_new_blocks = pf.zero_new_blocks;
 	ti->num_flush_requests = 1;
 	ti->num_discard_requests = 0;
-	ti->discards_supported = 0;
 	ti->private = pt;
 
 	pt->callbacks.congested_fn = pool_is_congested;
 	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
 
+	mutex_unlock(&dm_thin_pool_table.mutex);
+
 	return 0;
 
+out_free_pt:
+	kfree(pt);
 out:
 	dm_put_device(ti, data_dev);
 out_metadata:
 	dm_put_device(ti, metadata_dev);
+out_unlock:
+	mutex_unlock(&dm_thin_pool_table.mutex);
 
 	return r;
 }
@@ -1648,13 +1755,15 @@ static void pool_dtr(struct dm_target *t
 {
 	struct pool_c *pt = ti->private;
 
-	unbind_control_target(pt->pool, ti);
-	pool_dec(pt->pool);
+	mutex_lock(&dm_thin_pool_table.mutex);
 
+	unbind_control_target(pt->pool, ti);
+	__pool_dec(pt->pool);
 	dm_put_device(ti, pt->metadata_dev);
 	dm_put_device(ti, pt->data_dev);
-
 	kfree(pt);
+
+	mutex_unlock(&dm_thin_pool_table.mutex);
 }
 
 static void __requeue_bios(struct pool *pool)
@@ -1680,7 +1789,6 @@ static int pool_preresume(struct dm_targ
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 	dm_block_t data_size, sb_data_size;
-	unsigned long flags;
 
 	/*
 	 * Take control of the pool object.
@@ -1716,20 +1824,21 @@ static int pool_preresume(struct dm_targ
 		}
 	}
 
-	spin_lock_irqsave(&pool->lock, flags);
+	return 0;
+}
+
+static void pool_resume(struct dm_target *ti)
+{
+	struct pool_c *pt = ti->private;
+	struct pool *pool = pt->pool;
+
+	spin_lock_irq(&pool->lock);
 	pool->low_water_triggered = 0;
+	pool->no_free_space = 0;
 	__requeue_bios(pool);
-	spin_unlock_irqrestore(&pool->lock, flags);
+	spin_unlock_irq(&pool->lock);
 
 	wake_worker(pool);
-
-	/*
-	 * The pool object is only present if the pool is active.
-	 */
-	pool->pool_md = dm_table_get_md(ti->table);
-	pool_table_insert(pool);
-
-	return 0;
 }
 
 static void pool_postsuspend(struct dm_target *ti)
@@ -1741,14 +1850,11 @@ static void pool_postsuspend(struct dm_t
 	flush_workqueue(pool->wq);
 
 	r = dm_pool_commit_metadata(pool->pmd);
-	if (r) {
+	if (r < 0) {
 		DMERR("%s: dm_pool_commit_metadata() failed, error = %d",
 		      __func__, r);
 		/* FIXME: invalidate device? error the next FUA or FLUSH bio ?*/
 	}
-
-	pool_table_remove(pool);
-	pool->pool_md = NULL;
 }
 
 static int check_arg_count(unsigned argc, unsigned args_required)
@@ -1845,34 +1951,6 @@ static int process_delete_mesg(unsigned 
 	return r;
 }
 
-static int process_trim_mesg(unsigned argc, char **argv, struct pool *pool)
-{
-	dm_thin_id dev_id;
-	sector_t new_size;
-	int r;
-
-	r = check_arg_count(argc, 3);
-	if (r)
-		return r;
-
-	r = read_dev_id(argv[1], &dev_id, 1);
-	if (r)
-		return r;
-
-	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_size)) {
-		DMWARN("trim device %s: Invalid new size: %s sectors.",
-		       argv[1], argv[2]);
-		return -EINVAL;
-	}
-
-	r = dm_pool_trim_thin_device(pool->pmd, dev_id,
-			dm_sector_div_up(new_size, pool->sectors_per_block));
-	if (r)
-		DMWARN("Attempt to trim thin device %s failed.", argv[1]);
-
-	return r;
-}
-
 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
 {
 	dm_thin_id old_id, new_id;
@@ -1925,9 +2003,6 @@ static int pool_message(struct dm_target
 	else if (!strcasecmp(argv[0], "delete"))
 		r = process_delete_mesg(argc, argv, pool);
 
-	else if (!strcasecmp(argv[0], "trim"))
-		r = process_trim_mesg(argc, argv, pool);
-
 	else if (!strcasecmp(argv[0], "set_transaction_id"))
 		r = process_set_transaction_id_mesg(argc, argv, pool);
 
@@ -1994,13 +2069,11 @@ static int pool_status(struct dm_target 
 		if (r)
 			return r;
 
-		DMEMIT("%llu %llu/%llu %llu/%llu", (unsigned long long)transaction_id,
-		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata) *
-					   pool->sectors_per_block,
-		       (unsigned long long)nr_blocks_metadata * pool->sectors_per_block,
-		       (unsigned long long)(nr_blocks_data - nr_free_blocks_data) *
-					   pool->sectors_per_block,
-		       (unsigned long long)nr_blocks_data * pool->sectors_per_block);
+		DMEMIT("%llu %llu/%llu %llu/%llu ", (unsigned long long)transaction_id,
+		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
+		       (unsigned long long)nr_blocks_metadata,
+		       (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
+		       (unsigned long long)nr_blocks_data);
 
 		if (held_root)
 			DMEMIT("%llu", held_root);
@@ -2014,7 +2087,7 @@ static int pool_status(struct dm_target 
 		       format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
 		       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
 		       (unsigned long)pool->sectors_per_block,
-		       (unsigned long long)pt->low_water_mark);
+		       (unsigned long long)pt->low_water_blocks);
 
 		DMEMIT("%u ", !pool->zero_new_blocks);
 
@@ -2067,6 +2140,7 @@ static struct target_type pool_target = 
 	.map = pool_map,
 	.postsuspend = pool_postsuspend,
 	.preresume = pool_preresume,
+	.resume = pool_resume,
 	.message = pool_message,
 	.status = pool_status,
 	.merge = pool_merge,
@@ -2074,16 +2148,21 @@ static struct target_type pool_target = 
 	.io_hints = pool_io_hints,
 };
 
-/*----------------------------------------------------------------*/
-
+/*----------------------------------------------------------------
+ * Thin target methods
+ *--------------------------------------------------------------*/
 static void thin_dtr(struct dm_target *ti)
 {
 	struct thin_c *tc = ti->private;
 
-	pool_dec(tc->pool);
+	mutex_lock(&dm_thin_pool_table.mutex);
+
+	__pool_dec(tc->pool);
 	dm_pool_close_thin_device(tc->td);
 	dm_put_device(ti, tc->pool_dev);
 	kfree(tc);
+
+	mutex_unlock(&dm_thin_pool_table.mutex);
 }
 
 /*
@@ -2101,15 +2180,19 @@ static int thin_ctr(struct dm_target *ti
 	struct dm_dev *pool_dev;
 	struct mapped_device *pool_md;
 
+	mutex_lock(&dm_thin_pool_table.mutex);
+
 	if (argc != 2) {
 		ti->error = "Invalid argument count";
-		return -EINVAL;
+		r = -EINVAL;
+		goto out_unlock;
 	}
 
 	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
 	if (!tc) {
 		ti->error = "Out of memory";
-		return -ENOMEM;
+		r = -ENOMEM;
+		goto out_unlock;
 	}
 
 	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
@@ -2132,13 +2215,13 @@ static int thin_ctr(struct dm_target *ti
 		goto bad_common;
 	}
 
-	tc->pool = pool_table_lookup(pool_md);
+	tc->pool = __pool_table_lookup(pool_md);
 	if (!tc->pool) {
 		ti->error = "Couldn't find pool object";
 		r = -EINVAL;
 		goto bad_pool_lookup;
 	}
-	pool_inc(tc->pool);
+	__pool_inc(tc->pool);
 
 	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
 	if (r) {
@@ -2153,16 +2236,20 @@ static int thin_ctr(struct dm_target *ti
 
 	dm_put(pool_md);
 
+	mutex_unlock(&dm_thin_pool_table.mutex);
+
 	return 0;
 
 bad_thin_open:
-	pool_dec(tc->pool);
+	__pool_dec(tc->pool);
 bad_pool_lookup:
 	dm_put(pool_md);
 bad_common:
 	dm_put_device(ti, tc->pool_dev);
 bad_pool_dev:
 	kfree(tc);
+out_unlock:
+	mutex_unlock(&dm_thin_pool_table.mutex);
 
 	return r;
 }
@@ -2175,6 +2262,40 @@ static int thin_map(struct dm_target *ti
 	return thin_bio_map(ti, bio, map_context);
 }
 
+static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
+{
+	struct bio *bio;
+	struct bio_list bios;
+
+	bio_list_init(&bios);
+	bio_list_merge(&bios, master);
+	bio_list_init(master);
+
+	while ((bio = bio_list_pop(&bios))) {
+		if (dm_get_mapinfo(bio)->ptr == tc)
+			bio_endio(bio, DM_ENDIO_REQUEUE);
+		else
+			bio_list_add(master, bio);
+	}
+}
+
+static void requeue_io(struct thin_c *tc)
+{
+	struct pool *pool = tc->pool;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	__requeue_bio_list(tc, &pool->deferred_bios);
+	__requeue_bio_list(tc, &pool->retry_on_resume_list);
+	spin_unlock_irqrestore(&pool->lock, flags);
+}
+
+static void thin_postsuspend(struct dm_target *ti)
+{
+	if (dm_noflush_suspending(ti))
+		requeue_io((struct thin_c *)ti->private);
+}
+
 /*
  * <nr mapped sectors> <highest mapped sector>
  */
@@ -2222,9 +2343,18 @@ static int thin_status(struct dm_target 
 static int thin_iterate_devices(struct dm_target *ti,
 				iterate_devices_callout_fn fn, void *data)
 {
+	int r;
+	dm_block_t blocks;
 	struct thin_c *tc = ti->private;
 
-	return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block, data);
+	r = dm_pool_get_data_dev_size(tc->pool->pmd, &blocks);
+	if (r)
+		return r;
+
+	if (blocks)
+		return fn(ti, tc->pool_dev, 0, tc->pool->sectors_per_block * blocks, data);
+
+	return 0;
 }
 
 static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
@@ -2242,6 +2372,7 @@ static struct target_type thin_target = 
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
 	.map = thin_map,
+	.postsuspend = thin_postsuspend,
 	.status = thin_status,
 	.iterate_devices = thin_iterate_devices,
 	.io_hints = thin_io_hints,
