From: Joe Thornber <ejt@redhat.com>

dm thin metadata: introduce dm_pool_abort_metadata

Aborts the current metadata transaction.  Generally this will only be
called when bad things are happening and dm-thin is trying to roll
back to a good state for read-only mode.

Complicated by the fact that the metadata device may have failed
completely causing the abort to be unable to read the old transaction.
In this case the metadata object is placed in a 'fail' mode and
everything fails apart from destroying it.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c |  143 ++++++++++++++++++++++++++++++++----------
 drivers/md/dm-thin-metadata.h |   12 +++
 2 files changed, 121 insertions(+), 34 deletions(-)

Index: linux-3.5/drivers/md/dm-thin-metadata.c
===================================================================
--- linux-3.5.orig/drivers/md/dm-thin-metadata.c
+++ linux-3.5/drivers/md/dm-thin-metadata.c
@@ -185,6 +185,7 @@ struct dm_pool_metadata {
 	unsigned long flags;
 	sector_t data_block_size;
 	bool read_only:1;
+	bool fail_io:1;
 };
 
 struct dm_thin_device {
@@ -193,7 +194,8 @@ struct dm_thin_device {
 	dm_thin_id id;
 
 	int open_count;
-	int changed;
+	bool changed:1;
+	bool aborted_with_changes:1;
 	uint64_t mapped_blocks;
 	uint64_t transaction_id;
 	uint32_t creation_time;
@@ -731,9 +733,6 @@ static int __write_changed_details(struc
 
 static int __commit_transaction(struct dm_pool_metadata *pmd)
 {
-	/*
-	 * FIXME: Associated pool should be made read-only on failure.
-	 */
 	int r;
 	size_t metadata_len, data_len;
 	struct thin_disk_superblock *disk_super;
@@ -809,6 +808,7 @@ struct dm_pool_metadata *dm_pool_metadat
 	pmd->time = 0;
 	INIT_LIST_HEAD(&pmd->thin_devices);
 	pmd->read_only = false;
+	pmd->fail_io = false;
 	pmd->bdev = bdev;
 	pmd->data_block_size = data_block_size;
 
@@ -851,16 +851,17 @@ int dm_pool_metadata_close(struct dm_poo
 		return -EBUSY;
 	}
 
-	if (!pmd->read_only) {
+	if (!pmd->read_only && !pmd->fail_io) {
 		r = __commit_transaction(pmd);
 		if (r < 0)
 			DMWARN("%s: __commit_transaction() failed, error = %d",
 			       __func__, r);
 	}
 
-	__destroy_persistent_data_objects(pmd);
-	kfree(pmd);
+	if (!pmd->fail_io)
+		__destroy_persistent_data_objects(pmd);
 
+	kfree(pmd);
 	return 0;
 }
 
@@ -921,6 +922,7 @@ static int __open_device(struct dm_pool_
 	(*td)->id = dev;
 	(*td)->open_count = 1;
 	(*td)->changed = changed;
+	(*td)->aborted_with_changes = false;
 	(*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
 	(*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
 	(*td)->creation_time = le32_to_cpu(details_le.creation_time);
@@ -985,7 +987,7 @@ int dm_pool_create_thin(struct dm_pool_m
 	int r;
 
 	down_write(&pmd->root_lock);
-	r = __create_thin(pmd, dev);
+	r = pmd->fail_io ? -EINVAL : __create_thin(pmd, dev);
 	up_write(&pmd->root_lock);
 
 	return r;
@@ -1075,7 +1077,7 @@ int dm_pool_create_snap(struct dm_pool_m
 	int r;
 
 	down_write(&pmd->root_lock);
-	r = __create_snap(pmd, dev, origin);
+	r = pmd->fail_io ? -EINVAL : __create_snap(pmd, dev, origin);
 	up_write(&pmd->root_lock);
 
 	return r;
@@ -1117,7 +1119,7 @@ int dm_pool_delete_thin_device(struct dm
 	int r;
 
 	down_write(&pmd->root_lock);
-	r = __delete_device(pmd, dev);
+	r = pmd->fail_io ? -EINVAL : __delete_device(pmd, dev);
 	up_write(&pmd->root_lock);
 
 	return r;
@@ -1127,27 +1129,40 @@ int dm_pool_set_metadata_transaction_id(
 					uint64_t current_id,
 					uint64_t new_id)
 {
+	int r;
+
 	down_write(&pmd->root_lock);
-	if (pmd->trans_id != current_id) {
-		up_write(&pmd->root_lock);
-		DMERR("mismatched transaction id");
-		return -EINVAL;
+	if (pmd->fail_io)
+		r = -EINVAL;
+	else {
+		if (pmd->trans_id != current_id) {
+			DMERR("mismatched transaction id");
+			r = -EINVAL;
+		} else {
+			pmd->trans_id = new_id;
+			r = 0;
+		}
 	}
-
-	pmd->trans_id = new_id;
 	up_write(&pmd->root_lock);
 
-	return 0;
+	return r;
 }
 
 int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
 					uint64_t *result)
 {
+	int r;
+
 	down_read(&pmd->root_lock);
-	*result = pmd->trans_id;
+	if (pmd->fail_io)
+		r = -EINVAL;
+	else {
+		*result = pmd->trans_id;
+		r = 0;
+	}
 	up_read(&pmd->root_lock);
 
-	return 0;
+	return r;
 }
 
 static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
@@ -1214,7 +1229,7 @@ int dm_pool_reserve_metadata_snap(struct
 	int r;
 
 	down_write(&pmd->root_lock);
-	r = __reserve_metadata_snap(pmd);
+	r = pmd->fail_io ? -EINVAL : __reserve_metadata_snap(pmd);
 	up_write(&pmd->root_lock);
 
 	return r;
@@ -1259,7 +1274,7 @@ int dm_pool_release_metadata_snap(struct
 	int r;
 
 	down_write(&pmd->root_lock);
-	r = __release_metadata_snap(pmd);
+	r = pmd->fail_io ? -EINVAL : __release_metadata_snap(pmd);
 	up_write(&pmd->root_lock);
 
 	return r;
@@ -1289,7 +1304,7 @@ int dm_pool_get_metadata_snap(struct dm_
 	int r;
 
 	down_read(&pmd->root_lock);
-	r = __get_metadata_snap(pmd, result);
+	r = pmd->fail_io ? -EINVAL : __get_metadata_snap(pmd, result);
 	up_read(&pmd->root_lock);
 
 	return r;
@@ -1301,7 +1316,7 @@ int dm_pool_open_thin_device(struct dm_p
 	int r;
 
 	down_write(&pmd->root_lock);
-	r = __open_device(pmd, dev, 0, td);
+	r = pmd->fail_io ? -EINVAL : __open_device(pmd, dev, 0, td);
 	up_write(&pmd->root_lock);
 
 	return r;
@@ -1337,12 +1352,22 @@ int dm_thin_find_block(struct dm_thin_de
 
 	if (can_block) {
 		down_read(&pmd->root_lock);
+		if (pmd->fail_io) {
+			up_read(&pmd->root_lock);
+			return -EINVAL;
+		}
+
 		r = dm_btree_lookup(&pmd->info, pmd->root, keys, &value);
 		if (!r)
 			block_time = le64_to_cpu(value);
 		up_read(&pmd->root_lock);
 
 	} else if (down_read_trylock(&pmd->root_lock)) {
+		if (pmd->fail_io) {
+			up_read(&pmd->root_lock);
+			return -EINVAL;
+		}
+
 		r = dm_btree_lookup(&pmd->nb_info, pmd->root, keys, &value);
 		if (!r)
 			block_time = le64_to_cpu(value);
@@ -1392,7 +1417,7 @@ int dm_thin_insert_block(struct dm_thin_
 	int r;
 
 	down_write(&td->pmd->root_lock);
-	r = __insert(td, block, data_block);
+	r = td->pmd->fail_io ? -EINVAL : __insert(td, block, data_block);
 	up_write(&td->pmd->root_lock);
 
 	return r;
@@ -1419,7 +1444,7 @@ int dm_thin_remove_block(struct dm_thin_
 	int r;
 
 	down_write(&td->pmd->root_lock);
-	r = __remove(td, block);
+	r = td->pmd->fail_io ? -EINVAL : __remove(td, block);
 	up_write(&td->pmd->root_lock);
 
 	return r;
@@ -1436,12 +1461,23 @@ bool dm_thin_changed_this_transaction(st
 	return r;
 }
 
+bool dm_thin_aborted_changes(struct dm_thin_device *td)
+{
+	int r;
+
+	down_read(&td->pmd->root_lock);
+	r = td->aborted_with_changes;
+	up_read(&td->pmd->root_lock);
+
+	return r;
+}
+
 int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
 {
 	int r;
 
 	down_write(&pmd->root_lock);
-	r = dm_sm_new_block(pmd->data_sm, result);
+	r = pmd->fail_io ? -EINVAL : dm_sm_new_block(pmd->data_sm, result);
 	up_write(&pmd->root_lock);
 
 	return r;
@@ -1452,6 +1488,10 @@ int dm_pool_commit_metadata(struct dm_po
 	int r;
 
 	down_write(&pmd->root_lock);
+	if (pmd->fail_io) {
+		r = -EINVAL;
+		goto out;
+	}
 
 	r = __commit_transaction(pmd);
 	if (r <= 0)
@@ -1466,12 +1506,41 @@ out:
 	return r;
 }
 
+static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
+{
+	struct dm_thin_device *td;
+
+	list_for_each_entry(td, &pmd->thin_devices, list)
+		td->aborted_with_changes = td->changed;
+}
+
+int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
+{
+	int r;
+
+	down_write(&pmd->root_lock);
+	if (pmd->fail_io) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	__set_abort_with_changes_flags(pmd);
+	__destroy_persistent_data_objects(pmd);
+	r = __create_persistent_data_objects(pmd, false);
+	if (r)
+		pmd->fail_io = true;
+
+out:
+	up_write(&pmd->root_lock);
+	return r;
+}
+
 int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
 {
 	int r;
 
 	down_read(&pmd->root_lock);
-	r = dm_sm_get_nr_free(pmd->data_sm, result);
+	r = pmd->fail_io ? -EINVAL : dm_sm_get_nr_free(pmd->data_sm, result);
 	up_read(&pmd->root_lock);
 
 	return r;
@@ -1483,7 +1552,7 @@ int dm_pool_get_free_metadata_block_coun
 	int r;
 
 	down_read(&pmd->root_lock);
-	r = dm_sm_get_nr_free(pmd->metadata_sm, result);
+	r = pmd->fail_io ? -EINVAL : dm_sm_get_nr_free(pmd->metadata_sm, result);
 	up_read(&pmd->root_lock);
 
 	return r;
@@ -1495,7 +1564,7 @@ int dm_pool_get_metadata_dev_size(struct
 	int r;
 
 	down_read(&pmd->root_lock);
-	r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
+	r = pmd->fail_io ? -EINVAL : dm_sm_get_nr_blocks(pmd->metadata_sm, result);
 	up_read(&pmd->root_lock);
 
 	return r;
@@ -1515,7 +1584,7 @@ int dm_pool_get_data_dev_size(struct dm_
 	int r;
 
 	down_read(&pmd->root_lock);
-	r = dm_sm_get_nr_blocks(pmd->data_sm, result);
+	r = pmd->fail_io ? -EINVAL : dm_sm_get_nr_blocks(pmd->data_sm, result);
 	up_read(&pmd->root_lock);
 
 	return r;
@@ -1523,13 +1592,19 @@ int dm_pool_get_data_dev_size(struct dm_
 
 int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
 {
+	int r;
 	struct dm_pool_metadata *pmd = td->pmd;
 
 	down_read(&pmd->root_lock);
-	*result = td->mapped_blocks;
+	if (pmd->fail_io)
+		r = -EINVAL;
+	else {
+		*result = td->mapped_blocks;
+		r = 0;
+	}
 	up_read(&pmd->root_lock);
 
-	return 0;
+	return r;
 }
 
 static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
@@ -1555,7 +1630,7 @@ int dm_thin_get_highest_mapped_block(str
 	struct dm_pool_metadata *pmd = td->pmd;
 
 	down_read(&pmd->root_lock);
-	r = __highest_block(td, result);
+	r = pmd->fail_io ? -EINVAL : __highest_block(td, result);
 	up_read(&pmd->root_lock);
 
 	return r;
@@ -1586,7 +1661,7 @@ int dm_pool_resize_data_dev(struct dm_po
 	int r;
 
 	down_write(&pmd->root_lock);
-	r = __resize_data_dev(pmd, new_count);
+	r = pmd->fail_io ? -EINVAL : __resize_data_dev(pmd, new_count);
 	up_write(&pmd->root_lock);
 
 	return r;
Index: linux-3.5/drivers/md/dm-thin-metadata.h
===================================================================
--- linux-3.5.orig/drivers/md/dm-thin-metadata.h
+++ linux-3.5/drivers/md/dm-thin-metadata.h
@@ -80,6 +80,16 @@ int dm_pool_delete_thin_device(struct dm
 int dm_pool_commit_metadata(struct dm_pool_metadata *pmd);
 
 /*
+ * Discards all uncommitted changes.  Rereads the superblock, rolling back
+ * to the last good transaction.  Thin devices remain open, if they had
+ * uncommitted changes dm_thin_aborted_changes() will tell you.
+ *
+ * If this call fails it's only useful to call dm_pool_metadata_close().
+ * All other methods will fail with -EINVAL.
+ */
+int dm_pool_abort_metadata(struct dm_pool_metadata *pmd);
+
+/*
  * Set/get userspace transaction id.
  */
 int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
@@ -150,6 +160,8 @@ int dm_thin_remove_block(struct dm_thin_
  */
 bool dm_thin_changed_this_transaction(struct dm_thin_device *td);
 
+bool dm_thin_aborted_changes(struct dm_thin_device *td);
+
 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
 				     dm_block_t *highest_mapped);
 
