From: Mikulas Patocka <mpatocka@redhat.com>

Originally developed by Jim Ramsay. Simplified by Mikulas Patocka.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Jim Ramsay <jim_ramsay@dell.com>

FIXME Needs patch header, Kconfig help text, reformatted new-style documentation file, full code review, rename page to region / improve related var names, pull out inlined fns

---
 Documentation/device-mapper/switch.txt |   37 ++
 drivers/md/Kconfig                     |   11 
 drivers/md/Makefile                    |    1 
 drivers/md/dm-switch.c                 |  559 +++++++++++++++++++++++++++++++++
 4 files changed, 608 insertions(+)

Index: linux/Documentation/device-mapper/switch.txt
===================================================================
--- /dev/null
+++ linux/Documentation/device-mapper/switch.txt
@@ -0,0 +1,37 @@
+dm-switch target is suitable for Dell EqualLogic storage system.
+
+The EqualLogic storage consists of several nodes. Each host is connected
+to each node. The host may send I/O requests to any node, the node that
+received the requests forwards it to the node where the data is stored.
+
+However, there is a performance advantage of sending I/O requests to the
+node where the data is stored to avoid forwarding. The dm-switch targets
+is created to use this performance advantage.
+
+The dm-switch target splits the device to fixed-size pages. It maintains
+a page table that maps pages to storage nodes. Every request is
+forwarded to the corresponding storage node specified in the page table.
+The table may be changed with messages while the dm-switch target is
+running.
+
+
+DM table arguments:
+- number of paths
+- region size
+- number of optional arguments (must be 0 currently)
+  - optional arguments (none accepted)
+for every path
+- the underlying device
+- offset to the start of data in 512-byte sectors
+
+
+
+DM message:
+
+set_region_mappings index1:node1 index2:node2 index3:node3 ...
+- modify page table, set values at index to point to the specific node.
+  Index and node numbers are hexadecimal. You can omit the index number,
+  in which case previous index plus 1 is used.
+
+No status line
+
Index: linux/drivers/md/Kconfig
===================================================================
--- linux.orig/drivers/md/Kconfig
+++ linux/drivers/md/Kconfig
@@ -389,4 +389,15 @@ config DM_VERITY
 
 	  If unsure, say N.
 
+config DM_SWITCH
+	tristate "Switch target support (EXPERIMENTAL)"
+	depends on BLK_DEV_DM && EXPERIMENTAL
+	---help---
+	  Help text needs writing
+
+	  To compile this code as a module, choose M here: the module will
+	  be called dm-switch.
+
+	  If unsure, say N.
+
 endif # MD
Index: linux/drivers/md/Makefile
===================================================================
--- linux.orig/drivers/md/Makefile
+++ linux/drivers/md/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
 obj-$(CONFIG_DM_RAID)	+= dm-raid.o
 obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm-thin-pool.o
 obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
+obj-$(CONFIG_DM_SWITCH)		+= dm-switch.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
Index: linux/drivers/md/dm-switch.c
===================================================================
--- /dev/null
+++ linux/drivers/md/dm-switch.c
@@ -0,0 +1,559 @@
+/*
+ * Copyright (C) 2010-2012 by Dell Inc.  All rights reserved.
+ * Copyright (C) 2011-2012 Red Hat, Inc.
+ *
+ * This file is released under the GPL.
+ *
+ * Description:
+ *
+ *     file:    dm-switch.c
+ *     authors: Kevin_OKelley@dell.com
+ *              Jim_Ramsay@dell.com
+ *              Narendran_Ganapathy@dell.com
+ *		mpatocka@redhat.com
+ *
+ * This file implements a "switch" target which efficiently implements a
+ * mapping of IOs to underlying block devices in scenarios where there are:
+ *   (1) a large number of address regions
+ *   (2) a fixed size equal across all address regions
+ *   (3) no pattern than allows for a compact description with something like
+ *       the dm-stripe target.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+
+#define DM_MSG_PREFIX "switch"
+
+/*
+ * Switch device context block: A new one is created for each dm device.
+ * Contains an array of devices from which we have taken references.
+ */
+struct switch_dev {
+	struct dm_dev *dmdev;
+	sector_t start;
+};
+
+/*
+ * Holds a variable number (pte_fields) of page table entries,
+ * each a fixed number of bits in size.
+ */
+typedef unsigned long pt_entries;
+
+/*
+ * Switch context header
+ */
+struct switch_ctx {
+	struct dm_target *ti;
+
+	unsigned nr_paths;		/* Number of paths */
+	unsigned region_size;		/* Page size in 512B sectors */
+	unsigned long nr_regions;		/* Number of regions */
+	signed char region_size_bits;	/* log2 of region_size or -1 */
+
+// continue renaming to region not page
+	unsigned char pte_bits;		/* Number of bits in one page table entry */
+	unsigned char pte_fields;	/* Number of entries that fit into one pt_entry */
+	signed char pte_fields_bits;	/* log2 of pte_fields or -1 */
+	pt_entries *region_table;	/* Region table */
+
+	/*
+	 * Array of dm devices to switch between.
+	 */
+	struct switch_dev dev_list[0];
+};
+
+static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned nr_paths,
+					   unsigned region_size)
+{
+	struct switch_ctx *sctx;
+
+	sctx = kmalloc(sizeof(struct switch_ctx) + nr_paths * sizeof(struct switch_dev),
+		       GFP_KERNEL);
+	if (!sctx)
+		return NULL;
+
+	sctx->ti = ti;
+	sctx->region_size = region_size;
+
+	ti->private = sctx;
+
+	return sctx;
+}
+
+static void switch_get_position(struct switch_ctx *sctx, unsigned long page,
+				unsigned long *region_index, unsigned *bit)
+{
+	if (sctx->pte_fields_bits >= 0) {
+		*region_index = page >> sctx->pte_fields_bits;
+		*bit = page & (sctx->pte_fields - 1);
+	} else {
+		*region_index = page / sctx->pte_fields;
+		*bit = page % sctx->pte_fields;
+	}
+
+	*bit *= sctx->pte_bits;
+}
+
+/*
+ * Find which path to use at given offset.
+ */
+static unsigned switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
+{
+	unsigned long region_index;
+	unsigned bit, path_nr;
+	sector_t p;
+
+	p = offset;
+	if (sctx->region_size_bits >= 0)
+		p >>= sctx->region_size_bits;
+	else
+		sector_div(p, sctx->region_size);
+
+	switch_get_position(sctx, p, &region_index, &bit);
+	path_nr = (ACCESS_ONCE(sctx->region_table[region_index]) >> bit) &
+	       ((1 << sctx->pte_bits) - 1);
+
+	/* This can only happen if the processor uses non-atomic stores. */
+	if (unlikely(path_nr >= sctx->nr_paths))
+		path_nr = 0;
+
+	return path_nr;
+}
+
+static void switch_region_table_write(struct switch_ctx *sctx, unsigned long page,
+				      unsigned value)
+{
+	unsigned long region_index;
+	unsigned bit;
+	pt_entries pte;
+
+	switch_get_position(sctx, page, &region_index, &bit);
+
+	pte = sctx->region_table[region_index];
+	pte &= ~((((pt_entries)1 << sctx->pte_bits) - 1) << bit);
+	pte |= (pt_entries)value << bit;
+	sctx->region_table[region_index] = pte;
+}
+
+/*
+ * Fill the region table with an initial round robin pattern.
+ */
+static void initialise_switch_region_table(struct switch_ctx *sctx)
+{
+	unsigned path_nr = 0;
+	unsigned long region_nr;
+
+	for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
+		switch_region_table_write(sctx, region_nr, path_nr);
+		region_nr++;
+		if (path_nr >= sctx->nr_paths)
+			path_nr = 0;
+	}
+}
+
+static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
+{
+	struct switch_ctx *sctx = ti->private;
+	unsigned long long start;
+	int r;
+
+	r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
+			  &sctx->dev_list[sctx->nr_paths].dmdev);
+	if (r) {
+		ti->error = "Device lookup failed";
+		return r;
+	}
+
+	if (kstrtoull(as->argv[0], 10, &start) || start != (sector_t)start) {
+		ti->error = "Invalid device starting offset";
+		dm_put_device(ti, sctx->dev_list[sctx->nr_paths].dmdev);
+		return -EINVAL;
+	}
+
+	sctx->dev_list[sctx->nr_paths].start = start;
+	as->argc--;
+
+	sctx->nr_paths++;
+
+	return 0;
+}
+
+/*
+ * Destructor: Don't free the dm_target, just the ti->private data (if any).
+ */
+static void switch_dtr(struct dm_target *ti)
+{
+	struct switch_ctx *sctx = ti->private;
+
+	while (sctx->nr_paths--)
+		dm_put_device(ti, sctx->dev_list[sctx->nr_paths].dmdev);
+
+	vfree(sctx->region_table);
+	kfree(sctx);
+}
+
+/*
+ * Constructor arguments:
+ *   <num paths> <page size> <num optional args> [<optional args>...]
+ *   [<dev_path> <offset>]+
+ *
+ * Optional args are to allow for future extension: currently this
+ * parameter must be 0.
+ */
+static int switch_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	static struct dm_arg _args[] = {
+		{0, UINT_MAX, "invalid number of paths"},
+		{1, UINT_MAX, "invalid page size"},
+		{0, 0, "invalid number of optional args"},
+	};
+
+	int r;
+	struct switch_ctx *sctx;
+	struct dm_arg_set as;
+	unsigned nr_paths, region_size, nr_optional_args;
+
+	sector_t dev_size;
+
+	if (argc < 5) {
+		ti->error = "Insufficient arguments";
+		return -EINVAL;
+	}
+
+	as.argc = argc;
+	as.argv = argv;
+
+	r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
+	if (r)
+		return -EINVAL;
+
+	if (argc != nr_paths * 2 + 2) {
+		ti->error = "Incorrect number of path arguments";
+		return -EINVAL;
+	}
+
+	if (nr_paths > (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_dev)) {
+		ti->error = "Too many devices for system";
+		return -EINVAL;
+	}
+
+	r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
+	if (r)
+		return -EINVAL;
+
+	r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
+	if (r)
+		return -EINVAL;
+
+	sctx = alloc_switch_ctx(ti, nr_paths, region_size);
+	if (!sctx) {
+		ti->error = "Cannot allocate redirection context";
+		return -ENOMEM;
+	}
+
+// Move these into alloc fn?
+	if (!(sctx->region_size & (sctx->region_size - 1)))
+		sctx->region_size_bits = __ffs(sctx->region_size);
+	else
+		sctx->region_size_bits = -1;
+
+	sctx->pte_bits = 1;
+	while (sctx->pte_bits < sizeof(pt_entries) * 8 &&
+	       (pt_entries)1 << sctx->pte_bits < nr_paths)
+		sctx->pte_bits++;
+
+	sctx->pte_fields = (sizeof(pt_entries) * 8) / sctx->pte_bits;
+	if (!(sctx->pte_fields & (sctx->pte_fields - 1)))
+		sctx->pte_fields_bits = __ffs(sctx->pte_fields);
+	else
+		sctx->pte_fields_bits = -1;
+
+	dev_size = ti->len;
+	if (sector_div(dev_size, sctx->region_size))
+		dev_size++;
+
+	sctx->nr_regions = dev_size;
+	if (sctx->nr_regions != dev_size || sctx->nr_regions >= ULONG_MAX) {
+		ti->error = "Too long page table";
+		r = -EINVAL;
+		goto error_kfree;
+	}
+
+	if (sector_div(dev_size, sctx->pte_fields))
+		dev_size++;
+
+	if (dev_size > ULONG_MAX / sizeof(pt_entries)) {
+		ti->error = "Too long page table";
+		r = -EINVAL;
+		goto error_kfree;
+	}
+
+	r = dm_set_target_max_io_len(ti, sctx->region_size);
+	if (r)
+		goto error_kfree;
+
+	sctx->region_table = vmalloc(dev_size * sizeof(pt_entries));
+	if (!sctx->region_table) {
+		ti->error = "Cannot allocate page table";
+		r = -ENOMEM;
+		goto error_kfree;
+	}
+
+	initialise_switch_region_table(sctx);
+
+	while (as.argc) {
+		r = parse_path(&as, ti);
+		if (r) {
+			switch_dtr(ti);
+			return r;
+		}
+	}
+
+	/* For UNMAP, sending the request down any path is sufficient */
+	ti->num_discard_requests = 1;
+
+	return 0;
+
+error_kfree:
+	kfree(sctx);
+
+	return r;
+}
+
+static int switch_map(struct dm_target *ti, struct bio *bio,
+		      union map_info *map_context)
+{
+	struct switch_ctx *sctx = ti->private;
+	sector_t offset = dm_target_offset(ti, bio->bi_sector);
+	unsigned path_nr = switch_get_path_nr(sctx, offset);
+
+	bio->bi_bdev = sctx->dev_list[path_nr].dmdev->bdev;
+	bio->bi_sector = sctx->dev_list[path_nr].start + offset;
+
+	return DM_MAPIO_REMAPPED;
+}
+
+/*
+ * We need to parse hex numbers as quickly as possible.
+ * Message is used to load the whole table.
+ *
+ * This table-based hex parser improves performance.
+ * It improves a time to load 1000000 entries compared to the condition-based
+ * parser.
+ *		table-based parser	condition-based parser
+ * PA-RISC	0.29s			0.31s
+ * Opteron	0.0495s			0.0498s
+ */
+
+static const unsigned char hex_table[256] = {
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
+255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+};
+
+static void parse_hex(const char *string, sector_t *result, const char **end)
+{
+	unsigned char d;
+	sector_t r = 0;
+
+	while ((d = hex_table[(unsigned char)*string]) < 16) {
+		r = (r << 4) | d;
+		string++;
+	}
+
+	*end = string;
+	*result = r;
+}
+
+// FIXME Sort out DMWARNs
+static int process_set_region_mappings(struct switch_ctx *sctx,
+			     unsigned argc, char **argv)
+{
+	unsigned i;
+	sector_t region_index = 0;
+
+	for (i = 1; i < argc; i++) {
+		sector_t device;
+		const char *string = argv[i];
+
+		if (*string == ':')
+			region_index++;
+		else {
+			parse_hex(string, &region_index, &string);
+			if (unlikely(*string != ':')) {
+				DMWARN("invalid set_region_mappings argument");
+				return -EINVAL;
+			}
+		}
+
+		string++;
+		if (unlikely(!*string)) {
+			DMWARN("invalid set_region_mappings argument");
+			return -EINVAL;
+		}
+
+		parse_hex(string, &device, &string);
+		if (unlikely(*string)) {
+			DMWARN("invalid set_region_mappings argument");
+			return -EINVAL;
+		}
+		if (unlikely(region_index >= sctx->nr_regions)) {
+			DMWARN("invalid set_region_mappings page");
+			return -EINVAL;
+		}
+		if (unlikely(device >= sctx->nr_paths)) {
+			DMWARN("invalid set_region_mappings device");
+			return -EINVAL;
+		}
+
+		switch_region_table_write(sctx, region_index, device);
+	}
+
+	return 0;
+}
+
+/*
+ * Messages are processed one-at-a-time.
+ *
+ * Only set_region_mappings is supported.
+ */
+static int switch_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	static DEFINE_MUTEX(message_mutex);
+
+	struct switch_ctx *sctx = ti->private;
+	int r = -EINVAL;
+
+	mutex_lock(&message_mutex);
+
+	if (!strcasecmp(argv[0], "set_region_mappings"))
+		r = process_set_region_mappings(sctx, argc, argv);
+	else
+		DMWARN("Unrecognised message received.");
+
+	mutex_unlock(&message_mutex);
+
+	return r;
+}
+
+static int switch_status(struct dm_target *ti, status_type_t type,
+			 unsigned status_flags, char *result, unsigned maxlen)
+{
+	struct switch_ctx *sctx = ti->private;
+	unsigned sz = 0;
+	int path_nr;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		result[0] = '\0';
+		break;
+
+	case STATUSTYPE_TABLE:
+		DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
+		for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
+			DMEMIT(" %s %llu", sctx->dev_list[path_nr].dmdev->name,
+			       (unsigned long long)sctx->dev_list[path_nr].start);
+		break;
+	}
+
+	return 0;
+}
+
+/*
+ * Switch ioctl:
+ *
+ * Passthrough all ioctls to the path for sector 0
+ */
+static int switch_ioctl(struct dm_target *ti, unsigned cmd,
+			unsigned long arg)
+{
+	struct switch_ctx *sctx = ti->private;
+	struct block_device *bdev;
+	fmode_t mode;
+	unsigned path_nr;
+	int r = 0;
+
+	path_nr = switch_get_path_nr(sctx, 0);
+
+	bdev = sctx->dev_list[path_nr].dmdev->bdev;
+	mode = sctx->dev_list[path_nr].dmdev->mode;
+
+	/*
+	 * Only pass ioctls through if the device sizes match exactly.
+	 */
+	if (ti->len + sctx->dev_list[path_nr].start != i_size_read(bdev->bd_inode) >> SECTOR_SHIFT)
+		r = scsi_verify_blk_ioctl(NULL, cmd);
+
+	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+}
+
+static int switch_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct switch_ctx *sctx = ti->private;
+	int path_nr;
+
+	for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
+		r = fn(ti, sctx->dev_list[path_nr].dmdev,
+			 sctx->dev_list[path_nr].start, ti->len, data);
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+static struct target_type switch_target = {
+	.name = "switch",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr = switch_ctr,
+	.dtr = switch_dtr,
+	.map = switch_map,
+	.message = switch_message,
+	.status = switch_status,
+	.ioctl = switch_ioctl,
+	.iterate_devices = switch_iterate_devices,
+};
+
+static int __init dm_switch_init(void)
+{
+	int r;
+
+	r = dm_register_target(&switch_target);
+	if (r < 0)
+		DMERR("dm_register_target() failed %d", r);
+
+	return r;
+}
+
+static void __exit dm_switch_exit(void)
+{
+	dm_unregister_target(&switch_target);
+}
+
+module_init(dm_switch_init);
+module_exit(dm_switch_exit);
+
+MODULE_DESCRIPTION(DM_NAME " fixed-size address-region-mapping throughput-oriented path selector");
+MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
+MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
+MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
+MODULE_LICENSE("GPL");
