1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2010-2012 by Dell Inc.  All rights reserved.
4 * Copyright (C) 2011-2013 Red Hat, Inc.
5 *
6 * This file is released under the GPL.
7 *
8 * dm-switch is a device-mapper target that maps IO to underlying block
9 * devices efficiently when there are a large number of fixed-sized
10 * address regions but there is no simple pattern to allow for a compact
11 * mapping representation such as dm-stripe.
12 */
13
14#include <linux/device-mapper.h>
15
16#include <linux/module.h>
17#include <linux/init.h>
18#include <linux/vmalloc.h>
19
20#define DM_MSG_PREFIX "switch"
21
22/*
23 * One region_table_slot_t holds <region_entries_per_slot> region table
24 * entries each of which is <region_table_entry_bits> in size.
25 */
26typedef unsigned long region_table_slot_t;
27
28/*
29 * A device with the offset to its start sector.
30 */
31struct switch_path {
32	struct dm_dev *dmdev;
33	sector_t start;
34};
35
36/*
37 * Context block for a dm switch device.
38 */
39struct switch_ctx {
40	struct dm_target *ti;
41
42	unsigned int nr_paths;		/* Number of paths in path_list. */
43
44	unsigned int region_size;		/* Region size in 512-byte sectors */
45	unsigned long nr_regions;	/* Number of regions making up the device */
46	signed char region_size_bits;	/* log2 of region_size or -1 */
47
48	unsigned char region_table_entry_bits;	/* Number of bits in one region table entry */
49	unsigned char region_entries_per_slot;	/* Number of entries in one region table slot */
50	signed char region_entries_per_slot_bits;	/* log2 of region_entries_per_slot or -1 */
51
52	region_table_slot_t *region_table;	/* Region table */
53
54	/*
55	 * Array of dm devices to switch between.
56	 */
57	struct switch_path path_list[];
58};
59
60static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned int nr_paths,
61					   unsigned int region_size)
62{
63	struct switch_ctx *sctx;
64
65	sctx = kzalloc(struct_size(sctx, path_list, nr_paths), GFP_KERNEL);
66	if (!sctx)
67		return NULL;
68
69	sctx->ti = ti;
70	sctx->region_size = region_size;
71
72	ti->private = sctx;
73
74	return sctx;
75}
76
77static int alloc_region_table(struct dm_target *ti, unsigned int nr_paths)
78{
79	struct switch_ctx *sctx = ti->private;
80	sector_t nr_regions = ti->len;
81	sector_t nr_slots;
82
83	if (!(sctx->region_size & (sctx->region_size - 1)))
84		sctx->region_size_bits = __ffs(sctx->region_size);
85	else
86		sctx->region_size_bits = -1;
87
88	sctx->region_table_entry_bits = 1;
89	while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 &&
90	       (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths)
91		sctx->region_table_entry_bits++;
92
93	sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits;
94	if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1)))
95		sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot);
96	else
97		sctx->region_entries_per_slot_bits = -1;
98
99	if (sector_div(nr_regions, sctx->region_size))
100		nr_regions++;
101
102	if (nr_regions >= ULONG_MAX) {
103		ti->error = "Region table too large";
104		return -EINVAL;
105	}
106	sctx->nr_regions = nr_regions;
107
108	nr_slots = nr_regions;
109	if (sector_div(nr_slots, sctx->region_entries_per_slot))
110		nr_slots++;
111
112	if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) {
113		ti->error = "Region table too large";
114		return -EINVAL;
115	}
116
117	sctx->region_table = vmalloc(array_size(nr_slots,
118						sizeof(region_table_slot_t)));
119	if (!sctx->region_table) {
120		ti->error = "Cannot allocate region table";
121		return -ENOMEM;
122	}
123
124	return 0;
125}
126
127static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr,
128				unsigned long *region_index, unsigned int *bit)
129{
130	if (sctx->region_entries_per_slot_bits >= 0) {
131		*region_index = region_nr >> sctx->region_entries_per_slot_bits;
132		*bit = region_nr & (sctx->region_entries_per_slot - 1);
133	} else {
134		*region_index = region_nr / sctx->region_entries_per_slot;
135		*bit = region_nr % sctx->region_entries_per_slot;
136	}
137
138	*bit *= sctx->region_table_entry_bits;
139}
140
141static unsigned int switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr)
142{
143	unsigned long region_index;
144	unsigned int bit;
145
146	switch_get_position(sctx, region_nr, &region_index, &bit);
147
148	return (READ_ONCE(sctx->region_table[region_index]) >> bit) &
149		((1 << sctx->region_table_entry_bits) - 1);
150}
151
152/*
153 * Find which path to use at given offset.
154 */
155static unsigned int switch_get_path_nr(struct switch_ctx *sctx, sector_t offset)
156{
157	unsigned int path_nr;
158	sector_t p;
159
160	p = offset;
161	if (sctx->region_size_bits >= 0)
162		p >>= sctx->region_size_bits;
163	else
164		sector_div(p, sctx->region_size);
165
166	path_nr = switch_region_table_read(sctx, p);
167
168	/* This can only happen if the processor uses non-atomic stores. */
169	if (unlikely(path_nr >= sctx->nr_paths))
170		path_nr = 0;
171
172	return path_nr;
173}
174
175static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr,
176				      unsigned int value)
177{
178	unsigned long region_index;
179	unsigned int bit;
180	region_table_slot_t pte;
181
182	switch_get_position(sctx, region_nr, &region_index, &bit);
183
184	pte = sctx->region_table[region_index];
185	pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit);
186	pte |= (region_table_slot_t)value << bit;
187	sctx->region_table[region_index] = pte;
188}
189
190/*
191 * Fill the region table with an initial round robin pattern.
192 */
193static void initialise_region_table(struct switch_ctx *sctx)
194{
195	unsigned int path_nr = 0;
196	unsigned long region_nr;
197
198	for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) {
199		switch_region_table_write(sctx, region_nr, path_nr);
200		if (++path_nr >= sctx->nr_paths)
201			path_nr = 0;
202	}
203}
204
205static int parse_path(struct dm_arg_set *as, struct dm_target *ti)
206{
207	struct switch_ctx *sctx = ti->private;
208	unsigned long long start;
209	int r;
210
211	r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
212			  &sctx->path_list[sctx->nr_paths].dmdev);
213	if (r) {
214		ti->error = "Device lookup failed";
215		return r;
216	}
217
218	if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) {
219		ti->error = "Invalid device starting offset";
220		dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
221		return -EINVAL;
222	}
223
224	sctx->path_list[sctx->nr_paths].start = start;
225
226	sctx->nr_paths++;
227
228	return 0;
229}
230
231/*
232 * Destructor: Don't free the dm_target, just the ti->private data (if any).
233 */
234static void switch_dtr(struct dm_target *ti)
235{
236	struct switch_ctx *sctx = ti->private;
237
238	while (sctx->nr_paths--)
239		dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev);
240
241	vfree(sctx->region_table);
242	kfree(sctx);
243}
244
245/*
246 * Constructor arguments:
247 *   <num_paths> <region_size> <num_optional_args> [<optional_args>...]
248 *   [<dev_path> <offset>]+
249 *
250 * Optional args are to allow for future extension: currently this
251 * parameter must be 0.
252 */
253static int switch_ctr(struct dm_target *ti, unsigned int argc, char **argv)
254{
255	static const struct dm_arg _args[] = {
256		{1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"},
257		{1, UINT_MAX, "Invalid region size"},
258		{0, 0, "Invalid number of optional args"},
259	};
260
261	struct switch_ctx *sctx;
262	struct dm_arg_set as;
263	unsigned int nr_paths, region_size, nr_optional_args;
264	int r;
265
266	as.argc = argc;
267	as.argv = argv;
268
269	r = dm_read_arg(_args, &as, &nr_paths, &ti->error);
270	if (r)
271		return -EINVAL;
272
273	r = dm_read_arg(_args + 1, &as, &region_size, &ti->error);
274	if (r)
275		return r;
276
277	r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error);
278	if (r)
279		return r;
280	/* parse optional arguments here, if we add any */
281
282	if (as.argc != nr_paths * 2) {
283		ti->error = "Incorrect number of path arguments";
284		return -EINVAL;
285	}
286
287	sctx = alloc_switch_ctx(ti, nr_paths, region_size);
288	if (!sctx) {
289		ti->error = "Cannot allocate redirection context";
290		return -ENOMEM;
291	}
292
293	r = dm_set_target_max_io_len(ti, region_size);
294	if (r)
295		goto error;
296
297	while (as.argc) {
298		r = parse_path(&as, ti);
299		if (r)
300			goto error;
301	}
302
303	r = alloc_region_table(ti, nr_paths);
304	if (r)
305		goto error;
306
307	initialise_region_table(sctx);
308
309	/* For UNMAP, sending the request down any path is sufficient */
310	ti->num_discard_bios = 1;
311
312	return 0;
313
314error:
315	switch_dtr(ti);
316
317	return r;
318}
319
320static int switch_map(struct dm_target *ti, struct bio *bio)
321{
322	struct switch_ctx *sctx = ti->private;
323	sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector);
324	unsigned int path_nr = switch_get_path_nr(sctx, offset);
325
326	bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev);
327	bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset;
328
329	return DM_MAPIO_REMAPPED;
330}
331
332/*
333 * We need to parse hex numbers in the message as quickly as possible.
334 *
335 * This table-based hex parser improves performance.
336 * It improves a time to load 1000000 entries compared to the condition-based
337 * parser.
338 *		table-based parser	condition-based parser
339 * PA-RISC	0.29s			0.31s
340 * Opteron	0.0495s			0.0498s
341 */
342static const unsigned char hex_table[256] = {
343255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
344255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
345255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
3460, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255,
347255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
348255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
349255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255,
350255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
351255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
352255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
353255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
354255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
355255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
356255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
357255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
358255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
359};
360
361static __always_inline unsigned long parse_hex(const char **string)
362{
363	unsigned char d;
364	unsigned long r = 0;
365
366	while ((d = hex_table[(unsigned char)**string]) < 16) {
367		r = (r << 4) | d;
368		(*string)++;
369	}
370
371	return r;
372}
373
374static int process_set_region_mappings(struct switch_ctx *sctx,
375				       unsigned int argc, char **argv)
376{
377	unsigned int i;
378	unsigned long region_index = 0;
379
380	for (i = 1; i < argc; i++) {
381		unsigned long path_nr;
382		const char *string = argv[i];
383
384		if ((*string & 0xdf) == 'R') {
385			unsigned long cycle_length, num_write;
386
387			string++;
388			if (unlikely(*string == ',')) {
389				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
390				return -EINVAL;
391			}
392			cycle_length = parse_hex(&string);
393			if (unlikely(*string != ',')) {
394				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
395				return -EINVAL;
396			}
397			string++;
398			if (unlikely(!*string)) {
399				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
400				return -EINVAL;
401			}
402			num_write = parse_hex(&string);
403			if (unlikely(*string)) {
404				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
405				return -EINVAL;
406			}
407
408			if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) {
409				DMWARN("invalid set_region_mappings cycle length: %lu > %lu",
410				       cycle_length - 1, region_index);
411				return -EINVAL;
412			}
413			if (unlikely(region_index + num_write < region_index) ||
414			    unlikely(region_index + num_write >= sctx->nr_regions)) {
415				DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu",
416				       region_index, num_write, sctx->nr_regions);
417				return -EINVAL;
418			}
419
420			while (num_write--) {
421				region_index++;
422				path_nr = switch_region_table_read(sctx, region_index - cycle_length);
423				switch_region_table_write(sctx, region_index, path_nr);
424			}
425
426			continue;
427		}
428
429		if (*string == ':')
430			region_index++;
431		else {
432			region_index = parse_hex(&string);
433			if (unlikely(*string != ':')) {
434				DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
435				return -EINVAL;
436			}
437		}
438
439		string++;
440		if (unlikely(!*string)) {
441			DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
442			return -EINVAL;
443		}
444
445		path_nr = parse_hex(&string);
446		if (unlikely(*string)) {
447			DMWARN("invalid set_region_mappings argument: '%s'", argv[i]);
448			return -EINVAL;
449		}
450		if (unlikely(region_index >= sctx->nr_regions)) {
451			DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions);
452			return -EINVAL;
453		}
454		if (unlikely(path_nr >= sctx->nr_paths)) {
455			DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths);
456			return -EINVAL;
457		}
458
459		switch_region_table_write(sctx, region_index, path_nr);
460	}
461
462	return 0;
463}
464
465/*
466 * Messages are processed one-at-a-time.
467 *
468 * Only set_region_mappings is supported.
469 */
470static int switch_message(struct dm_target *ti, unsigned int argc, char **argv,
471			  char *result, unsigned int maxlen)
472{
473	static DEFINE_MUTEX(message_mutex);
474
475	struct switch_ctx *sctx = ti->private;
476	int r = -EINVAL;
477
478	mutex_lock(&message_mutex);
479
480	if (!strcasecmp(argv[0], "set_region_mappings"))
481		r = process_set_region_mappings(sctx, argc, argv);
482	else
483		DMWARN("Unrecognised message received.");
484
485	mutex_unlock(&message_mutex);
486
487	return r;
488}
489
490static void switch_status(struct dm_target *ti, status_type_t type,
491			  unsigned int status_flags, char *result, unsigned int maxlen)
492{
493	struct switch_ctx *sctx = ti->private;
494	unsigned int sz = 0;
495	int path_nr;
496
497	switch (type) {
498	case STATUSTYPE_INFO:
499		result[0] = '\0';
500		break;
501
502	case STATUSTYPE_TABLE:
503		DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size);
504		for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++)
505			DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name,
506			       (unsigned long long)sctx->path_list[path_nr].start);
507		break;
508
509	case STATUSTYPE_IMA:
510		result[0] = '\0';
511		break;
512	}
513}
514
515/*
516 * Switch ioctl:
517 *
518 * Passthrough all ioctls to the path for sector 0
519 */
520static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
521{
522	struct switch_ctx *sctx = ti->private;
523	unsigned int path_nr;
524
525	path_nr = switch_get_path_nr(sctx, 0);
526
527	*bdev = sctx->path_list[path_nr].dmdev->bdev;
528
529	/*
530	 * Only pass ioctls through if the device sizes match exactly.
531	 */
532	if (ti->len + sctx->path_list[path_nr].start !=
533	    bdev_nr_sectors((*bdev)))
534		return 1;
535	return 0;
536}
537
538static int switch_iterate_devices(struct dm_target *ti,
539				  iterate_devices_callout_fn fn, void *data)
540{
541	struct switch_ctx *sctx = ti->private;
542	int path_nr;
543	int r;
544
545	for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) {
546		r = fn(ti, sctx->path_list[path_nr].dmdev,
547			 sctx->path_list[path_nr].start, ti->len, data);
548		if (r)
549			return r;
550	}
551
552	return 0;
553}
554
555static struct target_type switch_target = {
556	.name = "switch",
557	.version = {1, 1, 0},
558	.features = DM_TARGET_NOWAIT,
559	.module = THIS_MODULE,
560	.ctr = switch_ctr,
561	.dtr = switch_dtr,
562	.map = switch_map,
563	.message = switch_message,
564	.status = switch_status,
565	.prepare_ioctl = switch_prepare_ioctl,
566	.iterate_devices = switch_iterate_devices,
567};
568module_dm(switch);
569
570MODULE_DESCRIPTION(DM_NAME " dynamic path switching target");
571MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>");
572MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>");
573MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>");
574MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
575MODULE_LICENSE("GPL");
576