1// SPDX-License-Identifier: GPL-2.0-only 2/* 3 * Copyright (C) 2010-2012 by Dell Inc. All rights reserved. 4 * Copyright (C) 2011-2013 Red Hat, Inc. 5 * 6 * This file is released under the GPL. 7 * 8 * dm-switch is a device-mapper target that maps IO to underlying block 9 * devices efficiently when there are a large number of fixed-sized 10 * address regions but there is no simple pattern to allow for a compact 11 * mapping representation such as dm-stripe. 12 */ 13 14#include <linux/device-mapper.h> 15 16#include <linux/module.h> 17#include <linux/init.h> 18#include <linux/vmalloc.h> 19 20#define DM_MSG_PREFIX "switch" 21 22/* 23 * One region_table_slot_t holds <region_entries_per_slot> region table 24 * entries each of which is <region_table_entry_bits> in size. 25 */ 26typedef unsigned long region_table_slot_t; 27 28/* 29 * A device with the offset to its start sector. 30 */ 31struct switch_path { 32 struct dm_dev *dmdev; 33 sector_t start; 34}; 35 36/* 37 * Context block for a dm switch device. 38 */ 39struct switch_ctx { 40 struct dm_target *ti; 41 42 unsigned int nr_paths; /* Number of paths in path_list. */ 43 44 unsigned int region_size; /* Region size in 512-byte sectors */ 45 unsigned long nr_regions; /* Number of regions making up the device */ 46 signed char region_size_bits; /* log2 of region_size or -1 */ 47 48 unsigned char region_table_entry_bits; /* Number of bits in one region table entry */ 49 unsigned char region_entries_per_slot; /* Number of entries in one region table slot */ 50 signed char region_entries_per_slot_bits; /* log2 of region_entries_per_slot or -1 */ 51 52 region_table_slot_t *region_table; /* Region table */ 53 54 /* 55 * Array of dm devices to switch between. 56 */ 57 struct switch_path path_list[]; 58}; 59 60static struct switch_ctx *alloc_switch_ctx(struct dm_target *ti, unsigned int nr_paths, 61 unsigned int region_size) 62{ 63 struct switch_ctx *sctx; 64 65 sctx = kzalloc(struct_size(sctx, path_list, nr_paths), GFP_KERNEL); 66 if (!sctx) 67 return NULL; 68 69 sctx->ti = ti; 70 sctx->region_size = region_size; 71 72 ti->private = sctx; 73 74 return sctx; 75} 76 77static int alloc_region_table(struct dm_target *ti, unsigned int nr_paths) 78{ 79 struct switch_ctx *sctx = ti->private; 80 sector_t nr_regions = ti->len; 81 sector_t nr_slots; 82 83 if (!(sctx->region_size & (sctx->region_size - 1))) 84 sctx->region_size_bits = __ffs(sctx->region_size); 85 else 86 sctx->region_size_bits = -1; 87 88 sctx->region_table_entry_bits = 1; 89 while (sctx->region_table_entry_bits < sizeof(region_table_slot_t) * 8 && 90 (region_table_slot_t)1 << sctx->region_table_entry_bits < nr_paths) 91 sctx->region_table_entry_bits++; 92 93 sctx->region_entries_per_slot = (sizeof(region_table_slot_t) * 8) / sctx->region_table_entry_bits; 94 if (!(sctx->region_entries_per_slot & (sctx->region_entries_per_slot - 1))) 95 sctx->region_entries_per_slot_bits = __ffs(sctx->region_entries_per_slot); 96 else 97 sctx->region_entries_per_slot_bits = -1; 98 99 if (sector_div(nr_regions, sctx->region_size)) 100 nr_regions++; 101 102 if (nr_regions >= ULONG_MAX) { 103 ti->error = "Region table too large"; 104 return -EINVAL; 105 } 106 sctx->nr_regions = nr_regions; 107 108 nr_slots = nr_regions; 109 if (sector_div(nr_slots, sctx->region_entries_per_slot)) 110 nr_slots++; 111 112 if (nr_slots > ULONG_MAX / sizeof(region_table_slot_t)) { 113 ti->error = "Region table too large"; 114 return -EINVAL; 115 } 116 117 sctx->region_table = vmalloc(array_size(nr_slots, 118 sizeof(region_table_slot_t))); 119 if (!sctx->region_table) { 120 ti->error = "Cannot allocate region table"; 121 return -ENOMEM; 122 } 123 124 return 0; 125} 126 127static void switch_get_position(struct switch_ctx *sctx, unsigned long region_nr, 128 unsigned long *region_index, unsigned int *bit) 129{ 130 if (sctx->region_entries_per_slot_bits >= 0) { 131 *region_index = region_nr >> sctx->region_entries_per_slot_bits; 132 *bit = region_nr & (sctx->region_entries_per_slot - 1); 133 } else { 134 *region_index = region_nr / sctx->region_entries_per_slot; 135 *bit = region_nr % sctx->region_entries_per_slot; 136 } 137 138 *bit *= sctx->region_table_entry_bits; 139} 140 141static unsigned int switch_region_table_read(struct switch_ctx *sctx, unsigned long region_nr) 142{ 143 unsigned long region_index; 144 unsigned int bit; 145 146 switch_get_position(sctx, region_nr, ®ion_index, &bit); 147 148 return (READ_ONCE(sctx->region_table[region_index]) >> bit) & 149 ((1 << sctx->region_table_entry_bits) - 1); 150} 151 152/* 153 * Find which path to use at given offset. 154 */ 155static unsigned int switch_get_path_nr(struct switch_ctx *sctx, sector_t offset) 156{ 157 unsigned int path_nr; 158 sector_t p; 159 160 p = offset; 161 if (sctx->region_size_bits >= 0) 162 p >>= sctx->region_size_bits; 163 else 164 sector_div(p, sctx->region_size); 165 166 path_nr = switch_region_table_read(sctx, p); 167 168 /* This can only happen if the processor uses non-atomic stores. */ 169 if (unlikely(path_nr >= sctx->nr_paths)) 170 path_nr = 0; 171 172 return path_nr; 173} 174 175static void switch_region_table_write(struct switch_ctx *sctx, unsigned long region_nr, 176 unsigned int value) 177{ 178 unsigned long region_index; 179 unsigned int bit; 180 region_table_slot_t pte; 181 182 switch_get_position(sctx, region_nr, ®ion_index, &bit); 183 184 pte = sctx->region_table[region_index]; 185 pte &= ~((((region_table_slot_t)1 << sctx->region_table_entry_bits) - 1) << bit); 186 pte |= (region_table_slot_t)value << bit; 187 sctx->region_table[region_index] = pte; 188} 189 190/* 191 * Fill the region table with an initial round robin pattern. 192 */ 193static void initialise_region_table(struct switch_ctx *sctx) 194{ 195 unsigned int path_nr = 0; 196 unsigned long region_nr; 197 198 for (region_nr = 0; region_nr < sctx->nr_regions; region_nr++) { 199 switch_region_table_write(sctx, region_nr, path_nr); 200 if (++path_nr >= sctx->nr_paths) 201 path_nr = 0; 202 } 203} 204 205static int parse_path(struct dm_arg_set *as, struct dm_target *ti) 206{ 207 struct switch_ctx *sctx = ti->private; 208 unsigned long long start; 209 int r; 210 211 r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table), 212 &sctx->path_list[sctx->nr_paths].dmdev); 213 if (r) { 214 ti->error = "Device lookup failed"; 215 return r; 216 } 217 218 if (kstrtoull(dm_shift_arg(as), 10, &start) || start != (sector_t)start) { 219 ti->error = "Invalid device starting offset"; 220 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); 221 return -EINVAL; 222 } 223 224 sctx->path_list[sctx->nr_paths].start = start; 225 226 sctx->nr_paths++; 227 228 return 0; 229} 230 231/* 232 * Destructor: Don't free the dm_target, just the ti->private data (if any). 233 */ 234static void switch_dtr(struct dm_target *ti) 235{ 236 struct switch_ctx *sctx = ti->private; 237 238 while (sctx->nr_paths--) 239 dm_put_device(ti, sctx->path_list[sctx->nr_paths].dmdev); 240 241 vfree(sctx->region_table); 242 kfree(sctx); 243} 244 245/* 246 * Constructor arguments: 247 * <num_paths> <region_size> <num_optional_args> [<optional_args>...] 248 * [<dev_path> <offset>]+ 249 * 250 * Optional args are to allow for future extension: currently this 251 * parameter must be 0. 252 */ 253static int switch_ctr(struct dm_target *ti, unsigned int argc, char **argv) 254{ 255 static const struct dm_arg _args[] = { 256 {1, (KMALLOC_MAX_SIZE - sizeof(struct switch_ctx)) / sizeof(struct switch_path), "Invalid number of paths"}, 257 {1, UINT_MAX, "Invalid region size"}, 258 {0, 0, "Invalid number of optional args"}, 259 }; 260 261 struct switch_ctx *sctx; 262 struct dm_arg_set as; 263 unsigned int nr_paths, region_size, nr_optional_args; 264 int r; 265 266 as.argc = argc; 267 as.argv = argv; 268 269 r = dm_read_arg(_args, &as, &nr_paths, &ti->error); 270 if (r) 271 return -EINVAL; 272 273 r = dm_read_arg(_args + 1, &as, ®ion_size, &ti->error); 274 if (r) 275 return r; 276 277 r = dm_read_arg_group(_args + 2, &as, &nr_optional_args, &ti->error); 278 if (r) 279 return r; 280 /* parse optional arguments here, if we add any */ 281 282 if (as.argc != nr_paths * 2) { 283 ti->error = "Incorrect number of path arguments"; 284 return -EINVAL; 285 } 286 287 sctx = alloc_switch_ctx(ti, nr_paths, region_size); 288 if (!sctx) { 289 ti->error = "Cannot allocate redirection context"; 290 return -ENOMEM; 291 } 292 293 r = dm_set_target_max_io_len(ti, region_size); 294 if (r) 295 goto error; 296 297 while (as.argc) { 298 r = parse_path(&as, ti); 299 if (r) 300 goto error; 301 } 302 303 r = alloc_region_table(ti, nr_paths); 304 if (r) 305 goto error; 306 307 initialise_region_table(sctx); 308 309 /* For UNMAP, sending the request down any path is sufficient */ 310 ti->num_discard_bios = 1; 311 312 return 0; 313 314error: 315 switch_dtr(ti); 316 317 return r; 318} 319 320static int switch_map(struct dm_target *ti, struct bio *bio) 321{ 322 struct switch_ctx *sctx = ti->private; 323 sector_t offset = dm_target_offset(ti, bio->bi_iter.bi_sector); 324 unsigned int path_nr = switch_get_path_nr(sctx, offset); 325 326 bio_set_dev(bio, sctx->path_list[path_nr].dmdev->bdev); 327 bio->bi_iter.bi_sector = sctx->path_list[path_nr].start + offset; 328 329 return DM_MAPIO_REMAPPED; 330} 331 332/* 333 * We need to parse hex numbers in the message as quickly as possible. 334 * 335 * This table-based hex parser improves performance. 336 * It improves a time to load 1000000 entries compared to the condition-based 337 * parser. 338 * table-based parser condition-based parser 339 * PA-RISC 0.29s 0.31s 340 * Opteron 0.0495s 0.0498s 341 */ 342static const unsigned char hex_table[256] = { 343255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 344255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 345255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 3460, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 347255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, 348255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 349255, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255, 255, 255, 350255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 351255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 352255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 353255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 354255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 355255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 356255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 357255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 358255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 359}; 360 361static __always_inline unsigned long parse_hex(const char **string) 362{ 363 unsigned char d; 364 unsigned long r = 0; 365 366 while ((d = hex_table[(unsigned char)**string]) < 16) { 367 r = (r << 4) | d; 368 (*string)++; 369 } 370 371 return r; 372} 373 374static int process_set_region_mappings(struct switch_ctx *sctx, 375 unsigned int argc, char **argv) 376{ 377 unsigned int i; 378 unsigned long region_index = 0; 379 380 for (i = 1; i < argc; i++) { 381 unsigned long path_nr; 382 const char *string = argv[i]; 383 384 if ((*string & 0xdf) == 'R') { 385 unsigned long cycle_length, num_write; 386 387 string++; 388 if (unlikely(*string == ',')) { 389 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 390 return -EINVAL; 391 } 392 cycle_length = parse_hex(&string); 393 if (unlikely(*string != ',')) { 394 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 395 return -EINVAL; 396 } 397 string++; 398 if (unlikely(!*string)) { 399 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 400 return -EINVAL; 401 } 402 num_write = parse_hex(&string); 403 if (unlikely(*string)) { 404 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 405 return -EINVAL; 406 } 407 408 if (unlikely(!cycle_length) || unlikely(cycle_length - 1 > region_index)) { 409 DMWARN("invalid set_region_mappings cycle length: %lu > %lu", 410 cycle_length - 1, region_index); 411 return -EINVAL; 412 } 413 if (unlikely(region_index + num_write < region_index) || 414 unlikely(region_index + num_write >= sctx->nr_regions)) { 415 DMWARN("invalid set_region_mappings region number: %lu + %lu >= %lu", 416 region_index, num_write, sctx->nr_regions); 417 return -EINVAL; 418 } 419 420 while (num_write--) { 421 region_index++; 422 path_nr = switch_region_table_read(sctx, region_index - cycle_length); 423 switch_region_table_write(sctx, region_index, path_nr); 424 } 425 426 continue; 427 } 428 429 if (*string == ':') 430 region_index++; 431 else { 432 region_index = parse_hex(&string); 433 if (unlikely(*string != ':')) { 434 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 435 return -EINVAL; 436 } 437 } 438 439 string++; 440 if (unlikely(!*string)) { 441 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 442 return -EINVAL; 443 } 444 445 path_nr = parse_hex(&string); 446 if (unlikely(*string)) { 447 DMWARN("invalid set_region_mappings argument: '%s'", argv[i]); 448 return -EINVAL; 449 } 450 if (unlikely(region_index >= sctx->nr_regions)) { 451 DMWARN("invalid set_region_mappings region number: %lu >= %lu", region_index, sctx->nr_regions); 452 return -EINVAL; 453 } 454 if (unlikely(path_nr >= sctx->nr_paths)) { 455 DMWARN("invalid set_region_mappings device: %lu >= %u", path_nr, sctx->nr_paths); 456 return -EINVAL; 457 } 458 459 switch_region_table_write(sctx, region_index, path_nr); 460 } 461 462 return 0; 463} 464 465/* 466 * Messages are processed one-at-a-time. 467 * 468 * Only set_region_mappings is supported. 469 */ 470static int switch_message(struct dm_target *ti, unsigned int argc, char **argv, 471 char *result, unsigned int maxlen) 472{ 473 static DEFINE_MUTEX(message_mutex); 474 475 struct switch_ctx *sctx = ti->private; 476 int r = -EINVAL; 477 478 mutex_lock(&message_mutex); 479 480 if (!strcasecmp(argv[0], "set_region_mappings")) 481 r = process_set_region_mappings(sctx, argc, argv); 482 else 483 DMWARN("Unrecognised message received."); 484 485 mutex_unlock(&message_mutex); 486 487 return r; 488} 489 490static void switch_status(struct dm_target *ti, status_type_t type, 491 unsigned int status_flags, char *result, unsigned int maxlen) 492{ 493 struct switch_ctx *sctx = ti->private; 494 unsigned int sz = 0; 495 int path_nr; 496 497 switch (type) { 498 case STATUSTYPE_INFO: 499 result[0] = '\0'; 500 break; 501 502 case STATUSTYPE_TABLE: 503 DMEMIT("%u %u 0", sctx->nr_paths, sctx->region_size); 504 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) 505 DMEMIT(" %s %llu", sctx->path_list[path_nr].dmdev->name, 506 (unsigned long long)sctx->path_list[path_nr].start); 507 break; 508 509 case STATUSTYPE_IMA: 510 result[0] = '\0'; 511 break; 512 } 513} 514 515/* 516 * Switch ioctl: 517 * 518 * Passthrough all ioctls to the path for sector 0 519 */ 520static int switch_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) 521{ 522 struct switch_ctx *sctx = ti->private; 523 unsigned int path_nr; 524 525 path_nr = switch_get_path_nr(sctx, 0); 526 527 *bdev = sctx->path_list[path_nr].dmdev->bdev; 528 529 /* 530 * Only pass ioctls through if the device sizes match exactly. 531 */ 532 if (ti->len + sctx->path_list[path_nr].start != 533 bdev_nr_sectors((*bdev))) 534 return 1; 535 return 0; 536} 537 538static int switch_iterate_devices(struct dm_target *ti, 539 iterate_devices_callout_fn fn, void *data) 540{ 541 struct switch_ctx *sctx = ti->private; 542 int path_nr; 543 int r; 544 545 for (path_nr = 0; path_nr < sctx->nr_paths; path_nr++) { 546 r = fn(ti, sctx->path_list[path_nr].dmdev, 547 sctx->path_list[path_nr].start, ti->len, data); 548 if (r) 549 return r; 550 } 551 552 return 0; 553} 554 555static struct target_type switch_target = { 556 .name = "switch", 557 .version = {1, 1, 0}, 558 .features = DM_TARGET_NOWAIT, 559 .module = THIS_MODULE, 560 .ctr = switch_ctr, 561 .dtr = switch_dtr, 562 .map = switch_map, 563 .message = switch_message, 564 .status = switch_status, 565 .prepare_ioctl = switch_prepare_ioctl, 566 .iterate_devices = switch_iterate_devices, 567}; 568module_dm(switch); 569 570MODULE_DESCRIPTION(DM_NAME " dynamic path switching target"); 571MODULE_AUTHOR("Kevin D. O'Kelley <Kevin_OKelley@dell.com>"); 572MODULE_AUTHOR("Narendran Ganapathy <Narendran_Ganapathy@dell.com>"); 573MODULE_AUTHOR("Jim Ramsay <Jim_Ramsay@dell.com>"); 574MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>"); 575MODULE_LICENSE("GPL"); 576