1/* 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 4 * 5 * This file is released under the GPL. 6 */ 7 8#include "dm.h" 9#include "dm-uevent.h" 10 11#include <linux/init.h> 12#include <linux/module.h> 13#include <linux/mutex.h> 14#include <linux/moduleparam.h> 15#include <linux/blkpg.h> 16#include <linux/bio.h> 17#include <linux/buffer_head.h> 18#include <linux/smp_lock.h> 19#include <linux/mempool.h> 20#include <linux/slab.h> 21#include <linux/idr.h> 22#include <linux/hdreg.h> 23#include <linux/delay.h> 24 25#include <trace/events/block.h> 26 27#define DM_MSG_PREFIX "core" 28 29/* 30 * Cookies are numeric values sent with CHANGE and REMOVE 31 * uevents while resuming, removing or renaming the device. 32 */ 33#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 34#define DM_COOKIE_LENGTH 24 35 36static const char *_name = DM_NAME; 37 38static unsigned int major = 0; 39static unsigned int _major = 0; 40 41static DEFINE_SPINLOCK(_minor_lock); 42/* 43 * For bio-based dm. 44 * One of these is allocated per bio. 45 */ 46struct dm_io { 47 struct mapped_device *md; 48 int error; 49 atomic_t io_count; 50 struct bio *bio; 51 unsigned long start_time; 52 spinlock_t endio_lock; 53}; 54 55/* 56 * For bio-based dm. 57 * One of these is allocated per target within a bio. Hopefully 58 * this will be simplified out one day. 59 */ 60struct dm_target_io { 61 struct dm_io *io; 62 struct dm_target *ti; 63 union map_info info; 64}; 65 66/* 67 * For request-based dm. 68 * One of these is allocated per request. 69 */ 70struct dm_rq_target_io { 71 struct mapped_device *md; 72 struct dm_target *ti; 73 struct request *orig, clone; 74 int error; 75 union map_info info; 76}; 77 78/* 79 * For request-based dm. 80 * One of these is allocated per bio. 81 */ 82struct dm_rq_clone_bio_info { 83 struct bio *orig; 84 struct dm_rq_target_io *tio; 85}; 86 87union map_info *dm_get_mapinfo(struct bio *bio) 88{ 89 if (bio && bio->bi_private) 90 return &((struct dm_target_io *)bio->bi_private)->info; 91 return NULL; 92} 93 94union map_info *dm_get_rq_mapinfo(struct request *rq) 95{ 96 if (rq && rq->end_io_data) 97 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 98 return NULL; 99} 100EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 101 102#define MINOR_ALLOCED ((void *)-1) 103 104/* 105 * Bits for the md->flags field. 106 */ 107#define DMF_BLOCK_IO_FOR_SUSPEND 0 108#define DMF_SUSPENDED 1 109#define DMF_FROZEN 2 110#define DMF_FREEING 3 111#define DMF_DELETING 4 112#define DMF_NOFLUSH_SUSPENDING 5 113#define DMF_QUEUE_IO_TO_THREAD 6 114 115/* 116 * Work processed by per-device workqueue. 117 */ 118struct mapped_device { 119 struct rw_semaphore io_lock; 120 struct mutex suspend_lock; 121 rwlock_t map_lock; 122 atomic_t holders; 123 atomic_t open_count; 124 125 unsigned long flags; 126 127 struct request_queue *queue; 128 unsigned type; 129 /* Protect queue and type against concurrent access. */ 130 struct mutex type_lock; 131 132 struct gendisk *disk; 133 char name[16]; 134 135 void *interface_ptr; 136 137 /* 138 * A list of ios that arrived while we were suspended. 139 */ 140 atomic_t pending[2]; 141 wait_queue_head_t wait; 142 struct work_struct work; 143 struct bio_list deferred; 144 spinlock_t deferred_lock; 145 146 /* 147 * An error from the barrier request currently being processed. 148 */ 149 int barrier_error; 150 151 /* 152 * Protect barrier_error from concurrent endio processing 153 * in request-based dm. 154 */ 155 spinlock_t barrier_error_lock; 156 157 /* 158 * Processing queue (flush/barriers) 159 */ 160 struct workqueue_struct *wq; 161 struct work_struct barrier_work; 162 163 /* A pointer to the currently processing pre/post flush request */ 164 struct request *flush_request; 165 166 /* 167 * The current mapping. 168 */ 169 struct dm_table *map; 170 171 /* 172 * io objects are allocated from here. 173 */ 174 mempool_t *io_pool; 175 mempool_t *tio_pool; 176 177 struct bio_set *bs; 178 179 /* 180 * Event handling. 181 */ 182 atomic_t event_nr; 183 wait_queue_head_t eventq; 184 atomic_t uevent_seq; 185 struct list_head uevent_list; 186 spinlock_t uevent_lock; /* Protect access to uevent_list */ 187 188 /* 189 * freeze/thaw support require holding onto a super block 190 */ 191 struct super_block *frozen_sb; 192 struct block_device *bdev; 193 194 /* forced geometry settings */ 195 struct hd_geometry geometry; 196 197 /* For saving the address of __make_request for request based dm */ 198 make_request_fn *saved_make_request_fn; 199 200 /* sysfs handle */ 201 struct kobject kobj; 202 203 /* zero-length barrier that will be cloned and submitted to targets */ 204 struct bio barrier_bio; 205}; 206 207/* 208 * For mempools pre-allocation at the table loading time. 209 */ 210struct dm_md_mempools { 211 mempool_t *io_pool; 212 mempool_t *tio_pool; 213 struct bio_set *bs; 214}; 215 216#define MIN_IOS 256 217static struct kmem_cache *_io_cache; 218static struct kmem_cache *_tio_cache; 219static struct kmem_cache *_rq_tio_cache; 220static struct kmem_cache *_rq_bio_info_cache; 221 222static int __init local_init(void) 223{ 224 int r = -ENOMEM; 225 226 /* allocate a slab for the dm_ios */ 227 _io_cache = KMEM_CACHE(dm_io, 0); 228 if (!_io_cache) 229 return r; 230 231 /* allocate a slab for the target ios */ 232 _tio_cache = KMEM_CACHE(dm_target_io, 0); 233 if (!_tio_cache) 234 goto out_free_io_cache; 235 236 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 237 if (!_rq_tio_cache) 238 goto out_free_tio_cache; 239 240 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); 241 if (!_rq_bio_info_cache) 242 goto out_free_rq_tio_cache; 243 244 r = dm_uevent_init(); 245 if (r) 246 goto out_free_rq_bio_info_cache; 247 248 _major = major; 249 r = register_blkdev(_major, _name); 250 if (r < 0) 251 goto out_uevent_exit; 252 253 if (!_major) 254 _major = r; 255 256 return 0; 257 258out_uevent_exit: 259 dm_uevent_exit(); 260out_free_rq_bio_info_cache: 261 kmem_cache_destroy(_rq_bio_info_cache); 262out_free_rq_tio_cache: 263 kmem_cache_destroy(_rq_tio_cache); 264out_free_tio_cache: 265 kmem_cache_destroy(_tio_cache); 266out_free_io_cache: 267 kmem_cache_destroy(_io_cache); 268 269 return r; 270} 271 272static void local_exit(void) 273{ 274 kmem_cache_destroy(_rq_bio_info_cache); 275 kmem_cache_destroy(_rq_tio_cache); 276 kmem_cache_destroy(_tio_cache); 277 kmem_cache_destroy(_io_cache); 278 unregister_blkdev(_major, _name); 279 dm_uevent_exit(); 280 281 _major = 0; 282 283 DMINFO("cleaned up"); 284} 285 286static int (*_inits[])(void) __initdata = { 287 local_init, 288 dm_target_init, 289 dm_linear_init, 290 dm_stripe_init, 291 dm_io_init, 292 dm_kcopyd_init, 293 dm_interface_init, 294}; 295 296static void (*_exits[])(void) = { 297 local_exit, 298 dm_target_exit, 299 dm_linear_exit, 300 dm_stripe_exit, 301 dm_io_exit, 302 dm_kcopyd_exit, 303 dm_interface_exit, 304}; 305 306static int __init dm_init(void) 307{ 308 const int count = ARRAY_SIZE(_inits); 309 310 int r, i; 311 312 for (i = 0; i < count; i++) { 313 r = _inits[i](); 314 if (r) 315 goto bad; 316 } 317 318 return 0; 319 320 bad: 321 while (i--) 322 _exits[i](); 323 324 return r; 325} 326 327static void __exit dm_exit(void) 328{ 329 int i = ARRAY_SIZE(_exits); 330 331 while (i--) 332 _exits[i](); 333} 334 335/* 336 * Block device functions 337 */ 338int dm_deleting_md(struct mapped_device *md) 339{ 340 return test_bit(DMF_DELETING, &md->flags); 341} 342 343static int dm_blk_open(struct block_device *bdev, fmode_t mode) 344{ 345 struct mapped_device *md; 346 347 lock_kernel(); 348 spin_lock(&_minor_lock); 349 350 md = bdev->bd_disk->private_data; 351 if (!md) 352 goto out; 353 354 if (test_bit(DMF_FREEING, &md->flags) || 355 dm_deleting_md(md)) { 356 md = NULL; 357 goto out; 358 } 359 360 dm_get(md); 361 atomic_inc(&md->open_count); 362 363out: 364 spin_unlock(&_minor_lock); 365 unlock_kernel(); 366 367 return md ? 0 : -ENXIO; 368} 369 370static int dm_blk_close(struct gendisk *disk, fmode_t mode) 371{ 372 struct mapped_device *md = disk->private_data; 373 374 lock_kernel(); 375 atomic_dec(&md->open_count); 376 dm_put(md); 377 unlock_kernel(); 378 379 return 0; 380} 381 382int dm_open_count(struct mapped_device *md) 383{ 384 return atomic_read(&md->open_count); 385} 386 387/* 388 * Guarantees nothing is using the device before it's deleted. 389 */ 390int dm_lock_for_deletion(struct mapped_device *md) 391{ 392 int r = 0; 393 394 spin_lock(&_minor_lock); 395 396 if (dm_open_count(md)) 397 r = -EBUSY; 398 else 399 set_bit(DMF_DELETING, &md->flags); 400 401 spin_unlock(&_minor_lock); 402 403 return r; 404} 405 406static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 407{ 408 struct mapped_device *md = bdev->bd_disk->private_data; 409 410 return dm_get_geometry(md, geo); 411} 412 413static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 414 unsigned int cmd, unsigned long arg) 415{ 416 struct mapped_device *md = bdev->bd_disk->private_data; 417 struct dm_table *map = dm_get_live_table(md); 418 struct dm_target *tgt; 419 int r = -ENOTTY; 420 421 if (!map || !dm_table_get_size(map)) 422 goto out; 423 424 /* We only support devices that have a single target */ 425 if (dm_table_get_num_targets(map) != 1) 426 goto out; 427 428 tgt = dm_table_get_target(map, 0); 429 430 if (dm_suspended_md(md)) { 431 r = -EAGAIN; 432 goto out; 433 } 434 435 if (tgt->type->ioctl) 436 r = tgt->type->ioctl(tgt, cmd, arg); 437 438out: 439 dm_table_put(map); 440 441 return r; 442} 443 444static struct dm_io *alloc_io(struct mapped_device *md) 445{ 446 return mempool_alloc(md->io_pool, GFP_NOIO); 447} 448 449static void free_io(struct mapped_device *md, struct dm_io *io) 450{ 451 mempool_free(io, md->io_pool); 452} 453 454static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 455{ 456 mempool_free(tio, md->tio_pool); 457} 458 459static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 460 gfp_t gfp_mask) 461{ 462 return mempool_alloc(md->tio_pool, gfp_mask); 463} 464 465static void free_rq_tio(struct dm_rq_target_io *tio) 466{ 467 mempool_free(tio, tio->md->tio_pool); 468} 469 470static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) 471{ 472 return mempool_alloc(md->io_pool, GFP_ATOMIC); 473} 474 475static void free_bio_info(struct dm_rq_clone_bio_info *info) 476{ 477 mempool_free(info, info->tio->md->io_pool); 478} 479 480static int md_in_flight(struct mapped_device *md) 481{ 482 return atomic_read(&md->pending[READ]) + 483 atomic_read(&md->pending[WRITE]); 484} 485 486static void start_io_acct(struct dm_io *io) 487{ 488 struct mapped_device *md = io->md; 489 int cpu; 490 int rw = bio_data_dir(io->bio); 491 492 io->start_time = jiffies; 493 494 cpu = part_stat_lock(); 495 part_round_stats(cpu, &dm_disk(md)->part0); 496 part_stat_unlock(); 497 dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]); 498} 499 500static void end_io_acct(struct dm_io *io) 501{ 502 struct mapped_device *md = io->md; 503 struct bio *bio = io->bio; 504 unsigned long duration = jiffies - io->start_time; 505 int pending, cpu; 506 int rw = bio_data_dir(bio); 507 508 cpu = part_stat_lock(); 509 part_round_stats(cpu, &dm_disk(md)->part0); 510 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 511 part_stat_unlock(); 512 513 /* 514 * After this is decremented the bio must not be touched if it is 515 * a barrier. 516 */ 517 dm_disk(md)->part0.in_flight[rw] = pending = 518 atomic_dec_return(&md->pending[rw]); 519 pending += atomic_read(&md->pending[rw^0x1]); 520 521 /* nudge anyone waiting on suspend queue */ 522 if (!pending) 523 wake_up(&md->wait); 524} 525 526/* 527 * Add the bio to the list of deferred io. 528 */ 529static void queue_io(struct mapped_device *md, struct bio *bio) 530{ 531 down_write(&md->io_lock); 532 533 spin_lock_irq(&md->deferred_lock); 534 bio_list_add(&md->deferred, bio); 535 spin_unlock_irq(&md->deferred_lock); 536 537 if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) 538 queue_work(md->wq, &md->work); 539 540 up_write(&md->io_lock); 541} 542 543/* 544 * Everyone (including functions in this file), should use this 545 * function to access the md->map field, and make sure they call 546 * dm_table_put() when finished. 547 */ 548struct dm_table *dm_get_live_table(struct mapped_device *md) 549{ 550 struct dm_table *t; 551 unsigned long flags; 552 553 read_lock_irqsave(&md->map_lock, flags); 554 t = md->map; 555 if (t) 556 dm_table_get(t); 557 read_unlock_irqrestore(&md->map_lock, flags); 558 559 return t; 560} 561 562/* 563 * Get the geometry associated with a dm device 564 */ 565int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 566{ 567 *geo = md->geometry; 568 569 return 0; 570} 571 572/* 573 * Set the geometry of a device. 574 */ 575int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 576{ 577 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 578 579 if (geo->start > sz) { 580 DMWARN("Start sector is beyond the geometry limits."); 581 return -EINVAL; 582 } 583 584 md->geometry = *geo; 585 586 return 0; 587} 588 589/*----------------------------------------------------------------- 590 * CRUD START: 591 * A more elegant soln is in the works that uses the queue 592 * merge fn, unfortunately there are a couple of changes to 593 * the block layer that I want to make for this. So in the 594 * interests of getting something for people to use I give 595 * you this clearly demarcated crap. 596 *---------------------------------------------------------------*/ 597 598static int __noflush_suspending(struct mapped_device *md) 599{ 600 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 601} 602 603/* 604 * Decrements the number of outstanding ios that a bio has been 605 * cloned into, completing the original io if necc. 606 */ 607static void dec_pending(struct dm_io *io, int error) 608{ 609 unsigned long flags; 610 int io_error; 611 struct bio *bio; 612 struct mapped_device *md = io->md; 613 614 /* Push-back supersedes any I/O errors */ 615 if (unlikely(error)) { 616 spin_lock_irqsave(&io->endio_lock, flags); 617 if (!(io->error > 0 && __noflush_suspending(md))) 618 io->error = error; 619 spin_unlock_irqrestore(&io->endio_lock, flags); 620 } 621 622 if (atomic_dec_and_test(&io->io_count)) { 623 if (io->error == DM_ENDIO_REQUEUE) { 624 /* 625 * Target requested pushing back the I/O. 626 */ 627 spin_lock_irqsave(&md->deferred_lock, flags); 628 if (__noflush_suspending(md)) { 629 if (!(io->bio->bi_rw & REQ_HARDBARRIER)) 630 bio_list_add_head(&md->deferred, 631 io->bio); 632 } else 633 /* noflush suspend was interrupted. */ 634 io->error = -EIO; 635 spin_unlock_irqrestore(&md->deferred_lock, flags); 636 } 637 638 io_error = io->error; 639 bio = io->bio; 640 641 if (bio->bi_rw & REQ_HARDBARRIER) { 642 /* 643 * There can be just one barrier request so we use 644 * a per-device variable for error reporting. 645 * Note that you can't touch the bio after end_io_acct 646 * 647 * We ignore -EOPNOTSUPP for empty flush reported by 648 * underlying devices. We assume that if the device 649 * doesn't support empty barriers, it doesn't need 650 * cache flushing commands. 651 */ 652 if (!md->barrier_error && 653 !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP)) 654 md->barrier_error = io_error; 655 end_io_acct(io); 656 free_io(md, io); 657 } else { 658 end_io_acct(io); 659 free_io(md, io); 660 661 if (io_error != DM_ENDIO_REQUEUE) { 662 trace_block_bio_complete(md->queue, bio); 663 664 bio_endio(bio, io_error); 665 } 666 } 667 } 668} 669 670static void clone_endio(struct bio *bio, int error) 671{ 672 int r = 0; 673 struct dm_target_io *tio = bio->bi_private; 674 struct dm_io *io = tio->io; 675 struct mapped_device *md = tio->io->md; 676 dm_endio_fn endio = tio->ti->type->end_io; 677 678 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 679 error = -EIO; 680 681 if (endio) { 682 r = endio(tio->ti, bio, error, &tio->info); 683 if (r < 0 || r == DM_ENDIO_REQUEUE) 684 /* 685 * error and requeue request are handled 686 * in dec_pending(). 687 */ 688 error = r; 689 else if (r == DM_ENDIO_INCOMPLETE) 690 /* The target will handle the io */ 691 return; 692 else if (r) { 693 DMWARN("unimplemented target endio return value: %d", r); 694 BUG(); 695 } 696 } 697 698 /* 699 * Store md for cleanup instead of tio which is about to get freed. 700 */ 701 bio->bi_private = md->bs; 702 703 free_tio(md, tio); 704 bio_put(bio); 705 dec_pending(io, error); 706} 707 708/* 709 * Partial completion handling for request-based dm 710 */ 711static void end_clone_bio(struct bio *clone, int error) 712{ 713 struct dm_rq_clone_bio_info *info = clone->bi_private; 714 struct dm_rq_target_io *tio = info->tio; 715 struct bio *bio = info->orig; 716 unsigned int nr_bytes = info->orig->bi_size; 717 718 bio_put(clone); 719 720 if (tio->error) 721 /* 722 * An error has already been detected on the request. 723 * Once error occurred, just let clone->end_io() handle 724 * the remainder. 725 */ 726 return; 727 else if (error) { 728 /* 729 * Don't notice the error to the upper layer yet. 730 * The error handling decision is made by the target driver, 731 * when the request is completed. 732 */ 733 tio->error = error; 734 return; 735 } 736 737 /* 738 * I/O for the bio successfully completed. 739 * Notice the data completion to the upper layer. 740 */ 741 742 /* 743 * bios are processed from the head of the list. 744 * So the completing bio should always be rq->bio. 745 * If it's not, something wrong is happening. 746 */ 747 if (tio->orig->bio != bio) 748 DMERR("bio completion is going in the middle of the request"); 749 750 /* 751 * Update the original request. 752 * Do not use blk_end_request() here, because it may complete 753 * the original request before the clone, and break the ordering. 754 */ 755 blk_update_request(tio->orig, 0, nr_bytes); 756} 757 758static void store_barrier_error(struct mapped_device *md, int error) 759{ 760 unsigned long flags; 761 762 spin_lock_irqsave(&md->barrier_error_lock, flags); 763 /* 764 * Basically, the first error is taken, but: 765 * -EOPNOTSUPP supersedes any I/O error. 766 * Requeue request supersedes any I/O error but -EOPNOTSUPP. 767 */ 768 if (!md->barrier_error || error == -EOPNOTSUPP || 769 (md->barrier_error != -EOPNOTSUPP && 770 error == DM_ENDIO_REQUEUE)) 771 md->barrier_error = error; 772 spin_unlock_irqrestore(&md->barrier_error_lock, flags); 773} 774 775/* 776 * Don't touch any member of the md after calling this function because 777 * the md may be freed in dm_put() at the end of this function. 778 * Or do dm_get() before calling this function and dm_put() later. 779 */ 780static void rq_completed(struct mapped_device *md, int rw, int run_queue) 781{ 782 atomic_dec(&md->pending[rw]); 783 784 /* nudge anyone waiting on suspend queue */ 785 if (!md_in_flight(md)) 786 wake_up(&md->wait); 787 788 if (run_queue) 789 blk_run_queue(md->queue); 790 791 /* 792 * dm_put() must be at the end of this function. See the comment above 793 */ 794 dm_put(md); 795} 796 797static void free_rq_clone(struct request *clone) 798{ 799 struct dm_rq_target_io *tio = clone->end_io_data; 800 801 blk_rq_unprep_clone(clone); 802 free_rq_tio(tio); 803} 804 805/* 806 * Complete the clone and the original request. 807 * Must be called without queue lock. 808 */ 809static void dm_end_request(struct request *clone, int error) 810{ 811 int rw = rq_data_dir(clone); 812 int run_queue = 1; 813 bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER; 814 struct dm_rq_target_io *tio = clone->end_io_data; 815 struct mapped_device *md = tio->md; 816 struct request *rq = tio->orig; 817 818 if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) { 819 rq->errors = clone->errors; 820 rq->resid_len = clone->resid_len; 821 822 if (rq->sense) 823 /* 824 * We are using the sense buffer of the original 825 * request. 826 * So setting the length of the sense data is enough. 827 */ 828 rq->sense_len = clone->sense_len; 829 } 830 831 free_rq_clone(clone); 832 833 if (unlikely(is_barrier)) { 834 if (unlikely(error)) 835 store_barrier_error(md, error); 836 run_queue = 0; 837 } else 838 blk_end_request_all(rq, error); 839 840 rq_completed(md, rw, run_queue); 841} 842 843static void dm_unprep_request(struct request *rq) 844{ 845 struct request *clone = rq->special; 846 847 rq->special = NULL; 848 rq->cmd_flags &= ~REQ_DONTPREP; 849 850 free_rq_clone(clone); 851} 852 853/* 854 * Requeue the original request of a clone. 855 */ 856void dm_requeue_unmapped_request(struct request *clone) 857{ 858 int rw = rq_data_dir(clone); 859 struct dm_rq_target_io *tio = clone->end_io_data; 860 struct mapped_device *md = tio->md; 861 struct request *rq = tio->orig; 862 struct request_queue *q = rq->q; 863 unsigned long flags; 864 865 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { 866 /* 867 * Barrier clones share an original request. 868 * Leave it to dm_end_request(), which handles this special 869 * case. 870 */ 871 dm_end_request(clone, DM_ENDIO_REQUEUE); 872 return; 873 } 874 875 dm_unprep_request(rq); 876 877 spin_lock_irqsave(q->queue_lock, flags); 878 if (elv_queue_empty(q)) 879 blk_plug_device(q); 880 blk_requeue_request(q, rq); 881 spin_unlock_irqrestore(q->queue_lock, flags); 882 883 rq_completed(md, rw, 0); 884} 885EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 886 887static void __stop_queue(struct request_queue *q) 888{ 889 blk_stop_queue(q); 890} 891 892static void stop_queue(struct request_queue *q) 893{ 894 unsigned long flags; 895 896 spin_lock_irqsave(q->queue_lock, flags); 897 __stop_queue(q); 898 spin_unlock_irqrestore(q->queue_lock, flags); 899} 900 901static void __start_queue(struct request_queue *q) 902{ 903 if (blk_queue_stopped(q)) 904 blk_start_queue(q); 905} 906 907static void start_queue(struct request_queue *q) 908{ 909 unsigned long flags; 910 911 spin_lock_irqsave(q->queue_lock, flags); 912 __start_queue(q); 913 spin_unlock_irqrestore(q->queue_lock, flags); 914} 915 916static void dm_done(struct request *clone, int error, bool mapped) 917{ 918 int r = error; 919 struct dm_rq_target_io *tio = clone->end_io_data; 920 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; 921 922 if (mapped && rq_end_io) 923 r = rq_end_io(tio->ti, clone, error, &tio->info); 924 925 if (r <= 0) 926 /* The target wants to complete the I/O */ 927 dm_end_request(clone, r); 928 else if (r == DM_ENDIO_INCOMPLETE) 929 /* The target will handle the I/O */ 930 return; 931 else if (r == DM_ENDIO_REQUEUE) 932 /* The target wants to requeue the I/O */ 933 dm_requeue_unmapped_request(clone); 934 else { 935 DMWARN("unimplemented target endio return value: %d", r); 936 BUG(); 937 } 938} 939 940/* 941 * Request completion handler for request-based dm 942 */ 943static void dm_softirq_done(struct request *rq) 944{ 945 bool mapped = true; 946 struct request *clone = rq->completion_data; 947 struct dm_rq_target_io *tio = clone->end_io_data; 948 949 if (rq->cmd_flags & REQ_FAILED) 950 mapped = false; 951 952 dm_done(clone, tio->error, mapped); 953} 954 955/* 956 * Complete the clone and the original request with the error status 957 * through softirq context. 958 */ 959static void dm_complete_request(struct request *clone, int error) 960{ 961 struct dm_rq_target_io *tio = clone->end_io_data; 962 struct request *rq = tio->orig; 963 964 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { 965 /* 966 * Barrier clones share an original request. So can't use 967 * softirq_done with the original. 968 * Pass the clone to dm_done() directly in this special case. 969 * It is safe (even if clone->q->queue_lock is held here) 970 * because there is no I/O dispatching during the completion 971 * of barrier clone. 972 */ 973 dm_done(clone, error, true); 974 return; 975 } 976 977 tio->error = error; 978 rq->completion_data = clone; 979 blk_complete_request(rq); 980} 981 982/* 983 * Complete the not-mapped clone and the original request with the error status 984 * through softirq context. 985 * Target's rq_end_io() function isn't called. 986 * This may be used when the target's map_rq() function fails. 987 */ 988void dm_kill_unmapped_request(struct request *clone, int error) 989{ 990 struct dm_rq_target_io *tio = clone->end_io_data; 991 struct request *rq = tio->orig; 992 993 if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) { 994 /* 995 * Barrier clones share an original request. 996 * Leave it to dm_end_request(), which handles this special 997 * case. 998 */ 999 BUG_ON(error > 0); 1000 dm_end_request(clone, error); 1001 return; 1002 } 1003 1004 rq->cmd_flags |= REQ_FAILED; 1005 dm_complete_request(clone, error); 1006} 1007EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 1008 1009/* 1010 * Called with the queue lock held 1011 */ 1012static void end_clone_request(struct request *clone, int error) 1013{ 1014 /* 1015 * For just cleaning up the information of the queue in which 1016 * the clone was dispatched. 1017 * The clone is *NOT* freed actually here because it is alloced from 1018 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 1019 */ 1020 __blk_put_request(clone->q, clone); 1021 1022 /* 1023 * Actual request completion is done in a softirq context which doesn't 1024 * hold the queue lock. Otherwise, deadlock could occur because: 1025 * - another request may be submitted by the upper level driver 1026 * of the stacking during the completion 1027 * - the submission which requires queue lock may be done 1028 * against this queue 1029 */ 1030 dm_complete_request(clone, error); 1031} 1032 1033/* 1034 * Return maximum size of I/O possible at the supplied sector up to the current 1035 * target boundary. 1036 */ 1037static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 1038{ 1039 sector_t target_offset = dm_target_offset(ti, sector); 1040 1041 return ti->len - target_offset; 1042} 1043 1044static sector_t max_io_len(sector_t sector, struct dm_target *ti) 1045{ 1046 sector_t len = max_io_len_target_boundary(sector, ti); 1047 1048 /* 1049 * Does the target need to split even further ? 1050 */ 1051 if (ti->split_io) { 1052 sector_t boundary; 1053 sector_t offset = dm_target_offset(ti, sector); 1054 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 1055 - offset; 1056 if (len > boundary) 1057 len = boundary; 1058 } 1059 1060 return len; 1061} 1062 1063static void __map_bio(struct dm_target *ti, struct bio *clone, 1064 struct dm_target_io *tio) 1065{ 1066 int r; 1067 sector_t sector; 1068 struct mapped_device *md; 1069 1070 clone->bi_end_io = clone_endio; 1071 clone->bi_private = tio; 1072 1073 /* 1074 * Map the clone. If r == 0 we don't need to do 1075 * anything, the target has assumed ownership of 1076 * this io. 1077 */ 1078 atomic_inc(&tio->io->io_count); 1079 sector = clone->bi_sector; 1080 r = ti->type->map(ti, clone, &tio->info); 1081 if (r == DM_MAPIO_REMAPPED) { 1082 /* the bio has been remapped so dispatch it */ 1083 1084 trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, 1085 tio->io->bio->bi_bdev->bd_dev, sector); 1086 1087 generic_make_request(clone); 1088 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1089 /* error the io and bail out, or requeue it if needed */ 1090 md = tio->io->md; 1091 dec_pending(tio->io, r); 1092 /* 1093 * Store bio_set for cleanup. 1094 */ 1095 clone->bi_private = md->bs; 1096 bio_put(clone); 1097 free_tio(md, tio); 1098 } else if (r) { 1099 DMWARN("unimplemented target map return value: %d", r); 1100 BUG(); 1101 } 1102} 1103 1104struct clone_info { 1105 struct mapped_device *md; 1106 struct dm_table *map; 1107 struct bio *bio; 1108 struct dm_io *io; 1109 sector_t sector; 1110 sector_t sector_count; 1111 unsigned short idx; 1112}; 1113 1114static void dm_bio_destructor(struct bio *bio) 1115{ 1116 struct bio_set *bs = bio->bi_private; 1117 1118 bio_free(bio, bs); 1119} 1120 1121/* 1122 * Creates a little bio that is just does part of a bvec. 1123 */ 1124static struct bio *split_bvec(struct bio *bio, sector_t sector, 1125 unsigned short idx, unsigned int offset, 1126 unsigned int len, struct bio_set *bs) 1127{ 1128 struct bio *clone; 1129 struct bio_vec *bv = bio->bi_io_vec + idx; 1130 1131 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 1132 clone->bi_destructor = dm_bio_destructor; 1133 *clone->bi_io_vec = *bv; 1134 1135 clone->bi_sector = sector; 1136 clone->bi_bdev = bio->bi_bdev; 1137 clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER; 1138 clone->bi_vcnt = 1; 1139 clone->bi_size = to_bytes(len); 1140 clone->bi_io_vec->bv_offset = offset; 1141 clone->bi_io_vec->bv_len = clone->bi_size; 1142 clone->bi_flags |= 1 << BIO_CLONED; 1143 1144 if (bio_integrity(bio)) { 1145 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1146 bio_integrity_trim(clone, 1147 bio_sector_offset(bio, idx, offset), len); 1148 } 1149 1150 return clone; 1151} 1152 1153/* 1154 * Creates a bio that consists of range of complete bvecs. 1155 */ 1156static struct bio *clone_bio(struct bio *bio, sector_t sector, 1157 unsigned short idx, unsigned short bv_count, 1158 unsigned int len, struct bio_set *bs) 1159{ 1160 struct bio *clone; 1161 1162 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1163 __bio_clone(clone, bio); 1164 clone->bi_rw &= ~REQ_HARDBARRIER; 1165 clone->bi_destructor = dm_bio_destructor; 1166 clone->bi_sector = sector; 1167 clone->bi_idx = idx; 1168 clone->bi_vcnt = idx + bv_count; 1169 clone->bi_size = to_bytes(len); 1170 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1171 1172 if (bio_integrity(bio)) { 1173 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1174 1175 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1176 bio_integrity_trim(clone, 1177 bio_sector_offset(bio, idx, 0), len); 1178 } 1179 1180 return clone; 1181} 1182 1183static struct dm_target_io *alloc_tio(struct clone_info *ci, 1184 struct dm_target *ti) 1185{ 1186 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); 1187 1188 tio->io = ci->io; 1189 tio->ti = ti; 1190 memset(&tio->info, 0, sizeof(tio->info)); 1191 1192 return tio; 1193} 1194 1195static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, 1196 unsigned request_nr, sector_t len) 1197{ 1198 struct dm_target_io *tio = alloc_tio(ci, ti); 1199 struct bio *clone; 1200 1201 tio->info.target_request_nr = request_nr; 1202 1203 /* 1204 * Discard requests require the bio's inline iovecs be initialized. 1205 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush 1206 * and discard, so no need for concern about wasted bvec allocations. 1207 */ 1208 clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); 1209 __bio_clone(clone, ci->bio); 1210 clone->bi_destructor = dm_bio_destructor; 1211 if (len) { 1212 clone->bi_sector = ci->sector; 1213 clone->bi_size = to_bytes(len); 1214 } 1215 1216 __map_bio(ti, clone, tio); 1217} 1218 1219static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, 1220 unsigned num_requests, sector_t len) 1221{ 1222 unsigned request_nr; 1223 1224 for (request_nr = 0; request_nr < num_requests; request_nr++) 1225 __issue_target_request(ci, ti, request_nr, len); 1226} 1227 1228static int __clone_and_map_empty_barrier(struct clone_info *ci) 1229{ 1230 unsigned target_nr = 0; 1231 struct dm_target *ti; 1232 1233 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1234 __issue_target_requests(ci, ti, ti->num_flush_requests, 0); 1235 1236 ci->sector_count = 0; 1237 1238 return 0; 1239} 1240 1241/* 1242 * Perform all io with a single clone. 1243 */ 1244static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) 1245{ 1246 struct bio *clone, *bio = ci->bio; 1247 struct dm_target_io *tio; 1248 1249 tio = alloc_tio(ci, ti); 1250 clone = clone_bio(bio, ci->sector, ci->idx, 1251 bio->bi_vcnt - ci->idx, ci->sector_count, 1252 ci->md->bs); 1253 __map_bio(ti, clone, tio); 1254 ci->sector_count = 0; 1255} 1256 1257static int __clone_and_map_discard(struct clone_info *ci) 1258{ 1259 struct dm_target *ti; 1260 sector_t len; 1261 1262 do { 1263 ti = dm_table_find_target(ci->map, ci->sector); 1264 if (!dm_target_is_valid(ti)) 1265 return -EIO; 1266 1267 /* 1268 * Even though the device advertised discard support, 1269 * reconfiguration might have changed that since the 1270 * check was performed. 1271 */ 1272 if (!ti->num_discard_requests) 1273 return -EOPNOTSUPP; 1274 1275 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1276 1277 __issue_target_requests(ci, ti, ti->num_discard_requests, len); 1278 1279 ci->sector += len; 1280 } while (ci->sector_count -= len); 1281 1282 return 0; 1283} 1284 1285static int __clone_and_map(struct clone_info *ci) 1286{ 1287 struct bio *clone, *bio = ci->bio; 1288 struct dm_target *ti; 1289 sector_t len = 0, max; 1290 struct dm_target_io *tio; 1291 1292 if (unlikely(bio_empty_barrier(bio))) 1293 return __clone_and_map_empty_barrier(ci); 1294 1295 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1296 return __clone_and_map_discard(ci); 1297 1298 ti = dm_table_find_target(ci->map, ci->sector); 1299 if (!dm_target_is_valid(ti)) 1300 return -EIO; 1301 1302 max = max_io_len(ci->sector, ti); 1303 1304 if (ci->sector_count <= max) { 1305 /* 1306 * Optimise for the simple case where we can do all of 1307 * the remaining io with a single clone. 1308 */ 1309 __clone_and_map_simple(ci, ti); 1310 1311 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1312 /* 1313 * There are some bvecs that don't span targets. 1314 * Do as many of these as possible. 1315 */ 1316 int i; 1317 sector_t remaining = max; 1318 sector_t bv_len; 1319 1320 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 1321 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 1322 1323 if (bv_len > remaining) 1324 break; 1325 1326 remaining -= bv_len; 1327 len += bv_len; 1328 } 1329 1330 tio = alloc_tio(ci, ti); 1331 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 1332 ci->md->bs); 1333 __map_bio(ti, clone, tio); 1334 1335 ci->sector += len; 1336 ci->sector_count -= len; 1337 ci->idx = i; 1338 1339 } else { 1340 /* 1341 * Handle a bvec that must be split between two or more targets. 1342 */ 1343 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1344 sector_t remaining = to_sector(bv->bv_len); 1345 unsigned int offset = 0; 1346 1347 do { 1348 if (offset) { 1349 ti = dm_table_find_target(ci->map, ci->sector); 1350 if (!dm_target_is_valid(ti)) 1351 return -EIO; 1352 1353 max = max_io_len(ci->sector, ti); 1354 } 1355 1356 len = min(remaining, max); 1357 1358 tio = alloc_tio(ci, ti); 1359 clone = split_bvec(bio, ci->sector, ci->idx, 1360 bv->bv_offset + offset, len, 1361 ci->md->bs); 1362 1363 __map_bio(ti, clone, tio); 1364 1365 ci->sector += len; 1366 ci->sector_count -= len; 1367 offset += to_bytes(len); 1368 } while (remaining -= len); 1369 1370 ci->idx++; 1371 } 1372 1373 return 0; 1374} 1375 1376/* 1377 * Split the bio into several clones and submit it to targets. 1378 */ 1379static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1380{ 1381 struct clone_info ci; 1382 int error = 0; 1383 1384 ci.map = dm_get_live_table(md); 1385 if (unlikely(!ci.map)) { 1386 if (!(bio->bi_rw & REQ_HARDBARRIER)) 1387 bio_io_error(bio); 1388 else 1389 if (!md->barrier_error) 1390 md->barrier_error = -EIO; 1391 return; 1392 } 1393 1394 ci.md = md; 1395 ci.bio = bio; 1396 ci.io = alloc_io(md); 1397 ci.io->error = 0; 1398 atomic_set(&ci.io->io_count, 1); 1399 ci.io->bio = bio; 1400 ci.io->md = md; 1401 spin_lock_init(&ci.io->endio_lock); 1402 ci.sector = bio->bi_sector; 1403 ci.sector_count = bio_sectors(bio); 1404 if (unlikely(bio_empty_barrier(bio))) 1405 ci.sector_count = 1; 1406 ci.idx = bio->bi_idx; 1407 1408 start_io_acct(ci.io); 1409 while (ci.sector_count && !error) 1410 error = __clone_and_map(&ci); 1411 1412 /* drop the extra reference count */ 1413 dec_pending(ci.io, error); 1414 dm_table_put(ci.map); 1415} 1416/*----------------------------------------------------------------- 1417 * CRUD END 1418 *---------------------------------------------------------------*/ 1419 1420static int dm_merge_bvec(struct request_queue *q, 1421 struct bvec_merge_data *bvm, 1422 struct bio_vec *biovec) 1423{ 1424 struct mapped_device *md = q->queuedata; 1425 struct dm_table *map = dm_get_live_table(md); 1426 struct dm_target *ti; 1427 sector_t max_sectors; 1428 int max_size = 0; 1429 1430 if (unlikely(!map)) 1431 goto out; 1432 1433 ti = dm_table_find_target(map, bvm->bi_sector); 1434 if (!dm_target_is_valid(ti)) 1435 goto out_table; 1436 1437 /* 1438 * Find maximum amount of I/O that won't need splitting 1439 */ 1440 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1441 (sector_t) BIO_MAX_SECTORS); 1442 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1443 if (max_size < 0) 1444 max_size = 0; 1445 1446 /* 1447 * merge_bvec_fn() returns number of bytes 1448 * it can accept at this offset 1449 * max is precomputed maximal io size 1450 */ 1451 if (max_size && ti->type->merge) 1452 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1453 /* 1454 * If the target doesn't support merge method and some of the devices 1455 * provided their merge_bvec method (we know this by looking at 1456 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1457 * entries. So always set max_size to 0, and the code below allows 1458 * just one page. 1459 */ 1460 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1461 1462 max_size = 0; 1463 1464out_table: 1465 dm_table_put(map); 1466 1467out: 1468 /* 1469 * Always allow an entire first page 1470 */ 1471 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1472 max_size = biovec->bv_len; 1473 1474 return max_size; 1475} 1476 1477/* 1478 * The request function that just remaps the bio built up by 1479 * dm_merge_bvec. 1480 */ 1481static int _dm_request(struct request_queue *q, struct bio *bio) 1482{ 1483 int rw = bio_data_dir(bio); 1484 struct mapped_device *md = q->queuedata; 1485 int cpu; 1486 1487 down_read(&md->io_lock); 1488 1489 cpu = part_stat_lock(); 1490 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1491 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1492 part_stat_unlock(); 1493 1494 /* 1495 * If we're suspended or the thread is processing barriers 1496 * we have to queue this io for later. 1497 */ 1498 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) || 1499 unlikely(bio->bi_rw & REQ_HARDBARRIER)) { 1500 up_read(&md->io_lock); 1501 1502 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) && 1503 bio_rw(bio) == READA) { 1504 bio_io_error(bio); 1505 return 0; 1506 } 1507 1508 queue_io(md, bio); 1509 1510 return 0; 1511 } 1512 1513 __split_and_process_bio(md, bio); 1514 up_read(&md->io_lock); 1515 return 0; 1516} 1517 1518static int dm_make_request(struct request_queue *q, struct bio *bio) 1519{ 1520 struct mapped_device *md = q->queuedata; 1521 1522 return md->saved_make_request_fn(q, bio); /* call __make_request() */ 1523} 1524 1525static int dm_request_based(struct mapped_device *md) 1526{ 1527 return blk_queue_stackable(md->queue); 1528} 1529 1530static int dm_request(struct request_queue *q, struct bio *bio) 1531{ 1532 struct mapped_device *md = q->queuedata; 1533 1534 if (dm_request_based(md)) 1535 return dm_make_request(q, bio); 1536 1537 return _dm_request(q, bio); 1538} 1539 1540static bool dm_rq_is_flush_request(struct request *rq) 1541{ 1542 if (rq->cmd_flags & REQ_FLUSH) 1543 return true; 1544 else 1545 return false; 1546} 1547 1548void dm_dispatch_request(struct request *rq) 1549{ 1550 int r; 1551 1552 if (blk_queue_io_stat(rq->q)) 1553 rq->cmd_flags |= REQ_IO_STAT; 1554 1555 rq->start_time = jiffies; 1556 r = blk_insert_cloned_request(rq->q, rq); 1557 if (r) 1558 dm_complete_request(rq, r); 1559} 1560EXPORT_SYMBOL_GPL(dm_dispatch_request); 1561 1562static void dm_rq_bio_destructor(struct bio *bio) 1563{ 1564 struct dm_rq_clone_bio_info *info = bio->bi_private; 1565 struct mapped_device *md = info->tio->md; 1566 1567 free_bio_info(info); 1568 bio_free(bio, md->bs); 1569} 1570 1571static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1572 void *data) 1573{ 1574 struct dm_rq_target_io *tio = data; 1575 struct mapped_device *md = tio->md; 1576 struct dm_rq_clone_bio_info *info = alloc_bio_info(md); 1577 1578 if (!info) 1579 return -ENOMEM; 1580 1581 info->orig = bio_orig; 1582 info->tio = tio; 1583 bio->bi_end_io = end_clone_bio; 1584 bio->bi_private = info; 1585 bio->bi_destructor = dm_rq_bio_destructor; 1586 1587 return 0; 1588} 1589 1590static int setup_clone(struct request *clone, struct request *rq, 1591 struct dm_rq_target_io *tio) 1592{ 1593 int r; 1594 1595 if (dm_rq_is_flush_request(rq)) { 1596 blk_rq_init(NULL, clone); 1597 clone->cmd_type = REQ_TYPE_FS; 1598 clone->cmd_flags |= (REQ_HARDBARRIER | WRITE); 1599 } else { 1600 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1601 dm_rq_bio_constructor, tio); 1602 if (r) 1603 return r; 1604 1605 clone->cmd = rq->cmd; 1606 clone->cmd_len = rq->cmd_len; 1607 clone->sense = rq->sense; 1608 clone->buffer = rq->buffer; 1609 } 1610 1611 clone->end_io = end_clone_request; 1612 clone->end_io_data = tio; 1613 1614 return 0; 1615} 1616 1617static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1618 gfp_t gfp_mask) 1619{ 1620 struct request *clone; 1621 struct dm_rq_target_io *tio; 1622 1623 tio = alloc_rq_tio(md, gfp_mask); 1624 if (!tio) 1625 return NULL; 1626 1627 tio->md = md; 1628 tio->ti = NULL; 1629 tio->orig = rq; 1630 tio->error = 0; 1631 memset(&tio->info, 0, sizeof(tio->info)); 1632 1633 clone = &tio->clone; 1634 if (setup_clone(clone, rq, tio)) { 1635 /* -ENOMEM */ 1636 free_rq_tio(tio); 1637 return NULL; 1638 } 1639 1640 return clone; 1641} 1642 1643/* 1644 * Called with the queue lock held. 1645 */ 1646static int dm_prep_fn(struct request_queue *q, struct request *rq) 1647{ 1648 struct mapped_device *md = q->queuedata; 1649 struct request *clone; 1650 1651 if (unlikely(dm_rq_is_flush_request(rq))) 1652 return BLKPREP_OK; 1653 1654 if (unlikely(rq->special)) { 1655 DMWARN("Already has something in rq->special."); 1656 return BLKPREP_KILL; 1657 } 1658 1659 clone = clone_rq(rq, md, GFP_ATOMIC); 1660 if (!clone) 1661 return BLKPREP_DEFER; 1662 1663 rq->special = clone; 1664 rq->cmd_flags |= REQ_DONTPREP; 1665 1666 return BLKPREP_OK; 1667} 1668 1669/* 1670 * Returns: 1671 * 0 : the request has been processed (not requeued) 1672 * !0 : the request has been requeued 1673 */ 1674static int map_request(struct dm_target *ti, struct request *clone, 1675 struct mapped_device *md) 1676{ 1677 int r, requeued = 0; 1678 struct dm_rq_target_io *tio = clone->end_io_data; 1679 1680 /* 1681 * Hold the md reference here for the in-flight I/O. 1682 * We can't rely on the reference count by device opener, 1683 * because the device may be closed during the request completion 1684 * when all bios are completed. 1685 * See the comment in rq_completed() too. 1686 */ 1687 dm_get(md); 1688 1689 tio->ti = ti; 1690 r = ti->type->map_rq(ti, clone, &tio->info); 1691 switch (r) { 1692 case DM_MAPIO_SUBMITTED: 1693 /* The target has taken the I/O to submit by itself later */ 1694 break; 1695 case DM_MAPIO_REMAPPED: 1696 /* The target has remapped the I/O so dispatch it */ 1697 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1698 blk_rq_pos(tio->orig)); 1699 dm_dispatch_request(clone); 1700 break; 1701 case DM_MAPIO_REQUEUE: 1702 /* The target wants to requeue the I/O */ 1703 dm_requeue_unmapped_request(clone); 1704 requeued = 1; 1705 break; 1706 default: 1707 if (r > 0) { 1708 DMWARN("unimplemented target map return value: %d", r); 1709 BUG(); 1710 } 1711 1712 /* The target wants to complete the I/O */ 1713 dm_kill_unmapped_request(clone, r); 1714 break; 1715 } 1716 1717 return requeued; 1718} 1719 1720/* 1721 * q->request_fn for request-based dm. 1722 * Called with the queue lock held. 1723 */ 1724static void dm_request_fn(struct request_queue *q) 1725{ 1726 struct mapped_device *md = q->queuedata; 1727 struct dm_table *map = dm_get_live_table(md); 1728 struct dm_target *ti; 1729 struct request *rq, *clone; 1730 1731 /* 1732 * For suspend, check blk_queue_stopped() and increment 1733 * ->pending within a single queue_lock not to increment the 1734 * number of in-flight I/Os after the queue is stopped in 1735 * dm_suspend(). 1736 */ 1737 while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) { 1738 rq = blk_peek_request(q); 1739 if (!rq) 1740 goto plug_and_out; 1741 1742 if (unlikely(dm_rq_is_flush_request(rq))) { 1743 BUG_ON(md->flush_request); 1744 md->flush_request = rq; 1745 blk_start_request(rq); 1746 queue_work(md->wq, &md->barrier_work); 1747 goto out; 1748 } 1749 1750 ti = dm_table_find_target(map, blk_rq_pos(rq)); 1751 if (ti->type->busy && ti->type->busy(ti)) 1752 goto plug_and_out; 1753 1754 blk_start_request(rq); 1755 clone = rq->special; 1756 atomic_inc(&md->pending[rq_data_dir(clone)]); 1757 1758 spin_unlock(q->queue_lock); 1759 if (map_request(ti, clone, md)) 1760 goto requeued; 1761 1762 spin_lock_irq(q->queue_lock); 1763 } 1764 1765 goto out; 1766 1767requeued: 1768 spin_lock_irq(q->queue_lock); 1769 1770plug_and_out: 1771 if (!elv_queue_empty(q)) 1772 /* Some requests still remain, retry later */ 1773 blk_plug_device(q); 1774 1775out: 1776 dm_table_put(map); 1777 1778 return; 1779} 1780 1781int dm_underlying_device_busy(struct request_queue *q) 1782{ 1783 return blk_lld_busy(q); 1784} 1785EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1786 1787static int dm_lld_busy(struct request_queue *q) 1788{ 1789 int r; 1790 struct mapped_device *md = q->queuedata; 1791 struct dm_table *map = dm_get_live_table(md); 1792 1793 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1794 r = 1; 1795 else 1796 r = dm_table_any_busy_target(map); 1797 1798 dm_table_put(map); 1799 1800 return r; 1801} 1802 1803static void dm_unplug_all(struct request_queue *q) 1804{ 1805 struct mapped_device *md = q->queuedata; 1806 struct dm_table *map = dm_get_live_table(md); 1807 1808 if (map) { 1809 if (dm_request_based(md)) 1810 generic_unplug_device(q); 1811 1812 dm_table_unplug_all(map); 1813 dm_table_put(map); 1814 } 1815} 1816 1817static int dm_any_congested(void *congested_data, int bdi_bits) 1818{ 1819 int r = bdi_bits; 1820 struct mapped_device *md = congested_data; 1821 struct dm_table *map; 1822 1823 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1824 map = dm_get_live_table(md); 1825 if (map) { 1826 /* 1827 * Request-based dm cares about only own queue for 1828 * the query about congestion status of request_queue 1829 */ 1830 if (dm_request_based(md)) 1831 r = md->queue->backing_dev_info.state & 1832 bdi_bits; 1833 else 1834 r = dm_table_any_congested(map, bdi_bits); 1835 1836 dm_table_put(map); 1837 } 1838 } 1839 1840 return r; 1841} 1842 1843/*----------------------------------------------------------------- 1844 * An IDR is used to keep track of allocated minor numbers. 1845 *---------------------------------------------------------------*/ 1846static DEFINE_IDR(_minor_idr); 1847 1848static void free_minor(int minor) 1849{ 1850 spin_lock(&_minor_lock); 1851 idr_remove(&_minor_idr, minor); 1852 spin_unlock(&_minor_lock); 1853} 1854 1855/* 1856 * See if the device with a specific minor # is free. 1857 */ 1858static int specific_minor(int minor) 1859{ 1860 int r, m; 1861 1862 if (minor >= (1 << MINORBITS)) 1863 return -EINVAL; 1864 1865 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1866 if (!r) 1867 return -ENOMEM; 1868 1869 spin_lock(&_minor_lock); 1870 1871 if (idr_find(&_minor_idr, minor)) { 1872 r = -EBUSY; 1873 goto out; 1874 } 1875 1876 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 1877 if (r) 1878 goto out; 1879 1880 if (m != minor) { 1881 idr_remove(&_minor_idr, m); 1882 r = -EBUSY; 1883 goto out; 1884 } 1885 1886out: 1887 spin_unlock(&_minor_lock); 1888 return r; 1889} 1890 1891static int next_free_minor(int *minor) 1892{ 1893 int r, m; 1894 1895 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1896 if (!r) 1897 return -ENOMEM; 1898 1899 spin_lock(&_minor_lock); 1900 1901 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 1902 if (r) 1903 goto out; 1904 1905 if (m >= (1 << MINORBITS)) { 1906 idr_remove(&_minor_idr, m); 1907 r = -ENOSPC; 1908 goto out; 1909 } 1910 1911 *minor = m; 1912 1913out: 1914 spin_unlock(&_minor_lock); 1915 return r; 1916} 1917 1918static const struct block_device_operations dm_blk_dops; 1919 1920static void dm_wq_work(struct work_struct *work); 1921static void dm_rq_barrier_work(struct work_struct *work); 1922 1923static void dm_init_md_queue(struct mapped_device *md) 1924{ 1925 /* 1926 * Request-based dm devices cannot be stacked on top of bio-based dm 1927 * devices. The type of this dm device has not been decided yet. 1928 * The type is decided at the first table loading time. 1929 * To prevent problematic device stacking, clear the queue flag 1930 * for request stacking support until then. 1931 * 1932 * This queue is new, so no concurrency on the queue_flags. 1933 */ 1934 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1935 1936 md->queue->queuedata = md; 1937 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1938 md->queue->backing_dev_info.congested_data = md; 1939 blk_queue_make_request(md->queue, dm_request); 1940 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1941 md->queue->unplug_fn = dm_unplug_all; 1942 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1943} 1944 1945/* 1946 * Allocate and initialise a blank device with a given minor. 1947 */ 1948static struct mapped_device *alloc_dev(int minor) 1949{ 1950 int r; 1951 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 1952 void *old_md; 1953 1954 if (!md) { 1955 DMWARN("unable to allocate device, out of memory."); 1956 return NULL; 1957 } 1958 1959 if (!try_module_get(THIS_MODULE)) 1960 goto bad_module_get; 1961 1962 /* get a minor number for the dev */ 1963 if (minor == DM_ANY_MINOR) 1964 r = next_free_minor(&minor); 1965 else 1966 r = specific_minor(minor); 1967 if (r < 0) 1968 goto bad_minor; 1969 1970 md->type = DM_TYPE_NONE; 1971 init_rwsem(&md->io_lock); 1972 mutex_init(&md->suspend_lock); 1973 mutex_init(&md->type_lock); 1974 spin_lock_init(&md->deferred_lock); 1975 spin_lock_init(&md->barrier_error_lock); 1976 rwlock_init(&md->map_lock); 1977 atomic_set(&md->holders, 1); 1978 atomic_set(&md->open_count, 0); 1979 atomic_set(&md->event_nr, 0); 1980 atomic_set(&md->uevent_seq, 0); 1981 INIT_LIST_HEAD(&md->uevent_list); 1982 spin_lock_init(&md->uevent_lock); 1983 1984 md->queue = blk_alloc_queue(GFP_KERNEL); 1985 if (!md->queue) 1986 goto bad_queue; 1987 1988 dm_init_md_queue(md); 1989 1990 md->disk = alloc_disk(1); 1991 if (!md->disk) 1992 goto bad_disk; 1993 1994 atomic_set(&md->pending[0], 0); 1995 atomic_set(&md->pending[1], 0); 1996 init_waitqueue_head(&md->wait); 1997 INIT_WORK(&md->work, dm_wq_work); 1998 INIT_WORK(&md->barrier_work, dm_rq_barrier_work); 1999 init_waitqueue_head(&md->eventq); 2000 2001 md->disk->major = _major; 2002 md->disk->first_minor = minor; 2003 md->disk->fops = &dm_blk_dops; 2004 md->disk->queue = md->queue; 2005 md->disk->private_data = md; 2006 sprintf(md->disk->disk_name, "dm-%d", minor); 2007 add_disk(md->disk); 2008 format_dev_t(md->name, MKDEV(_major, minor)); 2009 2010 md->wq = create_singlethread_workqueue("kdmflush"); 2011 if (!md->wq) 2012 goto bad_thread; 2013 2014 md->bdev = bdget_disk(md->disk, 0); 2015 if (!md->bdev) 2016 goto bad_bdev; 2017 2018 /* Populate the mapping, nobody knows we exist yet */ 2019 spin_lock(&_minor_lock); 2020 old_md = idr_replace(&_minor_idr, md, minor); 2021 spin_unlock(&_minor_lock); 2022 2023 BUG_ON(old_md != MINOR_ALLOCED); 2024 2025 return md; 2026 2027bad_bdev: 2028 destroy_workqueue(md->wq); 2029bad_thread: 2030 del_gendisk(md->disk); 2031 put_disk(md->disk); 2032bad_disk: 2033 blk_cleanup_queue(md->queue); 2034bad_queue: 2035 free_minor(minor); 2036bad_minor: 2037 module_put(THIS_MODULE); 2038bad_module_get: 2039 kfree(md); 2040 return NULL; 2041} 2042 2043static void unlock_fs(struct mapped_device *md); 2044 2045static void free_dev(struct mapped_device *md) 2046{ 2047 int minor = MINOR(disk_devt(md->disk)); 2048 2049 unlock_fs(md); 2050 bdput(md->bdev); 2051 destroy_workqueue(md->wq); 2052 if (md->tio_pool) 2053 mempool_destroy(md->tio_pool); 2054 if (md->io_pool) 2055 mempool_destroy(md->io_pool); 2056 if (md->bs) 2057 bioset_free(md->bs); 2058 blk_integrity_unregister(md->disk); 2059 del_gendisk(md->disk); 2060 free_minor(minor); 2061 2062 spin_lock(&_minor_lock); 2063 md->disk->private_data = NULL; 2064 spin_unlock(&_minor_lock); 2065 2066 put_disk(md->disk); 2067 blk_cleanup_queue(md->queue); 2068 module_put(THIS_MODULE); 2069 kfree(md); 2070} 2071 2072static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 2073{ 2074 struct dm_md_mempools *p; 2075 2076 if (md->io_pool && md->tio_pool && md->bs) 2077 /* the md already has necessary mempools */ 2078 goto out; 2079 2080 p = dm_table_get_md_mempools(t); 2081 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 2082 2083 md->io_pool = p->io_pool; 2084 p->io_pool = NULL; 2085 md->tio_pool = p->tio_pool; 2086 p->tio_pool = NULL; 2087 md->bs = p->bs; 2088 p->bs = NULL; 2089 2090out: 2091 /* mempool bind completed, now no need any mempools in the table */ 2092 dm_table_free_md_mempools(t); 2093} 2094 2095/* 2096 * Bind a table to the device. 2097 */ 2098static void event_callback(void *context) 2099{ 2100 unsigned long flags; 2101 LIST_HEAD(uevents); 2102 struct mapped_device *md = (struct mapped_device *) context; 2103 2104 spin_lock_irqsave(&md->uevent_lock, flags); 2105 list_splice_init(&md->uevent_list, &uevents); 2106 spin_unlock_irqrestore(&md->uevent_lock, flags); 2107 2108 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 2109 2110 atomic_inc(&md->event_nr); 2111 wake_up(&md->eventq); 2112} 2113 2114/* 2115 * Protected by md->suspend_lock obtained by dm_swap_table(). 2116 */ 2117static void __set_size(struct mapped_device *md, sector_t size) 2118{ 2119 set_capacity(md->disk, size); 2120 2121 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 2122} 2123 2124/* 2125 * Returns old map, which caller must destroy. 2126 */ 2127static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2128 struct queue_limits *limits) 2129{ 2130 struct dm_table *old_map; 2131 struct request_queue *q = md->queue; 2132 sector_t size; 2133 unsigned long flags; 2134 2135 size = dm_table_get_size(t); 2136 2137 /* 2138 * Wipe any geometry if the size of the table changed. 2139 */ 2140 if (size != get_capacity(md->disk)) 2141 memset(&md->geometry, 0, sizeof(md->geometry)); 2142 2143 __set_size(md, size); 2144 2145 dm_table_event_callback(t, event_callback, md); 2146 2147 /* 2148 * The queue hasn't been stopped yet, if the old table type wasn't 2149 * for request-based during suspension. So stop it to prevent 2150 * I/O mapping before resume. 2151 * This must be done before setting the queue restrictions, 2152 * because request-based dm may be run just after the setting. 2153 */ 2154 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2155 stop_queue(q); 2156 2157 __bind_mempools(md, t); 2158 2159 write_lock_irqsave(&md->map_lock, flags); 2160 old_map = md->map; 2161 md->map = t; 2162 dm_table_set_restrictions(t, q, limits); 2163 write_unlock_irqrestore(&md->map_lock, flags); 2164 2165 return old_map; 2166} 2167 2168/* 2169 * Returns unbound table for the caller to free. 2170 */ 2171static struct dm_table *__unbind(struct mapped_device *md) 2172{ 2173 struct dm_table *map = md->map; 2174 unsigned long flags; 2175 2176 if (!map) 2177 return NULL; 2178 2179 dm_table_event_callback(map, NULL, NULL); 2180 write_lock_irqsave(&md->map_lock, flags); 2181 md->map = NULL; 2182 write_unlock_irqrestore(&md->map_lock, flags); 2183 2184 return map; 2185} 2186 2187/* 2188 * Constructor for a new device. 2189 */ 2190int dm_create(int minor, struct mapped_device **result) 2191{ 2192 struct mapped_device *md; 2193 2194 md = alloc_dev(minor); 2195 if (!md) 2196 return -ENXIO; 2197 2198 dm_sysfs_init(md); 2199 2200 *result = md; 2201 return 0; 2202} 2203 2204/* 2205 * Functions to manage md->type. 2206 * All are required to hold md->type_lock. 2207 */ 2208void dm_lock_md_type(struct mapped_device *md) 2209{ 2210 mutex_lock(&md->type_lock); 2211} 2212 2213void dm_unlock_md_type(struct mapped_device *md) 2214{ 2215 mutex_unlock(&md->type_lock); 2216} 2217 2218void dm_set_md_type(struct mapped_device *md, unsigned type) 2219{ 2220 md->type = type; 2221} 2222 2223unsigned dm_get_md_type(struct mapped_device *md) 2224{ 2225 return md->type; 2226} 2227 2228/* 2229 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2230 */ 2231static int dm_init_request_based_queue(struct mapped_device *md) 2232{ 2233 struct request_queue *q = NULL; 2234 2235 if (md->queue->elevator) 2236 return 1; 2237 2238 /* Fully initialize the queue */ 2239 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2240 if (!q) 2241 return 0; 2242 2243 md->queue = q; 2244 md->saved_make_request_fn = md->queue->make_request_fn; 2245 dm_init_md_queue(md); 2246 blk_queue_softirq_done(md->queue, dm_softirq_done); 2247 blk_queue_prep_rq(md->queue, dm_prep_fn); 2248 blk_queue_lld_busy(md->queue, dm_lld_busy); 2249 blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH); 2250 2251 elv_register_queue(md->queue); 2252 2253 return 1; 2254} 2255 2256/* 2257 * Setup the DM device's queue based on md's type 2258 */ 2259int dm_setup_md_queue(struct mapped_device *md) 2260{ 2261 if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && 2262 !dm_init_request_based_queue(md)) { 2263 DMWARN("Cannot initialize queue for request-based mapped device"); 2264 return -EINVAL; 2265 } 2266 2267 return 0; 2268} 2269 2270static struct mapped_device *dm_find_md(dev_t dev) 2271{ 2272 struct mapped_device *md; 2273 unsigned minor = MINOR(dev); 2274 2275 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2276 return NULL; 2277 2278 spin_lock(&_minor_lock); 2279 2280 md = idr_find(&_minor_idr, minor); 2281 if (md && (md == MINOR_ALLOCED || 2282 (MINOR(disk_devt(dm_disk(md))) != minor) || 2283 dm_deleting_md(md) || 2284 test_bit(DMF_FREEING, &md->flags))) { 2285 md = NULL; 2286 goto out; 2287 } 2288 2289out: 2290 spin_unlock(&_minor_lock); 2291 2292 return md; 2293} 2294 2295struct mapped_device *dm_get_md(dev_t dev) 2296{ 2297 struct mapped_device *md = dm_find_md(dev); 2298 2299 if (md) 2300 dm_get(md); 2301 2302 return md; 2303} 2304 2305void *dm_get_mdptr(struct mapped_device *md) 2306{ 2307 return md->interface_ptr; 2308} 2309 2310void dm_set_mdptr(struct mapped_device *md, void *ptr) 2311{ 2312 md->interface_ptr = ptr; 2313} 2314 2315void dm_get(struct mapped_device *md) 2316{ 2317 atomic_inc(&md->holders); 2318 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2319} 2320 2321const char *dm_device_name(struct mapped_device *md) 2322{ 2323 return md->name; 2324} 2325EXPORT_SYMBOL_GPL(dm_device_name); 2326 2327static void __dm_destroy(struct mapped_device *md, bool wait) 2328{ 2329 struct dm_table *map; 2330 2331 might_sleep(); 2332 2333 spin_lock(&_minor_lock); 2334 map = dm_get_live_table(md); 2335 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2336 set_bit(DMF_FREEING, &md->flags); 2337 spin_unlock(&_minor_lock); 2338 2339 if (!dm_suspended_md(md)) { 2340 dm_table_presuspend_targets(map); 2341 dm_table_postsuspend_targets(map); 2342 } 2343 2344 /* 2345 * Rare, but there may be I/O requests still going to complete, 2346 * for example. Wait for all references to disappear. 2347 * No one should increment the reference count of the mapped_device, 2348 * after the mapped_device state becomes DMF_FREEING. 2349 */ 2350 if (wait) 2351 while (atomic_read(&md->holders)) 2352 msleep(1); 2353 else if (atomic_read(&md->holders)) 2354 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2355 dm_device_name(md), atomic_read(&md->holders)); 2356 2357 dm_sysfs_exit(md); 2358 dm_table_put(map); 2359 dm_table_destroy(__unbind(md)); 2360 free_dev(md); 2361} 2362 2363void dm_destroy(struct mapped_device *md) 2364{ 2365 __dm_destroy(md, true); 2366} 2367 2368void dm_destroy_immediate(struct mapped_device *md) 2369{ 2370 __dm_destroy(md, false); 2371} 2372 2373void dm_put(struct mapped_device *md) 2374{ 2375 atomic_dec(&md->holders); 2376} 2377EXPORT_SYMBOL_GPL(dm_put); 2378 2379static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2380{ 2381 int r = 0; 2382 DECLARE_WAITQUEUE(wait, current); 2383 2384 dm_unplug_all(md->queue); 2385 2386 add_wait_queue(&md->wait, &wait); 2387 2388 while (1) { 2389 set_current_state(interruptible); 2390 2391 smp_mb(); 2392 if (!md_in_flight(md)) 2393 break; 2394 2395 if (interruptible == TASK_INTERRUPTIBLE && 2396 signal_pending(current)) { 2397 r = -EINTR; 2398 break; 2399 } 2400 2401 io_schedule(); 2402 } 2403 set_current_state(TASK_RUNNING); 2404 2405 remove_wait_queue(&md->wait, &wait); 2406 2407 return r; 2408} 2409 2410static void dm_flush(struct mapped_device *md) 2411{ 2412 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2413 2414 bio_init(&md->barrier_bio); 2415 md->barrier_bio.bi_bdev = md->bdev; 2416 md->barrier_bio.bi_rw = WRITE_BARRIER; 2417 __split_and_process_bio(md, &md->barrier_bio); 2418 2419 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2420} 2421 2422static void process_barrier(struct mapped_device *md, struct bio *bio) 2423{ 2424 md->barrier_error = 0; 2425 2426 dm_flush(md); 2427 2428 if (!bio_empty_barrier(bio)) { 2429 __split_and_process_bio(md, bio); 2430 /* 2431 * If the request isn't supported, don't waste time with 2432 * the second flush. 2433 */ 2434 if (md->barrier_error != -EOPNOTSUPP) 2435 dm_flush(md); 2436 } 2437 2438 if (md->barrier_error != DM_ENDIO_REQUEUE) 2439 bio_endio(bio, md->barrier_error); 2440 else { 2441 spin_lock_irq(&md->deferred_lock); 2442 bio_list_add_head(&md->deferred, bio); 2443 spin_unlock_irq(&md->deferred_lock); 2444 } 2445} 2446 2447/* 2448 * Process the deferred bios 2449 */ 2450static void dm_wq_work(struct work_struct *work) 2451{ 2452 struct mapped_device *md = container_of(work, struct mapped_device, 2453 work); 2454 struct bio *c; 2455 2456 down_write(&md->io_lock); 2457 2458 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2459 spin_lock_irq(&md->deferred_lock); 2460 c = bio_list_pop(&md->deferred); 2461 spin_unlock_irq(&md->deferred_lock); 2462 2463 if (!c) { 2464 clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2465 break; 2466 } 2467 2468 up_write(&md->io_lock); 2469 2470 if (dm_request_based(md)) 2471 generic_make_request(c); 2472 else { 2473 if (c->bi_rw & REQ_HARDBARRIER) 2474 process_barrier(md, c); 2475 else 2476 __split_and_process_bio(md, c); 2477 } 2478 2479 down_write(&md->io_lock); 2480 } 2481 2482 up_write(&md->io_lock); 2483} 2484 2485static void dm_queue_flush(struct mapped_device *md) 2486{ 2487 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2488 smp_mb__after_clear_bit(); 2489 queue_work(md->wq, &md->work); 2490} 2491 2492static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr) 2493{ 2494 struct dm_rq_target_io *tio = clone->end_io_data; 2495 2496 tio->info.target_request_nr = request_nr; 2497} 2498 2499/* Issue barrier requests to targets and wait for their completion. */ 2500static int dm_rq_barrier(struct mapped_device *md) 2501{ 2502 int i, j; 2503 struct dm_table *map = dm_get_live_table(md); 2504 unsigned num_targets = dm_table_get_num_targets(map); 2505 struct dm_target *ti; 2506 struct request *clone; 2507 2508 md->barrier_error = 0; 2509 2510 for (i = 0; i < num_targets; i++) { 2511 ti = dm_table_get_target(map, i); 2512 for (j = 0; j < ti->num_flush_requests; j++) { 2513 clone = clone_rq(md->flush_request, md, GFP_NOIO); 2514 dm_rq_set_target_request_nr(clone, j); 2515 atomic_inc(&md->pending[rq_data_dir(clone)]); 2516 map_request(ti, clone, md); 2517 } 2518 } 2519 2520 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); 2521 dm_table_put(map); 2522 2523 return md->barrier_error; 2524} 2525 2526static void dm_rq_barrier_work(struct work_struct *work) 2527{ 2528 int error; 2529 struct mapped_device *md = container_of(work, struct mapped_device, 2530 barrier_work); 2531 struct request_queue *q = md->queue; 2532 struct request *rq; 2533 unsigned long flags; 2534 2535 /* 2536 * Hold the md reference here and leave it at the last part so that 2537 * the md can't be deleted by device opener when the barrier request 2538 * completes. 2539 */ 2540 dm_get(md); 2541 2542 error = dm_rq_barrier(md); 2543 2544 rq = md->flush_request; 2545 md->flush_request = NULL; 2546 2547 if (error == DM_ENDIO_REQUEUE) { 2548 spin_lock_irqsave(q->queue_lock, flags); 2549 blk_requeue_request(q, rq); 2550 spin_unlock_irqrestore(q->queue_lock, flags); 2551 } else 2552 blk_end_request_all(rq, error); 2553 2554 blk_run_queue(q); 2555 2556 dm_put(md); 2557} 2558 2559/* 2560 * Swap in a new table, returning the old one for the caller to destroy. 2561 */ 2562struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2563{ 2564 struct dm_table *map = ERR_PTR(-EINVAL); 2565 struct queue_limits limits; 2566 int r; 2567 2568 mutex_lock(&md->suspend_lock); 2569 2570 /* device must be suspended */ 2571 if (!dm_suspended_md(md)) 2572 goto out; 2573 2574 r = dm_calculate_queue_limits(table, &limits); 2575 if (r) { 2576 map = ERR_PTR(r); 2577 goto out; 2578 } 2579 2580 map = __bind(md, table, &limits); 2581 2582out: 2583 mutex_unlock(&md->suspend_lock); 2584 return map; 2585} 2586 2587/* 2588 * Functions to lock and unlock any filesystem running on the 2589 * device. 2590 */ 2591static int lock_fs(struct mapped_device *md) 2592{ 2593 int r; 2594 2595 WARN_ON(md->frozen_sb); 2596 2597 md->frozen_sb = freeze_bdev(md->bdev); 2598 if (IS_ERR(md->frozen_sb)) { 2599 r = PTR_ERR(md->frozen_sb); 2600 md->frozen_sb = NULL; 2601 return r; 2602 } 2603 2604 set_bit(DMF_FROZEN, &md->flags); 2605 2606 return 0; 2607} 2608 2609static void unlock_fs(struct mapped_device *md) 2610{ 2611 if (!test_bit(DMF_FROZEN, &md->flags)) 2612 return; 2613 2614 thaw_bdev(md->bdev, md->frozen_sb); 2615 md->frozen_sb = NULL; 2616 clear_bit(DMF_FROZEN, &md->flags); 2617} 2618 2619/* 2620 * We need to be able to change a mapping table under a mounted 2621 * filesystem. For example we might want to move some data in 2622 * the background. Before the table can be swapped with 2623 * dm_bind_table, dm_suspend must be called to flush any in 2624 * flight bios and ensure that any further io gets deferred. 2625 */ 2626/* 2627 * Suspend mechanism in request-based dm. 2628 * 2629 * 1. Flush all I/Os by lock_fs() if needed. 2630 * 2. Stop dispatching any I/O by stopping the request_queue. 2631 * 3. Wait for all in-flight I/Os to be completed or requeued. 2632 * 2633 * To abort suspend, start the request_queue. 2634 */ 2635int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2636{ 2637 struct dm_table *map = NULL; 2638 int r = 0; 2639 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2640 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2641 2642 mutex_lock(&md->suspend_lock); 2643 2644 if (dm_suspended_md(md)) { 2645 r = -EINVAL; 2646 goto out_unlock; 2647 } 2648 2649 map = dm_get_live_table(md); 2650 2651 /* 2652 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2653 * This flag is cleared before dm_suspend returns. 2654 */ 2655 if (noflush) 2656 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2657 2658 /* This does not get reverted if there's an error later. */ 2659 dm_table_presuspend_targets(map); 2660 2661 /* 2662 * Flush I/O to the device. 2663 * Any I/O submitted after lock_fs() may not be flushed. 2664 * noflush takes precedence over do_lockfs. 2665 * (lock_fs() flushes I/Os and waits for them to complete.) 2666 */ 2667 if (!noflush && do_lockfs) { 2668 r = lock_fs(md); 2669 if (r) 2670 goto out; 2671 } 2672 2673 /* 2674 * Here we must make sure that no processes are submitting requests 2675 * to target drivers i.e. no one may be executing 2676 * __split_and_process_bio. This is called from dm_request and 2677 * dm_wq_work. 2678 * 2679 * To get all processes out of __split_and_process_bio in dm_request, 2680 * we take the write lock. To prevent any process from reentering 2681 * __split_and_process_bio from dm_request, we set 2682 * DMF_QUEUE_IO_TO_THREAD. 2683 * 2684 * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND 2685 * and call flush_workqueue(md->wq). flush_workqueue will wait until 2686 * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any 2687 * further calls to __split_and_process_bio from dm_wq_work. 2688 */ 2689 down_write(&md->io_lock); 2690 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2691 set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags); 2692 up_write(&md->io_lock); 2693 2694 /* 2695 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which 2696 * can be kicked until md->queue is stopped. So stop md->queue before 2697 * flushing md->wq. 2698 */ 2699 if (dm_request_based(md)) 2700 stop_queue(md->queue); 2701 2702 flush_workqueue(md->wq); 2703 2704 /* 2705 * At this point no more requests are entering target request routines. 2706 * We call dm_wait_for_completion to wait for all existing requests 2707 * to finish. 2708 */ 2709 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2710 2711 down_write(&md->io_lock); 2712 if (noflush) 2713 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2714 up_write(&md->io_lock); 2715 2716 /* were we interrupted ? */ 2717 if (r < 0) { 2718 dm_queue_flush(md); 2719 2720 if (dm_request_based(md)) 2721 start_queue(md->queue); 2722 2723 unlock_fs(md); 2724 goto out; /* pushback list is already flushed, so skip flush */ 2725 } 2726 2727 /* 2728 * If dm_wait_for_completion returned 0, the device is completely 2729 * quiescent now. There is no request-processing activity. All new 2730 * requests are being added to md->deferred list. 2731 */ 2732 2733 set_bit(DMF_SUSPENDED, &md->flags); 2734 2735 dm_table_postsuspend_targets(map); 2736 2737out: 2738 dm_table_put(map); 2739 2740out_unlock: 2741 mutex_unlock(&md->suspend_lock); 2742 return r; 2743} 2744 2745int dm_resume(struct mapped_device *md) 2746{ 2747 int r = -EINVAL; 2748 struct dm_table *map = NULL; 2749 2750 mutex_lock(&md->suspend_lock); 2751 if (!dm_suspended_md(md)) 2752 goto out; 2753 2754 map = dm_get_live_table(md); 2755 if (!map || !dm_table_get_size(map)) 2756 goto out; 2757 2758 r = dm_table_resume_targets(map); 2759 if (r) 2760 goto out; 2761 2762 dm_queue_flush(md); 2763 2764 /* 2765 * Flushing deferred I/Os must be done after targets are resumed 2766 * so that mapping of targets can work correctly. 2767 * Request-based dm is queueing the deferred I/Os in its request_queue. 2768 */ 2769 if (dm_request_based(md)) 2770 start_queue(md->queue); 2771 2772 unlock_fs(md); 2773 2774 clear_bit(DMF_SUSPENDED, &md->flags); 2775 2776 dm_table_unplug_all(map); 2777 r = 0; 2778out: 2779 dm_table_put(map); 2780 mutex_unlock(&md->suspend_lock); 2781 2782 return r; 2783} 2784 2785/*----------------------------------------------------------------- 2786 * Event notification. 2787 *---------------------------------------------------------------*/ 2788int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2789 unsigned cookie) 2790{ 2791 char udev_cookie[DM_COOKIE_LENGTH]; 2792 char *envp[] = { udev_cookie, NULL }; 2793 2794 if (!cookie) 2795 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2796 else { 2797 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2798 DM_COOKIE_ENV_VAR_NAME, cookie); 2799 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2800 action, envp); 2801 } 2802} 2803 2804uint32_t dm_next_uevent_seq(struct mapped_device *md) 2805{ 2806 return atomic_add_return(1, &md->uevent_seq); 2807} 2808 2809uint32_t dm_get_event_nr(struct mapped_device *md) 2810{ 2811 return atomic_read(&md->event_nr); 2812} 2813 2814int dm_wait_event(struct mapped_device *md, int event_nr) 2815{ 2816 return wait_event_interruptible(md->eventq, 2817 (event_nr != atomic_read(&md->event_nr))); 2818} 2819 2820void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2821{ 2822 unsigned long flags; 2823 2824 spin_lock_irqsave(&md->uevent_lock, flags); 2825 list_add(elist, &md->uevent_list); 2826 spin_unlock_irqrestore(&md->uevent_lock, flags); 2827} 2828 2829/* 2830 * The gendisk is only valid as long as you have a reference 2831 * count on 'md'. 2832 */ 2833struct gendisk *dm_disk(struct mapped_device *md) 2834{ 2835 return md->disk; 2836} 2837 2838struct kobject *dm_kobject(struct mapped_device *md) 2839{ 2840 return &md->kobj; 2841} 2842 2843/* 2844 * struct mapped_device should not be exported outside of dm.c 2845 * so use this check to verify that kobj is part of md structure 2846 */ 2847struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2848{ 2849 struct mapped_device *md; 2850 2851 md = container_of(kobj, struct mapped_device, kobj); 2852 if (&md->kobj != kobj) 2853 return NULL; 2854 2855 if (test_bit(DMF_FREEING, &md->flags) || 2856 dm_deleting_md(md)) 2857 return NULL; 2858 2859 dm_get(md); 2860 return md; 2861} 2862 2863int dm_suspended_md(struct mapped_device *md) 2864{ 2865 return test_bit(DMF_SUSPENDED, &md->flags); 2866} 2867 2868int dm_suspended(struct dm_target *ti) 2869{ 2870 return dm_suspended_md(dm_table_get_md(ti->table)); 2871} 2872EXPORT_SYMBOL_GPL(dm_suspended); 2873 2874int dm_noflush_suspending(struct dm_target *ti) 2875{ 2876 return __noflush_suspending(dm_table_get_md(ti->table)); 2877} 2878EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2879 2880struct dm_md_mempools *dm_alloc_md_mempools(unsigned type) 2881{ 2882 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2883 2884 if (!pools) 2885 return NULL; 2886 2887 pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2888 mempool_create_slab_pool(MIN_IOS, _io_cache) : 2889 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); 2890 if (!pools->io_pool) 2891 goto free_pools_and_out; 2892 2893 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? 2894 mempool_create_slab_pool(MIN_IOS, _tio_cache) : 2895 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); 2896 if (!pools->tio_pool) 2897 goto free_io_pool_and_out; 2898 2899 pools->bs = (type == DM_TYPE_BIO_BASED) ? 2900 bioset_create(16, 0) : bioset_create(MIN_IOS, 0); 2901 if (!pools->bs) 2902 goto free_tio_pool_and_out; 2903 2904 return pools; 2905 2906free_tio_pool_and_out: 2907 mempool_destroy(pools->tio_pool); 2908 2909free_io_pool_and_out: 2910 mempool_destroy(pools->io_pool); 2911 2912free_pools_and_out: 2913 kfree(pools); 2914 2915 return NULL; 2916} 2917 2918void dm_free_md_mempools(struct dm_md_mempools *pools) 2919{ 2920 if (!pools) 2921 return; 2922 2923 if (pools->io_pool) 2924 mempool_destroy(pools->io_pool); 2925 2926 if (pools->tio_pool) 2927 mempool_destroy(pools->tio_pool); 2928 2929 if (pools->bs) 2930 bioset_free(pools->bs); 2931 2932 kfree(pools); 2933} 2934 2935static const struct block_device_operations dm_blk_dops = { 2936 .open = dm_blk_open, 2937 .release = dm_blk_close, 2938 .ioctl = dm_blk_ioctl, 2939 .getgeo = dm_blk_getgeo, 2940 .owner = THIS_MODULE 2941}; 2942 2943EXPORT_SYMBOL(dm_get_mapinfo); 2944 2945/* 2946 * module hooks 2947 */ 2948module_init(dm_init); 2949module_exit(dm_exit); 2950 2951module_param(major, uint, 0); 2952MODULE_PARM_DESC(major, "The major number of the device mapper"); 2953MODULE_DESCRIPTION(DM_NAME " driver"); 2954MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2955MODULE_LICENSE("GPL"); 2956