1/* 2 * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk> 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License version 2 as 6 * published by the Free Software Foundation. 7 * 8 * This program is distributed in the hope that it will be useful, 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * GNU General Public License for more details. 12 * 13 * You should have received a copy of the GNU General Public Licens 14 * along with this program; if not, write to the Free Software 15 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 16 * 17 */ 18#include <linux/mm.h> 19#include <linux/swap.h> 20#include <linux/bio.h> 21#include <linux/blkdev.h> 22#include <linux/slab.h> 23#include <linux/init.h> 24#include <linux/kernel.h> 25#include <linux/module.h> 26#include <linux/mempool.h> 27#include <linux/workqueue.h> 28#include <linux/blktrace_api.h> 29#include <scsi/sg.h> /* for struct sg_iovec */ 30 31#define BIO_POOL_SIZE 2 32 33static struct kmem_cache *bio_slab __read_mostly; 34 35#define BIOVEC_NR_POOLS 6 36 37/* 38 * a small number of entries is fine, not going to be performance critical. 39 * basically we just need to survive 40 */ 41#define BIO_SPLIT_ENTRIES 2 42mempool_t *bio_split_pool __read_mostly; 43 44struct biovec_slab { 45 int nr_vecs; 46 char *name; 47 struct kmem_cache *slab; 48}; 49 50/* 51 * if you change this list, also change bvec_alloc or things will 52 * break badly! cannot be bigger than what you can fit into an 53 * unsigned short 54 */ 55 56#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } 57static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { 58 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), 59}; 60#undef BV 61 62/* 63 * bio_set is used to allow other portions of the IO system to 64 * allocate their own private memory pools for bio and iovec structures. 65 * These memory pools in turn all allocate from the bio_slab 66 * and the bvec_slabs[]. 67 */ 68struct bio_set { 69 mempool_t *bio_pool; 70 mempool_t *bvec_pools[BIOVEC_NR_POOLS]; 71}; 72 73/* 74 * fs_bio_set is the bio_set containing bio and iovec memory pools used by 75 * IO code that does not need private memory pools. 76 */ 77static struct bio_set *fs_bio_set; 78 79static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs) 80{ 81 struct bio_vec *bvl; 82 83 /* 84 * see comment near bvec_array define! 85 */ 86 switch (nr) { 87 case 1 : *idx = 0; break; 88 case 2 ... 4: *idx = 1; break; 89 case 5 ... 16: *idx = 2; break; 90 case 17 ... 64: *idx = 3; break; 91 case 65 ... 128: *idx = 4; break; 92 case 129 ... BIO_MAX_PAGES: *idx = 5; break; 93 default: 94 return NULL; 95 } 96 /* 97 * idx now points to the pool we want to allocate from 98 */ 99 100 bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); 101 if (bvl) { 102 struct biovec_slab *bp = bvec_slabs + *idx; 103 104 memset(bvl, 0, bp->nr_vecs * sizeof(struct bio_vec)); 105 } 106 107 return bvl; 108} 109 110void bio_free(struct bio *bio, struct bio_set *bio_set) 111{ 112 const int pool_idx = BIO_POOL_IDX(bio); 113 114 BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS); 115 116 mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]); 117 mempool_free(bio, bio_set->bio_pool); 118} 119 120/* 121 * default destructor for a bio allocated with bio_alloc_bioset() 122 */ 123static void bio_fs_destructor(struct bio *bio) 124{ 125 bio_free(bio, fs_bio_set); 126} 127 128void bio_init(struct bio *bio) 129{ 130 bio->bi_next = NULL; 131 bio->bi_bdev = NULL; 132 bio->bi_flags = 1 << BIO_UPTODATE; 133 bio->bi_rw = 0; 134 bio->bi_vcnt = 0; 135 bio->bi_idx = 0; 136 bio->bi_phys_segments = 0; 137 bio->bi_hw_segments = 0; 138 bio->bi_hw_front_size = 0; 139 bio->bi_hw_back_size = 0; 140 bio->bi_size = 0; 141 bio->bi_max_vecs = 0; 142 bio->bi_end_io = NULL; 143 atomic_set(&bio->bi_cnt, 1); 144 bio->bi_private = NULL; 145} 146 147/** 148 * bio_alloc_bioset - allocate a bio for I/O 149 * @gfp_mask: the GFP_ mask given to the slab allocator 150 * @nr_iovecs: number of iovecs to pre-allocate 151 * @bs: the bio_set to allocate from 152 * 153 * Description: 154 * bio_alloc_bioset will first try it's on mempool to satisfy the allocation. 155 * If %__GFP_WAIT is set then we will block on the internal pool waiting 156 * for a &struct bio to become free. 157 * 158 * allocate bio and iovecs from the memory pools specified by the 159 * bio_set structure. 160 **/ 161struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) 162{ 163 struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask); 164 165 if (likely(bio)) { 166 struct bio_vec *bvl = NULL; 167 168 bio_init(bio); 169 if (likely(nr_iovecs)) { 170 unsigned long idx = 0; /* shut up gcc */ 171 172 bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); 173 if (unlikely(!bvl)) { 174 mempool_free(bio, bs->bio_pool); 175 bio = NULL; 176 goto out; 177 } 178 bio->bi_flags |= idx << BIO_POOL_OFFSET; 179 bio->bi_max_vecs = bvec_slabs[idx].nr_vecs; 180 } 181 bio->bi_io_vec = bvl; 182 } 183out: 184 return bio; 185} 186 187struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) 188{ 189 struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set); 190 191 if (bio) 192 bio->bi_destructor = bio_fs_destructor; 193 194 return bio; 195} 196 197void zero_fill_bio(struct bio *bio) 198{ 199 unsigned long flags; 200 struct bio_vec *bv; 201 int i; 202 203 bio_for_each_segment(bv, bio, i) { 204 char *data = bvec_kmap_irq(bv, &flags); 205 memset(data, 0, bv->bv_len); 206 flush_dcache_page(bv->bv_page); 207 bvec_kunmap_irq(data, &flags); 208 } 209} 210EXPORT_SYMBOL(zero_fill_bio); 211 212/** 213 * bio_put - release a reference to a bio 214 * @bio: bio to release reference to 215 * 216 * Description: 217 * Put a reference to a &struct bio, either one you have gotten with 218 * bio_alloc or bio_get. The last put of a bio will free it. 219 **/ 220void bio_put(struct bio *bio) 221{ 222 BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); 223 224 /* 225 * last put frees it 226 */ 227 if (atomic_dec_and_test(&bio->bi_cnt)) { 228 bio->bi_next = NULL; 229 bio->bi_destructor(bio); 230 } 231} 232 233inline int bio_phys_segments(request_queue_t *q, struct bio *bio) 234{ 235 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 236 blk_recount_segments(q, bio); 237 238 return bio->bi_phys_segments; 239} 240 241inline int bio_hw_segments(request_queue_t *q, struct bio *bio) 242{ 243 if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) 244 blk_recount_segments(q, bio); 245 246 return bio->bi_hw_segments; 247} 248 249/** 250 * __bio_clone - clone a bio 251 * @bio: destination bio 252 * @bio_src: bio to clone 253 * 254 * Clone a &bio. Caller will own the returned bio, but not 255 * the actual data it points to. Reference count of returned 256 * bio will be one. 257 */ 258void __bio_clone(struct bio *bio, struct bio *bio_src) 259{ 260 request_queue_t *q = bdev_get_queue(bio_src->bi_bdev); 261 262 memcpy(bio->bi_io_vec, bio_src->bi_io_vec, 263 bio_src->bi_max_vecs * sizeof(struct bio_vec)); 264 265 bio->bi_sector = bio_src->bi_sector; 266 bio->bi_bdev = bio_src->bi_bdev; 267 bio->bi_flags |= 1 << BIO_CLONED; 268 bio->bi_rw = bio_src->bi_rw; 269 bio->bi_vcnt = bio_src->bi_vcnt; 270 bio->bi_size = bio_src->bi_size; 271 bio->bi_idx = bio_src->bi_idx; 272 bio_phys_segments(q, bio); 273 bio_hw_segments(q, bio); 274} 275 276/** 277 * bio_clone - clone a bio 278 * @bio: bio to clone 279 * @gfp_mask: allocation priority 280 * 281 * Like __bio_clone, only also allocates the returned bio 282 */ 283struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) 284{ 285 struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set); 286 287 if (b) { 288 b->bi_destructor = bio_fs_destructor; 289 __bio_clone(b, bio); 290 } 291 292 return b; 293} 294 295/** 296 * bio_get_nr_vecs - return approx number of vecs 297 * @bdev: I/O target 298 * 299 * Return the approximate number of pages we can send to this target. 300 * There's no guarantee that you will be able to fit this number of pages 301 * into a bio, it does not account for dynamic restrictions that vary 302 * on offset. 303 */ 304int bio_get_nr_vecs(struct block_device *bdev) 305{ 306 request_queue_t *q = bdev_get_queue(bdev); 307 int nr_pages; 308 309 nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; 310 if (nr_pages > q->max_phys_segments) 311 nr_pages = q->max_phys_segments; 312 if (nr_pages > q->max_hw_segments) 313 nr_pages = q->max_hw_segments; 314 315 return nr_pages; 316} 317 318static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page 319 *page, unsigned int len, unsigned int offset, 320 unsigned short max_sectors) 321{ 322 int retried_segments = 0; 323 struct bio_vec *bvec; 324 325 /* 326 * cloned bio must not modify vec list 327 */ 328 if (unlikely(bio_flagged(bio, BIO_CLONED))) 329 return 0; 330 331 if (((bio->bi_size + len) >> 9) > max_sectors) 332 return 0; 333 334 /* 335 * For filesystems with a blocksize smaller than the pagesize 336 * we will often be called with the same page as last time and 337 * a consecutive offset. Optimize this special case. 338 */ 339 if (bio->bi_vcnt > 0) { 340 struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1]; 341 342 if (page == prev->bv_page && 343 offset == prev->bv_offset + prev->bv_len) { 344 prev->bv_len += len; 345 if (q->merge_bvec_fn && 346 q->merge_bvec_fn(q, bio, prev) < len) { 347 prev->bv_len -= len; 348 return 0; 349 } 350 351 goto done; 352 } 353 } 354 355 if (bio->bi_vcnt >= bio->bi_max_vecs) 356 return 0; 357 358 /* 359 * we might lose a segment or two here, but rather that than 360 * make this too complex. 361 */ 362 363 while (bio->bi_phys_segments >= q->max_phys_segments 364 || bio->bi_hw_segments >= q->max_hw_segments 365 || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) { 366 367 if (retried_segments) 368 return 0; 369 370 retried_segments = 1; 371 blk_recount_segments(q, bio); 372 } 373 374 /* 375 * setup the new entry, we might clear it again later if we 376 * cannot add the page 377 */ 378 bvec = &bio->bi_io_vec[bio->bi_vcnt]; 379 bvec->bv_page = page; 380 bvec->bv_len = len; 381 bvec->bv_offset = offset; 382 383 /* 384 * if queue has other restrictions (eg varying max sector size 385 * depending on offset), it can specify a merge_bvec_fn in the 386 * queue to get further control 387 */ 388 if (q->merge_bvec_fn) { 389 /* 390 * merge_bvec_fn() returns number of bytes it can accept 391 * at this offset 392 */ 393 if (q->merge_bvec_fn(q, bio, bvec) < len) { 394 bvec->bv_page = NULL; 395 bvec->bv_len = 0; 396 bvec->bv_offset = 0; 397 return 0; 398 } 399 } 400 401 /* If we may be able to merge these biovecs, force a recount */ 402 if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) || 403 BIOVEC_VIRT_MERGEABLE(bvec-1, bvec))) 404 bio->bi_flags &= ~(1 << BIO_SEG_VALID); 405 406 bio->bi_vcnt++; 407 bio->bi_phys_segments++; 408 bio->bi_hw_segments++; 409 done: 410 bio->bi_size += len; 411 return len; 412} 413 414/** 415 * bio_add_pc_page - attempt to add page to bio 416 * @q: the target queue 417 * @bio: destination bio 418 * @page: page to add 419 * @len: vec entry length 420 * @offset: vec entry offset 421 * 422 * Attempt to add a page to the bio_vec maplist. This can fail for a 423 * number of reasons, such as the bio being full or target block 424 * device limitations. The target block device must allow bio's 425 * smaller than PAGE_SIZE, so it is always possible to add a single 426 * page to an empty bio. This should only be used by REQ_PC bios. 427 */ 428int bio_add_pc_page(request_queue_t *q, struct bio *bio, struct page *page, 429 unsigned int len, unsigned int offset) 430{ 431 return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors); 432} 433 434/** 435 * bio_add_page - attempt to add page to bio 436 * @bio: destination bio 437 * @page: page to add 438 * @len: vec entry length 439 * @offset: vec entry offset 440 * 441 * Attempt to add a page to the bio_vec maplist. This can fail for a 442 * number of reasons, such as the bio being full or target block 443 * device limitations. The target block device must allow bio's 444 * smaller than PAGE_SIZE, so it is always possible to add a single 445 * page to an empty bio. 446 */ 447int bio_add_page(struct bio *bio, struct page *page, unsigned int len, 448 unsigned int offset) 449{ 450 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 451 return __bio_add_page(q, bio, page, len, offset, q->max_sectors); 452} 453 454struct bio_map_data { 455 struct bio_vec *iovecs; 456 void __user *userptr; 457}; 458 459static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio) 460{ 461 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); 462 bio->bi_private = bmd; 463} 464 465static void bio_free_map_data(struct bio_map_data *bmd) 466{ 467 kfree(bmd->iovecs); 468 kfree(bmd); 469} 470 471static struct bio_map_data *bio_alloc_map_data(int nr_segs) 472{ 473 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); 474 475 if (!bmd) 476 return NULL; 477 478 bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); 479 if (bmd->iovecs) 480 return bmd; 481 482 kfree(bmd); 483 return NULL; 484} 485 486/** 487 * bio_uncopy_user - finish previously mapped bio 488 * @bio: bio being terminated 489 * 490 * Free pages allocated from bio_copy_user() and write back data 491 * to user space in case of a read. 492 */ 493int bio_uncopy_user(struct bio *bio) 494{ 495 struct bio_map_data *bmd = bio->bi_private; 496 const int read = bio_data_dir(bio) == READ; 497 struct bio_vec *bvec; 498 int i, ret = 0; 499 500 __bio_for_each_segment(bvec, bio, i, 0) { 501 char *addr = page_address(bvec->bv_page); 502 unsigned int len = bmd->iovecs[i].bv_len; 503 504 if (read && !ret && copy_to_user(bmd->userptr, addr, len)) 505 ret = -EFAULT; 506 507 __free_page(bvec->bv_page); 508 bmd->userptr += len; 509 } 510 bio_free_map_data(bmd); 511 bio_put(bio); 512 return ret; 513} 514 515/** 516 * bio_copy_user - copy user data to bio 517 * @q: destination block queue 518 * @uaddr: start of user address 519 * @len: length in bytes 520 * @write_to_vm: bool indicating writing to pages or not 521 * 522 * Prepares and returns a bio for indirect user io, bouncing data 523 * to/from kernel pages as necessary. Must be paired with 524 * call bio_uncopy_user() on io completion. 525 */ 526struct bio *bio_copy_user(request_queue_t *q, unsigned long uaddr, 527 unsigned int len, int write_to_vm) 528{ 529 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 530 unsigned long start = uaddr >> PAGE_SHIFT; 531 struct bio_map_data *bmd; 532 struct bio_vec *bvec; 533 struct page *page; 534 struct bio *bio; 535 int i, ret; 536 537 bmd = bio_alloc_map_data(end - start); 538 if (!bmd) 539 return ERR_PTR(-ENOMEM); 540 541 bmd->userptr = (void __user *) uaddr; 542 543 ret = -ENOMEM; 544 bio = bio_alloc(GFP_KERNEL, end - start); 545 if (!bio) 546 goto out_bmd; 547 548 bio->bi_rw |= (!write_to_vm << BIO_RW); 549 550 ret = 0; 551 while (len) { 552 unsigned int bytes = PAGE_SIZE; 553 554 if (bytes > len) 555 bytes = len; 556 557 page = alloc_page(q->bounce_gfp | GFP_KERNEL); 558 if (!page) { 559 ret = -ENOMEM; 560 break; 561 } 562 563 if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) 564 break; 565 566 len -= bytes; 567 } 568 569 if (ret) 570 goto cleanup; 571 572 /* 573 * success 574 */ 575 if (!write_to_vm) { 576 char __user *p = (char __user *) uaddr; 577 578 /* 579 * for a write, copy in data to kernel pages 580 */ 581 ret = -EFAULT; 582 bio_for_each_segment(bvec, bio, i) { 583 char *addr = page_address(bvec->bv_page); 584 585 if (copy_from_user(addr, p, bvec->bv_len)) 586 goto cleanup; 587 p += bvec->bv_len; 588 } 589 } 590 591 bio_set_map_data(bmd, bio); 592 return bio; 593cleanup: 594 bio_for_each_segment(bvec, bio, i) 595 __free_page(bvec->bv_page); 596 597 bio_put(bio); 598out_bmd: 599 bio_free_map_data(bmd); 600 return ERR_PTR(ret); 601} 602 603static struct bio *__bio_map_user_iov(request_queue_t *q, 604 struct block_device *bdev, 605 struct sg_iovec *iov, int iov_count, 606 int write_to_vm) 607{ 608 int i, j; 609 int nr_pages = 0; 610 struct page **pages; 611 struct bio *bio; 612 int cur_page = 0; 613 int ret, offset; 614 615 for (i = 0; i < iov_count; i++) { 616 unsigned long uaddr = (unsigned long)iov[i].iov_base; 617 unsigned long len = iov[i].iov_len; 618 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 619 unsigned long start = uaddr >> PAGE_SHIFT; 620 621 nr_pages += end - start; 622 /* 623 * buffer must be aligned to at least hardsector size for now 624 */ 625 if (uaddr & queue_dma_alignment(q)) 626 return ERR_PTR(-EINVAL); 627 } 628 629 if (!nr_pages) 630 return ERR_PTR(-EINVAL); 631 632 bio = bio_alloc(GFP_KERNEL, nr_pages); 633 if (!bio) 634 return ERR_PTR(-ENOMEM); 635 636 ret = -ENOMEM; 637 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); 638 if (!pages) 639 goto out; 640 641 for (i = 0; i < iov_count; i++) { 642 unsigned long uaddr = (unsigned long)iov[i].iov_base; 643 unsigned long len = iov[i].iov_len; 644 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 645 unsigned long start = uaddr >> PAGE_SHIFT; 646 const int local_nr_pages = end - start; 647 const int page_limit = cur_page + local_nr_pages; 648 649 down_read(¤t->mm->mmap_sem); 650 ret = get_user_pages(current, current->mm, uaddr, 651 local_nr_pages, 652 write_to_vm, 0, &pages[cur_page], NULL); 653 up_read(¤t->mm->mmap_sem); 654 655 if (ret < local_nr_pages) { 656 ret = -EFAULT; 657 goto out_unmap; 658 } 659 660 offset = uaddr & ~PAGE_MASK; 661 for (j = cur_page; j < page_limit; j++) { 662 unsigned int bytes = PAGE_SIZE - offset; 663 664 if (len <= 0) 665 break; 666 667 if (bytes > len) 668 bytes = len; 669 670 /* 671 * sorry... 672 */ 673 if (bio_add_pc_page(q, bio, pages[j], bytes, offset) < 674 bytes) 675 break; 676 677 len -= bytes; 678 offset = 0; 679 } 680 681 cur_page = j; 682 /* 683 * release the pages we didn't map into the bio, if any 684 */ 685 while (j < page_limit) 686 page_cache_release(pages[j++]); 687 } 688 689 kfree(pages); 690 691 /* 692 * set data direction, and check if mapped pages need bouncing 693 */ 694 if (!write_to_vm) 695 bio->bi_rw |= (1 << BIO_RW); 696 697 bio->bi_bdev = bdev; 698 bio->bi_flags |= (1 << BIO_USER_MAPPED); 699 return bio; 700 701 out_unmap: 702 for (i = 0; i < nr_pages; i++) { 703 if(!pages[i]) 704 break; 705 page_cache_release(pages[i]); 706 } 707 out: 708 kfree(pages); 709 bio_put(bio); 710 return ERR_PTR(ret); 711} 712 713/** 714 * bio_map_user - map user address into bio 715 * @q: the request_queue_t for the bio 716 * @bdev: destination block device 717 * @uaddr: start of user address 718 * @len: length in bytes 719 * @write_to_vm: bool indicating writing to pages or not 720 * 721 * Map the user space address into a bio suitable for io to a block 722 * device. Returns an error pointer in case of error. 723 */ 724struct bio *bio_map_user(request_queue_t *q, struct block_device *bdev, 725 unsigned long uaddr, unsigned int len, int write_to_vm) 726{ 727 struct sg_iovec iov; 728 729 iov.iov_base = (void __user *)uaddr; 730 iov.iov_len = len; 731 732 return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm); 733} 734 735/** 736 * bio_map_user_iov - map user sg_iovec table into bio 737 * @q: the request_queue_t for the bio 738 * @bdev: destination block device 739 * @iov: the iovec. 740 * @iov_count: number of elements in the iovec 741 * @write_to_vm: bool indicating writing to pages or not 742 * 743 * Map the user space address into a bio suitable for io to a block 744 * device. Returns an error pointer in case of error. 745 */ 746struct bio *bio_map_user_iov(request_queue_t *q, struct block_device *bdev, 747 struct sg_iovec *iov, int iov_count, 748 int write_to_vm) 749{ 750 struct bio *bio; 751 752 bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm); 753 754 if (IS_ERR(bio)) 755 return bio; 756 757 /* 758 * subtle -- if __bio_map_user() ended up bouncing a bio, 759 * it would normally disappear when its bi_end_io is run. 760 * however, we need it for the unmap, so grab an extra 761 * reference to it 762 */ 763 bio_get(bio); 764 765 return bio; 766} 767 768static void __bio_unmap_user(struct bio *bio) 769{ 770 struct bio_vec *bvec; 771 int i; 772 773 /* 774 * make sure we dirty pages we wrote to 775 */ 776 __bio_for_each_segment(bvec, bio, i, 0) { 777 if (bio_data_dir(bio) == READ) 778 set_page_dirty_lock(bvec->bv_page); 779 780 page_cache_release(bvec->bv_page); 781 } 782 783 bio_put(bio); 784} 785 786/** 787 * bio_unmap_user - unmap a bio 788 * @bio: the bio being unmapped 789 * 790 * Unmap a bio previously mapped by bio_map_user(). Must be called with 791 * a process context. 792 * 793 * bio_unmap_user() may sleep. 794 */ 795void bio_unmap_user(struct bio *bio) 796{ 797 __bio_unmap_user(bio); 798 bio_put(bio); 799} 800 801static int bio_map_kern_endio(struct bio *bio, unsigned int bytes_done, int err) 802{ 803 if (bio->bi_size) 804 return 1; 805 806 bio_put(bio); 807 return 0; 808} 809 810 811static struct bio *__bio_map_kern(request_queue_t *q, void *data, 812 unsigned int len, gfp_t gfp_mask) 813{ 814 unsigned long kaddr = (unsigned long)data; 815 unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 816 unsigned long start = kaddr >> PAGE_SHIFT; 817 const int nr_pages = end - start; 818 int offset, i; 819 struct bio *bio; 820 821 bio = bio_alloc(gfp_mask, nr_pages); 822 if (!bio) 823 return ERR_PTR(-ENOMEM); 824 825 offset = offset_in_page(kaddr); 826 for (i = 0; i < nr_pages; i++) { 827 unsigned int bytes = PAGE_SIZE - offset; 828 829 if (len <= 0) 830 break; 831 832 if (bytes > len) 833 bytes = len; 834 835 if (bio_add_pc_page(q, bio, virt_to_page(data), bytes, 836 offset) < bytes) 837 break; 838 839 data += bytes; 840 len -= bytes; 841 offset = 0; 842 } 843 844 bio->bi_end_io = bio_map_kern_endio; 845 return bio; 846} 847 848/** 849 * bio_map_kern - map kernel address into bio 850 * @q: the request_queue_t for the bio 851 * @data: pointer to buffer to map 852 * @len: length in bytes 853 * @gfp_mask: allocation flags for bio allocation 854 * 855 * Map the kernel address into a bio suitable for io to a block 856 * device. Returns an error pointer in case of error. 857 */ 858struct bio *bio_map_kern(request_queue_t *q, void *data, unsigned int len, 859 gfp_t gfp_mask) 860{ 861 struct bio *bio; 862 863 bio = __bio_map_kern(q, data, len, gfp_mask); 864 if (IS_ERR(bio)) 865 return bio; 866 867 if (bio->bi_size == len) 868 return bio; 869 870 /* 871 * Don't support partial mappings. 872 */ 873 bio_put(bio); 874 return ERR_PTR(-EINVAL); 875} 876 877/* 878 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions 879 * for performing direct-IO in BIOs. 880 * 881 * The problem is that we cannot run set_page_dirty() from interrupt context 882 * because the required locks are not interrupt-safe. So what we can do is to 883 * mark the pages dirty _before_ performing IO. And in interrupt context, 884 * check that the pages are still dirty. If so, fine. If not, redirty them 885 * in process context. 886 * 887 * We special-case compound pages here: normally this means reads into hugetlb 888 * pages. The logic in here doesn't really work right for compound pages 889 * because the VM does not uniformly chase down the head page in all cases. 890 * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't 891 * handle them at all. So we skip compound pages here at an early stage. 892 * 893 * Note that this code is very hard to test under normal circumstances because 894 * direct-io pins the pages with get_user_pages(). This makes 895 * is_page_cache_freeable return false, and the VM will not clean the pages. 896 * But other code (eg, pdflush) could clean the pages if they are mapped 897 * pagecache. 898 * 899 * Simply disabling the call to bio_set_pages_dirty() is a good way to test the 900 * deferred bio dirtying paths. 901 */ 902 903/* 904 * bio_set_pages_dirty() will mark all the bio's pages as dirty. 905 */ 906void bio_set_pages_dirty(struct bio *bio) 907{ 908 struct bio_vec *bvec = bio->bi_io_vec; 909 int i; 910 911 for (i = 0; i < bio->bi_vcnt; i++) { 912 struct page *page = bvec[i].bv_page; 913 914 if (page && !PageCompound(page)) 915 set_page_dirty_lock(page); 916 } 917} 918 919void bio_release_pages(struct bio *bio) 920{ 921 struct bio_vec *bvec = bio->bi_io_vec; 922 int i; 923 924 for (i = 0; i < bio->bi_vcnt; i++) { 925 struct page *page = bvec[i].bv_page; 926 927 if (page) 928 put_page(page); 929 } 930} 931 932/* 933 * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. 934 * If they are, then fine. If, however, some pages are clean then they must 935 * have been written out during the direct-IO read. So we take another ref on 936 * the BIO and the offending pages and re-dirty the pages in process context. 937 * 938 * It is expected that bio_check_pages_dirty() will wholly own the BIO from 939 * here on. It will run one page_cache_release() against each page and will 940 * run one bio_put() against the BIO. 941 */ 942 943static void bio_dirty_fn(struct work_struct *work); 944 945static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); 946static DEFINE_SPINLOCK(bio_dirty_lock); 947static struct bio *bio_dirty_list; 948 949/* 950 * This runs in process context 951 */ 952static void bio_dirty_fn(struct work_struct *work) 953{ 954 unsigned long flags; 955 struct bio *bio; 956 957 spin_lock_irqsave(&bio_dirty_lock, flags); 958 bio = bio_dirty_list; 959 bio_dirty_list = NULL; 960 spin_unlock_irqrestore(&bio_dirty_lock, flags); 961 962 while (bio) { 963 struct bio *next = bio->bi_private; 964 965 bio_set_pages_dirty(bio); 966 bio_release_pages(bio); 967 bio_put(bio); 968 bio = next; 969 } 970} 971 972void bio_check_pages_dirty(struct bio *bio) 973{ 974 struct bio_vec *bvec = bio->bi_io_vec; 975 int nr_clean_pages = 0; 976 int i; 977 978 for (i = 0; i < bio->bi_vcnt; i++) { 979 struct page *page = bvec[i].bv_page; 980 981 if (PageDirty(page) || PageCompound(page)) { 982 page_cache_release(page); 983 bvec[i].bv_page = NULL; 984 } else { 985 nr_clean_pages++; 986 } 987 } 988 989 if (nr_clean_pages) { 990 unsigned long flags; 991 992 spin_lock_irqsave(&bio_dirty_lock, flags); 993 bio->bi_private = bio_dirty_list; 994 bio_dirty_list = bio; 995 spin_unlock_irqrestore(&bio_dirty_lock, flags); 996 schedule_work(&bio_dirty_work); 997 } else { 998 bio_put(bio); 999 } 1000} 1001 1002/** 1003 * bio_endio - end I/O on a bio 1004 * @bio: bio 1005 * @bytes_done: number of bytes completed 1006 * @error: error, if any 1007 * 1008 * Description: 1009 * bio_endio() will end I/O on @bytes_done number of bytes. This may be 1010 * just a partial part of the bio, or it may be the whole bio. bio_endio() 1011 * is the preferred way to end I/O on a bio, it takes care of decrementing 1012 * bi_size and clearing BIO_UPTODATE on error. @error is 0 on success, and 1013 * and one of the established -Exxxx (-EIO, for instance) error values in 1014 * case something went wrong. Noone should call bi_end_io() directly on 1015 * a bio unless they own it and thus know that it has an end_io function. 1016 **/ 1017void bio_endio(struct bio *bio, unsigned int bytes_done, int error) 1018{ 1019 if (error) 1020 clear_bit(BIO_UPTODATE, &bio->bi_flags); 1021 1022 if (unlikely(bytes_done > bio->bi_size)) { 1023 printk("%s: want %u bytes done, only %u left\n", __FUNCTION__, 1024 bytes_done, bio->bi_size); 1025 bytes_done = bio->bi_size; 1026 } 1027 1028 bio->bi_size -= bytes_done; 1029 bio->bi_sector += (bytes_done >> 9); 1030 1031 if (bio->bi_end_io) 1032 bio->bi_end_io(bio, bytes_done, error); 1033} 1034 1035void bio_pair_release(struct bio_pair *bp) 1036{ 1037 if (atomic_dec_and_test(&bp->cnt)) { 1038 struct bio *master = bp->bio1.bi_private; 1039 1040 bio_endio(master, master->bi_size, bp->error); 1041 mempool_free(bp, bp->bio2.bi_private); 1042 } 1043} 1044 1045static int bio_pair_end_1(struct bio * bi, unsigned int done, int err) 1046{ 1047 struct bio_pair *bp = container_of(bi, struct bio_pair, bio1); 1048 1049 if (err) 1050 bp->error = err; 1051 1052 if (bi->bi_size) 1053 return 1; 1054 1055 bio_pair_release(bp); 1056 return 0; 1057} 1058 1059static int bio_pair_end_2(struct bio * bi, unsigned int done, int err) 1060{ 1061 struct bio_pair *bp = container_of(bi, struct bio_pair, bio2); 1062 1063 if (err) 1064 bp->error = err; 1065 1066 if (bi->bi_size) 1067 return 1; 1068 1069 bio_pair_release(bp); 1070 return 0; 1071} 1072 1073/* 1074 * split a bio - only worry about a bio with a single page 1075 * in it's iovec 1076 */ 1077struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) 1078{ 1079 struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO); 1080 1081 if (!bp) 1082 return bp; 1083 1084 blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi, 1085 bi->bi_sector + first_sectors); 1086 1087 BUG_ON(bi->bi_vcnt != 1); 1088 BUG_ON(bi->bi_idx != 0); 1089 atomic_set(&bp->cnt, 3); 1090 bp->error = 0; 1091 bp->bio1 = *bi; 1092 bp->bio2 = *bi; 1093 bp->bio2.bi_sector += first_sectors; 1094 bp->bio2.bi_size -= first_sectors << 9; 1095 bp->bio1.bi_size = first_sectors << 9; 1096 1097 bp->bv1 = bi->bi_io_vec[0]; 1098 bp->bv2 = bi->bi_io_vec[0]; 1099 bp->bv2.bv_offset += first_sectors << 9; 1100 bp->bv2.bv_len -= first_sectors << 9; 1101 bp->bv1.bv_len = first_sectors << 9; 1102 1103 bp->bio1.bi_io_vec = &bp->bv1; 1104 bp->bio2.bi_io_vec = &bp->bv2; 1105 1106 bp->bio1.bi_max_vecs = 1; 1107 bp->bio2.bi_max_vecs = 1; 1108 1109 bp->bio1.bi_end_io = bio_pair_end_1; 1110 bp->bio2.bi_end_io = bio_pair_end_2; 1111 1112 bp->bio1.bi_private = bi; 1113 bp->bio2.bi_private = pool; 1114 1115 return bp; 1116} 1117 1118 1119/* 1120 * create memory pools for biovec's in a bio_set. 1121 * use the global biovec slabs created for general use. 1122 */ 1123static int biovec_create_pools(struct bio_set *bs, int pool_entries) 1124{ 1125 int i; 1126 1127 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1128 struct biovec_slab *bp = bvec_slabs + i; 1129 mempool_t **bvp = bs->bvec_pools + i; 1130 1131 *bvp = mempool_create_slab_pool(pool_entries, bp->slab); 1132 if (!*bvp) 1133 return -ENOMEM; 1134 } 1135 return 0; 1136} 1137 1138static void biovec_free_pools(struct bio_set *bs) 1139{ 1140 int i; 1141 1142 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1143 mempool_t *bvp = bs->bvec_pools[i]; 1144 1145 if (bvp) 1146 mempool_destroy(bvp); 1147 } 1148 1149} 1150 1151void bioset_free(struct bio_set *bs) 1152{ 1153 if (bs->bio_pool) 1154 mempool_destroy(bs->bio_pool); 1155 1156 biovec_free_pools(bs); 1157 1158 kfree(bs); 1159} 1160 1161struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size) 1162{ 1163 struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL); 1164 1165 if (!bs) 1166 return NULL; 1167 1168 bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab); 1169 if (!bs->bio_pool) 1170 goto bad; 1171 1172 if (!biovec_create_pools(bs, bvec_pool_size)) 1173 return bs; 1174 1175bad: 1176 bioset_free(bs); 1177 return NULL; 1178} 1179 1180static void __init biovec_init_slabs(void) 1181{ 1182 int i; 1183 1184 for (i = 0; i < BIOVEC_NR_POOLS; i++) { 1185 int size; 1186 struct biovec_slab *bvs = bvec_slabs + i; 1187 1188 size = bvs->nr_vecs * sizeof(struct bio_vec); 1189 bvs->slab = kmem_cache_create(bvs->name, size, 0, 1190 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); 1191 } 1192} 1193 1194static int __init init_bio(void) 1195{ 1196 bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC); 1197 1198 biovec_init_slabs(); 1199 1200 fs_bio_set = bioset_create(BIO_POOL_SIZE, 2); 1201 if (!fs_bio_set) 1202 panic("bio: can't allocate bios\n"); 1203 1204 bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES, 1205 sizeof(struct bio_pair)); 1206 if (!bio_split_pool) 1207 panic("bio: can't create split pool\n"); 1208 1209 return 0; 1210} 1211 1212subsys_initcall(init_bio); 1213 1214EXPORT_SYMBOL(bio_alloc); 1215EXPORT_SYMBOL(bio_put); 1216EXPORT_SYMBOL(bio_free); 1217EXPORT_SYMBOL(bio_endio); 1218EXPORT_SYMBOL(bio_init); 1219EXPORT_SYMBOL(__bio_clone); 1220EXPORT_SYMBOL(bio_clone); 1221EXPORT_SYMBOL(bio_phys_segments); 1222EXPORT_SYMBOL(bio_hw_segments); 1223EXPORT_SYMBOL(bio_add_page); 1224EXPORT_SYMBOL(bio_add_pc_page); 1225EXPORT_SYMBOL(bio_get_nr_vecs); 1226EXPORT_SYMBOL(bio_map_user); 1227EXPORT_SYMBOL(bio_unmap_user); 1228EXPORT_SYMBOL(bio_map_kern); 1229EXPORT_SYMBOL(bio_pair_release); 1230EXPORT_SYMBOL(bio_split); 1231EXPORT_SYMBOL(bio_split_pool); 1232EXPORT_SYMBOL(bio_copy_user); 1233EXPORT_SYMBOL(bio_uncopy_user); 1234EXPORT_SYMBOL(bioset_create); 1235EXPORT_SYMBOL(bioset_free); 1236EXPORT_SYMBOL(bio_alloc_bioset); 1237