1/* 2 * linux/fs/block_dev.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE 6 */ 7 8#include <linux/init.h> 9#include <linux/mm.h> 10#include <linux/fcntl.h> 11#include <linux/slab.h> 12#include <linux/kmod.h> 13#include <linux/major.h> 14#include <linux/smp_lock.h> 15#include <linux/device_cgroup.h> 16#include <linux/highmem.h> 17#include <linux/blkdev.h> 18#include <linux/module.h> 19#include <linux/blkpg.h> 20#include <linux/buffer_head.h> 21#include <linux/pagevec.h> 22#include <linux/writeback.h> 23#include <linux/mpage.h> 24#include <linux/mount.h> 25#include <linux/uio.h> 26#include <linux/namei.h> 27#include <linux/log2.h> 28#include <linux/kmemleak.h> 29#include <asm/uaccess.h> 30#include "internal.h" 31 32struct bdev_inode { 33 struct block_device bdev; 34 struct inode vfs_inode; 35}; 36 37static const struct address_space_operations def_blk_aops; 38 39static inline struct bdev_inode *BDEV_I(struct inode *inode) 40{ 41 return container_of(inode, struct bdev_inode, vfs_inode); 42} 43 44inline struct block_device *I_BDEV(struct inode *inode) 45{ 46 return &BDEV_I(inode)->bdev; 47} 48 49EXPORT_SYMBOL(I_BDEV); 50 51static sector_t max_block(struct block_device *bdev) 52{ 53 sector_t retval = ~((sector_t)0); 54 loff_t sz = i_size_read(bdev->bd_inode); 55 56 if (sz) { 57 unsigned int size = block_size(bdev); 58 unsigned int sizebits = blksize_bits(size); 59 retval = (sz >> sizebits); 60 } 61 return retval; 62} 63 64/* Kill _all_ buffers and pagecache , dirty or not.. */ 65static void kill_bdev(struct block_device *bdev) 66{ 67 if (bdev->bd_inode->i_mapping->nrpages == 0) 68 return; 69 invalidate_bh_lrus(); 70 truncate_inode_pages(bdev->bd_inode->i_mapping, 0); 71} 72 73int set_blocksize(struct block_device *bdev, int size) 74{ 75 /* Size must be a power of two, and between 512 and PAGE_SIZE */ 76 if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) 77 return -EINVAL; 78 79 /* Size cannot be smaller than the size supported by the device */ 80 if (size < bdev_logical_block_size(bdev)) 81 return -EINVAL; 82 83 /* Don't change the size if it is same as current */ 84 if (bdev->bd_block_size != size) { 85 sync_blockdev(bdev); 86 bdev->bd_block_size = size; 87 bdev->bd_inode->i_blkbits = blksize_bits(size); 88 kill_bdev(bdev); 89 } 90 return 0; 91} 92 93EXPORT_SYMBOL(set_blocksize); 94 95int sb_set_blocksize(struct super_block *sb, int size) 96{ 97 if (set_blocksize(sb->s_bdev, size)) 98 return 0; 99 /* If we get here, we know size is power of two 100 * and it's value is between 512 and PAGE_SIZE */ 101 sb->s_blocksize = size; 102 sb->s_blocksize_bits = blksize_bits(size); 103 return sb->s_blocksize; 104} 105 106EXPORT_SYMBOL(sb_set_blocksize); 107 108int sb_min_blocksize(struct super_block *sb, int size) 109{ 110 int minsize = bdev_logical_block_size(sb->s_bdev); 111 if (size < minsize) 112 size = minsize; 113 return sb_set_blocksize(sb, size); 114} 115 116EXPORT_SYMBOL(sb_min_blocksize); 117 118static int 119blkdev_get_block(struct inode *inode, sector_t iblock, 120 struct buffer_head *bh, int create) 121{ 122 if (iblock >= max_block(I_BDEV(inode))) { 123 if (create) 124 return -EIO; 125 126 /* 127 * for reads, we're just trying to fill a partial page. 128 * return a hole, they will have to call get_block again 129 * before they can fill it, and they will get -EIO at that 130 * time 131 */ 132 return 0; 133 } 134 bh->b_bdev = I_BDEV(inode); 135 bh->b_blocknr = iblock; 136 set_buffer_mapped(bh); 137 return 0; 138} 139 140static int 141blkdev_get_blocks(struct inode *inode, sector_t iblock, 142 struct buffer_head *bh, int create) 143{ 144 sector_t end_block = max_block(I_BDEV(inode)); 145 unsigned long max_blocks = bh->b_size >> inode->i_blkbits; 146 147 if ((iblock + max_blocks) > end_block) { 148 max_blocks = end_block - iblock; 149 if ((long)max_blocks <= 0) { 150 if (create) 151 return -EIO; /* write fully beyond EOF */ 152 /* 153 * It is a read which is fully beyond EOF. We return 154 * a !buffer_mapped buffer 155 */ 156 max_blocks = 0; 157 } 158 } 159 160 bh->b_bdev = I_BDEV(inode); 161 bh->b_blocknr = iblock; 162 bh->b_size = max_blocks << inode->i_blkbits; 163 if (max_blocks) 164 set_buffer_mapped(bh); 165 return 0; 166} 167 168static ssize_t 169blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 170 loff_t offset, unsigned long nr_segs) 171{ 172 struct file *file = iocb->ki_filp; 173 struct inode *inode = file->f_mapping->host; 174 175 return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset, 176 nr_segs, blkdev_get_blocks, NULL, NULL, 0); 177} 178 179int __sync_blockdev(struct block_device *bdev, int wait) 180{ 181 if (!bdev) 182 return 0; 183 if (!wait) 184 return filemap_flush(bdev->bd_inode->i_mapping); 185 return filemap_write_and_wait(bdev->bd_inode->i_mapping); 186} 187 188/* 189 * Write out and wait upon all the dirty data associated with a block 190 * device via its mapping. Does not take the superblock lock. 191 */ 192int sync_blockdev(struct block_device *bdev) 193{ 194 return __sync_blockdev(bdev, 1); 195} 196EXPORT_SYMBOL(sync_blockdev); 197 198/* 199 * Write out and wait upon all dirty data associated with this 200 * device. Filesystem data as well as the underlying block 201 * device. Takes the superblock lock. 202 */ 203int fsync_bdev(struct block_device *bdev) 204{ 205 struct super_block *sb = get_super(bdev); 206 if (sb) { 207 int res = sync_filesystem(sb); 208 drop_super(sb); 209 return res; 210 } 211 return sync_blockdev(bdev); 212} 213EXPORT_SYMBOL(fsync_bdev); 214 215/** 216 * freeze_bdev -- lock a filesystem and force it into a consistent state 217 * @bdev: blockdevice to lock 218 * 219 * If a superblock is found on this device, we take the s_umount semaphore 220 * on it to make sure nobody unmounts until the snapshot creation is done. 221 * The reference counter (bd_fsfreeze_count) guarantees that only the last 222 * unfreeze process can unfreeze the frozen filesystem actually when multiple 223 * freeze requests arrive simultaneously. It counts up in freeze_bdev() and 224 * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze 225 * actually. 226 */ 227struct super_block *freeze_bdev(struct block_device *bdev) 228{ 229 struct super_block *sb; 230 int error = 0; 231 232 mutex_lock(&bdev->bd_fsfreeze_mutex); 233 if (++bdev->bd_fsfreeze_count > 1) { 234 /* 235 * We don't even need to grab a reference - the first call 236 * to freeze_bdev grab an active reference and only the last 237 * thaw_bdev drops it. 238 */ 239 sb = get_super(bdev); 240 drop_super(sb); 241 mutex_unlock(&bdev->bd_fsfreeze_mutex); 242 return sb; 243 } 244 245 sb = get_active_super(bdev); 246 if (!sb) 247 goto out; 248 error = freeze_super(sb); 249 if (error) { 250 deactivate_super(sb); 251 bdev->bd_fsfreeze_count--; 252 mutex_unlock(&bdev->bd_fsfreeze_mutex); 253 return ERR_PTR(error); 254 } 255 deactivate_super(sb); 256 out: 257 sync_blockdev(bdev); 258 mutex_unlock(&bdev->bd_fsfreeze_mutex); 259 return sb; /* thaw_bdev releases s->s_umount */ 260} 261EXPORT_SYMBOL(freeze_bdev); 262 263/** 264 * thaw_bdev -- unlock filesystem 265 * @bdev: blockdevice to unlock 266 * @sb: associated superblock 267 * 268 * Unlocks the filesystem and marks it writeable again after freeze_bdev(). 269 */ 270int thaw_bdev(struct block_device *bdev, struct super_block *sb) 271{ 272 int error = -EINVAL; 273 274 mutex_lock(&bdev->bd_fsfreeze_mutex); 275 if (!bdev->bd_fsfreeze_count) 276 goto out; 277 278 error = 0; 279 if (--bdev->bd_fsfreeze_count > 0) 280 goto out; 281 282 if (!sb) 283 goto out; 284 285 error = thaw_super(sb); 286 if (error) { 287 bdev->bd_fsfreeze_count++; 288 mutex_unlock(&bdev->bd_fsfreeze_mutex); 289 return error; 290 } 291out: 292 mutex_unlock(&bdev->bd_fsfreeze_mutex); 293 return 0; 294} 295EXPORT_SYMBOL(thaw_bdev); 296 297static int blkdev_writepage(struct page *page, struct writeback_control *wbc) 298{ 299 return block_write_full_page(page, blkdev_get_block, wbc); 300} 301 302static int blkdev_readpage(struct file * file, struct page * page) 303{ 304 return block_read_full_page(page, blkdev_get_block); 305} 306 307static int blkdev_write_begin(struct file *file, struct address_space *mapping, 308 loff_t pos, unsigned len, unsigned flags, 309 struct page **pagep, void **fsdata) 310{ 311 return block_write_begin(mapping, pos, len, flags, pagep, 312 blkdev_get_block); 313} 314 315static int blkdev_write_end(struct file *file, struct address_space *mapping, 316 loff_t pos, unsigned len, unsigned copied, 317 struct page *page, void *fsdata) 318{ 319 int ret; 320 ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); 321 322 unlock_page(page); 323 page_cache_release(page); 324 325 return ret; 326} 327 328/* 329 * private llseek: 330 * for a block special file file->f_path.dentry->d_inode->i_size is zero 331 * so we compute the size by hand (just as in block_read/write above) 332 */ 333static loff_t block_llseek(struct file *file, loff_t offset, int origin) 334{ 335 struct inode *bd_inode = file->f_mapping->host; 336 loff_t size; 337 loff_t retval; 338 339 mutex_lock(&bd_inode->i_mutex); 340 size = i_size_read(bd_inode); 341 342 switch (origin) { 343 case 2: 344 offset += size; 345 break; 346 case 1: 347 offset += file->f_pos; 348 } 349 retval = -EINVAL; 350 if (offset >= 0 && offset <= size) { 351 if (offset != file->f_pos) { 352 file->f_pos = offset; 353 } 354 retval = offset; 355 } 356 mutex_unlock(&bd_inode->i_mutex); 357 return retval; 358} 359 360int blkdev_fsync(struct file *filp, int datasync) 361{ 362 struct inode *bd_inode = filp->f_mapping->host; 363 struct block_device *bdev = I_BDEV(bd_inode); 364 int error; 365 366 /* 367 * There is no need to serialise calls to blkdev_issue_flush with 368 * i_mutex and doing so causes performance issues with concurrent 369 * O_SYNC writers to a block device. 370 */ 371 mutex_unlock(&bd_inode->i_mutex); 372 373 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT); 374 if (error == -EOPNOTSUPP) 375 error = 0; 376 377 mutex_lock(&bd_inode->i_mutex); 378 379 return error; 380} 381EXPORT_SYMBOL(blkdev_fsync); 382 383/* 384 * pseudo-fs 385 */ 386 387static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); 388static struct kmem_cache * bdev_cachep __read_mostly; 389 390static struct inode *bdev_alloc_inode(struct super_block *sb) 391{ 392 struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); 393 if (!ei) 394 return NULL; 395 return &ei->vfs_inode; 396} 397 398static void bdev_destroy_inode(struct inode *inode) 399{ 400 struct bdev_inode *bdi = BDEV_I(inode); 401 402 kmem_cache_free(bdev_cachep, bdi); 403} 404 405static void init_once(void *foo) 406{ 407 struct bdev_inode *ei = (struct bdev_inode *) foo; 408 struct block_device *bdev = &ei->bdev; 409 410 memset(bdev, 0, sizeof(*bdev)); 411 mutex_init(&bdev->bd_mutex); 412 INIT_LIST_HEAD(&bdev->bd_inodes); 413 INIT_LIST_HEAD(&bdev->bd_list); 414#ifdef CONFIG_SYSFS 415 INIT_LIST_HEAD(&bdev->bd_holder_list); 416#endif 417 inode_init_once(&ei->vfs_inode); 418 /* Initialize mutex for freeze. */ 419 mutex_init(&bdev->bd_fsfreeze_mutex); 420} 421 422static inline void __bd_forget(struct inode *inode) 423{ 424 list_del_init(&inode->i_devices); 425 inode->i_bdev = NULL; 426 inode->i_mapping = &inode->i_data; 427} 428 429static void bdev_evict_inode(struct inode *inode) 430{ 431 struct block_device *bdev = &BDEV_I(inode)->bdev; 432 struct list_head *p; 433 truncate_inode_pages(&inode->i_data, 0); 434 invalidate_inode_buffers(inode); /* is it needed here? */ 435 end_writeback(inode); 436 spin_lock(&bdev_lock); 437 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) { 438 __bd_forget(list_entry(p, struct inode, i_devices)); 439 } 440 list_del_init(&bdev->bd_list); 441 spin_unlock(&bdev_lock); 442} 443 444static const struct super_operations bdev_sops = { 445 .statfs = simple_statfs, 446 .alloc_inode = bdev_alloc_inode, 447 .destroy_inode = bdev_destroy_inode, 448 .drop_inode = generic_delete_inode, 449 .evict_inode = bdev_evict_inode, 450}; 451 452static int bd_get_sb(struct file_system_type *fs_type, 453 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 454{ 455 return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); 456} 457 458static struct file_system_type bd_type = { 459 .name = "bdev", 460 .get_sb = bd_get_sb, 461 .kill_sb = kill_anon_super, 462}; 463 464struct super_block *blockdev_superblock __read_mostly; 465 466void __init bdev_cache_init(void) 467{ 468 int err; 469 struct vfsmount *bd_mnt; 470 471 bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 472 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| 473 SLAB_MEM_SPREAD|SLAB_PANIC), 474 init_once); 475 err = register_filesystem(&bd_type); 476 if (err) 477 panic("Cannot register bdev pseudo-fs"); 478 bd_mnt = kern_mount(&bd_type); 479 if (IS_ERR(bd_mnt)) 480 panic("Cannot create bdev pseudo-fs"); 481 /* 482 * This vfsmount structure is only used to obtain the 483 * blockdev_superblock, so tell kmemleak not to report it. 484 */ 485 kmemleak_not_leak(bd_mnt); 486 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ 487} 488 489/* 490 * Most likely _very_ bad one - but then it's hardly critical for small 491 * /dev and can be fixed when somebody will need really large one. 492 * Keep in mind that it will be fed through icache hash function too. 493 */ 494static inline unsigned long hash(dev_t dev) 495{ 496 return MAJOR(dev)+MINOR(dev); 497} 498 499static int bdev_test(struct inode *inode, void *data) 500{ 501 return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; 502} 503 504static int bdev_set(struct inode *inode, void *data) 505{ 506 BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; 507 return 0; 508} 509 510static LIST_HEAD(all_bdevs); 511 512struct block_device *bdget(dev_t dev) 513{ 514 struct block_device *bdev; 515 struct inode *inode; 516 517 inode = iget5_locked(blockdev_superblock, hash(dev), 518 bdev_test, bdev_set, &dev); 519 520 if (!inode) 521 return NULL; 522 523 bdev = &BDEV_I(inode)->bdev; 524 525 if (inode->i_state & I_NEW) { 526 bdev->bd_contains = NULL; 527 bdev->bd_inode = inode; 528 bdev->bd_block_size = (1 << inode->i_blkbits); 529 bdev->bd_part_count = 0; 530 bdev->bd_invalidated = 0; 531 inode->i_mode = S_IFBLK; 532 inode->i_rdev = dev; 533 inode->i_bdev = bdev; 534 inode->i_data.a_ops = &def_blk_aops; 535 mapping_set_gfp_mask(&inode->i_data, GFP_USER); 536 inode->i_data.backing_dev_info = &default_backing_dev_info; 537 spin_lock(&bdev_lock); 538 list_add(&bdev->bd_list, &all_bdevs); 539 spin_unlock(&bdev_lock); 540 unlock_new_inode(inode); 541 } 542 return bdev; 543} 544 545EXPORT_SYMBOL(bdget); 546 547/** 548 * bdgrab -- Grab a reference to an already referenced block device 549 * @bdev: Block device to grab a reference to. 550 */ 551struct block_device *bdgrab(struct block_device *bdev) 552{ 553 atomic_inc(&bdev->bd_inode->i_count); 554 return bdev; 555} 556 557long nr_blockdev_pages(void) 558{ 559 struct block_device *bdev; 560 long ret = 0; 561 spin_lock(&bdev_lock); 562 list_for_each_entry(bdev, &all_bdevs, bd_list) { 563 ret += bdev->bd_inode->i_mapping->nrpages; 564 } 565 spin_unlock(&bdev_lock); 566 return ret; 567} 568 569void bdput(struct block_device *bdev) 570{ 571 iput(bdev->bd_inode); 572} 573 574EXPORT_SYMBOL(bdput); 575 576static struct block_device *bd_acquire(struct inode *inode) 577{ 578 struct block_device *bdev; 579 580 spin_lock(&bdev_lock); 581 bdev = inode->i_bdev; 582 if (bdev) { 583 atomic_inc(&bdev->bd_inode->i_count); 584 spin_unlock(&bdev_lock); 585 return bdev; 586 } 587 spin_unlock(&bdev_lock); 588 589 bdev = bdget(inode->i_rdev); 590 if (bdev) { 591 spin_lock(&bdev_lock); 592 if (!inode->i_bdev) { 593 /* 594 * We take an additional bd_inode->i_count for inode, 595 * and it's released in clear_inode() of inode. 596 * So, we can access it via ->i_mapping always 597 * without igrab(). 598 */ 599 atomic_inc(&bdev->bd_inode->i_count); 600 inode->i_bdev = bdev; 601 inode->i_mapping = bdev->bd_inode->i_mapping; 602 list_add(&inode->i_devices, &bdev->bd_inodes); 603 } 604 spin_unlock(&bdev_lock); 605 } 606 return bdev; 607} 608 609/* Call when you free inode */ 610 611void bd_forget(struct inode *inode) 612{ 613 struct block_device *bdev = NULL; 614 615 spin_lock(&bdev_lock); 616 if (inode->i_bdev) { 617 if (!sb_is_blkdev_sb(inode->i_sb)) 618 bdev = inode->i_bdev; 619 __bd_forget(inode); 620 } 621 spin_unlock(&bdev_lock); 622 623 if (bdev) 624 iput(bdev->bd_inode); 625} 626 627/** 628 * bd_may_claim - test whether a block device can be claimed 629 * @bdev: block device of interest 630 * @whole: whole block device containing @bdev, may equal @bdev 631 * @holder: holder trying to claim @bdev 632 * 633 * Test whther @bdev can be claimed by @holder. 634 * 635 * CONTEXT: 636 * spin_lock(&bdev_lock). 637 * 638 * RETURNS: 639 * %true if @bdev can be claimed, %false otherwise. 640 */ 641static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, 642 void *holder) 643{ 644 if (bdev->bd_holder == holder) 645 return true; /* already a holder */ 646 else if (bdev->bd_holder != NULL) 647 return false; /* held by someone else */ 648 else if (bdev->bd_contains == bdev) 649 return true; /* is a whole device which isn't held */ 650 651 else if (whole->bd_holder == bd_claim) 652 return true; /* is a partition of a device that is being partitioned */ 653 else if (whole->bd_holder != NULL) 654 return false; /* is a partition of a held device */ 655 else 656 return true; /* is a partition of an un-held device */ 657} 658 659/** 660 * bd_prepare_to_claim - prepare to claim a block device 661 * @bdev: block device of interest 662 * @whole: the whole device containing @bdev, may equal @bdev 663 * @holder: holder trying to claim @bdev 664 * 665 * Prepare to claim @bdev. This function fails if @bdev is already 666 * claimed by another holder and waits if another claiming is in 667 * progress. This function doesn't actually claim. On successful 668 * return, the caller has ownership of bd_claiming and bd_holder[s]. 669 * 670 * CONTEXT: 671 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab 672 * it multiple times. 673 * 674 * RETURNS: 675 * 0 if @bdev can be claimed, -EBUSY otherwise. 676 */ 677static int bd_prepare_to_claim(struct block_device *bdev, 678 struct block_device *whole, void *holder) 679{ 680retry: 681 /* if someone else claimed, fail */ 682 if (!bd_may_claim(bdev, whole, holder)) 683 return -EBUSY; 684 685 /* if claiming is already in progress, wait for it to finish */ 686 if (whole->bd_claiming) { 687 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); 688 DEFINE_WAIT(wait); 689 690 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); 691 spin_unlock(&bdev_lock); 692 schedule(); 693 finish_wait(wq, &wait); 694 spin_lock(&bdev_lock); 695 goto retry; 696 } 697 698 /* yay, all mine */ 699 return 0; 700} 701 702/** 703 * bd_start_claiming - start claiming a block device 704 * @bdev: block device of interest 705 * @holder: holder trying to claim @bdev 706 * 707 * @bdev is about to be opened exclusively. Check @bdev can be opened 708 * exclusively and mark that an exclusive open is in progress. Each 709 * successful call to this function must be matched with a call to 710 * either bd_finish_claiming() or bd_abort_claiming() (which do not 711 * fail). 712 * 713 * This function is used to gain exclusive access to the block device 714 * without actually causing other exclusive open attempts to fail. It 715 * should be used when the open sequence itself requires exclusive 716 * access but may subsequently fail. 717 * 718 * CONTEXT: 719 * Might sleep. 720 * 721 * RETURNS: 722 * Pointer to the block device containing @bdev on success, ERR_PTR() 723 * value on failure. 724 */ 725static struct block_device *bd_start_claiming(struct block_device *bdev, 726 void *holder) 727{ 728 struct gendisk *disk; 729 struct block_device *whole; 730 int partno, err; 731 732 might_sleep(); 733 734 /* 735 * @bdev might not have been initialized properly yet, look up 736 * and grab the outer block device the hard way. 737 */ 738 disk = get_gendisk(bdev->bd_dev, &partno); 739 if (!disk) 740 return ERR_PTR(-ENXIO); 741 742 whole = bdget_disk(disk, 0); 743 module_put(disk->fops->owner); 744 put_disk(disk); 745 if (!whole) 746 return ERR_PTR(-ENOMEM); 747 748 /* prepare to claim, if successful, mark claiming in progress */ 749 spin_lock(&bdev_lock); 750 751 err = bd_prepare_to_claim(bdev, whole, holder); 752 if (err == 0) { 753 whole->bd_claiming = holder; 754 spin_unlock(&bdev_lock); 755 return whole; 756 } else { 757 spin_unlock(&bdev_lock); 758 bdput(whole); 759 return ERR_PTR(err); 760 } 761} 762 763/* releases bdev_lock */ 764static void __bd_abort_claiming(struct block_device *whole, void *holder) 765{ 766 BUG_ON(whole->bd_claiming != holder); 767 whole->bd_claiming = NULL; 768 wake_up_bit(&whole->bd_claiming, 0); 769 770 spin_unlock(&bdev_lock); 771 bdput(whole); 772} 773 774/** 775 * bd_abort_claiming - abort claiming a block device 776 * @whole: whole block device returned by bd_start_claiming() 777 * @holder: holder trying to claim @bdev 778 * 779 * Abort a claiming block started by bd_start_claiming(). Note that 780 * @whole is not the block device to be claimed but the whole device 781 * returned by bd_start_claiming(). 782 * 783 * CONTEXT: 784 * Grabs and releases bdev_lock. 785 */ 786static void bd_abort_claiming(struct block_device *whole, void *holder) 787{ 788 spin_lock(&bdev_lock); 789 __bd_abort_claiming(whole, holder); /* releases bdev_lock */ 790} 791 792/* increment holders when we have a legitimate claim. requires bdev_lock */ 793static void __bd_claim(struct block_device *bdev, struct block_device *whole, 794 void *holder) 795{ 796 /* note that for a whole device bd_holders 797 * will be incremented twice, and bd_holder will 798 * be set to bd_claim before being set to holder 799 */ 800 whole->bd_holders++; 801 whole->bd_holder = bd_claim; 802 bdev->bd_holders++; 803 bdev->bd_holder = holder; 804} 805 806/** 807 * bd_finish_claiming - finish claiming a block device 808 * @bdev: block device of interest (passed to bd_start_claiming()) 809 * @whole: whole block device returned by bd_start_claiming() 810 * @holder: holder trying to claim @bdev 811 * 812 * Finish a claiming block started by bd_start_claiming(). 813 * 814 * CONTEXT: 815 * Grabs and releases bdev_lock. 816 */ 817static void bd_finish_claiming(struct block_device *bdev, 818 struct block_device *whole, void *holder) 819{ 820 spin_lock(&bdev_lock); 821 BUG_ON(!bd_may_claim(bdev, whole, holder)); 822 __bd_claim(bdev, whole, holder); 823 __bd_abort_claiming(whole, holder); /* not actually an abort */ 824} 825 826/** 827 * bd_claim - claim a block device 828 * @bdev: block device to claim 829 * @holder: holder trying to claim @bdev 830 * 831 * Try to claim @bdev which must have been opened successfully. 832 * 833 * CONTEXT: 834 * Might sleep. 835 * 836 * RETURNS: 837 * 0 if successful, -EBUSY if @bdev is already claimed. 838 */ 839int bd_claim(struct block_device *bdev, void *holder) 840{ 841 struct block_device *whole = bdev->bd_contains; 842 int res; 843 844 might_sleep(); 845 846 spin_lock(&bdev_lock); 847 res = bd_prepare_to_claim(bdev, whole, holder); 848 if (res == 0) 849 __bd_claim(bdev, whole, holder); 850 spin_unlock(&bdev_lock); 851 852 return res; 853} 854EXPORT_SYMBOL(bd_claim); 855 856void bd_release(struct block_device *bdev) 857{ 858 spin_lock(&bdev_lock); 859 if (!--bdev->bd_contains->bd_holders) 860 bdev->bd_contains->bd_holder = NULL; 861 if (!--bdev->bd_holders) 862 bdev->bd_holder = NULL; 863 spin_unlock(&bdev_lock); 864} 865 866EXPORT_SYMBOL(bd_release); 867 868#ifdef CONFIG_SYSFS 869/* 870 * Functions for bd_claim_by_kobject / bd_release_from_kobject 871 * 872 * If a kobject is passed to bd_claim_by_kobject() 873 * and the kobject has a parent directory, 874 * following symlinks are created: 875 * o from the kobject to the claimed bdev 876 * o from "holders" directory of the bdev to the parent of the kobject 877 * bd_release_from_kobject() removes these symlinks. 878 * 879 * Example: 880 * If /dev/dm-0 maps to /dev/sda, kobject corresponding to 881 * /sys/block/dm-0/slaves is passed to bd_claim_by_kobject(), then: 882 * /sys/block/dm-0/slaves/sda --> /sys/block/sda 883 * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 884 */ 885 886static int add_symlink(struct kobject *from, struct kobject *to) 887{ 888 if (!from || !to) 889 return 0; 890 return sysfs_create_link(from, to, kobject_name(to)); 891} 892 893static void del_symlink(struct kobject *from, struct kobject *to) 894{ 895 if (!from || !to) 896 return; 897 sysfs_remove_link(from, kobject_name(to)); 898} 899 900/* 901 * 'struct bd_holder' contains pointers to kobjects symlinked by 902 * bd_claim_by_kobject. 903 * It's connected to bd_holder_list which is protected by bdev->bd_sem. 904 */ 905struct bd_holder { 906 struct list_head list; /* chain of holders of the bdev */ 907 int count; /* references from the holder */ 908 struct kobject *sdir; /* holder object, e.g. "/block/dm-0/slaves" */ 909 struct kobject *hdev; /* e.g. "/block/dm-0" */ 910 struct kobject *hdir; /* e.g. "/block/sda/holders" */ 911 struct kobject *sdev; /* e.g. "/block/sda" */ 912}; 913 914/* 915 * Get references of related kobjects at once. 916 * Returns 1 on success. 0 on failure. 917 * 918 * Should call bd_holder_release_dirs() after successful use. 919 */ 920static int bd_holder_grab_dirs(struct block_device *bdev, 921 struct bd_holder *bo) 922{ 923 if (!bdev || !bo) 924 return 0; 925 926 bo->sdir = kobject_get(bo->sdir); 927 if (!bo->sdir) 928 return 0; 929 930 bo->hdev = kobject_get(bo->sdir->parent); 931 if (!bo->hdev) 932 goto fail_put_sdir; 933 934 bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj); 935 if (!bo->sdev) 936 goto fail_put_hdev; 937 938 bo->hdir = kobject_get(bdev->bd_part->holder_dir); 939 if (!bo->hdir) 940 goto fail_put_sdev; 941 942 return 1; 943 944fail_put_sdev: 945 kobject_put(bo->sdev); 946fail_put_hdev: 947 kobject_put(bo->hdev); 948fail_put_sdir: 949 kobject_put(bo->sdir); 950 951 return 0; 952} 953 954/* Put references of related kobjects at once. */ 955static void bd_holder_release_dirs(struct bd_holder *bo) 956{ 957 kobject_put(bo->hdir); 958 kobject_put(bo->sdev); 959 kobject_put(bo->hdev); 960 kobject_put(bo->sdir); 961} 962 963static struct bd_holder *alloc_bd_holder(struct kobject *kobj) 964{ 965 struct bd_holder *bo; 966 967 bo = kzalloc(sizeof(*bo), GFP_KERNEL); 968 if (!bo) 969 return NULL; 970 971 bo->count = 1; 972 bo->sdir = kobj; 973 974 return bo; 975} 976 977static void free_bd_holder(struct bd_holder *bo) 978{ 979 kfree(bo); 980} 981 982/** 983 * find_bd_holder - find matching struct bd_holder from the block device 984 * 985 * @bdev: struct block device to be searched 986 * @bo: target struct bd_holder 987 * 988 * Returns matching entry with @bo in @bdev->bd_holder_list. 989 * If found, increment the reference count and return the pointer. 990 * If not found, returns NULL. 991 */ 992static struct bd_holder *find_bd_holder(struct block_device *bdev, 993 struct bd_holder *bo) 994{ 995 struct bd_holder *tmp; 996 997 list_for_each_entry(tmp, &bdev->bd_holder_list, list) 998 if (tmp->sdir == bo->sdir) { 999 tmp->count++; 1000 return tmp; 1001 } 1002 1003 return NULL; 1004} 1005 1006/** 1007 * add_bd_holder - create sysfs symlinks for bd_claim() relationship 1008 * 1009 * @bdev: block device to be bd_claimed 1010 * @bo: preallocated and initialized by alloc_bd_holder() 1011 * 1012 * Add @bo to @bdev->bd_holder_list, create symlinks. 1013 * 1014 * Returns 0 if symlinks are created. 1015 * Returns -ve if something fails. 1016 */ 1017static int add_bd_holder(struct block_device *bdev, struct bd_holder *bo) 1018{ 1019 int err; 1020 1021 if (!bo) 1022 return -EINVAL; 1023 1024 if (!bd_holder_grab_dirs(bdev, bo)) 1025 return -EBUSY; 1026 1027 err = add_symlink(bo->sdir, bo->sdev); 1028 if (err) 1029 return err; 1030 1031 err = add_symlink(bo->hdir, bo->hdev); 1032 if (err) { 1033 del_symlink(bo->sdir, bo->sdev); 1034 return err; 1035 } 1036 1037 list_add_tail(&bo->list, &bdev->bd_holder_list); 1038 return 0; 1039} 1040 1041/** 1042 * del_bd_holder - delete sysfs symlinks for bd_claim() relationship 1043 * 1044 * @bdev: block device to be bd_claimed 1045 * @kobj: holder's kobject 1046 * 1047 * If there is matching entry with @kobj in @bdev->bd_holder_list 1048 * and no other bd_claim() from the same kobject, 1049 * remove the struct bd_holder from the list, delete symlinks for it. 1050 * 1051 * Returns a pointer to the struct bd_holder when it's removed from the list 1052 * and ready to be freed. 1053 * Returns NULL if matching claim isn't found or there is other bd_claim() 1054 * by the same kobject. 1055 */ 1056static struct bd_holder *del_bd_holder(struct block_device *bdev, 1057 struct kobject *kobj) 1058{ 1059 struct bd_holder *bo; 1060 1061 list_for_each_entry(bo, &bdev->bd_holder_list, list) { 1062 if (bo->sdir == kobj) { 1063 bo->count--; 1064 BUG_ON(bo->count < 0); 1065 if (!bo->count) { 1066 list_del(&bo->list); 1067 del_symlink(bo->sdir, bo->sdev); 1068 del_symlink(bo->hdir, bo->hdev); 1069 bd_holder_release_dirs(bo); 1070 return bo; 1071 } 1072 break; 1073 } 1074 } 1075 1076 return NULL; 1077} 1078 1079/** 1080 * bd_claim_by_kobject - bd_claim() with additional kobject signature 1081 * 1082 * @bdev: block device to be claimed 1083 * @holder: holder's signature 1084 * @kobj: holder's kobject 1085 * 1086 * Do bd_claim() and if it succeeds, create sysfs symlinks between 1087 * the bdev and the holder's kobject. 1088 * Use bd_release_from_kobject() when relesing the claimed bdev. 1089 * 1090 * Returns 0 on success. (same as bd_claim()) 1091 * Returns errno on failure. 1092 */ 1093static int bd_claim_by_kobject(struct block_device *bdev, void *holder, 1094 struct kobject *kobj) 1095{ 1096 int err; 1097 struct bd_holder *bo, *found; 1098 1099 if (!kobj) 1100 return -EINVAL; 1101 1102 bo = alloc_bd_holder(kobj); 1103 if (!bo) 1104 return -ENOMEM; 1105 1106 mutex_lock(&bdev->bd_mutex); 1107 1108 err = bd_claim(bdev, holder); 1109 if (err) 1110 goto fail; 1111 1112 found = find_bd_holder(bdev, bo); 1113 if (found) 1114 goto fail; 1115 1116 err = add_bd_holder(bdev, bo); 1117 if (err) 1118 bd_release(bdev); 1119 else 1120 bo = NULL; 1121fail: 1122 mutex_unlock(&bdev->bd_mutex); 1123 free_bd_holder(bo); 1124 return err; 1125} 1126 1127/** 1128 * bd_release_from_kobject - bd_release() with additional kobject signature 1129 * 1130 * @bdev: block device to be released 1131 * @kobj: holder's kobject 1132 * 1133 * Do bd_release() and remove sysfs symlinks created by bd_claim_by_kobject(). 1134 */ 1135static void bd_release_from_kobject(struct block_device *bdev, 1136 struct kobject *kobj) 1137{ 1138 if (!kobj) 1139 return; 1140 1141 mutex_lock(&bdev->bd_mutex); 1142 bd_release(bdev); 1143 free_bd_holder(del_bd_holder(bdev, kobj)); 1144 mutex_unlock(&bdev->bd_mutex); 1145} 1146 1147/** 1148 * bd_claim_by_disk - wrapper function for bd_claim_by_kobject() 1149 * 1150 * @bdev: block device to be claimed 1151 * @holder: holder's signature 1152 * @disk: holder's gendisk 1153 * 1154 * Call bd_claim_by_kobject() with getting @disk->slave_dir. 1155 */ 1156int bd_claim_by_disk(struct block_device *bdev, void *holder, 1157 struct gendisk *disk) 1158{ 1159 return bd_claim_by_kobject(bdev, holder, kobject_get(disk->slave_dir)); 1160} 1161EXPORT_SYMBOL_GPL(bd_claim_by_disk); 1162 1163/** 1164 * bd_release_from_disk - wrapper function for bd_release_from_kobject() 1165 * 1166 * @bdev: block device to be claimed 1167 * @disk: holder's gendisk 1168 * 1169 * Call bd_release_from_kobject() and put @disk->slave_dir. 1170 */ 1171void bd_release_from_disk(struct block_device *bdev, struct gendisk *disk) 1172{ 1173 bd_release_from_kobject(bdev, disk->slave_dir); 1174 kobject_put(disk->slave_dir); 1175} 1176EXPORT_SYMBOL_GPL(bd_release_from_disk); 1177#endif 1178 1179/* 1180 * Tries to open block device by device number. Use it ONLY if you 1181 * really do not have anything better - i.e. when you are behind a 1182 * truly sucky interface and all you are given is a device number. _Never_ 1183 * to be used for internal purposes. If you ever need it - reconsider 1184 * your API. 1185 */ 1186struct block_device *open_by_devnum(dev_t dev, fmode_t mode) 1187{ 1188 struct block_device *bdev = bdget(dev); 1189 int err = -ENOMEM; 1190 if (bdev) 1191 err = blkdev_get(bdev, mode); 1192 return err ? ERR_PTR(err) : bdev; 1193} 1194 1195EXPORT_SYMBOL(open_by_devnum); 1196 1197/** 1198 * flush_disk - invalidates all buffer-cache entries on a disk 1199 * 1200 * @bdev: struct block device to be flushed 1201 * 1202 * Invalidates all buffer-cache entries on a disk. It should be called 1203 * when a disk has been changed -- either by a media change or online 1204 * resize. 1205 */ 1206static void flush_disk(struct block_device *bdev) 1207{ 1208 if (__invalidate_device(bdev)) { 1209 char name[BDEVNAME_SIZE] = ""; 1210 1211 if (bdev->bd_disk) 1212 disk_name(bdev->bd_disk, 0, name); 1213 printk(KERN_WARNING "VFS: busy inodes on changed media or " 1214 "resized disk %s\n", name); 1215 } 1216 1217 if (!bdev->bd_disk) 1218 return; 1219 if (disk_partitionable(bdev->bd_disk)) 1220 bdev->bd_invalidated = 1; 1221} 1222 1223/** 1224 * check_disk_size_change - checks for disk size change and adjusts bdev size. 1225 * @disk: struct gendisk to check 1226 * @bdev: struct bdev to adjust. 1227 * 1228 * This routine checks to see if the bdev size does not match the disk size 1229 * and adjusts it if it differs. 1230 */ 1231void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) 1232{ 1233 loff_t disk_size, bdev_size; 1234 1235 disk_size = (loff_t)get_capacity(disk) << 9; 1236 bdev_size = i_size_read(bdev->bd_inode); 1237 if (disk_size != bdev_size) { 1238 char name[BDEVNAME_SIZE]; 1239 1240 disk_name(disk, 0, name); 1241 printk(KERN_INFO 1242 "%s: detected capacity change from %lld to %lld\n", 1243 name, bdev_size, disk_size); 1244 i_size_write(bdev->bd_inode, disk_size); 1245 flush_disk(bdev); 1246 } 1247} 1248EXPORT_SYMBOL(check_disk_size_change); 1249 1250/** 1251 * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back 1252 * @disk: struct gendisk to be revalidated 1253 * 1254 * This routine is a wrapper for lower-level driver's revalidate_disk 1255 * call-backs. It is used to do common pre and post operations needed 1256 * for all revalidate_disk operations. 1257 */ 1258int revalidate_disk(struct gendisk *disk) 1259{ 1260 struct block_device *bdev; 1261 int ret = 0; 1262 1263 if (disk->fops->revalidate_disk) 1264 ret = disk->fops->revalidate_disk(disk); 1265 1266 bdev = bdget_disk(disk, 0); 1267 if (!bdev) 1268 return ret; 1269 1270 mutex_lock(&bdev->bd_mutex); 1271 check_disk_size_change(disk, bdev); 1272 mutex_unlock(&bdev->bd_mutex); 1273 bdput(bdev); 1274 return ret; 1275} 1276EXPORT_SYMBOL(revalidate_disk); 1277 1278/* 1279 * This routine checks whether a removable media has been changed, 1280 * and invalidates all buffer-cache-entries in that case. This 1281 * is a relatively slow routine, so we have to try to minimize using 1282 * it. Thus it is called only upon a 'mount' or 'open'. This 1283 * is the best way of combining speed and utility, I think. 1284 * People changing diskettes in the middle of an operation deserve 1285 * to lose :-) 1286 */ 1287int check_disk_change(struct block_device *bdev) 1288{ 1289 struct gendisk *disk = bdev->bd_disk; 1290 const struct block_device_operations *bdops = disk->fops; 1291 1292 if (!bdops->media_changed) 1293 return 0; 1294 if (!bdops->media_changed(bdev->bd_disk)) 1295 return 0; 1296 1297 flush_disk(bdev); 1298 if (bdops->revalidate_disk) 1299 bdops->revalidate_disk(bdev->bd_disk); 1300 return 1; 1301} 1302 1303EXPORT_SYMBOL(check_disk_change); 1304 1305void bd_set_size(struct block_device *bdev, loff_t size) 1306{ 1307 unsigned bsize = bdev_logical_block_size(bdev); 1308 1309 bdev->bd_inode->i_size = size; 1310 while (bsize < PAGE_CACHE_SIZE) { 1311 if (size & bsize) 1312 break; 1313 bsize <<= 1; 1314 } 1315 bdev->bd_block_size = bsize; 1316 bdev->bd_inode->i_blkbits = blksize_bits(bsize); 1317} 1318EXPORT_SYMBOL(bd_set_size); 1319 1320static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); 1321 1322/* 1323 * bd_mutex locking: 1324 * 1325 * mutex_lock(part->bd_mutex) 1326 * mutex_lock_nested(whole->bd_mutex, 1) 1327 */ 1328 1329static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) 1330{ 1331 struct gendisk *disk; 1332 int ret; 1333 int partno; 1334 int perm = 0; 1335 1336 if (mode & FMODE_READ) 1337 perm |= MAY_READ; 1338 if (mode & FMODE_WRITE) 1339 perm |= MAY_WRITE; 1340 /* 1341 * hooks: /n/, see "layering violations". 1342 */ 1343 if (!for_part) { 1344 ret = devcgroup_inode_permission(bdev->bd_inode, perm); 1345 if (ret != 0) { 1346 bdput(bdev); 1347 return ret; 1348 } 1349 } 1350 1351 restart: 1352 1353 ret = -ENXIO; 1354 disk = get_gendisk(bdev->bd_dev, &partno); 1355 if (!disk) 1356 goto out; 1357 1358 mutex_lock_nested(&bdev->bd_mutex, for_part); 1359 if (!bdev->bd_openers) { 1360 bdev->bd_disk = disk; 1361 bdev->bd_contains = bdev; 1362 if (!partno) { 1363 struct backing_dev_info *bdi; 1364 1365 ret = -ENXIO; 1366 bdev->bd_part = disk_get_part(disk, partno); 1367 if (!bdev->bd_part) 1368 goto out_clear; 1369 1370 if (disk->fops->open) { 1371 ret = disk->fops->open(bdev, mode); 1372 if (ret == -ERESTARTSYS) { 1373 /* Lost a race with 'disk' being 1374 * deleted, try again. 1375 * See md.c 1376 */ 1377 disk_put_part(bdev->bd_part); 1378 bdev->bd_part = NULL; 1379 module_put(disk->fops->owner); 1380 put_disk(disk); 1381 bdev->bd_disk = NULL; 1382 mutex_unlock(&bdev->bd_mutex); 1383 goto restart; 1384 } 1385 if (ret) 1386 goto out_clear; 1387 } 1388 if (!bdev->bd_openers) { 1389 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1390 bdi = blk_get_backing_dev_info(bdev); 1391 if (bdi == NULL) 1392 bdi = &default_backing_dev_info; 1393 bdev->bd_inode->i_data.backing_dev_info = bdi; 1394 } 1395 if (bdev->bd_invalidated) 1396 rescan_partitions(disk, bdev); 1397 } else { 1398 struct block_device *whole; 1399 whole = bdget_disk(disk, 0); 1400 ret = -ENOMEM; 1401 if (!whole) 1402 goto out_clear; 1403 BUG_ON(for_part); 1404 ret = __blkdev_get(whole, mode, 1); 1405 if (ret) 1406 goto out_clear; 1407 bdev->bd_contains = whole; 1408 bdev->bd_inode->i_data.backing_dev_info = 1409 whole->bd_inode->i_data.backing_dev_info; 1410 bdev->bd_part = disk_get_part(disk, partno); 1411 if (!(disk->flags & GENHD_FL_UP) || 1412 !bdev->bd_part || !bdev->bd_part->nr_sects) { 1413 ret = -ENXIO; 1414 goto out_clear; 1415 } 1416 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1417 } 1418 } else { 1419 module_put(disk->fops->owner); 1420 put_disk(disk); 1421 disk = NULL; 1422 if (bdev->bd_contains == bdev) { 1423 if (bdev->bd_disk->fops->open) { 1424 ret = bdev->bd_disk->fops->open(bdev, mode); 1425 if (ret) 1426 goto out_unlock_bdev; 1427 } 1428 if (bdev->bd_invalidated) 1429 rescan_partitions(bdev->bd_disk, bdev); 1430 } 1431 } 1432 bdev->bd_openers++; 1433 if (for_part) 1434 bdev->bd_part_count++; 1435 mutex_unlock(&bdev->bd_mutex); 1436 return 0; 1437 1438 out_clear: 1439 disk_put_part(bdev->bd_part); 1440 bdev->bd_disk = NULL; 1441 bdev->bd_part = NULL; 1442 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1443 if (bdev != bdev->bd_contains) 1444 __blkdev_put(bdev->bd_contains, mode, 1); 1445 bdev->bd_contains = NULL; 1446 out_unlock_bdev: 1447 mutex_unlock(&bdev->bd_mutex); 1448 out: 1449 if (disk) 1450 module_put(disk->fops->owner); 1451 put_disk(disk); 1452 bdput(bdev); 1453 1454 return ret; 1455} 1456 1457int blkdev_get(struct block_device *bdev, fmode_t mode) 1458{ 1459 return __blkdev_get(bdev, mode, 0); 1460} 1461EXPORT_SYMBOL(blkdev_get); 1462 1463static int blkdev_open(struct inode * inode, struct file * filp) 1464{ 1465 struct block_device *whole = NULL; 1466 struct block_device *bdev; 1467 int res; 1468 1469 filp->f_flags |= O_LARGEFILE; 1470 1471 if (filp->f_flags & O_NDELAY) 1472 filp->f_mode |= FMODE_NDELAY; 1473 if (filp->f_flags & O_EXCL) 1474 filp->f_mode |= FMODE_EXCL; 1475 if ((filp->f_flags & O_ACCMODE) == 3) 1476 filp->f_mode |= FMODE_WRITE_IOCTL; 1477 1478 bdev = bd_acquire(inode); 1479 if (bdev == NULL) 1480 return -ENOMEM; 1481 1482 if (filp->f_mode & FMODE_EXCL) { 1483 whole = bd_start_claiming(bdev, filp); 1484 if (IS_ERR(whole)) { 1485 bdput(bdev); 1486 return PTR_ERR(whole); 1487 } 1488 } 1489 1490 filp->f_mapping = bdev->bd_inode->i_mapping; 1491 1492 res = blkdev_get(bdev, filp->f_mode); 1493 1494 if (whole) { 1495 if (res == 0) 1496 bd_finish_claiming(bdev, whole, filp); 1497 else 1498 bd_abort_claiming(whole, filp); 1499 } 1500 1501 return res; 1502} 1503 1504static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) 1505{ 1506 int ret = 0; 1507 struct gendisk *disk = bdev->bd_disk; 1508 struct block_device *victim = NULL; 1509 1510 mutex_lock_nested(&bdev->bd_mutex, for_part); 1511 if (for_part) 1512 bdev->bd_part_count--; 1513 1514 if (!--bdev->bd_openers) { 1515 sync_blockdev(bdev); 1516 kill_bdev(bdev); 1517 } 1518 if (bdev->bd_contains == bdev) { 1519 if (disk->fops->release) 1520 ret = disk->fops->release(disk, mode); 1521 } 1522 if (!bdev->bd_openers) { 1523 struct module *owner = disk->fops->owner; 1524 1525 put_disk(disk); 1526 module_put(owner); 1527 disk_put_part(bdev->bd_part); 1528 bdev->bd_part = NULL; 1529 bdev->bd_disk = NULL; 1530 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; 1531 if (bdev != bdev->bd_contains) 1532 victim = bdev->bd_contains; 1533 bdev->bd_contains = NULL; 1534 } 1535 mutex_unlock(&bdev->bd_mutex); 1536 bdput(bdev); 1537 if (victim) 1538 __blkdev_put(victim, mode, 1); 1539 return ret; 1540} 1541 1542int blkdev_put(struct block_device *bdev, fmode_t mode) 1543{ 1544 return __blkdev_put(bdev, mode, 0); 1545} 1546EXPORT_SYMBOL(blkdev_put); 1547 1548static int blkdev_close(struct inode * inode, struct file * filp) 1549{ 1550 struct block_device *bdev = I_BDEV(filp->f_mapping->host); 1551 if (bdev->bd_holder == filp) 1552 bd_release(bdev); 1553 return blkdev_put(bdev, filp->f_mode); 1554} 1555 1556static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) 1557{ 1558 struct block_device *bdev = I_BDEV(file->f_mapping->host); 1559 fmode_t mode = file->f_mode; 1560 1561 /* 1562 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have 1563 * to updated it before every ioctl. 1564 */ 1565 if (file->f_flags & O_NDELAY) 1566 mode |= FMODE_NDELAY; 1567 else 1568 mode &= ~FMODE_NDELAY; 1569 1570 return blkdev_ioctl(bdev, mode, cmd, arg); 1571} 1572 1573/* 1574 * Write data to the block device. Only intended for the block device itself 1575 * and the raw driver which basically is a fake block device. 1576 * 1577 * Does not take i_mutex for the write and thus is not for general purpose 1578 * use. 1579 */ 1580ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, 1581 unsigned long nr_segs, loff_t pos) 1582{ 1583 struct file *file = iocb->ki_filp; 1584 ssize_t ret; 1585 1586 BUG_ON(iocb->ki_pos != pos); 1587 1588 ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); 1589 if (ret > 0 || ret == -EIOCBQUEUED) { 1590 ssize_t err; 1591 1592 err = generic_write_sync(file, pos, ret); 1593 if (err < 0 && ret > 0) 1594 ret = err; 1595 } 1596 return ret; 1597} 1598EXPORT_SYMBOL_GPL(blkdev_aio_write); 1599 1600/* 1601 * Try to release a page associated with block device when the system 1602 * is under memory pressure. 1603 */ 1604static int blkdev_releasepage(struct page *page, gfp_t wait) 1605{ 1606 struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; 1607 1608 if (super && super->s_op->bdev_try_to_free_page) 1609 return super->s_op->bdev_try_to_free_page(super, page, wait); 1610 1611 return try_to_free_buffers(page); 1612} 1613 1614static const struct address_space_operations def_blk_aops = { 1615 .readpage = blkdev_readpage, 1616 .writepage = blkdev_writepage, 1617 .sync_page = block_sync_page, 1618 .write_begin = blkdev_write_begin, 1619 .write_end = blkdev_write_end, 1620 .writepages = generic_writepages, 1621 .releasepage = blkdev_releasepage, 1622 .direct_IO = blkdev_direct_IO, 1623}; 1624 1625const struct file_operations def_blk_fops = { 1626 .open = blkdev_open, 1627 .release = blkdev_close, 1628 .llseek = block_llseek, 1629 .read = do_sync_read, 1630 .write = do_sync_write, 1631 .aio_read = generic_file_aio_read, 1632 .aio_write = blkdev_aio_write, 1633 .mmap = generic_file_mmap, 1634 .fsync = blkdev_fsync, 1635 .unlocked_ioctl = block_ioctl, 1636#ifdef CONFIG_COMPAT 1637 .compat_ioctl = compat_blkdev_ioctl, 1638#endif 1639 .splice_read = generic_file_splice_read, 1640 .splice_write = generic_file_splice_write, 1641}; 1642 1643int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) 1644{ 1645 int res; 1646 mm_segment_t old_fs = get_fs(); 1647 set_fs(KERNEL_DS); 1648 res = blkdev_ioctl(bdev, 0, cmd, arg); 1649 set_fs(old_fs); 1650 return res; 1651} 1652 1653EXPORT_SYMBOL(ioctl_by_bdev); 1654 1655/** 1656 * lookup_bdev - lookup a struct block_device by name 1657 * @pathname: special file representing the block device 1658 * 1659 * Get a reference to the blockdevice at @pathname in the current 1660 * namespace if possible and return it. Return ERR_PTR(error) 1661 * otherwise. 1662 */ 1663struct block_device *lookup_bdev(const char *pathname) 1664{ 1665 struct block_device *bdev; 1666 struct inode *inode; 1667 struct path path; 1668 int error; 1669 1670 if (!pathname || !*pathname) 1671 return ERR_PTR(-EINVAL); 1672 1673 error = kern_path(pathname, LOOKUP_FOLLOW, &path); 1674 if (error) 1675 return ERR_PTR(error); 1676 1677 inode = path.dentry->d_inode; 1678 error = -ENOTBLK; 1679 if (!S_ISBLK(inode->i_mode)) 1680 goto fail; 1681 error = -EACCES; 1682 if (path.mnt->mnt_flags & MNT_NODEV) 1683 goto fail; 1684 error = -ENOMEM; 1685 bdev = bd_acquire(inode); 1686 if (!bdev) 1687 goto fail; 1688out: 1689 path_put(&path); 1690 return bdev; 1691fail: 1692 bdev = ERR_PTR(error); 1693 goto out; 1694} 1695EXPORT_SYMBOL(lookup_bdev); 1696 1697/** 1698 * open_bdev_exclusive - open a block device by name and set it up for use 1699 * 1700 * @path: special file representing the block device 1701 * @mode: FMODE_... combination to pass be used 1702 * @holder: owner for exclusion 1703 * 1704 * Open the blockdevice described by the special file at @path, claim it 1705 * for the @holder. 1706 */ 1707struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) 1708{ 1709 struct block_device *bdev, *whole; 1710 int error; 1711 1712 bdev = lookup_bdev(path); 1713 if (IS_ERR(bdev)) 1714 return bdev; 1715 1716 whole = bd_start_claiming(bdev, holder); 1717 if (IS_ERR(whole)) { 1718 bdput(bdev); 1719 return whole; 1720 } 1721 1722 error = blkdev_get(bdev, mode); 1723 if (error) 1724 goto out_abort_claiming; 1725 1726 error = -EACCES; 1727 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) 1728 goto out_blkdev_put; 1729 1730 bd_finish_claiming(bdev, whole, holder); 1731 return bdev; 1732 1733out_blkdev_put: 1734 blkdev_put(bdev, mode); 1735out_abort_claiming: 1736 bd_abort_claiming(whole, holder); 1737 return ERR_PTR(error); 1738} 1739 1740EXPORT_SYMBOL(open_bdev_exclusive); 1741 1742/** 1743 * close_bdev_exclusive - close a blockdevice opened by open_bdev_exclusive() 1744 * 1745 * @bdev: blockdevice to close 1746 * @mode: mode, must match that used to open. 1747 * 1748 * This is the counterpart to open_bdev_exclusive(). 1749 */ 1750void close_bdev_exclusive(struct block_device *bdev, fmode_t mode) 1751{ 1752 bd_release(bdev); 1753 blkdev_put(bdev, mode); 1754} 1755 1756EXPORT_SYMBOL(close_bdev_exclusive); 1757 1758int __invalidate_device(struct block_device *bdev) 1759{ 1760 struct super_block *sb = get_super(bdev); 1761 int res = 0; 1762 1763 if (sb) { 1764 /* 1765 * no need to lock the super, get_super holds the 1766 * read mutex so the filesystem cannot go away 1767 * under us (->put_super runs with the write lock 1768 * hold). 1769 */ 1770 shrink_dcache_sb(sb); 1771 res = invalidate_inodes(sb); 1772 drop_super(sb); 1773 } 1774 invalidate_bdev(bdev); 1775 return res; 1776} 1777EXPORT_SYMBOL(__invalidate_device); 1778