1/* 2 * linux/fs/inode.c 3 * 4 * (C) 1997 Linus Torvalds 5 */ 6 7#include <linux/fs.h> 8#include <linux/mm.h> 9#include <linux/dcache.h> 10#include <linux/init.h> 11#include <linux/quotaops.h> 12#include <linux/slab.h> 13#include <linux/writeback.h> 14#include <linux/module.h> 15#include <linux/backing-dev.h> 16#include <linux/wait.h> 17#include <linux/hash.h> 18#include <linux/swap.h> 19#include <linux/security.h> 20#include <linux/pagemap.h> 21#include <linux/cdev.h> 22#include <linux/bootmem.h> 23#include <linux/inotify.h> 24#include <linux/mount.h> 25 26#include <linux/buffer_head.h> 27 28/* 29 * New inode.c implementation. 30 * 31 * This implementation has the basic premise of trying 32 * to be extremely low-overhead and SMP-safe, yet be 33 * simple enough to be "obviously correct". 34 * 35 * Famous last words. 36 */ 37 38/* inode dynamic allocation 1999, Andrea Arcangeli <andrea@suse.de> */ 39 40/* #define INODE_PARANOIA 1 */ 41/* #define INODE_DEBUG 1 */ 42 43/* 44 * Inode lookup is no longer as critical as it used to be: 45 * most of the lookups are going to be through the dcache. 46 */ 47#define I_HASHBITS i_hash_shift 48#define I_HASHMASK i_hash_mask 49 50static unsigned int i_hash_mask __read_mostly; 51static unsigned int i_hash_shift __read_mostly; 52 53/* 54 * Each inode can be on two separate lists. One is 55 * the hash list of the inode, used for lookups. The 56 * other linked list is the "type" list: 57 * "in_use" - valid inode, i_count > 0, i_nlink > 0 58 * "dirty" - as "in_use" but also dirty 59 * "unused" - valid inode, i_count = 0 60 * 61 * A "dirty" list is maintained for each super block, 62 * allowing for low-overhead inode sync() operations. 63 */ 64 65LIST_HEAD(inode_in_use); 66LIST_HEAD(inode_unused); 67static struct hlist_head *inode_hashtable __read_mostly; 68 69/* 70 * A simple spinlock to protect the list manipulations. 71 * 72 * NOTE! You also have to own the lock if you change 73 * the i_state of an inode while it is in use.. 74 */ 75DEFINE_SPINLOCK(inode_lock); 76 77/* 78 * iprune_mutex provides exclusion between the kswapd or try_to_free_pages 79 * icache shrinking path, and the umount path. Without this exclusion, 80 * by the time prune_icache calls iput for the inode whose pages it has 81 * been invalidating, or by the time it calls clear_inode & destroy_inode 82 * from its final dispose_list, the struct super_block they refer to 83 * (for inode->i_sb->s_op) may already have been freed and reused. 84 */ 85static DEFINE_MUTEX(iprune_mutex); 86 87/* 88 * Statistics gathering.. 89 */ 90struct inodes_stat_t inodes_stat; 91 92static struct kmem_cache * inode_cachep __read_mostly; 93 94static struct inode *alloc_inode(struct super_block *sb) 95{ 96 static const struct address_space_operations empty_aops; 97 static struct inode_operations empty_iops; 98 static const struct file_operations empty_fops; 99 struct inode *inode; 100 101 if (sb->s_op->alloc_inode) 102 inode = sb->s_op->alloc_inode(sb); 103 else 104 inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL); 105 106 if (inode) { 107 struct address_space * const mapping = &inode->i_data; 108 109 inode->i_sb = sb; 110 inode->i_blkbits = sb->s_blocksize_bits; 111 inode->i_flags = 0; 112 atomic_set(&inode->i_count, 1); 113 inode->i_op = &empty_iops; 114 inode->i_fop = &empty_fops; 115 inode->i_nlink = 1; 116 atomic_set(&inode->i_writecount, 0); 117 inode->i_size = 0; 118 inode->i_blocks = 0; 119 inode->i_bytes = 0; 120 inode->i_generation = 0; 121#ifdef CONFIG_QUOTA 122 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot)); 123#endif 124 inode->i_pipe = NULL; 125 inode->i_bdev = NULL; 126 inode->i_cdev = NULL; 127 inode->i_rdev = 0; 128 inode->dirtied_when = 0; 129 if (security_inode_alloc(inode)) { 130 if (inode->i_sb->s_op->destroy_inode) 131 inode->i_sb->s_op->destroy_inode(inode); 132 else 133 kmem_cache_free(inode_cachep, (inode)); 134 return NULL; 135 } 136 137 mapping->a_ops = &empty_aops; 138 mapping->host = inode; 139 mapping->flags = 0; 140 mapping_set_gfp_mask(mapping, GFP_HIGHUSER); 141 mapping->assoc_mapping = NULL; 142 mapping->backing_dev_info = &default_backing_dev_info; 143 144 /* 145 * If the block_device provides a backing_dev_info for client 146 * inodes then use that. Otherwise the inode share the bdev's 147 * backing_dev_info. 148 */ 149 if (sb->s_bdev) { 150 struct backing_dev_info *bdi; 151 152 bdi = sb->s_bdev->bd_inode_backing_dev_info; 153 if (!bdi) 154 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; 155 mapping->backing_dev_info = bdi; 156 } 157 inode->i_private = NULL; 158 inode->i_mapping = mapping; 159 } 160 return inode; 161} 162 163void destroy_inode(struct inode *inode) 164{ 165 BUG_ON(inode_has_buffers(inode)); 166 security_inode_free(inode); 167 if (inode->i_sb->s_op->destroy_inode) 168 inode->i_sb->s_op->destroy_inode(inode); 169 else 170 kmem_cache_free(inode_cachep, (inode)); 171} 172 173 174/* 175 * These are initializations that only need to be done 176 * once, because the fields are idempotent across use 177 * of the inode, so let the slab aware of that. 178 */ 179void inode_init_once(struct inode *inode) 180{ 181 memset(inode, 0, sizeof(*inode)); 182 INIT_HLIST_NODE(&inode->i_hash); 183 INIT_LIST_HEAD(&inode->i_dentry); 184 INIT_LIST_HEAD(&inode->i_devices); 185 mutex_init(&inode->i_mutex); 186 init_rwsem(&inode->i_alloc_sem); 187 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); 188 rwlock_init(&inode->i_data.tree_lock); 189 spin_lock_init(&inode->i_data.i_mmap_lock); 190 INIT_LIST_HEAD(&inode->i_data.private_list); 191 spin_lock_init(&inode->i_data.private_lock); 192 INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap); 193 INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); 194 spin_lock_init(&inode->i_lock); 195 i_size_ordered_init(inode); 196#ifdef CONFIG_INOTIFY 197 INIT_LIST_HEAD(&inode->inotify_watches); 198 mutex_init(&inode->inotify_mutex); 199#endif 200} 201 202EXPORT_SYMBOL(inode_init_once); 203 204static void init_once(void * foo, struct kmem_cache * cachep, unsigned long flags) 205{ 206 struct inode * inode = (struct inode *) foo; 207 208 inode_init_once(inode); 209} 210 211/* 212 * inode_lock must be held 213 */ 214void __iget(struct inode * inode) 215{ 216 if (atomic_read(&inode->i_count)) { 217 atomic_inc(&inode->i_count); 218 return; 219 } 220 atomic_inc(&inode->i_count); 221 if (!(inode->i_state & (I_DIRTY|I_LOCK))) 222 list_move(&inode->i_list, &inode_in_use); 223 inodes_stat.nr_unused--; 224} 225 226/** 227 * clear_inode - clear an inode 228 * @inode: inode to clear 229 * 230 * This is called by the filesystem to tell us 231 * that the inode is no longer useful. We just 232 * terminate it with extreme prejudice. 233 */ 234void clear_inode(struct inode *inode) 235{ 236 might_sleep(); 237 invalidate_inode_buffers(inode); 238 239 BUG_ON(inode->i_data.nrpages); 240 BUG_ON(!(inode->i_state & I_FREEING)); 241 BUG_ON(inode->i_state & I_CLEAR); 242 wait_on_inode(inode); 243 DQUOT_DROP(inode); 244 if (inode->i_sb->s_op->clear_inode) 245 inode->i_sb->s_op->clear_inode(inode); 246 if (S_ISBLK(inode->i_mode) && inode->i_bdev) 247 bd_forget(inode); 248 if (S_ISCHR(inode->i_mode) && inode->i_cdev) 249 cd_forget(inode); 250 inode->i_state = I_CLEAR; 251} 252 253EXPORT_SYMBOL(clear_inode); 254 255/* 256 * dispose_list - dispose of the contents of a local list 257 * @head: the head of the list to free 258 * 259 * Dispose-list gets a local list with local inodes in it, so it doesn't 260 * need to worry about list corruption and SMP locks. 261 */ 262static void dispose_list(struct list_head *head) 263{ 264 int nr_disposed = 0; 265 266 while (!list_empty(head)) { 267 struct inode *inode; 268 269 inode = list_first_entry(head, struct inode, i_list); 270 list_del(&inode->i_list); 271 272 if (inode->i_data.nrpages) 273 truncate_inode_pages(&inode->i_data, 0); 274 clear_inode(inode); 275 276 spin_lock(&inode_lock); 277 hlist_del_init(&inode->i_hash); 278 list_del_init(&inode->i_sb_list); 279 spin_unlock(&inode_lock); 280 281 wake_up_inode(inode); 282 destroy_inode(inode); 283 nr_disposed++; 284 } 285 spin_lock(&inode_lock); 286 inodes_stat.nr_inodes -= nr_disposed; 287 spin_unlock(&inode_lock); 288} 289 290/* 291 * Invalidate all inodes for a device. 292 */ 293static int invalidate_list(struct list_head *head, struct list_head *dispose) 294{ 295 struct list_head *next; 296 int busy = 0, count = 0; 297 298 next = head->next; 299 for (;;) { 300 struct list_head * tmp = next; 301 struct inode * inode; 302 303 /* 304 * We can reschedule here without worrying about the list's 305 * consistency because the per-sb list of inodes must not 306 * change during umount anymore, and because iprune_mutex keeps 307 * shrink_icache_memory() away. 308 */ 309 cond_resched_lock(&inode_lock); 310 311 next = next->next; 312 if (tmp == head) 313 break; 314 inode = list_entry(tmp, struct inode, i_sb_list); 315 invalidate_inode_buffers(inode); 316 if (!atomic_read(&inode->i_count)) { 317 list_move(&inode->i_list, dispose); 318 inode->i_state |= I_FREEING; 319 count++; 320 continue; 321 } 322 busy = 1; 323 } 324 /* only unused inodes may be cached with i_count zero */ 325 inodes_stat.nr_unused -= count; 326 return busy; 327} 328 329/** 330 * invalidate_inodes - discard the inodes on a device 331 * @sb: superblock 332 * 333 * Discard all of the inodes for a given superblock. If the discard 334 * fails because there are busy inodes then a non zero value is returned. 335 * If the discard is successful all the inodes have been discarded. 336 */ 337int invalidate_inodes(struct super_block * sb) 338{ 339 int busy; 340 LIST_HEAD(throw_away); 341 342 mutex_lock(&iprune_mutex); 343 spin_lock(&inode_lock); 344 inotify_unmount_inodes(&sb->s_inodes); 345 busy = invalidate_list(&sb->s_inodes, &throw_away); 346 spin_unlock(&inode_lock); 347 348 dispose_list(&throw_away); 349 mutex_unlock(&iprune_mutex); 350 351 return busy; 352} 353 354EXPORT_SYMBOL(invalidate_inodes); 355 356static int can_unuse(struct inode *inode) 357{ 358 if (inode->i_state) 359 return 0; 360 if (inode_has_buffers(inode)) 361 return 0; 362 if (atomic_read(&inode->i_count)) 363 return 0; 364 if (inode->i_data.nrpages) 365 return 0; 366 return 1; 367} 368 369/* 370 * Scan `goal' inodes on the unused list for freeable ones. They are moved to 371 * a temporary list and then are freed outside inode_lock by dispose_list(). 372 * 373 * Any inodes which are pinned purely because of attached pagecache have their 374 * pagecache removed. We expect the final iput() on that inode to add it to 375 * the front of the inode_unused list. So look for it there and if the 376 * inode is still freeable, proceed. The right inode is found 99.9% of the 377 * time in testing on a 4-way. 378 * 379 * If the inode has metadata buffers attached to mapping->private_list then 380 * try to remove them. 381 */ 382static void prune_icache(int nr_to_scan) 383{ 384 LIST_HEAD(freeable); 385 int nr_pruned = 0; 386 int nr_scanned; 387 unsigned long reap = 0; 388 389 mutex_lock(&iprune_mutex); 390 spin_lock(&inode_lock); 391 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 392 struct inode *inode; 393 394 if (list_empty(&inode_unused)) 395 break; 396 397 inode = list_entry(inode_unused.prev, struct inode, i_list); 398 399 if (inode->i_state || atomic_read(&inode->i_count)) { 400 list_move(&inode->i_list, &inode_unused); 401 continue; 402 } 403 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 404 __iget(inode); 405 spin_unlock(&inode_lock); 406 if (remove_inode_buffers(inode)) 407 reap += invalidate_mapping_pages(&inode->i_data, 408 0, -1); 409 iput(inode); 410 spin_lock(&inode_lock); 411 412 if (inode != list_entry(inode_unused.next, 413 struct inode, i_list)) 414 continue; /* wrong inode or list_empty */ 415 if (!can_unuse(inode)) 416 continue; 417 } 418 list_move(&inode->i_list, &freeable); 419 inode->i_state |= I_FREEING; 420 nr_pruned++; 421 } 422 inodes_stat.nr_unused -= nr_pruned; 423 if (current_is_kswapd()) 424 __count_vm_events(KSWAPD_INODESTEAL, reap); 425 else 426 __count_vm_events(PGINODESTEAL, reap); 427 spin_unlock(&inode_lock); 428 429 dispose_list(&freeable); 430 mutex_unlock(&iprune_mutex); 431} 432 433/* 434 * shrink_icache_memory() will attempt to reclaim some unused inodes. Here, 435 * "unused" means that no dentries are referring to the inodes: the files are 436 * not open and the dcache references to those inodes have already been 437 * reclaimed. 438 * 439 * This function is passed the number of inodes to scan, and it returns the 440 * total number of remaining possibly-reclaimable inodes. 441 */ 442static int shrink_icache_memory(int nr, gfp_t gfp_mask) 443{ 444 if (nr) { 445 /* 446 * Nasty deadlock avoidance. We may hold various FS locks, 447 * and we don't want to recurse into the FS that called us 448 * in clear_inode() and friends.. 449 */ 450 if (!(gfp_mask & __GFP_FS)) 451 return -1; 452 prune_icache(nr); 453 } 454 return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; 455} 456 457static void __wait_on_freeing_inode(struct inode *inode); 458/* 459 * Called with the inode lock held. 460 * NOTE: we are not increasing the inode-refcount, you must call __iget() 461 * by hand after calling find_inode now! This simplifies iunique and won't 462 * add any additional branch in the common code. 463 */ 464static struct inode * find_inode(struct super_block * sb, struct hlist_head *head, int (*test)(struct inode *, void *), void *data) 465{ 466 struct hlist_node *node; 467 struct inode * inode = NULL; 468 469repeat: 470 hlist_for_each (node, head) { 471 inode = hlist_entry(node, struct inode, i_hash); 472 if (inode->i_sb != sb) 473 continue; 474 if (!test(inode, data)) 475 continue; 476 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { 477 __wait_on_freeing_inode(inode); 478 goto repeat; 479 } 480 break; 481 } 482 return node ? inode : NULL; 483} 484 485/* 486 * find_inode_fast is the fast path version of find_inode, see the comment at 487 * iget_locked for details. 488 */ 489static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head *head, unsigned long ino) 490{ 491 struct hlist_node *node; 492 struct inode * inode = NULL; 493 494repeat: 495 hlist_for_each (node, head) { 496 inode = hlist_entry(node, struct inode, i_hash); 497 if (inode->i_ino != ino) 498 continue; 499 if (inode->i_sb != sb) 500 continue; 501 if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) { 502 __wait_on_freeing_inode(inode); 503 goto repeat; 504 } 505 break; 506 } 507 return node ? inode : NULL; 508} 509 510/** 511 * new_inode - obtain an inode 512 * @sb: superblock 513 * 514 * Allocates a new inode for given superblock. 515 */ 516struct inode *new_inode(struct super_block *sb) 517{ 518 /* 519 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW 520 * error if st_ino won't fit in target struct field. Use 32bit counter 521 * here to attempt to avoid that. 522 */ 523 static unsigned int last_ino; 524 struct inode * inode; 525 526 spin_lock_prefetch(&inode_lock); 527 528 inode = alloc_inode(sb); 529 if (inode) { 530 spin_lock(&inode_lock); 531 inodes_stat.nr_inodes++; 532 list_add(&inode->i_list, &inode_in_use); 533 list_add(&inode->i_sb_list, &sb->s_inodes); 534 inode->i_ino = ++last_ino; 535 inode->i_state = 0; 536 spin_unlock(&inode_lock); 537 } 538 return inode; 539} 540 541EXPORT_SYMBOL(new_inode); 542 543void unlock_new_inode(struct inode *inode) 544{ 545 /* 546 * This is special! We do not need the spinlock 547 * when clearing I_LOCK, because we're guaranteed 548 * that nobody else tries to do anything about the 549 * state of the inode when it is locked, as we 550 * just created it (so there can be no old holders 551 * that haven't tested I_LOCK). 552 */ 553 inode->i_state &= ~(I_LOCK|I_NEW); 554 wake_up_inode(inode); 555} 556 557EXPORT_SYMBOL(unlock_new_inode); 558 559/* 560 * This is called without the inode lock held.. Be careful. 561 * 562 * We no longer cache the sb_flags in i_flags - see fs.h 563 * -- rmk@arm.uk.linux.org 564 */ 565static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *head, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) 566{ 567 struct inode * inode; 568 569 inode = alloc_inode(sb); 570 if (inode) { 571 struct inode * old; 572 573 spin_lock(&inode_lock); 574 /* We released the lock, so.. */ 575 old = find_inode(sb, head, test, data); 576 if (!old) { 577 if (set(inode, data)) 578 goto set_failed; 579 580 inodes_stat.nr_inodes++; 581 list_add(&inode->i_list, &inode_in_use); 582 list_add(&inode->i_sb_list, &sb->s_inodes); 583 hlist_add_head(&inode->i_hash, head); 584 inode->i_state = I_LOCK|I_NEW; 585 spin_unlock(&inode_lock); 586 587 /* Return the locked inode with I_NEW set, the 588 * caller is responsible for filling in the contents 589 */ 590 return inode; 591 } 592 593 /* 594 * Uhhuh, somebody else created the same inode under 595 * us. Use the old inode instead of the one we just 596 * allocated. 597 */ 598 __iget(old); 599 spin_unlock(&inode_lock); 600 destroy_inode(inode); 601 inode = old; 602 wait_on_inode(inode); 603 } 604 return inode; 605 606set_failed: 607 spin_unlock(&inode_lock); 608 destroy_inode(inode); 609 return NULL; 610} 611 612/* 613 * get_new_inode_fast is the fast path version of get_new_inode, see the 614 * comment at iget_locked for details. 615 */ 616static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_head *head, unsigned long ino) 617{ 618 struct inode * inode; 619 620 inode = alloc_inode(sb); 621 if (inode) { 622 struct inode * old; 623 624 spin_lock(&inode_lock); 625 /* We released the lock, so.. */ 626 old = find_inode_fast(sb, head, ino); 627 if (!old) { 628 inode->i_ino = ino; 629 inodes_stat.nr_inodes++; 630 list_add(&inode->i_list, &inode_in_use); 631 list_add(&inode->i_sb_list, &sb->s_inodes); 632 hlist_add_head(&inode->i_hash, head); 633 inode->i_state = I_LOCK|I_NEW; 634 spin_unlock(&inode_lock); 635 636 /* Return the locked inode with I_NEW set, the 637 * caller is responsible for filling in the contents 638 */ 639 return inode; 640 } 641 642 /* 643 * Uhhuh, somebody else created the same inode under 644 * us. Use the old inode instead of the one we just 645 * allocated. 646 */ 647 __iget(old); 648 spin_unlock(&inode_lock); 649 destroy_inode(inode); 650 inode = old; 651 wait_on_inode(inode); 652 } 653 return inode; 654} 655 656static unsigned long hash(struct super_block *sb, unsigned long hashval) 657{ 658 unsigned long tmp; 659 660 tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / 661 L1_CACHE_BYTES; 662 tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS); 663 return tmp & I_HASHMASK; 664} 665 666/** 667 * iunique - get a unique inode number 668 * @sb: superblock 669 * @max_reserved: highest reserved inode number 670 * 671 * Obtain an inode number that is unique on the system for a given 672 * superblock. This is used by file systems that have no natural 673 * permanent inode numbering system. An inode number is returned that 674 * is higher than the reserved limit but unique. 675 * 676 * BUGS: 677 * With a large number of inodes live on the file system this function 678 * currently becomes quite slow. 679 */ 680ino_t iunique(struct super_block *sb, ino_t max_reserved) 681{ 682 /* 683 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW 684 * error if st_ino won't fit in target struct field. Use 32bit counter 685 * here to attempt to avoid that. 686 */ 687 static unsigned int counter; 688 struct inode *inode; 689 struct hlist_head *head; 690 ino_t res; 691 692 spin_lock(&inode_lock); 693 do { 694 if (counter <= max_reserved) 695 counter = max_reserved + 1; 696 res = counter++; 697 head = inode_hashtable + hash(sb, res); 698 inode = find_inode_fast(sb, head, res); 699 } while (inode != NULL); 700 spin_unlock(&inode_lock); 701 702 return res; 703} 704EXPORT_SYMBOL(iunique); 705 706struct inode *igrab(struct inode *inode) 707{ 708 spin_lock(&inode_lock); 709 if (!(inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE))) 710 __iget(inode); 711 else 712 /* 713 * Handle the case where s_op->clear_inode is not been 714 * called yet, and somebody is calling igrab 715 * while the inode is getting freed. 716 */ 717 inode = NULL; 718 spin_unlock(&inode_lock); 719 return inode; 720} 721 722EXPORT_SYMBOL(igrab); 723 724/** 725 * ifind - internal function, you want ilookup5() or iget5(). 726 * @sb: super block of file system to search 727 * @head: the head of the list to search 728 * @test: callback used for comparisons between inodes 729 * @data: opaque data pointer to pass to @test 730 * @wait: if true wait for the inode to be unlocked, if false do not 731 * 732 * ifind() searches for the inode specified by @data in the inode 733 * cache. This is a generalized version of ifind_fast() for file systems where 734 * the inode number is not sufficient for unique identification of an inode. 735 * 736 * If the inode is in the cache, the inode is returned with an incremented 737 * reference count. 738 * 739 * Otherwise NULL is returned. 740 * 741 * Note, @test is called with the inode_lock held, so can't sleep. 742 */ 743static struct inode *ifind(struct super_block *sb, 744 struct hlist_head *head, int (*test)(struct inode *, void *), 745 void *data, const int wait) 746{ 747 struct inode *inode; 748 749 spin_lock(&inode_lock); 750 inode = find_inode(sb, head, test, data); 751 if (inode) { 752 __iget(inode); 753 spin_unlock(&inode_lock); 754 if (likely(wait)) 755 wait_on_inode(inode); 756 return inode; 757 } 758 spin_unlock(&inode_lock); 759 return NULL; 760} 761 762/** 763 * ifind_fast - internal function, you want ilookup() or iget(). 764 * @sb: super block of file system to search 765 * @head: head of the list to search 766 * @ino: inode number to search for 767 * 768 * ifind_fast() searches for the inode @ino in the inode cache. This is for 769 * file systems where the inode number is sufficient for unique identification 770 * of an inode. 771 * 772 * If the inode is in the cache, the inode is returned with an incremented 773 * reference count. 774 * 775 * Otherwise NULL is returned. 776 */ 777static struct inode *ifind_fast(struct super_block *sb, 778 struct hlist_head *head, unsigned long ino) 779{ 780 struct inode *inode; 781 782 spin_lock(&inode_lock); 783 inode = find_inode_fast(sb, head, ino); 784 if (inode) { 785 __iget(inode); 786 spin_unlock(&inode_lock); 787 wait_on_inode(inode); 788 return inode; 789 } 790 spin_unlock(&inode_lock); 791 return NULL; 792} 793 794/** 795 * ilookup5_nowait - search for an inode in the inode cache 796 * @sb: super block of file system to search 797 * @hashval: hash value (usually inode number) to search for 798 * @test: callback used for comparisons between inodes 799 * @data: opaque data pointer to pass to @test 800 * 801 * ilookup5() uses ifind() to search for the inode specified by @hashval and 802 * @data in the inode cache. This is a generalized version of ilookup() for 803 * file systems where the inode number is not sufficient for unique 804 * identification of an inode. 805 * 806 * If the inode is in the cache, the inode is returned with an incremented 807 * reference count. Note, the inode lock is not waited upon so you have to be 808 * very careful what you do with the returned inode. You probably should be 809 * using ilookup5() instead. 810 * 811 * Otherwise NULL is returned. 812 * 813 * Note, @test is called with the inode_lock held, so can't sleep. 814 */ 815struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, 816 int (*test)(struct inode *, void *), void *data) 817{ 818 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 819 820 return ifind(sb, head, test, data, 0); 821} 822 823EXPORT_SYMBOL(ilookup5_nowait); 824 825/** 826 * ilookup5 - search for an inode in the inode cache 827 * @sb: super block of file system to search 828 * @hashval: hash value (usually inode number) to search for 829 * @test: callback used for comparisons between inodes 830 * @data: opaque data pointer to pass to @test 831 * 832 * ilookup5() uses ifind() to search for the inode specified by @hashval and 833 * @data in the inode cache. This is a generalized version of ilookup() for 834 * file systems where the inode number is not sufficient for unique 835 * identification of an inode. 836 * 837 * If the inode is in the cache, the inode lock is waited upon and the inode is 838 * returned with an incremented reference count. 839 * 840 * Otherwise NULL is returned. 841 * 842 * Note, @test is called with the inode_lock held, so can't sleep. 843 */ 844struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 845 int (*test)(struct inode *, void *), void *data) 846{ 847 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 848 849 return ifind(sb, head, test, data, 1); 850} 851 852EXPORT_SYMBOL(ilookup5); 853 854/** 855 * ilookup - search for an inode in the inode cache 856 * @sb: super block of file system to search 857 * @ino: inode number to search for 858 * 859 * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. 860 * This is for file systems where the inode number is sufficient for unique 861 * identification of an inode. 862 * 863 * If the inode is in the cache, the inode is returned with an incremented 864 * reference count. 865 * 866 * Otherwise NULL is returned. 867 */ 868struct inode *ilookup(struct super_block *sb, unsigned long ino) 869{ 870 struct hlist_head *head = inode_hashtable + hash(sb, ino); 871 872 return ifind_fast(sb, head, ino); 873} 874 875EXPORT_SYMBOL(ilookup); 876 877/** 878 * iget5_locked - obtain an inode from a mounted file system 879 * @sb: super block of file system 880 * @hashval: hash value (usually inode number) to get 881 * @test: callback used for comparisons between inodes 882 * @set: callback used to initialize a new struct inode 883 * @data: opaque data pointer to pass to @test and @set 884 * 885 * This is iget() without the read_inode() portion of get_new_inode(). 886 * 887 * iget5_locked() uses ifind() to search for the inode specified by @hashval 888 * and @data in the inode cache and if present it is returned with an increased 889 * reference count. This is a generalized version of iget_locked() for file 890 * systems where the inode number is not sufficient for unique identification 891 * of an inode. 892 * 893 * If the inode is not in cache, get_new_inode() is called to allocate a new 894 * inode and this is returned locked, hashed, and with the I_NEW flag set. The 895 * file system gets to fill it in before unlocking it via unlock_new_inode(). 896 * 897 * Note both @test and @set are called with the inode_lock held, so can't sleep. 898 */ 899struct inode *iget5_locked(struct super_block *sb, unsigned long hashval, 900 int (*test)(struct inode *, void *), 901 int (*set)(struct inode *, void *), void *data) 902{ 903 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 904 struct inode *inode; 905 906 inode = ifind(sb, head, test, data, 1); 907 if (inode) 908 return inode; 909 /* 910 * get_new_inode() will do the right thing, re-trying the search 911 * in case it had to block at any point. 912 */ 913 return get_new_inode(sb, head, test, set, data); 914} 915 916EXPORT_SYMBOL(iget5_locked); 917 918/** 919 * iget_locked - obtain an inode from a mounted file system 920 * @sb: super block of file system 921 * @ino: inode number to get 922 * 923 * This is iget() without the read_inode() portion of get_new_inode_fast(). 924 * 925 * iget_locked() uses ifind_fast() to search for the inode specified by @ino in 926 * the inode cache and if present it is returned with an increased reference 927 * count. This is for file systems where the inode number is sufficient for 928 * unique identification of an inode. 929 * 930 * If the inode is not in cache, get_new_inode_fast() is called to allocate a 931 * new inode and this is returned locked, hashed, and with the I_NEW flag set. 932 * The file system gets to fill it in before unlocking it via 933 * unlock_new_inode(). 934 */ 935struct inode *iget_locked(struct super_block *sb, unsigned long ino) 936{ 937 struct hlist_head *head = inode_hashtable + hash(sb, ino); 938 struct inode *inode; 939 940 inode = ifind_fast(sb, head, ino); 941 if (inode) 942 return inode; 943 /* 944 * get_new_inode_fast() will do the right thing, re-trying the search 945 * in case it had to block at any point. 946 */ 947 return get_new_inode_fast(sb, head, ino); 948} 949 950EXPORT_SYMBOL(iget_locked); 951 952/** 953 * __insert_inode_hash - hash an inode 954 * @inode: unhashed inode 955 * @hashval: unsigned long value used to locate this object in the 956 * inode_hashtable. 957 * 958 * Add an inode to the inode hash for this superblock. 959 */ 960void __insert_inode_hash(struct inode *inode, unsigned long hashval) 961{ 962 struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); 963 spin_lock(&inode_lock); 964 hlist_add_head(&inode->i_hash, head); 965 spin_unlock(&inode_lock); 966} 967 968EXPORT_SYMBOL(__insert_inode_hash); 969 970/** 971 * remove_inode_hash - remove an inode from the hash 972 * @inode: inode to unhash 973 * 974 * Remove an inode from the superblock. 975 */ 976void remove_inode_hash(struct inode *inode) 977{ 978 spin_lock(&inode_lock); 979 hlist_del_init(&inode->i_hash); 980 spin_unlock(&inode_lock); 981} 982 983EXPORT_SYMBOL(remove_inode_hash); 984 985/* 986 * Tell the filesystem that this inode is no longer of any interest and should 987 * be completely destroyed. 988 * 989 * We leave the inode in the inode hash table until *after* the filesystem's 990 * ->delete_inode completes. This ensures that an iget (such as nfsd might 991 * instigate) will always find up-to-date information either in the hash or on 992 * disk. 993 * 994 * I_FREEING is set so that no-one will take a new reference to the inode while 995 * it is being deleted. 996 */ 997void generic_delete_inode(struct inode *inode) 998{ 999 const struct super_operations *op = inode->i_sb->s_op; 1000 1001 list_del_init(&inode->i_list); 1002 list_del_init(&inode->i_sb_list); 1003 inode->i_state |= I_FREEING; 1004 inodes_stat.nr_inodes--; 1005 spin_unlock(&inode_lock); 1006 1007 security_inode_delete(inode); 1008 1009 if (op->delete_inode) { 1010 void (*delete)(struct inode *) = op->delete_inode; 1011 if (!is_bad_inode(inode)) 1012 DQUOT_INIT(inode); 1013 /* Filesystems implementing their own 1014 * s_op->delete_inode are required to call 1015 * truncate_inode_pages and clear_inode() 1016 * internally */ 1017 delete(inode); 1018 } else { 1019 truncate_inode_pages(&inode->i_data, 0); 1020 clear_inode(inode); 1021 } 1022 spin_lock(&inode_lock); 1023 hlist_del_init(&inode->i_hash); 1024 spin_unlock(&inode_lock); 1025 wake_up_inode(inode); 1026 BUG_ON(inode->i_state != I_CLEAR); 1027 destroy_inode(inode); 1028} 1029 1030EXPORT_SYMBOL(generic_delete_inode); 1031 1032static void generic_forget_inode(struct inode *inode) 1033{ 1034 struct super_block *sb = inode->i_sb; 1035 1036 if (!hlist_unhashed(&inode->i_hash)) { 1037 if (!(inode->i_state & (I_DIRTY|I_LOCK))) 1038 list_move(&inode->i_list, &inode_unused); 1039 inodes_stat.nr_unused++; 1040 if (sb->s_flags & MS_ACTIVE) { 1041 spin_unlock(&inode_lock); 1042 return; 1043 } 1044 inode->i_state |= I_WILL_FREE; 1045 spin_unlock(&inode_lock); 1046 write_inode_now(inode, 1); 1047 spin_lock(&inode_lock); 1048 inode->i_state &= ~I_WILL_FREE; 1049 inodes_stat.nr_unused--; 1050 hlist_del_init(&inode->i_hash); 1051 } 1052 list_del_init(&inode->i_list); 1053 list_del_init(&inode->i_sb_list); 1054 inode->i_state |= I_FREEING; 1055 inodes_stat.nr_inodes--; 1056 spin_unlock(&inode_lock); 1057 if (inode->i_data.nrpages) 1058 truncate_inode_pages(&inode->i_data, 0); 1059 clear_inode(inode); 1060 wake_up_inode(inode); 1061 destroy_inode(inode); 1062} 1063 1064/* 1065 * Normal UNIX filesystem behaviour: delete the 1066 * inode when the usage count drops to zero, and 1067 * i_nlink is zero. 1068 */ 1069void generic_drop_inode(struct inode *inode) 1070{ 1071 if (!inode->i_nlink) 1072 generic_delete_inode(inode); 1073 else 1074 generic_forget_inode(inode); 1075} 1076 1077EXPORT_SYMBOL_GPL(generic_drop_inode); 1078 1079/* 1080 * Called when we're dropping the last reference 1081 * to an inode. 1082 * 1083 * Call the FS "drop()" function, defaulting to 1084 * the legacy UNIX filesystem behaviour.. 1085 * 1086 * NOTE! NOTE! NOTE! We're called with the inode lock 1087 * held, and the drop function is supposed to release 1088 * the lock! 1089 */ 1090static inline void iput_final(struct inode *inode) 1091{ 1092 const struct super_operations *op = inode->i_sb->s_op; 1093 void (*drop)(struct inode *) = generic_drop_inode; 1094 1095 if (op && op->drop_inode) 1096 drop = op->drop_inode; 1097 drop(inode); 1098} 1099 1100/** 1101 * iput - put an inode 1102 * @inode: inode to put 1103 * 1104 * Puts an inode, dropping its usage count. If the inode use count hits 1105 * zero, the inode is then freed and may also be destroyed. 1106 * 1107 * Consequently, iput() can sleep. 1108 */ 1109void iput(struct inode *inode) 1110{ 1111 if (inode) { 1112 const struct super_operations *op = inode->i_sb->s_op; 1113 1114 BUG_ON(inode->i_state == I_CLEAR); 1115 1116 if (op && op->put_inode) 1117 op->put_inode(inode); 1118 1119 if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) 1120 iput_final(inode); 1121 } 1122} 1123 1124EXPORT_SYMBOL(iput); 1125 1126/** 1127 * bmap - find a block number in a file 1128 * @inode: inode of file 1129 * @block: block to find 1130 * 1131 * Returns the block number on the device holding the inode that 1132 * is the disk block number for the block of the file requested. 1133 * That is, asked for block 4 of inode 1 the function will return the 1134 * disk block relative to the disk start that holds that block of the 1135 * file. 1136 */ 1137sector_t bmap(struct inode * inode, sector_t block) 1138{ 1139 sector_t res = 0; 1140 if (inode->i_mapping->a_ops->bmap) 1141 res = inode->i_mapping->a_ops->bmap(inode->i_mapping, block); 1142 return res; 1143} 1144EXPORT_SYMBOL(bmap); 1145 1146/** 1147 * touch_atime - update the access time 1148 * @mnt: mount the inode is accessed on 1149 * @dentry: dentry accessed 1150 * 1151 * Update the accessed time on an inode and mark it for writeback. 1152 * This function automatically handles read only file systems and media, 1153 * as well as the "noatime" flag and inode specific "noatime" markers. 1154 */ 1155void touch_atime(struct vfsmount *mnt, struct dentry *dentry) 1156{ 1157 struct inode *inode = dentry->d_inode; 1158 struct timespec now; 1159 1160 if (inode->i_flags & S_NOATIME) 1161 return; 1162 if (IS_NOATIME(inode)) 1163 return; 1164 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) 1165 return; 1166 1167 /* 1168 * We may have a NULL vfsmount when coming from NFSD 1169 */ 1170 if (mnt) { 1171 if (mnt->mnt_flags & MNT_NOATIME) 1172 return; 1173 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) 1174 return; 1175 1176 if (mnt->mnt_flags & MNT_RELATIME) { 1177 /* 1178 * With relative atime, only update atime if the 1179 * previous atime is earlier than either the ctime or 1180 * mtime. 1181 */ 1182 if (timespec_compare(&inode->i_mtime, 1183 &inode->i_atime) < 0 && 1184 timespec_compare(&inode->i_ctime, 1185 &inode->i_atime) < 0) 1186 return; 1187 } 1188 } 1189 1190 now = current_fs_time(inode->i_sb); 1191 if (timespec_equal(&inode->i_atime, &now)) 1192 return; 1193 1194 inode->i_atime = now; 1195 mark_inode_dirty_sync(inode); 1196} 1197EXPORT_SYMBOL(touch_atime); 1198 1199/** 1200 * file_update_time - update mtime and ctime time 1201 * @file: file accessed 1202 * 1203 * Update the mtime and ctime members of an inode and mark the inode 1204 * for writeback. Note that this function is meant exclusively for 1205 * usage in the file write path of filesystems, and filesystems may 1206 * choose to explicitly ignore update via this function with the 1207 * S_NOCTIME inode flag, e.g. for network filesystem where these 1208 * timestamps are handled by the server. 1209 */ 1210 1211void file_update_time(struct file *file) 1212{ 1213 struct inode *inode = file->f_path.dentry->d_inode; 1214 struct timespec now; 1215 int sync_it = 0; 1216 1217 if (IS_NOCMTIME(inode)) 1218 return; 1219 if (IS_RDONLY(inode)) 1220 return; 1221 1222 now = current_fs_time(inode->i_sb); 1223 if (!timespec_equal(&inode->i_mtime, &now)) { 1224 inode->i_mtime = now; 1225 sync_it = 1; 1226 } 1227 1228 if (!timespec_equal(&inode->i_ctime, &now)) { 1229 inode->i_ctime = now; 1230 sync_it = 1; 1231 } 1232 1233 if (sync_it) 1234 mark_inode_dirty_sync(inode); 1235} 1236 1237EXPORT_SYMBOL(file_update_time); 1238 1239int inode_needs_sync(struct inode *inode) 1240{ 1241 if (IS_SYNC(inode)) 1242 return 1; 1243 if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) 1244 return 1; 1245 return 0; 1246} 1247 1248EXPORT_SYMBOL(inode_needs_sync); 1249 1250int inode_wait(void *word) 1251{ 1252 schedule(); 1253 return 0; 1254} 1255 1256/* 1257 * If we try to find an inode in the inode hash while it is being 1258 * deleted, we have to wait until the filesystem completes its 1259 * deletion before reporting that it isn't found. This function waits 1260 * until the deletion _might_ have completed. Callers are responsible 1261 * to recheck inode state. 1262 * 1263 * It doesn't matter if I_LOCK is not set initially, a call to 1264 * wake_up_inode() after removing from the hash list will DTRT. 1265 * 1266 * This is called with inode_lock held. 1267 */ 1268static void __wait_on_freeing_inode(struct inode *inode) 1269{ 1270 wait_queue_head_t *wq; 1271 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_LOCK); 1272 wq = bit_waitqueue(&inode->i_state, __I_LOCK); 1273 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 1274 spin_unlock(&inode_lock); 1275 schedule(); 1276 finish_wait(wq, &wait.wait); 1277 spin_lock(&inode_lock); 1278} 1279 1280void wake_up_inode(struct inode *inode) 1281{ 1282 /* 1283 * Prevent speculative execution through spin_unlock(&inode_lock); 1284 */ 1285 smp_mb(); 1286 wake_up_bit(&inode->i_state, __I_LOCK); 1287} 1288 1289/* 1290 * We rarely want to lock two inodes that do not have a parent/child 1291 * relationship (such as directory, child inode) simultaneously. The 1292 * vast majority of file systems should be able to get along fine 1293 * without this. Do not use these functions except as a last resort. 1294 */ 1295void inode_double_lock(struct inode *inode1, struct inode *inode2) 1296{ 1297 if (inode1 == NULL || inode2 == NULL || inode1 == inode2) { 1298 if (inode1) 1299 mutex_lock(&inode1->i_mutex); 1300 else if (inode2) 1301 mutex_lock(&inode2->i_mutex); 1302 return; 1303 } 1304 1305 if (inode1 < inode2) { 1306 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); 1307 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); 1308 } else { 1309 mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT); 1310 mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD); 1311 } 1312} 1313EXPORT_SYMBOL(inode_double_lock); 1314 1315void inode_double_unlock(struct inode *inode1, struct inode *inode2) 1316{ 1317 if (inode1) 1318 mutex_unlock(&inode1->i_mutex); 1319 1320 if (inode2 && inode2 != inode1) 1321 mutex_unlock(&inode2->i_mutex); 1322} 1323EXPORT_SYMBOL(inode_double_unlock); 1324 1325static __initdata unsigned long ihash_entries; 1326static int __init set_ihash_entries(char *str) 1327{ 1328 if (!str) 1329 return 0; 1330 ihash_entries = simple_strtoul(str, &str, 0); 1331 return 1; 1332} 1333__setup("ihash_entries=", set_ihash_entries); 1334 1335/* 1336 * Initialize the waitqueues and inode hash table. 1337 */ 1338void __init inode_init_early(void) 1339{ 1340 int loop; 1341 1342 /* If hashes are distributed across NUMA nodes, defer 1343 * hash allocation until vmalloc space is available. 1344 */ 1345 if (hashdist) 1346 return; 1347 1348 inode_hashtable = 1349 alloc_large_system_hash("Inode-cache", 1350 sizeof(struct hlist_head), 1351 ihash_entries, 1352 14, 1353 HASH_EARLY, 1354 &i_hash_shift, 1355 &i_hash_mask, 1356 0); 1357 1358 for (loop = 0; loop < (1 << i_hash_shift); loop++) 1359 INIT_HLIST_HEAD(&inode_hashtable[loop]); 1360} 1361 1362void __init inode_init(unsigned long mempages) 1363{ 1364 int loop; 1365 1366 /* inode slab cache */ 1367 inode_cachep = kmem_cache_create("inode_cache", 1368 sizeof(struct inode), 1369 0, 1370 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| 1371 SLAB_MEM_SPREAD), 1372 init_once, 1373 NULL); 1374 set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); 1375 1376 /* Hash may have been set up in inode_init_early */ 1377 if (!hashdist) 1378 return; 1379 1380 inode_hashtable = 1381 alloc_large_system_hash("Inode-cache", 1382 sizeof(struct hlist_head), 1383 ihash_entries, 1384 14, 1385 0, 1386 &i_hash_shift, 1387 &i_hash_mask, 1388 0); 1389 1390 for (loop = 0; loop < (1 << i_hash_shift); loop++) 1391 INIT_HLIST_HEAD(&inode_hashtable[loop]); 1392} 1393 1394void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) 1395{ 1396 inode->i_mode = mode; 1397 if (S_ISCHR(mode)) { 1398 inode->i_fop = &def_chr_fops; 1399 inode->i_rdev = rdev; 1400 } else if (S_ISBLK(mode)) { 1401 inode->i_fop = &def_blk_fops; 1402 inode->i_rdev = rdev; 1403 } else if (S_ISFIFO(mode)) 1404 inode->i_fop = &def_fifo_fops; 1405 else if (S_ISSOCK(mode)) 1406 inode->i_fop = &bad_sock_fops; 1407 else 1408 printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o)\n", 1409 mode); 1410} 1411EXPORT_SYMBOL(init_special_inode); 1412