1/* 2 * linux/mm/mmap.c 3 * 4 * Written by obz. 5 */ 6#include <linux/slab.h> 7#include <linux/shm.h> 8#include <linux/mman.h> 9#include <linux/pagemap.h> 10#include <linux/swap.h> 11#include <linux/swapctl.h> 12#include <linux/smp_lock.h> 13#include <linux/init.h> 14#include <linux/file.h> 15#include <linux/fs.h> 16#include <linux/personality.h> 17 18#include <asm/uaccess.h> 19#include <asm/pgalloc.h> 20 21/* 22 * WARNING: the debugging will use recursive algorithms so never enable this 23 * unless you know what you are doing. 24 */ 25#undef DEBUG_MM_RB 26 27/* description of effects of mapping type and prot in current implementation. 28 * this is due to the limited x86 page protection hardware. The expected 29 * behavior is in parens: 30 * 31 * map_type prot 32 * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC 33 * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes 34 * w: (no) no w: (no) no w: (yes) yes w: (no) no 35 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 36 * 37 * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes 38 * w: (no) no w: (no) no w: (copy) copy w: (no) no 39 * x: (no) no x: (no) yes x: (no) yes x: (yes) yes 40 * 41 */ 42pgprot_t protection_map[16] = { 43 __P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111, 44 __S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111 45}; 46 47int sysctl_overcommit_memory; 48int max_map_count = DEFAULT_MAX_MAP_COUNT; 49 50/* Check that a process has enough memory to allocate a 51 * new virtual mapping. 52 */ 53int vm_enough_memory(long pages) 54{ 55 /* Stupid algorithm to decide if we have enough memory: while 56 * simple, it hopefully works in most obvious cases.. Easy to 57 * fool it, but this should catch most mistakes. 58 */ 59 /* 23/11/98 NJC: Somewhat less stupid version of algorithm, 60 * which tries to do "TheRightThing". Instead of using half of 61 * (buffers+cache), use the minimum values. Allow an extra 2% 62 * of num_physpages for safety margin. 63 */ 64 65 unsigned long free; 66 67 /* Sometimes we want to use more memory than we have. */ 68 if (sysctl_overcommit_memory) 69 return 1; 70 71 /* The page cache contains buffer pages these days.. */ 72 free = atomic_read(&page_cache_size); 73 free += nr_free_pages(); 74 free += nr_swap_pages; 75 76 /* 77 * This double-counts: the nrpages are both in the page-cache 78 * and in the swapper space. At the same time, this compensates 79 * for the swap-space over-allocation (ie "nr_swap_pages" being 80 * too small. 81 */ 82 free += swapper_space.nrpages; 83 84 /* 85 * The code below doesn't account for free space in the inode 86 * and dentry slab cache, slab cache fragmentation, inodes and 87 * dentries which will become freeable under VM load, etc. 88 * Lets just hope all these (complex) factors balance out... 89 */ 90 free += (dentry_stat.nr_unused * sizeof(struct dentry)) >> PAGE_SHIFT; 91 free += (inodes_stat.nr_unused * sizeof(struct inode)) >> PAGE_SHIFT; 92 93 return free > pages; 94} 95 96/* Remove one vm structure from the inode's i_mapping address space. */ 97static inline void __remove_shared_vm_struct(struct vm_area_struct *vma) 98{ 99 struct file * file = vma->vm_file; 100 101 if (file) { 102 struct inode *inode = file->f_dentry->d_inode; 103 if (vma->vm_flags & VM_DENYWRITE) 104 atomic_inc(&inode->i_writecount); 105 if(vma->vm_next_share) 106 vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share; 107 *vma->vm_pprev_share = vma->vm_next_share; 108 } 109} 110 111static inline void remove_shared_vm_struct(struct vm_area_struct *vma) 112{ 113 lock_vma_mappings(vma); 114 __remove_shared_vm_struct(vma); 115 unlock_vma_mappings(vma); 116} 117 118void lock_vma_mappings(struct vm_area_struct *vma) 119{ 120 struct address_space *mapping; 121 122 mapping = NULL; 123 if (vma->vm_file) 124 mapping = vma->vm_file->f_dentry->d_inode->i_mapping; 125 if (mapping) 126 spin_lock(&mapping->i_shared_lock); 127} 128 129void unlock_vma_mappings(struct vm_area_struct *vma) 130{ 131 struct address_space *mapping; 132 133 mapping = NULL; 134 if (vma->vm_file) 135 mapping = vma->vm_file->f_dentry->d_inode->i_mapping; 136 if (mapping) 137 spin_unlock(&mapping->i_shared_lock); 138} 139 140/* 141 * sys_brk() for the most part doesn't need the global kernel 142 * lock, except when an application is doing something nasty 143 * like trying to un-brk an area that has already been mapped 144 * to a regular file. in this case, the unmapping will need 145 * to invoke file system routines that need the global lock. 146 */ 147asmlinkage unsigned long sys_brk(unsigned long brk) 148{ 149 unsigned long rlim, retval; 150 unsigned long newbrk, oldbrk; 151 struct mm_struct *mm = current->mm; 152 153 down_write(&mm->mmap_sem); 154 155 if (brk < mm->end_code) 156 goto out; 157 newbrk = PAGE_ALIGN(brk); 158 oldbrk = PAGE_ALIGN(mm->brk); 159 if (oldbrk == newbrk) 160 goto set_brk; 161 162 /* Always allow shrinking brk. */ 163 if (brk <= mm->brk) { 164 if (!do_munmap(mm, newbrk, oldbrk-newbrk)) 165 goto set_brk; 166 goto out; 167 } 168 169 /* Check against rlimit.. */ 170 rlim = current->rlim[RLIMIT_DATA].rlim_cur; 171 if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) 172 goto out; 173 174 /* Check against existing mmap mappings. */ 175 if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) 176 goto out; 177 178 /* Check if we have enough memory.. */ 179 if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT)) 180 goto out; 181 182 /* Ok, looks good - let it rip. */ 183 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) 184 goto out; 185set_brk: 186 mm->brk = brk; 187out: 188 retval = mm->brk; 189 up_write(&mm->mmap_sem); 190 return retval; 191} 192 193/* Combine the mmap "prot" and "flags" argument into one "vm_flags" used 194 * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits 195 * into "VM_xxx". 196 */ 197static inline unsigned long calc_vm_flags(unsigned long prot, unsigned long flags) 198{ 199#define _trans(x,bit1,bit2) \ 200((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0) 201 202 unsigned long prot_bits, flag_bits; 203 prot_bits = 204 _trans(prot, PROT_READ, VM_READ) | 205 _trans(prot, PROT_WRITE, VM_WRITE) | 206 _trans(prot, PROT_EXEC, VM_EXEC); 207 flag_bits = 208 _trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) | 209 _trans(flags, MAP_DENYWRITE, VM_DENYWRITE) | 210 _trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE); 211 return prot_bits | flag_bits; 212#undef _trans 213} 214 215#ifdef DEBUG_MM_RB 216static int browse_rb(rb_node_t * rb_node) { 217 int i = 0; 218 if (rb_node) { 219 i++; 220 i += browse_rb(rb_node->rb_left); 221 i += browse_rb(rb_node->rb_right); 222 } 223 return i; 224} 225 226static void validate_mm(struct mm_struct * mm) { 227 int bug = 0; 228 int i = 0; 229 struct vm_area_struct * tmp = mm->mmap; 230 while (tmp) { 231 tmp = tmp->vm_next; 232 i++; 233 } 234 if (i != mm->map_count) 235 printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; 236 i = browse_rb(mm->mm_rb.rb_node); 237 if (i != mm->map_count) 238 printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; 239 if (bug) 240 BUG(); 241} 242#else 243#define validate_mm(mm) do { } while (0) 244#endif 245 246static struct vm_area_struct * find_vma_prepare(struct mm_struct * mm, unsigned long addr, 247 struct vm_area_struct ** pprev, 248 rb_node_t *** rb_link, rb_node_t ** rb_parent) 249{ 250 struct vm_area_struct * vma; 251 rb_node_t ** __rb_link, * __rb_parent, * rb_prev; 252 253 __rb_link = &mm->mm_rb.rb_node; 254 rb_prev = __rb_parent = NULL; 255 vma = NULL; 256 257 while (*__rb_link) { 258 struct vm_area_struct *vma_tmp; 259 260 __rb_parent = *__rb_link; 261 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); 262 263 if (vma_tmp->vm_end > addr) { 264 vma = vma_tmp; 265 if (vma_tmp->vm_start <= addr) 266 return vma; 267 __rb_link = &__rb_parent->rb_left; 268 } else { 269 rb_prev = __rb_parent; 270 __rb_link = &__rb_parent->rb_right; 271 } 272 } 273 274 *pprev = NULL; 275 if (rb_prev) 276 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); 277 *rb_link = __rb_link; 278 *rb_parent = __rb_parent; 279 return vma; 280} 281 282static inline void __vma_link_list(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, 283 rb_node_t * rb_parent) 284{ 285 if (prev) { 286 vma->vm_next = prev->vm_next; 287 prev->vm_next = vma; 288 } else { 289 mm->mmap = vma; 290 if (rb_parent) 291 vma->vm_next = rb_entry(rb_parent, struct vm_area_struct, vm_rb); 292 else 293 vma->vm_next = NULL; 294 } 295} 296 297static inline void __vma_link_rb(struct mm_struct * mm, struct vm_area_struct * vma, 298 rb_node_t ** rb_link, rb_node_t * rb_parent) 299{ 300 rb_link_node(&vma->vm_rb, rb_parent, rb_link); 301 rb_insert_color(&vma->vm_rb, &mm->mm_rb); 302} 303 304static inline void __vma_link_file(struct vm_area_struct * vma) 305{ 306 struct file * file; 307 308 file = vma->vm_file; 309 if (file) { 310 struct inode * inode = file->f_dentry->d_inode; 311 struct address_space *mapping = inode->i_mapping; 312 struct vm_area_struct **head; 313 314 if (vma->vm_flags & VM_DENYWRITE) 315 atomic_dec(&inode->i_writecount); 316 317 head = &mapping->i_mmap; 318 if (vma->vm_flags & VM_SHARED) 319 head = &mapping->i_mmap_shared; 320 321 /* insert vma into inode's share list */ 322 if((vma->vm_next_share = *head) != NULL) 323 (*head)->vm_pprev_share = &vma->vm_next_share; 324 *head = vma; 325 vma->vm_pprev_share = head; 326 } 327} 328 329static void __vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, 330 rb_node_t ** rb_link, rb_node_t * rb_parent) 331{ 332 __vma_link_list(mm, vma, prev, rb_parent); 333 __vma_link_rb(mm, vma, rb_link, rb_parent); 334 __vma_link_file(vma); 335} 336 337static inline void vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev, 338 rb_node_t ** rb_link, rb_node_t * rb_parent) 339{ 340 lock_vma_mappings(vma); 341 spin_lock(&mm->page_table_lock); 342 __vma_link(mm, vma, prev, rb_link, rb_parent); 343 spin_unlock(&mm->page_table_lock); 344 unlock_vma_mappings(vma); 345 346 mm->map_count++; 347 validate_mm(mm); 348} 349 350static int vma_merge(struct mm_struct * mm, struct vm_area_struct * prev, 351 rb_node_t * rb_parent, unsigned long addr, unsigned long end, unsigned long vm_flags) 352{ 353 spinlock_t * lock = &mm->page_table_lock; 354 if (!prev) { 355 prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb); 356 goto merge_next; 357 } 358 if (prev->vm_end == addr && can_vma_merge(prev, vm_flags)) { 359 struct vm_area_struct * next; 360 361 spin_lock(lock); 362 prev->vm_end = end; 363 next = prev->vm_next; 364 if (next && prev->vm_end == next->vm_start && can_vma_merge(next, vm_flags)) { 365 prev->vm_end = next->vm_end; 366 __vma_unlink(mm, next, prev); 367 spin_unlock(lock); 368 369 mm->map_count--; 370 kmem_cache_free(vm_area_cachep, next); 371 return 1; 372 } 373 spin_unlock(lock); 374 return 1; 375 } 376 377 prev = prev->vm_next; 378 if (prev) { 379 merge_next: 380 if (!can_vma_merge(prev, vm_flags)) 381 return 0; 382 if (end == prev->vm_start) { 383 spin_lock(lock); 384 prev->vm_start = addr; 385 spin_unlock(lock); 386 return 1; 387 } 388 } 389 390 return 0; 391} 392 393unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len, 394 unsigned long prot, unsigned long flags, unsigned long pgoff) 395{ 396 struct mm_struct * mm = current->mm; 397 struct vm_area_struct * vma, * prev; 398 unsigned int vm_flags; 399 int correct_wcount = 0; 400 int error; 401 rb_node_t ** rb_link, * rb_parent; 402 403 if (file && (!file->f_op || !file->f_op->mmap)) 404 return -ENODEV; 405 406 if ((len = PAGE_ALIGN(len)) == 0) 407 return addr; 408 409 if (len > TASK_SIZE) 410 return -EINVAL; 411 412 /* offset overflow? */ 413 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) 414 return -EINVAL; 415 416 /* Too many mappings? */ 417 if (mm->map_count > max_map_count) 418 return -ENOMEM; 419 420 /* Obtain the address to map to. we verify (or select) it and ensure 421 * that it represents a valid section of the address space. 422 */ 423 addr = get_unmapped_area(file, addr, len, pgoff, flags); 424 if (addr & ~PAGE_MASK) 425 return addr; 426 427 /* Do simple checking here so the lower-level routines won't have 428 * to. we assume access permissions have been handled by the open 429 * of the memory object, so we don't do any here. 430 */ 431 vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 432 433 /* mlock MCL_FUTURE? */ 434 if (vm_flags & VM_LOCKED) { 435 unsigned long locked = mm->locked_vm << PAGE_SHIFT; 436 locked += len; 437 if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) 438 return -EAGAIN; 439 } 440 441 if (file) { 442 switch (flags & MAP_TYPE) { 443 case MAP_SHARED: 444 if ((prot & PROT_WRITE) && !(file->f_mode & FMODE_WRITE)) 445 return -EACCES; 446 447 /* Make sure we don't allow writing to an append-only file.. */ 448 if (IS_APPEND(file->f_dentry->d_inode) && (file->f_mode & FMODE_WRITE)) 449 return -EACCES; 450 451 /* make sure there are no mandatory locks on the file. */ 452 if (locks_verify_locked(file->f_dentry->d_inode)) 453 return -EAGAIN; 454 455 vm_flags |= VM_SHARED | VM_MAYSHARE; 456 if (!(file->f_mode & FMODE_WRITE)) 457 vm_flags &= ~(VM_MAYWRITE | VM_SHARED); 458 459 /* fall through */ 460 case MAP_PRIVATE: 461 if (!(file->f_mode & FMODE_READ)) 462 return -EACCES; 463 break; 464 465 default: 466 return -EINVAL; 467 } 468 } else { 469 vm_flags |= VM_SHARED | VM_MAYSHARE; 470 switch (flags & MAP_TYPE) { 471 default: 472 return -EINVAL; 473 case MAP_PRIVATE: 474 vm_flags &= ~(VM_SHARED | VM_MAYSHARE); 475 /* fall through */ 476 case MAP_SHARED: 477 break; 478 } 479 } 480 481 /* Clear old maps */ 482munmap_back: 483 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 484 if (vma && vma->vm_start < addr + len) { 485 if (do_munmap(mm, addr, len)) 486 return -ENOMEM; 487 goto munmap_back; 488 } 489 490 /* Check against address space limit. */ 491 if ((mm->total_vm << PAGE_SHIFT) + len 492 > current->rlim[RLIMIT_AS].rlim_cur) 493 return -ENOMEM; 494 495 /* Private writable mapping? Check memory availability.. */ 496 if ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE && 497 !(flags & MAP_NORESERVE) && 498 !vm_enough_memory(len >> PAGE_SHIFT)) 499 return -ENOMEM; 500 501 /* Can we just expand an old anonymous mapping? */ 502 if (!file && !(vm_flags & VM_SHARED) && rb_parent) 503 if (vma_merge(mm, prev, rb_parent, addr, addr + len, vm_flags)) 504 goto out; 505 506 /* Determine the object being mapped and call the appropriate 507 * specific mapper. the address has already been validated, but 508 * not unmapped, but the maps are removed from the list. 509 */ 510 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 511 if (!vma) 512 return -ENOMEM; 513 514 vma->vm_mm = mm; 515 vma->vm_start = addr; 516 vma->vm_end = addr + len; 517 vma->vm_flags = vm_flags; 518 vma->vm_page_prot = protection_map[vm_flags & 0x0f]; 519 vma->vm_ops = NULL; 520 vma->vm_pgoff = pgoff; 521 vma->vm_file = NULL; 522 vma->vm_private_data = NULL; 523 vma->vm_raend = 0; 524 525 if (file) { 526 error = -EINVAL; 527 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) 528 goto free_vma; 529 if (vm_flags & VM_DENYWRITE) { 530 error = deny_write_access(file); 531 if (error) 532 goto free_vma; 533 correct_wcount = 1; 534 } 535 vma->vm_file = file; 536 get_file(file); 537 error = file->f_op->mmap(file, vma); 538 if (error) 539 goto unmap_and_free_vma; 540 } else if (flags & MAP_SHARED) { 541 error = shmem_zero_setup(vma); 542 if (error) 543 goto free_vma; 544 } 545 546 /* Can addr have changed?? 547 * 548 * Answer: Yes, several device drivers can do it in their 549 * f_op->mmap method. -DaveM 550 */ 551 if (addr != vma->vm_start) { 552 struct vm_area_struct * stale_vma; 553 /* Since addr changed, we rely on the mmap op to prevent 554 * collisions with existing vmas and just use find_vma_prepare 555 * to update the tree pointers. 556 */ 557 addr = vma->vm_start; 558 stale_vma = find_vma_prepare(mm, addr, &prev, 559 &rb_link, &rb_parent); 560 /* 561 * Make sure the lowlevel driver did its job right. 562 */ 563 if (unlikely(stale_vma && stale_vma->vm_start < vma->vm_end)) { 564 printk(KERN_ERR "buggy mmap operation: [<%p>]\n", 565 file ? file->f_op->mmap : NULL); 566 BUG(); 567 } 568 } 569 570 vma_link(mm, vma, prev, rb_link, rb_parent); 571 if (correct_wcount) 572 atomic_inc(&file->f_dentry->d_inode->i_writecount); 573 574out: 575 mm->total_vm += len >> PAGE_SHIFT; 576 if (vm_flags & VM_LOCKED) { 577 mm->locked_vm += len >> PAGE_SHIFT; 578 make_pages_present(addr, addr + len); 579 } 580#ifdef CONFIG_HND_BMIPS3300_PROF 581 if (vm_flags & VM_EXEC) { 582 extern void sb1250_prof_mm_changed(struct task_struct *task, int sem); 583 sb1250_prof_mm_changed(current, 1); 584 } 585#endif /* CONFIG_HND_BMIPS3300_PROF */ 586 return addr; 587 588unmap_and_free_vma: 589 if (correct_wcount) 590 atomic_inc(&file->f_dentry->d_inode->i_writecount); 591 vma->vm_file = NULL; 592 fput(file); 593 594 /* Undo any partial mapping done by a device driver. */ 595 zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); 596free_vma: 597 kmem_cache_free(vm_area_cachep, vma); 598 return error; 599} 600 601/* Get an address range which is currently unmapped. 602 * For shmat() with addr=0. 603 * 604 * Ugly calling convention alert: 605 * Return value with the low bits set means error value, 606 * ie 607 * if (ret & ~PAGE_MASK) 608 * error = ret; 609 * 610 * This function "knows" that -ENOMEM has the bits set. 611 */ 612#ifndef HAVE_ARCH_UNMAPPED_AREA 613static inline unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) 614{ 615 struct vm_area_struct *vma; 616 617 if (len > TASK_SIZE) 618 return -ENOMEM; 619 620 if (addr) { 621 addr = PAGE_ALIGN(addr); 622 vma = find_vma(current->mm, addr); 623 if (TASK_SIZE - len >= addr && 624 (!vma || addr + len <= vma->vm_start)) 625 return addr; 626 } 627 addr = PAGE_ALIGN(TASK_UNMAPPED_BASE); 628 629 for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) { 630 /* At this point: (!vma || addr < vma->vm_end). */ 631 if (TASK_SIZE - len < addr) 632 return -ENOMEM; 633 if (!vma || addr + len <= vma->vm_start) 634 return addr; 635 addr = vma->vm_end; 636 } 637} 638#else 639extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 640#endif 641 642unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) 643{ 644 if (flags & MAP_FIXED) { 645 if (addr > TASK_SIZE - len) 646 return -ENOMEM; 647 if (addr & ~PAGE_MASK) 648 return -EINVAL; 649 return addr; 650 } 651 652 if (file && file->f_op && file->f_op->get_unmapped_area) 653 return file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); 654 655 return arch_get_unmapped_area(file, addr, len, pgoff, flags); 656} 657 658/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 659struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) 660{ 661 struct vm_area_struct *vma = NULL; 662 663 if (mm) { 664 /* Check the cache first. */ 665 /* (Cache hit rate is typically around 35%.) */ 666 vma = mm->mmap_cache; 667 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { 668 rb_node_t * rb_node; 669 670 rb_node = mm->mm_rb.rb_node; 671 vma = NULL; 672 673 while (rb_node) { 674 struct vm_area_struct * vma_tmp; 675 676 vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); 677 678 if (vma_tmp->vm_end > addr) { 679 vma = vma_tmp; 680 if (vma_tmp->vm_start <= addr) 681 break; 682 rb_node = rb_node->rb_left; 683 } else 684 rb_node = rb_node->rb_right; 685 } 686 if (vma) 687 mm->mmap_cache = vma; 688 } 689 } 690 return vma; 691} 692 693/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ 694struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, 695 struct vm_area_struct **pprev) 696{ 697 if (mm) { 698 /* Go through the RB tree quickly. */ 699 struct vm_area_struct * vma; 700 rb_node_t * rb_node, * rb_last_right, * rb_prev; 701 702 rb_node = mm->mm_rb.rb_node; 703 rb_last_right = rb_prev = NULL; 704 vma = NULL; 705 706 while (rb_node) { 707 struct vm_area_struct * vma_tmp; 708 709 vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); 710 711 if (vma_tmp->vm_end > addr) { 712 vma = vma_tmp; 713 rb_prev = rb_last_right; 714 if (vma_tmp->vm_start <= addr) 715 break; 716 rb_node = rb_node->rb_left; 717 } else { 718 rb_last_right = rb_node; 719 rb_node = rb_node->rb_right; 720 } 721 } 722 if (vma) { 723 if (vma->vm_rb.rb_left) { 724 rb_prev = vma->vm_rb.rb_left; 725 while (rb_prev->rb_right) 726 rb_prev = rb_prev->rb_right; 727 } 728 *pprev = NULL; 729 if (rb_prev) 730 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); 731 if ((rb_prev ? (*pprev)->vm_next : mm->mmap) != vma) 732 BUG(); 733 return vma; 734 } 735 } 736 *pprev = NULL; 737 return NULL; 738} 739 740struct vm_area_struct * find_extend_vma(struct mm_struct * mm, unsigned long addr) 741{ 742 struct vm_area_struct * vma; 743 unsigned long start; 744 745 addr &= PAGE_MASK; 746 vma = find_vma(mm,addr); 747 if (!vma) 748 return NULL; 749 if (vma->vm_start <= addr) 750 return vma; 751 if (!(vma->vm_flags & VM_GROWSDOWN)) 752 return NULL; 753 start = vma->vm_start; 754 if (expand_stack(vma, addr)) 755 return NULL; 756 if (vma->vm_flags & VM_LOCKED) { 757 make_pages_present(addr, start); 758 } 759 return vma; 760} 761 762/* Normal function to fix up a mapping 763 * This function is the default for when an area has no specific 764 * function. This may be used as part of a more specific routine. 765 * This function works out what part of an area is affected and 766 * adjusts the mapping information. Since the actual page 767 * manipulation is done in do_mmap(), none need be done here, 768 * though it would probably be more appropriate. 769 * 770 * By the time this function is called, the area struct has been 771 * removed from the process mapping list, so it needs to be 772 * reinserted if necessary. 773 * 774 * The 4 main cases are: 775 * Unmapping the whole area 776 * Unmapping from the start of the segment to a point in it 777 * Unmapping from an intermediate point to the end 778 * Unmapping between to intermediate points, making a hole. 779 * 780 * Case 4 involves the creation of 2 new areas, for each side of 781 * the hole. If possible, we reuse the existing area rather than 782 * allocate a new one, and the return indicates whether the old 783 * area was reused. 784 */ 785static struct vm_area_struct * unmap_fixup(struct mm_struct *mm, 786 struct vm_area_struct *area, unsigned long addr, size_t len, 787 struct vm_area_struct *extra) 788{ 789 struct vm_area_struct *mpnt; 790 unsigned long end = addr + len; 791 792 area->vm_mm->total_vm -= len >> PAGE_SHIFT; 793 if (area->vm_flags & VM_LOCKED) 794 area->vm_mm->locked_vm -= len >> PAGE_SHIFT; 795 796 /* Unmapping the whole area. */ 797 if (addr == area->vm_start && end == area->vm_end) { 798 if (area->vm_ops && area->vm_ops->close) 799 area->vm_ops->close(area); 800 if (area->vm_file) 801 fput(area->vm_file); 802 kmem_cache_free(vm_area_cachep, area); 803 return extra; 804 } 805 806 /* Work out to one of the ends. */ 807 if (end == area->vm_end) { 808 /* 809 * here area isn't visible to the semaphore-less readers 810 * so we don't need to update it under the spinlock. 811 */ 812 area->vm_end = addr; 813 lock_vma_mappings(area); 814 spin_lock(&mm->page_table_lock); 815 } else if (addr == area->vm_start) { 816 area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT; 817 /* same locking considerations of the above case */ 818 area->vm_start = end; 819 lock_vma_mappings(area); 820 spin_lock(&mm->page_table_lock); 821 } else { 822 /* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */ 823 /* Add end mapping -- leave beginning for below */ 824 mpnt = extra; 825 extra = NULL; 826 827 mpnt->vm_mm = area->vm_mm; 828 mpnt->vm_start = end; 829 mpnt->vm_end = area->vm_end; 830 mpnt->vm_page_prot = area->vm_page_prot; 831 mpnt->vm_flags = area->vm_flags; 832 mpnt->vm_raend = 0; 833 mpnt->vm_ops = area->vm_ops; 834 mpnt->vm_pgoff = area->vm_pgoff + ((end - area->vm_start) >> PAGE_SHIFT); 835 mpnt->vm_file = area->vm_file; 836 mpnt->vm_private_data = area->vm_private_data; 837 if (mpnt->vm_file) 838 get_file(mpnt->vm_file); 839 if (mpnt->vm_ops && mpnt->vm_ops->open) 840 mpnt->vm_ops->open(mpnt); 841 area->vm_end = addr; /* Truncate area */ 842 843 /* Because mpnt->vm_file == area->vm_file this locks 844 * things correctly. 845 */ 846 lock_vma_mappings(area); 847 spin_lock(&mm->page_table_lock); 848 __insert_vm_struct(mm, mpnt); 849 } 850 851 __insert_vm_struct(mm, area); 852 spin_unlock(&mm->page_table_lock); 853 unlock_vma_mappings(area); 854 return extra; 855} 856 857/* 858 * Try to free as many page directory entries as we can, 859 * without having to work very hard at actually scanning 860 * the page tables themselves. 861 * 862 * Right now we try to free page tables if we have a nice 863 * PGDIR-aligned area that got free'd up. We could be more 864 * granular if we want to, but this is fast and simple, 865 * and covers the bad cases. 866 * 867 * "prev", if it exists, points to a vma before the one 868 * we just free'd - but there's no telling how much before. 869 */ 870static void free_pgtables(struct mm_struct * mm, struct vm_area_struct *prev, 871 unsigned long start, unsigned long end) 872{ 873 unsigned long first = start & PGDIR_MASK; 874 unsigned long last = end + PGDIR_SIZE - 1; 875 unsigned long start_index, end_index; 876 877 if (!prev) { 878 prev = mm->mmap; 879 if (!prev) 880 goto no_mmaps; 881 if (prev->vm_end > start) { 882 if (last > prev->vm_start) 883 last = prev->vm_start; 884 goto no_mmaps; 885 } 886 } 887 for (;;) { 888 struct vm_area_struct *next = prev->vm_next; 889 890 if (next) { 891 if (next->vm_start < start) { 892 prev = next; 893 continue; 894 } 895 if (last > next->vm_start) 896 last = next->vm_start; 897 } 898 if (prev->vm_end > first) 899 first = prev->vm_end + PGDIR_SIZE - 1; 900 break; 901 } 902no_mmaps: 903 /* 904 * If the PGD bits are not consecutive in the virtual address, the 905 * old method of shifting the VA >> by PGDIR_SHIFT doesn't work. 906 */ 907 start_index = pgd_index(first); 908 end_index = pgd_index(last); 909 if (end_index > start_index) { 910 clear_page_tables(mm, start_index, end_index - start_index); 911 flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK); 912 } 913} 914 915/* Munmap is split into 2 main parts -- this part which finds 916 * what needs doing, and the areas themselves, which do the 917 * work. This now handles partial unmappings. 918 * Jeremy Fitzhardine <jeremy@sw.oz.au> 919 */ 920int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) 921{ 922 struct vm_area_struct *mpnt, *prev, **npp, *free, *extra; 923 924 if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr) 925 return -EINVAL; 926 927 if ((len = PAGE_ALIGN(len)) == 0) 928 return -EINVAL; 929 930 /* Check if this memory area is ok - put it on the temporary 931 * list if so.. The checks here are pretty simple -- 932 * every area affected in some way (by any overlap) is put 933 * on the list. If nothing is put on, nothing is affected. 934 */ 935 mpnt = find_vma_prev(mm, addr, &prev); 936 if (!mpnt) 937 return 0; 938 /* we have addr < mpnt->vm_end */ 939 940 if (mpnt->vm_start >= addr+len) 941 return 0; 942 943 /* If we'll make "hole", check the vm areas limit */ 944 if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len) 945 && mm->map_count >= max_map_count) 946 return -ENOMEM; 947 948 /* 949 * We may need one additional vma to fix up the mappings ... 950 * and this is the last chance for an easy error exit. 951 */ 952 extra = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 953 if (!extra) 954 return -ENOMEM; 955 956 npp = (prev ? &prev->vm_next : &mm->mmap); 957 free = NULL; 958 spin_lock(&mm->page_table_lock); 959 for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) { 960 *npp = mpnt->vm_next; 961 mpnt->vm_next = free; 962 free = mpnt; 963 rb_erase(&mpnt->vm_rb, &mm->mm_rb); 964 } 965 mm->mmap_cache = NULL; /* Kill the cache. */ 966 spin_unlock(&mm->page_table_lock); 967 968 /* Ok - we have the memory areas we should free on the 'free' list, 969 * so release them, and unmap the page range.. 970 * If the one of the segments is only being partially unmapped, 971 * it will put new vm_area_struct(s) into the address space. 972 * In that case we have to be careful with VM_DENYWRITE. 973 */ 974 while ((mpnt = free) != NULL) { 975 unsigned long st, end, size; 976 struct file *file = NULL; 977 978 free = free->vm_next; 979 980 st = addr < mpnt->vm_start ? mpnt->vm_start : addr; 981 end = addr+len; 982 end = end > mpnt->vm_end ? mpnt->vm_end : end; 983 size = end - st; 984 985 if (mpnt->vm_flags & VM_DENYWRITE && 986 (st != mpnt->vm_start || end != mpnt->vm_end) && 987 (file = mpnt->vm_file) != NULL) { 988 atomic_dec(&file->f_dentry->d_inode->i_writecount); 989 } 990 remove_shared_vm_struct(mpnt); 991 mm->map_count--; 992 993 zap_page_range(mm, st, size); 994 995 /* 996 * Fix the mapping, and free the old area if it wasn't reused. 997 */ 998 extra = unmap_fixup(mm, mpnt, st, size, extra); 999 if (file) 1000 atomic_inc(&file->f_dentry->d_inode->i_writecount); 1001 } 1002 validate_mm(mm); 1003 1004 /* Release the extra vma struct if it wasn't used */ 1005 if (extra) 1006 kmem_cache_free(vm_area_cachep, extra); 1007 1008 free_pgtables(mm, prev, addr, addr+len); 1009 1010 return 0; 1011} 1012 1013asmlinkage long sys_munmap(unsigned long addr, size_t len) 1014{ 1015 int ret; 1016 struct mm_struct *mm = current->mm; 1017 1018 down_write(&mm->mmap_sem); 1019 ret = do_munmap(mm, addr, len); 1020 up_write(&mm->mmap_sem); 1021 return ret; 1022} 1023 1024/* 1025 * this is really a simplified "do_mmap". it only handles 1026 * anonymous maps. eventually we may be able to do some 1027 * brk-specific accounting here. 1028 */ 1029unsigned long do_brk(unsigned long addr, unsigned long len) 1030{ 1031 struct mm_struct * mm = current->mm; 1032 struct vm_area_struct * vma, * prev; 1033 unsigned long flags; 1034 rb_node_t ** rb_link, * rb_parent; 1035 1036 len = PAGE_ALIGN(len); 1037 if (!len) 1038 return addr; 1039 1040 /* 1041 * mlock MCL_FUTURE? 1042 */ 1043 if (mm->def_flags & VM_LOCKED) { 1044 unsigned long locked = mm->locked_vm << PAGE_SHIFT; 1045 locked += len; 1046 if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur) 1047 return -EAGAIN; 1048 } 1049 1050 /* 1051 * Clear old maps. this also does some error checking for us 1052 */ 1053 munmap_back: 1054 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 1055 if (vma && vma->vm_start < addr + len) { 1056 if (do_munmap(mm, addr, len)) 1057 return -ENOMEM; 1058 goto munmap_back; 1059 } 1060 1061 /* Check against address space limits *after* clearing old maps... */ 1062 if ((mm->total_vm << PAGE_SHIFT) + len 1063 > current->rlim[RLIMIT_AS].rlim_cur) 1064 return -ENOMEM; 1065 1066 if (mm->map_count > max_map_count) 1067 return -ENOMEM; 1068 1069 if (!vm_enough_memory(len >> PAGE_SHIFT)) 1070 return -ENOMEM; 1071 1072 flags = VM_DATA_DEFAULT_FLAGS | mm->def_flags; 1073 1074 /* Can we just expand an old anonymous mapping? */ 1075 if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len, flags)) 1076 goto out; 1077 1078 /* 1079 * create a vma struct for an anonymous mapping 1080 */ 1081 vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); 1082 if (!vma) 1083 return -ENOMEM; 1084 1085 vma->vm_mm = mm; 1086 vma->vm_start = addr; 1087 vma->vm_end = addr + len; 1088 vma->vm_flags = flags; 1089 vma->vm_page_prot = protection_map[flags & 0x0f]; 1090 vma->vm_ops = NULL; 1091 vma->vm_pgoff = 0; 1092 vma->vm_file = NULL; 1093 vma->vm_private_data = NULL; 1094 1095 vma_link(mm, vma, prev, rb_link, rb_parent); 1096 1097out: 1098 mm->total_vm += len >> PAGE_SHIFT; 1099 if (flags & VM_LOCKED) { 1100 mm->locked_vm += len >> PAGE_SHIFT; 1101 make_pages_present(addr, addr + len); 1102 } 1103 return addr; 1104} 1105 1106/* Build the RB tree corresponding to the VMA list. */ 1107void build_mmap_rb(struct mm_struct * mm) 1108{ 1109 struct vm_area_struct * vma; 1110 rb_node_t ** rb_link, * rb_parent; 1111 1112 mm->mm_rb = RB_ROOT; 1113 rb_link = &mm->mm_rb.rb_node; 1114 rb_parent = NULL; 1115 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1116 __vma_link_rb(mm, vma, rb_link, rb_parent); 1117 rb_parent = &vma->vm_rb; 1118 rb_link = &rb_parent->rb_right; 1119 } 1120} 1121 1122/* Release all mmaps. */ 1123void exit_mmap(struct mm_struct * mm) 1124{ 1125 struct vm_area_struct * mpnt; 1126 1127 release_segments(mm); 1128 spin_lock(&mm->page_table_lock); 1129 mpnt = mm->mmap; 1130 mm->mmap = mm->mmap_cache = NULL; 1131 mm->mm_rb = RB_ROOT; 1132 mm->rss = 0; 1133 spin_unlock(&mm->page_table_lock); 1134 mm->total_vm = 0; 1135 mm->locked_vm = 0; 1136 1137 flush_cache_mm(mm); 1138 while (mpnt) { 1139 struct vm_area_struct * next = mpnt->vm_next; 1140 unsigned long start = mpnt->vm_start; 1141 unsigned long end = mpnt->vm_end; 1142 unsigned long size = end - start; 1143 1144 if (mpnt->vm_ops) { 1145 if (mpnt->vm_ops->close) 1146 mpnt->vm_ops->close(mpnt); 1147 } 1148 mm->map_count--; 1149 remove_shared_vm_struct(mpnt); 1150 zap_page_range(mm, start, size); 1151 if (mpnt->vm_file) 1152 fput(mpnt->vm_file); 1153 kmem_cache_free(vm_area_cachep, mpnt); 1154 mpnt = next; 1155 } 1156 1157 /* This is just debugging */ 1158 if (mm->map_count) 1159 BUG(); 1160 1161 clear_page_tables(mm, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD); 1162 1163 flush_tlb_mm(mm); 1164} 1165 1166/* Insert vm structure into process list sorted by address 1167 * and into the inode's i_mmap ring. If vm_file is non-NULL 1168 * then the i_shared_lock must be held here. 1169 */ 1170void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) 1171{ 1172 struct vm_area_struct * __vma, * prev; 1173 rb_node_t ** rb_link, * rb_parent; 1174 1175 __vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent); 1176 if (__vma && __vma->vm_start < vma->vm_end) 1177 BUG(); 1178 __vma_link(mm, vma, prev, rb_link, rb_parent); 1179 mm->map_count++; 1180 validate_mm(mm); 1181} 1182 1183void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) 1184{ 1185 struct vm_area_struct * __vma, * prev; 1186 rb_node_t ** rb_link, * rb_parent; 1187 1188 __vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent); 1189 if (__vma && __vma->vm_start < vma->vm_end) 1190 BUG(); 1191 vma_link(mm, vma, prev, rb_link, rb_parent); 1192 validate_mm(mm); 1193} 1194