1/*
2 *	linux/mm/mmap.c
3 *
4 * Written by obz.
5 */
6#include <linux/slab.h>
7#include <linux/shm.h>
8#include <linux/mman.h>
9#include <linux/pagemap.h>
10#include <linux/swap.h>
11#include <linux/swapctl.h>
12#include <linux/smp_lock.h>
13#include <linux/init.h>
14#include <linux/file.h>
15#include <linux/fs.h>
16#include <linux/personality.h>
17
18#include <asm/uaccess.h>
19#include <asm/pgalloc.h>
20
21/*
22 * WARNING: the debugging will use recursive algorithms so never enable this
23 * unless you know what you are doing.
24 */
25#undef DEBUG_MM_RB
26
27/* description of effects of mapping type and prot in current implementation.
28 * this is due to the limited x86 page protection hardware.  The expected
29 * behavior is in parens:
30 *
31 * map_type	prot
32 *		PROT_NONE	PROT_READ	PROT_WRITE	PROT_EXEC
33 * MAP_SHARED	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
34 *		w: (no) no	w: (no) no	w: (yes) yes	w: (no) no
35 *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
36 *
37 * MAP_PRIVATE	r: (no) no	r: (yes) yes	r: (no) yes	r: (no) yes
38 *		w: (no) no	w: (no) no	w: (copy) copy	w: (no) no
39 *		x: (no) no	x: (no) yes	x: (no) yes	x: (yes) yes
40 *
41 */
42pgprot_t protection_map[16] = {
43	__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
44	__S000, __S001, __S010, __S011, __S100, __S101, __S110, __S111
45};
46
47int sysctl_overcommit_memory;
48int max_map_count = DEFAULT_MAX_MAP_COUNT;
49
50/* Check that a process has enough memory to allocate a
51 * new virtual mapping.
52 */
53int vm_enough_memory(long pages)
54{
55	/* Stupid algorithm to decide if we have enough memory: while
56	 * simple, it hopefully works in most obvious cases.. Easy to
57	 * fool it, but this should catch most mistakes.
58	 */
59	/* 23/11/98 NJC: Somewhat less stupid version of algorithm,
60	 * which tries to do "TheRightThing".  Instead of using half of
61	 * (buffers+cache), use the minimum values.  Allow an extra 2%
62	 * of num_physpages for safety margin.
63	 */
64
65	unsigned long free;
66
67        /* Sometimes we want to use more memory than we have. */
68	if (sysctl_overcommit_memory)
69	    return 1;
70
71	/* The page cache contains buffer pages these days.. */
72	free = atomic_read(&page_cache_size);
73	free += nr_free_pages();
74	free += nr_swap_pages;
75
76	/*
77	 * This double-counts: the nrpages are both in the page-cache
78	 * and in the swapper space. At the same time, this compensates
79	 * for the swap-space over-allocation (ie "nr_swap_pages" being
80	 * too small.
81	 */
82	free += swapper_space.nrpages;
83
84	/*
85	 * The code below doesn't account for free space in the inode
86	 * and dentry slab cache, slab cache fragmentation, inodes and
87	 * dentries which will become freeable under VM load, etc.
88	 * Lets just hope all these (complex) factors balance out...
89	 */
90	free += (dentry_stat.nr_unused * sizeof(struct dentry)) >> PAGE_SHIFT;
91	free += (inodes_stat.nr_unused * sizeof(struct inode)) >> PAGE_SHIFT;
92
93	return free > pages;
94}
95
96/* Remove one vm structure from the inode's i_mapping address space. */
97static inline void __remove_shared_vm_struct(struct vm_area_struct *vma)
98{
99	struct file * file = vma->vm_file;
100
101	if (file) {
102		struct inode *inode = file->f_dentry->d_inode;
103		if (vma->vm_flags & VM_DENYWRITE)
104			atomic_inc(&inode->i_writecount);
105		if(vma->vm_next_share)
106			vma->vm_next_share->vm_pprev_share = vma->vm_pprev_share;
107		*vma->vm_pprev_share = vma->vm_next_share;
108	}
109}
110
111static inline void remove_shared_vm_struct(struct vm_area_struct *vma)
112{
113	lock_vma_mappings(vma);
114	__remove_shared_vm_struct(vma);
115	unlock_vma_mappings(vma);
116}
117
118void lock_vma_mappings(struct vm_area_struct *vma)
119{
120	struct address_space *mapping;
121
122	mapping = NULL;
123	if (vma->vm_file)
124		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
125	if (mapping)
126		spin_lock(&mapping->i_shared_lock);
127}
128
129void unlock_vma_mappings(struct vm_area_struct *vma)
130{
131	struct address_space *mapping;
132
133	mapping = NULL;
134	if (vma->vm_file)
135		mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
136	if (mapping)
137		spin_unlock(&mapping->i_shared_lock);
138}
139
140/*
141 *  sys_brk() for the most part doesn't need the global kernel
142 *  lock, except when an application is doing something nasty
143 *  like trying to un-brk an area that has already been mapped
144 *  to a regular file.  in this case, the unmapping will need
145 *  to invoke file system routines that need the global lock.
146 */
147asmlinkage unsigned long sys_brk(unsigned long brk)
148{
149	unsigned long rlim, retval;
150	unsigned long newbrk, oldbrk;
151	struct mm_struct *mm = current->mm;
152
153	down_write(&mm->mmap_sem);
154
155	if (brk < mm->end_code)
156		goto out;
157	newbrk = PAGE_ALIGN(brk);
158	oldbrk = PAGE_ALIGN(mm->brk);
159	if (oldbrk == newbrk)
160		goto set_brk;
161
162	/* Always allow shrinking brk. */
163	if (brk <= mm->brk) {
164		if (!do_munmap(mm, newbrk, oldbrk-newbrk))
165			goto set_brk;
166		goto out;
167	}
168
169	/* Check against rlimit.. */
170	rlim = current->rlim[RLIMIT_DATA].rlim_cur;
171	if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim)
172		goto out;
173
174	/* Check against existing mmap mappings. */
175	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
176		goto out;
177
178	/* Check if we have enough memory.. */
179	if (!vm_enough_memory((newbrk-oldbrk) >> PAGE_SHIFT))
180		goto out;
181
182	/* Ok, looks good - let it rip. */
183	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
184		goto out;
185set_brk:
186	mm->brk = brk;
187out:
188	retval = mm->brk;
189	up_write(&mm->mmap_sem);
190	return retval;
191}
192
193/* Combine the mmap "prot" and "flags" argument into one "vm_flags" used
194 * internally. Essentially, translate the "PROT_xxx" and "MAP_xxx" bits
195 * into "VM_xxx".
196 */
197static inline unsigned long calc_vm_flags(unsigned long prot, unsigned long flags)
198{
199#define _trans(x,bit1,bit2) \
200((bit1==bit2)?(x&bit1):(x&bit1)?bit2:0)
201
202	unsigned long prot_bits, flag_bits;
203	prot_bits =
204		_trans(prot, PROT_READ, VM_READ) |
205		_trans(prot, PROT_WRITE, VM_WRITE) |
206		_trans(prot, PROT_EXEC, VM_EXEC);
207	flag_bits =
208		_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN) |
209		_trans(flags, MAP_DENYWRITE, VM_DENYWRITE) |
210		_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE);
211	return prot_bits | flag_bits;
212#undef _trans
213}
214
215#ifdef DEBUG_MM_RB
216static int browse_rb(rb_node_t * rb_node) {
217	int i = 0;
218	if (rb_node) {
219		i++;
220		i += browse_rb(rb_node->rb_left);
221		i += browse_rb(rb_node->rb_right);
222	}
223	return i;
224}
225
226static void validate_mm(struct mm_struct * mm) {
227	int bug = 0;
228	int i = 0;
229	struct vm_area_struct * tmp = mm->mmap;
230	while (tmp) {
231		tmp = tmp->vm_next;
232		i++;
233	}
234	if (i != mm->map_count)
235		printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1;
236	i = browse_rb(mm->mm_rb.rb_node);
237	if (i != mm->map_count)
238		printk("map_count %d rb %d\n", mm->map_count, i), bug = 1;
239	if (bug)
240		BUG();
241}
242#else
243#define validate_mm(mm) do { } while (0)
244#endif
245
246static struct vm_area_struct * find_vma_prepare(struct mm_struct * mm, unsigned long addr,
247						struct vm_area_struct ** pprev,
248						rb_node_t *** rb_link, rb_node_t ** rb_parent)
249{
250	struct vm_area_struct * vma;
251	rb_node_t ** __rb_link, * __rb_parent, * rb_prev;
252
253	__rb_link = &mm->mm_rb.rb_node;
254	rb_prev = __rb_parent = NULL;
255	vma = NULL;
256
257	while (*__rb_link) {
258		struct vm_area_struct *vma_tmp;
259
260		__rb_parent = *__rb_link;
261		vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
262
263		if (vma_tmp->vm_end > addr) {
264			vma = vma_tmp;
265			if (vma_tmp->vm_start <= addr)
266				return vma;
267			__rb_link = &__rb_parent->rb_left;
268		} else {
269			rb_prev = __rb_parent;
270			__rb_link = &__rb_parent->rb_right;
271		}
272	}
273
274	*pprev = NULL;
275	if (rb_prev)
276		*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
277	*rb_link = __rb_link;
278	*rb_parent = __rb_parent;
279	return vma;
280}
281
282static inline void __vma_link_list(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev,
283				   rb_node_t * rb_parent)
284{
285	if (prev) {
286		vma->vm_next = prev->vm_next;
287		prev->vm_next = vma;
288	} else {
289		mm->mmap = vma;
290		if (rb_parent)
291			vma->vm_next = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
292		else
293			vma->vm_next = NULL;
294	}
295}
296
297static inline void __vma_link_rb(struct mm_struct * mm, struct vm_area_struct * vma,
298				 rb_node_t ** rb_link, rb_node_t * rb_parent)
299{
300	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
301	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
302}
303
304static inline void __vma_link_file(struct vm_area_struct * vma)
305{
306	struct file * file;
307
308	file = vma->vm_file;
309	if (file) {
310		struct inode * inode = file->f_dentry->d_inode;
311		struct address_space *mapping = inode->i_mapping;
312		struct vm_area_struct **head;
313
314		if (vma->vm_flags & VM_DENYWRITE)
315			atomic_dec(&inode->i_writecount);
316
317		head = &mapping->i_mmap;
318		if (vma->vm_flags & VM_SHARED)
319			head = &mapping->i_mmap_shared;
320
321		/* insert vma into inode's share list */
322		if((vma->vm_next_share = *head) != NULL)
323			(*head)->vm_pprev_share = &vma->vm_next_share;
324		*head = vma;
325		vma->vm_pprev_share = head;
326	}
327}
328
329static void __vma_link(struct mm_struct * mm, struct vm_area_struct * vma,  struct vm_area_struct * prev,
330		       rb_node_t ** rb_link, rb_node_t * rb_parent)
331{
332	__vma_link_list(mm, vma, prev, rb_parent);
333	__vma_link_rb(mm, vma, rb_link, rb_parent);
334	__vma_link_file(vma);
335}
336
337static inline void vma_link(struct mm_struct * mm, struct vm_area_struct * vma, struct vm_area_struct * prev,
338			    rb_node_t ** rb_link, rb_node_t * rb_parent)
339{
340	lock_vma_mappings(vma);
341	spin_lock(&mm->page_table_lock);
342	__vma_link(mm, vma, prev, rb_link, rb_parent);
343	spin_unlock(&mm->page_table_lock);
344	unlock_vma_mappings(vma);
345
346	mm->map_count++;
347	validate_mm(mm);
348}
349
350static int vma_merge(struct mm_struct * mm, struct vm_area_struct * prev,
351		     rb_node_t * rb_parent, unsigned long addr, unsigned long end, unsigned long vm_flags)
352{
353	spinlock_t * lock = &mm->page_table_lock;
354	if (!prev) {
355		prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb);
356		goto merge_next;
357	}
358	if (prev->vm_end == addr && can_vma_merge(prev, vm_flags)) {
359		struct vm_area_struct * next;
360
361		spin_lock(lock);
362		prev->vm_end = end;
363		next = prev->vm_next;
364		if (next && prev->vm_end == next->vm_start && can_vma_merge(next, vm_flags)) {
365			prev->vm_end = next->vm_end;
366			__vma_unlink(mm, next, prev);
367			spin_unlock(lock);
368
369			mm->map_count--;
370			kmem_cache_free(vm_area_cachep, next);
371			return 1;
372		}
373		spin_unlock(lock);
374		return 1;
375	}
376
377	prev = prev->vm_next;
378	if (prev) {
379 merge_next:
380		if (!can_vma_merge(prev, vm_flags))
381			return 0;
382		if (end == prev->vm_start) {
383			spin_lock(lock);
384			prev->vm_start = addr;
385			spin_unlock(lock);
386			return 1;
387		}
388	}
389
390	return 0;
391}
392
393unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
394	unsigned long prot, unsigned long flags, unsigned long pgoff)
395{
396	struct mm_struct * mm = current->mm;
397	struct vm_area_struct * vma, * prev;
398	unsigned int vm_flags;
399	int correct_wcount = 0;
400	int error;
401	rb_node_t ** rb_link, * rb_parent;
402
403	if (file && (!file->f_op || !file->f_op->mmap))
404		return -ENODEV;
405
406	if ((len = PAGE_ALIGN(len)) == 0)
407		return addr;
408
409	if (len > TASK_SIZE)
410		return -EINVAL;
411
412	/* offset overflow? */
413	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
414		return -EINVAL;
415
416	/* Too many mappings? */
417	if (mm->map_count > max_map_count)
418		return -ENOMEM;
419
420	/* Obtain the address to map to. we verify (or select) it and ensure
421	 * that it represents a valid section of the address space.
422	 */
423	addr = get_unmapped_area(file, addr, len, pgoff, flags);
424	if (addr & ~PAGE_MASK)
425		return addr;
426
427	/* Do simple checking here so the lower-level routines won't have
428	 * to. we assume access permissions have been handled by the open
429	 * of the memory object, so we don't do any here.
430	 */
431	vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
432
433	/* mlock MCL_FUTURE? */
434	if (vm_flags & VM_LOCKED) {
435		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
436		locked += len;
437		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
438			return -EAGAIN;
439	}
440
441	if (file) {
442		switch (flags & MAP_TYPE) {
443		case MAP_SHARED:
444			if ((prot & PROT_WRITE) && !(file->f_mode & FMODE_WRITE))
445				return -EACCES;
446
447			/* Make sure we don't allow writing to an append-only file.. */
448			if (IS_APPEND(file->f_dentry->d_inode) && (file->f_mode & FMODE_WRITE))
449				return -EACCES;
450
451			/* make sure there are no mandatory locks on the file. */
452			if (locks_verify_locked(file->f_dentry->d_inode))
453				return -EAGAIN;
454
455			vm_flags |= VM_SHARED | VM_MAYSHARE;
456			if (!(file->f_mode & FMODE_WRITE))
457				vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
458
459			/* fall through */
460		case MAP_PRIVATE:
461			if (!(file->f_mode & FMODE_READ))
462				return -EACCES;
463			break;
464
465		default:
466			return -EINVAL;
467		}
468	} else {
469		vm_flags |= VM_SHARED | VM_MAYSHARE;
470		switch (flags & MAP_TYPE) {
471		default:
472			return -EINVAL;
473		case MAP_PRIVATE:
474			vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
475			/* fall through */
476		case MAP_SHARED:
477			break;
478		}
479	}
480
481	/* Clear old maps */
482munmap_back:
483	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
484	if (vma && vma->vm_start < addr + len) {
485		if (do_munmap(mm, addr, len))
486			return -ENOMEM;
487		goto munmap_back;
488	}
489
490	/* Check against address space limit. */
491	if ((mm->total_vm << PAGE_SHIFT) + len
492	    > current->rlim[RLIMIT_AS].rlim_cur)
493		return -ENOMEM;
494
495	/* Private writable mapping? Check memory availability.. */
496	if ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
497	    !(flags & MAP_NORESERVE)				 &&
498	    !vm_enough_memory(len >> PAGE_SHIFT))
499		return -ENOMEM;
500
501	/* Can we just expand an old anonymous mapping? */
502	if (!file && !(vm_flags & VM_SHARED) && rb_parent)
503		if (vma_merge(mm, prev, rb_parent, addr, addr + len, vm_flags))
504			goto out;
505
506	/* Determine the object being mapped and call the appropriate
507	 * specific mapper. the address has already been validated, but
508	 * not unmapped, but the maps are removed from the list.
509	 */
510	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
511	if (!vma)
512		return -ENOMEM;
513
514	vma->vm_mm = mm;
515	vma->vm_start = addr;
516	vma->vm_end = addr + len;
517	vma->vm_flags = vm_flags;
518	vma->vm_page_prot = protection_map[vm_flags & 0x0f];
519	vma->vm_ops = NULL;
520	vma->vm_pgoff = pgoff;
521	vma->vm_file = NULL;
522	vma->vm_private_data = NULL;
523	vma->vm_raend = 0;
524
525	if (file) {
526		error = -EINVAL;
527		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
528			goto free_vma;
529		if (vm_flags & VM_DENYWRITE) {
530			error = deny_write_access(file);
531			if (error)
532				goto free_vma;
533			correct_wcount = 1;
534		}
535		vma->vm_file = file;
536		get_file(file);
537		error = file->f_op->mmap(file, vma);
538		if (error)
539			goto unmap_and_free_vma;
540	} else if (flags & MAP_SHARED) {
541		error = shmem_zero_setup(vma);
542		if (error)
543			goto free_vma;
544	}
545
546	/* Can addr have changed??
547	 *
548	 * Answer: Yes, several device drivers can do it in their
549	 *         f_op->mmap method. -DaveM
550	 */
551	if (addr != vma->vm_start) {
552		struct vm_area_struct * stale_vma;
553		/* Since addr changed, we rely on the mmap op to prevent
554		 * collisions with existing vmas and just use find_vma_prepare
555		 * to update the tree pointers.
556		 */
557		addr = vma->vm_start;
558		stale_vma = find_vma_prepare(mm, addr, &prev,
559						&rb_link, &rb_parent);
560		/*
561		 * Make sure the lowlevel driver did its job right.
562		 */
563		if (unlikely(stale_vma && stale_vma->vm_start < vma->vm_end)) {
564			printk(KERN_ERR "buggy mmap operation: [<%p>]\n",
565				file ? file->f_op->mmap : NULL);
566			BUG();
567		}
568	}
569
570	vma_link(mm, vma, prev, rb_link, rb_parent);
571	if (correct_wcount)
572		atomic_inc(&file->f_dentry->d_inode->i_writecount);
573
574out:
575	mm->total_vm += len >> PAGE_SHIFT;
576	if (vm_flags & VM_LOCKED) {
577		mm->locked_vm += len >> PAGE_SHIFT;
578		make_pages_present(addr, addr + len);
579	}
580#ifdef CONFIG_HND_BMIPS3300_PROF
581	if (vm_flags & VM_EXEC) {
582		extern void sb1250_prof_mm_changed(struct task_struct *task, int sem);
583		sb1250_prof_mm_changed(current, 1);
584	}
585#endif	/* CONFIG_HND_BMIPS3300_PROF */
586	return addr;
587
588unmap_and_free_vma:
589	if (correct_wcount)
590		atomic_inc(&file->f_dentry->d_inode->i_writecount);
591	vma->vm_file = NULL;
592	fput(file);
593
594	/* Undo any partial mapping done by a device driver. */
595	zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
596free_vma:
597	kmem_cache_free(vm_area_cachep, vma);
598	return error;
599}
600
601/* Get an address range which is currently unmapped.
602 * For shmat() with addr=0.
603 *
604 * Ugly calling convention alert:
605 * Return value with the low bits set means error value,
606 * ie
607 *	if (ret & ~PAGE_MASK)
608 *		error = ret;
609 *
610 * This function "knows" that -ENOMEM has the bits set.
611 */
612#ifndef HAVE_ARCH_UNMAPPED_AREA
613static inline unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
614{
615	struct vm_area_struct *vma;
616
617	if (len > TASK_SIZE)
618		return -ENOMEM;
619
620	if (addr) {
621		addr = PAGE_ALIGN(addr);
622		vma = find_vma(current->mm, addr);
623		if (TASK_SIZE - len >= addr &&
624		    (!vma || addr + len <= vma->vm_start))
625			return addr;
626	}
627	addr = PAGE_ALIGN(TASK_UNMAPPED_BASE);
628
629	for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
630		/* At this point:  (!vma || addr < vma->vm_end). */
631		if (TASK_SIZE - len < addr)
632			return -ENOMEM;
633		if (!vma || addr + len <= vma->vm_start)
634			return addr;
635		addr = vma->vm_end;
636	}
637}
638#else
639extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
640#endif
641
642unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags)
643{
644	if (flags & MAP_FIXED) {
645		if (addr > TASK_SIZE - len)
646			return -ENOMEM;
647		if (addr & ~PAGE_MASK)
648			return -EINVAL;
649		return addr;
650	}
651
652	if (file && file->f_op && file->f_op->get_unmapped_area)
653		return file->f_op->get_unmapped_area(file, addr, len, pgoff, flags);
654
655	return arch_get_unmapped_area(file, addr, len, pgoff, flags);
656}
657
658/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
659struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr)
660{
661	struct vm_area_struct *vma = NULL;
662
663	if (mm) {
664		/* Check the cache first. */
665		/* (Cache hit rate is typically around 35%.) */
666		vma = mm->mmap_cache;
667		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
668			rb_node_t * rb_node;
669
670			rb_node = mm->mm_rb.rb_node;
671			vma = NULL;
672
673			while (rb_node) {
674				struct vm_area_struct * vma_tmp;
675
676				vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
677
678				if (vma_tmp->vm_end > addr) {
679					vma = vma_tmp;
680					if (vma_tmp->vm_start <= addr)
681						break;
682					rb_node = rb_node->rb_left;
683				} else
684					rb_node = rb_node->rb_right;
685			}
686			if (vma)
687				mm->mmap_cache = vma;
688		}
689	}
690	return vma;
691}
692
693/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */
694struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
695				      struct vm_area_struct **pprev)
696{
697	if (mm) {
698		/* Go through the RB tree quickly. */
699		struct vm_area_struct * vma;
700		rb_node_t * rb_node, * rb_last_right, * rb_prev;
701
702		rb_node = mm->mm_rb.rb_node;
703		rb_last_right = rb_prev = NULL;
704		vma = NULL;
705
706		while (rb_node) {
707			struct vm_area_struct * vma_tmp;
708
709			vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb);
710
711			if (vma_tmp->vm_end > addr) {
712				vma = vma_tmp;
713				rb_prev = rb_last_right;
714				if (vma_tmp->vm_start <= addr)
715					break;
716				rb_node = rb_node->rb_left;
717			} else {
718				rb_last_right = rb_node;
719				rb_node = rb_node->rb_right;
720			}
721		}
722		if (vma) {
723			if (vma->vm_rb.rb_left) {
724				rb_prev = vma->vm_rb.rb_left;
725				while (rb_prev->rb_right)
726					rb_prev = rb_prev->rb_right;
727			}
728			*pprev = NULL;
729			if (rb_prev)
730				*pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
731			if ((rb_prev ? (*pprev)->vm_next : mm->mmap) != vma)
732				BUG();
733			return vma;
734		}
735	}
736	*pprev = NULL;
737	return NULL;
738}
739
740struct vm_area_struct * find_extend_vma(struct mm_struct * mm, unsigned long addr)
741{
742	struct vm_area_struct * vma;
743	unsigned long start;
744
745	addr &= PAGE_MASK;
746	vma = find_vma(mm,addr);
747	if (!vma)
748		return NULL;
749	if (vma->vm_start <= addr)
750		return vma;
751	if (!(vma->vm_flags & VM_GROWSDOWN))
752		return NULL;
753	start = vma->vm_start;
754	if (expand_stack(vma, addr))
755		return NULL;
756	if (vma->vm_flags & VM_LOCKED) {
757		make_pages_present(addr, start);
758	}
759	return vma;
760}
761
762/* Normal function to fix up a mapping
763 * This function is the default for when an area has no specific
764 * function.  This may be used as part of a more specific routine.
765 * This function works out what part of an area is affected and
766 * adjusts the mapping information.  Since the actual page
767 * manipulation is done in do_mmap(), none need be done here,
768 * though it would probably be more appropriate.
769 *
770 * By the time this function is called, the area struct has been
771 * removed from the process mapping list, so it needs to be
772 * reinserted if necessary.
773 *
774 * The 4 main cases are:
775 *    Unmapping the whole area
776 *    Unmapping from the start of the segment to a point in it
777 *    Unmapping from an intermediate point to the end
778 *    Unmapping between to intermediate points, making a hole.
779 *
780 * Case 4 involves the creation of 2 new areas, for each side of
781 * the hole.  If possible, we reuse the existing area rather than
782 * allocate a new one, and the return indicates whether the old
783 * area was reused.
784 */
785static struct vm_area_struct * unmap_fixup(struct mm_struct *mm,
786	struct vm_area_struct *area, unsigned long addr, size_t len,
787	struct vm_area_struct *extra)
788{
789	struct vm_area_struct *mpnt;
790	unsigned long end = addr + len;
791
792	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
793	if (area->vm_flags & VM_LOCKED)
794		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
795
796	/* Unmapping the whole area. */
797	if (addr == area->vm_start && end == area->vm_end) {
798		if (area->vm_ops && area->vm_ops->close)
799			area->vm_ops->close(area);
800		if (area->vm_file)
801			fput(area->vm_file);
802		kmem_cache_free(vm_area_cachep, area);
803		return extra;
804	}
805
806	/* Work out to one of the ends. */
807	if (end == area->vm_end) {
808		/*
809		 * here area isn't visible to the semaphore-less readers
810		 * so we don't need to update it under the spinlock.
811		 */
812		area->vm_end = addr;
813		lock_vma_mappings(area);
814		spin_lock(&mm->page_table_lock);
815	} else if (addr == area->vm_start) {
816		area->vm_pgoff += (end - area->vm_start) >> PAGE_SHIFT;
817		/* same locking considerations of the above case */
818		area->vm_start = end;
819		lock_vma_mappings(area);
820		spin_lock(&mm->page_table_lock);
821	} else {
822	/* Unmapping a hole: area->vm_start < addr <= end < area->vm_end */
823		/* Add end mapping -- leave beginning for below */
824		mpnt = extra;
825		extra = NULL;
826
827		mpnt->vm_mm = area->vm_mm;
828		mpnt->vm_start = end;
829		mpnt->vm_end = area->vm_end;
830		mpnt->vm_page_prot = area->vm_page_prot;
831		mpnt->vm_flags = area->vm_flags;
832		mpnt->vm_raend = 0;
833		mpnt->vm_ops = area->vm_ops;
834		mpnt->vm_pgoff = area->vm_pgoff + ((end - area->vm_start) >> PAGE_SHIFT);
835		mpnt->vm_file = area->vm_file;
836		mpnt->vm_private_data = area->vm_private_data;
837		if (mpnt->vm_file)
838			get_file(mpnt->vm_file);
839		if (mpnt->vm_ops && mpnt->vm_ops->open)
840			mpnt->vm_ops->open(mpnt);
841		area->vm_end = addr;	/* Truncate area */
842
843		/* Because mpnt->vm_file == area->vm_file this locks
844		 * things correctly.
845		 */
846		lock_vma_mappings(area);
847		spin_lock(&mm->page_table_lock);
848		__insert_vm_struct(mm, mpnt);
849	}
850
851	__insert_vm_struct(mm, area);
852	spin_unlock(&mm->page_table_lock);
853	unlock_vma_mappings(area);
854	return extra;
855}
856
857/*
858 * Try to free as many page directory entries as we can,
859 * without having to work very hard at actually scanning
860 * the page tables themselves.
861 *
862 * Right now we try to free page tables if we have a nice
863 * PGDIR-aligned area that got free'd up. We could be more
864 * granular if we want to, but this is fast and simple,
865 * and covers the bad cases.
866 *
867 * "prev", if it exists, points to a vma before the one
868 * we just free'd - but there's no telling how much before.
869 */
870static void free_pgtables(struct mm_struct * mm, struct vm_area_struct *prev,
871	unsigned long start, unsigned long end)
872{
873	unsigned long first = start & PGDIR_MASK;
874	unsigned long last = end + PGDIR_SIZE - 1;
875	unsigned long start_index, end_index;
876
877	if (!prev) {
878		prev = mm->mmap;
879		if (!prev)
880			goto no_mmaps;
881		if (prev->vm_end > start) {
882			if (last > prev->vm_start)
883				last = prev->vm_start;
884			goto no_mmaps;
885		}
886	}
887	for (;;) {
888		struct vm_area_struct *next = prev->vm_next;
889
890		if (next) {
891			if (next->vm_start < start) {
892				prev = next;
893				continue;
894			}
895			if (last > next->vm_start)
896				last = next->vm_start;
897		}
898		if (prev->vm_end > first)
899			first = prev->vm_end + PGDIR_SIZE - 1;
900		break;
901	}
902no_mmaps:
903	/*
904	 * If the PGD bits are not consecutive in the virtual address, the
905	 * old method of shifting the VA >> by PGDIR_SHIFT doesn't work.
906	 */
907	start_index = pgd_index(first);
908	end_index = pgd_index(last);
909	if (end_index > start_index) {
910		clear_page_tables(mm, start_index, end_index - start_index);
911		flush_tlb_pgtables(mm, first & PGDIR_MASK, last & PGDIR_MASK);
912	}
913}
914
915/* Munmap is split into 2 main parts -- this part which finds
916 * what needs doing, and the areas themselves, which do the
917 * work.  This now handles partial unmappings.
918 * Jeremy Fitzhardine <jeremy@sw.oz.au>
919 */
920int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
921{
922	struct vm_area_struct *mpnt, *prev, **npp, *free, *extra;
923
924	if ((addr & ~PAGE_MASK) || addr > TASK_SIZE || len > TASK_SIZE-addr)
925		return -EINVAL;
926
927	if ((len = PAGE_ALIGN(len)) == 0)
928		return -EINVAL;
929
930	/* Check if this memory area is ok - put it on the temporary
931	 * list if so..  The checks here are pretty simple --
932	 * every area affected in some way (by any overlap) is put
933	 * on the list.  If nothing is put on, nothing is affected.
934	 */
935	mpnt = find_vma_prev(mm, addr, &prev);
936	if (!mpnt)
937		return 0;
938	/* we have  addr < mpnt->vm_end  */
939
940	if (mpnt->vm_start >= addr+len)
941		return 0;
942
943	/* If we'll make "hole", check the vm areas limit */
944	if ((mpnt->vm_start < addr && mpnt->vm_end > addr+len)
945	    && mm->map_count >= max_map_count)
946		return -ENOMEM;
947
948	/*
949	 * We may need one additional vma to fix up the mappings ...
950	 * and this is the last chance for an easy error exit.
951	 */
952	extra = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
953	if (!extra)
954		return -ENOMEM;
955
956	npp = (prev ? &prev->vm_next : &mm->mmap);
957	free = NULL;
958	spin_lock(&mm->page_table_lock);
959	for ( ; mpnt && mpnt->vm_start < addr+len; mpnt = *npp) {
960		*npp = mpnt->vm_next;
961		mpnt->vm_next = free;
962		free = mpnt;
963		rb_erase(&mpnt->vm_rb, &mm->mm_rb);
964	}
965	mm->mmap_cache = NULL;	/* Kill the cache. */
966	spin_unlock(&mm->page_table_lock);
967
968	/* Ok - we have the memory areas we should free on the 'free' list,
969	 * so release them, and unmap the page range..
970	 * If the one of the segments is only being partially unmapped,
971	 * it will put new vm_area_struct(s) into the address space.
972	 * In that case we have to be careful with VM_DENYWRITE.
973	 */
974	while ((mpnt = free) != NULL) {
975		unsigned long st, end, size;
976		struct file *file = NULL;
977
978		free = free->vm_next;
979
980		st = addr < mpnt->vm_start ? mpnt->vm_start : addr;
981		end = addr+len;
982		end = end > mpnt->vm_end ? mpnt->vm_end : end;
983		size = end - st;
984
985		if (mpnt->vm_flags & VM_DENYWRITE &&
986		    (st != mpnt->vm_start || end != mpnt->vm_end) &&
987		    (file = mpnt->vm_file) != NULL) {
988			atomic_dec(&file->f_dentry->d_inode->i_writecount);
989		}
990		remove_shared_vm_struct(mpnt);
991		mm->map_count--;
992
993		zap_page_range(mm, st, size);
994
995		/*
996		 * Fix the mapping, and free the old area if it wasn't reused.
997		 */
998		extra = unmap_fixup(mm, mpnt, st, size, extra);
999		if (file)
1000			atomic_inc(&file->f_dentry->d_inode->i_writecount);
1001	}
1002	validate_mm(mm);
1003
1004	/* Release the extra vma struct if it wasn't used */
1005	if (extra)
1006		kmem_cache_free(vm_area_cachep, extra);
1007
1008	free_pgtables(mm, prev, addr, addr+len);
1009
1010	return 0;
1011}
1012
1013asmlinkage long sys_munmap(unsigned long addr, size_t len)
1014{
1015	int ret;
1016	struct mm_struct *mm = current->mm;
1017
1018	down_write(&mm->mmap_sem);
1019	ret = do_munmap(mm, addr, len);
1020	up_write(&mm->mmap_sem);
1021	return ret;
1022}
1023
1024/*
1025 *  this is really a simplified "do_mmap".  it only handles
1026 *  anonymous maps.  eventually we may be able to do some
1027 *  brk-specific accounting here.
1028 */
1029unsigned long do_brk(unsigned long addr, unsigned long len)
1030{
1031	struct mm_struct * mm = current->mm;
1032	struct vm_area_struct * vma, * prev;
1033	unsigned long flags;
1034	rb_node_t ** rb_link, * rb_parent;
1035
1036	len = PAGE_ALIGN(len);
1037	if (!len)
1038		return addr;
1039
1040	/*
1041	 * mlock MCL_FUTURE?
1042	 */
1043	if (mm->def_flags & VM_LOCKED) {
1044		unsigned long locked = mm->locked_vm << PAGE_SHIFT;
1045		locked += len;
1046		if (locked > current->rlim[RLIMIT_MEMLOCK].rlim_cur)
1047			return -EAGAIN;
1048	}
1049
1050	/*
1051	 * Clear old maps.  this also does some error checking for us
1052	 */
1053 munmap_back:
1054	vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
1055	if (vma && vma->vm_start < addr + len) {
1056		if (do_munmap(mm, addr, len))
1057			return -ENOMEM;
1058		goto munmap_back;
1059	}
1060
1061	/* Check against address space limits *after* clearing old maps... */
1062	if ((mm->total_vm << PAGE_SHIFT) + len
1063	    > current->rlim[RLIMIT_AS].rlim_cur)
1064		return -ENOMEM;
1065
1066	if (mm->map_count > max_map_count)
1067		return -ENOMEM;
1068
1069	if (!vm_enough_memory(len >> PAGE_SHIFT))
1070		return -ENOMEM;
1071
1072	flags = VM_DATA_DEFAULT_FLAGS | mm->def_flags;
1073
1074	/* Can we just expand an old anonymous mapping? */
1075	if (rb_parent && vma_merge(mm, prev, rb_parent, addr, addr + len, flags))
1076		goto out;
1077
1078	/*
1079	 * create a vma struct for an anonymous mapping
1080	 */
1081	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
1082	if (!vma)
1083		return -ENOMEM;
1084
1085	vma->vm_mm = mm;
1086	vma->vm_start = addr;
1087	vma->vm_end = addr + len;
1088	vma->vm_flags = flags;
1089	vma->vm_page_prot = protection_map[flags & 0x0f];
1090	vma->vm_ops = NULL;
1091	vma->vm_pgoff = 0;
1092	vma->vm_file = NULL;
1093	vma->vm_private_data = NULL;
1094
1095	vma_link(mm, vma, prev, rb_link, rb_parent);
1096
1097out:
1098	mm->total_vm += len >> PAGE_SHIFT;
1099	if (flags & VM_LOCKED) {
1100		mm->locked_vm += len >> PAGE_SHIFT;
1101		make_pages_present(addr, addr + len);
1102	}
1103	return addr;
1104}
1105
1106/* Build the RB tree corresponding to the VMA list. */
1107void build_mmap_rb(struct mm_struct * mm)
1108{
1109	struct vm_area_struct * vma;
1110	rb_node_t ** rb_link, * rb_parent;
1111
1112	mm->mm_rb = RB_ROOT;
1113	rb_link = &mm->mm_rb.rb_node;
1114	rb_parent = NULL;
1115	for (vma = mm->mmap; vma; vma = vma->vm_next) {
1116		__vma_link_rb(mm, vma, rb_link, rb_parent);
1117		rb_parent = &vma->vm_rb;
1118		rb_link = &rb_parent->rb_right;
1119	}
1120}
1121
1122/* Release all mmaps. */
1123void exit_mmap(struct mm_struct * mm)
1124{
1125	struct vm_area_struct * mpnt;
1126
1127	release_segments(mm);
1128	spin_lock(&mm->page_table_lock);
1129	mpnt = mm->mmap;
1130	mm->mmap = mm->mmap_cache = NULL;
1131	mm->mm_rb = RB_ROOT;
1132	mm->rss = 0;
1133	spin_unlock(&mm->page_table_lock);
1134	mm->total_vm = 0;
1135	mm->locked_vm = 0;
1136
1137	flush_cache_mm(mm);
1138	while (mpnt) {
1139		struct vm_area_struct * next = mpnt->vm_next;
1140		unsigned long start = mpnt->vm_start;
1141		unsigned long end = mpnt->vm_end;
1142		unsigned long size = end - start;
1143
1144		if (mpnt->vm_ops) {
1145			if (mpnt->vm_ops->close)
1146				mpnt->vm_ops->close(mpnt);
1147		}
1148		mm->map_count--;
1149		remove_shared_vm_struct(mpnt);
1150		zap_page_range(mm, start, size);
1151		if (mpnt->vm_file)
1152			fput(mpnt->vm_file);
1153		kmem_cache_free(vm_area_cachep, mpnt);
1154		mpnt = next;
1155	}
1156
1157	/* This is just debugging */
1158	if (mm->map_count)
1159		BUG();
1160
1161	clear_page_tables(mm, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
1162
1163	flush_tlb_mm(mm);
1164}
1165
1166/* Insert vm structure into process list sorted by address
1167 * and into the inode's i_mmap ring.  If vm_file is non-NULL
1168 * then the i_shared_lock must be held here.
1169 */
1170void __insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
1171{
1172	struct vm_area_struct * __vma, * prev;
1173	rb_node_t ** rb_link, * rb_parent;
1174
1175	__vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent);
1176	if (__vma && __vma->vm_start < vma->vm_end)
1177		BUG();
1178	__vma_link(mm, vma, prev, rb_link, rb_parent);
1179	mm->map_count++;
1180	validate_mm(mm);
1181}
1182
1183void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
1184{
1185	struct vm_area_struct * __vma, * prev;
1186	rb_node_t ** rb_link, * rb_parent;
1187
1188	__vma = find_vma_prepare(mm, vma->vm_start, &prev, &rb_link, &rb_parent);
1189	if (__vma && __vma->vm_start < vma->vm_end)
1190		BUG();
1191	vma_link(mm, vma, prev, rb_link, rb_parent);
1192	validate_mm(mm);
1193}
1194