1/*
2 *  linux/mm/memory.c
3 *
4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
5 */
6
7/*
8 * demand-loading started 01.12.91 - seems it is high on the list of
9 * things wanted, and it should be easy to implement. - Linus
10 */
11
12/*
13 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
14 * pages started 02.12.91, seems to work. - Linus.
15 *
16 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
17 * would have taken more than the 6M I have free, but it worked well as
18 * far as I could see.
19 *
20 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
21 */
22
23/*
24 * Real VM (paging to/from disk) started 18.12.91. Much more work and
25 * thought has to go into this. Oh, well..
26 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
27 *		Found it. Everything seems to work now.
28 * 20.12.91  -  Ok, making the swap-device changeable like the root.
29 */
30
31/*
32 * 05.04.94  -  Multi-page memory management added for v1.1.
33 * 		Idea by Alex Bligh (alex@cconcepts.co.uk)
34 *
35 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
36 *		(Gerhard.Wichert@pdb.siemens.de)
37 */
38
39#include <linux/mm.h>
40#include <linux/mman.h>
41#include <linux/swap.h>
42#include <linux/smp_lock.h>
43#include <linux/swapctl.h>
44#include <linux/iobuf.h>
45#include <linux/highmem.h>
46#include <linux/pagemap.h>
47#include <linux/module.h>
48
49#include <asm/pgalloc.h>
50#include <asm/uaccess.h>
51#include <asm/tlb.h>
52
53unsigned long max_mapnr;
54unsigned long num_physpages;
55unsigned long num_mappedpages;
56void * high_memory;
57struct page *highmem_start_page;
58
59/*
60 * We special-case the C-O-W ZERO_PAGE, because it's such
61 * a common occurrence (no need to read the page to know
62 * that it's zero - better for the cache and memory subsystem).
63 */
64static inline void copy_cow_page(struct page * from, struct page * to, unsigned long address)
65{
66	if (from == ZERO_PAGE(address)) {
67		clear_user_highpage(to, address);
68		return;
69	}
70	copy_user_highpage(to, from, address);
71}
72
73mem_map_t * mem_map;
74
75/*
76 * Called by TLB shootdown
77 */
78void __free_pte(pte_t pte)
79{
80	struct page *page = pte_page(pte);
81	if ((!VALID_PAGE(page)) || PageReserved(page))
82		return;
83	if (pte_dirty(pte))
84		set_page_dirty(page);
85	free_page_and_swap_cache(page);
86}
87
88
89/*
90 * Note: this doesn't free the actual pages themselves. That
91 * has been handled earlier when unmapping all the memory regions.
92 */
93static inline void free_one_pmd(pmd_t * dir)
94{
95	pte_t * pte;
96
97	if (pmd_none(*dir))
98		return;
99	if (pmd_bad(*dir)) {
100		pmd_ERROR(*dir);
101		pmd_clear(dir);
102		return;
103	}
104	pte = pte_offset(dir, 0);
105	pmd_clear(dir);
106	pte_free(pte);
107}
108
109static inline void free_one_pgd(pgd_t * dir)
110{
111	int j;
112	pmd_t * pmd;
113
114	if (pgd_none(*dir))
115		return;
116	if (pgd_bad(*dir)) {
117		pgd_ERROR(*dir);
118		pgd_clear(dir);
119		return;
120	}
121	pmd = pmd_offset(dir, 0);
122	pgd_clear(dir);
123	for (j = 0; j < PTRS_PER_PMD ; j++) {
124		prefetchw(pmd+j+(PREFETCH_STRIDE/16));
125		free_one_pmd(pmd+j);
126	}
127	pmd_free(pmd);
128}
129
130/* Low and high watermarks for page table cache.
131   The system should try to have pgt_water[0] <= cache elements <= pgt_water[1]
132 */
133int pgt_cache_water[2] = { 25, 50 };
134
135/* Returns the number of pages freed */
136int check_pgt_cache(void)
137{
138	return do_check_pgt_cache(pgt_cache_water[0], pgt_cache_water[1]);
139}
140
141
142/*
143 * This function clears all user-level page tables of a process - this
144 * is needed by execve(), so that old pages aren't in the way.
145 */
146void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr)
147{
148	pgd_t * page_dir = mm->pgd;
149
150	spin_lock(&mm->page_table_lock);
151	page_dir += first;
152	do {
153		free_one_pgd(page_dir);
154		page_dir++;
155	} while (--nr);
156	spin_unlock(&mm->page_table_lock);
157
158	/* keep the page table cache within bounds */
159	check_pgt_cache();
160}
161
162#define PTE_TABLE_MASK	((PTRS_PER_PTE-1) * sizeof(pte_t))
163#define PMD_TABLE_MASK	((PTRS_PER_PMD-1) * sizeof(pmd_t))
164
165/*
166 * copy one vm_area from one task to the other. Assumes the page tables
167 * already present in the new task to be cleared in the whole range
168 * covered by this vma.
169 *
170 * 08Jan98 Merged into one routine from several inline routines to reduce
171 *         variable count and make things faster. -jj
172 *
173 * dst->page_table_lock is held on entry and exit,
174 * but may be dropped within pmd_alloc() and pte_alloc().
175 */
176int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
177			struct vm_area_struct *vma)
178{
179	pgd_t * src_pgd, * dst_pgd;
180	unsigned long address = vma->vm_start;
181	unsigned long end = vma->vm_end;
182	unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
183
184	src_pgd = pgd_offset(src, address)-1;
185	dst_pgd = pgd_offset(dst, address)-1;
186
187	for (;;) {
188		pmd_t * src_pmd, * dst_pmd;
189
190		src_pgd++; dst_pgd++;
191
192		/* copy_pmd_range */
193
194		if (pgd_none(*src_pgd))
195			goto skip_copy_pmd_range;
196		if (pgd_bad(*src_pgd)) {
197			pgd_ERROR(*src_pgd);
198			pgd_clear(src_pgd);
199skip_copy_pmd_range:	address = (address + PGDIR_SIZE) & PGDIR_MASK;
200			if (!address || (address >= end))
201				goto out;
202			continue;
203		}
204
205		src_pmd = pmd_offset(src_pgd, address);
206		dst_pmd = pmd_alloc(dst, dst_pgd, address);
207		if (!dst_pmd)
208			goto nomem;
209
210		do {
211			pte_t * src_pte, * dst_pte;
212
213			/* copy_pte_range */
214
215			if (pmd_none(*src_pmd))
216				goto skip_copy_pte_range;
217			if (pmd_bad(*src_pmd)) {
218				pmd_ERROR(*src_pmd);
219				pmd_clear(src_pmd);
220skip_copy_pte_range:		address = (address + PMD_SIZE) & PMD_MASK;
221				if (address >= end)
222					goto out;
223				goto cont_copy_pmd_range;
224			}
225
226			src_pte = pte_offset(src_pmd, address);
227			dst_pte = pte_alloc(dst, dst_pmd, address);
228			if (!dst_pte)
229				goto nomem;
230
231			spin_lock(&src->page_table_lock);
232			do {
233				pte_t pte = *src_pte;
234				struct page *ptepage;
235
236				/* copy_one_pte */
237
238				if (pte_none(pte))
239					goto cont_copy_pte_range_noset;
240				if (!pte_present(pte)) {
241					swap_duplicate(pte_to_swp_entry(pte));
242					goto cont_copy_pte_range;
243				}
244				ptepage = pte_page(pte);
245				if ((!VALID_PAGE(ptepage)) ||
246				    PageReserved(ptepage))
247					goto cont_copy_pte_range;
248
249				/* If it's a COW mapping, write protect it both in the parent and the child */
250				if (cow && pte_write(pte)) {
251					ptep_set_wrprotect(src_pte);
252					pte = *src_pte;
253				}
254
255				/* If it's a shared mapping, mark it clean in the child */
256				if (vma->vm_flags & VM_SHARED)
257					pte = pte_mkclean(pte);
258				pte = pte_mkold(pte);
259				get_page(ptepage);
260				dst->rss++;
261
262cont_copy_pte_range:		set_pte(dst_pte, pte);
263cont_copy_pte_range_noset:	address += PAGE_SIZE;
264				if (address >= end)
265					goto out_unlock;
266				src_pte++;
267				dst_pte++;
268			} while ((unsigned long)src_pte & PTE_TABLE_MASK);
269			spin_unlock(&src->page_table_lock);
270
271cont_copy_pmd_range:	src_pmd++;
272			dst_pmd++;
273		} while ((unsigned long)src_pmd & PMD_TABLE_MASK);
274	}
275out_unlock:
276	spin_unlock(&src->page_table_lock);
277out:
278	return 0;
279nomem:
280	return -ENOMEM;
281}
282
283/*
284 * Return indicates whether a page was freed so caller can adjust rss
285 */
286static inline void forget_pte(pte_t page)
287{
288	if (!pte_none(page)) {
289		printk("forget_pte: old mapping existed!\n");
290		BUG();
291	}
292}
293
294static inline int zap_pte_range(mmu_gather_t *tlb, pmd_t * pmd, unsigned long address, unsigned long size)
295{
296	unsigned long offset;
297	pte_t * ptep;
298	int freed = 0;
299
300	if (pmd_none(*pmd))
301		return 0;
302	if (pmd_bad(*pmd)) {
303		pmd_ERROR(*pmd);
304		pmd_clear(pmd);
305		return 0;
306	}
307	ptep = pte_offset(pmd, address);
308	offset = address & ~PMD_MASK;
309	if (offset + size > PMD_SIZE)
310		size = PMD_SIZE - offset;
311	size &= PAGE_MASK;
312	for (offset=0; offset < size; ptep++, offset += PAGE_SIZE) {
313		pte_t pte = *ptep;
314		if (pte_none(pte))
315			continue;
316		if (pte_present(pte)) {
317			struct page *page = pte_page(pte);
318			if (VALID_PAGE(page) && !PageReserved(page))
319				freed ++;
320			/* This will eventually call __free_pte on the pte. */
321			tlb_remove_page(tlb, ptep, address + offset);
322		} else {
323			free_swap_and_cache(pte_to_swp_entry(pte));
324			pte_clear(ptep);
325		}
326	}
327
328	return freed;
329}
330
331static inline int zap_pmd_range(mmu_gather_t *tlb, pgd_t * dir, unsigned long address, unsigned long size)
332{
333	pmd_t * pmd;
334	unsigned long end;
335	int freed;
336
337	if (pgd_none(*dir))
338		return 0;
339	if (pgd_bad(*dir)) {
340		pgd_ERROR(*dir);
341		pgd_clear(dir);
342		return 0;
343	}
344	pmd = pmd_offset(dir, address);
345	end = address + size;
346	if (end > ((address + PGDIR_SIZE) & PGDIR_MASK))
347		end = ((address + PGDIR_SIZE) & PGDIR_MASK);
348	freed = 0;
349	do {
350		freed += zap_pte_range(tlb, pmd, address, end - address);
351		address = (address + PMD_SIZE) & PMD_MASK;
352		pmd++;
353	} while (address < end);
354	return freed;
355}
356
357/*
358 * remove user pages in a given range.
359 */
360void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size)
361{
362	mmu_gather_t *tlb;
363	pgd_t * dir;
364	unsigned long start = address, end = address + size;
365	int freed = 0;
366
367	dir = pgd_offset(mm, address);
368
369	/*
370	 * This is a long-lived spinlock. That's fine.
371	 * There's no contention, because the page table
372	 * lock only protects against kswapd anyway, and
373	 * even if kswapd happened to be looking at this
374	 * process we _want_ it to get stuck.
375	 */
376	if (address >= end)
377		BUG();
378	spin_lock(&mm->page_table_lock);
379	flush_cache_range(mm, address, end);
380	tlb = tlb_gather_mmu(mm);
381
382	do {
383		freed += zap_pmd_range(tlb, dir, address, end - address);
384		address = (address + PGDIR_SIZE) & PGDIR_MASK;
385		dir++;
386	} while (address && (address < end));
387
388	/* this will flush any remaining tlb entries */
389	tlb_finish_mmu(tlb, start, end);
390
391	/*
392	 * Update rss for the mm_struct (not necessarily current->mm)
393	 * Notice that rss is an unsigned long.
394	 */
395	if (mm->rss > freed)
396		mm->rss -= freed;
397	else
398		mm->rss = 0;
399	spin_unlock(&mm->page_table_lock);
400}
401
402/*
403 * Do a quick page-table lookup for a single page.
404 */
405static struct page * follow_page(struct mm_struct *mm, unsigned long address, int write)
406{
407	pgd_t *pgd;
408	pmd_t *pmd;
409	pte_t *ptep, pte;
410
411	pgd = pgd_offset(mm, address);
412	if (pgd_none(*pgd) || pgd_bad(*pgd))
413		goto out;
414
415	pmd = pmd_offset(pgd, address);
416	if (pmd_none(*pmd) || pmd_bad(*pmd))
417		goto out;
418
419	ptep = pte_offset(pmd, address);
420	if (!ptep)
421		goto out;
422
423	pte = *ptep;
424	if (pte_present(pte)) {
425		if (!write ||
426		    (pte_write(pte) && pte_dirty(pte)))
427			return pte_page(pte);
428	}
429
430out:
431	return 0;
432}
433
434/*
435 * Given a physical address, is there a useful struct page pointing to
436 * it?  This may become more complex in the future if we start dealing
437 * with IO-aperture pages in kiobufs.
438 */
439
440static inline struct page * get_page_map(struct page *page)
441{
442	if (!VALID_PAGE(page))
443		return 0;
444	return page;
445}
446
447/*
448 * Please read Documentation/cachetlb.txt before using this function,
449 * accessing foreign memory spaces can cause cache coherency problems.
450 *
451 * Accessing a VM_IO area is even more dangerous, therefore the function
452 * fails if pages is != NULL and a VM_IO area is found.
453 */
454int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
455		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas)
456{
457	int i;
458	unsigned int flags;
459
460	/*
461	 * Require read or write permissions.
462	 * If 'force' is set, we only require the "MAY" flags.
463	 */
464	flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
465	flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
466	i = 0;
467
468	do {
469		struct vm_area_struct *	vma;
470
471		vma = find_extend_vma(mm, start);
472
473		if ( !vma || (pages && vma->vm_flags & VM_IO) || !(flags & vma->vm_flags) )
474			return i ? : -EFAULT;
475
476		spin_lock(&mm->page_table_lock);
477		do {
478			struct page *map;
479			while (!(map = follow_page(mm, start, write))) {
480				spin_unlock(&mm->page_table_lock);
481				switch (handle_mm_fault(mm, vma, start, write)) {
482				case 1:
483					tsk->min_flt++;
484					break;
485				case 2:
486					tsk->maj_flt++;
487					break;
488				case 0:
489					if (i) return i;
490					return -EFAULT;
491				default:
492					if (i) return i;
493					return -ENOMEM;
494				}
495				spin_lock(&mm->page_table_lock);
496			}
497			if (pages) {
498				pages[i] = get_page_map(map);
499				if (!pages[i])
500					goto bad_page;
501				page_cache_get(pages[i]);
502			}
503			if (vmas)
504				vmas[i] = vma;
505			i++;
506			start += PAGE_SIZE;
507			len--;
508		} while(len && start < vma->vm_end);
509		spin_unlock(&mm->page_table_lock);
510	} while(len);
511out:
512	return i;
513
514	/*
515	 * We found an invalid page in the VMA.  Release all we have
516	 * so far and fail.
517	 */
518bad_page:
519	spin_unlock(&mm->page_table_lock);
520	while (i--)
521		page_cache_release(pages[i]);
522	i = -EFAULT;
523	goto out;
524}
525
526EXPORT_SYMBOL(get_user_pages);
527
528/*
529 * Force in an entire range of pages from the current process's user VA,
530 * and pin them in physical memory.
531 */
532#define dprintk(x...)
533
534int map_user_kiobuf(int rw, struct kiobuf *iobuf, unsigned long va, size_t len)
535{
536	int pgcount, err;
537	struct mm_struct *	mm;
538
539	/* Make sure the iobuf is not already mapped somewhere. */
540	if (iobuf->nr_pages)
541		return -EINVAL;
542
543	mm = current->mm;
544	dprintk ("map_user_kiobuf: begin\n");
545
546	pgcount = (va + len + PAGE_SIZE - 1)/PAGE_SIZE - va/PAGE_SIZE;
547	/* mapping 0 bytes is not permitted */
548	if (!pgcount) BUG();
549	err = expand_kiobuf(iobuf, pgcount);
550	if (err)
551		return err;
552
553	iobuf->locked = 0;
554	iobuf->offset = va & (PAGE_SIZE-1);
555	iobuf->length = len;
556
557	/* Try to fault in all of the necessary pages */
558	down_read(&mm->mmap_sem);
559	/* rw==READ means read from disk, write into memory area */
560	err = get_user_pages(current, mm, va, pgcount,
561			(rw==READ), 0, iobuf->maplist, NULL);
562	up_read(&mm->mmap_sem);
563	if (err < 0) {
564		unmap_kiobuf(iobuf);
565		dprintk ("map_user_kiobuf: end %d\n", err);
566		return err;
567	}
568	iobuf->nr_pages = err;
569	while (pgcount--) {
570		flush_dcache_page(iobuf->maplist[pgcount]);
571	}
572	dprintk ("map_user_kiobuf: end OK\n");
573	return 0;
574}
575
576/*
577 * Mark all of the pages in a kiobuf as dirty
578 *
579 * We need to be able to deal with short reads from disk: if an IO error
580 * occurs, the number of bytes read into memory may be less than the
581 * size of the kiobuf, so we have to stop marking pages dirty once the
582 * requested byte count has been reached.
583 *
584 * Must be called from process context - set_page_dirty() takes VFS locks.
585 */
586
587void mark_dirty_kiobuf(struct kiobuf *iobuf, int bytes)
588{
589	int index, offset, remaining;
590	struct page *page;
591
592	index = iobuf->offset >> PAGE_SHIFT;
593	offset = iobuf->offset & ~PAGE_MASK;
594	remaining = bytes;
595	if (remaining > iobuf->length)
596		remaining = iobuf->length;
597
598	while (remaining > 0 && index < iobuf->nr_pages) {
599		page = iobuf->maplist[index];
600
601		if (!PageReserved(page))
602			set_page_dirty(page);
603
604		remaining -= (PAGE_SIZE - offset);
605		offset = 0;
606		index++;
607	}
608}
609
610/*
611 * Unmap all of the pages referenced by a kiobuf.  We release the pages,
612 * and unlock them if they were locked.
613 */
614
615void unmap_kiobuf (struct kiobuf *iobuf)
616{
617	int i;
618	struct page *map;
619
620	for (i = 0; i < iobuf->nr_pages; i++) {
621		map = iobuf->maplist[i];
622		if (map) {
623			if (iobuf->locked)
624				UnlockPage(map);
625			page_cache_release(map);
626		}
627	}
628
629	iobuf->nr_pages = 0;
630	iobuf->locked = 0;
631}
632
633
634/*
635 * Lock down all of the pages of a kiovec for IO.
636 *
637 * If any page is mapped twice in the kiovec, we return the error -EINVAL.
638 *
639 * The optional wait parameter causes the lock call to block until all
640 * pages can be locked if set.  If wait==0, the lock operation is
641 * aborted if any locked pages are found and -EAGAIN is returned.
642 */
643
644int lock_kiovec(int nr, struct kiobuf *iovec[], int wait)
645{
646	struct kiobuf *iobuf;
647	int i, j;
648	struct page *page, **ppage;
649	int doublepage = 0;
650	int repeat = 0;
651
652 repeat:
653
654	for (i = 0; i < nr; i++) {
655		iobuf = iovec[i];
656
657		if (iobuf->locked)
658			continue;
659
660		ppage = iobuf->maplist;
661		for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
662			page = *ppage;
663			if (!page)
664				continue;
665
666			if (TryLockPage(page)) {
667				while (j--) {
668					struct page *tmp = *--ppage;
669					if (tmp)
670						UnlockPage(tmp);
671				}
672				goto retry;
673			}
674		}
675		iobuf->locked = 1;
676	}
677
678	return 0;
679
680 retry:
681
682	/*
683	 * We couldn't lock one of the pages.  Undo the locking so far,
684	 * wait on the page we got to, and try again.
685	 */
686
687	unlock_kiovec(nr, iovec);
688	if (!wait)
689		return -EAGAIN;
690
691	/*
692	 * Did the release also unlock the page we got stuck on?
693	 */
694	if (!PageLocked(page)) {
695		/*
696		 * If so, we may well have the page mapped twice
697		 * in the IO address range.  Bad news.  Of
698		 * course, it _might_ just be a coincidence,
699		 * but if it happens more than once, chances
700		 * are we have a double-mapped page.
701		 */
702		if (++doublepage >= 3)
703			return -EINVAL;
704
705		/* Try again...  */
706		wait_on_page(page);
707	}
708
709	if (++repeat < 16)
710		goto repeat;
711	return -EAGAIN;
712}
713
714/*
715 * Unlock all of the pages of a kiovec after IO.
716 */
717
718int unlock_kiovec(int nr, struct kiobuf *iovec[])
719{
720	struct kiobuf *iobuf;
721	int i, j;
722	struct page *page, **ppage;
723
724	for (i = 0; i < nr; i++) {
725		iobuf = iovec[i];
726
727		if (!iobuf->locked)
728			continue;
729		iobuf->locked = 0;
730
731		ppage = iobuf->maplist;
732		for (j = 0; j < iobuf->nr_pages; ppage++, j++) {
733			page = *ppage;
734			if (!page)
735				continue;
736			UnlockPage(page);
737		}
738	}
739	return 0;
740}
741
742static inline void zeromap_pte_range(pte_t * pte, unsigned long address,
743                                     unsigned long size, pgprot_t prot)
744{
745	unsigned long end;
746
747	address &= ~PMD_MASK;
748	end = address + size;
749	if (end > PMD_SIZE)
750		end = PMD_SIZE;
751	do {
752		pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot));
753		pte_t oldpage = ptep_get_and_clear(pte);
754		set_pte(pte, zero_pte);
755		forget_pte(oldpage);
756		address += PAGE_SIZE;
757		pte++;
758	} while (address && (address < end));
759}
760
761static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address,
762                                    unsigned long size, pgprot_t prot)
763{
764	unsigned long end;
765
766	address &= ~PGDIR_MASK;
767	end = address + size;
768	if (end > PGDIR_SIZE)
769		end = PGDIR_SIZE;
770	do {
771		pte_t * pte = pte_alloc(mm, pmd, address);
772		if (!pte)
773			return -ENOMEM;
774		zeromap_pte_range(pte, address, end - address, prot);
775		address = (address + PMD_SIZE) & PMD_MASK;
776		pmd++;
777	} while (address && (address < end));
778	return 0;
779}
780
781int zeromap_page_range(unsigned long address, unsigned long size, pgprot_t prot)
782{
783	int error = 0;
784	pgd_t * dir;
785	unsigned long beg = address;
786	unsigned long end = address + size;
787	struct mm_struct *mm = current->mm;
788
789	dir = pgd_offset(mm, address);
790	flush_cache_range(mm, beg, end);
791	if (address >= end)
792		BUG();
793
794	spin_lock(&mm->page_table_lock);
795	do {
796		pmd_t *pmd = pmd_alloc(mm, dir, address);
797		error = -ENOMEM;
798		if (!pmd)
799			break;
800		error = zeromap_pmd_range(mm, pmd, address, end - address, prot);
801		if (error)
802			break;
803		address = (address + PGDIR_SIZE) & PGDIR_MASK;
804		dir++;
805	} while (address && (address < end));
806	spin_unlock(&mm->page_table_lock);
807	flush_tlb_range(mm, beg, end);
808	return error;
809}
810
811/*
812 * maps a range of physical memory into the requested pages. the old
813 * mappings are removed. any references to nonexistent pages results
814 * in null mappings (currently treated as "copy-on-access")
815 */
816static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size,
817	unsigned long phys_addr, pgprot_t prot)
818{
819	unsigned long end;
820
821	address &= ~PMD_MASK;
822	end = address + size;
823	if (end > PMD_SIZE)
824		end = PMD_SIZE;
825	do {
826		struct page *page;
827		pte_t oldpage;
828		oldpage = ptep_get_and_clear(pte);
829
830		page = virt_to_page(__va(phys_addr));
831		if ((!VALID_PAGE(page)) || PageReserved(page))
832 			set_pte(pte, mk_pte_phys(phys_addr, prot));
833		forget_pte(oldpage);
834		address += PAGE_SIZE;
835		phys_addr += PAGE_SIZE;
836		pte++;
837	} while (address && (address < end));
838}
839
840static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size,
841	unsigned long phys_addr, pgprot_t prot)
842{
843	unsigned long end;
844
845	address &= ~PGDIR_MASK;
846	end = address + size;
847	if (end > PGDIR_SIZE)
848		end = PGDIR_SIZE;
849	phys_addr -= address;
850	do {
851		pte_t * pte = pte_alloc(mm, pmd, address);
852		if (!pte)
853			return -ENOMEM;
854		remap_pte_range(pte, address, end - address, address + phys_addr, prot);
855		address = (address + PMD_SIZE) & PMD_MASK;
856		pmd++;
857	} while (address && (address < end));
858	return 0;
859}
860
861/*  Note: this is only safe if the mm semaphore is held when called. */
862int remap_page_range(unsigned long from, unsigned long phys_addr, unsigned long size, pgprot_t prot)
863{
864	int error = 0;
865	pgd_t * dir;
866	unsigned long beg = from;
867	unsigned long end = from + size;
868	struct mm_struct *mm = current->mm;
869
870	phys_addr -= from;
871	dir = pgd_offset(mm, from);
872	flush_cache_range(mm, beg, end);
873	if (from >= end)
874		BUG();
875
876	spin_lock(&mm->page_table_lock);
877	do {
878		pmd_t *pmd = pmd_alloc(mm, dir, from);
879		error = -ENOMEM;
880		if (!pmd)
881			break;
882		error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot);
883		if (error)
884			break;
885		from = (from + PGDIR_SIZE) & PGDIR_MASK;
886		dir++;
887	} while (from && (from < end));
888	spin_unlock(&mm->page_table_lock);
889	flush_tlb_range(mm, beg, end);
890	return error;
891}
892
893/*
894 * Establish a new mapping:
895 *  - flush the old one
896 *  - update the page tables
897 *  - inform the TLB about the new one
898 *
899 * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
900 */
901static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry)
902{
903	set_pte(page_table, entry);
904	flush_tlb_page(vma, address);
905	update_mmu_cache(vma, address, entry);
906}
907
908/*
909 * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
910 */
911static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
912		pte_t *page_table)
913{
914	flush_page_to_ram(new_page);
915	flush_cache_page(vma, address);
916	establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
917	flush_icache_page(vma, new_page);
918}
919
920/*
921 * This routine handles present pages, when users try to write
922 * to a shared page. It is done by copying the page to a new address
923 * and decrementing the shared-page counter for the old page.
924 *
925 * Goto-purists beware: the only reason for goto's here is that it results
926 * in better assembly code.. The "default" path will see no jumps at all.
927 *
928 * Note that this routine assumes that the protection checks have been
929 * done by the caller (the low-level page fault routine in most cases).
930 * Thus we can safely just mark it writable once we've done any necessary
931 * COW.
932 *
933 * We also mark the page dirty at this point even though the page will
934 * change only once the write actually happens. This avoids a few races,
935 * and potentially makes it more efficient.
936 *
937 * We hold the mm semaphore and the page_table_lock on entry and exit
938 * with the page_table_lock released.
939 */
940static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
941	unsigned long address, pte_t *page_table, pte_t pte)
942{
943	struct page *old_page, *new_page;
944
945	old_page = pte_page(pte);
946	if (!VALID_PAGE(old_page))
947		goto bad_wp_page;
948
949	if (!TryLockPage(old_page)) {
950		int reuse = can_share_swap_page(old_page);
951		unlock_page(old_page);
952		if (reuse) {
953			flush_cache_page(vma, address);
954			establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte))));
955			spin_unlock(&mm->page_table_lock);
956			return 1;	/* Minor fault */
957		}
958	}
959
960	/*
961	 * Ok, we need to copy. Oh, well..
962	 */
963	page_cache_get(old_page);
964	spin_unlock(&mm->page_table_lock);
965
966	new_page = alloc_page(GFP_HIGHUSER);
967	if (!new_page)
968		goto no_mem;
969	copy_cow_page(old_page,new_page,address);
970
971	/*
972	 * Re-check the pte - we dropped the lock
973	 */
974	spin_lock(&mm->page_table_lock);
975	if (pte_same(*page_table, pte)) {
976		if (PageReserved(old_page))
977			++mm->rss;
978		break_cow(vma, new_page, address, page_table);
979		lru_cache_add(new_page);
980
981		/* Free the old page.. */
982		new_page = old_page;
983	}
984	spin_unlock(&mm->page_table_lock);
985	page_cache_release(new_page);
986	page_cache_release(old_page);
987	return 1;	/* Minor fault */
988
989bad_wp_page:
990	spin_unlock(&mm->page_table_lock);
991	printk("do_wp_page: bogus page at address %08lx (page 0x%lx)\n",address,(unsigned long)old_page);
992	return -1;
993no_mem:
994	page_cache_release(old_page);
995	return -1;
996}
997
998static void vmtruncate_list(struct vm_area_struct *mpnt, unsigned long pgoff)
999{
1000	do {
1001		struct mm_struct *mm = mpnt->vm_mm;
1002		unsigned long start = mpnt->vm_start;
1003		unsigned long end = mpnt->vm_end;
1004		unsigned long len = end - start;
1005		unsigned long diff;
1006
1007		/* mapping wholly truncated? */
1008		if (mpnt->vm_pgoff >= pgoff) {
1009			zap_page_range(mm, start, len);
1010			continue;
1011		}
1012
1013		/* mapping wholly unaffected? */
1014		len = len >> PAGE_SHIFT;
1015		diff = pgoff - mpnt->vm_pgoff;
1016		if (diff >= len)
1017			continue;
1018
1019		/* Ok, partially affected.. */
1020		start += diff << PAGE_SHIFT;
1021		len = (len - diff) << PAGE_SHIFT;
1022		zap_page_range(mm, start, len);
1023	} while ((mpnt = mpnt->vm_next_share) != NULL);
1024}
1025
1026/*
1027 * Handle all mappings that got truncated by a "truncate()"
1028 * system call.
1029 *
1030 * NOTE! We have to be ready to update the memory sharing
1031 * between the file and the memory map for a potential last
1032 * incomplete page.  Ugly, but necessary.
1033 */
1034int vmtruncate(struct inode * inode, loff_t offset)
1035{
1036	unsigned long pgoff;
1037	struct address_space *mapping = inode->i_mapping;
1038	unsigned long limit;
1039
1040	if (inode->i_size < offset)
1041		goto do_expand;
1042	inode->i_size = offset;
1043	spin_lock(&mapping->i_shared_lock);
1044	if (!mapping->i_mmap && !mapping->i_mmap_shared)
1045		goto out_unlock;
1046
1047	pgoff = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1048	if (mapping->i_mmap != NULL)
1049		vmtruncate_list(mapping->i_mmap, pgoff);
1050	if (mapping->i_mmap_shared != NULL)
1051		vmtruncate_list(mapping->i_mmap_shared, pgoff);
1052
1053out_unlock:
1054	spin_unlock(&mapping->i_shared_lock);
1055	truncate_inode_pages(mapping, offset);
1056	goto out_truncate;
1057
1058do_expand:
1059	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1060	if (limit != RLIM_INFINITY && offset > limit)
1061		goto out_sig;
1062	if (offset > inode->i_sb->s_maxbytes)
1063		goto out;
1064	inode->i_size = offset;
1065
1066out_truncate:
1067	if (inode->i_op && inode->i_op->truncate) {
1068		lock_kernel();
1069		inode->i_op->truncate(inode);
1070		unlock_kernel();
1071	}
1072	return 0;
1073out_sig:
1074	send_sig(SIGXFSZ, current, 0);
1075out:
1076	return -EFBIG;
1077}
1078
1079/*
1080 * Primitive swap readahead code. We simply read an aligned block of
1081 * (1 << page_cluster) entries in the swap area. This method is chosen
1082 * because it doesn't cost us any seek time.  We also make sure to queue
1083 * the 'original' request together with the readahead ones...
1084 */
1085void swapin_readahead(swp_entry_t entry)
1086{
1087	int i, num;
1088	struct page *new_page;
1089	unsigned long offset;
1090
1091	/*
1092	 * Get the number of handles we should do readahead io to.
1093	 */
1094	num = valid_swaphandles(entry, &offset);
1095	for (i = 0; i < num; offset++, i++) {
1096		/* Ok, do the async read-ahead now */
1097		new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset));
1098		if (!new_page)
1099			break;
1100		page_cache_release(new_page);
1101	}
1102	return;
1103}
1104
1105/*
1106 * We hold the mm semaphore and the page_table_lock on entry and
1107 * should release the pagetable lock on exit..
1108 */
1109static int do_swap_page(struct mm_struct * mm,
1110	struct vm_area_struct * vma, unsigned long address,
1111	pte_t * page_table, pte_t orig_pte, int write_access)
1112{
1113	struct page *page;
1114	swp_entry_t entry = pte_to_swp_entry(orig_pte);
1115	pte_t pte;
1116	int ret = 1;
1117
1118	spin_unlock(&mm->page_table_lock);
1119	page = lookup_swap_cache(entry);
1120	if (!page) {
1121		swapin_readahead(entry);
1122		page = read_swap_cache_async(entry);
1123		if (!page) {
1124			/*
1125			 * Back out if somebody else faulted in this pte while
1126			 * we released the page table lock.
1127			 */
1128			int retval;
1129			spin_lock(&mm->page_table_lock);
1130			retval = pte_same(*page_table, orig_pte) ? -1 : 1;
1131			spin_unlock(&mm->page_table_lock);
1132			return retval;
1133		}
1134
1135		/* Had to read the page from swap area: Major fault */
1136		ret = 2;
1137	}
1138
1139	mark_page_accessed(page);
1140
1141	lock_page(page);
1142
1143	/*
1144	 * Back out if somebody else faulted in this pte while we
1145	 * released the page table lock.
1146	 */
1147	spin_lock(&mm->page_table_lock);
1148	if (!pte_same(*page_table, orig_pte)) {
1149		spin_unlock(&mm->page_table_lock);
1150		unlock_page(page);
1151		page_cache_release(page);
1152		return 1;
1153	}
1154
1155	/* The page isn't present yet, go ahead with the fault. */
1156
1157	swap_free(entry);
1158	if (vm_swap_full())
1159		remove_exclusive_swap_page(page);
1160
1161	mm->rss++;
1162	pte = mk_pte(page, vma->vm_page_prot);
1163	if (write_access && can_share_swap_page(page))
1164		pte = pte_mkdirty(pte_mkwrite(pte));
1165	unlock_page(page);
1166
1167	flush_page_to_ram(page);
1168	flush_icache_page(vma, page);
1169	set_pte(page_table, pte);
1170
1171	/* No need to invalidate - it was non-present before */
1172	update_mmu_cache(vma, address, pte);
1173	spin_unlock(&mm->page_table_lock);
1174	return ret;
1175}
1176
1177/*
1178 * We are called with the MM semaphore and page_table_lock
1179 * spinlock held to protect against concurrent faults in
1180 * multithreaded programs.
1181 */
1182static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, pte_t *page_table, int write_access, unsigned long addr)
1183{
1184	pte_t entry;
1185
1186	/* Read-only mapping of ZERO_PAGE. */
1187	entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
1188
1189	/* ..except if it's a write access */
1190	if (write_access) {
1191		struct page *page;
1192
1193		/* Allocate our own private page. */
1194		spin_unlock(&mm->page_table_lock);
1195
1196		page = alloc_page(GFP_HIGHUSER);
1197		if (!page)
1198			goto no_mem;
1199		clear_user_highpage(page, addr);
1200
1201		spin_lock(&mm->page_table_lock);
1202		if (!pte_none(*page_table)) {
1203			page_cache_release(page);
1204			spin_unlock(&mm->page_table_lock);
1205			return 1;
1206		}
1207		mm->rss++;
1208		flush_page_to_ram(page);
1209		entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
1210		lru_cache_add(page);
1211		mark_page_accessed(page);
1212	}
1213
1214	set_pte(page_table, entry);
1215
1216	/* No need to invalidate - it was non-present before */
1217	update_mmu_cache(vma, addr, entry);
1218	spin_unlock(&mm->page_table_lock);
1219	return 1;	/* Minor fault */
1220
1221no_mem:
1222	return -1;
1223}
1224
1225/*
1226 * do_no_page() tries to create a new page mapping. It aggressively
1227 * tries to share with existing pages, but makes a separate copy if
1228 * the "write_access" parameter is true in order to avoid the next
1229 * page fault.
1230 *
1231 * As this is called only for pages that do not currently exist, we
1232 * do not need to flush old virtual caches or the TLB.
1233 *
1234 * This is called with the MM semaphore held and the page table
1235 * spinlock held. Exit with the spinlock released.
1236 */
1237static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
1238	unsigned long address, int write_access, pte_t *page_table)
1239{
1240	struct page * new_page;
1241	pte_t entry;
1242
1243	if (!vma->vm_ops || !vma->vm_ops->nopage)
1244		return do_anonymous_page(mm, vma, page_table, write_access, address);
1245	spin_unlock(&mm->page_table_lock);
1246
1247	new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
1248
1249	if (new_page == NULL)	/* no page was available -- SIGBUS */
1250		return 0;
1251	if (new_page == NOPAGE_OOM)
1252		return -1;
1253
1254	/*
1255	 * Should we do an early C-O-W break?
1256	 */
1257	if (write_access && !(vma->vm_flags & VM_SHARED)) {
1258		struct page * page = alloc_page(GFP_HIGHUSER);
1259		if (!page) {
1260			page_cache_release(new_page);
1261			return -1;
1262		}
1263		copy_user_highpage(page, new_page, address);
1264		page_cache_release(new_page);
1265		lru_cache_add(page);
1266		new_page = page;
1267	}
1268
1269	spin_lock(&mm->page_table_lock);
1270	/*
1271	 * This silly early PAGE_DIRTY setting removes a race
1272	 * due to the bad i386 page protection. But it's valid
1273	 * for other architectures too.
1274	 *
1275	 * Note that if write_access is true, we either now have
1276	 * an exclusive copy of the page, or this is a shared mapping,
1277	 * so we can make it writable and dirty to avoid having to
1278	 * handle that later.
1279	 */
1280	/* Only go through if we didn't race with anybody else... */
1281	if (pte_none(*page_table)) {
1282		++mm->rss;
1283		flush_page_to_ram(new_page);
1284		flush_icache_page(vma, new_page);
1285		entry = mk_pte(new_page, vma->vm_page_prot);
1286		if (write_access)
1287			entry = pte_mkwrite(pte_mkdirty(entry));
1288		set_pte(page_table, entry);
1289	} else {
1290		/* One of our sibling threads was faster, back out. */
1291		page_cache_release(new_page);
1292		spin_unlock(&mm->page_table_lock);
1293		return 1;
1294	}
1295
1296	/* no need to invalidate: a not-present page shouldn't be cached */
1297	update_mmu_cache(vma, address, entry);
1298	spin_unlock(&mm->page_table_lock);
1299	return 2;	/* Major fault */
1300}
1301
1302/*
1303 * These routines also need to handle stuff like marking pages dirty
1304 * and/or accessed for architectures that don't do it in hardware (most
1305 * RISC architectures).  The early dirtying is also good on the i386.
1306 *
1307 * There is also a hook called "update_mmu_cache()" that architectures
1308 * with external mmu caches can use to update those (ie the Sparc or
1309 * PowerPC hashed page tables that act as extended TLBs).
1310 *
1311 * Note the "page_table_lock". It is to protect against kswapd removing
1312 * pages from under us. Note that kswapd only ever _removes_ pages, never
1313 * adds them. As such, once we have noticed that the page is not present,
1314 * we can drop the lock early.
1315 *
1316 * The adding of pages is protected by the MM semaphore (which we hold),
1317 * so we don't need to worry about a page being suddenly been added into
1318 * our VM.
1319 *
1320 * We enter with the pagetable spinlock held, we are supposed to
1321 * release it when done.
1322 */
1323static inline int handle_pte_fault(struct mm_struct *mm,
1324	struct vm_area_struct * vma, unsigned long address,
1325	int write_access, pte_t * pte)
1326{
1327	pte_t entry;
1328
1329	entry = *pte;
1330	if (!pte_present(entry)) {
1331		/*
1332		 * If it truly wasn't present, we know that kswapd
1333		 * and the PTE updates will not touch it later. So
1334		 * drop the lock.
1335		 */
1336		if (pte_none(entry))
1337			return do_no_page(mm, vma, address, write_access, pte);
1338		return do_swap_page(mm, vma, address, pte, entry, write_access);
1339	}
1340
1341	if (write_access) {
1342		if (!pte_write(entry))
1343			return do_wp_page(mm, vma, address, pte, entry);
1344
1345		entry = pte_mkdirty(entry);
1346	}
1347	entry = pte_mkyoung(entry);
1348	establish_pte(vma, address, pte, entry);
1349	spin_unlock(&mm->page_table_lock);
1350	return 1;
1351}
1352
1353/*
1354 * By the time we get here, we already hold the mm semaphore
1355 */
1356int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
1357	unsigned long address, int write_access)
1358{
1359	pgd_t *pgd;
1360	pmd_t *pmd;
1361
1362	current->state = TASK_RUNNING;
1363	pgd = pgd_offset(mm, address);
1364
1365	/*
1366	 * We need the page table lock to synchronize with kswapd
1367	 * and the SMP-safe atomic PTE updates.
1368	 */
1369	spin_lock(&mm->page_table_lock);
1370	pmd = pmd_alloc(mm, pgd, address);
1371
1372	if (pmd) {
1373		pte_t * pte = pte_alloc(mm, pmd, address);
1374		if (pte)
1375			return handle_pte_fault(mm, vma, address, write_access, pte);
1376	}
1377	spin_unlock(&mm->page_table_lock);
1378	return -1;
1379}
1380
1381/*
1382 * Allocate page middle directory.
1383 *
1384 * We've already handled the fast-path in-line, and we own the
1385 * page table lock.
1386 *
1387 * On a two-level page table, this ends up actually being entirely
1388 * optimized away.
1389 */
1390pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
1391{
1392	pmd_t *new;
1393
1394	/* "fast" allocation can happen without dropping the lock.. */
1395	new = pmd_alloc_one_fast(mm, address);
1396	if (!new) {
1397		spin_unlock(&mm->page_table_lock);
1398		new = pmd_alloc_one(mm, address);
1399		spin_lock(&mm->page_table_lock);
1400		if (!new)
1401			return NULL;
1402
1403		/*
1404		 * Because we dropped the lock, we should re-check the
1405		 * entry, as somebody else could have populated it..
1406		 */
1407		if (!pgd_none(*pgd)) {
1408			pmd_free(new);
1409			goto out;
1410		}
1411	}
1412	pgd_populate(mm, pgd, new);
1413out:
1414	return pmd_offset(pgd, address);
1415}
1416
1417/*
1418 * Allocate the page table directory.
1419 *
1420 * We've already handled the fast-path in-line, and we own the
1421 * page table lock.
1422 */
1423pte_t *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
1424{
1425	if (pmd_none(*pmd)) {
1426		pte_t *new;
1427
1428		/* "fast" allocation can happen without dropping the lock.. */
1429		new = pte_alloc_one_fast(mm, address);
1430		if (!new) {
1431			spin_unlock(&mm->page_table_lock);
1432			new = pte_alloc_one(mm, address);
1433			spin_lock(&mm->page_table_lock);
1434			if (!new)
1435				return NULL;
1436
1437			/*
1438			 * Because we dropped the lock, we should re-check the
1439			 * entry, as somebody else could have populated it..
1440			 */
1441			if (!pmd_none(*pmd)) {
1442				pte_free(new);
1443				goto out;
1444			}
1445		}
1446		pmd_populate(mm, pmd, new);
1447	}
1448out:
1449	return pte_offset(pmd, address);
1450}
1451
1452int make_pages_present(unsigned long addr, unsigned long end)
1453{
1454	int ret, len, write;
1455	struct vm_area_struct * vma;
1456
1457	vma = find_vma(current->mm, addr);
1458	write = (vma->vm_flags & VM_WRITE) != 0;
1459	if (addr >= end)
1460		BUG();
1461	if (end > vma->vm_end)
1462		BUG();
1463	len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
1464	ret = get_user_pages(current, current->mm, addr,
1465			len, write, 0, NULL, NULL);
1466	return ret == len ? 0 : -1;
1467}
1468
1469struct page * vmalloc_to_page(void * vmalloc_addr)
1470{
1471	unsigned long addr = (unsigned long) vmalloc_addr;
1472	struct page *page = NULL;
1473	pmd_t *pmd;
1474	pte_t *pte;
1475	pgd_t *pgd;
1476
1477	pgd = pgd_offset_k(addr);
1478	if (!pgd_none(*pgd)) {
1479		pmd = pmd_offset(pgd, addr);
1480		if (!pmd_none(*pmd)) {
1481			pte = pte_offset(pmd, addr);
1482			if (pte_present(*pte)) {
1483				page = pte_page(*pte);
1484			}
1485		}
1486	}
1487	return page;
1488}
1489