vm_glue.c revision 1542
1/*
2 * Copyright (c) 1991, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * The Mach Operating System project at Carnegie-Mellon University.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)vm_glue.c	8.6 (Berkeley) 1/5/94
37 *
38 *
39 * Copyright (c) 1987, 1990 Carnegie-Mellon University.
40 * All rights reserved.
41 *
42 * Permission to use, copy, modify and distribute this software and
43 * its documentation is hereby granted, provided that both the copyright
44 * notice and this permission notice appear in all copies of the
45 * software, derivative works or modified versions, and any portions
46 * thereof, and that both notices appear in supporting documentation.
47 *
48 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
49 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
50 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
51 *
52 * Carnegie Mellon requests users of this software to return to
53 *
54 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
55 *  School of Computer Science
56 *  Carnegie Mellon University
57 *  Pittsburgh PA 15213-3890
58 *
59 * any improvements or extensions that they make and grant Carnegie the
60 * rights to redistribute these changes.
61 */
62
63#include <sys/param.h>
64#include <sys/systm.h>
65#include <sys/proc.h>
66#include <sys/resourcevar.h>
67#include <sys/buf.h>
68#include <sys/user.h>
69
70#include <vm/vm.h>
71#include <vm/vm_page.h>
72#include <vm/vm_kern.h>
73
74#include <machine/cpu.h>
75
76int	avefree = 0;		/* XXX */
77unsigned maxdmap = MAXDSIZ;	/* XXX */
78int	readbuffers = 0;	/* XXX allow kgdb to read kernel buffer pool */
79
80int
81kernacc(addr, len, rw)
82	caddr_t addr;
83	int len, rw;
84{
85	boolean_t rv;
86	vm_offset_t saddr, eaddr;
87	vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE;
88
89	saddr = trunc_page(addr);
90	eaddr = round_page(addr+len);
91	rv = vm_map_check_protection(kernel_map, saddr, eaddr, prot);
92	/*
93	 * XXX there are still some things (e.g. the buffer cache) that
94	 * are managed behind the VM system's back so even though an
95	 * address is accessible in the mind of the VM system, there may
96	 * not be physical pages where the VM thinks there is.  This can
97	 * lead to bogus allocation of pages in the kernel address space
98	 * or worse, inconsistencies at the pmap level.  We only worry
99	 * about the buffer cache for now.
100	 */
101	if (!readbuffers && rv && (eaddr > (vm_offset_t)buffers &&
102		   saddr < (vm_offset_t)buffers + MAXBSIZE * nbuf))
103		rv = FALSE;
104	return(rv == TRUE);
105}
106
107int
108useracc(addr, len, rw)
109	caddr_t addr;
110	int len, rw;
111{
112	boolean_t rv;
113	vm_prot_t prot = rw == B_READ ? VM_PROT_READ : VM_PROT_WRITE;
114
115	rv = vm_map_check_protection(&curproc->p_vmspace->vm_map,
116	    trunc_page(addr), round_page(addr+len), prot);
117	return(rv == TRUE);
118}
119
120#ifdef KGDB
121/*
122 * Change protections on kernel pages from addr to addr+len
123 * (presumably so debugger can plant a breakpoint).
124 *
125 * We force the protection change at the pmap level.  If we were
126 * to use vm_map_protect a change to allow writing would be lazily-
127 * applied meaning we would still take a protection fault, something
128 * we really don't want to do.  It would also fragment the kernel
129 * map unnecessarily.  We cannot use pmap_protect since it also won't
130 * enforce a write-enable request.  Using pmap_enter is the only way
131 * we can ensure the change takes place properly.
132 */
133void
134chgkprot(addr, len, rw)
135	register caddr_t addr;
136	int len, rw;
137{
138	vm_prot_t prot;
139	vm_offset_t pa, sva, eva;
140
141	prot = rw == B_READ ? VM_PROT_READ : VM_PROT_READ|VM_PROT_WRITE;
142	eva = round_page(addr + len);
143	for (sva = trunc_page(addr); sva < eva; sva += PAGE_SIZE) {
144		/*
145		 * Extract physical address for the page.
146		 * We use a cheezy hack to differentiate physical
147		 * page 0 from an invalid mapping, not that it
148		 * really matters...
149		 */
150		pa = pmap_extract(kernel_pmap, sva|1);
151		if (pa == 0)
152			panic("chgkprot: invalid page");
153		pmap_enter(kernel_pmap, sva, pa&~1, prot, TRUE);
154	}
155}
156#endif
157
158void
159vslock(addr, len)
160	caddr_t	addr;
161	u_int	len;
162{
163	vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr),
164			round_page(addr+len), FALSE);
165}
166
167void
168vsunlock(addr, len, dirtied)
169	caddr_t	addr;
170	u_int	len;
171	int dirtied;
172{
173#ifdef	lint
174	dirtied++;
175#endif
176	vm_map_pageable(&curproc->p_vmspace->vm_map, trunc_page(addr),
177			round_page(addr+len), TRUE);
178}
179
180/*
181 * Implement fork's actions on an address space.
182 * Here we arrange for the address space to be copied or referenced,
183 * allocate a user struct (pcb and kernel stack), then call the
184 * machine-dependent layer to fill those in and make the new process
185 * ready to run.
186 * NOTE: the kernel stack may be at a different location in the child
187 * process, and thus addresses of automatic variables may be invalid
188 * after cpu_fork returns in the child process.  We do nothing here
189 * after cpu_fork returns.
190 */
191int
192vm_fork(p1, p2, isvfork)
193	register struct proc *p1, *p2;
194	int isvfork;
195{
196	register struct user *up;
197	vm_offset_t addr;
198
199#ifdef i386
200	/*
201	 * avoid copying any of the parent's pagetables or other per-process
202	 * objects that reside in the map by marking all of them non-inheritable
203	 */
204	(void)vm_map_inherit(&p1->p_vmspace->vm_map,
205		UPT_MIN_ADDRESS-UPAGES*NBPG, VM_MAX_ADDRESS, VM_INHERIT_NONE);
206#endif
207	p2->p_vmspace = vmspace_fork(p1->p_vmspace);
208
209#ifdef SYSVSHM
210	if (p1->p_vmspace->vm_shm)
211		shmfork(p1, p2, isvfork);
212#endif
213
214#ifndef	i386
215	/*
216	 * Allocate a wired-down (for now) pcb and kernel stack for the process
217	 */
218	addr = kmem_alloc_pageable(kernel_map, ctob(UPAGES));
219	if (addr == 0)
220		panic("vm_fork: no more kernel virtual memory");
221	vm_map_pageable(kernel_map, addr, addr + ctob(UPAGES), FALSE);
222#else
223/* XXX somehow, on 386, ocassionally pageout removes active, wired down kstack,
224and pagetables, WITHOUT going thru vm_page_unwire! Why this appears to work is
225not yet clear, yet it does... */
226	addr = kmem_alloc(kernel_map, ctob(UPAGES));
227	if (addr == 0)
228		panic("vm_fork: no more kernel virtual memory");
229#endif
230	up = (struct user *)addr;
231	p2->p_addr = up;
232
233	/*
234	 * p_stats and p_sigacts currently point at fields
235	 * in the user struct but not at &u, instead at p_addr.
236	 * Copy p_sigacts and parts of p_stats; zero the rest
237	 * of p_stats (statistics).
238	 */
239	p2->p_stats = &up->u_stats;
240	p2->p_sigacts = &up->u_sigacts;
241	up->u_sigacts = *p1->p_sigacts;
242	bzero(&up->u_stats.pstat_startzero,
243	    (unsigned) ((caddr_t)&up->u_stats.pstat_endzero -
244	    (caddr_t)&up->u_stats.pstat_startzero));
245	bcopy(&p1->p_stats->pstat_startcopy, &up->u_stats.pstat_startcopy,
246	    ((caddr_t)&up->u_stats.pstat_endcopy -
247	     (caddr_t)&up->u_stats.pstat_startcopy));
248
249#ifdef i386
250	{ u_int addr = UPT_MIN_ADDRESS - UPAGES*NBPG; struct vm_map *vp;
251
252	vp = &p2->p_vmspace->vm_map;
253	(void)vm_deallocate(vp, addr, UPT_MAX_ADDRESS - addr);
254	(void)vm_allocate(vp, &addr, UPT_MAX_ADDRESS - addr, FALSE);
255	(void)vm_map_inherit(vp, addr, UPT_MAX_ADDRESS, VM_INHERIT_NONE);
256	}
257#endif
258	/*
259	 * cpu_fork will copy and update the kernel stack and pcb,
260	 * and make the child ready to run.  It marks the child
261	 * so that it can return differently than the parent.
262	 * It returns twice, once in the parent process and
263	 * once in the child.
264	 */
265	return (cpu_fork(p1, p2));
266}
267
268/*
269 * Set default limits for VM system.
270 * Called for proc 0, and then inherited by all others.
271 */
272void
273vm_init_limits(p)
274	register struct proc *p;
275{
276
277	/*
278	 * Set up the initial limits on process VM.
279	 * Set the maximum resident set size to be all
280	 * of (reasonably) available memory.  This causes
281	 * any single, large process to start random page
282	 * replacement once it fills memory.
283	 */
284        p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ;
285        p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ;
286        p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ;
287        p->p_rlimit[RLIMIT_DATA].rlim_max = MAXDSIZ;
288	p->p_rlimit[RLIMIT_RSS].rlim_cur = ptoa(cnt.v_free_count);
289}
290
291#include <vm/vm_pageout.h>
292
293#ifdef DEBUG
294int	enableswap = 1;
295int	swapdebug = 0;
296#define	SDB_FOLLOW	1
297#define SDB_SWAPIN	2
298#define SDB_SWAPOUT	4
299#endif
300
301/*
302 * Brutally simple:
303 *	1. Attempt to swapin every swaped-out, runnable process in
304 *	   order of priority.
305 *	2. If not enough memory, wake the pageout daemon and let it
306 *	   clear some space.
307 */
308void
309scheduler()
310{
311	register struct proc *p;
312	register int pri;
313	struct proc *pp;
314	int ppri;
315	vm_offset_t addr;
316	vm_size_t size;
317
318loop:
319#ifdef DEBUG
320	while (!enableswap)
321		sleep((caddr_t)&proc0, PVM);
322#endif
323	pp = NULL;
324	ppri = INT_MIN;
325	for (p = (struct proc *)allproc; p != NULL; p = p->p_next) {
326		if (p->p_stat == SRUN && (p->p_flag & P_INMEM) == 0) {
327			pri = p->p_swtime + p->p_slptime - p->p_nice * 8;
328			if (pri > ppri) {
329				pp = p;
330				ppri = pri;
331			}
332		}
333	}
334#ifdef DEBUG
335	if (swapdebug & SDB_FOLLOW)
336		printf("sched: running, procp %x pri %d\n", pp, ppri);
337#endif
338	/*
339	 * Nothing to do, back to sleep
340	 */
341	if ((p = pp) == NULL) {
342		sleep((caddr_t)&proc0, PVM);
343		goto loop;
344	}
345
346	/*
347	 * We would like to bring someone in.
348	 * This part is really bogus cuz we could deadlock on memory
349	 * despite our feeble check.
350	 */
351	size = round_page(ctob(UPAGES));
352	addr = (vm_offset_t) p->p_addr;
353	if (cnt.v_free_count > atop(size)) {
354#ifdef DEBUG
355		if (swapdebug & SDB_SWAPIN)
356			printf("swapin: pid %d(%s)@%x, pri %d free %d\n",
357			       p->p_pid, p->p_comm, p->p_addr,
358			       ppri, cnt.v_free_count);
359#endif
360		vm_map_pageable(kernel_map, addr, addr+size, FALSE);
361		/*
362		 * Some architectures need to be notified when the
363		 * user area has moved to new physical page(s) (e.g.
364		 * see pmax/pmax/vm_machdep.c).
365		 */
366		cpu_swapin(p);
367		(void) splstatclock();
368		if (p->p_stat == SRUN)
369			setrunqueue(p);
370		p->p_flag |= P_INMEM;
371		(void) spl0();
372		p->p_swtime = 0;
373		goto loop;
374	}
375	/*
376	 * Not enough memory, jab the pageout daemon and wait til the
377	 * coast is clear.
378	 */
379#ifdef DEBUG
380	if (swapdebug & SDB_FOLLOW)
381		printf("sched: no room for pid %d(%s), free %d\n",
382		       p->p_pid, p->p_comm, cnt.v_free_count);
383#endif
384	(void) splhigh();
385	VM_WAIT;
386	(void) spl0();
387#ifdef DEBUG
388	if (swapdebug & SDB_FOLLOW)
389		printf("sched: room again, free %d\n", cnt.v_free_count);
390#endif
391	goto loop;
392}
393
394#define	swappable(p)							\
395	(((p)->p_flag &							\
396	    (P_SYSTEM | P_INMEM | P_NOSWAP | P_WEXIT | P_PHYSIO)) == P_INMEM)
397
398/*
399 * Swapout is driven by the pageout daemon.  Very simple, we find eligible
400 * procs and unwire their u-areas.  We try to always "swap" at least one
401 * process in case we need the room for a swapin.
402 * If any procs have been sleeping/stopped for at least maxslp seconds,
403 * they are swapped.  Else, we swap the longest-sleeping or stopped process,
404 * if any, otherwise the longest-resident process.
405 */
406void
407swapout_threads()
408{
409	register struct proc *p;
410	struct proc *outp, *outp2;
411	int outpri, outpri2;
412	int didswap = 0;
413	extern int maxslp;
414
415#ifdef DEBUG
416	if (!enableswap)
417		return;
418#endif
419	outp = outp2 = NULL;
420	outpri = outpri2 = 0;
421	for (p = (struct proc *)allproc; p != NULL; p = p->p_next) {
422		if (!swappable(p))
423			continue;
424		switch (p->p_stat) {
425		case SRUN:
426			if (p->p_swtime > outpri2) {
427				outp2 = p;
428				outpri2 = p->p_swtime;
429			}
430			continue;
431
432		case SSLEEP:
433		case SSTOP:
434			if (p->p_slptime >= maxslp) {
435				swapout(p);
436				didswap++;
437			} else if (p->p_slptime > outpri) {
438				outp = p;
439				outpri = p->p_slptime;
440			}
441			continue;
442		}
443	}
444	/*
445	 * If we didn't get rid of any real duds, toss out the next most
446	 * likely sleeping/stopped or running candidate.  We only do this
447	 * if we are real low on memory since we don't gain much by doing
448	 * it (UPAGES pages).
449	 */
450	if (didswap == 0 &&
451	    cnt.v_free_count <= atop(round_page(ctob(UPAGES)))) {
452		if ((p = outp) == 0)
453			p = outp2;
454#ifdef DEBUG
455		if (swapdebug & SDB_SWAPOUT)
456			printf("swapout_threads: no duds, try procp %x\n", p);
457#endif
458		if (p)
459			swapout(p);
460	}
461}
462
463void
464swapout(p)
465	register struct proc *p;
466{
467	vm_offset_t addr;
468	vm_size_t size;
469
470#ifdef DEBUG
471	if (swapdebug & SDB_SWAPOUT)
472		printf("swapout: pid %d(%s)@%x, stat %x pri %d free %d\n",
473		       p->p_pid, p->p_comm, p->p_addr, p->p_stat,
474		       p->p_slptime, cnt.v_free_count);
475#endif
476	size = round_page(ctob(UPAGES));
477	addr = (vm_offset_t) p->p_addr;
478#if defined(hp300) || defined(luna68k)
479	/*
480	 * Ugh!  u-area is double mapped to a fixed address behind the
481	 * back of the VM system and accesses are usually through that
482	 * address rather than the per-process address.  Hence reference
483	 * and modify information are recorded at the fixed address and
484	 * lost at context switch time.  We assume the u-struct and
485	 * kernel stack are always accessed/modified and force it to be so.
486	 */
487	{
488		register int i;
489		volatile long tmp;
490
491		for (i = 0; i < UPAGES; i++) {
492			tmp = *(long *)addr; *(long *)addr = tmp;
493			addr += NBPG;
494		}
495		addr = (vm_offset_t) p->p_addr;
496	}
497#endif
498#ifdef mips
499	/*
500	 * Be sure to save the floating point coprocessor state before
501	 * paging out the u-struct.
502	 */
503	{
504		extern struct proc *machFPCurProcPtr;
505
506		if (p == machFPCurProcPtr) {
507			MachSaveCurFPState(p);
508			machFPCurProcPtr = (struct proc *)0;
509		}
510	}
511#endif
512#ifndef	i386 /* temporary measure till we find spontaineous unwire of kstack */
513	vm_map_pageable(kernel_map, addr, addr+size, TRUE);
514	pmap_collect(vm_map_pmap(&p->p_vmspace->vm_map));
515#endif
516	(void) splhigh();
517	p->p_flag &= ~P_INMEM;
518	if (p->p_stat == SRUN)
519		remrq(p);
520	(void) spl0();
521	p->p_swtime = 0;
522}
523
524/*
525 * The rest of these routines fake thread handling
526 */
527
528void
529assert_wait(event, ruptible)
530	int event;
531	boolean_t ruptible;
532{
533#ifdef lint
534	ruptible++;
535#endif
536	curproc->p_thread = event;
537}
538
539void
540thread_block()
541{
542	int s = splhigh();
543
544	if (curproc->p_thread)
545		sleep((caddr_t)curproc->p_thread, PVM);
546	splx(s);
547}
548
549void
550thread_sleep(event, lock, ruptible)
551	int event;
552	simple_lock_t lock;
553	boolean_t ruptible;
554{
555#ifdef lint
556	ruptible++;
557#endif
558	int s = splhigh();
559
560	curproc->p_thread = event;
561	simple_unlock(lock);
562	if (curproc->p_thread)
563		sleep((caddr_t)event, PVM);
564	splx(s);
565}
566
567void
568thread_wakeup(event)
569	int event;
570{
571	int s = splhigh();
572
573	wakeup((caddr_t)event);
574	splx(s);
575}
576
577/*
578 * DEBUG stuff
579 */
580
581int indent = 0;
582
583#include <machine/stdarg.h>		/* see subr_prf.c */
584
585/*ARGSUSED2*/
586void
587#if __STDC__
588iprintf(const char *fmt, ...)
589#else
590iprintf(fmt /* , va_alist */)
591	char *fmt;
592	/* va_dcl */
593#endif
594{
595	register int i;
596	va_list ap;
597
598	for (i = indent; i >= 8; i -= 8)
599		printf("\t");
600	while (--i >= 0)
601		printf(" ");
602	va_start(ap, fmt);
603	printf("%r", fmt, ap);
604	va_end(ap);
605}
606