vm_machdep.c revision 1362
1/*-
2 * Copyright (c) 1982, 1986 The Regents of the University of California.
3 * Copyright (c) 1989, 1990 William Jolitz
4 * Copyright (c) 1994 John Dyson
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * the Systems Programming Group of the University of Utah Computer
9 * Science Department, and William Jolitz.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by the University of
22 *	California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 *    may be used to endorse or promote products derived from this software
25 *    without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
40 *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
41 *	$Id: vm_machdep.c,v 1.18 1994/04/05 03:23:09 davidg Exp $
42 */
43
44#include "npx.h"
45#include "param.h"
46#include "systm.h"
47#include "proc.h"
48#include "malloc.h"
49#include "buf.h"
50#include "user.h"
51
52#include "../include/cpu.h"
53
54#include "vm/vm.h"
55#include "vm/vm_kern.h"
56
57#define b_cylin b_resid
58
59#ifndef NOBOUNCE
60
61caddr_t		bouncememory;
62vm_offset_t	bouncepa, bouncepaend;
63int		bouncepages, bpwait;
64vm_map_t	io_map;
65int		bmwait, bmfreeing;
66
67#define BITS_IN_UNSIGNED (8*sizeof(unsigned))
68int		bounceallocarraysize;
69unsigned	*bounceallocarray;
70int		bouncefree;
71
72#define SIXTEENMEG (4096*4096)
73#define MAXBKVA 1024
74
75/* special list that can be used at interrupt time for eventual kva free */
76struct kvasfree {
77	vm_offset_t addr;
78	vm_offset_t size;
79} kvaf[MAXBKVA];
80
81int		kvasfreecnt;
82
83vm_offset_t vm_bounce_kva();
84/*
85 * get bounce buffer pages (count physically contiguous)
86 * (only 1 inplemented now)
87 */
88vm_offset_t
89vm_bounce_page_find(count)
90	int count;
91{
92	int bit;
93	int s,i;
94
95	if (count != 1)
96		panic("vm_bounce_page_find -- no support for > 1 page yet!!!");
97
98	s = splbio();
99retry:
100	for (i = 0; i < bounceallocarraysize; i++) {
101		if (bounceallocarray[i] != 0xffffffff) {
102			if (bit = ffs(~bounceallocarray[i])) {
103				bounceallocarray[i] |= 1 << (bit - 1) ;
104				bouncefree -= count;
105				splx(s);
106				return bouncepa + (i * BITS_IN_UNSIGNED + (bit - 1)) * NBPG;
107			}
108		}
109	}
110	bpwait = 1;
111	tsleep((caddr_t) &bounceallocarray, PRIBIO, "bncwai", 0);
112	goto retry;
113}
114
115void
116vm_bounce_kva_free(addr, size, now)
117	vm_offset_t addr;
118	vm_offset_t size;
119	int now;
120{
121	int s = splbio();
122	kvaf[kvasfreecnt].addr = addr;
123	kvaf[kvasfreecnt++].size = size;
124	if( now)
125		vm_bounce_kva(0,0);
126	else
127		wakeup((caddr_t) io_map);
128	splx(s);
129}
130
131/*
132 * free count bounce buffer pages
133 */
134void
135vm_bounce_page_free(pa, count)
136	vm_offset_t pa;
137	int count;
138{
139	int allocindex;
140	int index;
141	int bit;
142
143	if (count != 1)
144		panic("vm_bounce_page_free -- no support for > 1 page yet!!!\n");
145
146	index = (pa - bouncepa) / NBPG;
147
148	if ((index < 0) || (index >= bouncepages))
149		panic("vm_bounce_page_free -- bad index\n");
150
151	allocindex = index / BITS_IN_UNSIGNED;
152	bit = index % BITS_IN_UNSIGNED;
153
154	bounceallocarray[allocindex] &= ~(1 << bit);
155
156	bouncefree += count;
157	if (bpwait) {
158		bpwait = 0;
159		wakeup((caddr_t) &bounceallocarray);
160	}
161}
162
163/*
164 * allocate count bounce buffer kva pages
165 */
166vm_offset_t
167vm_bounce_kva(count, waitok)
168	int count;
169	int waitok;
170{
171	int tofree;
172	int i;
173	int startfree;
174	vm_offset_t kva = 0;
175	int s = splbio();
176	int size = count;
177	startfree = 0;
178more:
179	if (!bmfreeing && (tofree = kvasfreecnt)) {
180		bmfreeing = 1;
181		for (i = startfree; i < kvasfreecnt; i++) {
182			/*
183			 * if we have a kva of the right size, no sense
184			 * in freeing/reallocating...
185			 * might affect fragmentation short term, but
186			 * as long as the amount of io_map is
187			 * significantly more than the maximum transfer
188			 * size, I don't think that it is a problem.
189			 */
190			pmap_remove(kernel_pmap,
191				kvaf[i].addr, kvaf[i].addr + kvaf[i].size);
192			if( size && !kva && kvaf[i].size == size) {
193				kva = kvaf[i].addr;
194			} else {
195				kmem_free_wakeup(io_map, kvaf[i].addr,
196					kvaf[i].size);
197			}
198		}
199		if (kvasfreecnt != tofree) {
200			startfree = i;
201			bmfreeing = 0;
202			goto more;
203		}
204		kvasfreecnt = 0;
205		bmfreeing = 0;
206	}
207
208	if( size == 0) {
209		splx(s);
210		return NULL;
211	}
212
213	if (!kva && !(kva = kmem_alloc_pageable(io_map, size))) {
214		if( !waitok) {
215			splx(s);
216			return NULL;
217		}
218		bmwait = 1;
219		tsleep((caddr_t) io_map, PRIBIO, "bmwait", 0);
220		goto more;
221	}
222	splx(s);
223
224	return kva;
225}
226
227/*
228 * do the things necessary to the struct buf to implement
229 * bounce buffers...  inserted before the disk sort
230 */
231void
232vm_bounce_alloc(bp)
233	struct buf *bp;
234{
235	int countvmpg;
236	vm_offset_t vastart, vaend;
237	vm_offset_t vapstart, vapend;
238	vm_offset_t va, kva;
239	vm_offset_t pa;
240	int dobounceflag = 0;
241	int bounceindex;
242	int i;
243	int s;
244
245	if (bouncepages == 0)
246		return;
247
248	if (bp->b_bufsize < bp->b_bcount) {
249		printf("vm_bounce_alloc: b_bufsize(%d) < b_bcount(%d) !!!!\n",
250			bp->b_bufsize, bp->b_bcount);
251		bp->b_bufsize = bp->b_bcount;
252	}
253
254	vastart = (vm_offset_t) bp->b_un.b_addr;
255	vaend = (vm_offset_t) bp->b_un.b_addr + bp->b_bufsize;
256
257	vapstart = i386_trunc_page(vastart);
258	vapend = i386_round_page(vaend);
259	countvmpg = (vapend - vapstart) / NBPG;
260
261/*
262 * if any page is above 16MB, then go into bounce-buffer mode
263 */
264	va = vapstart;
265	for (i = 0; i < countvmpg; i++) {
266		pa = pmap_kextract(va);
267		if (pa >= SIXTEENMEG)
268			++dobounceflag;
269		va += NBPG;
270	}
271	if (dobounceflag == 0)
272		return;
273
274	if (bouncepages < dobounceflag)
275		panic("Not enough bounce buffers!!!");
276
277/*
278 * allocate a replacement kva for b_addr
279 */
280	kva = vm_bounce_kva(countvmpg*NBPG, 1);
281	va = vapstart;
282	for (i = 0; i < countvmpg; i++) {
283		pa = pmap_kextract(va);
284		if (pa >= SIXTEENMEG) {
285			/*
286			 * allocate a replacement page
287			 */
288			vm_offset_t bpa = vm_bounce_page_find(1);
289			pmap_kenter(kva + (NBPG * i), bpa);
290			/*
291			 * if we are writing, the copy the data into the page
292			 */
293			if ((bp->b_flags & B_READ) == 0) {
294				pmap_update();
295				bcopy((caddr_t) va, (caddr_t) kva + (NBPG * i), NBPG);
296			}
297		} else {
298			/*
299			 * use original page
300			 */
301			pmap_kenter(kva + (NBPG * i), pa);
302		}
303		va += NBPG;
304	}
305	pmap_update();
306
307/*
308 * flag the buffer as being bounced
309 */
310	bp->b_flags |= B_BOUNCE;
311/*
312 * save the original buffer kva
313 */
314	bp->b_savekva = bp->b_un.b_addr;
315/*
316 * put our new kva into the buffer (offset by original offset)
317 */
318	bp->b_un.b_addr = (caddr_t) (((vm_offset_t) kva) |
319				((vm_offset_t) bp->b_savekva & (NBPG - 1)));
320	return;
321}
322
323/*
324 * hook into biodone to free bounce buffer
325 */
326void
327vm_bounce_free(bp)
328	struct buf *bp;
329{
330	int i;
331	vm_offset_t origkva, bouncekva;
332	vm_offset_t vastart, vaend;
333	vm_offset_t vapstart, vapend;
334	int countbounce = 0;
335	vm_offset_t firstbouncepa = 0;
336	int firstbounceindex;
337	int countvmpg;
338	vm_offset_t bcount;
339	int s;
340
341/*
342 * if this isn't a bounced buffer, then just return
343 */
344	if ((bp->b_flags & B_BOUNCE) == 0)
345		return;
346
347	origkva = (vm_offset_t) bp->b_savekva;
348	bouncekva = (vm_offset_t) bp->b_un.b_addr;
349
350	vastart = bouncekva;
351	vaend = bouncekva + bp->b_bufsize;
352	bcount = bp->b_bufsize;
353
354	vapstart = i386_trunc_page(vastart);
355	vapend = i386_round_page(vaend);
356
357	countvmpg = (vapend - vapstart) / NBPG;
358
359/*
360 * check every page in the kva space for b_addr
361 */
362	for (i = 0; i < countvmpg; i++) {
363		vm_offset_t mybouncepa;
364		vm_offset_t copycount;
365
366		copycount = i386_round_page(bouncekva + 1) - bouncekva;
367		mybouncepa = pmap_kextract(i386_trunc_page(bouncekva));
368
369/*
370 * if this is a bounced pa, then process as one
371 */
372		if ((mybouncepa >= bouncepa) && (mybouncepa < bouncepaend)) {
373			if (copycount > bcount)
374				copycount = bcount;
375/*
376 * if this is a read, then copy from bounce buffer into original buffer
377 */
378			if (bp->b_flags & B_READ)
379				bcopy((caddr_t) bouncekva, (caddr_t) origkva, copycount);
380/*
381 * free the bounce allocation
382 */
383			vm_bounce_page_free(i386_trunc_page(mybouncepa), 1);
384		}
385
386		origkva += copycount;
387		bouncekva += copycount;
388		bcount -= copycount;
389	}
390
391/*
392 * add the old kva into the "to free" list
393 */
394	bouncekva = i386_trunc_page((vm_offset_t) bp->b_un.b_addr);
395	vm_bounce_kva_free( bouncekva, countvmpg*NBPG, 0);
396	if (bmwait) {
397		/*
398		 * if anyone is waiting on the bounce-map, then wakeup
399		 */
400		wakeup((caddr_t) io_map);
401		bmwait = 0;
402	}
403
404	bp->b_un.b_addr = bp->b_savekva;
405	bp->b_savekva = 0;
406	bp->b_flags &= ~B_BOUNCE;
407
408	return;
409}
410
411#endif /* NOBOUNCE */
412
413/*
414 * init the bounce buffer system
415 */
416void
417vm_bounce_init()
418{
419	vm_offset_t minaddr, maxaddr;
420
421	io_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, MAXBKVA * NBPG, FALSE);
422	kvasfreecnt = 0;
423
424#ifndef NOBOUNCE
425	if (bouncepages == 0)
426		return;
427
428	bounceallocarraysize = (bouncepages + BITS_IN_UNSIGNED - 1) / BITS_IN_UNSIGNED;
429	bounceallocarray = malloc(bounceallocarraysize * sizeof(unsigned), M_TEMP, M_NOWAIT);
430
431	if (!bounceallocarray)
432		panic("Cannot allocate bounce resource array\n");
433
434	bzero(bounceallocarray, bounceallocarraysize * sizeof(long));
435
436
437	bouncepa = pmap_kextract((vm_offset_t) bouncememory);
438	bouncepaend = bouncepa + bouncepages * NBPG;
439	bouncefree = bouncepages;
440#endif
441
442}
443
444
445static void
446cldiskvamerge( kvanew, orig1, orig1cnt, orig2, orig2cnt)
447	vm_offset_t kvanew;
448	vm_offset_t orig1, orig1cnt;
449	vm_offset_t orig2, orig2cnt;
450{
451	int i;
452	vm_offset_t pa;
453/*
454 * enter the transfer physical addresses into the new kva
455 */
456	for(i=0;i<orig1cnt;i++) {
457		vm_offset_t pa;
458		pa = pmap_kextract((caddr_t) orig1 + i * PAGE_SIZE);
459		pmap_kenter(kvanew + i * PAGE_SIZE, pa);
460	}
461
462	for(i=0;i<orig2cnt;i++) {
463		vm_offset_t pa;
464		pa = pmap_kextract((caddr_t) orig2 + i * PAGE_SIZE);
465		pmap_kenter(kvanew + (i + orig1cnt) * PAGE_SIZE, pa);
466	}
467	pmap_update();
468}
469
470void
471cldisksort(struct buf *dp, struct buf *bp, vm_offset_t maxio)
472{
473	register struct buf *ap, *newbp;
474	int i, trycount=0;
475	vm_offset_t orig1pages, orig2pages;
476	vm_offset_t orig1begin, orig2begin;
477	vm_offset_t kvanew, kvaorig;
478
479	/*
480	 * If nothing on the activity queue, then
481	 * we become the only thing.
482	 */
483	ap = dp->b_actf;
484	if(ap == NULL) {
485		dp->b_actf = bp;
486		dp->b_actl = bp;
487		bp->av_forw = NULL;
488		return;
489	}
490
491	/*
492	 * If we lie after the first (currently active)
493	 * request, then we must locate the second request list
494	 * and add ourselves to it.
495	 */
496
497	if (bp->b_cylin < ap->b_cylin) {
498		while (ap->av_forw) {
499			/*
500			 * Check for an ``inversion'' in the
501			 * normally ascending cylinder numbers,
502			 * indicating the start of the second request list.
503			 */
504			if (ap->av_forw->b_cylin < ap->b_cylin) {
505				/*
506				 * Search the second request list
507				 * for the first request at a larger
508				 * cylinder number.  We go before that;
509				 * if there is no such request, we go at end.
510				 */
511				do {
512					if (bp->b_cylin < ap->av_forw->b_cylin)
513						goto insert;
514					ap = ap->av_forw;
515				} while (ap->av_forw);
516				goto insert;		/* after last */
517			}
518			ap = ap->av_forw;
519		}
520		/*
521		 * No inversions... we will go after the last, and
522		 * be the first request in the second request list.
523		 */
524		goto insert;
525	}
526	/*
527	 * Request is at/after the current request...
528	 * sort in the first request list.
529	 */
530	while (ap->av_forw) {
531		/*
532		 * We want to go after the current request
533		 * if there is an inversion after it (i.e. it is
534		 * the end of the first request list), or if
535		 * the next request is a larger cylinder than our request.
536		 */
537		if (ap->av_forw->b_cylin < ap->b_cylin ||
538		    bp->b_cylin < ap->av_forw->b_cylin )
539			goto insert;
540		ap = ap->av_forw;
541	}
542
543insert:
544	/*
545	 * we currently only cluster I/O transfers that are at page-aligned
546	 * kvas and transfers that are multiples of page lengths.
547	 */
548	if(((bp->b_bcount & PAGE_MASK) == 0) &&
549		(((vm_offset_t) bp->b_un.b_addr & PAGE_MASK) == 0)) {
550		/*
551		 * merge with previous?
552		 * conditions:
553		 * 	1) We reside physically immediately after the previous block.
554		 *	2) The previous block is not first on the device queue because
555		 *	   such a block might be active.
556		 *  3) The mode of the two I/Os is identical.
557		 *  4) The previous kva is page aligned and the previous transfer
558		 *	   is a multiple of a page in length.
559		 *	5) And the total I/O size would be below the maximum.
560		 */
561		if( (ap->b_blkno + (ap->b_bcount / DEV_BSIZE) == bp->b_blkno) &&
562			(dp->b_actf != ap) &&
563			((ap->b_flags & ~B_CLUSTER) == bp->b_flags) &&
564			((ap->b_bcount & PAGE_MASK) == 0) &&
565			(((vm_offset_t) ap->b_un.b_addr & PAGE_MASK) == 0) &&
566			(ap->b_bcount + bp->b_bcount < maxio)) {
567
568			orig1begin = (vm_offset_t) ap->b_un.b_addr;
569			orig1pages = ap->b_bcount / PAGE_SIZE;
570
571			orig2begin = (vm_offset_t) bp->b_un.b_addr;
572			orig2pages = bp->b_bcount / PAGE_SIZE;
573			/*
574			 * see if we can allocate a kva, if we cannot, the don't
575			 * cluster.
576			 */
577			kvanew = vm_bounce_kva( PAGE_SIZE * (orig1pages + orig2pages), 0);
578			if( !kvanew) {
579				goto nocluster;
580			}
581
582
583			if( (ap->b_flags & B_CLUSTER) == 0) {
584
585				/*
586				 * get a physical buf pointer
587				 */
588				newbp = (struct buf *)trypbuf();
589				if( !newbp) {
590					vm_bounce_kva_free( kvanew, PAGE_SIZE * (orig1pages + orig2pages), 1);
591					goto nocluster;
592				}
593
594				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
595
596				/*
597				 * build the new bp to be handed off to the device
598				 */
599
600				*newbp = *ap;
601				newbp->b_flags |= B_CLUSTER;
602				newbp->b_un.b_addr = (caddr_t) kvanew;
603				newbp->b_bcount += bp->b_bcount;
604				newbp->b_bufsize = newbp->b_bcount;
605				newbp->b_clusterf = ap;
606				newbp->b_clusterl = bp;
607
608				/*
609				 * enter the new bp onto the device queue
610				 */
611				if( ap->av_forw)
612					ap->av_forw->av_back = newbp;
613				else
614					dp->b_actl = newbp;
615
616				if( dp->b_actf != ap )
617					ap->av_back->av_forw = newbp;
618				else
619					dp->b_actf = newbp;
620
621				/*
622				 * enter the previous bps onto the cluster queue
623				 */
624				ap->av_forw = bp;
625				bp->av_back = ap;
626
627				ap->av_back = NULL;
628				bp->av_forw = NULL;
629
630			} else {
631				vm_offset_t addr;
632
633				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
634				/*
635				 * free the old kva
636				 */
637				vm_bounce_kva_free( orig1begin, ap->b_bufsize, 0);
638
639				ap->b_un.b_addr = (caddr_t) kvanew;
640
641				ap->b_clusterl->av_forw = bp;
642				bp->av_forw = NULL;
643				bp->av_back = ap->b_clusterl;
644				ap->b_clusterl = bp;
645
646				ap->b_bcount += bp->b_bcount;
647				ap->b_bufsize = ap->b_bcount;
648			}
649			return;
650		/*
651		 * merge with next?
652		 * conditions:
653		 * 	1) We reside physically before the next block.
654		 *  3) The mode of the two I/Os is identical.
655		 *  4) The next kva is page aligned and the next transfer
656		 *	   is a multiple of a page in length.
657		 *	5) And the total I/O size would be below the maximum.
658		 */
659		} else if( ap->av_forw &&
660			(bp->b_blkno + (bp->b_bcount / DEV_BSIZE) == ap->av_forw->b_blkno) &&
661			(bp->b_flags == (ap->av_forw->b_flags & ~B_CLUSTER)) &&
662			((ap->av_forw->b_bcount & PAGE_MASK) == 0) &&
663			(((vm_offset_t) ap->av_forw->b_un.b_addr & PAGE_MASK) == 0) &&
664			(ap->av_forw->b_bcount + bp->b_bcount < maxio)) {
665
666			orig1begin = (vm_offset_t) bp->b_un.b_addr;
667			orig1pages = bp->b_bcount / PAGE_SIZE;
668
669			orig2begin = (vm_offset_t) ap->av_forw->b_un.b_addr;
670			orig2pages = ap->av_forw->b_bcount / PAGE_SIZE;
671
672			/*
673			 * see if we can allocate a kva, if we cannot, the don't
674			 * cluster.
675			 */
676			kvanew = vm_bounce_kva( PAGE_SIZE * (orig1pages + orig2pages), 0);
677			if( !kvanew) {
678				goto nocluster;
679			}
680
681
682			/*
683			 * if next isn't a cluster we need to create one
684			 */
685			if( (ap->av_forw->b_flags & B_CLUSTER) == 0) {
686
687				/*
688				 * get a physical buf pointer
689				 */
690				newbp = (struct buf *)trypbuf();
691				if( !newbp) {
692					vm_bounce_kva_free( kvanew, PAGE_SIZE * (orig1pages + orig2pages), 1);
693					goto nocluster;
694				}
695
696				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
697
698				pmap_update();
699
700				ap = ap->av_forw;
701				*newbp = *ap;
702				newbp->b_flags |= B_CLUSTER;
703				newbp->b_un.b_addr = (caddr_t) kvanew;
704				newbp->b_blkno = bp->b_blkno;
705				newbp->b_bcount += bp->b_bcount;
706				newbp->b_bufsize = newbp->b_bcount;
707				newbp->b_clusterf = bp;
708				newbp->b_clusterl = ap;
709
710				if( ap->av_forw)
711					ap->av_forw->av_back = newbp;
712				else
713					dp->b_actl = newbp;
714
715				if( dp->b_actf != ap )
716					ap->av_back->av_forw = newbp;
717				else
718					dp->b_actf = newbp;
719
720				bp->av_forw = ap;
721				ap->av_back = bp;
722
723				bp->av_back = NULL;
724				ap->av_forw = NULL;
725			} else {
726				vm_offset_t addr;
727
728				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
729				ap = ap->av_forw;
730				vm_bounce_kva_free( orig2begin, ap->b_bufsize, 0);
731
732				ap->b_un.b_addr = (caddr_t) kvanew;
733				bp->av_forw = ap->b_clusterf;
734				ap->b_clusterf->av_back = bp;
735				ap->b_clusterf = bp;
736				bp->av_back = NULL;
737
738				ap->b_blkno = bp->b_blkno;
739				ap->b_bcount += bp->b_bcount;
740				ap->b_bufsize = ap->b_bcount;
741
742			}
743			return;
744		}
745	}
746	/*
747	 * don't merge
748	 */
749nocluster:
750	bp->av_forw = ap->av_forw;
751	if( bp->av_forw)
752		bp->av_forw->av_back = bp;
753	else
754		dp->b_actl = bp;
755
756	ap->av_forw = bp;
757	bp->av_back = ap;
758}
759
760
761/*
762 * Finish a fork operation, with process p2 nearly set up.
763 * Copy and update the kernel stack and pcb, making the child
764 * ready to run, and marking it so that it can return differently
765 * than the parent.  Returns 1 in the child process, 0 in the parent.
766 * We currently double-map the user area so that the stack is at the same
767 * address in each process; in the future we will probably relocate
768 * the frame pointers on the stack after copying.
769 */
770int
771cpu_fork(p1, p2)
772	register struct proc *p1, *p2;
773{
774	register struct user *up = p2->p_addr;
775	int foo, offset, addr, i;
776	extern char kstack[];
777	extern int mvesp();
778
779	/*
780	 * Copy pcb and stack from proc p1 to p2.
781	 * We do this as cheaply as possible, copying only the active
782	 * part of the stack.  The stack and pcb need to agree;
783	 * this is tricky, as the final pcb is constructed by savectx,
784	 * but its frame isn't yet on the stack when the stack is copied.
785	 * swtch compensates for this when the child eventually runs.
786	 * This should be done differently, with a single call
787	 * that copies and updates the pcb+stack,
788	 * replacing the bcopy and savectx.
789	 */
790	p2->p_addr->u_pcb = p1->p_addr->u_pcb;
791	offset = mvesp() - (int)kstack;
792	bcopy((caddr_t)kstack + offset, (caddr_t)p2->p_addr + offset,
793	    (unsigned) ctob(UPAGES) - offset);
794	p2->p_regs = p1->p_regs;
795
796	/*
797	 * Wire top of address space of child to it's kstack.
798	 * First, fault in a page of pte's to map it.
799	 */
800#if 0
801        addr = trunc_page((u_int)vtopte(kstack));
802	vm_map_pageable(&p2->p_vmspace->vm_map, addr, addr+NBPG, FALSE);
803	for (i=0; i < UPAGES; i++)
804		pmap_enter(&p2->p_vmspace->vm_pmap, kstack+i*NBPG,
805			   pmap_extract(kernel_pmap, ((int)p2->p_addr)+i*NBPG),
806			   /*
807			    * The user area has to be mapped writable because
808			    * it contains the kernel stack (when CR0_WP is on
809			    * on a 486 there is no user-read/kernel-write
810			    * mode).  It is protected from user mode access
811			    * by the segment limits.
812			    */
813			   VM_PROT_READ|VM_PROT_WRITE, TRUE);
814#endif
815	pmap_activate(&p2->p_vmspace->vm_pmap, &up->u_pcb);
816
817	/*
818	 *
819	 * Arrange for a non-local goto when the new process
820	 * is started, to resume here, returning nonzero from setjmp.
821	 */
822	if (savectx(up, 1)) {
823		/*
824		 * Return 1 in child.
825		 */
826		return (1);
827	}
828	return (0);
829}
830
831#ifdef notyet
832/*
833 * cpu_exit is called as the last action during exit.
834 *
835 * We change to an inactive address space and a "safe" stack,
836 * passing thru an argument to the new stack. Now, safely isolated
837 * from the resources we're shedding, we release the address space
838 * and any remaining machine-dependent resources, including the
839 * memory for the user structure and kernel stack.
840 *
841 * Next, we assign a dummy context to be written over by swtch,
842 * calling it to send this process off to oblivion.
843 * [The nullpcb allows us to minimize cost in swtch() by not having
844 * a special case].
845 */
846struct proc *swtch_to_inactive();
847volatile void
848cpu_exit(p)
849	register struct proc *p;
850{
851	static struct pcb nullpcb;	/* pcb to overwrite on last swtch */
852
853#if NNPX > 0
854	npxexit(p);
855#endif	/* NNPX */
856
857	/* move to inactive space and stack, passing arg accross */
858	p = swtch_to_inactive(p);
859
860	/* drop per-process resources */
861	vmspace_free(p->p_vmspace);
862	kmem_free(kernel_map, (vm_offset_t)p->p_addr, ctob(UPAGES));
863
864	p->p_addr = (struct user *) &nullpcb;
865	splclock();
866	swtch();
867	/* NOTREACHED */
868}
869#else
870void
871cpu_exit(p)
872	register struct proc *p;
873{
874
875#if NNPX > 0
876	npxexit(p);
877#endif	/* NNPX */
878	splclock();
879	curproc = 0;
880	swtch();
881	/*
882	 * This is to shutup the compiler, and if swtch() failed I suppose
883	 * this would be a good thing.  This keeps gcc happy because panic
884	 * is a volatile void function as well.
885	 */
886	panic("cpu_exit");
887}
888
889void
890cpu_wait(p) struct proc *p; {
891/*	extern vm_map_t upages_map; */
892	extern char kstack[];
893
894	/* drop per-process resources */
895 	pmap_remove(vm_map_pmap(kernel_map), (vm_offset_t) p->p_addr,
896		((vm_offset_t) p->p_addr) + ctob(UPAGES));
897	kmem_free(kernel_map, (vm_offset_t)p->p_addr, ctob(UPAGES));
898	vmspace_free(p->p_vmspace);
899}
900#endif
901
902/*
903 * Set a red zone in the kernel stack after the u. area.
904 */
905void
906setredzone(pte, vaddr)
907	u_short *pte;
908	caddr_t vaddr;
909{
910/* eventually do this by setting up an expand-down stack segment
911   for ss0: selector, allowing stack access down to top of u.
912   this means though that protection violations need to be handled
913   thru a double fault exception that must do an integral task
914   switch to a known good context, within which a dump can be
915   taken. a sensible scheme might be to save the initial context
916   used by sched (that has physical memory mapped 1:1 at bottom)
917   and take the dump while still in mapped mode */
918}
919
920/*
921 * Convert kernel VA to physical address
922 */
923u_long
924kvtop(void *addr)
925{
926	vm_offset_t va;
927
928	va = pmap_kextract((vm_offset_t)addr);
929	if (va == 0)
930		panic("kvtop: zero page frame");
931	return((int)va);
932}
933
934extern vm_map_t phys_map;
935
936/*
937 * Map an IO request into kernel virtual address space.
938 *
939 * All requests are (re)mapped into kernel VA space.
940 * Notice that we use b_bufsize for the size of the buffer
941 * to be mapped.  b_bcount might be modified by the driver.
942 */
943void
944vmapbuf(bp)
945	register struct buf *bp;
946{
947	register int npf;
948	register caddr_t addr;
949	register long flags = bp->b_flags;
950	struct proc *p;
951	int off;
952	vm_offset_t kva;
953	register vm_offset_t pa;
954
955	if ((flags & B_PHYS) == 0)
956		panic("vmapbuf");
957	addr = bp->b_saveaddr = bp->b_un.b_addr;
958	off = (int)addr & PGOFSET;
959	p = bp->b_proc;
960	npf = btoc(round_page(bp->b_bufsize + off));
961	kva = kmem_alloc_wait(phys_map, ctob(npf));
962	bp->b_un.b_addr = (caddr_t) (kva + off);
963	while (npf--) {
964		pa = pmap_extract(&p->p_vmspace->vm_pmap, (vm_offset_t)addr);
965		if (pa == 0)
966			panic("vmapbuf: null page frame");
967		pmap_kenter(kva, trunc_page(pa));
968		addr += PAGE_SIZE;
969		kva += PAGE_SIZE;
970	}
971	pmap_update();
972}
973
974/*
975 * Free the io map PTEs associated with this IO operation.
976 * We also invalidate the TLB entries and restore the original b_addr.
977 */
978void
979vunmapbuf(bp)
980	register struct buf *bp;
981{
982	register int npf;
983	register caddr_t addr = bp->b_un.b_addr;
984	vm_offset_t kva;
985
986	if ((bp->b_flags & B_PHYS) == 0)
987		panic("vunmapbuf");
988	npf = btoc(round_page(bp->b_bufsize + ((int)addr & PGOFSET)));
989	kva = (vm_offset_t)((int)addr & ~PGOFSET);
990	kmem_free_wakeup(phys_map, kva, ctob(npf));
991	bp->b_un.b_addr = bp->b_saveaddr;
992	bp->b_saveaddr = NULL;
993}
994
995/*
996 * Force reset the processor by invalidating the entire address space!
997 */
998void
999cpu_reset() {
1000
1001	/* force a shutdown by unmapping entire address space ! */
1002	bzero((caddr_t) PTD, NBPG);
1003
1004	/* "good night, sweet prince .... <THUNK!>" */
1005	tlbflush();
1006	/* NOTREACHED */
1007	while(1);
1008}
1009
1010/*
1011 * Grow the user stack to allow for 'sp'. This version grows the stack in
1012 *	chunks of SGROWSIZ.
1013 */
1014int
1015grow(p, sp)
1016	struct proc *p;
1017	int sp;
1018{
1019	unsigned int nss;
1020	caddr_t v;
1021	struct vmspace *vm = p->p_vmspace;
1022
1023	if ((caddr_t)sp <= vm->vm_maxsaddr || (unsigned)sp >= (unsigned)USRSTACK)
1024	    return (1);
1025
1026	nss = roundup(USRSTACK - (unsigned)sp, PAGE_SIZE);
1027
1028	if (nss > p->p_rlimit[RLIMIT_STACK].rlim_cur)
1029		return (0);
1030
1031	if (vm->vm_ssize && roundup(vm->vm_ssize << PAGE_SHIFT,
1032	    SGROWSIZ) < nss) {
1033		int grow_amount;
1034		/*
1035		 * If necessary, grow the VM that the stack occupies
1036		 * to allow for the rlimit. This allows us to not have
1037		 * to allocate all of the VM up-front in execve (which
1038		 * is expensive).
1039		 * Grow the VM by the amount requested rounded up to
1040		 * the nearest SGROWSIZ to provide for some hysteresis.
1041		 */
1042		grow_amount = roundup((nss - (vm->vm_ssize << PAGE_SHIFT)), SGROWSIZ);
1043		v = (char *)USRSTACK - roundup(vm->vm_ssize << PAGE_SHIFT,
1044		    SGROWSIZ) - grow_amount;
1045		/*
1046		 * If there isn't enough room to extend by SGROWSIZ, then
1047		 * just extend to the maximum size
1048		 */
1049		if (v < vm->vm_maxsaddr) {
1050			v = vm->vm_maxsaddr;
1051			grow_amount = MAXSSIZ - (vm->vm_ssize << PAGE_SHIFT);
1052		}
1053		if (vm_allocate(&vm->vm_map, (vm_offset_t *)&v,
1054		    grow_amount, FALSE) != KERN_SUCCESS) {
1055			return (0);
1056		}
1057		vm->vm_ssize += grow_amount >> PAGE_SHIFT;
1058	}
1059
1060	return (1);
1061}
1062