vm_machdep.c revision 1415
1/*-
2 * Copyright (c) 1982, 1986 The Regents of the University of California.
3 * Copyright (c) 1989, 1990 William Jolitz
4 * Copyright (c) 1994 John Dyson
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * the Systems Programming Group of the University of Utah Computer
9 * Science Department, and William Jolitz.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by the University of
22 *	California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 *    may be used to endorse or promote products derived from this software
25 *    without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
40 *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
41 *	$Id: vm_machdep.c,v 1.20 1994/04/20 07:06:20 davidg Exp $
42 */
43
44#include "npx.h"
45#include "param.h"
46#include "systm.h"
47#include "proc.h"
48#include "malloc.h"
49#include "buf.h"
50#include "user.h"
51
52#include "../include/cpu.h"
53
54#include "vm/vm.h"
55#include "vm/vm_kern.h"
56
57#define b_cylin b_resid
58
59#define MAXCLSTATS 256
60int clstats[MAXCLSTATS];
61int rqstats[MAXCLSTATS];
62
63
64#ifndef NOBOUNCE
65
66caddr_t		bouncememory;
67vm_offset_t	bouncepa, bouncepaend;
68int		bouncepages, bpwait;
69vm_map_t	io_map;
70int		bmwait, bmfreeing;
71
72#define BITS_IN_UNSIGNED (8*sizeof(unsigned))
73int		bounceallocarraysize;
74unsigned	*bounceallocarray;
75int		bouncefree;
76
77#define SIXTEENMEG (4096*4096)
78#define MAXBKVA 1024
79
80/* special list that can be used at interrupt time for eventual kva free */
81struct kvasfree {
82	vm_offset_t addr;
83	vm_offset_t size;
84} kvaf[MAXBKVA];
85
86int		kvasfreecnt;
87
88vm_offset_t vm_bounce_kva();
89/*
90 * get bounce buffer pages (count physically contiguous)
91 * (only 1 inplemented now)
92 */
93vm_offset_t
94vm_bounce_page_find(count)
95	int count;
96{
97	int bit;
98	int s,i;
99
100	if (count != 1)
101		panic("vm_bounce_page_find -- no support for > 1 page yet!!!");
102
103	s = splbio();
104retry:
105	for (i = 0; i < bounceallocarraysize; i++) {
106		if (bounceallocarray[i] != 0xffffffff) {
107			if (bit = ffs(~bounceallocarray[i])) {
108				bounceallocarray[i] |= 1 << (bit - 1) ;
109				bouncefree -= count;
110				splx(s);
111				return bouncepa + (i * BITS_IN_UNSIGNED + (bit - 1)) * NBPG;
112			}
113		}
114	}
115	bpwait = 1;
116	tsleep((caddr_t) &bounceallocarray, PRIBIO, "bncwai", 0);
117	goto retry;
118}
119
120void
121vm_bounce_kva_free(addr, size, now)
122	vm_offset_t addr;
123	vm_offset_t size;
124	int now;
125{
126	int s = splbio();
127	kvaf[kvasfreecnt].addr = addr;
128	kvaf[kvasfreecnt++].size = size;
129	if( now) {
130		/*
131		 * this will do wakeups
132		 */
133		vm_bounce_kva(0,0);
134	} else {
135		if (bmwait) {
136		/*
137		 * if anyone is waiting on the bounce-map, then wakeup
138		 */
139			wakeup((caddr_t) io_map);
140			bmwait = 0;
141		}
142	}
143	splx(s);
144}
145
146/*
147 * free count bounce buffer pages
148 */
149void
150vm_bounce_page_free(pa, count)
151	vm_offset_t pa;
152	int count;
153{
154	int allocindex;
155	int index;
156	int bit;
157
158	if (count != 1)
159		panic("vm_bounce_page_free -- no support for > 1 page yet!!!\n");
160
161	index = (pa - bouncepa) / NBPG;
162
163	if ((index < 0) || (index >= bouncepages))
164		panic("vm_bounce_page_free -- bad index\n");
165
166	allocindex = index / BITS_IN_UNSIGNED;
167	bit = index % BITS_IN_UNSIGNED;
168
169	bounceallocarray[allocindex] &= ~(1 << bit);
170
171	bouncefree += count;
172	if (bpwait) {
173		bpwait = 0;
174		wakeup((caddr_t) &bounceallocarray);
175	}
176}
177
178/*
179 * allocate count bounce buffer kva pages
180 */
181vm_offset_t
182vm_bounce_kva(count, waitok)
183	int count;
184	int waitok;
185{
186	int tofree;
187	int i;
188	int startfree;
189	vm_offset_t kva = 0;
190	int s = splbio();
191	int size = count;
192	startfree = 0;
193more:
194	if (!bmfreeing && (tofree = kvasfreecnt)) {
195		bmfreeing = 1;
196		for (i = startfree; i < kvasfreecnt; i++) {
197			/*
198			 * if we have a kva of the right size, no sense
199			 * in freeing/reallocating...
200			 * might affect fragmentation short term, but
201			 * as long as the amount of io_map is
202			 * significantly more than the maximum transfer
203			 * size, I don't think that it is a problem.
204			 */
205			pmap_remove(kernel_pmap,
206				kvaf[i].addr, kvaf[i].addr + kvaf[i].size);
207			if( size && !kva && kvaf[i].size == size) {
208				kva = kvaf[i].addr;
209			} else {
210				kmem_free_wakeup(io_map, kvaf[i].addr,
211					kvaf[i].size);
212			}
213		}
214		if (kvasfreecnt != tofree) {
215			startfree = i;
216			bmfreeing = 0;
217			goto more;
218		}
219		kvasfreecnt = 0;
220		bmfreeing = 0;
221	}
222
223	if( size == 0) {
224		splx(s);
225		return NULL;
226	}
227
228	if (!kva && !(kva = kmem_alloc_pageable(io_map, size))) {
229		if( !waitok) {
230			splx(s);
231			return NULL;
232		}
233		bmwait = 1;
234		tsleep((caddr_t) io_map, PRIBIO, "bmwait", 0);
235		goto more;
236	}
237	splx(s);
238
239	return kva;
240}
241
242/*
243 * same as vm_bounce_kva -- but really allocate
244 */
245vm_offset_t
246vm_bounce_kva_alloc(count)
247int count;
248{
249	int i;
250	vm_offset_t kva;
251	vm_offset_t pa;
252	if( bouncepages == 0) {
253		kva = (vm_offset_t) malloc(count*NBPG, M_TEMP, M_WAITOK);
254		return kva;
255	}
256	kva = vm_bounce_kva(count, 1);
257	for(i=0;i<count;i++) {
258		pa = vm_bounce_page_find(1);
259		pmap_kenter(kva + i * NBPG, pa);
260	}
261	return kva;
262}
263
264/*
265 * same as vm_bounce_kva_free -- but really free
266 */
267void
268vm_bounce_kva_alloc_free(kva, count)
269	vm_offset_t kva;
270	int count;
271{
272	int i;
273	vm_offset_t pa;
274	if( bouncepages == 0) {
275		free((caddr_t) kva, M_TEMP);
276		return;
277	}
278	for(i = 0; i < count; i++) {
279		pa = pmap_kextract(kva + i * NBPG);
280		vm_bounce_page_free(pa, 1);
281	}
282	vm_bounce_kva_free(kva, count);
283}
284
285/*
286 * do the things necessary to the struct buf to implement
287 * bounce buffers...  inserted before the disk sort
288 */
289void
290vm_bounce_alloc(bp)
291	struct buf *bp;
292{
293	int countvmpg;
294	vm_offset_t vastart, vaend;
295	vm_offset_t vapstart, vapend;
296	vm_offset_t va, kva;
297	vm_offset_t pa;
298	int dobounceflag = 0;
299	int bounceindex;
300	int i;
301	int s;
302
303	if (bouncepages == 0)
304		return;
305
306	if (bp->b_bufsize < bp->b_bcount) {
307		printf("vm_bounce_alloc: b_bufsize(%d) < b_bcount(%d) !!!!\n",
308			bp->b_bufsize, bp->b_bcount);
309		bp->b_bufsize = bp->b_bcount;
310	}
311
312	vastart = (vm_offset_t) bp->b_un.b_addr;
313	vaend = (vm_offset_t) bp->b_un.b_addr + bp->b_bufsize;
314
315	vapstart = i386_trunc_page(vastart);
316	vapend = i386_round_page(vaend);
317	countvmpg = (vapend - vapstart) / NBPG;
318
319/*
320 * if any page is above 16MB, then go into bounce-buffer mode
321 */
322	va = vapstart;
323	for (i = 0; i < countvmpg; i++) {
324		pa = pmap_kextract(va);
325		if (pa >= SIXTEENMEG)
326			++dobounceflag;
327		va += NBPG;
328	}
329	if (dobounceflag == 0)
330		return;
331
332	if (bouncepages < dobounceflag)
333		panic("Not enough bounce buffers!!!");
334
335/*
336 * allocate a replacement kva for b_addr
337 */
338	kva = vm_bounce_kva(countvmpg*NBPG, 1);
339	va = vapstart;
340	for (i = 0; i < countvmpg; i++) {
341		pa = pmap_kextract(va);
342		if (pa >= SIXTEENMEG) {
343			/*
344			 * allocate a replacement page
345			 */
346			vm_offset_t bpa = vm_bounce_page_find(1);
347			pmap_kenter(kva + (NBPG * i), bpa);
348			/*
349			 * if we are writing, the copy the data into the page
350			 */
351			if ((bp->b_flags & B_READ) == 0) {
352				pmap_update();
353				bcopy((caddr_t) va, (caddr_t) kva + (NBPG * i), NBPG);
354			}
355		} else {
356			/*
357			 * use original page
358			 */
359			pmap_kenter(kva + (NBPG * i), pa);
360		}
361		va += NBPG;
362	}
363	pmap_update();
364
365/*
366 * flag the buffer as being bounced
367 */
368	bp->b_flags |= B_BOUNCE;
369/*
370 * save the original buffer kva
371 */
372	bp->b_savekva = bp->b_un.b_addr;
373/*
374 * put our new kva into the buffer (offset by original offset)
375 */
376	bp->b_un.b_addr = (caddr_t) (((vm_offset_t) kva) |
377				((vm_offset_t) bp->b_savekva & (NBPG - 1)));
378	return;
379}
380
381/*
382 * hook into biodone to free bounce buffer
383 */
384void
385vm_bounce_free(bp)
386	struct buf *bp;
387{
388	int i;
389	vm_offset_t origkva, bouncekva;
390	vm_offset_t vastart, vaend;
391	vm_offset_t vapstart, vapend;
392	int countbounce = 0;
393	vm_offset_t firstbouncepa = 0;
394	int firstbounceindex;
395	int countvmpg;
396	vm_offset_t bcount;
397	int s;
398
399/*
400 * if this isn't a bounced buffer, then just return
401 */
402	if ((bp->b_flags & B_BOUNCE) == 0)
403		return;
404
405	origkva = (vm_offset_t) bp->b_savekva;
406	bouncekva = (vm_offset_t) bp->b_un.b_addr;
407
408	vastart = bouncekva;
409	vaend = bouncekva + bp->b_bufsize;
410	bcount = bp->b_bufsize;
411
412	vapstart = i386_trunc_page(vastart);
413	vapend = i386_round_page(vaend);
414
415	countvmpg = (vapend - vapstart) / NBPG;
416
417/*
418 * check every page in the kva space for b_addr
419 */
420	for (i = 0; i < countvmpg; i++) {
421		vm_offset_t mybouncepa;
422		vm_offset_t copycount;
423
424		copycount = i386_round_page(bouncekva + 1) - bouncekva;
425		mybouncepa = pmap_kextract(i386_trunc_page(bouncekva));
426
427/*
428 * if this is a bounced pa, then process as one
429 */
430		if ((mybouncepa >= bouncepa) && (mybouncepa < bouncepaend)) {
431			if (copycount > bcount)
432				copycount = bcount;
433/*
434 * if this is a read, then copy from bounce buffer into original buffer
435 */
436			if (bp->b_flags & B_READ)
437				bcopy((caddr_t) bouncekva, (caddr_t) origkva, copycount);
438/*
439 * free the bounce allocation
440 */
441			vm_bounce_page_free(i386_trunc_page(mybouncepa), 1);
442		}
443
444		origkva += copycount;
445		bouncekva += copycount;
446		bcount -= copycount;
447	}
448
449/*
450 * add the old kva into the "to free" list
451 */
452	bouncekva = i386_trunc_page((vm_offset_t) bp->b_un.b_addr);
453	vm_bounce_kva_free( bouncekva, countvmpg*NBPG, 0);
454	bp->b_un.b_addr = bp->b_savekva;
455	bp->b_savekva = 0;
456	bp->b_flags &= ~B_BOUNCE;
457
458	return;
459}
460
461#endif /* NOBOUNCE */
462
463/*
464 * init the bounce buffer system
465 */
466void
467vm_bounce_init()
468{
469	vm_offset_t minaddr, maxaddr;
470
471	io_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, MAXBKVA * NBPG, FALSE);
472	kvasfreecnt = 0;
473
474#ifndef NOBOUNCE
475	if (bouncepages == 0)
476		return;
477
478	bounceallocarraysize = (bouncepages + BITS_IN_UNSIGNED - 1) / BITS_IN_UNSIGNED;
479	bounceallocarray = malloc(bounceallocarraysize * sizeof(unsigned), M_TEMP, M_NOWAIT);
480
481	if (!bounceallocarray)
482		panic("Cannot allocate bounce resource array\n");
483
484	bzero(bounceallocarray, bounceallocarraysize * sizeof(long));
485
486
487	bouncepa = pmap_kextract((vm_offset_t) bouncememory);
488	bouncepaend = bouncepa + bouncepages * NBPG;
489	bouncefree = bouncepages;
490#endif
491
492}
493
494
495static void
496cldiskvamerge( kvanew, orig1, orig1cnt, orig2, orig2cnt)
497	vm_offset_t kvanew;
498	vm_offset_t orig1, orig1cnt;
499	vm_offset_t orig2, orig2cnt;
500{
501	int i;
502	vm_offset_t pa;
503/*
504 * enter the transfer physical addresses into the new kva
505 */
506	for(i=0;i<orig1cnt;i++) {
507		vm_offset_t pa;
508		pa = pmap_kextract((caddr_t) orig1 + i * PAGE_SIZE);
509		pmap_kenter(kvanew + i * PAGE_SIZE, pa);
510	}
511
512	for(i=0;i<orig2cnt;i++) {
513		vm_offset_t pa;
514		pa = pmap_kextract((caddr_t) orig2 + i * PAGE_SIZE);
515		pmap_kenter(kvanew + (i + orig1cnt) * PAGE_SIZE, pa);
516	}
517	pmap_update();
518}
519
520void
521cldisksort(struct buf *dp, struct buf *bp, vm_offset_t maxio)
522{
523	register struct buf *ap, *newbp;
524	int i, trycount=0;
525	vm_offset_t orig1pages, orig2pages;
526	vm_offset_t orig1begin, orig2begin;
527	vm_offset_t kvanew, kvaorig;
528
529	if( bp->b_bcount < MAXCLSTATS*PAGE_SIZE)
530		++rqstats[bp->b_bcount/PAGE_SIZE];
531	/*
532	 * If nothing on the activity queue, then
533	 * we become the only thing.
534	 */
535	ap = dp->b_actf;
536	if(ap == NULL) {
537		dp->b_actf = bp;
538		dp->b_actl = bp;
539		bp->av_forw = NULL;
540		return;
541	}
542
543	/*
544	 * If we lie after the first (currently active)
545	 * request, then we must locate the second request list
546	 * and add ourselves to it.
547	 */
548
549	if (bp->b_pblkno < ap->b_pblkno) {
550		while (ap->av_forw) {
551			/*
552			 * Check for an ``inversion'' in the
553			 * normally ascending block numbers,
554			 * indicating the start of the second request list.
555			 */
556			if (ap->av_forw->b_pblkno < ap->b_pblkno) {
557				/*
558				 * Search the second request list
559				 * for the first request at a larger
560				 * block number.  We go before that;
561				 * if there is no such request, we go at end.
562				 */
563				do {
564					if (bp->b_pblkno < ap->av_forw->b_pblkno)
565						goto insert;
566					ap = ap->av_forw;
567				} while (ap->av_forw);
568				goto insert;		/* after last */
569			}
570			ap = ap->av_forw;
571		}
572		/*
573		 * No inversions... we will go after the last, and
574		 * be the first request in the second request list.
575		 */
576		goto insert;
577	}
578	/*
579	 * Request is at/after the current request...
580	 * sort in the first request list.
581	 */
582	while (ap->av_forw) {
583		/*
584		 * We want to go after the current request
585		 * if there is an inversion after it (i.e. it is
586		 * the end of the first request list), or if
587		 * the next request is a larger block than our request.
588		 */
589		if (ap->av_forw->b_pblkno < ap->b_pblkno ||
590		    bp->b_pblkno < ap->av_forw->b_pblkno )
591			goto insert;
592		ap = ap->av_forw;
593	}
594
595insert:
596
597	/*
598	 * read clustering with new read-ahead disk drives hurts mostly, so
599	 * we don't bother...
600	 */
601	if( bp->b_flags & B_READ)
602		goto nocluster;
603	/*
604	 * we currently only cluster I/O transfers that are at page-aligned
605	 * kvas and transfers that are multiples of page lengths.
606	 */
607	if ((bp->b_flags & B_BAD) == 0 &&
608		((bp->b_bcount & PAGE_MASK) == 0) &&
609		(((vm_offset_t) bp->b_un.b_addr & PAGE_MASK) == 0)) {
610		if( maxio > MAXCLSTATS*PAGE_SIZE)
611			maxio = MAXCLSTATS*PAGE_SIZE;
612		/*
613		 * merge with previous?
614		 * conditions:
615		 * 	1) We reside physically immediately after the previous block.
616		 *	2) The previous block is not first on the device queue because
617		 *	   such a block might be active.
618		 *  3) The mode of the two I/Os is identical.
619		 *  4) The previous kva is page aligned and the previous transfer
620		 *	   is a multiple of a page in length.
621		 *	5) And the total I/O size would be below the maximum.
622		 */
623		if( (ap->b_pblkno + (ap->b_bcount / DEV_BSIZE) == bp->b_pblkno) &&
624			(dp->b_actf != ap) &&
625			((ap->b_flags & ~B_CLUSTER) == bp->b_flags) &&
626			((ap->b_flags & B_BAD) == 0) &&
627			((ap->b_bcount & PAGE_MASK) == 0) &&
628			(((vm_offset_t) ap->b_un.b_addr & PAGE_MASK) == 0) &&
629			(ap->b_bcount + bp->b_bcount < maxio)) {
630
631			orig1begin = (vm_offset_t) ap->b_un.b_addr;
632			orig1pages = ap->b_bcount / PAGE_SIZE;
633
634			orig2begin = (vm_offset_t) bp->b_un.b_addr;
635			orig2pages = bp->b_bcount / PAGE_SIZE;
636			/*
637			 * see if we can allocate a kva, if we cannot, the don't
638			 * cluster.
639			 */
640			kvanew = vm_bounce_kva( PAGE_SIZE * (orig1pages + orig2pages), 0);
641			if( !kvanew) {
642				goto nocluster;
643			}
644
645
646			if( (ap->b_flags & B_CLUSTER) == 0) {
647
648				/*
649				 * get a physical buf pointer
650				 */
651				newbp = (struct buf *)trypbuf();
652				if( !newbp) {
653					vm_bounce_kva_free( kvanew, PAGE_SIZE * (orig1pages + orig2pages), 1);
654					goto nocluster;
655				}
656
657				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
658
659				/*
660				 * build the new bp to be handed off to the device
661				 */
662
663				--clstats[ap->b_bcount/PAGE_SIZE];
664				*newbp = *ap;
665				newbp->b_flags |= B_CLUSTER;
666				newbp->b_un.b_addr = (caddr_t) kvanew;
667				newbp->b_bcount += bp->b_bcount;
668				newbp->b_bufsize = newbp->b_bcount;
669				newbp->b_clusterf = ap;
670				newbp->b_clusterl = bp;
671				++clstats[newbp->b_bcount/PAGE_SIZE];
672
673				/*
674				 * enter the new bp onto the device queue
675				 */
676				if( ap->av_forw)
677					ap->av_forw->av_back = newbp;
678				else
679					dp->b_actl = newbp;
680
681				if( dp->b_actf != ap )
682					ap->av_back->av_forw = newbp;
683				else
684					dp->b_actf = newbp;
685
686				/*
687				 * enter the previous bps onto the cluster queue
688				 */
689				ap->av_forw = bp;
690				bp->av_back = ap;
691
692				ap->av_back = NULL;
693				bp->av_forw = NULL;
694
695			} else {
696				vm_offset_t addr;
697
698				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
699				/*
700				 * free the old kva
701				 */
702				vm_bounce_kva_free( orig1begin, ap->b_bufsize, 0);
703				--clstats[ap->b_bcount/PAGE_SIZE];
704
705				ap->b_un.b_addr = (caddr_t) kvanew;
706
707				ap->b_clusterl->av_forw = bp;
708				bp->av_forw = NULL;
709				bp->av_back = ap->b_clusterl;
710				ap->b_clusterl = bp;
711
712				ap->b_bcount += bp->b_bcount;
713				ap->b_bufsize = ap->b_bcount;
714				++clstats[ap->b_bcount/PAGE_SIZE];
715			}
716			return;
717		/*
718		 * merge with next?
719		 * conditions:
720		 * 	1) We reside physically before the next block.
721		 *  3) The mode of the two I/Os is identical.
722		 *  4) The next kva is page aligned and the next transfer
723		 *	   is a multiple of a page in length.
724		 *	5) And the total I/O size would be below the maximum.
725		 */
726		} else if( ap->av_forw &&
727			(bp->b_pblkno + (bp->b_bcount / DEV_BSIZE) == ap->av_forw->b_pblkno) &&
728			(bp->b_flags == (ap->av_forw->b_flags & ~B_CLUSTER)) &&
729			((ap->av_forw->b_flags & B_BAD) == 0) &&
730			((ap->av_forw->b_bcount & PAGE_MASK) == 0) &&
731			(((vm_offset_t) ap->av_forw->b_un.b_addr & PAGE_MASK) == 0) &&
732			(ap->av_forw->b_bcount + bp->b_bcount < maxio)) {
733
734			orig1begin = (vm_offset_t) bp->b_un.b_addr;
735			orig1pages = bp->b_bcount / PAGE_SIZE;
736
737			orig2begin = (vm_offset_t) ap->av_forw->b_un.b_addr;
738			orig2pages = ap->av_forw->b_bcount / PAGE_SIZE;
739
740			/*
741			 * see if we can allocate a kva, if we cannot, the don't
742			 * cluster.
743			 */
744			kvanew = vm_bounce_kva( PAGE_SIZE * (orig1pages + orig2pages), 0);
745			if( !kvanew) {
746				goto nocluster;
747			}
748
749			/*
750			 * if next isn't a cluster we need to create one
751			 */
752			if( (ap->av_forw->b_flags & B_CLUSTER) == 0) {
753
754				/*
755				 * get a physical buf pointer
756				 */
757				newbp = (struct buf *)trypbuf();
758				if( !newbp) {
759					vm_bounce_kva_free( kvanew, PAGE_SIZE * (orig1pages + orig2pages), 1);
760					goto nocluster;
761				}
762
763				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
764				ap = ap->av_forw;
765				--clstats[ap->b_bcount/PAGE_SIZE];
766				*newbp = *ap;
767				newbp->b_flags |= B_CLUSTER;
768				newbp->b_un.b_addr = (caddr_t) kvanew;
769				newbp->b_blkno = bp->b_blkno;
770				newbp->b_pblkno = bp->b_pblkno;
771				newbp->b_bcount += bp->b_bcount;
772				newbp->b_bufsize = newbp->b_bcount;
773				newbp->b_clusterf = bp;
774				newbp->b_clusterl = ap;
775				++clstats[newbp->b_bcount/PAGE_SIZE];
776
777				if( ap->av_forw)
778					ap->av_forw->av_back = newbp;
779				else
780					dp->b_actl = newbp;
781
782				if( dp->b_actf != ap )
783					ap->av_back->av_forw = newbp;
784				else
785					dp->b_actf = newbp;
786
787				bp->av_forw = ap;
788				ap->av_back = bp;
789
790				bp->av_back = NULL;
791				ap->av_forw = NULL;
792			} else {
793				vm_offset_t addr;
794
795				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
796				ap = ap->av_forw;
797				vm_bounce_kva_free( orig2begin, ap->b_bufsize, 0);
798
799				ap->b_un.b_addr = (caddr_t) kvanew;
800				bp->av_forw = ap->b_clusterf;
801				ap->b_clusterf->av_back = bp;
802				ap->b_clusterf = bp;
803				bp->av_back = NULL;
804				--clstats[ap->b_bcount/PAGE_SIZE];
805
806				ap->b_blkno = bp->b_blkno;
807				ap->b_pblkno = bp->b_pblkno;
808				ap->b_bcount += bp->b_bcount;
809				ap->b_bufsize = ap->b_bcount;
810				++clstats[ap->b_bcount/PAGE_SIZE];
811
812			}
813			return;
814		}
815	}
816	/*
817	 * don't merge
818	 */
819nocluster:
820	++clstats[bp->b_bcount/PAGE_SIZE];
821	bp->av_forw = ap->av_forw;
822	if( bp->av_forw)
823		bp->av_forw->av_back = bp;
824	else
825		dp->b_actl = bp;
826
827	ap->av_forw = bp;
828	bp->av_back = ap;
829}
830
831/*
832 * quick version of vm_fault
833 */
834
835void
836vm_fault_quick( v, prot)
837	vm_offset_t v;
838	int prot;
839{
840	if( (cpu_class == CPUCLASS_386) &&
841		(prot & VM_PROT_WRITE))
842		vm_fault(&curproc->p_vmspace->vm_map, v,
843			VM_PROT_READ|VM_PROT_WRITE, FALSE);
844	else if( prot & VM_PROT_WRITE)
845		*(volatile char *)v += 0;
846	else
847		*(volatile char *)v;
848}
849
850
851/*
852 * Finish a fork operation, with process p2 nearly set up.
853 * Copy and update the kernel stack and pcb, making the child
854 * ready to run, and marking it so that it can return differently
855 * than the parent.  Returns 1 in the child process, 0 in the parent.
856 * We currently double-map the user area so that the stack is at the same
857 * address in each process; in the future we will probably relocate
858 * the frame pointers on the stack after copying.
859 */
860int
861cpu_fork(p1, p2)
862	register struct proc *p1, *p2;
863{
864	register struct user *up = p2->p_addr;
865	int foo, offset, addr, i;
866	extern char kstack[];
867	extern int mvesp();
868
869	/*
870	 * Copy pcb and stack from proc p1 to p2.
871	 * We do this as cheaply as possible, copying only the active
872	 * part of the stack.  The stack and pcb need to agree;
873	 * this is tricky, as the final pcb is constructed by savectx,
874	 * but its frame isn't yet on the stack when the stack is copied.
875	 * swtch compensates for this when the child eventually runs.
876	 * This should be done differently, with a single call
877	 * that copies and updates the pcb+stack,
878	 * replacing the bcopy and savectx.
879	 */
880	p2->p_addr->u_pcb = p1->p_addr->u_pcb;
881	offset = mvesp() - (int)kstack;
882	bcopy((caddr_t)kstack + offset, (caddr_t)p2->p_addr + offset,
883	    (unsigned) ctob(UPAGES) - offset);
884	p2->p_regs = p1->p_regs;
885
886	/*
887	 * Wire top of address space of child to it's kstack.
888	 * First, fault in a page of pte's to map it.
889	 */
890#if 0
891        addr = trunc_page((u_int)vtopte(kstack));
892	vm_map_pageable(&p2->p_vmspace->vm_map, addr, addr+NBPG, FALSE);
893	for (i=0; i < UPAGES; i++)
894		pmap_enter(&p2->p_vmspace->vm_pmap, kstack+i*NBPG,
895			   pmap_extract(kernel_pmap, ((int)p2->p_addr)+i*NBPG),
896			   /*
897			    * The user area has to be mapped writable because
898			    * it contains the kernel stack (when CR0_WP is on
899			    * on a 486 there is no user-read/kernel-write
900			    * mode).  It is protected from user mode access
901			    * by the segment limits.
902			    */
903			   VM_PROT_READ|VM_PROT_WRITE, TRUE);
904#endif
905	pmap_activate(&p2->p_vmspace->vm_pmap, &up->u_pcb);
906
907	/*
908	 *
909	 * Arrange for a non-local goto when the new process
910	 * is started, to resume here, returning nonzero from setjmp.
911	 */
912	if (savectx(up, 1)) {
913		/*
914		 * Return 1 in child.
915		 */
916		return (1);
917	}
918	return (0);
919}
920
921#ifdef notyet
922/*
923 * cpu_exit is called as the last action during exit.
924 *
925 * We change to an inactive address space and a "safe" stack,
926 * passing thru an argument to the new stack. Now, safely isolated
927 * from the resources we're shedding, we release the address space
928 * and any remaining machine-dependent resources, including the
929 * memory for the user structure and kernel stack.
930 *
931 * Next, we assign a dummy context to be written over by swtch,
932 * calling it to send this process off to oblivion.
933 * [The nullpcb allows us to minimize cost in swtch() by not having
934 * a special case].
935 */
936struct proc *swtch_to_inactive();
937volatile void
938cpu_exit(p)
939	register struct proc *p;
940{
941	static struct pcb nullpcb;	/* pcb to overwrite on last swtch */
942
943#if NNPX > 0
944	npxexit(p);
945#endif	/* NNPX */
946
947	/* move to inactive space and stack, passing arg accross */
948	p = swtch_to_inactive(p);
949
950	/* drop per-process resources */
951	vmspace_free(p->p_vmspace);
952	kmem_free(kernel_map, (vm_offset_t)p->p_addr, ctob(UPAGES));
953
954	p->p_addr = (struct user *) &nullpcb;
955	splclock();
956	swtch();
957	/* NOTREACHED */
958}
959#else
960void
961cpu_exit(p)
962	register struct proc *p;
963{
964
965#if NNPX > 0
966	npxexit(p);
967#endif	/* NNPX */
968	splclock();
969	curproc = 0;
970	swtch();
971	/*
972	 * This is to shutup the compiler, and if swtch() failed I suppose
973	 * this would be a good thing.  This keeps gcc happy because panic
974	 * is a volatile void function as well.
975	 */
976	panic("cpu_exit");
977}
978
979void
980cpu_wait(p) struct proc *p; {
981/*	extern vm_map_t upages_map; */
982	extern char kstack[];
983
984	/* drop per-process resources */
985 	pmap_remove(vm_map_pmap(kernel_map), (vm_offset_t) p->p_addr,
986		((vm_offset_t) p->p_addr) + ctob(UPAGES));
987	kmem_free(kernel_map, (vm_offset_t)p->p_addr, ctob(UPAGES));
988	vmspace_free(p->p_vmspace);
989}
990#endif
991
992/*
993 * Set a red zone in the kernel stack after the u. area.
994 */
995void
996setredzone(pte, vaddr)
997	u_short *pte;
998	caddr_t vaddr;
999{
1000/* eventually do this by setting up an expand-down stack segment
1001   for ss0: selector, allowing stack access down to top of u.
1002   this means though that protection violations need to be handled
1003   thru a double fault exception that must do an integral task
1004   switch to a known good context, within which a dump can be
1005   taken. a sensible scheme might be to save the initial context
1006   used by sched (that has physical memory mapped 1:1 at bottom)
1007   and take the dump while still in mapped mode */
1008}
1009
1010/*
1011 * Convert kernel VA to physical address
1012 */
1013u_long
1014kvtop(void *addr)
1015{
1016	vm_offset_t va;
1017
1018	va = pmap_kextract((vm_offset_t)addr);
1019	if (va == 0)
1020		panic("kvtop: zero page frame");
1021	return((int)va);
1022}
1023
1024extern vm_map_t phys_map;
1025
1026/*
1027 * Map an IO request into kernel virtual address space.
1028 *
1029 * All requests are (re)mapped into kernel VA space.
1030 * Notice that we use b_bufsize for the size of the buffer
1031 * to be mapped.  b_bcount might be modified by the driver.
1032 */
1033void
1034vmapbuf(bp)
1035	register struct buf *bp;
1036{
1037	register int npf;
1038	register caddr_t addr;
1039	register long flags = bp->b_flags;
1040	struct proc *p;
1041	int off;
1042	vm_offset_t kva;
1043	register vm_offset_t pa;
1044
1045	if ((flags & B_PHYS) == 0)
1046		panic("vmapbuf");
1047	addr = bp->b_saveaddr = bp->b_un.b_addr;
1048	off = (int)addr & PGOFSET;
1049	p = bp->b_proc;
1050	npf = btoc(round_page(bp->b_bufsize + off));
1051	kva = kmem_alloc_wait(phys_map, ctob(npf));
1052	bp->b_un.b_addr = (caddr_t) (kva + off);
1053	while (npf--) {
1054		pa = pmap_extract(&p->p_vmspace->vm_pmap, (vm_offset_t)addr);
1055		if (pa == 0)
1056			panic("vmapbuf: null page frame");
1057		pmap_kenter(kva, trunc_page(pa));
1058		addr += PAGE_SIZE;
1059		kva += PAGE_SIZE;
1060	}
1061	pmap_update();
1062}
1063
1064/*
1065 * Free the io map PTEs associated with this IO operation.
1066 * We also invalidate the TLB entries and restore the original b_addr.
1067 */
1068void
1069vunmapbuf(bp)
1070	register struct buf *bp;
1071{
1072	register int npf;
1073	register caddr_t addr = bp->b_un.b_addr;
1074	vm_offset_t kva;
1075
1076	if ((bp->b_flags & B_PHYS) == 0)
1077		panic("vunmapbuf");
1078	npf = btoc(round_page(bp->b_bufsize + ((int)addr & PGOFSET)));
1079	kva = (vm_offset_t)((int)addr & ~PGOFSET);
1080	kmem_free_wakeup(phys_map, kva, ctob(npf));
1081	bp->b_un.b_addr = bp->b_saveaddr;
1082	bp->b_saveaddr = NULL;
1083}
1084
1085/*
1086 * Force reset the processor by invalidating the entire address space!
1087 */
1088void
1089cpu_reset() {
1090
1091	/* force a shutdown by unmapping entire address space ! */
1092	bzero((caddr_t) PTD, NBPG);
1093
1094	/* "good night, sweet prince .... <THUNK!>" */
1095	tlbflush();
1096	/* NOTREACHED */
1097	while(1);
1098}
1099
1100/*
1101 * Grow the user stack to allow for 'sp'. This version grows the stack in
1102 *	chunks of SGROWSIZ.
1103 */
1104int
1105grow(p, sp)
1106	struct proc *p;
1107	int sp;
1108{
1109	unsigned int nss;
1110	caddr_t v;
1111	struct vmspace *vm = p->p_vmspace;
1112
1113	if ((caddr_t)sp <= vm->vm_maxsaddr || (unsigned)sp >= (unsigned)USRSTACK)
1114	    return (1);
1115
1116	nss = roundup(USRSTACK - (unsigned)sp, PAGE_SIZE);
1117
1118	if (nss > p->p_rlimit[RLIMIT_STACK].rlim_cur)
1119		return (0);
1120
1121	if (vm->vm_ssize && roundup(vm->vm_ssize << PAGE_SHIFT,
1122	    SGROWSIZ) < nss) {
1123		int grow_amount;
1124		/*
1125		 * If necessary, grow the VM that the stack occupies
1126		 * to allow for the rlimit. This allows us to not have
1127		 * to allocate all of the VM up-front in execve (which
1128		 * is expensive).
1129		 * Grow the VM by the amount requested rounded up to
1130		 * the nearest SGROWSIZ to provide for some hysteresis.
1131		 */
1132		grow_amount = roundup((nss - (vm->vm_ssize << PAGE_SHIFT)), SGROWSIZ);
1133		v = (char *)USRSTACK - roundup(vm->vm_ssize << PAGE_SHIFT,
1134		    SGROWSIZ) - grow_amount;
1135		/*
1136		 * If there isn't enough room to extend by SGROWSIZ, then
1137		 * just extend to the maximum size
1138		 */
1139		if (v < vm->vm_maxsaddr) {
1140			v = vm->vm_maxsaddr;
1141			grow_amount = MAXSSIZ - (vm->vm_ssize << PAGE_SHIFT);
1142		}
1143		if (vm_allocate(&vm->vm_map, (vm_offset_t *)&v,
1144		    grow_amount, FALSE) != KERN_SUCCESS) {
1145			return (0);
1146		}
1147		vm->vm_ssize += grow_amount >> PAGE_SHIFT;
1148	}
1149
1150	return (1);
1151}
1152