vm_machdep.c revision 1379
1/*-
2 * Copyright (c) 1982, 1986 The Regents of the University of California.
3 * Copyright (c) 1989, 1990 William Jolitz
4 * Copyright (c) 1994 John Dyson
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * the Systems Programming Group of the University of Utah Computer
9 * Science Department, and William Jolitz.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by the University of
22 *	California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 *    may be used to endorse or promote products derived from this software
25 *    without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
40 *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
41 *	$Id: vm_machdep.c,v 1.19 1994/04/14 07:49:40 davidg Exp $
42 */
43
44#include "npx.h"
45#include "param.h"
46#include "systm.h"
47#include "proc.h"
48#include "malloc.h"
49#include "buf.h"
50#include "user.h"
51
52#include "../include/cpu.h"
53
54#include "vm/vm.h"
55#include "vm/vm_kern.h"
56
57#define b_cylin b_resid
58
59#define MAXCLSTATS 256
60int clstats[MAXCLSTATS];
61int rqstats[MAXCLSTATS];
62
63
64#ifndef NOBOUNCE
65
66caddr_t		bouncememory;
67vm_offset_t	bouncepa, bouncepaend;
68int		bouncepages, bpwait;
69vm_map_t	io_map;
70int		bmwait, bmfreeing;
71
72#define BITS_IN_UNSIGNED (8*sizeof(unsigned))
73int		bounceallocarraysize;
74unsigned	*bounceallocarray;
75int		bouncefree;
76
77#define SIXTEENMEG (4096*4096)
78#define MAXBKVA 1024
79
80/* special list that can be used at interrupt time for eventual kva free */
81struct kvasfree {
82	vm_offset_t addr;
83	vm_offset_t size;
84} kvaf[MAXBKVA];
85
86int		kvasfreecnt;
87
88vm_offset_t vm_bounce_kva();
89/*
90 * get bounce buffer pages (count physically contiguous)
91 * (only 1 inplemented now)
92 */
93vm_offset_t
94vm_bounce_page_find(count)
95	int count;
96{
97	int bit;
98	int s,i;
99
100	if (count != 1)
101		panic("vm_bounce_page_find -- no support for > 1 page yet!!!");
102
103	s = splbio();
104retry:
105	for (i = 0; i < bounceallocarraysize; i++) {
106		if (bounceallocarray[i] != 0xffffffff) {
107			if (bit = ffs(~bounceallocarray[i])) {
108				bounceallocarray[i] |= 1 << (bit - 1) ;
109				bouncefree -= count;
110				splx(s);
111				return bouncepa + (i * BITS_IN_UNSIGNED + (bit - 1)) * NBPG;
112			}
113		}
114	}
115	bpwait = 1;
116	tsleep((caddr_t) &bounceallocarray, PRIBIO, "bncwai", 0);
117	goto retry;
118}
119
120void
121vm_bounce_kva_free(addr, size, now)
122	vm_offset_t addr;
123	vm_offset_t size;
124	int now;
125{
126	int s = splbio();
127	kvaf[kvasfreecnt].addr = addr;
128	kvaf[kvasfreecnt++].size = size;
129	if( now) {
130		/*
131		 * this will do wakeups
132		 */
133		vm_bounce_kva(0,0);
134	} else {
135		if (bmwait) {
136		/*
137		 * if anyone is waiting on the bounce-map, then wakeup
138		 */
139			wakeup((caddr_t) io_map);
140			bmwait = 0;
141		}
142	}
143	splx(s);
144}
145
146/*
147 * free count bounce buffer pages
148 */
149void
150vm_bounce_page_free(pa, count)
151	vm_offset_t pa;
152	int count;
153{
154	int allocindex;
155	int index;
156	int bit;
157
158	if (count != 1)
159		panic("vm_bounce_page_free -- no support for > 1 page yet!!!\n");
160
161	index = (pa - bouncepa) / NBPG;
162
163	if ((index < 0) || (index >= bouncepages))
164		panic("vm_bounce_page_free -- bad index\n");
165
166	allocindex = index / BITS_IN_UNSIGNED;
167	bit = index % BITS_IN_UNSIGNED;
168
169	bounceallocarray[allocindex] &= ~(1 << bit);
170
171	bouncefree += count;
172	if (bpwait) {
173		bpwait = 0;
174		wakeup((caddr_t) &bounceallocarray);
175	}
176}
177
178/*
179 * allocate count bounce buffer kva pages
180 */
181vm_offset_t
182vm_bounce_kva(count, waitok)
183	int count;
184	int waitok;
185{
186	int tofree;
187	int i;
188	int startfree;
189	vm_offset_t kva = 0;
190	int s = splbio();
191	int size = count;
192	startfree = 0;
193more:
194	if (!bmfreeing && (tofree = kvasfreecnt)) {
195		bmfreeing = 1;
196		for (i = startfree; i < kvasfreecnt; i++) {
197			/*
198			 * if we have a kva of the right size, no sense
199			 * in freeing/reallocating...
200			 * might affect fragmentation short term, but
201			 * as long as the amount of io_map is
202			 * significantly more than the maximum transfer
203			 * size, I don't think that it is a problem.
204			 */
205			pmap_remove(kernel_pmap,
206				kvaf[i].addr, kvaf[i].addr + kvaf[i].size);
207			if( size && !kva && kvaf[i].size == size) {
208				kva = kvaf[i].addr;
209			} else {
210				kmem_free_wakeup(io_map, kvaf[i].addr,
211					kvaf[i].size);
212			}
213		}
214		if (kvasfreecnt != tofree) {
215			startfree = i;
216			bmfreeing = 0;
217			goto more;
218		}
219		kvasfreecnt = 0;
220		bmfreeing = 0;
221	}
222
223	if( size == 0) {
224		splx(s);
225		return NULL;
226	}
227
228	if (!kva && !(kva = kmem_alloc_pageable(io_map, size))) {
229		if( !waitok) {
230			splx(s);
231			return NULL;
232		}
233		bmwait = 1;
234		tsleep((caddr_t) io_map, PRIBIO, "bmwait", 0);
235		goto more;
236	}
237	splx(s);
238
239	return kva;
240}
241
242/*
243 * same as vm_bounce_kva -- but really allocate
244 */
245vm_offset_t
246vm_bounce_kva_alloc(count)
247int count;
248{
249	int i;
250	vm_offset_t kva;
251	vm_offset_t pa;
252	if( bouncepages == 0) {
253		kva = (vm_offset_t) malloc(count*NBPG, M_TEMP, M_WAITOK);
254		return kva;
255	}
256	kva = vm_bounce_kva(count, 1);
257	for(i=0;i<count;i++) {
258		pa = vm_bounce_page_find(1);
259		pmap_kenter(kva + i * NBPG, pa);
260	}
261	return kva;
262}
263
264/*
265 * same as vm_bounce_kva_free -- but really free
266 */
267void
268vm_bounce_kva_alloc_free(kva, count)
269	vm_offset_t kva;
270	int count;
271{
272	int i;
273	vm_offset_t pa;
274	if( bouncepages == 0) {
275		free((caddr_t) kva, M_TEMP);
276		return;
277	}
278	for(i = 0; i < count; i++) {
279		pa = pmap_kextract(kva + i * NBPG);
280		vm_bounce_page_free(pa, 1);
281	}
282	vm_bounce_kva_free(kva, count);
283}
284
285/*
286 * do the things necessary to the struct buf to implement
287 * bounce buffers...  inserted before the disk sort
288 */
289void
290vm_bounce_alloc(bp)
291	struct buf *bp;
292{
293	int countvmpg;
294	vm_offset_t vastart, vaend;
295	vm_offset_t vapstart, vapend;
296	vm_offset_t va, kva;
297	vm_offset_t pa;
298	int dobounceflag = 0;
299	int bounceindex;
300	int i;
301	int s;
302
303	if (bouncepages == 0)
304		return;
305
306	if (bp->b_bufsize < bp->b_bcount) {
307		printf("vm_bounce_alloc: b_bufsize(%d) < b_bcount(%d) !!!!\n",
308			bp->b_bufsize, bp->b_bcount);
309		bp->b_bufsize = bp->b_bcount;
310	}
311
312	vastart = (vm_offset_t) bp->b_un.b_addr;
313	vaend = (vm_offset_t) bp->b_un.b_addr + bp->b_bufsize;
314
315	vapstart = i386_trunc_page(vastart);
316	vapend = i386_round_page(vaend);
317	countvmpg = (vapend - vapstart) / NBPG;
318
319/*
320 * if any page is above 16MB, then go into bounce-buffer mode
321 */
322	va = vapstart;
323	for (i = 0; i < countvmpg; i++) {
324		pa = pmap_kextract(va);
325		if (pa >= SIXTEENMEG)
326			++dobounceflag;
327		va += NBPG;
328	}
329	if (dobounceflag == 0)
330		return;
331
332	if (bouncepages < dobounceflag)
333		panic("Not enough bounce buffers!!!");
334
335/*
336 * allocate a replacement kva for b_addr
337 */
338	kva = vm_bounce_kva(countvmpg*NBPG, 1);
339	va = vapstart;
340	for (i = 0; i < countvmpg; i++) {
341		pa = pmap_kextract(va);
342		if (pa >= SIXTEENMEG) {
343			/*
344			 * allocate a replacement page
345			 */
346			vm_offset_t bpa = vm_bounce_page_find(1);
347			pmap_kenter(kva + (NBPG * i), bpa);
348			/*
349			 * if we are writing, the copy the data into the page
350			 */
351			if ((bp->b_flags & B_READ) == 0) {
352				pmap_update();
353				bcopy((caddr_t) va, (caddr_t) kva + (NBPG * i), NBPG);
354			}
355		} else {
356			/*
357			 * use original page
358			 */
359			pmap_kenter(kva + (NBPG * i), pa);
360		}
361		va += NBPG;
362	}
363	pmap_update();
364
365/*
366 * flag the buffer as being bounced
367 */
368	bp->b_flags |= B_BOUNCE;
369/*
370 * save the original buffer kva
371 */
372	bp->b_savekva = bp->b_un.b_addr;
373/*
374 * put our new kva into the buffer (offset by original offset)
375 */
376	bp->b_un.b_addr = (caddr_t) (((vm_offset_t) kva) |
377				((vm_offset_t) bp->b_savekva & (NBPG - 1)));
378	return;
379}
380
381/*
382 * hook into biodone to free bounce buffer
383 */
384void
385vm_bounce_free(bp)
386	struct buf *bp;
387{
388	int i;
389	vm_offset_t origkva, bouncekva;
390	vm_offset_t vastart, vaend;
391	vm_offset_t vapstart, vapend;
392	int countbounce = 0;
393	vm_offset_t firstbouncepa = 0;
394	int firstbounceindex;
395	int countvmpg;
396	vm_offset_t bcount;
397	int s;
398
399/*
400 * if this isn't a bounced buffer, then just return
401 */
402	if ((bp->b_flags & B_BOUNCE) == 0)
403		return;
404
405	origkva = (vm_offset_t) bp->b_savekva;
406	bouncekva = (vm_offset_t) bp->b_un.b_addr;
407
408	vastart = bouncekva;
409	vaend = bouncekva + bp->b_bufsize;
410	bcount = bp->b_bufsize;
411
412	vapstart = i386_trunc_page(vastart);
413	vapend = i386_round_page(vaend);
414
415	countvmpg = (vapend - vapstart) / NBPG;
416
417/*
418 * check every page in the kva space for b_addr
419 */
420	for (i = 0; i < countvmpg; i++) {
421		vm_offset_t mybouncepa;
422		vm_offset_t copycount;
423
424		copycount = i386_round_page(bouncekva + 1) - bouncekva;
425		mybouncepa = pmap_kextract(i386_trunc_page(bouncekva));
426
427/*
428 * if this is a bounced pa, then process as one
429 */
430		if ((mybouncepa >= bouncepa) && (mybouncepa < bouncepaend)) {
431			if (copycount > bcount)
432				copycount = bcount;
433/*
434 * if this is a read, then copy from bounce buffer into original buffer
435 */
436			if (bp->b_flags & B_READ)
437				bcopy((caddr_t) bouncekva, (caddr_t) origkva, copycount);
438/*
439 * free the bounce allocation
440 */
441			vm_bounce_page_free(i386_trunc_page(mybouncepa), 1);
442		}
443
444		origkva += copycount;
445		bouncekva += copycount;
446		bcount -= copycount;
447	}
448
449/*
450 * add the old kva into the "to free" list
451 */
452	bouncekva = i386_trunc_page((vm_offset_t) bp->b_un.b_addr);
453	vm_bounce_kva_free( bouncekva, countvmpg*NBPG, 0);
454	bp->b_un.b_addr = bp->b_savekva;
455	bp->b_savekva = 0;
456	bp->b_flags &= ~B_BOUNCE;
457
458	return;
459}
460
461#endif /* NOBOUNCE */
462
463/*
464 * init the bounce buffer system
465 */
466void
467vm_bounce_init()
468{
469	vm_offset_t minaddr, maxaddr;
470
471	io_map = kmem_suballoc(kernel_map, &minaddr, &maxaddr, MAXBKVA * NBPG, FALSE);
472	kvasfreecnt = 0;
473
474#ifndef NOBOUNCE
475	if (bouncepages == 0)
476		return;
477
478	bounceallocarraysize = (bouncepages + BITS_IN_UNSIGNED - 1) / BITS_IN_UNSIGNED;
479	bounceallocarray = malloc(bounceallocarraysize * sizeof(unsigned), M_TEMP, M_NOWAIT);
480
481	if (!bounceallocarray)
482		panic("Cannot allocate bounce resource array\n");
483
484	bzero(bounceallocarray, bounceallocarraysize * sizeof(long));
485
486
487	bouncepa = pmap_kextract((vm_offset_t) bouncememory);
488	bouncepaend = bouncepa + bouncepages * NBPG;
489	bouncefree = bouncepages;
490#endif
491
492}
493
494
495static void
496cldiskvamerge( kvanew, orig1, orig1cnt, orig2, orig2cnt)
497	vm_offset_t kvanew;
498	vm_offset_t orig1, orig1cnt;
499	vm_offset_t orig2, orig2cnt;
500{
501	int i;
502	vm_offset_t pa;
503/*
504 * enter the transfer physical addresses into the new kva
505 */
506	for(i=0;i<orig1cnt;i++) {
507		vm_offset_t pa;
508		pa = pmap_kextract((caddr_t) orig1 + i * PAGE_SIZE);
509		pmap_kenter(kvanew + i * PAGE_SIZE, pa);
510	}
511
512	for(i=0;i<orig2cnt;i++) {
513		vm_offset_t pa;
514		pa = pmap_kextract((caddr_t) orig2 + i * PAGE_SIZE);
515		pmap_kenter(kvanew + (i + orig1cnt) * PAGE_SIZE, pa);
516	}
517	pmap_update();
518}
519
520void
521cldisksort(struct buf *dp, struct buf *bp, vm_offset_t maxio)
522{
523	register struct buf *ap, *newbp;
524	int i, trycount=0;
525	vm_offset_t orig1pages, orig2pages;
526	vm_offset_t orig1begin, orig2begin;
527	vm_offset_t kvanew, kvaorig;
528
529	if( bp->b_bcount < MAXCLSTATS*PAGE_SIZE)
530		++rqstats[bp->b_bcount/PAGE_SIZE];
531	/*
532	 * If nothing on the activity queue, then
533	 * we become the only thing.
534	 */
535	ap = dp->b_actf;
536	if(ap == NULL) {
537		dp->b_actf = bp;
538		dp->b_actl = bp;
539		bp->av_forw = NULL;
540		return;
541	}
542
543	/*
544	 * If we lie after the first (currently active)
545	 * request, then we must locate the second request list
546	 * and add ourselves to it.
547	 */
548
549	if (bp->b_pblkno < ap->b_pblkno) {
550		while (ap->av_forw) {
551			/*
552			 * Check for an ``inversion'' in the
553			 * normally ascending block numbers,
554			 * indicating the start of the second request list.
555			 */
556			if (ap->av_forw->b_pblkno < ap->b_pblkno) {
557				/*
558				 * Search the second request list
559				 * for the first request at a larger
560				 * block number.  We go before that;
561				 * if there is no such request, we go at end.
562				 */
563				do {
564					if (bp->b_pblkno < ap->av_forw->b_pblkno)
565						goto insert;
566					ap = ap->av_forw;
567				} while (ap->av_forw);
568				goto insert;		/* after last */
569			}
570			ap = ap->av_forw;
571		}
572		/*
573		 * No inversions... we will go after the last, and
574		 * be the first request in the second request list.
575		 */
576		goto insert;
577	}
578	/*
579	 * Request is at/after the current request...
580	 * sort in the first request list.
581	 */
582	while (ap->av_forw) {
583		/*
584		 * We want to go after the current request
585		 * if there is an inversion after it (i.e. it is
586		 * the end of the first request list), or if
587		 * the next request is a larger block than our request.
588		 */
589		if (ap->av_forw->b_pblkno < ap->b_pblkno ||
590		    bp->b_pblkno < ap->av_forw->b_pblkno )
591			goto insert;
592		ap = ap->av_forw;
593	}
594
595insert:
596
597#if 0
598	/*
599	 * read clustering with new read-ahead disk drives hurts mostly, so
600	 * we don't bother...
601	 */
602	if( bp->b_flags & B_READ)
603		goto nocluster;
604#endif
605	/*
606	 * we currently only cluster I/O transfers that are at page-aligned
607	 * kvas and transfers that are multiples of page lengths.
608	 */
609	if ((bp->b_flags & B_BAD) == 0 &&
610		((bp->b_bcount & PAGE_MASK) == 0) &&
611		(((vm_offset_t) bp->b_un.b_addr & PAGE_MASK) == 0)) {
612		if( maxio > MAXCLSTATS*PAGE_SIZE)
613			maxio = MAXCLSTATS*PAGE_SIZE;
614		/*
615		 * merge with previous?
616		 * conditions:
617		 * 	1) We reside physically immediately after the previous block.
618		 *	2) The previous block is not first on the device queue because
619		 *	   such a block might be active.
620		 *  3) The mode of the two I/Os is identical.
621		 *  4) The previous kva is page aligned and the previous transfer
622		 *	   is a multiple of a page in length.
623		 *	5) And the total I/O size would be below the maximum.
624		 */
625		if( (ap->b_pblkno + (ap->b_bcount / DEV_BSIZE) == bp->b_pblkno) &&
626			(dp->b_actf != ap) &&
627			((ap->b_flags & ~B_CLUSTER) == bp->b_flags) &&
628			((ap->b_flags & B_BAD) == 0) &&
629			((ap->b_bcount & PAGE_MASK) == 0) &&
630			(((vm_offset_t) ap->b_un.b_addr & PAGE_MASK) == 0) &&
631			(ap->b_bcount + bp->b_bcount < maxio)) {
632
633			orig1begin = (vm_offset_t) ap->b_un.b_addr;
634			orig1pages = ap->b_bcount / PAGE_SIZE;
635
636			orig2begin = (vm_offset_t) bp->b_un.b_addr;
637			orig2pages = bp->b_bcount / PAGE_SIZE;
638			/*
639			 * see if we can allocate a kva, if we cannot, the don't
640			 * cluster.
641			 */
642			kvanew = vm_bounce_kva( PAGE_SIZE * (orig1pages + orig2pages), 0);
643			if( !kvanew) {
644				goto nocluster;
645			}
646
647
648			if( (ap->b_flags & B_CLUSTER) == 0) {
649
650				/*
651				 * get a physical buf pointer
652				 */
653				newbp = (struct buf *)trypbuf();
654				if( !newbp) {
655					vm_bounce_kva_free( kvanew, PAGE_SIZE * (orig1pages + orig2pages), 1);
656					goto nocluster;
657				}
658
659				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
660
661				/*
662				 * build the new bp to be handed off to the device
663				 */
664
665				--clstats[ap->b_bcount/PAGE_SIZE];
666				*newbp = *ap;
667				newbp->b_flags |= B_CLUSTER;
668				newbp->b_un.b_addr = (caddr_t) kvanew;
669				newbp->b_bcount += bp->b_bcount;
670				newbp->b_bufsize = newbp->b_bcount;
671				newbp->b_clusterf = ap;
672				newbp->b_clusterl = bp;
673				++clstats[newbp->b_bcount/PAGE_SIZE];
674
675				/*
676				 * enter the new bp onto the device queue
677				 */
678				if( ap->av_forw)
679					ap->av_forw->av_back = newbp;
680				else
681					dp->b_actl = newbp;
682
683				if( dp->b_actf != ap )
684					ap->av_back->av_forw = newbp;
685				else
686					dp->b_actf = newbp;
687
688				/*
689				 * enter the previous bps onto the cluster queue
690				 */
691				ap->av_forw = bp;
692				bp->av_back = ap;
693
694				ap->av_back = NULL;
695				bp->av_forw = NULL;
696
697			} else {
698				vm_offset_t addr;
699
700				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
701				/*
702				 * free the old kva
703				 */
704				vm_bounce_kva_free( orig1begin, ap->b_bufsize, 0);
705				--clstats[ap->b_bcount/PAGE_SIZE];
706
707				ap->b_un.b_addr = (caddr_t) kvanew;
708
709				ap->b_clusterl->av_forw = bp;
710				bp->av_forw = NULL;
711				bp->av_back = ap->b_clusterl;
712				ap->b_clusterl = bp;
713
714				ap->b_bcount += bp->b_bcount;
715				ap->b_bufsize = ap->b_bcount;
716				++clstats[ap->b_bcount/PAGE_SIZE];
717			}
718			return;
719		/*
720		 * merge with next?
721		 * conditions:
722		 * 	1) We reside physically before the next block.
723		 *  3) The mode of the two I/Os is identical.
724		 *  4) The next kva is page aligned and the next transfer
725		 *	   is a multiple of a page in length.
726		 *	5) And the total I/O size would be below the maximum.
727		 */
728		} else if( ap->av_forw &&
729			(bp->b_pblkno + (bp->b_bcount / DEV_BSIZE) == ap->av_forw->b_pblkno) &&
730			(bp->b_flags == (ap->av_forw->b_flags & ~B_CLUSTER)) &&
731			((ap->av_forw->b_flags & B_BAD) == 0) &&
732			((ap->av_forw->b_bcount & PAGE_MASK) == 0) &&
733			(((vm_offset_t) ap->av_forw->b_un.b_addr & PAGE_MASK) == 0) &&
734			(ap->av_forw->b_bcount + bp->b_bcount < maxio)) {
735
736			orig1begin = (vm_offset_t) bp->b_un.b_addr;
737			orig1pages = bp->b_bcount / PAGE_SIZE;
738
739			orig2begin = (vm_offset_t) ap->av_forw->b_un.b_addr;
740			orig2pages = ap->av_forw->b_bcount / PAGE_SIZE;
741
742			/*
743			 * see if we can allocate a kva, if we cannot, the don't
744			 * cluster.
745			 */
746			kvanew = vm_bounce_kva( PAGE_SIZE * (orig1pages + orig2pages), 0);
747			if( !kvanew) {
748				goto nocluster;
749			}
750
751			/*
752			 * if next isn't a cluster we need to create one
753			 */
754			if( (ap->av_forw->b_flags & B_CLUSTER) == 0) {
755
756				/*
757				 * get a physical buf pointer
758				 */
759				newbp = (struct buf *)trypbuf();
760				if( !newbp) {
761					vm_bounce_kva_free( kvanew, PAGE_SIZE * (orig1pages + orig2pages), 1);
762					goto nocluster;
763				}
764
765				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
766				ap = ap->av_forw;
767				--clstats[ap->b_bcount/PAGE_SIZE];
768				*newbp = *ap;
769				newbp->b_flags |= B_CLUSTER;
770				newbp->b_un.b_addr = (caddr_t) kvanew;
771				newbp->b_blkno = bp->b_blkno;
772				newbp->b_pblkno = bp->b_pblkno;
773				newbp->b_bcount += bp->b_bcount;
774				newbp->b_bufsize = newbp->b_bcount;
775				newbp->b_clusterf = bp;
776				newbp->b_clusterl = ap;
777				++clstats[newbp->b_bcount/PAGE_SIZE];
778
779				if( ap->av_forw)
780					ap->av_forw->av_back = newbp;
781				else
782					dp->b_actl = newbp;
783
784				if( dp->b_actf != ap )
785					ap->av_back->av_forw = newbp;
786				else
787					dp->b_actf = newbp;
788
789				bp->av_forw = ap;
790				ap->av_back = bp;
791
792				bp->av_back = NULL;
793				ap->av_forw = NULL;
794			} else {
795				vm_offset_t addr;
796
797				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
798				ap = ap->av_forw;
799				vm_bounce_kva_free( orig2begin, ap->b_bufsize, 0);
800
801				ap->b_un.b_addr = (caddr_t) kvanew;
802				bp->av_forw = ap->b_clusterf;
803				ap->b_clusterf->av_back = bp;
804				ap->b_clusterf = bp;
805				bp->av_back = NULL;
806				--clstats[ap->b_bcount/PAGE_SIZE];
807
808				ap->b_blkno = bp->b_blkno;
809				ap->b_pblkno = bp->b_pblkno;
810				ap->b_bcount += bp->b_bcount;
811				ap->b_bufsize = ap->b_bcount;
812				++clstats[ap->b_bcount/PAGE_SIZE];
813
814			}
815			return;
816		}
817	}
818	/*
819	 * don't merge
820	 */
821nocluster:
822	++clstats[bp->b_bcount/PAGE_SIZE];
823	bp->av_forw = ap->av_forw;
824	if( bp->av_forw)
825		bp->av_forw->av_back = bp;
826	else
827		dp->b_actl = bp;
828
829	ap->av_forw = bp;
830	bp->av_back = ap;
831}
832
833
834/*
835 * Finish a fork operation, with process p2 nearly set up.
836 * Copy and update the kernel stack and pcb, making the child
837 * ready to run, and marking it so that it can return differently
838 * than the parent.  Returns 1 in the child process, 0 in the parent.
839 * We currently double-map the user area so that the stack is at the same
840 * address in each process; in the future we will probably relocate
841 * the frame pointers on the stack after copying.
842 */
843int
844cpu_fork(p1, p2)
845	register struct proc *p1, *p2;
846{
847	register struct user *up = p2->p_addr;
848	int foo, offset, addr, i;
849	extern char kstack[];
850	extern int mvesp();
851
852	/*
853	 * Copy pcb and stack from proc p1 to p2.
854	 * We do this as cheaply as possible, copying only the active
855	 * part of the stack.  The stack and pcb need to agree;
856	 * this is tricky, as the final pcb is constructed by savectx,
857	 * but its frame isn't yet on the stack when the stack is copied.
858	 * swtch compensates for this when the child eventually runs.
859	 * This should be done differently, with a single call
860	 * that copies and updates the pcb+stack,
861	 * replacing the bcopy and savectx.
862	 */
863	p2->p_addr->u_pcb = p1->p_addr->u_pcb;
864	offset = mvesp() - (int)kstack;
865	bcopy((caddr_t)kstack + offset, (caddr_t)p2->p_addr + offset,
866	    (unsigned) ctob(UPAGES) - offset);
867	p2->p_regs = p1->p_regs;
868
869	/*
870	 * Wire top of address space of child to it's kstack.
871	 * First, fault in a page of pte's to map it.
872	 */
873#if 0
874        addr = trunc_page((u_int)vtopte(kstack));
875	vm_map_pageable(&p2->p_vmspace->vm_map, addr, addr+NBPG, FALSE);
876	for (i=0; i < UPAGES; i++)
877		pmap_enter(&p2->p_vmspace->vm_pmap, kstack+i*NBPG,
878			   pmap_extract(kernel_pmap, ((int)p2->p_addr)+i*NBPG),
879			   /*
880			    * The user area has to be mapped writable because
881			    * it contains the kernel stack (when CR0_WP is on
882			    * on a 486 there is no user-read/kernel-write
883			    * mode).  It is protected from user mode access
884			    * by the segment limits.
885			    */
886			   VM_PROT_READ|VM_PROT_WRITE, TRUE);
887#endif
888	pmap_activate(&p2->p_vmspace->vm_pmap, &up->u_pcb);
889
890	/*
891	 *
892	 * Arrange for a non-local goto when the new process
893	 * is started, to resume here, returning nonzero from setjmp.
894	 */
895	if (savectx(up, 1)) {
896		/*
897		 * Return 1 in child.
898		 */
899		return (1);
900	}
901	return (0);
902}
903
904#ifdef notyet
905/*
906 * cpu_exit is called as the last action during exit.
907 *
908 * We change to an inactive address space and a "safe" stack,
909 * passing thru an argument to the new stack. Now, safely isolated
910 * from the resources we're shedding, we release the address space
911 * and any remaining machine-dependent resources, including the
912 * memory for the user structure and kernel stack.
913 *
914 * Next, we assign a dummy context to be written over by swtch,
915 * calling it to send this process off to oblivion.
916 * [The nullpcb allows us to minimize cost in swtch() by not having
917 * a special case].
918 */
919struct proc *swtch_to_inactive();
920volatile void
921cpu_exit(p)
922	register struct proc *p;
923{
924	static struct pcb nullpcb;	/* pcb to overwrite on last swtch */
925
926#if NNPX > 0
927	npxexit(p);
928#endif	/* NNPX */
929
930	/* move to inactive space and stack, passing arg accross */
931	p = swtch_to_inactive(p);
932
933	/* drop per-process resources */
934	vmspace_free(p->p_vmspace);
935	kmem_free(kernel_map, (vm_offset_t)p->p_addr, ctob(UPAGES));
936
937	p->p_addr = (struct user *) &nullpcb;
938	splclock();
939	swtch();
940	/* NOTREACHED */
941}
942#else
943void
944cpu_exit(p)
945	register struct proc *p;
946{
947
948#if NNPX > 0
949	npxexit(p);
950#endif	/* NNPX */
951	splclock();
952	curproc = 0;
953	swtch();
954	/*
955	 * This is to shutup the compiler, and if swtch() failed I suppose
956	 * this would be a good thing.  This keeps gcc happy because panic
957	 * is a volatile void function as well.
958	 */
959	panic("cpu_exit");
960}
961
962void
963cpu_wait(p) struct proc *p; {
964/*	extern vm_map_t upages_map; */
965	extern char kstack[];
966
967	/* drop per-process resources */
968 	pmap_remove(vm_map_pmap(kernel_map), (vm_offset_t) p->p_addr,
969		((vm_offset_t) p->p_addr) + ctob(UPAGES));
970	kmem_free(kernel_map, (vm_offset_t)p->p_addr, ctob(UPAGES));
971	vmspace_free(p->p_vmspace);
972}
973#endif
974
975/*
976 * Set a red zone in the kernel stack after the u. area.
977 */
978void
979setredzone(pte, vaddr)
980	u_short *pte;
981	caddr_t vaddr;
982{
983/* eventually do this by setting up an expand-down stack segment
984   for ss0: selector, allowing stack access down to top of u.
985   this means though that protection violations need to be handled
986   thru a double fault exception that must do an integral task
987   switch to a known good context, within which a dump can be
988   taken. a sensible scheme might be to save the initial context
989   used by sched (that has physical memory mapped 1:1 at bottom)
990   and take the dump while still in mapped mode */
991}
992
993/*
994 * Convert kernel VA to physical address
995 */
996u_long
997kvtop(void *addr)
998{
999	vm_offset_t va;
1000
1001	va = pmap_kextract((vm_offset_t)addr);
1002	if (va == 0)
1003		panic("kvtop: zero page frame");
1004	return((int)va);
1005}
1006
1007extern vm_map_t phys_map;
1008
1009/*
1010 * Map an IO request into kernel virtual address space.
1011 *
1012 * All requests are (re)mapped into kernel VA space.
1013 * Notice that we use b_bufsize for the size of the buffer
1014 * to be mapped.  b_bcount might be modified by the driver.
1015 */
1016void
1017vmapbuf(bp)
1018	register struct buf *bp;
1019{
1020	register int npf;
1021	register caddr_t addr;
1022	register long flags = bp->b_flags;
1023	struct proc *p;
1024	int off;
1025	vm_offset_t kva;
1026	register vm_offset_t pa;
1027
1028	if ((flags & B_PHYS) == 0)
1029		panic("vmapbuf");
1030	addr = bp->b_saveaddr = bp->b_un.b_addr;
1031	off = (int)addr & PGOFSET;
1032	p = bp->b_proc;
1033	npf = btoc(round_page(bp->b_bufsize + off));
1034	kva = kmem_alloc_wait(phys_map, ctob(npf));
1035	bp->b_un.b_addr = (caddr_t) (kva + off);
1036	while (npf--) {
1037		pa = pmap_extract(&p->p_vmspace->vm_pmap, (vm_offset_t)addr);
1038		if (pa == 0)
1039			panic("vmapbuf: null page frame");
1040		pmap_kenter(kva, trunc_page(pa));
1041		addr += PAGE_SIZE;
1042		kva += PAGE_SIZE;
1043	}
1044	pmap_update();
1045}
1046
1047/*
1048 * Free the io map PTEs associated with this IO operation.
1049 * We also invalidate the TLB entries and restore the original b_addr.
1050 */
1051void
1052vunmapbuf(bp)
1053	register struct buf *bp;
1054{
1055	register int npf;
1056	register caddr_t addr = bp->b_un.b_addr;
1057	vm_offset_t kva;
1058
1059	if ((bp->b_flags & B_PHYS) == 0)
1060		panic("vunmapbuf");
1061	npf = btoc(round_page(bp->b_bufsize + ((int)addr & PGOFSET)));
1062	kva = (vm_offset_t)((int)addr & ~PGOFSET);
1063	kmem_free_wakeup(phys_map, kva, ctob(npf));
1064	bp->b_un.b_addr = bp->b_saveaddr;
1065	bp->b_saveaddr = NULL;
1066}
1067
1068/*
1069 * Force reset the processor by invalidating the entire address space!
1070 */
1071void
1072cpu_reset() {
1073
1074	/* force a shutdown by unmapping entire address space ! */
1075	bzero((caddr_t) PTD, NBPG);
1076
1077	/* "good night, sweet prince .... <THUNK!>" */
1078	tlbflush();
1079	/* NOTREACHED */
1080	while(1);
1081}
1082
1083/*
1084 * Grow the user stack to allow for 'sp'. This version grows the stack in
1085 *	chunks of SGROWSIZ.
1086 */
1087int
1088grow(p, sp)
1089	struct proc *p;
1090	int sp;
1091{
1092	unsigned int nss;
1093	caddr_t v;
1094	struct vmspace *vm = p->p_vmspace;
1095
1096	if ((caddr_t)sp <= vm->vm_maxsaddr || (unsigned)sp >= (unsigned)USRSTACK)
1097	    return (1);
1098
1099	nss = roundup(USRSTACK - (unsigned)sp, PAGE_SIZE);
1100
1101	if (nss > p->p_rlimit[RLIMIT_STACK].rlim_cur)
1102		return (0);
1103
1104	if (vm->vm_ssize && roundup(vm->vm_ssize << PAGE_SHIFT,
1105	    SGROWSIZ) < nss) {
1106		int grow_amount;
1107		/*
1108		 * If necessary, grow the VM that the stack occupies
1109		 * to allow for the rlimit. This allows us to not have
1110		 * to allocate all of the VM up-front in execve (which
1111		 * is expensive).
1112		 * Grow the VM by the amount requested rounded up to
1113		 * the nearest SGROWSIZ to provide for some hysteresis.
1114		 */
1115		grow_amount = roundup((nss - (vm->vm_ssize << PAGE_SHIFT)), SGROWSIZ);
1116		v = (char *)USRSTACK - roundup(vm->vm_ssize << PAGE_SHIFT,
1117		    SGROWSIZ) - grow_amount;
1118		/*
1119		 * If there isn't enough room to extend by SGROWSIZ, then
1120		 * just extend to the maximum size
1121		 */
1122		if (v < vm->vm_maxsaddr) {
1123			v = vm->vm_maxsaddr;
1124			grow_amount = MAXSSIZ - (vm->vm_ssize << PAGE_SHIFT);
1125		}
1126		if (vm_allocate(&vm->vm_map, (vm_offset_t *)&v,
1127		    grow_amount, FALSE) != KERN_SUCCESS) {
1128			return (0);
1129		}
1130		vm->vm_ssize += grow_amount >> PAGE_SHIFT;
1131	}
1132
1133	return (1);
1134}
1135