vm_machdep.c revision 1549
1/*-
2 * Copyright (c) 1982, 1986 The Regents of the University of California.
3 * Copyright (c) 1989, 1990 William Jolitz
4 * Copyright (c) 1994 John Dyson
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * the Systems Programming Group of the University of Utah Computer
9 * Science Department, and William Jolitz.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 *    must display the following acknowledgement:
21 *	This product includes software developed by the University of
22 *	California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 *    may be used to endorse or promote products derived from this software
25 *    without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 *	from: @(#)vm_machdep.c	7.3 (Berkeley) 5/13/91
40 *	Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
41 *	$Id: vm_machdep.c,v 1.20 1994/04/20 07:06:20 davidg Exp $
42 */
43
44#include "npx.h"
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/proc.h>
48#include <sys/malloc.h>
49#include <sys/buf.h>
50#include <sys/vnode.h>
51#include <sys/user.h>
52
53#include <machine/cpu.h>
54
55#include <vm/vm.h>
56#include <vm/vm_kern.h>
57
58#define b_cylin b_resid
59
60caddr_t		bouncememory;
61vm_offset_t	bouncepa, bouncepaend;
62int		bouncepages, bpwait;
63vm_map_t	io_map;
64int		bmwait, bmfreeing;
65
66#define BITS_IN_UNSIGNED (8*sizeof(unsigned))
67int		bounceallocarraysize;
68unsigned	*bounceallocarray;
69int		bouncefree;
70
71#define SIXTEENMEG (4096*4096)
72#define MAXBKVA 512
73int		maxbkva=MAXBKVA*NBPG;
74
75/* special list that can be used at interrupt time for eventual kva free */
76struct kvasfree {
77	vm_offset_t addr;
78	vm_offset_t size;
79} kvaf[MAXBKVA];
80
81int		kvasfreecnt;
82
83vm_offset_t vm_bounce_kva();
84/*
85 * get bounce buffer pages (count physically contiguous)
86 * (only 1 inplemented now)
87 */
88vm_offset_t
89vm_bounce_page_find(count)
90	int count;
91{
92	int bit;
93	int s,i;
94
95	if (count != 1)
96		panic("vm_bounce_page_find -- no support for > 1 page yet!!!");
97
98	s = splbio();
99retry:
100	for (i = 0; i < bounceallocarraysize; i++) {
101		if (bounceallocarray[i] != 0xffffffff) {
102			if (bit = ffs(~bounceallocarray[i])) {
103				bounceallocarray[i] |= 1 << (bit - 1) ;
104				bouncefree -= count;
105				splx(s);
106				return bouncepa + (i * BITS_IN_UNSIGNED + (bit - 1)) * NBPG;
107			}
108		}
109	}
110	bpwait = 1;
111	tsleep((caddr_t) &bounceallocarray, PRIBIO, "bncwai", 0);
112	goto retry;
113}
114
115void
116vm_bounce_kva_free(addr, size, now)
117	vm_offset_t addr;
118	vm_offset_t size;
119	int now;
120{
121	int s = splbio();
122	kvaf[kvasfreecnt].addr = addr;
123	kvaf[kvasfreecnt++].size = size;
124	if( now) {
125		/*
126		 * this will do wakeups
127		 */
128		vm_bounce_kva(0,0);
129	} else {
130		if (bmwait) {
131		/*
132		 * if anyone is waiting on the bounce-map, then wakeup
133		 */
134			wakeup((caddr_t) io_map);
135			bmwait = 0;
136		}
137	}
138	splx(s);
139}
140
141/*
142 * free count bounce buffer pages
143 */
144void
145vm_bounce_page_free(pa, count)
146	vm_offset_t pa;
147	int count;
148{
149	int allocindex;
150	int index;
151	int bit;
152
153	if (count != 1)
154		panic("vm_bounce_page_free -- no support for > 1 page yet!!!\n");
155
156	index = (pa - bouncepa) / NBPG;
157
158	if ((index < 0) || (index >= bouncepages))
159		panic("vm_bounce_page_free -- bad index\n");
160
161	allocindex = index / BITS_IN_UNSIGNED;
162	bit = index % BITS_IN_UNSIGNED;
163
164	bounceallocarray[allocindex] &= ~(1 << bit);
165
166	bouncefree += count;
167	if (bpwait) {
168		bpwait = 0;
169		wakeup((caddr_t) &bounceallocarray);
170	}
171}
172
173/*
174 * allocate count bounce buffer kva pages
175 */
176vm_offset_t
177vm_bounce_kva(count, waitok)
178	int count;
179	int waitok;
180{
181	int tofree;
182	int i;
183	int startfree;
184	vm_offset_t kva = 0;
185	int s = splbio();
186	int size = count;
187	startfree = 0;
188more:
189	if (!bmfreeing && (tofree = kvasfreecnt)) {
190		bmfreeing = 1;
191		for (i = startfree; i < kvasfreecnt; i++) {
192			/*
193			 * if we have a kva of the right size, no sense
194			 * in freeing/reallocating...
195			 * might affect fragmentation short term, but
196			 * as long as the amount of io_map is
197			 * significantly more than the maximum transfer
198			 * size, I don't think that it is a problem.
199			 */
200			pmap_remove(kernel_pmap,
201				kvaf[i].addr, kvaf[i].addr + kvaf[i].size);
202			if( size && !kva && kvaf[i].size == size) {
203				kva = kvaf[i].addr;
204			} else {
205				kmem_free_wakeup(io_map, kvaf[i].addr,
206					kvaf[i].size);
207			}
208		}
209		if (kvasfreecnt != tofree) {
210			startfree = i;
211			bmfreeing = 0;
212			goto more;
213		}
214		kvasfreecnt = 0;
215		bmfreeing = 0;
216	}
217
218	if( size == 0) {
219		splx(s);
220		return NULL;
221	}
222
223	if (!kva && !(kva = kmem_alloc_pageable(io_map, size))) {
224		if( !waitok) {
225			splx(s);
226			return NULL;
227		}
228		bmwait = 1;
229		tsleep((caddr_t) io_map, PRIBIO, "bmwait", 0);
230		goto more;
231	}
232	splx(s);
233
234	return kva;
235}
236
237/*
238 * same as vm_bounce_kva -- but really allocate
239 */
240vm_offset_t
241vm_bounce_kva_alloc(count)
242int count;
243{
244	int i;
245	vm_offset_t kva;
246	vm_offset_t pa;
247	if( bouncepages == 0) {
248		kva = (vm_offset_t) malloc(count*NBPG, M_TEMP, M_WAITOK);
249		return kva;
250	}
251	kva = vm_bounce_kva(count, 1);
252	for(i=0;i<count;i++) {
253		pa = vm_bounce_page_find(1);
254		pmap_kenter(kva + i * NBPG, pa);
255	}
256	pmap_update();
257	return kva;
258}
259
260/*
261 * same as vm_bounce_kva_free -- but really free
262 */
263void
264vm_bounce_kva_alloc_free(kva, count)
265	vm_offset_t kva;
266	int count;
267{
268	int i;
269	vm_offset_t pa;
270	if( bouncepages == 0) {
271		free((caddr_t) kva, M_TEMP);
272		return;
273	}
274	for(i = 0; i < count; i++) {
275		pa = pmap_kextract(kva + i * NBPG);
276		vm_bounce_page_free(pa, 1);
277	}
278	vm_bounce_kva_free(kva, count);
279}
280
281/*
282 * do the things necessary to the struct buf to implement
283 * bounce buffers...  inserted before the disk sort
284 */
285void
286vm_bounce_alloc(bp)
287	struct buf *bp;
288{
289	int countvmpg;
290	vm_offset_t vastart, vaend;
291	vm_offset_t vapstart, vapend;
292	vm_offset_t va, kva;
293	vm_offset_t pa;
294	int dobounceflag = 0;
295	int bounceindex;
296	int i;
297	int s;
298
299	if (bouncepages == 0)
300		return;
301
302	if (bp->b_bufsize < bp->b_bcount) {
303		printf("vm_bounce_alloc: b_bufsize(%d) < b_bcount(%d) !!!!\n",
304			bp->b_bufsize, bp->b_bcount);
305		bp->b_bufsize = bp->b_bcount;
306	}
307
308	vastart = (vm_offset_t) bp->b_data;
309	vaend = (vm_offset_t) bp->b_data + bp->b_bufsize;
310
311	vapstart = i386_trunc_page(vastart);
312	vapend = i386_round_page(vaend);
313	countvmpg = (vapend - vapstart) / NBPG;
314
315/*
316 * if any page is above 16MB, then go into bounce-buffer mode
317 */
318	va = vapstart;
319	for (i = 0; i < countvmpg; i++) {
320		pa = pmap_kextract(va);
321		if (pa >= SIXTEENMEG)
322			++dobounceflag;
323		va += NBPG;
324	}
325	if (dobounceflag == 0)
326		return;
327
328	if (bouncepages < dobounceflag)
329		panic("Not enough bounce buffers!!!");
330
331/*
332 * allocate a replacement kva for b_addr
333 */
334	kva = vm_bounce_kva(countvmpg*NBPG, 1);
335	va = vapstart;
336	for (i = 0; i < countvmpg; i++) {
337		pa = pmap_kextract(va);
338		if (pa >= SIXTEENMEG) {
339			/*
340			 * allocate a replacement page
341			 */
342			vm_offset_t bpa = vm_bounce_page_find(1);
343			pmap_kenter(kva + (NBPG * i), bpa);
344			/*
345			 * if we are writing, the copy the data into the page
346			 */
347			if ((bp->b_flags & B_READ) == 0) {
348				pmap_update();
349				bcopy((caddr_t) va, (caddr_t) kva + (NBPG * i), NBPG);
350			}
351		} else {
352			/*
353			 * use original page
354			 */
355			pmap_kenter(kva + (NBPG * i), pa);
356		}
357		va += NBPG;
358	}
359	pmap_update();
360
361/*
362 * flag the buffer as being bounced
363 */
364	bp->b_flags |= B_BOUNCE;
365/*
366 * save the original buffer kva
367 */
368	bp->b_savekva = bp->b_data;
369/*
370 * put our new kva into the buffer (offset by original offset)
371 */
372	bp->b_data = (caddr_t) (((vm_offset_t) kva) |
373				((vm_offset_t) bp->b_savekva & (NBPG - 1)));
374	return;
375}
376
377/*
378 * hook into biodone to free bounce buffer
379 */
380void
381vm_bounce_free(bp)
382	struct buf *bp;
383{
384	int i;
385	vm_offset_t origkva, bouncekva;
386	vm_offset_t vastart, vaend;
387	vm_offset_t vapstart, vapend;
388	int countbounce = 0;
389	vm_offset_t firstbouncepa = 0;
390	int firstbounceindex;
391	int countvmpg;
392	vm_offset_t bcount;
393	int s;
394
395/*
396 * if this isn't a bounced buffer, then just return
397 */
398	if ((bp->b_flags & B_BOUNCE) == 0)
399		return;
400
401	origkva = (vm_offset_t) bp->b_savekva;
402	bouncekva = (vm_offset_t) bp->b_data;
403
404	vastart = bouncekva;
405	vaend = bouncekva + bp->b_bufsize;
406	bcount = bp->b_bufsize;
407
408	vapstart = i386_trunc_page(vastart);
409	vapend = i386_round_page(vaend);
410
411	countvmpg = (vapend - vapstart) / NBPG;
412
413/*
414 * check every page in the kva space for b_addr
415 */
416	for (i = 0; i < countvmpg; i++) {
417		vm_offset_t mybouncepa;
418		vm_offset_t copycount;
419
420		copycount = i386_round_page(bouncekva + 1) - bouncekva;
421		mybouncepa = pmap_kextract(i386_trunc_page(bouncekva));
422
423/*
424 * if this is a bounced pa, then process as one
425 */
426		if ((mybouncepa >= bouncepa) && (mybouncepa < bouncepaend)) {
427			if (copycount > bcount)
428				copycount = bcount;
429/*
430 * if this is a read, then copy from bounce buffer into original buffer
431 */
432			if (bp->b_flags & B_READ)
433				bcopy((caddr_t) bouncekva, (caddr_t) origkva, copycount);
434/*
435 * free the bounce allocation
436 */
437			vm_bounce_page_free(i386_trunc_page(mybouncepa), 1);
438		}
439
440		origkva += copycount;
441		bouncekva += copycount;
442		bcount -= copycount;
443	}
444
445/*
446 * add the old kva into the "to free" list
447 */
448	bouncekva = i386_trunc_page((vm_offset_t) bp->b_data);
449	vm_bounce_kva_free( bouncekva, countvmpg*NBPG, 0);
450	bp->b_data = bp->b_savekva;
451	bp->b_savekva = 0;
452	bp->b_flags &= ~B_BOUNCE;
453
454	return;
455}
456
457/*
458 * init the bounce buffer system
459 */
460void
461vm_bounce_init()
462{
463	vm_offset_t minaddr, maxaddr;
464
465	kvasfreecnt = 0;
466
467	if (bouncepages == 0)
468		return;
469
470	bounceallocarraysize = (bouncepages + BITS_IN_UNSIGNED - 1) / BITS_IN_UNSIGNED;
471	bounceallocarray = malloc(bounceallocarraysize * sizeof(unsigned), M_TEMP, M_NOWAIT);
472
473	if (!bounceallocarray)
474		panic("Cannot allocate bounce resource array\n");
475
476	bzero(bounceallocarray, bounceallocarraysize * sizeof(long));
477
478
479	bouncepa = pmap_kextract((vm_offset_t) bouncememory);
480	bouncepaend = bouncepa + bouncepages * NBPG;
481	bouncefree = bouncepages;
482}
483
484
485#ifdef BROKEN_IN_44
486static void
487cldiskvamerge( kvanew, orig1, orig1cnt, orig2, orig2cnt)
488	vm_offset_t kvanew;
489	vm_offset_t orig1, orig1cnt;
490	vm_offset_t orig2, orig2cnt;
491{
492	int i;
493	vm_offset_t pa;
494/*
495 * enter the transfer physical addresses into the new kva
496 */
497	for(i=0;i<orig1cnt;i++) {
498		vm_offset_t pa;
499		pa = pmap_kextract((caddr_t) orig1 + i * PAGE_SIZE);
500		pmap_kenter(kvanew + i * PAGE_SIZE, pa);
501	}
502
503	for(i=0;i<orig2cnt;i++) {
504		vm_offset_t pa;
505		pa = pmap_kextract((caddr_t) orig2 + i * PAGE_SIZE);
506		pmap_kenter(kvanew + (i + orig1cnt) * PAGE_SIZE, pa);
507	}
508	pmap_update();
509}
510
511void
512cldisksort(struct buf *dp, struct buf *bp, vm_offset_t maxio)
513{
514	register struct buf *ap, *newbp;
515	int i, trycount=0;
516	vm_offset_t orig1pages, orig2pages;
517	vm_offset_t orig1begin, orig2begin;
518	vm_offset_t kvanew, kvaorig;
519
520	if( bp->b_bcount < MAXCLSTATS*PAGE_SIZE)
521		++rqstats[bp->b_bcount/PAGE_SIZE];
522	/*
523	 * If nothing on the activity queue, then
524	 * we become the only thing.
525	 */
526	ap = dp->b_actf;
527	if(ap == NULL) {
528		dp->b_actf = bp;
529		dp->b_actl = bp;
530		bp->av_forw = NULL;
531		return;
532	}
533
534	/*
535	 * If we lie after the first (currently active)
536	 * request, then we must locate the second request list
537	 * and add ourselves to it.
538	 */
539
540	if (bp->b_pblkno < ap->b_pblkno) {
541		while (ap->av_forw) {
542			/*
543			 * Check for an ``inversion'' in the
544			 * normally ascending block numbers,
545			 * indicating the start of the second request list.
546			 */
547			if (ap->av_forw->b_pblkno < ap->b_pblkno) {
548				/*
549				 * Search the second request list
550				 * for the first request at a larger
551				 * block number.  We go before that;
552				 * if there is no such request, we go at end.
553				 */
554				do {
555					if (bp->b_pblkno < ap->av_forw->b_pblkno)
556						goto insert;
557					ap = ap->av_forw;
558				} while (ap->av_forw);
559				goto insert;		/* after last */
560			}
561			ap = ap->av_forw;
562		}
563		/*
564		 * No inversions... we will go after the last, and
565		 * be the first request in the second request list.
566		 */
567		goto insert;
568	}
569	/*
570	 * Request is at/after the current request...
571	 * sort in the first request list.
572	 */
573	while (ap->av_forw) {
574		/*
575		 * We want to go after the current request
576		 * if there is an inversion after it (i.e. it is
577		 * the end of the first request list), or if
578		 * the next request is a larger block than our request.
579		 */
580		if (ap->av_forw->b_pblkno < ap->b_pblkno ||
581		    bp->b_pblkno < ap->av_forw->b_pblkno )
582			goto insert;
583		ap = ap->av_forw;
584	}
585
586insert:
587
588	/*
589	 * read clustering with new read-ahead disk drives hurts mostly, so
590	 * we don't bother...
591	 */
592	if( bp->b_flags & B_READ)
593		goto nocluster;
594	/*
595	 * we currently only cluster I/O transfers that are at page-aligned
596	 * kvas and transfers that are multiples of page lengths.
597	 */
598	if ((bp->b_flags & B_BAD) == 0 &&
599		((bp->b_bcount & PAGE_MASK) == 0) &&
600		(((vm_offset_t) bp->b_un.b_addr & PAGE_MASK) == 0)) {
601		if( maxio > MAXCLSTATS*PAGE_SIZE)
602			maxio = MAXCLSTATS*PAGE_SIZE;
603		/*
604		 * merge with previous?
605		 * conditions:
606		 * 	1) We reside physically immediately after the previous block.
607		 *	2) The previous block is not first on the device queue because
608		 *	   such a block might be active.
609		 *  3) The mode of the two I/Os is identical.
610		 *  4) The previous kva is page aligned and the previous transfer
611		 *	   is a multiple of a page in length.
612		 *	5) And the total I/O size would be below the maximum.
613		 */
614		if( (ap->b_pblkno + (ap->b_bcount / DEV_BSIZE) == bp->b_pblkno) &&
615			(dp->b_actf != ap) &&
616			((ap->b_flags & ~B_CLUSTER) == bp->b_flags) &&
617			((ap->b_flags & B_BAD) == 0) &&
618			((ap->b_bcount & PAGE_MASK) == 0) &&
619			(((vm_offset_t) ap->b_un.b_addr & PAGE_MASK) == 0) &&
620			(ap->b_bcount + bp->b_bcount < maxio)) {
621
622			orig1begin = (vm_offset_t) ap->b_un.b_addr;
623			orig1pages = ap->b_bcount / PAGE_SIZE;
624
625			orig2begin = (vm_offset_t) bp->b_un.b_addr;
626			orig2pages = bp->b_bcount / PAGE_SIZE;
627			/*
628			 * see if we can allocate a kva, if we cannot, the don't
629			 * cluster.
630			 */
631			kvanew = vm_bounce_kva( PAGE_SIZE * (orig1pages + orig2pages), 0);
632			if( !kvanew) {
633				goto nocluster;
634			}
635
636
637			if( (ap->b_flags & B_CLUSTER) == 0) {
638
639				/*
640				 * get a physical buf pointer
641				 */
642				newbp = (struct buf *)trypbuf();
643				if( !newbp) {
644					vm_bounce_kva_free( kvanew, PAGE_SIZE * (orig1pages + orig2pages), 1);
645					goto nocluster;
646				}
647
648				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
649
650				/*
651				 * build the new bp to be handed off to the device
652				 */
653
654				--clstats[ap->b_bcount/PAGE_SIZE];
655				*newbp = *ap;
656				newbp->b_flags |= B_CLUSTER;
657				newbp->b_un.b_addr = (caddr_t) kvanew;
658				newbp->b_bcount += bp->b_bcount;
659				newbp->b_bufsize = newbp->b_bcount;
660				newbp->b_clusterf = ap;
661				newbp->b_clusterl = bp;
662				++clstats[newbp->b_bcount/PAGE_SIZE];
663
664				/*
665				 * enter the new bp onto the device queue
666				 */
667				if( ap->av_forw)
668					ap->av_forw->av_back = newbp;
669				else
670					dp->b_actl = newbp;
671
672				if( dp->b_actf != ap )
673					ap->av_back->av_forw = newbp;
674				else
675					dp->b_actf = newbp;
676
677				/*
678				 * enter the previous bps onto the cluster queue
679				 */
680				ap->av_forw = bp;
681				bp->av_back = ap;
682
683				ap->av_back = NULL;
684				bp->av_forw = NULL;
685
686			} else {
687				vm_offset_t addr;
688
689				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
690				/*
691				 * free the old kva
692				 */
693				vm_bounce_kva_free( orig1begin, ap->b_bufsize, 0);
694				--clstats[ap->b_bcount/PAGE_SIZE];
695
696				ap->b_un.b_addr = (caddr_t) kvanew;
697
698				ap->b_clusterl->av_forw = bp;
699				bp->av_forw = NULL;
700				bp->av_back = ap->b_clusterl;
701				ap->b_clusterl = bp;
702
703				ap->b_bcount += bp->b_bcount;
704				ap->b_bufsize = ap->b_bcount;
705				++clstats[ap->b_bcount/PAGE_SIZE];
706			}
707			return;
708		/*
709		 * merge with next?
710		 * conditions:
711		 * 	1) We reside physically before the next block.
712		 *  3) The mode of the two I/Os is identical.
713		 *  4) The next kva is page aligned and the next transfer
714		 *	   is a multiple of a page in length.
715		 *	5) And the total I/O size would be below the maximum.
716		 */
717		} else if( ap->av_forw &&
718			(bp->b_pblkno + (bp->b_bcount / DEV_BSIZE) == ap->av_forw->b_pblkno) &&
719			(bp->b_flags == (ap->av_forw->b_flags & ~B_CLUSTER)) &&
720			((ap->av_forw->b_flags & B_BAD) == 0) &&
721			((ap->av_forw->b_bcount & PAGE_MASK) == 0) &&
722			(((vm_offset_t) ap->av_forw->b_un.b_addr & PAGE_MASK) == 0) &&
723			(ap->av_forw->b_bcount + bp->b_bcount < maxio)) {
724
725			orig1begin = (vm_offset_t) bp->b_un.b_addr;
726			orig1pages = bp->b_bcount / PAGE_SIZE;
727
728			orig2begin = (vm_offset_t) ap->av_forw->b_un.b_addr;
729			orig2pages = ap->av_forw->b_bcount / PAGE_SIZE;
730
731			/*
732			 * see if we can allocate a kva, if we cannot, the don't
733			 * cluster.
734			 */
735			kvanew = vm_bounce_kva( PAGE_SIZE * (orig1pages + orig2pages), 0);
736			if( !kvanew) {
737				goto nocluster;
738			}
739
740			/*
741			 * if next isn't a cluster we need to create one
742			 */
743			if( (ap->av_forw->b_flags & B_CLUSTER) == 0) {
744
745				/*
746				 * get a physical buf pointer
747				 */
748				newbp = (struct buf *)trypbuf();
749				if( !newbp) {
750					vm_bounce_kva_free( kvanew, PAGE_SIZE * (orig1pages + orig2pages), 1);
751					goto nocluster;
752				}
753
754				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
755				ap = ap->av_forw;
756				--clstats[ap->b_bcount/PAGE_SIZE];
757				*newbp = *ap;
758				newbp->b_flags |= B_CLUSTER;
759				newbp->b_un.b_addr = (caddr_t) kvanew;
760				newbp->b_blkno = bp->b_blkno;
761				newbp->b_pblkno = bp->b_pblkno;
762				newbp->b_bcount += bp->b_bcount;
763				newbp->b_bufsize = newbp->b_bcount;
764				newbp->b_clusterf = bp;
765				newbp->b_clusterl = ap;
766				++clstats[newbp->b_bcount/PAGE_SIZE];
767
768				if( ap->av_forw)
769					ap->av_forw->av_back = newbp;
770				else
771					dp->b_actl = newbp;
772
773				if( dp->b_actf != ap )
774					ap->av_back->av_forw = newbp;
775				else
776					dp->b_actf = newbp;
777
778				bp->av_forw = ap;
779				ap->av_back = bp;
780
781				bp->av_back = NULL;
782				ap->av_forw = NULL;
783			} else {
784				vm_offset_t addr;
785
786				cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages);
787				ap = ap->av_forw;
788				vm_bounce_kva_free( orig2begin, ap->b_bufsize, 0);
789
790				ap->b_un.b_addr = (caddr_t) kvanew;
791				bp->av_forw = ap->b_clusterf;
792				ap->b_clusterf->av_back = bp;
793				ap->b_clusterf = bp;
794				bp->av_back = NULL;
795				--clstats[ap->b_bcount/PAGE_SIZE];
796
797				ap->b_blkno = bp->b_blkno;
798				ap->b_pblkno = bp->b_pblkno;
799				ap->b_bcount += bp->b_bcount;
800				ap->b_bufsize = ap->b_bcount;
801				++clstats[ap->b_bcount/PAGE_SIZE];
802
803			}
804			return;
805		}
806	}
807	/*
808	 * don't merge
809	 */
810nocluster:
811	++clstats[bp->b_bcount/PAGE_SIZE];
812	bp->av_forw = ap->av_forw;
813	if( bp->av_forw)
814		bp->av_forw->av_back = bp;
815	else
816		dp->b_actl = bp;
817
818	ap->av_forw = bp;
819	bp->av_back = ap;
820}
821#endif
822
823/*
824 * quick version of vm_fault
825 */
826
827void
828vm_fault_quick( v, prot)
829	vm_offset_t v;
830	int prot;
831{
832	if( (cpu_class == CPUCLASS_386) &&
833		(prot & VM_PROT_WRITE))
834		vm_fault(&curproc->p_vmspace->vm_map, v,
835			VM_PROT_READ|VM_PROT_WRITE, FALSE);
836	else if( prot & VM_PROT_WRITE)
837		*(volatile char *)v += 0;
838	else
839		*(volatile char *)v;
840}
841
842
843/*
844 * Finish a fork operation, with process p2 nearly set up.
845 * Copy and update the kernel stack and pcb, making the child
846 * ready to run, and marking it so that it can return differently
847 * than the parent.  Returns 1 in the child process, 0 in the parent.
848 * We currently double-map the user area so that the stack is at the same
849 * address in each process; in the future we will probably relocate
850 * the frame pointers on the stack after copying.
851 */
852int
853cpu_fork(p1, p2)
854	register struct proc *p1, *p2;
855{
856	register struct user *up = p2->p_addr;
857	int foo, offset, addr, i;
858	extern char kstack[];
859	extern int mvesp();
860
861	/*
862	 * Copy pcb and stack from proc p1 to p2.
863	 * We do this as cheaply as possible, copying only the active
864	 * part of the stack.  The stack and pcb need to agree;
865	 * this is tricky, as the final pcb is constructed by savectx,
866	 * but its frame isn't yet on the stack when the stack is copied.
867	 * swtch compensates for this when the child eventually runs.
868	 * This should be done differently, with a single call
869	 * that copies and updates the pcb+stack,
870	 * replacing the bcopy and savectx.
871	 */
872	p2->p_addr->u_pcb = p1->p_addr->u_pcb;
873	offset = mvesp() - (int)kstack;
874	bcopy((caddr_t)kstack + offset, (caddr_t)p2->p_addr + offset,
875	    (unsigned) ctob(UPAGES) - offset);
876	p2->p_md.md_regs = p1->p_md.md_regs;
877
878	/*
879	 * Wire top of address space of child to it's kstack.
880	 * First, fault in a page of pte's to map it.
881	 */
882#if 0
883        addr = trunc_page((u_int)vtopte(kstack));
884	vm_map_pageable(&p2->p_vmspace->vm_map, addr, addr+NBPG, FALSE);
885	for (i=0; i < UPAGES; i++)
886		pmap_enter(&p2->p_vmspace->vm_pmap, kstack+i*NBPG,
887			   pmap_extract(kernel_pmap, ((int)p2->p_addr)+i*NBPG),
888			   /*
889			    * The user area has to be mapped writable because
890			    * it contains the kernel stack (when CR0_WP is on
891			    * on a 486 there is no user-read/kernel-write
892			    * mode).  It is protected from user mode access
893			    * by the segment limits.
894			    */
895			   VM_PROT_READ|VM_PROT_WRITE, TRUE);
896#endif
897	pmap_activate(&p2->p_vmspace->vm_pmap, &up->u_pcb);
898
899	/*
900	 *
901	 * Arrange for a non-local goto when the new process
902	 * is started, to resume here, returning nonzero from setjmp.
903	 */
904	if (savectx(up, 1)) {
905		/*
906		 * Return 1 in child.
907		 */
908		return (1);
909	}
910	return (0);
911}
912
913#ifdef notyet
914/*
915 * cpu_exit is called as the last action during exit.
916 *
917 * We change to an inactive address space and a "safe" stack,
918 * passing thru an argument to the new stack. Now, safely isolated
919 * from the resources we're shedding, we release the address space
920 * and any remaining machine-dependent resources, including the
921 * memory for the user structure and kernel stack.
922 *
923 * Next, we assign a dummy context to be written over by swtch,
924 * calling it to send this process off to oblivion.
925 * [The nullpcb allows us to minimize cost in mi_switch() by not having
926 * a special case].
927 */
928struct proc *swtch_to_inactive();
929volatile void
930cpu_exit(p)
931	register struct proc *p;
932{
933	static struct pcb nullpcb;	/* pcb to overwrite on last swtch */
934
935#if NNPX > 0
936	npxexit(p);
937#endif	/* NNPX */
938
939	/* move to inactive space and stack, passing arg accross */
940	p = swtch_to_inactive(p);
941
942	/* drop per-process resources */
943	vmspace_free(p->p_vmspace);
944	kmem_free(kernel_map, (vm_offset_t)p->p_addr, ctob(UPAGES));
945
946	p->p_addr = (struct user *) &nullpcb;
947	mi_switch();
948	/* NOTREACHED */
949}
950#else
951void
952cpu_exit(p)
953	register struct proc *p;
954{
955
956#if NNPX > 0
957	npxexit(p);
958#endif	/* NNPX */
959	curproc = p;
960	mi_switch();
961	/*
962	 * This is to shutup the compiler, and if swtch() failed I suppose
963	 * this would be a good thing.  This keeps gcc happy because panic
964	 * is a volatile void function as well.
965	 */
966	panic("cpu_exit");
967}
968
969void
970cpu_wait(p) struct proc *p; {
971/*	extern vm_map_t upages_map; */
972	extern char kstack[];
973
974	/* drop per-process resources */
975 	pmap_remove(vm_map_pmap(kernel_map), (vm_offset_t) p->p_addr,
976		((vm_offset_t) p->p_addr) + ctob(UPAGES));
977	kmem_free(kernel_map, (vm_offset_t)p->p_addr, ctob(UPAGES));
978	vmspace_free(p->p_vmspace);
979}
980#endif
981
982/*
983 * Dump the machine specific header information at the start of a core dump.
984 */
985int
986cpu_coredump(p, vp, cred)
987	struct proc *p;
988	struct vnode *vp;
989	struct ucred *cred;
990{
991
992	return (vn_rdwr(UIO_WRITE, vp, (caddr_t) p->p_addr, ctob(UPAGES),
993	    (off_t)0, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *)NULL,
994	    p));
995}
996
997/*
998 * Set a red zone in the kernel stack after the u. area.
999 */
1000void
1001setredzone(pte, vaddr)
1002	u_short *pte;
1003	caddr_t vaddr;
1004{
1005/* eventually do this by setting up an expand-down stack segment
1006   for ss0: selector, allowing stack access down to top of u.
1007   this means though that protection violations need to be handled
1008   thru a double fault exception that must do an integral task
1009   switch to a known good context, within which a dump can be
1010   taken. a sensible scheme might be to save the initial context
1011   used by sched (that has physical memory mapped 1:1 at bottom)
1012   and take the dump while still in mapped mode */
1013}
1014
1015/*
1016 * Move pages from one kernel virtual address to another.
1017 * Both addresses are assumed to reside in the Sysmap,
1018 * and size must be a multiple of CLSIZE.
1019 */
1020
1021/*
1022 * Move pages from one kernel virtual address to another.
1023 * Both addresses are assumed to reside in the Sysmap,
1024 * and size must be a multiple of CLSIZE.
1025 */
1026
1027void
1028pagemove(from, to, size)
1029	register caddr_t from, to;
1030	int size;
1031{
1032	register vm_offset_t pa;
1033
1034	if (size & CLOFSET)
1035		panic("pagemove");
1036	while (size > 0) {
1037		pa = pmap_kextract((vm_offset_t)from);
1038		if (pa == 0)
1039			panic("pagemove 2");
1040		if (pmap_kextract((vm_offset_t)to) != 0)
1041			panic("pagemove 3");
1042		pmap_remove(kernel_pmap,
1043			    (vm_offset_t)from, (vm_offset_t)from + PAGE_SIZE);
1044		pmap_kenter( (vm_offset_t)to, pa);
1045		from += PAGE_SIZE;
1046		to += PAGE_SIZE;
1047		size -= PAGE_SIZE;
1048	}
1049	pmap_update();
1050}
1051
1052/*
1053 * Convert kernel VA to physical address
1054 */
1055u_long
1056kvtop(void *addr)
1057{
1058	vm_offset_t va;
1059
1060	va = pmap_kextract((vm_offset_t)addr);
1061	if (va == 0)
1062		panic("kvtop: zero page frame");
1063	return((int)va);
1064}
1065
1066extern vm_map_t phys_map;
1067
1068/*
1069 * Map an IO request into kernel virtual address space.
1070 *
1071 * All requests are (re)mapped into kernel VA space.
1072 * Notice that we use b_bufsize for the size of the buffer
1073 * to be mapped.  b_bcount might be modified by the driver.
1074 */
1075void
1076vmapbuf(bp)
1077	register struct buf *bp;
1078{
1079	register int npf;
1080	register caddr_t addr;
1081	int off;
1082	vm_offset_t kva;
1083	vm_offset_t pa, lastv, v;
1084
1085	if ((bp->b_flags & B_PHYS) == 0)
1086		panic("vmapbuf");
1087
1088	lastv = 0;
1089	for (addr = (caddr_t)trunc_page(bp->b_data);
1090		addr < bp->b_data + bp->b_bufsize;
1091		addr += PAGE_SIZE) {
1092
1093/*
1094 * make sure that the pde is valid and held
1095 */
1096		v = trunc_page(((vm_offset_t)vtopte(addr)));
1097		if (v != lastv) {
1098			vm_fault_quick(v, VM_PROT_READ);
1099			pa = pmap_extract(&curproc->p_vmspace->vm_pmap, v);
1100			vm_page_hold(PHYS_TO_VM_PAGE(pa));
1101			lastv = v;
1102		}
1103
1104/*
1105 * do the vm_fault if needed, do the copy-on-write thing when
1106 * reading stuff off device into memory.
1107 */
1108		vm_fault_quick(addr,
1109			(bp->b_flags&B_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ);
1110		pa = pmap_extract(&curproc->p_vmspace->vm_pmap, (vm_offset_t) addr);
1111/*
1112 * hold the data page
1113 */
1114		vm_page_hold(PHYS_TO_VM_PAGE(pa));
1115	}
1116
1117	addr = bp->b_saveaddr = bp->b_un.b_addr;
1118	off = (int)addr & PGOFSET;
1119	npf = btoc(round_page(bp->b_bufsize + off));
1120	kva = kmem_alloc_wait(phys_map, ctob(npf));
1121	bp->b_un.b_addr = (caddr_t) (kva + off);
1122	while (npf--) {
1123		pa = pmap_extract(&curproc->p_vmspace->vm_pmap, (vm_offset_t)addr);
1124		if (pa == 0)
1125			panic("vmapbuf: null page frame");
1126		pmap_kenter(kva, trunc_page(pa));
1127		addr += PAGE_SIZE;
1128		kva += PAGE_SIZE;
1129	}
1130	pmap_update();
1131}
1132
1133/*
1134 * Free the io map PTEs associated with this IO operation.
1135 * We also invalidate the TLB entries and restore the original b_addr.
1136 */
1137void
1138vunmapbuf(bp)
1139	register struct buf *bp;
1140{
1141	register int npf;
1142	register caddr_t addr = bp->b_un.b_addr;
1143	vm_offset_t kva,va,v,lastv,pa;
1144
1145	if ((bp->b_flags & B_PHYS) == 0)
1146		panic("vunmapbuf");
1147	npf = btoc(round_page(bp->b_bufsize + ((int)addr & PGOFSET)));
1148	kva = (vm_offset_t)((int)addr & ~PGOFSET);
1149	kmem_free_wakeup(phys_map, kva, ctob(npf));
1150	bp->b_un.b_addr = bp->b_saveaddr;
1151	bp->b_saveaddr = NULL;
1152
1153
1154/*
1155 * unhold the pde, and data pages
1156 */
1157	lastv = 0;
1158	for (addr = (caddr_t)trunc_page(bp->b_data);
1159		addr < bp->b_data + bp->b_bufsize;
1160		addr += NBPG) {
1161
1162	/*
1163	 * release the data page
1164	 */
1165		pa = pmap_extract(&curproc->p_vmspace->vm_pmap, (vm_offset_t) addr);
1166		vm_page_unhold(PHYS_TO_VM_PAGE(pa));
1167
1168	/*
1169	 * and unhold the page table
1170	 */
1171		v = trunc_page(((vm_offset_t)vtopte(addr)));
1172		if (v != lastv) {
1173			pa = pmap_extract(&curproc->p_vmspace->vm_pmap, v);
1174			vm_page_unhold(PHYS_TO_VM_PAGE(pa));
1175			lastv = v;
1176		}
1177	}
1178}
1179
1180/*
1181 * Force reset the processor by invalidating the entire address space!
1182 */
1183void
1184cpu_reset() {
1185
1186	/* force a shutdown by unmapping entire address space ! */
1187	bzero((caddr_t) PTD, NBPG);
1188
1189	/* "good night, sweet prince .... <THUNK!>" */
1190	tlbflush();
1191	/* NOTREACHED */
1192	while(1);
1193}
1194
1195/*
1196 * Grow the user stack to allow for 'sp'. This version grows the stack in
1197 *	chunks of SGROWSIZ.
1198 */
1199int
1200grow(p, sp)
1201	struct proc *p;
1202	u_int sp;
1203{
1204	unsigned int nss;
1205	caddr_t v;
1206	struct vmspace *vm = p->p_vmspace;
1207
1208	if ((caddr_t)sp <= vm->vm_maxsaddr || (unsigned)sp >= (unsigned)USRSTACK)
1209	    return (1);
1210
1211	nss = roundup(USRSTACK - (unsigned)sp, PAGE_SIZE);
1212
1213	if (nss > p->p_rlimit[RLIMIT_STACK].rlim_cur)
1214		return (0);
1215
1216	if (vm->vm_ssize && roundup(vm->vm_ssize << PAGE_SHIFT,
1217	    SGROWSIZ) < nss) {
1218		int grow_amount;
1219		/*
1220		 * If necessary, grow the VM that the stack occupies
1221		 * to allow for the rlimit. This allows us to not have
1222		 * to allocate all of the VM up-front in execve (which
1223		 * is expensive).
1224		 * Grow the VM by the amount requested rounded up to
1225		 * the nearest SGROWSIZ to provide for some hysteresis.
1226		 */
1227		grow_amount = roundup((nss - (vm->vm_ssize << PAGE_SHIFT)), SGROWSIZ);
1228		v = (char *)USRSTACK - roundup(vm->vm_ssize << PAGE_SHIFT,
1229		    SGROWSIZ) - grow_amount;
1230		/*
1231		 * If there isn't enough room to extend by SGROWSIZ, then
1232		 * just extend to the maximum size
1233		 */
1234		if (v < vm->vm_maxsaddr) {
1235			v = vm->vm_maxsaddr;
1236			grow_amount = MAXSSIZ - (vm->vm_ssize << PAGE_SHIFT);
1237		}
1238		if (vm_allocate(&vm->vm_map, (vm_offset_t *)&v,
1239		    grow_amount, FALSE) != KERN_SUCCESS) {
1240			return (0);
1241		}
1242		vm->vm_ssize += grow_amount >> PAGE_SHIFT;
1243	}
1244
1245	return (1);
1246}
1247