vm_machdep.c revision 1549
1/*- 2 * Copyright (c) 1982, 1986 The Regents of the University of California. 3 * Copyright (c) 1989, 1990 William Jolitz 4 * Copyright (c) 1994 John Dyson 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * the Systems Programming Group of the University of Utah Computer 9 * Science Department, and William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 40 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ 41 * $Id: vm_machdep.c,v 1.20 1994/04/20 07:06:20 davidg Exp $ 42 */ 43 44#include "npx.h" 45#include <sys/param.h> 46#include <sys/systm.h> 47#include <sys/proc.h> 48#include <sys/malloc.h> 49#include <sys/buf.h> 50#include <sys/vnode.h> 51#include <sys/user.h> 52 53#include <machine/cpu.h> 54 55#include <vm/vm.h> 56#include <vm/vm_kern.h> 57 58#define b_cylin b_resid 59 60caddr_t bouncememory; 61vm_offset_t bouncepa, bouncepaend; 62int bouncepages, bpwait; 63vm_map_t io_map; 64int bmwait, bmfreeing; 65 66#define BITS_IN_UNSIGNED (8*sizeof(unsigned)) 67int bounceallocarraysize; 68unsigned *bounceallocarray; 69int bouncefree; 70 71#define SIXTEENMEG (4096*4096) 72#define MAXBKVA 512 73int maxbkva=MAXBKVA*NBPG; 74 75/* special list that can be used at interrupt time for eventual kva free */ 76struct kvasfree { 77 vm_offset_t addr; 78 vm_offset_t size; 79} kvaf[MAXBKVA]; 80 81int kvasfreecnt; 82 83vm_offset_t vm_bounce_kva(); 84/* 85 * get bounce buffer pages (count physically contiguous) 86 * (only 1 inplemented now) 87 */ 88vm_offset_t 89vm_bounce_page_find(count) 90 int count; 91{ 92 int bit; 93 int s,i; 94 95 if (count != 1) 96 panic("vm_bounce_page_find -- no support for > 1 page yet!!!"); 97 98 s = splbio(); 99retry: 100 for (i = 0; i < bounceallocarraysize; i++) { 101 if (bounceallocarray[i] != 0xffffffff) { 102 if (bit = ffs(~bounceallocarray[i])) { 103 bounceallocarray[i] |= 1 << (bit - 1) ; 104 bouncefree -= count; 105 splx(s); 106 return bouncepa + (i * BITS_IN_UNSIGNED + (bit - 1)) * NBPG; 107 } 108 } 109 } 110 bpwait = 1; 111 tsleep((caddr_t) &bounceallocarray, PRIBIO, "bncwai", 0); 112 goto retry; 113} 114 115void 116vm_bounce_kva_free(addr, size, now) 117 vm_offset_t addr; 118 vm_offset_t size; 119 int now; 120{ 121 int s = splbio(); 122 kvaf[kvasfreecnt].addr = addr; 123 kvaf[kvasfreecnt++].size = size; 124 if( now) { 125 /* 126 * this will do wakeups 127 */ 128 vm_bounce_kva(0,0); 129 } else { 130 if (bmwait) { 131 /* 132 * if anyone is waiting on the bounce-map, then wakeup 133 */ 134 wakeup((caddr_t) io_map); 135 bmwait = 0; 136 } 137 } 138 splx(s); 139} 140 141/* 142 * free count bounce buffer pages 143 */ 144void 145vm_bounce_page_free(pa, count) 146 vm_offset_t pa; 147 int count; 148{ 149 int allocindex; 150 int index; 151 int bit; 152 153 if (count != 1) 154 panic("vm_bounce_page_free -- no support for > 1 page yet!!!\n"); 155 156 index = (pa - bouncepa) / NBPG; 157 158 if ((index < 0) || (index >= bouncepages)) 159 panic("vm_bounce_page_free -- bad index\n"); 160 161 allocindex = index / BITS_IN_UNSIGNED; 162 bit = index % BITS_IN_UNSIGNED; 163 164 bounceallocarray[allocindex] &= ~(1 << bit); 165 166 bouncefree += count; 167 if (bpwait) { 168 bpwait = 0; 169 wakeup((caddr_t) &bounceallocarray); 170 } 171} 172 173/* 174 * allocate count bounce buffer kva pages 175 */ 176vm_offset_t 177vm_bounce_kva(count, waitok) 178 int count; 179 int waitok; 180{ 181 int tofree; 182 int i; 183 int startfree; 184 vm_offset_t kva = 0; 185 int s = splbio(); 186 int size = count; 187 startfree = 0; 188more: 189 if (!bmfreeing && (tofree = kvasfreecnt)) { 190 bmfreeing = 1; 191 for (i = startfree; i < kvasfreecnt; i++) { 192 /* 193 * if we have a kva of the right size, no sense 194 * in freeing/reallocating... 195 * might affect fragmentation short term, but 196 * as long as the amount of io_map is 197 * significantly more than the maximum transfer 198 * size, I don't think that it is a problem. 199 */ 200 pmap_remove(kernel_pmap, 201 kvaf[i].addr, kvaf[i].addr + kvaf[i].size); 202 if( size && !kva && kvaf[i].size == size) { 203 kva = kvaf[i].addr; 204 } else { 205 kmem_free_wakeup(io_map, kvaf[i].addr, 206 kvaf[i].size); 207 } 208 } 209 if (kvasfreecnt != tofree) { 210 startfree = i; 211 bmfreeing = 0; 212 goto more; 213 } 214 kvasfreecnt = 0; 215 bmfreeing = 0; 216 } 217 218 if( size == 0) { 219 splx(s); 220 return NULL; 221 } 222 223 if (!kva && !(kva = kmem_alloc_pageable(io_map, size))) { 224 if( !waitok) { 225 splx(s); 226 return NULL; 227 } 228 bmwait = 1; 229 tsleep((caddr_t) io_map, PRIBIO, "bmwait", 0); 230 goto more; 231 } 232 splx(s); 233 234 return kva; 235} 236 237/* 238 * same as vm_bounce_kva -- but really allocate 239 */ 240vm_offset_t 241vm_bounce_kva_alloc(count) 242int count; 243{ 244 int i; 245 vm_offset_t kva; 246 vm_offset_t pa; 247 if( bouncepages == 0) { 248 kva = (vm_offset_t) malloc(count*NBPG, M_TEMP, M_WAITOK); 249 return kva; 250 } 251 kva = vm_bounce_kva(count, 1); 252 for(i=0;i<count;i++) { 253 pa = vm_bounce_page_find(1); 254 pmap_kenter(kva + i * NBPG, pa); 255 } 256 pmap_update(); 257 return kva; 258} 259 260/* 261 * same as vm_bounce_kva_free -- but really free 262 */ 263void 264vm_bounce_kva_alloc_free(kva, count) 265 vm_offset_t kva; 266 int count; 267{ 268 int i; 269 vm_offset_t pa; 270 if( bouncepages == 0) { 271 free((caddr_t) kva, M_TEMP); 272 return; 273 } 274 for(i = 0; i < count; i++) { 275 pa = pmap_kextract(kva + i * NBPG); 276 vm_bounce_page_free(pa, 1); 277 } 278 vm_bounce_kva_free(kva, count); 279} 280 281/* 282 * do the things necessary to the struct buf to implement 283 * bounce buffers... inserted before the disk sort 284 */ 285void 286vm_bounce_alloc(bp) 287 struct buf *bp; 288{ 289 int countvmpg; 290 vm_offset_t vastart, vaend; 291 vm_offset_t vapstart, vapend; 292 vm_offset_t va, kva; 293 vm_offset_t pa; 294 int dobounceflag = 0; 295 int bounceindex; 296 int i; 297 int s; 298 299 if (bouncepages == 0) 300 return; 301 302 if (bp->b_bufsize < bp->b_bcount) { 303 printf("vm_bounce_alloc: b_bufsize(%d) < b_bcount(%d) !!!!\n", 304 bp->b_bufsize, bp->b_bcount); 305 bp->b_bufsize = bp->b_bcount; 306 } 307 308 vastart = (vm_offset_t) bp->b_data; 309 vaend = (vm_offset_t) bp->b_data + bp->b_bufsize; 310 311 vapstart = i386_trunc_page(vastart); 312 vapend = i386_round_page(vaend); 313 countvmpg = (vapend - vapstart) / NBPG; 314 315/* 316 * if any page is above 16MB, then go into bounce-buffer mode 317 */ 318 va = vapstart; 319 for (i = 0; i < countvmpg; i++) { 320 pa = pmap_kextract(va); 321 if (pa >= SIXTEENMEG) 322 ++dobounceflag; 323 va += NBPG; 324 } 325 if (dobounceflag == 0) 326 return; 327 328 if (bouncepages < dobounceflag) 329 panic("Not enough bounce buffers!!!"); 330 331/* 332 * allocate a replacement kva for b_addr 333 */ 334 kva = vm_bounce_kva(countvmpg*NBPG, 1); 335 va = vapstart; 336 for (i = 0; i < countvmpg; i++) { 337 pa = pmap_kextract(va); 338 if (pa >= SIXTEENMEG) { 339 /* 340 * allocate a replacement page 341 */ 342 vm_offset_t bpa = vm_bounce_page_find(1); 343 pmap_kenter(kva + (NBPG * i), bpa); 344 /* 345 * if we are writing, the copy the data into the page 346 */ 347 if ((bp->b_flags & B_READ) == 0) { 348 pmap_update(); 349 bcopy((caddr_t) va, (caddr_t) kva + (NBPG * i), NBPG); 350 } 351 } else { 352 /* 353 * use original page 354 */ 355 pmap_kenter(kva + (NBPG * i), pa); 356 } 357 va += NBPG; 358 } 359 pmap_update(); 360 361/* 362 * flag the buffer as being bounced 363 */ 364 bp->b_flags |= B_BOUNCE; 365/* 366 * save the original buffer kva 367 */ 368 bp->b_savekva = bp->b_data; 369/* 370 * put our new kva into the buffer (offset by original offset) 371 */ 372 bp->b_data = (caddr_t) (((vm_offset_t) kva) | 373 ((vm_offset_t) bp->b_savekva & (NBPG - 1))); 374 return; 375} 376 377/* 378 * hook into biodone to free bounce buffer 379 */ 380void 381vm_bounce_free(bp) 382 struct buf *bp; 383{ 384 int i; 385 vm_offset_t origkva, bouncekva; 386 vm_offset_t vastart, vaend; 387 vm_offset_t vapstart, vapend; 388 int countbounce = 0; 389 vm_offset_t firstbouncepa = 0; 390 int firstbounceindex; 391 int countvmpg; 392 vm_offset_t bcount; 393 int s; 394 395/* 396 * if this isn't a bounced buffer, then just return 397 */ 398 if ((bp->b_flags & B_BOUNCE) == 0) 399 return; 400 401 origkva = (vm_offset_t) bp->b_savekva; 402 bouncekva = (vm_offset_t) bp->b_data; 403 404 vastart = bouncekva; 405 vaend = bouncekva + bp->b_bufsize; 406 bcount = bp->b_bufsize; 407 408 vapstart = i386_trunc_page(vastart); 409 vapend = i386_round_page(vaend); 410 411 countvmpg = (vapend - vapstart) / NBPG; 412 413/* 414 * check every page in the kva space for b_addr 415 */ 416 for (i = 0; i < countvmpg; i++) { 417 vm_offset_t mybouncepa; 418 vm_offset_t copycount; 419 420 copycount = i386_round_page(bouncekva + 1) - bouncekva; 421 mybouncepa = pmap_kextract(i386_trunc_page(bouncekva)); 422 423/* 424 * if this is a bounced pa, then process as one 425 */ 426 if ((mybouncepa >= bouncepa) && (mybouncepa < bouncepaend)) { 427 if (copycount > bcount) 428 copycount = bcount; 429/* 430 * if this is a read, then copy from bounce buffer into original buffer 431 */ 432 if (bp->b_flags & B_READ) 433 bcopy((caddr_t) bouncekva, (caddr_t) origkva, copycount); 434/* 435 * free the bounce allocation 436 */ 437 vm_bounce_page_free(i386_trunc_page(mybouncepa), 1); 438 } 439 440 origkva += copycount; 441 bouncekva += copycount; 442 bcount -= copycount; 443 } 444 445/* 446 * add the old kva into the "to free" list 447 */ 448 bouncekva = i386_trunc_page((vm_offset_t) bp->b_data); 449 vm_bounce_kva_free( bouncekva, countvmpg*NBPG, 0); 450 bp->b_data = bp->b_savekva; 451 bp->b_savekva = 0; 452 bp->b_flags &= ~B_BOUNCE; 453 454 return; 455} 456 457/* 458 * init the bounce buffer system 459 */ 460void 461vm_bounce_init() 462{ 463 vm_offset_t minaddr, maxaddr; 464 465 kvasfreecnt = 0; 466 467 if (bouncepages == 0) 468 return; 469 470 bounceallocarraysize = (bouncepages + BITS_IN_UNSIGNED - 1) / BITS_IN_UNSIGNED; 471 bounceallocarray = malloc(bounceallocarraysize * sizeof(unsigned), M_TEMP, M_NOWAIT); 472 473 if (!bounceallocarray) 474 panic("Cannot allocate bounce resource array\n"); 475 476 bzero(bounceallocarray, bounceallocarraysize * sizeof(long)); 477 478 479 bouncepa = pmap_kextract((vm_offset_t) bouncememory); 480 bouncepaend = bouncepa + bouncepages * NBPG; 481 bouncefree = bouncepages; 482} 483 484 485#ifdef BROKEN_IN_44 486static void 487cldiskvamerge( kvanew, orig1, orig1cnt, orig2, orig2cnt) 488 vm_offset_t kvanew; 489 vm_offset_t orig1, orig1cnt; 490 vm_offset_t orig2, orig2cnt; 491{ 492 int i; 493 vm_offset_t pa; 494/* 495 * enter the transfer physical addresses into the new kva 496 */ 497 for(i=0;i<orig1cnt;i++) { 498 vm_offset_t pa; 499 pa = pmap_kextract((caddr_t) orig1 + i * PAGE_SIZE); 500 pmap_kenter(kvanew + i * PAGE_SIZE, pa); 501 } 502 503 for(i=0;i<orig2cnt;i++) { 504 vm_offset_t pa; 505 pa = pmap_kextract((caddr_t) orig2 + i * PAGE_SIZE); 506 pmap_kenter(kvanew + (i + orig1cnt) * PAGE_SIZE, pa); 507 } 508 pmap_update(); 509} 510 511void 512cldisksort(struct buf *dp, struct buf *bp, vm_offset_t maxio) 513{ 514 register struct buf *ap, *newbp; 515 int i, trycount=0; 516 vm_offset_t orig1pages, orig2pages; 517 vm_offset_t orig1begin, orig2begin; 518 vm_offset_t kvanew, kvaorig; 519 520 if( bp->b_bcount < MAXCLSTATS*PAGE_SIZE) 521 ++rqstats[bp->b_bcount/PAGE_SIZE]; 522 /* 523 * If nothing on the activity queue, then 524 * we become the only thing. 525 */ 526 ap = dp->b_actf; 527 if(ap == NULL) { 528 dp->b_actf = bp; 529 dp->b_actl = bp; 530 bp->av_forw = NULL; 531 return; 532 } 533 534 /* 535 * If we lie after the first (currently active) 536 * request, then we must locate the second request list 537 * and add ourselves to it. 538 */ 539 540 if (bp->b_pblkno < ap->b_pblkno) { 541 while (ap->av_forw) { 542 /* 543 * Check for an ``inversion'' in the 544 * normally ascending block numbers, 545 * indicating the start of the second request list. 546 */ 547 if (ap->av_forw->b_pblkno < ap->b_pblkno) { 548 /* 549 * Search the second request list 550 * for the first request at a larger 551 * block number. We go before that; 552 * if there is no such request, we go at end. 553 */ 554 do { 555 if (bp->b_pblkno < ap->av_forw->b_pblkno) 556 goto insert; 557 ap = ap->av_forw; 558 } while (ap->av_forw); 559 goto insert; /* after last */ 560 } 561 ap = ap->av_forw; 562 } 563 /* 564 * No inversions... we will go after the last, and 565 * be the first request in the second request list. 566 */ 567 goto insert; 568 } 569 /* 570 * Request is at/after the current request... 571 * sort in the first request list. 572 */ 573 while (ap->av_forw) { 574 /* 575 * We want to go after the current request 576 * if there is an inversion after it (i.e. it is 577 * the end of the first request list), or if 578 * the next request is a larger block than our request. 579 */ 580 if (ap->av_forw->b_pblkno < ap->b_pblkno || 581 bp->b_pblkno < ap->av_forw->b_pblkno ) 582 goto insert; 583 ap = ap->av_forw; 584 } 585 586insert: 587 588 /* 589 * read clustering with new read-ahead disk drives hurts mostly, so 590 * we don't bother... 591 */ 592 if( bp->b_flags & B_READ) 593 goto nocluster; 594 /* 595 * we currently only cluster I/O transfers that are at page-aligned 596 * kvas and transfers that are multiples of page lengths. 597 */ 598 if ((bp->b_flags & B_BAD) == 0 && 599 ((bp->b_bcount & PAGE_MASK) == 0) && 600 (((vm_offset_t) bp->b_un.b_addr & PAGE_MASK) == 0)) { 601 if( maxio > MAXCLSTATS*PAGE_SIZE) 602 maxio = MAXCLSTATS*PAGE_SIZE; 603 /* 604 * merge with previous? 605 * conditions: 606 * 1) We reside physically immediately after the previous block. 607 * 2) The previous block is not first on the device queue because 608 * such a block might be active. 609 * 3) The mode of the two I/Os is identical. 610 * 4) The previous kva is page aligned and the previous transfer 611 * is a multiple of a page in length. 612 * 5) And the total I/O size would be below the maximum. 613 */ 614 if( (ap->b_pblkno + (ap->b_bcount / DEV_BSIZE) == bp->b_pblkno) && 615 (dp->b_actf != ap) && 616 ((ap->b_flags & ~B_CLUSTER) == bp->b_flags) && 617 ((ap->b_flags & B_BAD) == 0) && 618 ((ap->b_bcount & PAGE_MASK) == 0) && 619 (((vm_offset_t) ap->b_un.b_addr & PAGE_MASK) == 0) && 620 (ap->b_bcount + bp->b_bcount < maxio)) { 621 622 orig1begin = (vm_offset_t) ap->b_un.b_addr; 623 orig1pages = ap->b_bcount / PAGE_SIZE; 624 625 orig2begin = (vm_offset_t) bp->b_un.b_addr; 626 orig2pages = bp->b_bcount / PAGE_SIZE; 627 /* 628 * see if we can allocate a kva, if we cannot, the don't 629 * cluster. 630 */ 631 kvanew = vm_bounce_kva( PAGE_SIZE * (orig1pages + orig2pages), 0); 632 if( !kvanew) { 633 goto nocluster; 634 } 635 636 637 if( (ap->b_flags & B_CLUSTER) == 0) { 638 639 /* 640 * get a physical buf pointer 641 */ 642 newbp = (struct buf *)trypbuf(); 643 if( !newbp) { 644 vm_bounce_kva_free( kvanew, PAGE_SIZE * (orig1pages + orig2pages), 1); 645 goto nocluster; 646 } 647 648 cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages); 649 650 /* 651 * build the new bp to be handed off to the device 652 */ 653 654 --clstats[ap->b_bcount/PAGE_SIZE]; 655 *newbp = *ap; 656 newbp->b_flags |= B_CLUSTER; 657 newbp->b_un.b_addr = (caddr_t) kvanew; 658 newbp->b_bcount += bp->b_bcount; 659 newbp->b_bufsize = newbp->b_bcount; 660 newbp->b_clusterf = ap; 661 newbp->b_clusterl = bp; 662 ++clstats[newbp->b_bcount/PAGE_SIZE]; 663 664 /* 665 * enter the new bp onto the device queue 666 */ 667 if( ap->av_forw) 668 ap->av_forw->av_back = newbp; 669 else 670 dp->b_actl = newbp; 671 672 if( dp->b_actf != ap ) 673 ap->av_back->av_forw = newbp; 674 else 675 dp->b_actf = newbp; 676 677 /* 678 * enter the previous bps onto the cluster queue 679 */ 680 ap->av_forw = bp; 681 bp->av_back = ap; 682 683 ap->av_back = NULL; 684 bp->av_forw = NULL; 685 686 } else { 687 vm_offset_t addr; 688 689 cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages); 690 /* 691 * free the old kva 692 */ 693 vm_bounce_kva_free( orig1begin, ap->b_bufsize, 0); 694 --clstats[ap->b_bcount/PAGE_SIZE]; 695 696 ap->b_un.b_addr = (caddr_t) kvanew; 697 698 ap->b_clusterl->av_forw = bp; 699 bp->av_forw = NULL; 700 bp->av_back = ap->b_clusterl; 701 ap->b_clusterl = bp; 702 703 ap->b_bcount += bp->b_bcount; 704 ap->b_bufsize = ap->b_bcount; 705 ++clstats[ap->b_bcount/PAGE_SIZE]; 706 } 707 return; 708 /* 709 * merge with next? 710 * conditions: 711 * 1) We reside physically before the next block. 712 * 3) The mode of the two I/Os is identical. 713 * 4) The next kva is page aligned and the next transfer 714 * is a multiple of a page in length. 715 * 5) And the total I/O size would be below the maximum. 716 */ 717 } else if( ap->av_forw && 718 (bp->b_pblkno + (bp->b_bcount / DEV_BSIZE) == ap->av_forw->b_pblkno) && 719 (bp->b_flags == (ap->av_forw->b_flags & ~B_CLUSTER)) && 720 ((ap->av_forw->b_flags & B_BAD) == 0) && 721 ((ap->av_forw->b_bcount & PAGE_MASK) == 0) && 722 (((vm_offset_t) ap->av_forw->b_un.b_addr & PAGE_MASK) == 0) && 723 (ap->av_forw->b_bcount + bp->b_bcount < maxio)) { 724 725 orig1begin = (vm_offset_t) bp->b_un.b_addr; 726 orig1pages = bp->b_bcount / PAGE_SIZE; 727 728 orig2begin = (vm_offset_t) ap->av_forw->b_un.b_addr; 729 orig2pages = ap->av_forw->b_bcount / PAGE_SIZE; 730 731 /* 732 * see if we can allocate a kva, if we cannot, the don't 733 * cluster. 734 */ 735 kvanew = vm_bounce_kva( PAGE_SIZE * (orig1pages + orig2pages), 0); 736 if( !kvanew) { 737 goto nocluster; 738 } 739 740 /* 741 * if next isn't a cluster we need to create one 742 */ 743 if( (ap->av_forw->b_flags & B_CLUSTER) == 0) { 744 745 /* 746 * get a physical buf pointer 747 */ 748 newbp = (struct buf *)trypbuf(); 749 if( !newbp) { 750 vm_bounce_kva_free( kvanew, PAGE_SIZE * (orig1pages + orig2pages), 1); 751 goto nocluster; 752 } 753 754 cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages); 755 ap = ap->av_forw; 756 --clstats[ap->b_bcount/PAGE_SIZE]; 757 *newbp = *ap; 758 newbp->b_flags |= B_CLUSTER; 759 newbp->b_un.b_addr = (caddr_t) kvanew; 760 newbp->b_blkno = bp->b_blkno; 761 newbp->b_pblkno = bp->b_pblkno; 762 newbp->b_bcount += bp->b_bcount; 763 newbp->b_bufsize = newbp->b_bcount; 764 newbp->b_clusterf = bp; 765 newbp->b_clusterl = ap; 766 ++clstats[newbp->b_bcount/PAGE_SIZE]; 767 768 if( ap->av_forw) 769 ap->av_forw->av_back = newbp; 770 else 771 dp->b_actl = newbp; 772 773 if( dp->b_actf != ap ) 774 ap->av_back->av_forw = newbp; 775 else 776 dp->b_actf = newbp; 777 778 bp->av_forw = ap; 779 ap->av_back = bp; 780 781 bp->av_back = NULL; 782 ap->av_forw = NULL; 783 } else { 784 vm_offset_t addr; 785 786 cldiskvamerge( kvanew, orig1begin, orig1pages, orig2begin, orig2pages); 787 ap = ap->av_forw; 788 vm_bounce_kva_free( orig2begin, ap->b_bufsize, 0); 789 790 ap->b_un.b_addr = (caddr_t) kvanew; 791 bp->av_forw = ap->b_clusterf; 792 ap->b_clusterf->av_back = bp; 793 ap->b_clusterf = bp; 794 bp->av_back = NULL; 795 --clstats[ap->b_bcount/PAGE_SIZE]; 796 797 ap->b_blkno = bp->b_blkno; 798 ap->b_pblkno = bp->b_pblkno; 799 ap->b_bcount += bp->b_bcount; 800 ap->b_bufsize = ap->b_bcount; 801 ++clstats[ap->b_bcount/PAGE_SIZE]; 802 803 } 804 return; 805 } 806 } 807 /* 808 * don't merge 809 */ 810nocluster: 811 ++clstats[bp->b_bcount/PAGE_SIZE]; 812 bp->av_forw = ap->av_forw; 813 if( bp->av_forw) 814 bp->av_forw->av_back = bp; 815 else 816 dp->b_actl = bp; 817 818 ap->av_forw = bp; 819 bp->av_back = ap; 820} 821#endif 822 823/* 824 * quick version of vm_fault 825 */ 826 827void 828vm_fault_quick( v, prot) 829 vm_offset_t v; 830 int prot; 831{ 832 if( (cpu_class == CPUCLASS_386) && 833 (prot & VM_PROT_WRITE)) 834 vm_fault(&curproc->p_vmspace->vm_map, v, 835 VM_PROT_READ|VM_PROT_WRITE, FALSE); 836 else if( prot & VM_PROT_WRITE) 837 *(volatile char *)v += 0; 838 else 839 *(volatile char *)v; 840} 841 842 843/* 844 * Finish a fork operation, with process p2 nearly set up. 845 * Copy and update the kernel stack and pcb, making the child 846 * ready to run, and marking it so that it can return differently 847 * than the parent. Returns 1 in the child process, 0 in the parent. 848 * We currently double-map the user area so that the stack is at the same 849 * address in each process; in the future we will probably relocate 850 * the frame pointers on the stack after copying. 851 */ 852int 853cpu_fork(p1, p2) 854 register struct proc *p1, *p2; 855{ 856 register struct user *up = p2->p_addr; 857 int foo, offset, addr, i; 858 extern char kstack[]; 859 extern int mvesp(); 860 861 /* 862 * Copy pcb and stack from proc p1 to p2. 863 * We do this as cheaply as possible, copying only the active 864 * part of the stack. The stack and pcb need to agree; 865 * this is tricky, as the final pcb is constructed by savectx, 866 * but its frame isn't yet on the stack when the stack is copied. 867 * swtch compensates for this when the child eventually runs. 868 * This should be done differently, with a single call 869 * that copies and updates the pcb+stack, 870 * replacing the bcopy and savectx. 871 */ 872 p2->p_addr->u_pcb = p1->p_addr->u_pcb; 873 offset = mvesp() - (int)kstack; 874 bcopy((caddr_t)kstack + offset, (caddr_t)p2->p_addr + offset, 875 (unsigned) ctob(UPAGES) - offset); 876 p2->p_md.md_regs = p1->p_md.md_regs; 877 878 /* 879 * Wire top of address space of child to it's kstack. 880 * First, fault in a page of pte's to map it. 881 */ 882#if 0 883 addr = trunc_page((u_int)vtopte(kstack)); 884 vm_map_pageable(&p2->p_vmspace->vm_map, addr, addr+NBPG, FALSE); 885 for (i=0; i < UPAGES; i++) 886 pmap_enter(&p2->p_vmspace->vm_pmap, kstack+i*NBPG, 887 pmap_extract(kernel_pmap, ((int)p2->p_addr)+i*NBPG), 888 /* 889 * The user area has to be mapped writable because 890 * it contains the kernel stack (when CR0_WP is on 891 * on a 486 there is no user-read/kernel-write 892 * mode). It is protected from user mode access 893 * by the segment limits. 894 */ 895 VM_PROT_READ|VM_PROT_WRITE, TRUE); 896#endif 897 pmap_activate(&p2->p_vmspace->vm_pmap, &up->u_pcb); 898 899 /* 900 * 901 * Arrange for a non-local goto when the new process 902 * is started, to resume here, returning nonzero from setjmp. 903 */ 904 if (savectx(up, 1)) { 905 /* 906 * Return 1 in child. 907 */ 908 return (1); 909 } 910 return (0); 911} 912 913#ifdef notyet 914/* 915 * cpu_exit is called as the last action during exit. 916 * 917 * We change to an inactive address space and a "safe" stack, 918 * passing thru an argument to the new stack. Now, safely isolated 919 * from the resources we're shedding, we release the address space 920 * and any remaining machine-dependent resources, including the 921 * memory for the user structure and kernel stack. 922 * 923 * Next, we assign a dummy context to be written over by swtch, 924 * calling it to send this process off to oblivion. 925 * [The nullpcb allows us to minimize cost in mi_switch() by not having 926 * a special case]. 927 */ 928struct proc *swtch_to_inactive(); 929volatile void 930cpu_exit(p) 931 register struct proc *p; 932{ 933 static struct pcb nullpcb; /* pcb to overwrite on last swtch */ 934 935#if NNPX > 0 936 npxexit(p); 937#endif /* NNPX */ 938 939 /* move to inactive space and stack, passing arg accross */ 940 p = swtch_to_inactive(p); 941 942 /* drop per-process resources */ 943 vmspace_free(p->p_vmspace); 944 kmem_free(kernel_map, (vm_offset_t)p->p_addr, ctob(UPAGES)); 945 946 p->p_addr = (struct user *) &nullpcb; 947 mi_switch(); 948 /* NOTREACHED */ 949} 950#else 951void 952cpu_exit(p) 953 register struct proc *p; 954{ 955 956#if NNPX > 0 957 npxexit(p); 958#endif /* NNPX */ 959 curproc = p; 960 mi_switch(); 961 /* 962 * This is to shutup the compiler, and if swtch() failed I suppose 963 * this would be a good thing. This keeps gcc happy because panic 964 * is a volatile void function as well. 965 */ 966 panic("cpu_exit"); 967} 968 969void 970cpu_wait(p) struct proc *p; { 971/* extern vm_map_t upages_map; */ 972 extern char kstack[]; 973 974 /* drop per-process resources */ 975 pmap_remove(vm_map_pmap(kernel_map), (vm_offset_t) p->p_addr, 976 ((vm_offset_t) p->p_addr) + ctob(UPAGES)); 977 kmem_free(kernel_map, (vm_offset_t)p->p_addr, ctob(UPAGES)); 978 vmspace_free(p->p_vmspace); 979} 980#endif 981 982/* 983 * Dump the machine specific header information at the start of a core dump. 984 */ 985int 986cpu_coredump(p, vp, cred) 987 struct proc *p; 988 struct vnode *vp; 989 struct ucred *cred; 990{ 991 992 return (vn_rdwr(UIO_WRITE, vp, (caddr_t) p->p_addr, ctob(UPAGES), 993 (off_t)0, UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *)NULL, 994 p)); 995} 996 997/* 998 * Set a red zone in the kernel stack after the u. area. 999 */ 1000void 1001setredzone(pte, vaddr) 1002 u_short *pte; 1003 caddr_t vaddr; 1004{ 1005/* eventually do this by setting up an expand-down stack segment 1006 for ss0: selector, allowing stack access down to top of u. 1007 this means though that protection violations need to be handled 1008 thru a double fault exception that must do an integral task 1009 switch to a known good context, within which a dump can be 1010 taken. a sensible scheme might be to save the initial context 1011 used by sched (that has physical memory mapped 1:1 at bottom) 1012 and take the dump while still in mapped mode */ 1013} 1014 1015/* 1016 * Move pages from one kernel virtual address to another. 1017 * Both addresses are assumed to reside in the Sysmap, 1018 * and size must be a multiple of CLSIZE. 1019 */ 1020 1021/* 1022 * Move pages from one kernel virtual address to another. 1023 * Both addresses are assumed to reside in the Sysmap, 1024 * and size must be a multiple of CLSIZE. 1025 */ 1026 1027void 1028pagemove(from, to, size) 1029 register caddr_t from, to; 1030 int size; 1031{ 1032 register vm_offset_t pa; 1033 1034 if (size & CLOFSET) 1035 panic("pagemove"); 1036 while (size > 0) { 1037 pa = pmap_kextract((vm_offset_t)from); 1038 if (pa == 0) 1039 panic("pagemove 2"); 1040 if (pmap_kextract((vm_offset_t)to) != 0) 1041 panic("pagemove 3"); 1042 pmap_remove(kernel_pmap, 1043 (vm_offset_t)from, (vm_offset_t)from + PAGE_SIZE); 1044 pmap_kenter( (vm_offset_t)to, pa); 1045 from += PAGE_SIZE; 1046 to += PAGE_SIZE; 1047 size -= PAGE_SIZE; 1048 } 1049 pmap_update(); 1050} 1051 1052/* 1053 * Convert kernel VA to physical address 1054 */ 1055u_long 1056kvtop(void *addr) 1057{ 1058 vm_offset_t va; 1059 1060 va = pmap_kextract((vm_offset_t)addr); 1061 if (va == 0) 1062 panic("kvtop: zero page frame"); 1063 return((int)va); 1064} 1065 1066extern vm_map_t phys_map; 1067 1068/* 1069 * Map an IO request into kernel virtual address space. 1070 * 1071 * All requests are (re)mapped into kernel VA space. 1072 * Notice that we use b_bufsize for the size of the buffer 1073 * to be mapped. b_bcount might be modified by the driver. 1074 */ 1075void 1076vmapbuf(bp) 1077 register struct buf *bp; 1078{ 1079 register int npf; 1080 register caddr_t addr; 1081 int off; 1082 vm_offset_t kva; 1083 vm_offset_t pa, lastv, v; 1084 1085 if ((bp->b_flags & B_PHYS) == 0) 1086 panic("vmapbuf"); 1087 1088 lastv = 0; 1089 for (addr = (caddr_t)trunc_page(bp->b_data); 1090 addr < bp->b_data + bp->b_bufsize; 1091 addr += PAGE_SIZE) { 1092 1093/* 1094 * make sure that the pde is valid and held 1095 */ 1096 v = trunc_page(((vm_offset_t)vtopte(addr))); 1097 if (v != lastv) { 1098 vm_fault_quick(v, VM_PROT_READ); 1099 pa = pmap_extract(&curproc->p_vmspace->vm_pmap, v); 1100 vm_page_hold(PHYS_TO_VM_PAGE(pa)); 1101 lastv = v; 1102 } 1103 1104/* 1105 * do the vm_fault if needed, do the copy-on-write thing when 1106 * reading stuff off device into memory. 1107 */ 1108 vm_fault_quick(addr, 1109 (bp->b_flags&B_READ)?(VM_PROT_READ|VM_PROT_WRITE):VM_PROT_READ); 1110 pa = pmap_extract(&curproc->p_vmspace->vm_pmap, (vm_offset_t) addr); 1111/* 1112 * hold the data page 1113 */ 1114 vm_page_hold(PHYS_TO_VM_PAGE(pa)); 1115 } 1116 1117 addr = bp->b_saveaddr = bp->b_un.b_addr; 1118 off = (int)addr & PGOFSET; 1119 npf = btoc(round_page(bp->b_bufsize + off)); 1120 kva = kmem_alloc_wait(phys_map, ctob(npf)); 1121 bp->b_un.b_addr = (caddr_t) (kva + off); 1122 while (npf--) { 1123 pa = pmap_extract(&curproc->p_vmspace->vm_pmap, (vm_offset_t)addr); 1124 if (pa == 0) 1125 panic("vmapbuf: null page frame"); 1126 pmap_kenter(kva, trunc_page(pa)); 1127 addr += PAGE_SIZE; 1128 kva += PAGE_SIZE; 1129 } 1130 pmap_update(); 1131} 1132 1133/* 1134 * Free the io map PTEs associated with this IO operation. 1135 * We also invalidate the TLB entries and restore the original b_addr. 1136 */ 1137void 1138vunmapbuf(bp) 1139 register struct buf *bp; 1140{ 1141 register int npf; 1142 register caddr_t addr = bp->b_un.b_addr; 1143 vm_offset_t kva,va,v,lastv,pa; 1144 1145 if ((bp->b_flags & B_PHYS) == 0) 1146 panic("vunmapbuf"); 1147 npf = btoc(round_page(bp->b_bufsize + ((int)addr & PGOFSET))); 1148 kva = (vm_offset_t)((int)addr & ~PGOFSET); 1149 kmem_free_wakeup(phys_map, kva, ctob(npf)); 1150 bp->b_un.b_addr = bp->b_saveaddr; 1151 bp->b_saveaddr = NULL; 1152 1153 1154/* 1155 * unhold the pde, and data pages 1156 */ 1157 lastv = 0; 1158 for (addr = (caddr_t)trunc_page(bp->b_data); 1159 addr < bp->b_data + bp->b_bufsize; 1160 addr += NBPG) { 1161 1162 /* 1163 * release the data page 1164 */ 1165 pa = pmap_extract(&curproc->p_vmspace->vm_pmap, (vm_offset_t) addr); 1166 vm_page_unhold(PHYS_TO_VM_PAGE(pa)); 1167 1168 /* 1169 * and unhold the page table 1170 */ 1171 v = trunc_page(((vm_offset_t)vtopte(addr))); 1172 if (v != lastv) { 1173 pa = pmap_extract(&curproc->p_vmspace->vm_pmap, v); 1174 vm_page_unhold(PHYS_TO_VM_PAGE(pa)); 1175 lastv = v; 1176 } 1177 } 1178} 1179 1180/* 1181 * Force reset the processor by invalidating the entire address space! 1182 */ 1183void 1184cpu_reset() { 1185 1186 /* force a shutdown by unmapping entire address space ! */ 1187 bzero((caddr_t) PTD, NBPG); 1188 1189 /* "good night, sweet prince .... <THUNK!>" */ 1190 tlbflush(); 1191 /* NOTREACHED */ 1192 while(1); 1193} 1194 1195/* 1196 * Grow the user stack to allow for 'sp'. This version grows the stack in 1197 * chunks of SGROWSIZ. 1198 */ 1199int 1200grow(p, sp) 1201 struct proc *p; 1202 u_int sp; 1203{ 1204 unsigned int nss; 1205 caddr_t v; 1206 struct vmspace *vm = p->p_vmspace; 1207 1208 if ((caddr_t)sp <= vm->vm_maxsaddr || (unsigned)sp >= (unsigned)USRSTACK) 1209 return (1); 1210 1211 nss = roundup(USRSTACK - (unsigned)sp, PAGE_SIZE); 1212 1213 if (nss > p->p_rlimit[RLIMIT_STACK].rlim_cur) 1214 return (0); 1215 1216 if (vm->vm_ssize && roundup(vm->vm_ssize << PAGE_SHIFT, 1217 SGROWSIZ) < nss) { 1218 int grow_amount; 1219 /* 1220 * If necessary, grow the VM that the stack occupies 1221 * to allow for the rlimit. This allows us to not have 1222 * to allocate all of the VM up-front in execve (which 1223 * is expensive). 1224 * Grow the VM by the amount requested rounded up to 1225 * the nearest SGROWSIZ to provide for some hysteresis. 1226 */ 1227 grow_amount = roundup((nss - (vm->vm_ssize << PAGE_SHIFT)), SGROWSIZ); 1228 v = (char *)USRSTACK - roundup(vm->vm_ssize << PAGE_SHIFT, 1229 SGROWSIZ) - grow_amount; 1230 /* 1231 * If there isn't enough room to extend by SGROWSIZ, then 1232 * just extend to the maximum size 1233 */ 1234 if (v < vm->vm_maxsaddr) { 1235 v = vm->vm_maxsaddr; 1236 grow_amount = MAXSSIZ - (vm->vm_ssize << PAGE_SHIFT); 1237 } 1238 if (vm_allocate(&vm->vm_map, (vm_offset_t *)&v, 1239 grow_amount, FALSE) != KERN_SUCCESS) { 1240 return (0); 1241 } 1242 vm->vm_ssize += grow_amount >> PAGE_SHIFT; 1243 } 1244 1245 return (1); 1246} 1247