1/* 2 * Copyright (c) 2009 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28#include <mach_assert.h> 29#include <sys/errno.h> 30#include <i386/param.h> 31#include <i386/misc_protos.h> 32#include <i386/cpu_data.h> 33#include <i386/machine_routines.h> 34#include <vm/pmap.h> 35#include <vm/vm_map.h> 36#include <vm/vm_kern.h> 37#include <vm/vm_fault.h> 38 39#include <sys/kdebug.h> 40 41/* 42 * the copy engine has the following characteristics 43 * - copyio handles copies to/from user or kernel space 44 * - copypv deals with physical or virtual addresses 45 * 46 * implementation details as follows 47 * - a cache of up to NCOPY_WINDOWS is maintained per thread for 48 * access of user virutal space 49 * - the window size is determined by the amount of virtual space 50 * that can be mapped by a single page table 51 * - the mapping is done by copying the page table pointer from 52 * the user's directory entry corresponding to the window's 53 * address in user space to the directory entry corresponding 54 * to the window slot in the kernel's address space 55 * - the set of mappings is preserved across context switches, 56 * so the copy can run with pre-emption enabled 57 * - there is a gdt entry set up to anchor the kernel window on 58 * each processor 59 * - the copies are done using the selector corresponding to the 60 * gdt entry 61 * - the addresses corresponding to the user virtual address are 62 * relative to the beginning of the window being used to map 63 * that region... thus the thread can be pre-empted and switched 64 * to a different processor while in the midst of a copy 65 * - the window caches must be invalidated if the pmap changes out 66 * from under the thread... this can happen during vfork/exec... 67 * inval_copy_windows is the invalidation routine to be used 68 * - the copyio engine has 4 different states associated with it 69 * that allows for lazy tlb flushes and the ability to avoid 70 * a flush all together if we've just come from user space 71 * the 4 states are as follows... 72 * 73 * WINDOWS_OPENED - set by copyio to indicate to the context 74 * switch code that it is necessary to do a tlbflush after 75 * switching the windows since we're in the middle of a copy 76 * 77 * WINDOWS_CLOSED - set by copyio to indicate that it's done 78 * using the windows, so that the context switch code need 79 * not do the tlbflush... instead it will set the state to... 80 * 81 * WINDOWS_DIRTY - set by the context switch code to indicate 82 * to the copy engine that it is responsible for doing a 83 * tlbflush before using the windows again... it's also 84 * set by the inval_copy_windows routine to indicate the 85 * same responsibility. 86 * 87 * WINDOWS_CLEAN - set by the return to user path to indicate 88 * that a tlbflush has happened and that there is no need 89 * for copyio to do another when it is entered next... 90 * 91 * - a window for mapping single physical pages is provided for copypv 92 * - this window is maintained across context switches and has the 93 * same characteristics as the user space windows w/r to pre-emption 94 */ 95 96extern int copyout_user(const char *, vm_offset_t, vm_size_t); 97extern int copyout_kern(const char *, vm_offset_t, vm_size_t); 98extern int copyin_user(const vm_offset_t, char *, vm_size_t); 99extern int copyin_kern(const vm_offset_t, char *, vm_size_t); 100extern int copyoutphys_user(const char *, vm_offset_t, vm_size_t); 101extern int copyoutphys_kern(const char *, vm_offset_t, vm_size_t); 102extern int copyinphys_user(const vm_offset_t, char *, vm_size_t); 103extern int copyinphys_kern(const vm_offset_t, char *, vm_size_t); 104extern int copyinstr_user(const vm_offset_t, char *, vm_size_t, vm_size_t *); 105extern int copyinstr_kern(const vm_offset_t, char *, vm_size_t, vm_size_t *); 106 107static int copyio(int, user_addr_t, char *, vm_size_t, vm_size_t *, int); 108static int copyio_phys(addr64_t, addr64_t, vm_size_t, int); 109 110 111#define COPYIN 0 112#define COPYOUT 1 113#define COPYINSTR 2 114#define COPYINPHYS 3 115#define COPYOUTPHYS 4 116 117void inval_copy_windows(thread_t thread) 118{ 119 int i; 120 121 for (i = 0; i < NCOPY_WINDOWS; i++) { 122 thread->machine.copy_window[i].user_base = -1; 123 } 124 thread->machine.nxt_window = 0; 125 thread->machine.copyio_state = WINDOWS_DIRTY; 126 127 KERNEL_DEBUG(0xeff70058 | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), (int)thread->map, 0, 0, 0); 128} 129 130 131static int 132copyio(int copy_type, user_addr_t user_addr, char *kernel_addr, 133 vm_size_t nbytes, vm_size_t *lencopied, int use_kernel_map) 134{ 135 thread_t thread; 136 pmap_t pmap; 137 pt_entry_t *updp; 138 pt_entry_t *kpdp; 139 user_addr_t user_base; 140 vm_offset_t user_offset; 141 vm_offset_t kern_vaddr; 142 vm_size_t cnt; 143 vm_size_t bytes_copied; 144 int error = 0; 145 int window_index; 146 int copyio_state; 147 boolean_t istate; 148#if KDEBUG 149 int debug_type = 0xeff70010; 150 debug_type += (copy_type << 2); 151#endif 152 153 thread = current_thread(); 154 155 KERNEL_DEBUG(debug_type | DBG_FUNC_START, (int)(user_addr >> 32), (int)user_addr, 156 (int)nbytes, thread->machine.copyio_state, 0); 157 158 if (nbytes == 0) { 159 KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)user_addr, 160 (unsigned)kernel_addr, (unsigned)nbytes, 0, 0); 161 return (0); 162 } 163 pmap = thread->map->pmap; 164 165 if (pmap == kernel_pmap || use_kernel_map) { 166 167 kern_vaddr = (vm_offset_t)user_addr; 168 169 switch (copy_type) { 170 171 case COPYIN: 172 error = copyin_kern(kern_vaddr, kernel_addr, nbytes); 173 break; 174 175 case COPYOUT: 176 error = copyout_kern(kernel_addr, kern_vaddr, nbytes); 177 break; 178 179 case COPYINSTR: 180 error = copyinstr_kern(kern_vaddr, kernel_addr, nbytes, lencopied); 181 break; 182 183 case COPYINPHYS: 184 error = copyinphys_kern(kern_vaddr, kernel_addr, nbytes); 185 break; 186 187 case COPYOUTPHYS: 188 error = copyoutphys_kern(kernel_addr, kern_vaddr, nbytes); 189 break; 190 } 191 KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)kern_vaddr, 192 (unsigned)kernel_addr, (unsigned)nbytes, 193 error | 0x80000000, 0); 194 return (error); 195 } 196 197#if CONFIG_DTRACE 198 thread->machine.specFlags |= CopyIOActive; 199#endif /* CONFIG_DTRACE */ 200 201 if ((nbytes && (user_addr + nbytes <= user_addr)) || 202 (user_addr < vm_map_min(thread->map)) || 203 (user_addr + nbytes > vm_map_max(thread->map))) { 204 error = EFAULT; 205 goto done; 206 } 207 208 user_base = user_addr & ~((user_addr_t)(NBPDE - 1)); 209 user_offset = (vm_offset_t)(user_addr & (NBPDE - 1)); 210 211 KERNEL_DEBUG(debug_type | DBG_FUNC_NONE, (int)(user_base >> 32), (int)user_base, 212 (int)user_offset, 0, 0); 213 214 cnt = NBPDE - user_offset; 215 216 if (cnt > nbytes) 217 cnt = nbytes; 218 219 istate = ml_set_interrupts_enabled(FALSE); 220 221 copyio_state = thread->machine.copyio_state; 222 thread->machine.copyio_state = WINDOWS_OPENED; 223 224 (void) ml_set_interrupts_enabled(istate); 225 226 227 for (;;) { 228 229 for (window_index = 0; window_index < NCOPY_WINDOWS; window_index++) { 230 if (thread->machine.copy_window[window_index].user_base == user_base) 231 break; 232 } 233 if (window_index >= NCOPY_WINDOWS) { 234 235 window_index = thread->machine.nxt_window; 236 thread->machine.nxt_window++; 237 238 if (thread->machine.nxt_window >= NCOPY_WINDOWS) 239 thread->machine.nxt_window = 0; 240 241 /* 242 * it's necessary to disable pre-emption 243 * since I have to compute the kernel descriptor pointer 244 * for the new window 245 */ 246 istate = ml_set_interrupts_enabled(FALSE); 247 248 thread->machine.copy_window[window_index].user_base = user_base; 249 250 updp = pmap_pde(pmap, user_base); 251 252 kpdp = current_cpu_datap()->cpu_copywindow_pdp; 253 kpdp += window_index; 254 255 pmap_store_pte(kpdp, updp ? *updp : 0); 256 257 (void) ml_set_interrupts_enabled(istate); 258 259 copyio_state = WINDOWS_DIRTY; 260 261 KERNEL_DEBUG(0xeff70040 | DBG_FUNC_NONE, window_index, 262 (unsigned)user_base, (unsigned)updp, 263 (unsigned)kpdp, 0); 264 265 } 266#if JOE_DEBUG 267 else { 268 istate = ml_set_interrupts_enabled(FALSE); 269 270 updp = pmap_pde(pmap, user_base); 271 272 kpdp = current_cpu_datap()->cpu_copywindow_pdp; 273 274 kpdp += window_index; 275 276 if ((*kpdp & PG_FRAME) != (*updp & PG_FRAME)) { 277 panic("copyio: user pdp mismatch - kpdp = 0x%qx, updp = 0x%qx\n", *kpdp, *updp); 278 } 279 (void) ml_set_interrupts_enabled(istate); 280 } 281#endif 282 if (copyio_state == WINDOWS_DIRTY) { 283 flush_tlb(); 284 285 copyio_state = WINDOWS_CLEAN; 286 287 KERNEL_DEBUG(0xeff70054 | DBG_FUNC_NONE, window_index, 0, 0, 0, 0); 288 } 289 user_offset += (window_index * NBPDE); 290 291 KERNEL_DEBUG(0xeff70044 | DBG_FUNC_NONE, (unsigned)user_offset, 292 (unsigned)kernel_addr, cnt, 0, 0); 293 294 switch (copy_type) { 295 296 case COPYIN: 297 error = copyin_user(user_offset, kernel_addr, cnt); 298 break; 299 300 case COPYOUT: 301 error = copyout_user(kernel_addr, user_offset, cnt); 302 break; 303 304 case COPYINPHYS: 305 error = copyinphys_user(user_offset, kernel_addr, cnt); 306 break; 307 308 case COPYOUTPHYS: 309 error = copyoutphys_user(kernel_addr, user_offset, cnt); 310 break; 311 312 case COPYINSTR: 313 error = copyinstr_user(user_offset, kernel_addr, cnt, &bytes_copied); 314 315 /* 316 * lencopied should be updated on success 317 * or ENAMETOOLONG... but not EFAULT 318 */ 319 if (error != EFAULT) 320 *lencopied += bytes_copied; 321 322 /* 323 * if we still have room, then the ENAMETOOLONG 324 * is just an artifact of the buffer straddling 325 * a window boundary and we should continue 326 */ 327 if (error == ENAMETOOLONG && nbytes > cnt) 328 error = 0; 329 330 if (error) { 331#if KDEBUG 332 nbytes = *lencopied; 333#endif 334 break; 335 } 336 if (*(kernel_addr + bytes_copied - 1) == 0) { 337 /* 338 * we found a NULL terminator... we're done 339 */ 340#if KDEBUG 341 nbytes = *lencopied; 342#endif 343 goto done; 344 } 345 if (cnt == nbytes) { 346 /* 347 * no more room in the buffer and we haven't 348 * yet come across a NULL terminator 349 */ 350#if KDEBUG 351 nbytes = *lencopied; 352#endif 353 error = ENAMETOOLONG; 354 break; 355 } 356 assert(cnt == bytes_copied); 357 358 break; 359 } 360 if (error) 361 break; 362 if ((nbytes -= cnt) == 0) 363 break; 364 365 kernel_addr += cnt; 366 user_base += NBPDE; 367 user_offset = 0; 368 369 if (nbytes > NBPDE) 370 cnt = NBPDE; 371 else 372 cnt = nbytes; 373 } 374done: 375 thread->machine.copyio_state = WINDOWS_CLOSED; 376 377 KERNEL_DEBUG(debug_type | DBG_FUNC_END, (unsigned)user_addr, 378 (unsigned)kernel_addr, (unsigned)nbytes, error, 0); 379 380#if CONFIG_DTRACE 381 thread->machine.specFlags &= ~CopyIOActive; 382#endif /* CONFIG_DTRACE */ 383 384 return (error); 385} 386 387static int 388copyio_phys(addr64_t source, addr64_t sink, vm_size_t csize, int which) 389{ 390 pmap_paddr_t paddr; 391 user_addr_t vaddr; 392 char *window_offset; 393 pt_entry_t pentry; 394 int ctype; 395 int retval; 396 boolean_t istate; 397 398 399 if (which & cppvPsnk) { 400 paddr = (pmap_paddr_t)sink; 401 vaddr = (user_addr_t)source; 402 ctype = COPYINPHYS; 403 pentry = (pt_entry_t)(INTEL_PTE_VALID | (paddr & PG_FRAME) | INTEL_PTE_RW); 404 } else { 405 paddr = (pmap_paddr_t)source; 406 vaddr = (user_addr_t)sink; 407 ctype = COPYOUTPHYS; 408 pentry = (pt_entry_t)(INTEL_PTE_VALID | (paddr & PG_FRAME)); 409 } 410 /* Fold in cache attributes for this physical page */ 411 pentry |= pmap_get_cache_attributes(i386_btop(paddr)); 412 window_offset = (char *)(uintptr_t)((uint32_t)paddr & (PAGE_SIZE - 1)); 413 414 assert(!((current_thread()->machine.specFlags & CopyIOActive) && ((which & cppvKmap) == 0))); 415 416 if (current_thread()->machine.physwindow_busy) { 417 pt_entry_t old_pentry; 418 419 KERNEL_DEBUG(0xeff70048 | DBG_FUNC_NONE, paddr, csize, 0, -1, 0); 420 /* 421 * we had better be targeting wired memory at this point 422 * we will not be able to handle a fault with interrupts 423 * disabled... we disable them because we can't tolerate 424 * being preempted during this nested use of the window 425 */ 426 istate = ml_set_interrupts_enabled(FALSE); 427 428 old_pentry = *(current_cpu_datap()->cpu_physwindow_ptep); 429 pmap_store_pte((current_cpu_datap()->cpu_physwindow_ptep), pentry); 430 431 invlpg((uintptr_t)current_cpu_datap()->cpu_physwindow_base); 432 433 retval = copyio(ctype, vaddr, window_offset, csize, NULL, which & cppvKmap); 434 435 pmap_store_pte((current_cpu_datap()->cpu_physwindow_ptep), old_pentry); 436 437 invlpg((uintptr_t)current_cpu_datap()->cpu_physwindow_base); 438 439 (void) ml_set_interrupts_enabled(istate); 440 } else { 441 /* 442 * mark the window as in use... if an interrupt hits while we're 443 * busy, or we trigger another coyppv from the fault path into 444 * the driver on a user address space page fault due to a copyin/out 445 * then we need to save and restore the current window state instead 446 * of caching the window preserving it across context switches 447 */ 448 current_thread()->machine.physwindow_busy = 1; 449 450 if (current_thread()->machine.physwindow_pte != pentry) { 451 KERNEL_DEBUG(0xeff70048 | DBG_FUNC_NONE, paddr, csize, 0, 0, 0); 452 453 current_thread()->machine.physwindow_pte = pentry; 454 455 /* 456 * preemption at this point would be bad since we 457 * could end up on the other processor after we grabbed the 458 * pointer to the current cpu data area, but before we finished 459 * using it to stuff the page table entry since we would 460 * be modifying a window that no longer belonged to us 461 * the invlpg can be done unprotected since it only flushes 462 * this page address from the tlb... if it flushes the wrong 463 * one, no harm is done, and the context switch that moved us 464 * to the other processor will have already take care of 465 * flushing the tlb after it reloaded the page table from machine.physwindow_pte 466 */ 467 istate = ml_set_interrupts_enabled(FALSE); 468 469 pmap_store_pte((current_cpu_datap()->cpu_physwindow_ptep), pentry); 470 (void) ml_set_interrupts_enabled(istate); 471 472 invlpg((uintptr_t)current_cpu_datap()->cpu_physwindow_base); 473 } 474#if JOE_DEBUG 475 else { 476 if (pentry != 477 (*(current_cpu_datap()->cpu_physwindow_ptep) & (INTEL_PTE_VALID | PG_FRAME | INTEL_PTE_RW))) 478 panic("copyio_phys: pentry != *physwindow_ptep"); 479 } 480#endif 481 retval = copyio(ctype, vaddr, window_offset, csize, NULL, which & cppvKmap); 482 483 current_thread()->machine.physwindow_busy = 0; 484 } 485 return (retval); 486} 487 488int 489copyinmsg(const user_addr_t user_addr, char *kernel_addr, mach_msg_size_t nbytes) 490{ 491 return (copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0)); 492} 493 494int 495copyin(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes) 496{ 497 return (copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0)); 498} 499 500int 501copyinstr(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes, vm_size_t *lencopied) 502{ 503 *lencopied = 0; 504 505 return (copyio(COPYINSTR, user_addr, kernel_addr, nbytes, lencopied, 0)); 506} 507 508int 509copyoutmsg(const char *kernel_addr, user_addr_t user_addr, mach_msg_size_t nbytes) 510{ 511 return (copyio(COPYOUT, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0)); 512} 513 514int 515copyout(const void *kernel_addr, user_addr_t user_addr, vm_size_t nbytes) 516{ 517 return (copyio(COPYOUT, user_addr, (char *)(uintptr_t)kernel_addr, nbytes, NULL, 0)); 518} 519 520 521kern_return_t 522copypv(addr64_t src64, addr64_t snk64, unsigned int size, int which) 523{ 524 unsigned int lop, csize; 525 int bothphys = 0; 526 527 KERNEL_DEBUG(0xeff7004c | DBG_FUNC_START, (unsigned)src64, 528 (unsigned)snk64, size, which, 0); 529 530 if ((which & (cppvPsrc | cppvPsnk)) == 0 ) /* Make sure that only one is virtual */ 531 panic("copypv: no more than 1 parameter may be virtual\n"); /* Not allowed */ 532 533 if ((which & (cppvPsrc | cppvPsnk)) == (cppvPsrc | cppvPsnk)) 534 bothphys = 1; /* both are physical */ 535 536 while (size) { 537 538 if (bothphys) { 539 lop = (unsigned int)(PAGE_SIZE - (snk64 & (PAGE_SIZE - 1))); /* Assume sink smallest */ 540 541 if (lop > (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1)))) 542 lop = (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1))); /* No, source is smaller */ 543 } else { 544 /* 545 * only need to compute the resid for the physical page 546 * address... we don't care about where we start/finish in 547 * the virtual since we just call the normal copyin/copyout 548 */ 549 if (which & cppvPsrc) 550 lop = (unsigned int)(PAGE_SIZE - (src64 & (PAGE_SIZE - 1))); 551 else 552 lop = (unsigned int)(PAGE_SIZE - (snk64 & (PAGE_SIZE - 1))); 553 } 554 csize = size; /* Assume we can copy it all */ 555 if (lop < size) 556 csize = lop; /* Nope, we can't do it all */ 557#if 0 558 /* 559 * flush_dcache64 is currently a nop on the i386... 560 * it's used when copying to non-system memory such 561 * as video capture cards... on PPC there was a need 562 * to flush due to how we mapped this memory... not 563 * sure if it's needed on i386. 564 */ 565 if (which & cppvFsrc) 566 flush_dcache64(src64, csize, 1); /* If requested, flush source before move */ 567 if (which & cppvFsnk) 568 flush_dcache64(snk64, csize, 1); /* If requested, flush sink before move */ 569#endif 570 if (bothphys) { 571 bcopy_phys(src64, snk64, csize); /* Do a physical copy, virtually */ 572 } 573 else { 574 if (copyio_phys(src64, snk64, csize, which)) { 575 return (KERN_FAILURE); 576 } 577 } 578#if 0 579 if (which & cppvFsrc) 580 flush_dcache64(src64, csize, 1); /* If requested, flush source after move */ 581 if (which & cppvFsnk) 582 flush_dcache64(snk64, csize, 1); /* If requested, flush sink after move */ 583#endif 584 size -= csize; /* Calculate what is left */ 585 snk64 += csize; /* Bump sink to next physical address */ 586 src64 += csize; /* Bump source to next physical address */ 587 } 588 KERNEL_DEBUG(0xeff7004c | DBG_FUNC_END, (unsigned)src64, 589 (unsigned)snk64, size, which, 0); 590 591 return KERN_SUCCESS; 592} 593void 594copy_window_fault(thread_t thread, vm_map_t map, int window) 595{ 596 pt_entry_t *updp; 597 pt_entry_t *kpdp; 598 599 /* 600 * in case there was no page table assigned 601 * for the user base address and the pmap 602 * got 'expanded' due to this fault, we'll 603 * copy in the descriptor 604 * 605 * we're either setting the page table descriptor 606 * to the same value or it was 0... no need 607 * for a TLB flush in either case 608 */ 609 610 updp = pmap_pde(map->pmap, thread->machine.copy_window[window].user_base); 611 assert(updp); 612 if (0 == updp) panic("trap: updp 0"); /* XXX DEBUG */ 613 kpdp = current_cpu_datap()->cpu_copywindow_pdp; 614 kpdp += window; 615 616#if JOE_DEBUG 617 if (*kpdp && (*kpdp & PG_FRAME) != (*updp & PG_FRAME)) 618 panic("kernel_fault: user pdp doesn't match - updp = 0x%qx, kpdp = 0x%qx\n", *updp, *kpdp); 619#endif 620 pmap_store_pte(kpdp, *updp); 621} 622