1/*- 2 * Copyright (c) 1982, 1986 The Regents of the University of California. 3 * Copyright (c) 1989, 1990 William Jolitz 4 * Copyright (c) 1994 John Dyson 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * the Systems Programming Group of the University of Utah Computer 9 * Science Department, and William Jolitz. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the University of 22 * California, Berkeley and its contributors. 23 * 4. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91 40 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$ 41 */ 42 43#include <sys/cdefs.h> 44__FBSDID("$FreeBSD: stable/11/sys/amd64/amd64/vm_machdep.c 345126 2019-03-14 08:27:01Z ae $"); 45 46#include "opt_isa.h" 47#include "opt_cpu.h" 48#include "opt_compat.h" 49 50#include <sys/param.h> 51#include <sys/systm.h> 52#include <sys/bio.h> 53#include <sys/buf.h> 54#include <sys/kernel.h> 55#include <sys/ktr.h> 56#include <sys/lock.h> 57#include <sys/malloc.h> 58#include <sys/mbuf.h> 59#include <sys/mutex.h> 60#include <sys/pioctl.h> 61#include <sys/proc.h> 62#include <sys/smp.h> 63#include <sys/sysctl.h> 64#include <sys/sysent.h> 65#include <sys/unistd.h> 66#include <sys/vnode.h> 67#include <sys/vmmeter.h> 68 69#include <machine/cpu.h> 70#include <machine/md_var.h> 71#include <machine/pcb.h> 72#include <machine/smp.h> 73#include <machine/specialreg.h> 74#include <machine/tss.h> 75 76#include <vm/vm.h> 77#include <vm/vm_extern.h> 78#include <vm/vm_kern.h> 79#include <vm/vm_page.h> 80#include <vm/vm_map.h> 81#include <vm/vm_param.h> 82 83_Static_assert(OFFSETOF_CURTHREAD == offsetof(struct pcpu, pc_curthread), 84 "OFFSETOF_CURTHREAD does not correspond with offset of pc_curthread."); 85_Static_assert(OFFSETOF_CURPCB == offsetof(struct pcpu, pc_curpcb), 86 "OFFSETOF_CURPCB does not correspond with offset of pc_curpcb."); 87_Static_assert(OFFSETOF_MONITORBUF == offsetof(struct pcpu, pc_monitorbuf), 88 "OFFSETOF_MONITORBUF does not correspond with offset of pc_monitorbuf."); 89 90struct savefpu * 91get_pcb_user_save_td(struct thread *td) 92{ 93 vm_offset_t p; 94 95 p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE - 96 roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN); 97 KASSERT((p % XSAVE_AREA_ALIGN) == 0, ("Unaligned pcb_user_save area")); 98 return ((struct savefpu *)p); 99} 100 101struct savefpu * 102get_pcb_user_save_pcb(struct pcb *pcb) 103{ 104 vm_offset_t p; 105 106 p = (vm_offset_t)(pcb + 1); 107 return ((struct savefpu *)p); 108} 109 110struct pcb * 111get_pcb_td(struct thread *td) 112{ 113 vm_offset_t p; 114 115 p = td->td_kstack + td->td_kstack_pages * PAGE_SIZE - 116 roundup2(cpu_max_ext_state_size, XSAVE_AREA_ALIGN) - 117 sizeof(struct pcb); 118 return ((struct pcb *)p); 119} 120 121void * 122alloc_fpusave(int flags) 123{ 124 void *res; 125 struct savefpu_ymm *sf; 126 127 res = malloc(cpu_max_ext_state_size, M_DEVBUF, flags); 128 if (use_xsave) { 129 sf = (struct savefpu_ymm *)res; 130 bzero(&sf->sv_xstate.sx_hd, sizeof(sf->sv_xstate.sx_hd)); 131 sf->sv_xstate.sx_hd.xstate_bv = xsave_mask; 132 } 133 return (res); 134} 135 136/* 137 * Finish a fork operation, with process p2 nearly set up. 138 * Copy and update the pcb, set up the stack so that the child 139 * ready to run and return to user mode. 140 */ 141void 142cpu_fork(struct thread *td1, struct proc *p2, struct thread *td2, int flags) 143{ 144 struct proc *p1; 145 struct pcb *pcb2; 146 struct mdproc *mdp1, *mdp2; 147 struct proc_ldt *pldt; 148 149 p1 = td1->td_proc; 150 if ((flags & RFPROC) == 0) { 151 if ((flags & RFMEM) == 0) { 152 /* unshare user LDT */ 153 mdp1 = &p1->p_md; 154 mtx_lock(&dt_lock); 155 if ((pldt = mdp1->md_ldt) != NULL && 156 pldt->ldt_refcnt > 1 && 157 user_ldt_alloc(p1, 1) == NULL) 158 panic("could not copy LDT"); 159 mtx_unlock(&dt_lock); 160 } 161 return; 162 } 163 164 /* Ensure that td1's pcb is up to date. */ 165 fpuexit(td1); 166 update_pcb_bases(td1->td_pcb); 167 168 /* Point the pcb to the top of the stack */ 169 pcb2 = get_pcb_td(td2); 170 td2->td_pcb = pcb2; 171 172 /* Copy td1's pcb */ 173 bcopy(td1->td_pcb, pcb2, sizeof(*pcb2)); 174 175 /* Properly initialize pcb_save */ 176 pcb2->pcb_save = get_pcb_user_save_pcb(pcb2); 177 bcopy(get_pcb_user_save_td(td1), get_pcb_user_save_pcb(pcb2), 178 cpu_max_ext_state_size); 179 180 /* Point mdproc and then copy over td1's contents */ 181 mdp2 = &p2->p_md; 182 bcopy(&p1->p_md, mdp2, sizeof(*mdp2)); 183 184 /* 185 * Create a new fresh stack for the new process. 186 * Copy the trap frame for the return to user mode as if from a 187 * syscall. This copies most of the user mode register values. 188 */ 189 td2->td_frame = (struct trapframe *)td2->td_pcb - 1; 190 bcopy(td1->td_frame, td2->td_frame, sizeof(struct trapframe)); 191 192 td2->td_frame->tf_rax = 0; /* Child returns zero */ 193 td2->td_frame->tf_rflags &= ~PSL_C; /* success */ 194 td2->td_frame->tf_rdx = 1; 195 196 /* 197 * If the parent process has the trap bit set (i.e. a debugger had 198 * single stepped the process to the system call), we need to clear 199 * the trap flag from the new frame unless the debugger had set PF_FORK 200 * on the parent. Otherwise, the child will receive a (likely 201 * unexpected) SIGTRAP when it executes the first instruction after 202 * returning to userland. 203 */ 204 if ((p1->p_pfsflags & PF_FORK) == 0) 205 td2->td_frame->tf_rflags &= ~PSL_T; 206 207 /* 208 * Set registers for trampoline to user mode. Leave space for the 209 * return address on stack. These are the kernel mode register values. 210 */ 211 pcb2->pcb_r12 = (register_t)fork_return; /* fork_trampoline argument */ 212 pcb2->pcb_rbp = 0; 213 pcb2->pcb_rsp = (register_t)td2->td_frame - sizeof(void *); 214 pcb2->pcb_rbx = (register_t)td2; /* fork_trampoline argument */ 215 pcb2->pcb_rip = (register_t)fork_trampoline; 216 /*- 217 * pcb2->pcb_dr*: cloned above. 218 * pcb2->pcb_savefpu: cloned above. 219 * pcb2->pcb_flags: cloned above. 220 * pcb2->pcb_onfault: cloned above (always NULL here?). 221 * pcb2->pcb_[fg]sbase: cloned above 222 */ 223 224 /* Setup to release spin count in fork_exit(). */ 225 td2->td_md.md_spinlock_count = 1; 226 td2->td_md.md_saved_flags = PSL_KERNEL | PSL_I; 227 td2->td_md.md_invl_gen.gen = 0; 228 229 /* As an i386, do not copy io permission bitmap. */ 230 pcb2->pcb_tssp = NULL; 231 232 /* New segment registers. */ 233 set_pcb_flags_raw(pcb2, PCB_FULL_IRET); 234 235 /* Copy the LDT, if necessary. */ 236 mdp1 = &td1->td_proc->p_md; 237 mdp2 = &p2->p_md; 238 mtx_lock(&dt_lock); 239 if (mdp1->md_ldt != NULL) { 240 if (flags & RFMEM) { 241 mdp1->md_ldt->ldt_refcnt++; 242 mdp2->md_ldt = mdp1->md_ldt; 243 bcopy(&mdp1->md_ldt_sd, &mdp2->md_ldt_sd, sizeof(struct 244 system_segment_descriptor)); 245 } else { 246 mdp2->md_ldt = NULL; 247 mdp2->md_ldt = user_ldt_alloc(p2, 0); 248 if (mdp2->md_ldt == NULL) 249 panic("could not copy LDT"); 250 amd64_set_ldt_data(td2, 0, max_ldt_segment, 251 (struct user_segment_descriptor *) 252 mdp1->md_ldt->ldt_base); 253 } 254 } else 255 mdp2->md_ldt = NULL; 256 mtx_unlock(&dt_lock); 257 258 /* 259 * Now, cpu_switch() can schedule the new process. 260 * pcb_rsp is loaded pointing to the cpu_switch() stack frame 261 * containing the return address when exiting cpu_switch. 262 * This will normally be to fork_trampoline(), which will have 263 * %ebx loaded with the new proc's pointer. fork_trampoline() 264 * will set up a stack to call fork_return(p, frame); to complete 265 * the return to user-mode. 266 */ 267} 268 269/* 270 * Intercept the return address from a freshly forked process that has NOT 271 * been scheduled yet. 272 * 273 * This is needed to make kernel threads stay in kernel mode. 274 */ 275void 276cpu_fork_kthread_handler(struct thread *td, void (*func)(void *), void *arg) 277{ 278 /* 279 * Note that the trap frame follows the args, so the function 280 * is really called like this: func(arg, frame); 281 */ 282 td->td_pcb->pcb_r12 = (long) func; /* function */ 283 td->td_pcb->pcb_rbx = (long) arg; /* first arg */ 284} 285 286void 287cpu_exit(struct thread *td) 288{ 289 290 /* 291 * If this process has a custom LDT, release it. 292 */ 293 mtx_lock(&dt_lock); 294 if (td->td_proc->p_md.md_ldt != 0) 295 user_ldt_free(td); 296 else 297 mtx_unlock(&dt_lock); 298} 299 300void 301cpu_thread_exit(struct thread *td) 302{ 303 struct pcb *pcb; 304 305 critical_enter(); 306 if (td == PCPU_GET(fpcurthread)) 307 fpudrop(); 308 critical_exit(); 309 310 pcb = td->td_pcb; 311 312 /* Disable any hardware breakpoints. */ 313 if (pcb->pcb_flags & PCB_DBREGS) { 314 reset_dbregs(); 315 clear_pcb_flags(pcb, PCB_DBREGS); 316 } 317} 318 319void 320cpu_thread_clean(struct thread *td) 321{ 322 struct pcb *pcb; 323 324 pcb = td->td_pcb; 325 326 /* 327 * Clean TSS/iomap 328 */ 329 if (pcb->pcb_tssp != NULL) { 330 pmap_pti_remove_kva((vm_offset_t)pcb->pcb_tssp, 331 (vm_offset_t)pcb->pcb_tssp + ctob(IOPAGES + 1)); 332 kmem_free(kernel_arena, (vm_offset_t)pcb->pcb_tssp, 333 ctob(IOPAGES + 1)); 334 pcb->pcb_tssp = NULL; 335 } 336} 337 338void 339cpu_thread_swapin(struct thread *td) 340{ 341} 342 343void 344cpu_thread_swapout(struct thread *td) 345{ 346} 347 348void 349cpu_thread_alloc(struct thread *td) 350{ 351 struct pcb *pcb; 352 struct xstate_hdr *xhdr; 353 354 td->td_pcb = pcb = get_pcb_td(td); 355 td->td_frame = (struct trapframe *)pcb - 1; 356 pcb->pcb_save = get_pcb_user_save_pcb(pcb); 357 if (use_xsave) { 358 xhdr = (struct xstate_hdr *)(pcb->pcb_save + 1); 359 bzero(xhdr, sizeof(*xhdr)); 360 xhdr->xstate_bv = xsave_mask; 361 } 362} 363 364void 365cpu_thread_free(struct thread *td) 366{ 367 368 cpu_thread_clean(td); 369} 370 371void 372cpu_set_syscall_retval(struct thread *td, int error) 373{ 374 375 switch (error) { 376 case 0: 377 td->td_frame->tf_rax = td->td_retval[0]; 378 td->td_frame->tf_rdx = td->td_retval[1]; 379 td->td_frame->tf_rflags &= ~PSL_C; 380 break; 381 382 case ERESTART: 383 /* 384 * Reconstruct pc, we know that 'syscall' is 2 bytes, 385 * lcall $X,y is 7 bytes, int 0x80 is 2 bytes. 386 * We saved this in tf_err. 387 * %r10 (which was holding the value of %rcx) is restored 388 * for the next iteration. 389 * %r10 restore is only required for freebsd/amd64 processes, 390 * but shall be innocent for any ia32 ABI. 391 * 392 * Require full context restore to get the arguments 393 * in the registers reloaded at return to usermode. 394 */ 395 td->td_frame->tf_rip -= td->td_frame->tf_err; 396 td->td_frame->tf_r10 = td->td_frame->tf_rcx; 397 set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 398 break; 399 400 case EJUSTRETURN: 401 break; 402 403 default: 404 td->td_frame->tf_rax = SV_ABI_ERRNO(td->td_proc, error); 405 td->td_frame->tf_rflags |= PSL_C; 406 break; 407 } 408} 409 410/* 411 * Initialize machine state, mostly pcb and trap frame for a new 412 * thread, about to return to userspace. Put enough state in the new 413 * thread's PCB to get it to go back to the fork_return(), which 414 * finalizes the thread state and handles peculiarities of the first 415 * return to userspace for the new thread. 416 */ 417void 418cpu_copy_thread(struct thread *td, struct thread *td0) 419{ 420 struct pcb *pcb2; 421 422 /* Point the pcb to the top of the stack. */ 423 pcb2 = td->td_pcb; 424 425 /* 426 * Copy the upcall pcb. This loads kernel regs. 427 * Those not loaded individually below get their default 428 * values here. 429 */ 430 update_pcb_bases(td0->td_pcb); 431 bcopy(td0->td_pcb, pcb2, sizeof(*pcb2)); 432 clear_pcb_flags(pcb2, PCB_FPUINITDONE | PCB_USERFPUINITDONE | 433 PCB_KERNFPU); 434 pcb2->pcb_save = get_pcb_user_save_pcb(pcb2); 435 bcopy(get_pcb_user_save_td(td0), pcb2->pcb_save, 436 cpu_max_ext_state_size); 437 set_pcb_flags_raw(pcb2, PCB_FULL_IRET); 438 439 /* 440 * Create a new fresh stack for the new thread. 441 */ 442 bcopy(td0->td_frame, td->td_frame, sizeof(struct trapframe)); 443 444 /* If the current thread has the trap bit set (i.e. a debugger had 445 * single stepped the process to the system call), we need to clear 446 * the trap flag from the new frame. Otherwise, the new thread will 447 * receive a (likely unexpected) SIGTRAP when it executes the first 448 * instruction after returning to userland. 449 */ 450 td->td_frame->tf_rflags &= ~PSL_T; 451 452 /* 453 * Set registers for trampoline to user mode. Leave space for the 454 * return address on stack. These are the kernel mode register values. 455 */ 456 pcb2->pcb_r12 = (register_t)fork_return; /* trampoline arg */ 457 pcb2->pcb_rbp = 0; 458 pcb2->pcb_rsp = (register_t)td->td_frame - sizeof(void *); /* trampoline arg */ 459 pcb2->pcb_rbx = (register_t)td; /* trampoline arg */ 460 pcb2->pcb_rip = (register_t)fork_trampoline; 461 /* 462 * If we didn't copy the pcb, we'd need to do the following registers: 463 * pcb2->pcb_dr*: cloned above. 464 * pcb2->pcb_savefpu: cloned above. 465 * pcb2->pcb_onfault: cloned above (always NULL here?). 466 * pcb2->pcb_[fg]sbase: cloned above 467 */ 468 469 /* Setup to release spin count in fork_exit(). */ 470 td->td_md.md_spinlock_count = 1; 471 td->td_md.md_saved_flags = PSL_KERNEL | PSL_I; 472} 473 474/* 475 * Set that machine state for performing an upcall that starts 476 * the entry function with the given argument. 477 */ 478void 479cpu_set_upcall(struct thread *td, void (*entry)(void *), void *arg, 480 stack_t *stack) 481{ 482 483 /* 484 * Do any extra cleaning that needs to be done. 485 * The thread may have optional components 486 * that are not present in a fresh thread. 487 * This may be a recycled thread so make it look 488 * as though it's newly allocated. 489 */ 490 cpu_thread_clean(td); 491 492#ifdef COMPAT_FREEBSD32 493 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 494 /* 495 * Set the trap frame to point at the beginning of the entry 496 * function. 497 */ 498 td->td_frame->tf_rbp = 0; 499 td->td_frame->tf_rsp = 500 (((uintptr_t)stack->ss_sp + stack->ss_size - 4) & ~0x0f) - 4; 501 td->td_frame->tf_rip = (uintptr_t)entry; 502 503 /* Return address sentinel value to stop stack unwinding. */ 504 suword32((void *)td->td_frame->tf_rsp, 0); 505 506 /* Pass the argument to the entry point. */ 507 suword32((void *)(td->td_frame->tf_rsp + sizeof(int32_t)), 508 (uint32_t)(uintptr_t)arg); 509 510 return; 511 } 512#endif 513 514 /* 515 * Set the trap frame to point at the beginning of the uts 516 * function. 517 */ 518 td->td_frame->tf_rbp = 0; 519 td->td_frame->tf_rsp = 520 ((register_t)stack->ss_sp + stack->ss_size) & ~0x0f; 521 td->td_frame->tf_rsp -= 8; 522 td->td_frame->tf_rip = (register_t)entry; 523 td->td_frame->tf_ds = _udatasel; 524 td->td_frame->tf_es = _udatasel; 525 td->td_frame->tf_fs = _ufssel; 526 td->td_frame->tf_gs = _ugssel; 527 td->td_frame->tf_flags = TF_HASSEGS; 528 529 /* Return address sentinel value to stop stack unwinding. */ 530 suword((void *)td->td_frame->tf_rsp, 0); 531 532 /* Pass the argument to the entry point. */ 533 td->td_frame->tf_rdi = (register_t)arg; 534} 535 536int 537cpu_set_user_tls(struct thread *td, void *tls_base) 538{ 539 struct pcb *pcb; 540 541 if ((u_int64_t)tls_base >= VM_MAXUSER_ADDRESS) 542 return (EINVAL); 543 544 pcb = td->td_pcb; 545 set_pcb_flags(pcb, PCB_FULL_IRET); 546#ifdef COMPAT_FREEBSD32 547 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { 548 pcb->pcb_gsbase = (register_t)tls_base; 549 return (0); 550 } 551#endif 552 pcb->pcb_fsbase = (register_t)tls_base; 553 return (0); 554} 555 556/* 557 * Software interrupt handler for queued VM system processing. 558 */ 559void 560swi_vm(void *dummy) 561{ 562 if (busdma_swi_pending != 0) 563 busdma_swi(); 564} 565 566/* 567 * Tell whether this address is in some physical memory region. 568 * Currently used by the kernel coredump code in order to avoid 569 * dumping the ``ISA memory hole'' which could cause indefinite hangs, 570 * or other unpredictable behaviour. 571 */ 572 573int 574is_physical_memory(vm_paddr_t addr) 575{ 576 577#ifdef DEV_ISA 578 /* The ISA ``memory hole''. */ 579 if (addr >= 0xa0000 && addr < 0x100000) 580 return 0; 581#endif 582 583 /* 584 * stuff other tests for known memory-mapped devices (PCI?) 585 * here 586 */ 587 588 return 1; 589} 590