mmu_phyp.c revision 330897
1/*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (C) 2010 Andreas Tobler 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 20 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 22 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 24 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 25 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include <sys/cdefs.h> 29__FBSDID("$FreeBSD: stable/11/sys/powerpc/pseries/mmu_phyp.c 330897 2018-03-14 03:19:51Z eadler $"); 30 31#include <sys/param.h> 32#include <sys/kernel.h> 33#include <sys/ktr.h> 34#include <sys/lock.h> 35#include <sys/rmlock.h> 36#include <sys/mutex.h> 37#include <sys/proc.h> 38#include <sys/sysctl.h> 39#include <sys/systm.h> 40#include <sys/vmmeter.h> 41 42#include <dev/ofw/openfirm.h> 43#include <machine/ofw_machdep.h> 44 45#include <vm/vm.h> 46#include <vm/vm_param.h> 47#include <vm/vm_kern.h> 48#include <vm/vm_page.h> 49#include <vm/vm_map.h> 50#include <vm/vm_object.h> 51#include <vm/vm_extern.h> 52#include <vm/vm_pageout.h> 53#include <vm/uma.h> 54 55#include <powerpc/aim/mmu_oea64.h> 56 57#include "mmu_if.h" 58#include "moea64_if.h" 59 60#include "phyp-hvcall.h" 61 62extern int n_slbs; 63 64static struct rmlock mphyp_eviction_lock; 65 66/* 67 * Kernel MMU interface 68 */ 69 70static void mphyp_bootstrap(mmu_t mmup, vm_offset_t kernelstart, 71 vm_offset_t kernelend); 72static void mphyp_cpu_bootstrap(mmu_t mmup, int ap); 73static int64_t mphyp_pte_synch(mmu_t, struct pvo_entry *pvo); 74static int64_t mphyp_pte_clear(mmu_t, struct pvo_entry *pvo, uint64_t ptebit); 75static int64_t mphyp_pte_unset(mmu_t, struct pvo_entry *pvo); 76static int mphyp_pte_insert(mmu_t, struct pvo_entry *pvo); 77 78static mmu_method_t mphyp_methods[] = { 79 MMUMETHOD(mmu_bootstrap, mphyp_bootstrap), 80 MMUMETHOD(mmu_cpu_bootstrap, mphyp_cpu_bootstrap), 81 82 MMUMETHOD(moea64_pte_synch, mphyp_pte_synch), 83 MMUMETHOD(moea64_pte_clear, mphyp_pte_clear), 84 MMUMETHOD(moea64_pte_unset, mphyp_pte_unset), 85 MMUMETHOD(moea64_pte_insert, mphyp_pte_insert), 86 87 /* XXX: pmap_copy_page, pmap_init_page with H_PAGE_INIT */ 88 89 { 0, 0 } 90}; 91 92MMU_DEF_INHERIT(pseries_mmu, "mmu_phyp", mphyp_methods, 0, oea64_mmu); 93 94static int brokenkvm = 0; 95 96static void 97print_kvm_bug_warning(void *data) 98{ 99 100 if (brokenkvm) 101 printf("WARNING: Running on a broken hypervisor that does " 102 "not support mandatory H_CLEAR_MOD and H_CLEAR_REF " 103 "hypercalls. Performance will be suboptimal.\n"); 104} 105 106SYSINIT(kvmbugwarn1, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1, 107 print_kvm_bug_warning, NULL); 108SYSINIT(kvmbugwarn2, SI_SUB_LAST, SI_ORDER_THIRD + 1, print_kvm_bug_warning, 109 NULL); 110 111static void 112mphyp_bootstrap(mmu_t mmup, vm_offset_t kernelstart, vm_offset_t kernelend) 113{ 114 uint64_t final_pteg_count = 0; 115 char buf[8]; 116 uint32_t prop[2]; 117 uint32_t nptlp, shift = 0, slb_encoding = 0; 118 uint32_t lp_size, lp_encoding; 119 struct lpte old; 120 uint64_t vsid; 121 phandle_t dev, node, root; 122 int idx, len, res; 123 124 rm_init(&mphyp_eviction_lock, "pte eviction"); 125 126 moea64_early_bootstrap(mmup, kernelstart, kernelend); 127 128 root = OF_peer(0); 129 130 dev = OF_child(root); 131 while (dev != 0) { 132 res = OF_getprop(dev, "name", buf, sizeof(buf)); 133 if (res > 0 && strcmp(buf, "cpus") == 0) 134 break; 135 dev = OF_peer(dev); 136 } 137 138 node = OF_child(dev); 139 140 while (node != 0) { 141 res = OF_getprop(node, "device_type", buf, sizeof(buf)); 142 if (res > 0 && strcmp(buf, "cpu") == 0) 143 break; 144 node = OF_peer(node); 145 } 146 147 res = OF_getencprop(node, "ibm,pft-size", prop, sizeof(prop)); 148 if (res <= 0) 149 panic("mmu_phyp: unknown PFT size"); 150 final_pteg_count = 1 << prop[1]; 151 res = OF_getencprop(node, "ibm,slb-size", prop, sizeof(prop[0])); 152 if (res > 0) 153 n_slbs = prop[0]; 154 155 moea64_pteg_count = final_pteg_count / sizeof(struct lpteg); 156 157 /* Clear any old page table entries */ 158 for (idx = 0; idx < moea64_pteg_count*8; idx++) { 159 phyp_pft_hcall(H_READ, 0, idx, 0, 0, &old.pte_hi, 160 &old.pte_lo, &old.pte_lo); 161 vsid = (old.pte_hi << (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) >> 28; 162 if (vsid == VSID_VRMA || vsid == 0 /* Older VRMA */) 163 continue; 164 165 if (old.pte_hi & LPTE_VALID) 166 phyp_hcall(H_REMOVE, 0, idx, 0); 167 } 168 169 /* 170 * Scan the large page size property for PAPR compatible machines. 171 * See PAPR D.5 Changes to Section 5.1.4, 'CPU Node Properties' 172 * for the encoding of the property. 173 */ 174 175 len = OF_getproplen(node, "ibm,segment-page-sizes"); 176 if (len > 0) { 177 /* 178 * We have to use a variable length array on the stack 179 * since we have very limited stack space. 180 */ 181 pcell_t arr[len/sizeof(cell_t)]; 182 res = OF_getencprop(node, "ibm,segment-page-sizes", arr, 183 sizeof(arr)); 184 len /= 4; 185 idx = 0; 186 while (len > 0) { 187 shift = arr[idx]; 188 slb_encoding = arr[idx + 1]; 189 nptlp = arr[idx + 2]; 190 idx += 3; 191 len -= 3; 192 while (len > 0 && nptlp) { 193 lp_size = arr[idx]; 194 lp_encoding = arr[idx+1]; 195 if (slb_encoding == SLBV_L && lp_encoding == 0) 196 break; 197 198 idx += 2; 199 len -= 2; 200 nptlp--; 201 } 202 if (nptlp && slb_encoding == SLBV_L && lp_encoding == 0) 203 break; 204 } 205 206 if (len == 0) 207 panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) " 208 "not supported by this system. Please enable huge " 209 "page backing if running under PowerKVM."); 210 211 moea64_large_page_shift = shift; 212 moea64_large_page_size = 1ULL << lp_size; 213 } 214 215 moea64_mid_bootstrap(mmup, kernelstart, kernelend); 216 moea64_late_bootstrap(mmup, kernelstart, kernelend); 217 218 /* Test for broken versions of KVM that don't conform to the spec */ 219 if (phyp_hcall(H_CLEAR_MOD, 0, 0) == H_FUNCTION) 220 brokenkvm = 1; 221} 222 223static void 224mphyp_cpu_bootstrap(mmu_t mmup, int ap) 225{ 226 struct slb *slb = PCPU_GET(slb); 227 register_t seg0; 228 int i; 229 230 /* 231 * Install kernel SLB entries 232 */ 233 234 __asm __volatile ("slbia"); 235 __asm __volatile ("slbmfee %0,%1; slbie %0;" : "=r"(seg0) : "r"(0)); 236 for (i = 0; i < 64; i++) { 237 if (!(slb[i].slbe & SLBE_VALID)) 238 continue; 239 240 __asm __volatile ("slbmte %0, %1" :: 241 "r"(slb[i].slbv), "r"(slb[i].slbe)); 242 } 243} 244 245static int64_t 246mphyp_pte_synch(mmu_t mmu, struct pvo_entry *pvo) 247{ 248 struct lpte pte; 249 uint64_t junk; 250 251 __asm __volatile("ptesync"); 252 phyp_pft_hcall(H_READ, 0, pvo->pvo_pte.slot, 0, 0, &pte.pte_hi, 253 &pte.pte_lo, &junk); 254 if ((pte.pte_hi & LPTE_AVPN_MASK) != 255 ((pvo->pvo_vpn >> (ADDR_API_SHFT64 - ADDR_PIDX_SHFT)) & 256 LPTE_AVPN_MASK)) 257 return (-1); 258 if (!(pte.pte_hi & LPTE_VALID)) 259 return (-1); 260 261 return (pte.pte_lo & (LPTE_CHG | LPTE_REF)); 262} 263 264static int64_t 265mphyp_pte_clear(mmu_t mmu, struct pvo_entry *pvo, uint64_t ptebit) 266{ 267 struct rm_priotracker track; 268 int64_t refchg; 269 uint64_t ptelo, junk; 270 int err; 271 272 /* 273 * This involves two steps (synch and clear) so we need the entry 274 * not to change in the middle. We are protected against deliberate 275 * unset by virtue of holding the pmap lock. Protection against 276 * incidental unset (page table eviction) comes from holding the 277 * shared eviction lock. 278 */ 279 PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); 280 rm_rlock(&mphyp_eviction_lock, &track); 281 282 refchg = mphyp_pte_synch(mmu, pvo); 283 if (refchg < 0) { 284 rm_runlock(&mphyp_eviction_lock, &track); 285 return (refchg); 286 } 287 288 if (brokenkvm) { 289 /* 290 * No way to clear either bit, which is total madness. 291 * Pessimistically claim that, once modified, it stays so 292 * forever and that it is never referenced. 293 */ 294 rm_runlock(&mphyp_eviction_lock, &track); 295 return (refchg & ~LPTE_REF); 296 } 297 298 if (ptebit & LPTE_CHG) { 299 err = phyp_pft_hcall(H_CLEAR_MOD, 0, pvo->pvo_pte.slot, 0, 0, 300 &ptelo, &junk, &junk); 301 KASSERT(err == H_SUCCESS, 302 ("Error clearing page change bit: %d", err)); 303 refchg |= (ptelo & LPTE_CHG); 304 } 305 if (ptebit & LPTE_REF) { 306 err = phyp_pft_hcall(H_CLEAR_REF, 0, pvo->pvo_pte.slot, 0, 0, 307 &ptelo, &junk, &junk); 308 KASSERT(err == H_SUCCESS, 309 ("Error clearing page reference bit: %d", err)); 310 refchg |= (ptelo & LPTE_REF); 311 } 312 313 rm_runlock(&mphyp_eviction_lock, &track); 314 315 return (refchg); 316} 317 318static int64_t 319mphyp_pte_unset(mmu_t mmu, struct pvo_entry *pvo) 320{ 321 struct lpte pte; 322 uint64_t junk; 323 int err; 324 325 PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); 326 327 moea64_pte_from_pvo(pvo, &pte); 328 329 err = phyp_pft_hcall(H_REMOVE, H_AVPN, pvo->pvo_pte.slot, 330 pte.pte_hi & LPTE_AVPN_MASK, 0, &pte.pte_hi, &pte.pte_lo, 331 &junk); 332 KASSERT(err == H_SUCCESS || err == H_NOT_FOUND, 333 ("Error removing page: %d", err)); 334 335 if (err == H_NOT_FOUND) { 336 moea64_pte_overflow--; 337 return (-1); 338 } 339 340 return (pte.pte_lo & (LPTE_REF | LPTE_CHG)); 341} 342 343static uintptr_t 344mphyp_pte_spillable_ident(uintptr_t ptegbase, struct lpte *to_evict) 345{ 346 uint64_t slot, junk, k; 347 struct lpte pt; 348 int i, j; 349 350 /* Start at a random slot */ 351 i = mftb() % 8; 352 k = -1; 353 for (j = 0; j < 8; j++) { 354 slot = ptegbase + (i + j) % 8; 355 phyp_pft_hcall(H_READ, 0, slot, 0, 0, &pt.pte_hi, 356 &pt.pte_lo, &junk); 357 358 if (pt.pte_hi & LPTE_WIRED) 359 continue; 360 361 /* This is a candidate, so remember it */ 362 k = slot; 363 364 /* Try to get a page that has not been used lately */ 365 if (!(pt.pte_hi & LPTE_VALID) || !(pt.pte_lo & LPTE_REF)) { 366 memcpy(to_evict, &pt, sizeof(struct lpte)); 367 return (k); 368 } 369 } 370 371 if (k == -1) 372 return (k); 373 374 phyp_pft_hcall(H_READ, 0, k, 0, 0, &to_evict->pte_hi, 375 &to_evict->pte_lo, &junk); 376 return (k); 377} 378 379static int 380mphyp_pte_insert(mmu_t mmu, struct pvo_entry *pvo) 381{ 382 struct rm_priotracker track; 383 int64_t result; 384 struct lpte evicted, pte; 385 uint64_t index, junk, lastptelo; 386 387 PMAP_LOCK_ASSERT(pvo->pvo_pmap, MA_OWNED); 388 389 /* Initialize PTE */ 390 moea64_pte_from_pvo(pvo, &pte); 391 evicted.pte_hi = 0; 392 393 /* Make sure further insertion is locked out during evictions */ 394 rm_rlock(&mphyp_eviction_lock, &track); 395 396 /* 397 * First try primary hash. 398 */ 399 pvo->pvo_pte.slot &= ~7UL; /* Base slot address */ 400 result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, pte.pte_hi, 401 pte.pte_lo, &index, &evicted.pte_lo, &junk); 402 if (result == H_SUCCESS) { 403 rm_runlock(&mphyp_eviction_lock, &track); 404 pvo->pvo_pte.slot = index; 405 return (0); 406 } 407 KASSERT(result == H_PTEG_FULL, ("Page insertion error: %ld " 408 "(ptegidx: %#zx/%#x, PTE %#lx/%#lx", result, pvo->pvo_pte.slot, 409 moea64_pteg_count, pte.pte_hi, pte.pte_lo)); 410 411 /* 412 * Next try secondary hash. 413 */ 414 pvo->pvo_vaddr ^= PVO_HID; 415 pte.pte_hi ^= LPTE_HID; 416 pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); 417 418 result = phyp_pft_hcall(H_ENTER, 0, pvo->pvo_pte.slot, 419 pte.pte_hi, pte.pte_lo, &index, &evicted.pte_lo, &junk); 420 if (result == H_SUCCESS) { 421 rm_runlock(&mphyp_eviction_lock, &track); 422 pvo->pvo_pte.slot = index; 423 return (0); 424 } 425 KASSERT(result == H_PTEG_FULL, ("Secondary page insertion error: %ld", 426 result)); 427 428 /* 429 * Out of luck. Find a PTE to sacrifice. 430 */ 431 432 /* Lock out all insertions for a bit */ 433 rm_runlock(&mphyp_eviction_lock, &track); 434 rm_wlock(&mphyp_eviction_lock); 435 436 index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted); 437 if (index == -1L) { 438 /* Try other hash table? */ 439 pvo->pvo_vaddr ^= PVO_HID; 440 pte.pte_hi ^= LPTE_HID; 441 pvo->pvo_pte.slot ^= (moea64_pteg_mask << 3); 442 index = mphyp_pte_spillable_ident(pvo->pvo_pte.slot, &evicted); 443 } 444 445 if (index == -1L) { 446 /* No freeable slots in either PTEG? We're hosed. */ 447 rm_wunlock(&mphyp_eviction_lock); 448 panic("mphyp_pte_insert: overflow"); 449 return (-1); 450 } 451 452 /* Victim acquired: update page before waving goodbye */ 453 if (evicted.pte_hi & LPTE_VALID) { 454 result = phyp_pft_hcall(H_REMOVE, H_AVPN, index, 455 evicted.pte_hi & LPTE_AVPN_MASK, 0, &junk, &lastptelo, 456 &junk); 457 moea64_pte_overflow++; 458 KASSERT(result == H_SUCCESS, 459 ("Error evicting page: %d", (int)result)); 460 } 461 462 /* 463 * Set the new PTE. 464 */ 465 result = phyp_pft_hcall(H_ENTER, H_EXACT, index, pte.pte_hi, 466 pte.pte_lo, &index, &evicted.pte_lo, &junk); 467 rm_wunlock(&mphyp_eviction_lock); /* All clear */ 468 469 pvo->pvo_pte.slot = index; 470 if (result == H_SUCCESS) 471 return (0); 472 473 panic("Page replacement error: %ld", result); 474 return (result); 475} 476 477