srat.c revision 322996
1/*- 2 * Copyright (c) 2010 Hudson River Trading LLC 3 * Written by: John H. Baldwin <jhb@FreeBSD.org> 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28#include <sys/cdefs.h> 29__FBSDID("$FreeBSD: stable/11/sys/x86/acpica/srat.c 322996 2017-08-29 07:01:15Z mav $"); 30 31#include "opt_vm.h" 32 33#include <sys/param.h> 34#include <sys/bus.h> 35#include <sys/kernel.h> 36#include <sys/lock.h> 37#include <sys/mutex.h> 38#include <sys/smp.h> 39#include <sys/vmmeter.h> 40#include <vm/vm.h> 41#include <vm/pmap.h> 42#include <vm/vm_param.h> 43#include <vm/vm_page.h> 44#include <vm/vm_phys.h> 45 46#include <contrib/dev/acpica/include/acpi.h> 47#include <contrib/dev/acpica/include/aclocal.h> 48#include <contrib/dev/acpica/include/actables.h> 49 50#include <machine/intr_machdep.h> 51#include <x86/apicvar.h> 52 53#include <dev/acpica/acpivar.h> 54 55#if MAXMEMDOM > 1 56struct cpu_info { 57 int enabled:1; 58 int has_memory:1; 59 int domain; 60} cpus[MAX_APIC_ID + 1]; 61 62struct mem_affinity mem_info[VM_PHYSSEG_MAX + 1]; 63int num_mem; 64 65static ACPI_TABLE_SRAT *srat; 66static vm_paddr_t srat_physaddr; 67 68static int domain_pxm[MAXMEMDOM]; 69static int ndomain; 70 71static ACPI_TABLE_SLIT *slit; 72static vm_paddr_t slit_physaddr; 73static int vm_locality_table[MAXMEMDOM * MAXMEMDOM]; 74 75static void srat_walk_table(acpi_subtable_handler *handler, void *arg); 76 77/* 78 * SLIT parsing. 79 */ 80 81static void 82slit_parse_table(ACPI_TABLE_SLIT *s) 83{ 84 int i, j; 85 int i_domain, j_domain; 86 int offset = 0; 87 uint8_t e; 88 89 /* 90 * This maps the SLIT data into the VM-domain centric view. 91 * There may be sparse entries in the PXM namespace, so 92 * remap them to a VM-domain ID and if it doesn't exist, 93 * skip it. 94 * 95 * It should result in a packed 2d array of VM-domain 96 * locality information entries. 97 */ 98 99 if (bootverbose) 100 printf("SLIT.Localities: %d\n", (int) s->LocalityCount); 101 for (i = 0; i < s->LocalityCount; i++) { 102 i_domain = acpi_map_pxm_to_vm_domainid(i); 103 if (i_domain < 0) 104 continue; 105 106 if (bootverbose) 107 printf("%d: ", i); 108 for (j = 0; j < s->LocalityCount; j++) { 109 j_domain = acpi_map_pxm_to_vm_domainid(j); 110 if (j_domain < 0) 111 continue; 112 e = s->Entry[i * s->LocalityCount + j]; 113 if (bootverbose) 114 printf("%d ", (int) e); 115 /* 255 == "no locality information" */ 116 if (e == 255) 117 vm_locality_table[offset] = -1; 118 else 119 vm_locality_table[offset] = e; 120 offset++; 121 } 122 if (bootverbose) 123 printf("\n"); 124 } 125} 126 127/* 128 * Look for an ACPI System Locality Distance Information Table ("SLIT") 129 */ 130static int 131parse_slit(void) 132{ 133 134 if (resource_disabled("slit", 0)) { 135 return (-1); 136 } 137 138 slit_physaddr = acpi_find_table(ACPI_SIG_SLIT); 139 if (slit_physaddr == 0) { 140 return (-1); 141 } 142 143 /* 144 * Make a pass over the table to populate the cpus[] and 145 * mem_info[] tables. 146 */ 147 slit = acpi_map_table(slit_physaddr, ACPI_SIG_SLIT); 148 slit_parse_table(slit); 149 acpi_unmap_table(slit); 150 slit = NULL; 151 152#ifdef VM_NUMA_ALLOC 153 /* Tell the VM about it! */ 154 mem_locality = vm_locality_table; 155#endif 156 return (0); 157} 158 159/* 160 * SRAT parsing. 161 */ 162 163/* 164 * Returns true if a memory range overlaps with at least one range in 165 * phys_avail[]. 166 */ 167static int 168overlaps_phys_avail(vm_paddr_t start, vm_paddr_t end) 169{ 170 int i; 171 172 for (i = 0; phys_avail[i] != 0 && phys_avail[i + 1] != 0; i += 2) { 173 if (phys_avail[i + 1] <= start) 174 continue; 175 if (phys_avail[i] < end) 176 return (1); 177 break; 178 } 179 return (0); 180 181} 182 183static void 184srat_parse_entry(ACPI_SUBTABLE_HEADER *entry, void *arg) 185{ 186 ACPI_SRAT_CPU_AFFINITY *cpu; 187 ACPI_SRAT_X2APIC_CPU_AFFINITY *x2apic; 188 ACPI_SRAT_MEM_AFFINITY *mem; 189 int domain, i, slot; 190 191 switch (entry->Type) { 192 case ACPI_SRAT_TYPE_CPU_AFFINITY: 193 cpu = (ACPI_SRAT_CPU_AFFINITY *)entry; 194 domain = cpu->ProximityDomainLo | 195 cpu->ProximityDomainHi[0] << 8 | 196 cpu->ProximityDomainHi[1] << 16 | 197 cpu->ProximityDomainHi[2] << 24; 198 if (bootverbose) 199 printf("SRAT: Found CPU APIC ID %u domain %d: %s\n", 200 cpu->ApicId, domain, 201 (cpu->Flags & ACPI_SRAT_CPU_ENABLED) ? 202 "enabled" : "disabled"); 203 if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED)) 204 break; 205 if (cpu->ApicId > MAX_APIC_ID) { 206 printf("SRAT: Ignoring local APIC ID %u (too high)\n", 207 cpu->ApicId); 208 break; 209 } 210 211 if (cpus[cpu->ApicId].enabled) { 212 printf("SRAT: Duplicate local APIC ID %u\n", 213 cpu->ApicId); 214 *(int *)arg = ENXIO; 215 break; 216 } 217 cpus[cpu->ApicId].domain = domain; 218 cpus[cpu->ApicId].enabled = 1; 219 break; 220 case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: 221 x2apic = (ACPI_SRAT_X2APIC_CPU_AFFINITY *)entry; 222 if (bootverbose) 223 printf("SRAT: Found CPU APIC ID %u domain %d: %s\n", 224 x2apic->ApicId, x2apic->ProximityDomain, 225 (x2apic->Flags & ACPI_SRAT_CPU_ENABLED) ? 226 "enabled" : "disabled"); 227 if (!(x2apic->Flags & ACPI_SRAT_CPU_ENABLED)) 228 break; 229 if (x2apic->ApicId > MAX_APIC_ID) { 230 printf("SRAT: Ignoring local APIC ID %u (too high)\n", 231 x2apic->ApicId); 232 break; 233 } 234 235 KASSERT(!cpus[x2apic->ApicId].enabled, 236 ("Duplicate local APIC ID %u", x2apic->ApicId)); 237 cpus[x2apic->ApicId].domain = x2apic->ProximityDomain; 238 cpus[x2apic->ApicId].enabled = 1; 239 break; 240 case ACPI_SRAT_TYPE_MEMORY_AFFINITY: 241 mem = (ACPI_SRAT_MEM_AFFINITY *)entry; 242 if (bootverbose) 243 printf( 244 "SRAT: Found memory domain %d addr 0x%jx len 0x%jx: %s\n", 245 mem->ProximityDomain, (uintmax_t)mem->BaseAddress, 246 (uintmax_t)mem->Length, 247 (mem->Flags & ACPI_SRAT_MEM_ENABLED) ? 248 "enabled" : "disabled"); 249 if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED)) 250 break; 251 if (!overlaps_phys_avail(mem->BaseAddress, 252 mem->BaseAddress + mem->Length)) { 253 printf("SRAT: Ignoring memory at addr 0x%jx\n", 254 (uintmax_t)mem->BaseAddress); 255 break; 256 } 257 if (num_mem == VM_PHYSSEG_MAX) { 258 printf("SRAT: Too many memory regions\n"); 259 *(int *)arg = ENXIO; 260 break; 261 } 262 slot = num_mem; 263 for (i = 0; i < num_mem; i++) { 264 if (mem_info[i].end <= mem->BaseAddress) 265 continue; 266 if (mem_info[i].start < 267 (mem->BaseAddress + mem->Length)) { 268 printf("SRAT: Overlapping memory entries\n"); 269 *(int *)arg = ENXIO; 270 return; 271 } 272 slot = i; 273 } 274 for (i = num_mem; i > slot; i--) 275 mem_info[i] = mem_info[i - 1]; 276 mem_info[slot].start = mem->BaseAddress; 277 mem_info[slot].end = mem->BaseAddress + mem->Length; 278 mem_info[slot].domain = mem->ProximityDomain; 279 num_mem++; 280 break; 281 } 282} 283 284/* 285 * Ensure each memory domain has at least one CPU and that each CPU 286 * has at least one memory domain. 287 */ 288static int 289check_domains(void) 290{ 291 int found, i, j; 292 293 for (i = 0; i < num_mem; i++) { 294 found = 0; 295 for (j = 0; j <= MAX_APIC_ID; j++) 296 if (cpus[j].enabled && 297 cpus[j].domain == mem_info[i].domain) { 298 cpus[j].has_memory = 1; 299 found++; 300 } 301 if (!found) { 302 printf("SRAT: No CPU found for memory domain %d\n", 303 mem_info[i].domain); 304 return (ENXIO); 305 } 306 } 307 for (i = 0; i <= MAX_APIC_ID; i++) 308 if (cpus[i].enabled && !cpus[i].has_memory) { 309 printf("SRAT: No memory found for CPU %d\n", i); 310 return (ENXIO); 311 } 312 return (0); 313} 314 315/* 316 * Check that the SRAT memory regions cover all of the regions in 317 * phys_avail[]. 318 */ 319static int 320check_phys_avail(void) 321{ 322 vm_paddr_t address; 323 int i, j; 324 325 /* j is the current offset into phys_avail[]. */ 326 address = phys_avail[0]; 327 j = 0; 328 for (i = 0; i < num_mem; i++) { 329 /* 330 * Consume as many phys_avail[] entries as fit in this 331 * region. 332 */ 333 while (address >= mem_info[i].start && 334 address <= mem_info[i].end) { 335 /* 336 * If we cover the rest of this phys_avail[] entry, 337 * advance to the next entry. 338 */ 339 if (phys_avail[j + 1] <= mem_info[i].end) { 340 j += 2; 341 if (phys_avail[j] == 0 && 342 phys_avail[j + 1] == 0) { 343 return (0); 344 } 345 address = phys_avail[j]; 346 } else 347 address = mem_info[i].end + 1; 348 } 349 } 350 printf("SRAT: No memory region found for 0x%jx - 0x%jx\n", 351 (uintmax_t)phys_avail[j], (uintmax_t)phys_avail[j + 1]); 352 return (ENXIO); 353} 354 355/* 356 * Renumber the memory domains to be compact and zero-based if not 357 * already. Returns an error if there are too many domains. 358 */ 359static int 360renumber_domains(void) 361{ 362 int i, j, slot; 363 364 /* Enumerate all the domains. */ 365 ndomain = 0; 366 for (i = 0; i < num_mem; i++) { 367 /* See if this domain is already known. */ 368 for (j = 0; j < ndomain; j++) { 369 if (domain_pxm[j] >= mem_info[i].domain) 370 break; 371 } 372 if (j < ndomain && domain_pxm[j] == mem_info[i].domain) 373 continue; 374 375 if (ndomain >= MAXMEMDOM) { 376 ndomain = 1; 377 printf("SRAT: Too many memory domains\n"); 378 return (EFBIG); 379 } 380 381 /* Insert the new domain at slot 'j'. */ 382 slot = j; 383 for (j = ndomain; j > slot; j--) 384 domain_pxm[j] = domain_pxm[j - 1]; 385 domain_pxm[slot] = mem_info[i].domain; 386 ndomain++; 387 } 388 389 /* Renumber each domain to its index in the sorted 'domain_pxm' list. */ 390 for (i = 0; i < ndomain; i++) { 391 /* 392 * If the domain is already the right value, no need 393 * to renumber. 394 */ 395 if (domain_pxm[i] == i) 396 continue; 397 398 /* Walk the cpu[] and mem_info[] arrays to renumber. */ 399 for (j = 0; j < num_mem; j++) 400 if (mem_info[j].domain == domain_pxm[i]) 401 mem_info[j].domain = i; 402 for (j = 0; j <= MAX_APIC_ID; j++) 403 if (cpus[j].enabled && cpus[j].domain == domain_pxm[i]) 404 cpus[j].domain = i; 405 } 406 407 return (0); 408} 409 410/* 411 * Look for an ACPI System Resource Affinity Table ("SRAT") 412 */ 413static int 414parse_srat(void) 415{ 416 int error; 417 418 if (resource_disabled("srat", 0)) 419 return (-1); 420 421 srat_physaddr = acpi_find_table(ACPI_SIG_SRAT); 422 if (srat_physaddr == 0) 423 return (-1); 424 425 /* 426 * Make a pass over the table to populate the cpus[] and 427 * mem_info[] tables. 428 */ 429 srat = acpi_map_table(srat_physaddr, ACPI_SIG_SRAT); 430 error = 0; 431 srat_walk_table(srat_parse_entry, &error); 432 acpi_unmap_table(srat); 433 srat = NULL; 434 if (error || check_domains() != 0 || check_phys_avail() != 0 || 435 renumber_domains() != 0) { 436 srat_physaddr = 0; 437 return (-1); 438 } 439 440#ifdef VM_NUMA_ALLOC 441 /* Point vm_phys at our memory affinity table. */ 442 vm_ndomains = ndomain; 443 mem_affinity = mem_info; 444#endif 445 446 return (0); 447} 448 449static void 450init_mem_locality(void) 451{ 452 int i; 453 454 /* 455 * For now, assume -1 == "no locality information for 456 * this pairing. 457 */ 458 for (i = 0; i < MAXMEMDOM * MAXMEMDOM; i++) 459 vm_locality_table[i] = -1; 460} 461 462static void 463parse_acpi_tables(void *dummy) 464{ 465 466 if (parse_srat() < 0) 467 return; 468 init_mem_locality(); 469 (void) parse_slit(); 470} 471SYSINIT(parse_acpi_tables, SI_SUB_VM - 1, SI_ORDER_FIRST, parse_acpi_tables, 472 NULL); 473 474static void 475srat_walk_table(acpi_subtable_handler *handler, void *arg) 476{ 477 478 acpi_walk_subtables(srat + 1, (char *)srat + srat->Header.Length, 479 handler, arg); 480} 481 482/* 483 * Setup per-CPU domain IDs. 484 */ 485static void 486srat_set_cpus(void *dummy) 487{ 488 struct cpu_info *cpu; 489 struct pcpu *pc; 490 u_int i; 491 492 if (srat_physaddr == 0) 493 return; 494 for (i = 0; i < MAXCPU; i++) { 495 if (CPU_ABSENT(i)) 496 continue; 497 pc = pcpu_find(i); 498 KASSERT(pc != NULL, ("no pcpu data for CPU %u", i)); 499 cpu = &cpus[pc->pc_apic_id]; 500 if (!cpu->enabled) 501 panic("SRAT: CPU with APIC ID %u is not known", 502 pc->pc_apic_id); 503 pc->pc_domain = cpu->domain; 504 CPU_SET(i, &cpuset_domain[cpu->domain]); 505 if (bootverbose) 506 printf("SRAT: CPU %u has memory domain %d\n", i, 507 cpu->domain); 508 } 509} 510SYSINIT(srat_set_cpus, SI_SUB_CPU, SI_ORDER_ANY, srat_set_cpus, NULL); 511 512/* 513 * Map a _PXM value to a VM domain ID. 514 * 515 * Returns the domain ID, or -1 if no domain ID was found. 516 */ 517int 518acpi_map_pxm_to_vm_domainid(int pxm) 519{ 520 int i; 521 522 for (i = 0; i < ndomain; i++) { 523 if (domain_pxm[i] == pxm) 524 return (i); 525 } 526 527 return (-1); 528} 529 530#else /* MAXMEMDOM == 1 */ 531 532int 533acpi_map_pxm_to_vm_domainid(int pxm) 534{ 535 536 return (-1); 537} 538 539#endif /* MAXMEMDOM > 1 */ 540