1/* $NetBSD: subr_percpu.c,v 1.25 2020/05/11 21:37:31 riastradh Exp $ */ 2 3/*- 4 * Copyright (c)2007,2008 YAMAMOTO Takashi, 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29/* 30 * per-cpu storage. 31 */ 32 33#include <sys/cdefs.h> 34__KERNEL_RCSID(0, "$NetBSD: subr_percpu.c,v 1.25 2020/05/11 21:37:31 riastradh Exp $"); 35 36#include <sys/param.h> 37#include <sys/cpu.h> 38#include <sys/kernel.h> 39#include <sys/kmem.h> 40#include <sys/mutex.h> 41#include <sys/percpu.h> 42#include <sys/rwlock.h> 43#include <sys/vmem.h> 44#include <sys/xcall.h> 45 46#define PERCPU_QUANTUM_SIZE (ALIGNBYTES + 1) 47#define PERCPU_QCACHE_MAX 0 48#define PERCPU_IMPORT_SIZE 2048 49 50struct percpu { 51 unsigned pc_offset; 52 size_t pc_size; 53 percpu_callback_t pc_ctor; 54 percpu_callback_t pc_dtor; 55 void *pc_cookie; 56 LIST_ENTRY(percpu) pc_list; 57}; 58 59static krwlock_t percpu_swap_lock __cacheline_aligned; 60static vmem_t * percpu_offset_arena __read_mostly; 61static struct { 62 kmutex_t lock; 63 unsigned int nextoff; 64 LIST_HEAD(, percpu) ctor_list; 65 struct lwp *busy; 66 kcondvar_t cv; 67} percpu_allocation __cacheline_aligned; 68 69static percpu_cpu_t * 70cpu_percpu(struct cpu_info *ci) 71{ 72 73 return &ci->ci_data.cpu_percpu; 74} 75 76static unsigned int 77percpu_offset(percpu_t *pc) 78{ 79 const unsigned int off = pc->pc_offset; 80 81 KASSERT(off < percpu_allocation.nextoff); 82 return off; 83} 84 85/* 86 * percpu_cpu_swap: crosscall handler for percpu_cpu_enlarge 87 */ 88__noubsan 89static void 90percpu_cpu_swap(void *p1, void *p2) 91{ 92 struct cpu_info * const ci = p1; 93 percpu_cpu_t * const newpcc = p2; 94 percpu_cpu_t * const pcc = cpu_percpu(ci); 95 96 KASSERT(ci == curcpu() || !mp_online); 97 98 /* 99 * swap *pcc and *newpcc unless anyone has beaten us. 100 */ 101 rw_enter(&percpu_swap_lock, RW_WRITER); 102 if (newpcc->pcc_size > pcc->pcc_size) { 103 percpu_cpu_t tmp; 104 int s; 105 106 tmp = *pcc; 107 108 /* 109 * block interrupts so that we don't lose their modifications. 110 */ 111 112 s = splhigh(); 113 114 /* 115 * copy data to new storage. 116 */ 117 118 memcpy(newpcc->pcc_data, pcc->pcc_data, pcc->pcc_size); 119 120 /* 121 * this assignment needs to be atomic for percpu_getptr_remote. 122 */ 123 124 pcc->pcc_data = newpcc->pcc_data; 125 126 splx(s); 127 128 pcc->pcc_size = newpcc->pcc_size; 129 *newpcc = tmp; 130 } 131 rw_exit(&percpu_swap_lock); 132} 133 134/* 135 * percpu_cpu_enlarge: ensure that percpu_cpu_t of each cpus have enough space 136 */ 137 138static void 139percpu_cpu_enlarge(size_t size) 140{ 141 CPU_INFO_ITERATOR cii; 142 struct cpu_info *ci; 143 144 for (CPU_INFO_FOREACH(cii, ci)) { 145 percpu_cpu_t pcc; 146 147 pcc.pcc_data = kmem_alloc(size, KM_SLEEP); /* XXX cacheline */ 148 pcc.pcc_size = size; 149 if (!mp_online) { 150 percpu_cpu_swap(ci, &pcc); 151 } else { 152 uint64_t where; 153 154 where = xc_unicast(0, percpu_cpu_swap, ci, &pcc, ci); 155 xc_wait(where); 156 } 157 KASSERT(pcc.pcc_size <= size); 158 if (pcc.pcc_data != NULL) { 159 kmem_free(pcc.pcc_data, pcc.pcc_size); 160 } 161 } 162} 163 164/* 165 * percpu_backend_alloc: vmem import callback for percpu_offset_arena 166 */ 167 168static int 169percpu_backend_alloc(vmem_t *dummy, vmem_size_t size, vmem_size_t *resultsize, 170 vm_flag_t vmflags, vmem_addr_t *addrp) 171{ 172 unsigned int offset; 173 unsigned int nextoff; 174 175 ASSERT_SLEEPABLE(); 176 KASSERT(dummy == NULL); 177 178 if ((vmflags & VM_NOSLEEP) != 0) 179 return ENOMEM; 180 181 size = roundup(size, PERCPU_IMPORT_SIZE); 182 mutex_enter(&percpu_allocation.lock); 183 offset = percpu_allocation.nextoff; 184 percpu_allocation.nextoff = nextoff = percpu_allocation.nextoff + size; 185 mutex_exit(&percpu_allocation.lock); 186 187 percpu_cpu_enlarge(nextoff); 188 189 *resultsize = size; 190 *addrp = (vmem_addr_t)offset; 191 return 0; 192} 193 194static void 195percpu_zero_cb(void *vp, void *vp2, struct cpu_info *ci) 196{ 197 size_t sz = (uintptr_t)vp2; 198 199 memset(vp, 0, sz); 200} 201 202/* 203 * percpu_zero: initialize percpu storage with zero. 204 */ 205 206static void 207percpu_zero(percpu_t *pc, size_t sz) 208{ 209 210 percpu_foreach(pc, percpu_zero_cb, (void *)(uintptr_t)sz); 211} 212 213/* 214 * percpu_init: subsystem initialization 215 */ 216 217void 218percpu_init(void) 219{ 220 221 ASSERT_SLEEPABLE(); 222 rw_init(&percpu_swap_lock); 223 mutex_init(&percpu_allocation.lock, MUTEX_DEFAULT, IPL_NONE); 224 percpu_allocation.nextoff = PERCPU_QUANTUM_SIZE; 225 LIST_INIT(&percpu_allocation.ctor_list); 226 percpu_allocation.busy = NULL; 227 cv_init(&percpu_allocation.cv, "percpu"); 228 229 percpu_offset_arena = vmem_xcreate("percpu", 0, 0, PERCPU_QUANTUM_SIZE, 230 percpu_backend_alloc, NULL, NULL, PERCPU_QCACHE_MAX, VM_SLEEP, 231 IPL_NONE); 232} 233 234/* 235 * percpu_init_cpu: cpu initialization 236 * 237 * => should be called before the cpu appears on the list for CPU_INFO_FOREACH. 238 * => may be called for static CPUs afterward (typically just primary CPU) 239 */ 240 241void 242percpu_init_cpu(struct cpu_info *ci) 243{ 244 percpu_cpu_t * const pcc = cpu_percpu(ci); 245 struct percpu *pc; 246 size_t size = percpu_allocation.nextoff; /* XXX racy */ 247 248 ASSERT_SLEEPABLE(); 249 250 /* 251 * For the primary CPU, prior percpu_create may have already 252 * triggered allocation, so there's nothing more for us to do 253 * here. 254 */ 255 if (pcc->pcc_size) 256 return; 257 KASSERT(pcc->pcc_data == NULL); 258 259 /* 260 * Otherwise, allocate storage and, while the constructor list 261 * is locked, run constructors for all percpus on this CPU. 262 */ 263 pcc->pcc_size = size; 264 if (size) { 265 pcc->pcc_data = kmem_zalloc(pcc->pcc_size, KM_SLEEP); 266 mutex_enter(&percpu_allocation.lock); 267 while (percpu_allocation.busy) 268 cv_wait(&percpu_allocation.cv, 269 &percpu_allocation.lock); 270 percpu_allocation.busy = curlwp; 271 LIST_FOREACH(pc, &percpu_allocation.ctor_list, pc_list) { 272 KASSERT(pc->pc_ctor); 273 mutex_exit(&percpu_allocation.lock); 274 (*pc->pc_ctor)((char *)pcc->pcc_data + pc->pc_offset, 275 pc->pc_cookie, ci); 276 mutex_enter(&percpu_allocation.lock); 277 } 278 KASSERT(percpu_allocation.busy == curlwp); 279 percpu_allocation.busy = NULL; 280 cv_broadcast(&percpu_allocation.cv); 281 mutex_exit(&percpu_allocation.lock); 282 } 283} 284 285/* 286 * percpu_alloc: allocate percpu storage 287 * 288 * => called in thread context. 289 * => considered as an expensive and rare operation. 290 * => allocated storage is initialized with zeros. 291 */ 292 293percpu_t * 294percpu_alloc(size_t size) 295{ 296 297 return percpu_create(size, NULL, NULL, NULL); 298} 299 300/* 301 * percpu_create: allocate percpu storage and associate ctor/dtor with it 302 * 303 * => called in thread context. 304 * => considered as an expensive and rare operation. 305 * => allocated storage is initialized by ctor, or zeros if ctor is null 306 * => percpu_free will call dtor first, if dtor is nonnull 307 * => ctor or dtor may sleep, even on allocation 308 */ 309 310percpu_t * 311percpu_create(size_t size, percpu_callback_t ctor, percpu_callback_t dtor, 312 void *cookie) 313{ 314 vmem_addr_t offset; 315 percpu_t *pc; 316 317 ASSERT_SLEEPABLE(); 318 (void)vmem_alloc(percpu_offset_arena, size, VM_SLEEP | VM_BESTFIT, 319 &offset); 320 321 pc = kmem_alloc(sizeof(*pc), KM_SLEEP); 322 pc->pc_offset = offset; 323 pc->pc_size = size; 324 pc->pc_ctor = ctor; 325 pc->pc_dtor = dtor; 326 pc->pc_cookie = cookie; 327 328 if (ctor) { 329 CPU_INFO_ITERATOR cii; 330 struct cpu_info *ci; 331 void *buf; 332 333 /* 334 * Wait until nobody is using the list of percpus with 335 * constructors. 336 */ 337 mutex_enter(&percpu_allocation.lock); 338 while (percpu_allocation.busy) 339 cv_wait(&percpu_allocation.cv, 340 &percpu_allocation.lock); 341 percpu_allocation.busy = curlwp; 342 mutex_exit(&percpu_allocation.lock); 343 344 /* 345 * Run the constructor for all CPUs. We use a 346 * temporary buffer wo that we need not hold the 347 * percpu_swap_lock while running the constructor. 348 */ 349 buf = kmem_alloc(size, KM_SLEEP); 350 for (CPU_INFO_FOREACH(cii, ci)) { 351 memset(buf, 0, size); 352 (*ctor)(buf, cookie, ci); 353 percpu_traverse_enter(); 354 memcpy(percpu_getptr_remote(pc, ci), buf, size); 355 percpu_traverse_exit(); 356 } 357 explicit_memset(buf, 0, size); 358 kmem_free(buf, size); 359 360 /* 361 * Insert the percpu into the list of percpus with 362 * constructors. We are now done using the list, so it 363 * is safe for concurrent percpu_create or concurrent 364 * percpu_init_cpu to run. 365 */ 366 mutex_enter(&percpu_allocation.lock); 367 KASSERT(percpu_allocation.busy == curlwp); 368 percpu_allocation.busy = NULL; 369 cv_broadcast(&percpu_allocation.cv); 370 LIST_INSERT_HEAD(&percpu_allocation.ctor_list, pc, pc_list); 371 mutex_exit(&percpu_allocation.lock); 372 } else { 373 percpu_zero(pc, size); 374 } 375 376 return pc; 377} 378 379/* 380 * percpu_free: free percpu storage 381 * 382 * => called in thread context. 383 * => considered as an expensive and rare operation. 384 */ 385 386void 387percpu_free(percpu_t *pc, size_t size) 388{ 389 390 ASSERT_SLEEPABLE(); 391 KASSERT(size == pc->pc_size); 392 393 /* 394 * If there's a constructor, take the percpu off the list of 395 * percpus with constructors, but first wait until nobody is 396 * using the list. 397 */ 398 if (pc->pc_ctor) { 399 mutex_enter(&percpu_allocation.lock); 400 while (percpu_allocation.busy) 401 cv_wait(&percpu_allocation.cv, 402 &percpu_allocation.lock); 403 LIST_REMOVE(pc, pc_list); 404 mutex_exit(&percpu_allocation.lock); 405 } 406 407 /* If there's a destructor, run it now for all CPUs. */ 408 if (pc->pc_dtor) { 409 CPU_INFO_ITERATOR cii; 410 struct cpu_info *ci; 411 void *buf; 412 413 buf = kmem_alloc(size, KM_SLEEP); 414 for (CPU_INFO_FOREACH(cii, ci)) { 415 percpu_traverse_enter(); 416 memcpy(buf, percpu_getptr_remote(pc, ci), size); 417 explicit_memset(percpu_getptr_remote(pc, ci), 0, size); 418 percpu_traverse_exit(); 419 (*pc->pc_dtor)(buf, pc->pc_cookie, ci); 420 } 421 explicit_memset(buf, 0, size); 422 kmem_free(buf, size); 423 } 424 425 vmem_free(percpu_offset_arena, (vmem_addr_t)percpu_offset(pc), size); 426 kmem_free(pc, sizeof(*pc)); 427} 428 429/* 430 * percpu_getref: 431 * 432 * => safe to be used in either thread or interrupt context 433 * => disables preemption; must be bracketed with a percpu_putref() 434 */ 435 436void * 437percpu_getref(percpu_t *pc) 438{ 439 440 kpreempt_disable(); 441 return percpu_getptr_remote(pc, curcpu()); 442} 443 444/* 445 * percpu_putref: 446 * 447 * => drops the preemption-disabled count after caller is done with per-cpu 448 * data 449 */ 450 451void 452percpu_putref(percpu_t *pc) 453{ 454 455 kpreempt_enable(); 456} 457 458/* 459 * percpu_traverse_enter, percpu_traverse_exit, percpu_getptr_remote: 460 * helpers to access remote cpu's percpu data. 461 * 462 * => called in thread context. 463 * => percpu_traverse_enter can block low-priority xcalls. 464 * => typical usage would be: 465 * 466 * sum = 0; 467 * percpu_traverse_enter(); 468 * for (CPU_INFO_FOREACH(cii, ci)) { 469 * unsigned int *p = percpu_getptr_remote(pc, ci); 470 * sum += *p; 471 * } 472 * percpu_traverse_exit(); 473 */ 474 475void 476percpu_traverse_enter(void) 477{ 478 479 ASSERT_SLEEPABLE(); 480 rw_enter(&percpu_swap_lock, RW_READER); 481} 482 483void 484percpu_traverse_exit(void) 485{ 486 487 rw_exit(&percpu_swap_lock); 488} 489 490void * 491percpu_getptr_remote(percpu_t *pc, struct cpu_info *ci) 492{ 493 494 return &((char *)cpu_percpu(ci)->pcc_data)[percpu_offset(pc)]; 495} 496 497/* 498 * percpu_foreach: call the specified callback function for each cpus. 499 * 500 * => must be called from thread context. 501 * => callback executes on **current** CPU (or, really, arbitrary CPU, 502 * in case of preemption) 503 * => caller should not rely on the cpu iteration order. 504 * => the callback function should be minimum because it is executed with 505 * holding a global lock, which can block low-priority xcalls. 506 * eg. it's illegal for a callback function to sleep for memory allocation. 507 */ 508void 509percpu_foreach(percpu_t *pc, percpu_callback_t cb, void *arg) 510{ 511 CPU_INFO_ITERATOR cii; 512 struct cpu_info *ci; 513 514 percpu_traverse_enter(); 515 for (CPU_INFO_FOREACH(cii, ci)) { 516 (*cb)(percpu_getptr_remote(pc, ci), arg, ci); 517 } 518 percpu_traverse_exit(); 519} 520 521struct percpu_xcall_ctx { 522 percpu_callback_t ctx_cb; 523 void *ctx_arg; 524}; 525 526static void 527percpu_xcfunc(void * const v1, void * const v2) 528{ 529 percpu_t * const pc = v1; 530 struct percpu_xcall_ctx * const ctx = v2; 531 532 (*ctx->ctx_cb)(percpu_getref(pc), ctx->ctx_arg, curcpu()); 533 percpu_putref(pc); 534} 535 536/* 537 * percpu_foreach_xcall: call the specified callback function for each 538 * cpu. This version uses an xcall to run the callback on each cpu. 539 * 540 * => must be called from thread context. 541 * => callback executes on **remote** CPU in soft-interrupt context 542 * (at the specified soft interrupt priority). 543 * => caller should not rely on the cpu iteration order. 544 * => the callback function should be minimum because it may be 545 * executed in soft-interrupt context. eg. it's illegal for 546 * a callback function to sleep for memory allocation. 547 */ 548void 549percpu_foreach_xcall(percpu_t *pc, u_int xcflags, percpu_callback_t cb, 550 void *arg) 551{ 552 struct percpu_xcall_ctx ctx = { 553 .ctx_cb = cb, 554 .ctx_arg = arg, 555 }; 556 CPU_INFO_ITERATOR cii; 557 struct cpu_info *ci; 558 559 for (CPU_INFO_FOREACH(cii, ci)) { 560 xc_wait(xc_unicast(xcflags, percpu_xcfunc, pc, &ctx, ci)); 561 } 562} 563