subr_percpu.c revision 1.21
1/* $NetBSD: subr_percpu.c,v 1.21 2020/02/01 12:49:02 riastradh Exp $ */ 2 3/*- 4 * Copyright (c)2007,2008 YAMAMOTO Takashi, 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29/* 30 * per-cpu storage. 31 */ 32 33#include <sys/cdefs.h> 34__KERNEL_RCSID(0, "$NetBSD: subr_percpu.c,v 1.21 2020/02/01 12:49:02 riastradh Exp $"); 35 36#include <sys/param.h> 37#include <sys/cpu.h> 38#include <sys/kmem.h> 39#include <sys/kernel.h> 40#include <sys/mutex.h> 41#include <sys/percpu.h> 42#include <sys/rwlock.h> 43#include <sys/vmem.h> 44#include <sys/xcall.h> 45 46#define PERCPU_QUANTUM_SIZE (ALIGNBYTES + 1) 47#define PERCPU_QCACHE_MAX 0 48#define PERCPU_IMPORT_SIZE 2048 49 50struct percpu { 51 unsigned pc_offset; 52 size_t pc_size; 53 percpu_callback_t pc_dtor; 54 void *pc_cookie; 55}; 56 57static krwlock_t percpu_swap_lock __cacheline_aligned; 58static kmutex_t percpu_allocation_lock __cacheline_aligned; 59static vmem_t * percpu_offset_arena __cacheline_aligned; 60static unsigned int percpu_nextoff __cacheline_aligned; 61 62static percpu_cpu_t * 63cpu_percpu(struct cpu_info *ci) 64{ 65 66 return &ci->ci_data.cpu_percpu; 67} 68 69static unsigned int 70percpu_offset(percpu_t *pc) 71{ 72 const unsigned int off = pc->pc_offset; 73 74 KASSERT(off < percpu_nextoff); 75 return off; 76} 77 78/* 79 * percpu_cpu_swap: crosscall handler for percpu_cpu_enlarge 80 */ 81__noubsan 82static void 83percpu_cpu_swap(void *p1, void *p2) 84{ 85 struct cpu_info * const ci = p1; 86 percpu_cpu_t * const newpcc = p2; 87 percpu_cpu_t * const pcc = cpu_percpu(ci); 88 89 KASSERT(ci == curcpu() || !mp_online); 90 91 /* 92 * swap *pcc and *newpcc unless anyone has beaten us. 93 */ 94 rw_enter(&percpu_swap_lock, RW_WRITER); 95 if (newpcc->pcc_size > pcc->pcc_size) { 96 percpu_cpu_t tmp; 97 int s; 98 99 tmp = *pcc; 100 101 /* 102 * block interrupts so that we don't lose their modifications. 103 */ 104 105 s = splhigh(); 106 107 /* 108 * copy data to new storage. 109 */ 110 111 memcpy(newpcc->pcc_data, pcc->pcc_data, pcc->pcc_size); 112 113 /* 114 * this assignment needs to be atomic for percpu_getptr_remote. 115 */ 116 117 pcc->pcc_data = newpcc->pcc_data; 118 119 splx(s); 120 121 pcc->pcc_size = newpcc->pcc_size; 122 *newpcc = tmp; 123 } 124 rw_exit(&percpu_swap_lock); 125} 126 127/* 128 * percpu_cpu_enlarge: ensure that percpu_cpu_t of each cpus have enough space 129 */ 130 131static void 132percpu_cpu_enlarge(size_t size) 133{ 134 CPU_INFO_ITERATOR cii; 135 struct cpu_info *ci; 136 137 for (CPU_INFO_FOREACH(cii, ci)) { 138 percpu_cpu_t pcc; 139 140 pcc.pcc_data = kmem_alloc(size, KM_SLEEP); /* XXX cacheline */ 141 pcc.pcc_size = size; 142 if (!mp_online) { 143 percpu_cpu_swap(ci, &pcc); 144 } else { 145 uint64_t where; 146 147 where = xc_unicast(0, percpu_cpu_swap, ci, &pcc, ci); 148 xc_wait(where); 149 } 150 KASSERT(pcc.pcc_size <= size); 151 if (pcc.pcc_data != NULL) { 152 kmem_free(pcc.pcc_data, pcc.pcc_size); 153 } 154 } 155} 156 157/* 158 * percpu_backend_alloc: vmem import callback for percpu_offset_arena 159 */ 160 161static int 162percpu_backend_alloc(vmem_t *dummy, vmem_size_t size, vmem_size_t *resultsize, 163 vm_flag_t vmflags, vmem_addr_t *addrp) 164{ 165 unsigned int offset; 166 unsigned int nextoff; 167 168 ASSERT_SLEEPABLE(); 169 KASSERT(dummy == NULL); 170 171 if ((vmflags & VM_NOSLEEP) != 0) 172 return ENOMEM; 173 174 size = roundup(size, PERCPU_IMPORT_SIZE); 175 mutex_enter(&percpu_allocation_lock); 176 offset = percpu_nextoff; 177 percpu_nextoff = nextoff = percpu_nextoff + size; 178 mutex_exit(&percpu_allocation_lock); 179 180 percpu_cpu_enlarge(nextoff); 181 182 *resultsize = size; 183 *addrp = (vmem_addr_t)offset; 184 return 0; 185} 186 187static void 188percpu_zero_cb(void *vp, void *vp2, struct cpu_info *ci) 189{ 190 size_t sz = (uintptr_t)vp2; 191 192 memset(vp, 0, sz); 193} 194 195/* 196 * percpu_zero: initialize percpu storage with zero. 197 */ 198 199static void 200percpu_zero(percpu_t *pc, size_t sz) 201{ 202 203 percpu_foreach(pc, percpu_zero_cb, (void *)(uintptr_t)sz); 204} 205 206/* 207 * percpu_init: subsystem initialization 208 */ 209 210void 211percpu_init(void) 212{ 213 214 ASSERT_SLEEPABLE(); 215 rw_init(&percpu_swap_lock); 216 mutex_init(&percpu_allocation_lock, MUTEX_DEFAULT, IPL_NONE); 217 percpu_nextoff = PERCPU_QUANTUM_SIZE; 218 219 percpu_offset_arena = vmem_xcreate("percpu", 0, 0, PERCPU_QUANTUM_SIZE, 220 percpu_backend_alloc, NULL, NULL, PERCPU_QCACHE_MAX, VM_SLEEP, 221 IPL_NONE); 222} 223 224/* 225 * percpu_init_cpu: cpu initialization 226 * 227 * => should be called before the cpu appears on the list for CPU_INFO_FOREACH. 228 */ 229 230void 231percpu_init_cpu(struct cpu_info *ci) 232{ 233 percpu_cpu_t * const pcc = cpu_percpu(ci); 234 size_t size = percpu_nextoff; /* XXX racy */ 235 236 ASSERT_SLEEPABLE(); 237 pcc->pcc_size = size; 238 if (size) { 239 pcc->pcc_data = kmem_zalloc(pcc->pcc_size, KM_SLEEP); 240 } 241} 242 243/* 244 * percpu_alloc: allocate percpu storage 245 * 246 * => called in thread context. 247 * => considered as an expensive and rare operation. 248 * => allocated storage is initialized with zeros. 249 */ 250 251percpu_t * 252percpu_alloc(size_t size) 253{ 254 255 return percpu_create(size, NULL, NULL, NULL); 256} 257 258/* 259 * percpu_create: allocate percpu storage and associate ctor/dtor with it 260 * 261 * => called in thread context. 262 * => considered as an expensive and rare operation. 263 * => allocated storage is initialized by ctor, or zeros if ctor is null 264 * => percpu_free will call dtor first, if dtor is nonnull 265 * => ctor or dtor may sleep, even on allocation 266 */ 267 268percpu_t * 269percpu_create(size_t size, percpu_callback_t ctor, percpu_callback_t dtor, 270 void *cookie) 271{ 272 vmem_addr_t offset; 273 percpu_t *pc; 274 275 ASSERT_SLEEPABLE(); 276 (void)vmem_alloc(percpu_offset_arena, size, VM_SLEEP | VM_BESTFIT, 277 &offset); 278 279 pc = kmem_alloc(sizeof(*pc), KM_SLEEP); 280 pc->pc_offset = offset; 281 pc->pc_size = size; 282 pc->pc_dtor = dtor; 283 pc->pc_cookie = cookie; 284 285 if (ctor) { 286 CPU_INFO_ITERATOR cii; 287 struct cpu_info *ci; 288 void *buf; 289 290 buf = kmem_alloc(size, KM_SLEEP); 291 for (CPU_INFO_FOREACH(cii, ci)) { 292 memset(buf, 0, size); 293 (*ctor)(buf, cookie, ci); 294 percpu_traverse_enter(); 295 memcpy(percpu_getptr_remote(pc, ci), buf, size); 296 percpu_traverse_exit(); 297 } 298 explicit_memset(buf, 0, size); 299 kmem_free(buf, size); 300 } else { 301 percpu_zero(pc, size); 302 } 303 304 return pc; 305} 306 307/* 308 * percpu_free: free percpu storage 309 * 310 * => called in thread context. 311 * => considered as an expensive and rare operation. 312 */ 313 314void 315percpu_free(percpu_t *pc, size_t size) 316{ 317 318 ASSERT_SLEEPABLE(); 319 KASSERT(size == pc->pc_size); 320 321 if (pc->pc_dtor) { 322 CPU_INFO_ITERATOR cii; 323 struct cpu_info *ci; 324 void *buf; 325 326 buf = kmem_alloc(size, KM_SLEEP); 327 for (CPU_INFO_FOREACH(cii, ci)) { 328 percpu_traverse_enter(); 329 memcpy(buf, percpu_getptr_remote(pc, ci), size); 330 explicit_memset(percpu_getptr_remote(pc, ci), 0, size); 331 percpu_traverse_exit(); 332 (*pc->pc_dtor)(buf, pc->pc_cookie, ci); 333 } 334 explicit_memset(buf, 0, size); 335 kmem_free(buf, size); 336 } 337 338 vmem_free(percpu_offset_arena, (vmem_addr_t)percpu_offset(pc), size); 339 kmem_free(pc, sizeof(*pc)); 340} 341 342/* 343 * percpu_getref: 344 * 345 * => safe to be used in either thread or interrupt context 346 * => disables preemption; must be bracketed with a percpu_putref() 347 */ 348 349void * 350percpu_getref(percpu_t *pc) 351{ 352 353 kpreempt_disable(); 354 return percpu_getptr_remote(pc, curcpu()); 355} 356 357/* 358 * percpu_putref: 359 * 360 * => drops the preemption-disabled count after caller is done with per-cpu 361 * data 362 */ 363 364void 365percpu_putref(percpu_t *pc) 366{ 367 368 kpreempt_enable(); 369} 370 371/* 372 * percpu_traverse_enter, percpu_traverse_exit, percpu_getptr_remote: 373 * helpers to access remote cpu's percpu data. 374 * 375 * => called in thread context. 376 * => percpu_traverse_enter can block low-priority xcalls. 377 * => typical usage would be: 378 * 379 * sum = 0; 380 * percpu_traverse_enter(); 381 * for (CPU_INFO_FOREACH(cii, ci)) { 382 * unsigned int *p = percpu_getptr_remote(pc, ci); 383 * sum += *p; 384 * } 385 * percpu_traverse_exit(); 386 */ 387 388void 389percpu_traverse_enter(void) 390{ 391 392 ASSERT_SLEEPABLE(); 393 rw_enter(&percpu_swap_lock, RW_READER); 394} 395 396void 397percpu_traverse_exit(void) 398{ 399 400 rw_exit(&percpu_swap_lock); 401} 402 403void * 404percpu_getptr_remote(percpu_t *pc, struct cpu_info *ci) 405{ 406 407 return &((char *)cpu_percpu(ci)->pcc_data)[percpu_offset(pc)]; 408} 409 410/* 411 * percpu_foreach: call the specified callback function for each cpus. 412 * 413 * => called in thread context. 414 * => caller should not rely on the cpu iteration order. 415 * => the callback function should be minimum because it is executed with 416 * holding a global lock, which can block low-priority xcalls. 417 * eg. it's illegal for a callback function to sleep for memory allocation. 418 */ 419void 420percpu_foreach(percpu_t *pc, percpu_callback_t cb, void *arg) 421{ 422 CPU_INFO_ITERATOR cii; 423 struct cpu_info *ci; 424 425 percpu_traverse_enter(); 426 for (CPU_INFO_FOREACH(cii, ci)) { 427 (*cb)(percpu_getptr_remote(pc, ci), arg, ci); 428 } 429 percpu_traverse_exit(); 430} 431