1/*	$NetBSD: subr_percpu.c,v 1.25 2020/05/11 21:37:31 riastradh Exp $	*/
2
3/*-
4 * Copyright (c)2007,2008 YAMAMOTO Takashi,
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29/*
30 * per-cpu storage.
31 */
32
33#include <sys/cdefs.h>
34__KERNEL_RCSID(0, "$NetBSD: subr_percpu.c,v 1.25 2020/05/11 21:37:31 riastradh Exp $");
35
36#include <sys/param.h>
37#include <sys/cpu.h>
38#include <sys/kernel.h>
39#include <sys/kmem.h>
40#include <sys/mutex.h>
41#include <sys/percpu.h>
42#include <sys/rwlock.h>
43#include <sys/vmem.h>
44#include <sys/xcall.h>
45
46#define	PERCPU_QUANTUM_SIZE	(ALIGNBYTES + 1)
47#define	PERCPU_QCACHE_MAX	0
48#define	PERCPU_IMPORT_SIZE	2048
49
50struct percpu {
51	unsigned		pc_offset;
52	size_t			pc_size;
53	percpu_callback_t	pc_ctor;
54	percpu_callback_t	pc_dtor;
55	void			*pc_cookie;
56	LIST_ENTRY(percpu)	pc_list;
57};
58
59static krwlock_t	percpu_swap_lock	__cacheline_aligned;
60static vmem_t *		percpu_offset_arena	__read_mostly;
61static struct {
62	kmutex_t	lock;
63	unsigned int	nextoff;
64	LIST_HEAD(, percpu) ctor_list;
65	struct lwp	*busy;
66	kcondvar_t	cv;
67} percpu_allocation __cacheline_aligned;
68
69static percpu_cpu_t *
70cpu_percpu(struct cpu_info *ci)
71{
72
73	return &ci->ci_data.cpu_percpu;
74}
75
76static unsigned int
77percpu_offset(percpu_t *pc)
78{
79	const unsigned int off = pc->pc_offset;
80
81	KASSERT(off < percpu_allocation.nextoff);
82	return off;
83}
84
85/*
86 * percpu_cpu_swap: crosscall handler for percpu_cpu_enlarge
87 */
88__noubsan
89static void
90percpu_cpu_swap(void *p1, void *p2)
91{
92	struct cpu_info * const ci = p1;
93	percpu_cpu_t * const newpcc = p2;
94	percpu_cpu_t * const pcc = cpu_percpu(ci);
95
96	KASSERT(ci == curcpu() || !mp_online);
97
98	/*
99	 * swap *pcc and *newpcc unless anyone has beaten us.
100	 */
101	rw_enter(&percpu_swap_lock, RW_WRITER);
102	if (newpcc->pcc_size > pcc->pcc_size) {
103		percpu_cpu_t tmp;
104		int s;
105
106		tmp = *pcc;
107
108		/*
109		 * block interrupts so that we don't lose their modifications.
110		 */
111
112		s = splhigh();
113
114		/*
115		 * copy data to new storage.
116		 */
117
118		memcpy(newpcc->pcc_data, pcc->pcc_data, pcc->pcc_size);
119
120		/*
121		 * this assignment needs to be atomic for percpu_getptr_remote.
122		 */
123
124		pcc->pcc_data = newpcc->pcc_data;
125
126		splx(s);
127
128		pcc->pcc_size = newpcc->pcc_size;
129		*newpcc = tmp;
130	}
131	rw_exit(&percpu_swap_lock);
132}
133
134/*
135 * percpu_cpu_enlarge: ensure that percpu_cpu_t of each cpus have enough space
136 */
137
138static void
139percpu_cpu_enlarge(size_t size)
140{
141	CPU_INFO_ITERATOR cii;
142	struct cpu_info *ci;
143
144	for (CPU_INFO_FOREACH(cii, ci)) {
145		percpu_cpu_t pcc;
146
147		pcc.pcc_data = kmem_alloc(size, KM_SLEEP); /* XXX cacheline */
148		pcc.pcc_size = size;
149		if (!mp_online) {
150			percpu_cpu_swap(ci, &pcc);
151		} else {
152			uint64_t where;
153
154			where = xc_unicast(0, percpu_cpu_swap, ci, &pcc, ci);
155			xc_wait(where);
156		}
157		KASSERT(pcc.pcc_size <= size);
158		if (pcc.pcc_data != NULL) {
159			kmem_free(pcc.pcc_data, pcc.pcc_size);
160		}
161	}
162}
163
164/*
165 * percpu_backend_alloc: vmem import callback for percpu_offset_arena
166 */
167
168static int
169percpu_backend_alloc(vmem_t *dummy, vmem_size_t size, vmem_size_t *resultsize,
170    vm_flag_t vmflags, vmem_addr_t *addrp)
171{
172	unsigned int offset;
173	unsigned int nextoff;
174
175	ASSERT_SLEEPABLE();
176	KASSERT(dummy == NULL);
177
178	if ((vmflags & VM_NOSLEEP) != 0)
179		return ENOMEM;
180
181	size = roundup(size, PERCPU_IMPORT_SIZE);
182	mutex_enter(&percpu_allocation.lock);
183	offset = percpu_allocation.nextoff;
184	percpu_allocation.nextoff = nextoff = percpu_allocation.nextoff + size;
185	mutex_exit(&percpu_allocation.lock);
186
187	percpu_cpu_enlarge(nextoff);
188
189	*resultsize = size;
190	*addrp = (vmem_addr_t)offset;
191	return 0;
192}
193
194static void
195percpu_zero_cb(void *vp, void *vp2, struct cpu_info *ci)
196{
197	size_t sz = (uintptr_t)vp2;
198
199	memset(vp, 0, sz);
200}
201
202/*
203 * percpu_zero: initialize percpu storage with zero.
204 */
205
206static void
207percpu_zero(percpu_t *pc, size_t sz)
208{
209
210	percpu_foreach(pc, percpu_zero_cb, (void *)(uintptr_t)sz);
211}
212
213/*
214 * percpu_init: subsystem initialization
215 */
216
217void
218percpu_init(void)
219{
220
221	ASSERT_SLEEPABLE();
222	rw_init(&percpu_swap_lock);
223	mutex_init(&percpu_allocation.lock, MUTEX_DEFAULT, IPL_NONE);
224	percpu_allocation.nextoff = PERCPU_QUANTUM_SIZE;
225	LIST_INIT(&percpu_allocation.ctor_list);
226	percpu_allocation.busy = NULL;
227	cv_init(&percpu_allocation.cv, "percpu");
228
229	percpu_offset_arena = vmem_xcreate("percpu", 0, 0, PERCPU_QUANTUM_SIZE,
230	    percpu_backend_alloc, NULL, NULL, PERCPU_QCACHE_MAX, VM_SLEEP,
231	    IPL_NONE);
232}
233
234/*
235 * percpu_init_cpu: cpu initialization
236 *
237 * => should be called before the cpu appears on the list for CPU_INFO_FOREACH.
238 * => may be called for static CPUs afterward (typically just primary CPU)
239 */
240
241void
242percpu_init_cpu(struct cpu_info *ci)
243{
244	percpu_cpu_t * const pcc = cpu_percpu(ci);
245	struct percpu *pc;
246	size_t size = percpu_allocation.nextoff; /* XXX racy */
247
248	ASSERT_SLEEPABLE();
249
250	/*
251	 * For the primary CPU, prior percpu_create may have already
252	 * triggered allocation, so there's nothing more for us to do
253	 * here.
254	 */
255	if (pcc->pcc_size)
256		return;
257	KASSERT(pcc->pcc_data == NULL);
258
259	/*
260	 * Otherwise, allocate storage and, while the constructor list
261	 * is locked, run constructors for all percpus on this CPU.
262	 */
263	pcc->pcc_size = size;
264	if (size) {
265		pcc->pcc_data = kmem_zalloc(pcc->pcc_size, KM_SLEEP);
266		mutex_enter(&percpu_allocation.lock);
267		while (percpu_allocation.busy)
268			cv_wait(&percpu_allocation.cv,
269			    &percpu_allocation.lock);
270		percpu_allocation.busy = curlwp;
271		LIST_FOREACH(pc, &percpu_allocation.ctor_list, pc_list) {
272			KASSERT(pc->pc_ctor);
273			mutex_exit(&percpu_allocation.lock);
274			(*pc->pc_ctor)((char *)pcc->pcc_data + pc->pc_offset,
275			    pc->pc_cookie, ci);
276			mutex_enter(&percpu_allocation.lock);
277		}
278		KASSERT(percpu_allocation.busy == curlwp);
279		percpu_allocation.busy = NULL;
280		cv_broadcast(&percpu_allocation.cv);
281		mutex_exit(&percpu_allocation.lock);
282	}
283}
284
285/*
286 * percpu_alloc: allocate percpu storage
287 *
288 * => called in thread context.
289 * => considered as an expensive and rare operation.
290 * => allocated storage is initialized with zeros.
291 */
292
293percpu_t *
294percpu_alloc(size_t size)
295{
296
297	return percpu_create(size, NULL, NULL, NULL);
298}
299
300/*
301 * percpu_create: allocate percpu storage and associate ctor/dtor with it
302 *
303 * => called in thread context.
304 * => considered as an expensive and rare operation.
305 * => allocated storage is initialized by ctor, or zeros if ctor is null
306 * => percpu_free will call dtor first, if dtor is nonnull
307 * => ctor or dtor may sleep, even on allocation
308 */
309
310percpu_t *
311percpu_create(size_t size, percpu_callback_t ctor, percpu_callback_t dtor,
312    void *cookie)
313{
314	vmem_addr_t offset;
315	percpu_t *pc;
316
317	ASSERT_SLEEPABLE();
318	(void)vmem_alloc(percpu_offset_arena, size, VM_SLEEP | VM_BESTFIT,
319	    &offset);
320
321	pc = kmem_alloc(sizeof(*pc), KM_SLEEP);
322	pc->pc_offset = offset;
323	pc->pc_size = size;
324	pc->pc_ctor = ctor;
325	pc->pc_dtor = dtor;
326	pc->pc_cookie = cookie;
327
328	if (ctor) {
329		CPU_INFO_ITERATOR cii;
330		struct cpu_info *ci;
331		void *buf;
332
333		/*
334		 * Wait until nobody is using the list of percpus with
335		 * constructors.
336		 */
337		mutex_enter(&percpu_allocation.lock);
338		while (percpu_allocation.busy)
339			cv_wait(&percpu_allocation.cv,
340			    &percpu_allocation.lock);
341		percpu_allocation.busy = curlwp;
342		mutex_exit(&percpu_allocation.lock);
343
344		/*
345		 * Run the constructor for all CPUs.  We use a
346		 * temporary buffer wo that we need not hold the
347		 * percpu_swap_lock while running the constructor.
348		 */
349		buf = kmem_alloc(size, KM_SLEEP);
350		for (CPU_INFO_FOREACH(cii, ci)) {
351			memset(buf, 0, size);
352			(*ctor)(buf, cookie, ci);
353			percpu_traverse_enter();
354			memcpy(percpu_getptr_remote(pc, ci), buf, size);
355			percpu_traverse_exit();
356		}
357		explicit_memset(buf, 0, size);
358		kmem_free(buf, size);
359
360		/*
361		 * Insert the percpu into the list of percpus with
362		 * constructors.  We are now done using the list, so it
363		 * is safe for concurrent percpu_create or concurrent
364		 * percpu_init_cpu to run.
365		 */
366		mutex_enter(&percpu_allocation.lock);
367		KASSERT(percpu_allocation.busy == curlwp);
368		percpu_allocation.busy = NULL;
369		cv_broadcast(&percpu_allocation.cv);
370		LIST_INSERT_HEAD(&percpu_allocation.ctor_list, pc, pc_list);
371		mutex_exit(&percpu_allocation.lock);
372	} else {
373		percpu_zero(pc, size);
374	}
375
376	return pc;
377}
378
379/*
380 * percpu_free: free percpu storage
381 *
382 * => called in thread context.
383 * => considered as an expensive and rare operation.
384 */
385
386void
387percpu_free(percpu_t *pc, size_t size)
388{
389
390	ASSERT_SLEEPABLE();
391	KASSERT(size == pc->pc_size);
392
393	/*
394	 * If there's a constructor, take the percpu off the list of
395	 * percpus with constructors, but first wait until nobody is
396	 * using the list.
397	 */
398	if (pc->pc_ctor) {
399		mutex_enter(&percpu_allocation.lock);
400		while (percpu_allocation.busy)
401			cv_wait(&percpu_allocation.cv,
402			    &percpu_allocation.lock);
403		LIST_REMOVE(pc, pc_list);
404		mutex_exit(&percpu_allocation.lock);
405	}
406
407	/* If there's a destructor, run it now for all CPUs.  */
408	if (pc->pc_dtor) {
409		CPU_INFO_ITERATOR cii;
410		struct cpu_info *ci;
411		void *buf;
412
413		buf = kmem_alloc(size, KM_SLEEP);
414		for (CPU_INFO_FOREACH(cii, ci)) {
415			percpu_traverse_enter();
416			memcpy(buf, percpu_getptr_remote(pc, ci), size);
417			explicit_memset(percpu_getptr_remote(pc, ci), 0, size);
418			percpu_traverse_exit();
419			(*pc->pc_dtor)(buf, pc->pc_cookie, ci);
420		}
421		explicit_memset(buf, 0, size);
422		kmem_free(buf, size);
423	}
424
425	vmem_free(percpu_offset_arena, (vmem_addr_t)percpu_offset(pc), size);
426	kmem_free(pc, sizeof(*pc));
427}
428
429/*
430 * percpu_getref:
431 *
432 * => safe to be used in either thread or interrupt context
433 * => disables preemption; must be bracketed with a percpu_putref()
434 */
435
436void *
437percpu_getref(percpu_t *pc)
438{
439
440	kpreempt_disable();
441	return percpu_getptr_remote(pc, curcpu());
442}
443
444/*
445 * percpu_putref:
446 *
447 * => drops the preemption-disabled count after caller is done with per-cpu
448 *    data
449 */
450
451void
452percpu_putref(percpu_t *pc)
453{
454
455	kpreempt_enable();
456}
457
458/*
459 * percpu_traverse_enter, percpu_traverse_exit, percpu_getptr_remote:
460 * helpers to access remote cpu's percpu data.
461 *
462 * => called in thread context.
463 * => percpu_traverse_enter can block low-priority xcalls.
464 * => typical usage would be:
465 *
466 *	sum = 0;
467 *	percpu_traverse_enter();
468 *	for (CPU_INFO_FOREACH(cii, ci)) {
469 *		unsigned int *p = percpu_getptr_remote(pc, ci);
470 *		sum += *p;
471 *	}
472 *	percpu_traverse_exit();
473 */
474
475void
476percpu_traverse_enter(void)
477{
478
479	ASSERT_SLEEPABLE();
480	rw_enter(&percpu_swap_lock, RW_READER);
481}
482
483void
484percpu_traverse_exit(void)
485{
486
487	rw_exit(&percpu_swap_lock);
488}
489
490void *
491percpu_getptr_remote(percpu_t *pc, struct cpu_info *ci)
492{
493
494	return &((char *)cpu_percpu(ci)->pcc_data)[percpu_offset(pc)];
495}
496
497/*
498 * percpu_foreach: call the specified callback function for each cpus.
499 *
500 * => must be called from thread context.
501 * => callback executes on **current** CPU (or, really, arbitrary CPU,
502 *    in case of preemption)
503 * => caller should not rely on the cpu iteration order.
504 * => the callback function should be minimum because it is executed with
505 *    holding a global lock, which can block low-priority xcalls.
506 *    eg. it's illegal for a callback function to sleep for memory allocation.
507 */
508void
509percpu_foreach(percpu_t *pc, percpu_callback_t cb, void *arg)
510{
511	CPU_INFO_ITERATOR cii;
512	struct cpu_info *ci;
513
514	percpu_traverse_enter();
515	for (CPU_INFO_FOREACH(cii, ci)) {
516		(*cb)(percpu_getptr_remote(pc, ci), arg, ci);
517	}
518	percpu_traverse_exit();
519}
520
521struct percpu_xcall_ctx {
522	percpu_callback_t  ctx_cb;
523	void		  *ctx_arg;
524};
525
526static void
527percpu_xcfunc(void * const v1, void * const v2)
528{
529	percpu_t * const pc = v1;
530	struct percpu_xcall_ctx * const ctx = v2;
531
532	(*ctx->ctx_cb)(percpu_getref(pc), ctx->ctx_arg, curcpu());
533	percpu_putref(pc);
534}
535
536/*
537 * percpu_foreach_xcall: call the specified callback function for each
538 * cpu.  This version uses an xcall to run the callback on each cpu.
539 *
540 * => must be called from thread context.
541 * => callback executes on **remote** CPU in soft-interrupt context
542 *    (at the specified soft interrupt priority).
543 * => caller should not rely on the cpu iteration order.
544 * => the callback function should be minimum because it may be
545 *    executed in soft-interrupt context.  eg. it's illegal for
546 *    a callback function to sleep for memory allocation.
547 */
548void
549percpu_foreach_xcall(percpu_t *pc, u_int xcflags, percpu_callback_t cb,
550		     void *arg)
551{
552	struct percpu_xcall_ctx ctx = {
553		.ctx_cb = cb,
554		.ctx_arg = arg,
555	};
556	CPU_INFO_ITERATOR cii;
557	struct cpu_info *ci;
558
559	for (CPU_INFO_FOREACH(cii, ci)) {
560		xc_wait(xc_unicast(xcflags, percpu_xcfunc, pc, &ctx, ci));
561	}
562}
563