subr_percpu.c revision 1.21
1/*	$NetBSD: subr_percpu.c,v 1.21 2020/02/01 12:49:02 riastradh Exp $	*/
2
3/*-
4 * Copyright (c)2007,2008 YAMAMOTO Takashi,
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29/*
30 * per-cpu storage.
31 */
32
33#include <sys/cdefs.h>
34__KERNEL_RCSID(0, "$NetBSD: subr_percpu.c,v 1.21 2020/02/01 12:49:02 riastradh Exp $");
35
36#include <sys/param.h>
37#include <sys/cpu.h>
38#include <sys/kmem.h>
39#include <sys/kernel.h>
40#include <sys/mutex.h>
41#include <sys/percpu.h>
42#include <sys/rwlock.h>
43#include <sys/vmem.h>
44#include <sys/xcall.h>
45
46#define	PERCPU_QUANTUM_SIZE	(ALIGNBYTES + 1)
47#define	PERCPU_QCACHE_MAX	0
48#define	PERCPU_IMPORT_SIZE	2048
49
50struct percpu {
51	unsigned		pc_offset;
52	size_t			pc_size;
53	percpu_callback_t	pc_dtor;
54	void			*pc_cookie;
55};
56
57static krwlock_t	percpu_swap_lock	__cacheline_aligned;
58static kmutex_t		percpu_allocation_lock	__cacheline_aligned;
59static vmem_t *		percpu_offset_arena	__cacheline_aligned;
60static unsigned int	percpu_nextoff		__cacheline_aligned;
61
62static percpu_cpu_t *
63cpu_percpu(struct cpu_info *ci)
64{
65
66	return &ci->ci_data.cpu_percpu;
67}
68
69static unsigned int
70percpu_offset(percpu_t *pc)
71{
72	const unsigned int off = pc->pc_offset;
73
74	KASSERT(off < percpu_nextoff);
75	return off;
76}
77
78/*
79 * percpu_cpu_swap: crosscall handler for percpu_cpu_enlarge
80 */
81__noubsan
82static void
83percpu_cpu_swap(void *p1, void *p2)
84{
85	struct cpu_info * const ci = p1;
86	percpu_cpu_t * const newpcc = p2;
87	percpu_cpu_t * const pcc = cpu_percpu(ci);
88
89	KASSERT(ci == curcpu() || !mp_online);
90
91	/*
92	 * swap *pcc and *newpcc unless anyone has beaten us.
93	 */
94	rw_enter(&percpu_swap_lock, RW_WRITER);
95	if (newpcc->pcc_size > pcc->pcc_size) {
96		percpu_cpu_t tmp;
97		int s;
98
99		tmp = *pcc;
100
101		/*
102		 * block interrupts so that we don't lose their modifications.
103		 */
104
105		s = splhigh();
106
107		/*
108		 * copy data to new storage.
109		 */
110
111		memcpy(newpcc->pcc_data, pcc->pcc_data, pcc->pcc_size);
112
113		/*
114		 * this assignment needs to be atomic for percpu_getptr_remote.
115		 */
116
117		pcc->pcc_data = newpcc->pcc_data;
118
119		splx(s);
120
121		pcc->pcc_size = newpcc->pcc_size;
122		*newpcc = tmp;
123	}
124	rw_exit(&percpu_swap_lock);
125}
126
127/*
128 * percpu_cpu_enlarge: ensure that percpu_cpu_t of each cpus have enough space
129 */
130
131static void
132percpu_cpu_enlarge(size_t size)
133{
134	CPU_INFO_ITERATOR cii;
135	struct cpu_info *ci;
136
137	for (CPU_INFO_FOREACH(cii, ci)) {
138		percpu_cpu_t pcc;
139
140		pcc.pcc_data = kmem_alloc(size, KM_SLEEP); /* XXX cacheline */
141		pcc.pcc_size = size;
142		if (!mp_online) {
143			percpu_cpu_swap(ci, &pcc);
144		} else {
145			uint64_t where;
146
147			where = xc_unicast(0, percpu_cpu_swap, ci, &pcc, ci);
148			xc_wait(where);
149		}
150		KASSERT(pcc.pcc_size <= size);
151		if (pcc.pcc_data != NULL) {
152			kmem_free(pcc.pcc_data, pcc.pcc_size);
153		}
154	}
155}
156
157/*
158 * percpu_backend_alloc: vmem import callback for percpu_offset_arena
159 */
160
161static int
162percpu_backend_alloc(vmem_t *dummy, vmem_size_t size, vmem_size_t *resultsize,
163    vm_flag_t vmflags, vmem_addr_t *addrp)
164{
165	unsigned int offset;
166	unsigned int nextoff;
167
168	ASSERT_SLEEPABLE();
169	KASSERT(dummy == NULL);
170
171	if ((vmflags & VM_NOSLEEP) != 0)
172		return ENOMEM;
173
174	size = roundup(size, PERCPU_IMPORT_SIZE);
175	mutex_enter(&percpu_allocation_lock);
176	offset = percpu_nextoff;
177	percpu_nextoff = nextoff = percpu_nextoff + size;
178	mutex_exit(&percpu_allocation_lock);
179
180	percpu_cpu_enlarge(nextoff);
181
182	*resultsize = size;
183	*addrp = (vmem_addr_t)offset;
184	return 0;
185}
186
187static void
188percpu_zero_cb(void *vp, void *vp2, struct cpu_info *ci)
189{
190	size_t sz = (uintptr_t)vp2;
191
192	memset(vp, 0, sz);
193}
194
195/*
196 * percpu_zero: initialize percpu storage with zero.
197 */
198
199static void
200percpu_zero(percpu_t *pc, size_t sz)
201{
202
203	percpu_foreach(pc, percpu_zero_cb, (void *)(uintptr_t)sz);
204}
205
206/*
207 * percpu_init: subsystem initialization
208 */
209
210void
211percpu_init(void)
212{
213
214	ASSERT_SLEEPABLE();
215	rw_init(&percpu_swap_lock);
216	mutex_init(&percpu_allocation_lock, MUTEX_DEFAULT, IPL_NONE);
217	percpu_nextoff = PERCPU_QUANTUM_SIZE;
218
219	percpu_offset_arena = vmem_xcreate("percpu", 0, 0, PERCPU_QUANTUM_SIZE,
220	    percpu_backend_alloc, NULL, NULL, PERCPU_QCACHE_MAX, VM_SLEEP,
221	    IPL_NONE);
222}
223
224/*
225 * percpu_init_cpu: cpu initialization
226 *
227 * => should be called before the cpu appears on the list for CPU_INFO_FOREACH.
228 */
229
230void
231percpu_init_cpu(struct cpu_info *ci)
232{
233	percpu_cpu_t * const pcc = cpu_percpu(ci);
234	size_t size = percpu_nextoff; /* XXX racy */
235
236	ASSERT_SLEEPABLE();
237	pcc->pcc_size = size;
238	if (size) {
239		pcc->pcc_data = kmem_zalloc(pcc->pcc_size, KM_SLEEP);
240	}
241}
242
243/*
244 * percpu_alloc: allocate percpu storage
245 *
246 * => called in thread context.
247 * => considered as an expensive and rare operation.
248 * => allocated storage is initialized with zeros.
249 */
250
251percpu_t *
252percpu_alloc(size_t size)
253{
254
255	return percpu_create(size, NULL, NULL, NULL);
256}
257
258/*
259 * percpu_create: allocate percpu storage and associate ctor/dtor with it
260 *
261 * => called in thread context.
262 * => considered as an expensive and rare operation.
263 * => allocated storage is initialized by ctor, or zeros if ctor is null
264 * => percpu_free will call dtor first, if dtor is nonnull
265 * => ctor or dtor may sleep, even on allocation
266 */
267
268percpu_t *
269percpu_create(size_t size, percpu_callback_t ctor, percpu_callback_t dtor,
270    void *cookie)
271{
272	vmem_addr_t offset;
273	percpu_t *pc;
274
275	ASSERT_SLEEPABLE();
276	(void)vmem_alloc(percpu_offset_arena, size, VM_SLEEP | VM_BESTFIT,
277	    &offset);
278
279	pc = kmem_alloc(sizeof(*pc), KM_SLEEP);
280	pc->pc_offset = offset;
281	pc->pc_size = size;
282	pc->pc_dtor = dtor;
283	pc->pc_cookie = cookie;
284
285	if (ctor) {
286		CPU_INFO_ITERATOR cii;
287		struct cpu_info *ci;
288		void *buf;
289
290		buf = kmem_alloc(size, KM_SLEEP);
291		for (CPU_INFO_FOREACH(cii, ci)) {
292			memset(buf, 0, size);
293			(*ctor)(buf, cookie, ci);
294			percpu_traverse_enter();
295			memcpy(percpu_getptr_remote(pc, ci), buf, size);
296			percpu_traverse_exit();
297		}
298		explicit_memset(buf, 0, size);
299		kmem_free(buf, size);
300	} else {
301		percpu_zero(pc, size);
302	}
303
304	return pc;
305}
306
307/*
308 * percpu_free: free percpu storage
309 *
310 * => called in thread context.
311 * => considered as an expensive and rare operation.
312 */
313
314void
315percpu_free(percpu_t *pc, size_t size)
316{
317
318	ASSERT_SLEEPABLE();
319	KASSERT(size == pc->pc_size);
320
321	if (pc->pc_dtor) {
322		CPU_INFO_ITERATOR cii;
323		struct cpu_info *ci;
324		void *buf;
325
326		buf = kmem_alloc(size, KM_SLEEP);
327		for (CPU_INFO_FOREACH(cii, ci)) {
328			percpu_traverse_enter();
329			memcpy(buf, percpu_getptr_remote(pc, ci), size);
330			explicit_memset(percpu_getptr_remote(pc, ci), 0, size);
331			percpu_traverse_exit();
332			(*pc->pc_dtor)(buf, pc->pc_cookie, ci);
333		}
334		explicit_memset(buf, 0, size);
335		kmem_free(buf, size);
336	}
337
338	vmem_free(percpu_offset_arena, (vmem_addr_t)percpu_offset(pc), size);
339	kmem_free(pc, sizeof(*pc));
340}
341
342/*
343 * percpu_getref:
344 *
345 * => safe to be used in either thread or interrupt context
346 * => disables preemption; must be bracketed with a percpu_putref()
347 */
348
349void *
350percpu_getref(percpu_t *pc)
351{
352
353	kpreempt_disable();
354	return percpu_getptr_remote(pc, curcpu());
355}
356
357/*
358 * percpu_putref:
359 *
360 * => drops the preemption-disabled count after caller is done with per-cpu
361 *    data
362 */
363
364void
365percpu_putref(percpu_t *pc)
366{
367
368	kpreempt_enable();
369}
370
371/*
372 * percpu_traverse_enter, percpu_traverse_exit, percpu_getptr_remote:
373 * helpers to access remote cpu's percpu data.
374 *
375 * => called in thread context.
376 * => percpu_traverse_enter can block low-priority xcalls.
377 * => typical usage would be:
378 *
379 *	sum = 0;
380 *	percpu_traverse_enter();
381 *	for (CPU_INFO_FOREACH(cii, ci)) {
382 *		unsigned int *p = percpu_getptr_remote(pc, ci);
383 *		sum += *p;
384 *	}
385 *	percpu_traverse_exit();
386 */
387
388void
389percpu_traverse_enter(void)
390{
391
392	ASSERT_SLEEPABLE();
393	rw_enter(&percpu_swap_lock, RW_READER);
394}
395
396void
397percpu_traverse_exit(void)
398{
399
400	rw_exit(&percpu_swap_lock);
401}
402
403void *
404percpu_getptr_remote(percpu_t *pc, struct cpu_info *ci)
405{
406
407	return &((char *)cpu_percpu(ci)->pcc_data)[percpu_offset(pc)];
408}
409
410/*
411 * percpu_foreach: call the specified callback function for each cpus.
412 *
413 * => called in thread context.
414 * => caller should not rely on the cpu iteration order.
415 * => the callback function should be minimum because it is executed with
416 *    holding a global lock, which can block low-priority xcalls.
417 *    eg. it's illegal for a callback function to sleep for memory allocation.
418 */
419void
420percpu_foreach(percpu_t *pc, percpu_callback_t cb, void *arg)
421{
422	CPU_INFO_ITERATOR cii;
423	struct cpu_info *ci;
424
425	percpu_traverse_enter();
426	for (CPU_INFO_FOREACH(cii, ci)) {
427		(*cb)(percpu_getptr_remote(pc, ci), arg, ci);
428	}
429	percpu_traverse_exit();
430}
431