kern_mbuf.c revision 174247
1/*-
2 * Copyright (c) 2004, 2005,
3 * 	Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice unmodified, this list of conditions and the following
10 *    disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_mbuf.c 174247 2007-12-04 07:06:08Z alc $");
30
31#include "opt_mac.h"
32#include "opt_param.h"
33
34#include <sys/param.h>
35#include <sys/malloc.h>
36#include <sys/systm.h>
37#include <sys/mbuf.h>
38#include <sys/domain.h>
39#include <sys/eventhandler.h>
40#include <sys/kernel.h>
41#include <sys/protosw.h>
42#include <sys/smp.h>
43#include <sys/sysctl.h>
44
45#include <security/mac/mac_framework.h>
46
47#include <vm/vm.h>
48#include <vm/vm_page.h>
49#include <vm/uma.h>
50#include <vm/uma_int.h>
51#include <vm/uma_dbg.h>
52
53/*
54 * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
55 * Zones.
56 *
57 * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
58 * Zone.  The Zone can be capped at kern.ipc.nmbclusters, if the
59 * administrator so desires.
60 *
61 * Mbufs are allocated from a UMA Master Zone called the Mbuf
62 * Zone.
63 *
64 * Additionally, FreeBSD provides a Packet Zone, which it
65 * configures as a Secondary Zone to the Mbuf Master Zone,
66 * thus sharing backend Slab kegs with the Mbuf Master Zone.
67 *
68 * Thus common-case allocations and locking are simplified:
69 *
70 *  m_clget()                m_getcl()
71 *    |                         |
72 *    |   .------------>[(Packet Cache)]    m_get(), m_gethdr()
73 *    |   |             [     Packet   ]            |
74 *  [(Cluster Cache)]   [    Secondary ]   [ (Mbuf Cache)     ]
75 *  [ Cluster Zone  ]   [     Zone     ]   [ Mbuf Master Zone ]
76 *        |                       \________         |
77 *  [ Cluster Keg   ]                      \       /
78 *        |    	                         [ Mbuf Keg   ]
79 *  [ Cluster Slabs ]                         |
80 *        |                              [ Mbuf Slabs ]
81 *         \____________(VM)_________________/
82 *
83 *
84 * Whenever an object is allocated with uma_zalloc() out of
85 * one of the Zones its _ctor_ function is executed.  The same
86 * for any deallocation through uma_zfree() the _dtor_ function
87 * is executed.
88 *
89 * Caches are per-CPU and are filled from the Master Zone.
90 *
91 * Whenever an object is allocated from the underlying global
92 * memory pool it gets pre-initialized with the _zinit_ functions.
93 * When the Keg's are overfull objects get decomissioned with
94 * _zfini_ functions and free'd back to the global memory pool.
95 *
96 */
97
98int nmbclusters;		/* limits number of mbuf clusters */
99int nmbjumbop;			/* limits number of page size jumbo clusters */
100int nmbjumbo9;			/* limits number of 9k jumbo clusters */
101int nmbjumbo16;			/* limits number of 16k jumbo clusters */
102struct mbstat mbstat;
103
104static void
105tunable_mbinit(void *dummy)
106{
107
108	/* This has to be done before VM init. */
109	nmbclusters = 1024 + maxusers * 64;
110	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
111}
112SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
113
114/* XXX: These should be tuneables. Can't change UMA limits on the fly. */
115static int
116sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
117{
118	int error, newnmbclusters;
119
120	newnmbclusters = nmbclusters;
121	error = sysctl_handle_int(oidp, &newnmbclusters, 0, req);
122	if (error == 0 && req->newptr) {
123		if (newnmbclusters > nmbclusters) {
124			nmbclusters = newnmbclusters;
125			uma_zone_set_max(zone_clust, nmbclusters);
126			EVENTHANDLER_INVOKE(nmbclusters_change);
127		} else
128			error = EINVAL;
129	}
130	return (error);
131}
132SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW,
133&nmbclusters, 0, sysctl_nmbclusters, "IU",
134    "Maximum number of mbuf clusters allowed");
135SYSCTL_INT(_kern_ipc, OID_AUTO, nmbjumbop, CTLFLAG_RW, &nmbjumbop, 0,
136    "Maximum number of mbuf page size jumbo clusters allowed");
137SYSCTL_INT(_kern_ipc, OID_AUTO, nmbjumbo9, CTLFLAG_RW, &nmbjumbo9, 0,
138    "Maximum number of mbuf 9k jumbo clusters allowed");
139SYSCTL_INT(_kern_ipc, OID_AUTO, nmbjumbo16, CTLFLAG_RW, &nmbjumbo16, 0,
140    "Maximum number of mbuf 16k jumbo clusters allowed");
141SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
142    "Mbuf general information and statistics");
143
144/*
145 * Zones from which we allocate.
146 */
147uma_zone_t	zone_mbuf;
148uma_zone_t	zone_clust;
149uma_zone_t	zone_pack;
150uma_zone_t	zone_jumbop;
151uma_zone_t	zone_jumbo9;
152uma_zone_t	zone_jumbo16;
153uma_zone_t	zone_ext_refcnt;
154
155/*
156 * Local prototypes.
157 */
158static int	mb_ctor_mbuf(void *, int, void *, int);
159static int	mb_ctor_clust(void *, int, void *, int);
160static int	mb_ctor_pack(void *, int, void *, int);
161static void	mb_dtor_mbuf(void *, int, void *);
162static void	mb_dtor_clust(void *, int, void *);
163static void	mb_dtor_pack(void *, int, void *);
164static int	mb_zinit_pack(void *, int, int);
165static void	mb_zfini_pack(void *, int);
166
167static void	mb_reclaim(void *);
168static void	mbuf_init(void *);
169static void    *mbuf_jumbo_alloc(uma_zone_t, int, u_int8_t *, int);
170static void	mbuf_jumbo_free(void *, int, u_int8_t);
171
172static MALLOC_DEFINE(M_JUMBOFRAME, "jumboframes", "mbuf jumbo frame buffers");
173
174/* Ensure that MSIZE doesn't break dtom() - it must be a power of 2 */
175CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
176
177/*
178 * Initialize FreeBSD Network buffer allocation.
179 */
180SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
181static void
182mbuf_init(void *dummy)
183{
184
185	/*
186	 * Configure UMA zones for Mbufs, Clusters, and Packets.
187	 */
188	zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE,
189	    mb_ctor_mbuf, mb_dtor_mbuf,
190#ifdef INVARIANTS
191	    trash_init, trash_fini,
192#else
193	    NULL, NULL,
194#endif
195	    MSIZE - 1, UMA_ZONE_MAXBUCKET);
196
197	zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES,
198	    mb_ctor_clust, mb_dtor_clust,
199#ifdef INVARIANTS
200	    trash_init, trash_fini,
201#else
202	    NULL, NULL,
203#endif
204	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
205	if (nmbclusters > 0)
206		uma_zone_set_max(zone_clust, nmbclusters);
207
208	zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack,
209	    mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf);
210
211	/* Make jumbo frame zone too. Page size, 9k and 16k. */
212	zone_jumbop = uma_zcreate(MBUF_JUMBOP_MEM_NAME, MJUMPAGESIZE,
213	    mb_ctor_clust, mb_dtor_clust,
214#ifdef INVARIANTS
215	    trash_init, trash_fini,
216#else
217	    NULL, NULL,
218#endif
219	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
220	if (nmbjumbop > 0)
221		uma_zone_set_max(zone_jumbop, nmbjumbop);
222
223	zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES,
224	    mb_ctor_clust, mb_dtor_clust,
225#ifdef INVARIANTS
226	    trash_init, trash_fini,
227#else
228	    NULL, NULL,
229#endif
230	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
231	if (nmbjumbo9 > 0)
232		uma_zone_set_max(zone_jumbo9, nmbjumbo9);
233	uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc);
234	uma_zone_set_freef(zone_jumbo9, mbuf_jumbo_free);
235
236	zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES,
237	    mb_ctor_clust, mb_dtor_clust,
238#ifdef INVARIANTS
239	    trash_init, trash_fini,
240#else
241	    NULL, NULL,
242#endif
243	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
244	if (nmbjumbo16 > 0)
245		uma_zone_set_max(zone_jumbo16, nmbjumbo16);
246	uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc);
247	uma_zone_set_freef(zone_jumbo16, mbuf_jumbo_free);
248
249	zone_ext_refcnt = uma_zcreate(MBUF_EXTREFCNT_MEM_NAME, sizeof(u_int),
250	    NULL, NULL,
251	    NULL, NULL,
252	    UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
253
254	/* uma_prealloc() goes here... */
255
256	/*
257	 * Hook event handler for low-memory situation, used to
258	 * drain protocols and push data back to the caches (UMA
259	 * later pushes it back to VM).
260	 */
261	EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
262	    EVENTHANDLER_PRI_FIRST);
263
264	/*
265	 * [Re]set counters and local statistics knobs.
266	 * XXX Some of these should go and be replaced, but UMA stat
267	 * gathering needs to be revised.
268	 */
269	mbstat.m_mbufs = 0;
270	mbstat.m_mclusts = 0;
271	mbstat.m_drain = 0;
272	mbstat.m_msize = MSIZE;
273	mbstat.m_mclbytes = MCLBYTES;
274	mbstat.m_minclsize = MINCLSIZE;
275	mbstat.m_mlen = MLEN;
276	mbstat.m_mhlen = MHLEN;
277	mbstat.m_numtypes = MT_NTYPES;
278
279	mbstat.m_mcfail = mbstat.m_mpfail = 0;
280	mbstat.sf_iocnt = 0;
281	mbstat.sf_allocwait = mbstat.sf_allocfail = 0;
282}
283
284/*
285 * UMA backend page allocator for the jumbo frame zones.
286 *
287 * Allocates kernel virtual memory that is backed by contiguous physical
288 * pages.
289 */
290static void *
291mbuf_jumbo_alloc(uma_zone_t zone, int bytes, u_int8_t *flags, int wait)
292{
293
294	*flags = UMA_SLAB_PRIV;
295	return (contigmalloc(bytes, M_JUMBOFRAME, wait, (vm_paddr_t)0,
296	    ~(vm_paddr_t)0, 1, 0));
297}
298
299/*
300 * UMA backend page deallocator for the jumbo frame zones.
301 */
302static void
303mbuf_jumbo_free(void *mem, int size, u_int8_t flags)
304{
305
306	contigfree(mem, size, M_JUMBOFRAME);
307}
308
309/*
310 * Constructor for Mbuf master zone.
311 *
312 * The 'arg' pointer points to a mb_args structure which
313 * contains call-specific information required to support the
314 * mbuf allocation API.  See mbuf.h.
315 */
316static int
317mb_ctor_mbuf(void *mem, int size, void *arg, int how)
318{
319	struct mbuf *m;
320	struct mb_args *args;
321#ifdef MAC
322	int error;
323#endif
324	int flags;
325	short type;
326
327#ifdef INVARIANTS
328	trash_ctor(mem, size, arg, how);
329#endif
330	m = (struct mbuf *)mem;
331	args = (struct mb_args *)arg;
332	flags = args->flags;
333	type = args->type;
334
335	/*
336	 * The mbuf is initialized later.  The caller has the
337	 * responsibility to set up any MAC labels too.
338	 */
339	if (type == MT_NOINIT)
340		return (0);
341
342	m->m_next = NULL;
343	m->m_nextpkt = NULL;
344	m->m_len = 0;
345	m->m_flags = flags;
346	m->m_type = type;
347	if (flags & M_PKTHDR) {
348		m->m_data = m->m_pktdat;
349		m->m_pkthdr.rcvif = NULL;
350		m->m_pkthdr.len = 0;
351		m->m_pkthdr.header = NULL;
352		m->m_pkthdr.csum_flags = 0;
353		m->m_pkthdr.csum_data = 0;
354		m->m_pkthdr.tso_segsz = 0;
355		m->m_pkthdr.ether_vtag = 0;
356		SLIST_INIT(&m->m_pkthdr.tags);
357#ifdef MAC
358		/* If the label init fails, fail the alloc */
359		error = mac_mbuf_init(m, how);
360		if (error)
361			return (error);
362#endif
363	} else
364		m->m_data = m->m_dat;
365	return (0);
366}
367
368/*
369 * The Mbuf master zone destructor.
370 */
371static void
372mb_dtor_mbuf(void *mem, int size, void *arg)
373{
374	struct mbuf *m;
375	unsigned long flags;
376
377	m = (struct mbuf *)mem;
378	flags = (unsigned long)arg;
379
380	if ((flags & MB_NOTAGS) == 0 && (m->m_flags & M_PKTHDR) != 0)
381		m_tag_delete_chain(m, NULL);
382	KASSERT((m->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__));
383	KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
384#ifdef INVARIANTS
385	trash_dtor(mem, size, arg);
386#endif
387}
388
389/*
390 * The Mbuf Packet zone destructor.
391 */
392static void
393mb_dtor_pack(void *mem, int size, void *arg)
394{
395	struct mbuf *m;
396
397	m = (struct mbuf *)mem;
398	if ((m->m_flags & M_PKTHDR) != 0)
399		m_tag_delete_chain(m, NULL);
400
401	/* Make sure we've got a clean cluster back. */
402	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
403	KASSERT(m->m_ext.ext_buf != NULL, ("%s: ext_buf == NULL", __func__));
404	KASSERT(m->m_ext.ext_free == NULL, ("%s: ext_free != NULL", __func__));
405	KASSERT(m->m_ext.ext_args == NULL, ("%s: ext_args != NULL", __func__));
406	KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__));
407	KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__));
408	KASSERT(*m->m_ext.ref_cnt == 1, ("%s: ref_cnt != 1", __func__));
409#ifdef INVARIANTS
410	trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg);
411#endif
412	/*
413	 * If there are processes blocked on zone_clust, waiting for pages
414	 * to be freed up, * cause them to be woken up by draining the
415	 * packet zone.  We are exposed to a race here * (in the check for
416	 * the UMA_ZFLAG_FULL) where we might miss the flag set, but that
417	 * is deliberate. We don't want to acquire the zone lock for every
418	 * mbuf free.
419	 */
420	if (uma_zone_exhausted_nolock(zone_clust))
421		zone_drain(zone_pack);
422}
423
424/*
425 * The Cluster and Jumbo[PAGESIZE|9|16] zone constructor.
426 *
427 * Here the 'arg' pointer points to the Mbuf which we
428 * are configuring cluster storage for.  If 'arg' is
429 * empty we allocate just the cluster without setting
430 * the mbuf to it.  See mbuf.h.
431 */
432static int
433mb_ctor_clust(void *mem, int size, void *arg, int how)
434{
435	struct mbuf *m;
436	u_int *refcnt;
437	int type;
438	uma_zone_t zone;
439
440#ifdef INVARIANTS
441	trash_ctor(mem, size, arg, how);
442#endif
443	switch (size) {
444	case MCLBYTES:
445		type = EXT_CLUSTER;
446		zone = zone_clust;
447		break;
448#if MJUMPAGESIZE != MCLBYTES
449	case MJUMPAGESIZE:
450		type = EXT_JUMBOP;
451		zone = zone_jumbop;
452		break;
453#endif
454	case MJUM9BYTES:
455		type = EXT_JUMBO9;
456		zone = zone_jumbo9;
457		break;
458	case MJUM16BYTES:
459		type = EXT_JUMBO16;
460		zone = zone_jumbo16;
461		break;
462	default:
463		panic("unknown cluster size");
464		break;
465	}
466
467	m = (struct mbuf *)arg;
468	refcnt = uma_find_refcnt(zone, mem);
469	*refcnt = 1;
470	if (m != NULL) {
471		m->m_ext.ext_buf = (caddr_t)mem;
472		m->m_data = m->m_ext.ext_buf;
473		m->m_flags |= M_EXT;
474		m->m_ext.ext_free = NULL;
475		m->m_ext.ext_args = NULL;
476		m->m_ext.ext_size = size;
477		m->m_ext.ext_type = type;
478		m->m_ext.ref_cnt = refcnt;
479	}
480
481	return (0);
482}
483
484/*
485 * The Mbuf Cluster zone destructor.
486 */
487static void
488mb_dtor_clust(void *mem, int size, void *arg)
489{
490#ifdef INVARIANTS
491	uma_zone_t zone;
492
493	zone = m_getzone(size);
494	KASSERT(*(uma_find_refcnt(zone, mem)) <= 1,
495		("%s: refcnt incorrect %u", __func__,
496		 *(uma_find_refcnt(zone, mem))) );
497
498	trash_dtor(mem, size, arg);
499#endif
500}
501
502/*
503 * The Packet secondary zone's init routine, executed on the
504 * object's transition from mbuf keg slab to zone cache.
505 */
506static int
507mb_zinit_pack(void *mem, int size, int how)
508{
509	struct mbuf *m;
510
511	m = (struct mbuf *)mem;		/* m is virgin. */
512	if (uma_zalloc_arg(zone_clust, m, how) == NULL ||
513	    m->m_ext.ext_buf == NULL)
514		return (ENOMEM);
515	m->m_ext.ext_type = EXT_PACKET;	/* Override. */
516#ifdef INVARIANTS
517	trash_init(m->m_ext.ext_buf, MCLBYTES, how);
518#endif
519	return (0);
520}
521
522/*
523 * The Packet secondary zone's fini routine, executed on the
524 * object's transition from zone cache to keg slab.
525 */
526static void
527mb_zfini_pack(void *mem, int size)
528{
529	struct mbuf *m;
530
531	m = (struct mbuf *)mem;
532#ifdef INVARIANTS
533	trash_fini(m->m_ext.ext_buf, MCLBYTES);
534#endif
535	uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
536#ifdef INVARIANTS
537	trash_dtor(mem, size, NULL);
538#endif
539}
540
541/*
542 * The "packet" keg constructor.
543 */
544static int
545mb_ctor_pack(void *mem, int size, void *arg, int how)
546{
547	struct mbuf *m;
548	struct mb_args *args;
549#ifdef MAC
550	int error;
551#endif
552	int flags;
553	short type;
554
555	m = (struct mbuf *)mem;
556	args = (struct mb_args *)arg;
557	flags = args->flags;
558	type = args->type;
559
560#ifdef INVARIANTS
561	trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how);
562#endif
563	m->m_next = NULL;
564	m->m_nextpkt = NULL;
565	m->m_data = m->m_ext.ext_buf;
566	m->m_len = 0;
567	m->m_flags = (flags | M_EXT);
568	m->m_type = type;
569
570	if (flags & M_PKTHDR) {
571		m->m_pkthdr.rcvif = NULL;
572		m->m_pkthdr.len = 0;
573		m->m_pkthdr.header = NULL;
574		m->m_pkthdr.csum_flags = 0;
575		m->m_pkthdr.csum_data = 0;
576		m->m_pkthdr.tso_segsz = 0;
577		m->m_pkthdr.ether_vtag = 0;
578		SLIST_INIT(&m->m_pkthdr.tags);
579#ifdef MAC
580		/* If the label init fails, fail the alloc */
581		error = mac_mbuf_init(m, how);
582		if (error)
583			return (error);
584#endif
585	}
586	/* m_ext is already initialized. */
587
588	return (0);
589}
590
591/*
592 * This is the protocol drain routine.
593 *
594 * No locks should be held when this is called.  The drain routines have to
595 * presently acquire some locks which raises the possibility of lock order
596 * reversal.
597 */
598static void
599mb_reclaim(void *junk)
600{
601	struct domain *dp;
602	struct protosw *pr;
603
604	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
605	    "mb_reclaim()");
606
607	for (dp = domains; dp != NULL; dp = dp->dom_next)
608		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
609			if (pr->pr_drain != NULL)
610				(*pr->pr_drain)();
611}
612