kern_mbuf.c revision 148095
1129906Sbmilekic/*-
2141991Sbmilekic * Copyright (c) 2004, 2005,
3141991Sbmilekic * 	Bosko Milekic <bmilekic@FreeBSD.org>.  All rights reserved.
4129906Sbmilekic *
5129906Sbmilekic * Redistribution and use in source and binary forms, with or without
6129906Sbmilekic * modification, are permitted provided that the following conditions
7129906Sbmilekic * are met:
8129906Sbmilekic * 1. Redistributions of source code must retain the above copyright
9129906Sbmilekic *    notice unmodified, this list of conditions and the following
10129906Sbmilekic *    disclaimer.
11129906Sbmilekic * 2. Redistributions in binary form must reproduce the above copyright
12129906Sbmilekic *    notice, this list of conditions and the following disclaimer in the
13129906Sbmilekic *    documentation and/or other materials provided with the distribution.
14129906Sbmilekic *
15129906Sbmilekic * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16129906Sbmilekic * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17129906Sbmilekic * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18129906Sbmilekic * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19129906Sbmilekic * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20129906Sbmilekic * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21129906Sbmilekic * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22129906Sbmilekic * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23129906Sbmilekic * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24129906Sbmilekic * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25129906Sbmilekic * SUCH DAMAGE.
26129906Sbmilekic */
27129906Sbmilekic
28129906Sbmilekic#include <sys/cdefs.h>
29129906Sbmilekic__FBSDID("$FreeBSD: head/sys/kern/kern_mbuf.c 148095 2005-07-17 14:04:03Z rwatson $");
30129906Sbmilekic
31129906Sbmilekic#include "opt_mac.h"
32129906Sbmilekic#include "opt_param.h"
33129906Sbmilekic
34129906Sbmilekic#include <sys/param.h>
35129906Sbmilekic#include <sys/mac.h>
36129906Sbmilekic#include <sys/malloc.h>
37129906Sbmilekic#include <sys/systm.h>
38129906Sbmilekic#include <sys/mbuf.h>
39129906Sbmilekic#include <sys/domain.h>
40129906Sbmilekic#include <sys/eventhandler.h>
41129906Sbmilekic#include <sys/kernel.h>
42129906Sbmilekic#include <sys/protosw.h>
43129906Sbmilekic#include <sys/smp.h>
44129906Sbmilekic#include <sys/sysctl.h>
45129906Sbmilekic
46129906Sbmilekic#include <vm/vm.h>
47129906Sbmilekic#include <vm/vm_page.h>
48129906Sbmilekic#include <vm/uma.h>
49147537Ssilby#include <vm/uma_int.h>
50147537Ssilby#include <vm/uma_dbg.h>
51129906Sbmilekic
52129906Sbmilekic/*
53129906Sbmilekic * In FreeBSD, Mbufs and Mbuf Clusters are allocated from UMA
54129906Sbmilekic * Zones.
55129906Sbmilekic *
56129906Sbmilekic * Mbuf Clusters (2K, contiguous) are allocated from the Cluster
57129906Sbmilekic * Zone.  The Zone can be capped at kern.ipc.nmbclusters, if the
58129906Sbmilekic * administrator so desires.
59129906Sbmilekic *
60129906Sbmilekic * Mbufs are allocated from a UMA Master Zone called the Mbuf
61129906Sbmilekic * Zone.
62129906Sbmilekic *
63129906Sbmilekic * Additionally, FreeBSD provides a Packet Zone, which it
64129906Sbmilekic * configures as a Secondary Zone to the Mbuf Master Zone,
65129906Sbmilekic * thus sharing backend Slab kegs with the Mbuf Master Zone.
66129906Sbmilekic *
67129906Sbmilekic * Thus common-case allocations and locking are simplified:
68129906Sbmilekic *
69129906Sbmilekic *  m_clget()                m_getcl()
70129906Sbmilekic *    |                         |
71129906Sbmilekic *    |   .------------>[(Packet Cache)]    m_get(), m_gethdr()
72129906Sbmilekic *    |   |             [     Packet   ]            |
73129906Sbmilekic *  [(Cluster Cache)]   [    Secondary ]   [ (Mbuf Cache)     ]
74129906Sbmilekic *  [ Cluster Zone  ]   [     Zone     ]   [ Mbuf Master Zone ]
75129906Sbmilekic *        |                       \________         |
76129906Sbmilekic *  [ Cluster Keg   ]                      \       /
77129906Sbmilekic *        |    	                         [ Mbuf Keg   ]
78129906Sbmilekic *  [ Cluster Slabs ]                         |
79129906Sbmilekic *        |                              [ Mbuf Slabs ]
80129906Sbmilekic *         \____________(VM)_________________/
81129906Sbmilekic */
82129906Sbmilekic
83129906Sbmilekicint nmbclusters;
84129906Sbmilekicstruct mbstat mbstat;
85129906Sbmilekic
86129906Sbmilekicstatic void
87129906Sbmilekictunable_mbinit(void *dummy)
88129906Sbmilekic{
89129906Sbmilekic
90129906Sbmilekic	/* This has to be done before VM init. */
91129906Sbmilekic	nmbclusters = 1024 + maxusers * 64;
92129906Sbmilekic	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
93129906Sbmilekic}
94129906SbmilekicSYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
95129906Sbmilekic
96129906SbmilekicSYSCTL_DECL(_kern_ipc);
97129906SbmilekicSYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RW, &nmbclusters, 0,
98129906Sbmilekic    "Maximum number of mbuf clusters allowed");
99129906SbmilekicSYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
100129906Sbmilekic    "Mbuf general information and statistics");
101129906Sbmilekic
102129906Sbmilekic/*
103129906Sbmilekic * Zones from which we allocate.
104129906Sbmilekic */
105129906Sbmilekicuma_zone_t	zone_mbuf;
106129906Sbmilekicuma_zone_t	zone_clust;
107129906Sbmilekicuma_zone_t	zone_pack;
108129906Sbmilekic
109129906Sbmilekic/*
110129906Sbmilekic * Local prototypes.
111129906Sbmilekic */
112132987Sgreenstatic int	mb_ctor_mbuf(void *, int, void *, int);
113132987Sgreenstatic int	mb_ctor_clust(void *, int, void *, int);
114132987Sgreenstatic int	mb_ctor_pack(void *, int, void *, int);
115129906Sbmilekicstatic void	mb_dtor_mbuf(void *, int, void *);
116129906Sbmilekicstatic void	mb_dtor_clust(void *, int, void *);	/* XXX */
117129906Sbmilekicstatic void	mb_dtor_pack(void *, int, void *);	/* XXX */
118132987Sgreenstatic int	mb_init_pack(void *, int, int);
119129906Sbmilekicstatic void	mb_fini_pack(void *, int);
120129906Sbmilekic
121129906Sbmilekicstatic void	mb_reclaim(void *);
122129906Sbmilekicstatic void	mbuf_init(void *);
123129906Sbmilekic
124135510Sbrian/* Ensure that MSIZE doesn't break dtom() - it must be a power of 2 */
125135510SbrianCTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
126135510Sbrian
127129906Sbmilekic/*
128129906Sbmilekic * Initialize FreeBSD Network buffer allocation.
129129906Sbmilekic */
130129906SbmilekicSYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbuf_init, NULL)
131129906Sbmilekicstatic void
132129906Sbmilekicmbuf_init(void *dummy)
133129906Sbmilekic{
134129906Sbmilekic
135129906Sbmilekic	/*
136129906Sbmilekic	 * Configure UMA zones for Mbufs, Clusters, and Packets.
137129906Sbmilekic	 */
138148095Srwatson	zone_mbuf = uma_zcreate(MBUF_MEM_NAME, MSIZE, mb_ctor_mbuf,
139148095Srwatson	    mb_dtor_mbuf,
140147537Ssilby#ifdef INVARIANTS
141147537Ssilby	    trash_init, trash_fini, MSIZE - 1, UMA_ZONE_MAXBUCKET);
142147537Ssilby#else
143135510Sbrian	    NULL, NULL, MSIZE - 1, UMA_ZONE_MAXBUCKET);
144147537Ssilby#endif
145148095Srwatson	zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES,
146148095Srwatson	    mb_ctor_clust,
147147537Ssilby#ifdef INVARIANTS
148147537Ssilby	    mb_dtor_clust, trash_init, trash_fini, UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
149147537Ssilby#else
150129906Sbmilekic	    mb_dtor_clust, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
151147537Ssilby#endif
152129906Sbmilekic	if (nmbclusters > 0)
153129906Sbmilekic		uma_zone_set_max(zone_clust, nmbclusters);
154148095Srwatson	zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack,
155148095Srwatson	    mb_dtor_pack, mb_init_pack, mb_fini_pack, zone_mbuf);
156129906Sbmilekic
157129906Sbmilekic	/* uma_prealloc() goes here */
158129906Sbmilekic
159129906Sbmilekic	/*
160129906Sbmilekic	 * Hook event handler for low-memory situation, used to
161129906Sbmilekic	 * drain protocols and push data back to the caches (UMA
162129906Sbmilekic	 * later pushes it back to VM).
163129906Sbmilekic	 */
164129906Sbmilekic	EVENTHANDLER_REGISTER(vm_lowmem, mb_reclaim, NULL,
165129906Sbmilekic	    EVENTHANDLER_PRI_FIRST);
166129906Sbmilekic
167129906Sbmilekic	/*
168129906Sbmilekic	 * [Re]set counters and local statistics knobs.
169129906Sbmilekic	 * XXX Some of these should go and be replaced, but UMA stat
170129906Sbmilekic	 * gathering needs to be revised.
171129906Sbmilekic	 */
172129906Sbmilekic	mbstat.m_mbufs = 0;
173129906Sbmilekic	mbstat.m_mclusts = 0;
174129906Sbmilekic	mbstat.m_drain = 0;
175129906Sbmilekic	mbstat.m_msize = MSIZE;
176129906Sbmilekic	mbstat.m_mclbytes = MCLBYTES;
177129906Sbmilekic	mbstat.m_minclsize = MINCLSIZE;
178129906Sbmilekic	mbstat.m_mlen = MLEN;
179129906Sbmilekic	mbstat.m_mhlen = MHLEN;
180129906Sbmilekic	mbstat.m_numtypes = MT_NTYPES;
181129906Sbmilekic
182129906Sbmilekic	mbstat.m_mcfail = mbstat.m_mpfail = 0;
183129906Sbmilekic	mbstat.sf_iocnt = 0;
184129906Sbmilekic	mbstat.sf_allocwait = mbstat.sf_allocfail = 0;
185129906Sbmilekic}
186129906Sbmilekic
187129906Sbmilekic/*
188129906Sbmilekic * Constructor for Mbuf master zone.
189129906Sbmilekic *
190129906Sbmilekic * The 'arg' pointer points to a mb_args structure which
191129906Sbmilekic * contains call-specific information required to support the
192129906Sbmilekic * mbuf allocation API.
193129906Sbmilekic */
194132987Sgreenstatic int
195132987Sgreenmb_ctor_mbuf(void *mem, int size, void *arg, int how)
196129906Sbmilekic{
197129906Sbmilekic	struct mbuf *m;
198129906Sbmilekic	struct mb_args *args;
199132987Sgreen#ifdef MAC
200132987Sgreen	int error;
201132987Sgreen#endif
202129906Sbmilekic	int flags;
203129906Sbmilekic	short type;
204129906Sbmilekic
205147537Ssilby#ifdef INVARIANTS
206147537Ssilby	trash_ctor(mem, size, arg, how);
207147537Ssilby#endif
208129906Sbmilekic	m = (struct mbuf *)mem;
209129906Sbmilekic	args = (struct mb_args *)arg;
210129906Sbmilekic	flags = args->flags;
211129906Sbmilekic	type = args->type;
212129906Sbmilekic
213129906Sbmilekic	m->m_type = type;
214129906Sbmilekic	m->m_next = NULL;
215129906Sbmilekic	m->m_nextpkt = NULL;
216129947Sbmilekic	m->m_flags = flags;
217129906Sbmilekic	if (flags & M_PKTHDR) {
218129906Sbmilekic		m->m_data = m->m_pktdat;
219129906Sbmilekic		m->m_pkthdr.rcvif = NULL;
220129906Sbmilekic		m->m_pkthdr.csum_flags = 0;
221129906Sbmilekic		SLIST_INIT(&m->m_pkthdr.tags);
222129906Sbmilekic#ifdef MAC
223129906Sbmilekic		/* If the label init fails, fail the alloc */
224132987Sgreen		error = mac_init_mbuf(m, how);
225132987Sgreen		if (error)
226132987Sgreen			return (error);
227129906Sbmilekic#endif
228129947Sbmilekic	} else
229129906Sbmilekic		m->m_data = m->m_dat;
230129906Sbmilekic	mbstat.m_mbufs += 1;	/* XXX */
231132987Sgreen	return (0);
232129906Sbmilekic}
233129906Sbmilekic
234129906Sbmilekic/*
235129906Sbmilekic * The Mbuf master zone and Packet secondary zone destructor.
236129906Sbmilekic */
237129906Sbmilekicstatic void
238129906Sbmilekicmb_dtor_mbuf(void *mem, int size, void *arg)
239129906Sbmilekic{
240129906Sbmilekic	struct mbuf *m;
241129906Sbmilekic
242129906Sbmilekic	m = (struct mbuf *)mem;
243129906Sbmilekic	if ((m->m_flags & M_PKTHDR) != 0)
244129906Sbmilekic		m_tag_delete_chain(m, NULL);
245147537Ssilby#ifdef INVARIANTS
246147537Ssilby	trash_dtor(mem, size, arg);
247147537Ssilby#endif
248129906Sbmilekic	mbstat.m_mbufs -= 1;	/* XXX */
249129906Sbmilekic}
250129906Sbmilekic
251129906Sbmilekic/* XXX Only because of stats */
252129906Sbmilekicstatic void
253129906Sbmilekicmb_dtor_pack(void *mem, int size, void *arg)
254129906Sbmilekic{
255129906Sbmilekic	struct mbuf *m;
256129906Sbmilekic
257129906Sbmilekic	m = (struct mbuf *)mem;
258129906Sbmilekic	if ((m->m_flags & M_PKTHDR) != 0)
259129906Sbmilekic		m_tag_delete_chain(m, NULL);
260147537Ssilby#ifdef INVARIANTS
261147537Ssilby	trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg);
262147537Ssilby#endif
263129906Sbmilekic	mbstat.m_mbufs -= 1;	/* XXX */
264129906Sbmilekic	mbstat.m_mclusts -= 1;	/* XXX */
265129906Sbmilekic}
266129906Sbmilekic
267129906Sbmilekic/*
268129906Sbmilekic * The Cluster zone constructor.
269129906Sbmilekic *
270129906Sbmilekic * Here the 'arg' pointer points to the Mbuf which we
271129906Sbmilekic * are configuring cluster storage for.
272129906Sbmilekic */
273132987Sgreenstatic int
274132987Sgreenmb_ctor_clust(void *mem, int size, void *arg, int how)
275129906Sbmilekic{
276129906Sbmilekic	struct mbuf *m;
277129906Sbmilekic
278147537Ssilby#ifdef INVARIANTS
279147537Ssilby	trash_ctor(mem, size, arg, how);
280147537Ssilby#endif
281129906Sbmilekic	m = (struct mbuf *)arg;
282129906Sbmilekic	m->m_ext.ext_buf = (caddr_t)mem;
283129906Sbmilekic	m->m_data = m->m_ext.ext_buf;
284129906Sbmilekic	m->m_flags |= M_EXT;
285129906Sbmilekic	m->m_ext.ext_free = NULL;
286129906Sbmilekic	m->m_ext.ext_args = NULL;
287129906Sbmilekic	m->m_ext.ext_size = MCLBYTES;
288129906Sbmilekic	m->m_ext.ext_type = EXT_CLUSTER;
289141668Sbmilekic	m->m_ext.ref_cnt = NULL;	/* Lazy counter assign. */
290129906Sbmilekic	mbstat.m_mclusts += 1;	/* XXX */
291132987Sgreen	return (0);
292129906Sbmilekic}
293129906Sbmilekic
294129906Sbmilekic/* XXX */
295129906Sbmilekicstatic void
296129906Sbmilekicmb_dtor_clust(void *mem, int size, void *arg)
297129906Sbmilekic{
298147537Ssilby#ifdef INVARIANTS
299147537Ssilby	trash_dtor(mem, size, arg);
300147537Ssilby#endif
301129906Sbmilekic	mbstat.m_mclusts -= 1;	/* XXX */
302129906Sbmilekic}
303129906Sbmilekic
304129906Sbmilekic/*
305129906Sbmilekic * The Packet secondary zone's init routine, executed on the
306129906Sbmilekic * object's transition from keg slab to zone cache.
307129906Sbmilekic */
308132987Sgreenstatic int
309132987Sgreenmb_init_pack(void *mem, int size, int how)
310129906Sbmilekic{
311129906Sbmilekic	struct mbuf *m;
312129906Sbmilekic
313129906Sbmilekic	m = (struct mbuf *)mem;
314129906Sbmilekic	m->m_ext.ext_buf = NULL;
315132987Sgreen	uma_zalloc_arg(zone_clust, m, how);
316132987Sgreen	if (m->m_ext.ext_buf == NULL)
317132987Sgreen		return (ENOMEM);
318147537Ssilby#ifdef INVARIANTS
319147537Ssilby	trash_init(m->m_ext.ext_buf, MCLBYTES, how);
320147537Ssilby#endif
321129906Sbmilekic	mbstat.m_mclusts -= 1;	/* XXX */
322132987Sgreen	return (0);
323129906Sbmilekic}
324129906Sbmilekic
325129906Sbmilekic/*
326129906Sbmilekic * The Packet secondary zone's fini routine, executed on the
327129906Sbmilekic * object's transition from zone cache to keg slab.
328129906Sbmilekic */
329129906Sbmilekicstatic void
330129906Sbmilekicmb_fini_pack(void *mem, int size)
331129906Sbmilekic{
332129906Sbmilekic	struct mbuf *m;
333129906Sbmilekic
334129906Sbmilekic	m = (struct mbuf *)mem;
335147537Ssilby#ifdef INVARIANTS
336147537Ssilby	trash_fini(m->m_ext.ext_buf, MCLBYTES);
337147537Ssilby#endif
338129906Sbmilekic	uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
339129906Sbmilekic	m->m_ext.ext_buf = NULL;
340129906Sbmilekic	mbstat.m_mclusts += 1;	/* XXX */
341147652Ssilby#ifdef INVARIANTS
342147652Ssilby	trash_dtor(mem, size, NULL);
343147652Ssilby#endif
344129906Sbmilekic}
345129906Sbmilekic
346129906Sbmilekic/*
347129906Sbmilekic * The "packet" keg constructor.
348129906Sbmilekic */
349132987Sgreenstatic int
350132987Sgreenmb_ctor_pack(void *mem, int size, void *arg, int how)
351129906Sbmilekic{
352129906Sbmilekic	struct mbuf *m;
353129906Sbmilekic	struct mb_args *args;
354132987Sgreen#ifdef MAC
355132987Sgreen	int error;
356132987Sgreen#endif
357132987Sgreen	int flags;
358129906Sbmilekic	short type;
359129906Sbmilekic
360129906Sbmilekic	m = (struct mbuf *)mem;
361129906Sbmilekic	args = (struct mb_args *)arg;
362129906Sbmilekic	flags = args->flags;
363129906Sbmilekic	type = args->type;
364129906Sbmilekic
365147537Ssilby#ifdef INVARIANTS
366147537Ssilby	trash_ctor(m->m_ext.ext_buf, MCLBYTES, arg, how);
367147537Ssilby#endif
368129906Sbmilekic	m->m_type = type;
369129906Sbmilekic	m->m_next = NULL;
370129947Sbmilekic	m->m_nextpkt = NULL;
371129906Sbmilekic	m->m_data = m->m_ext.ext_buf;
372129906Sbmilekic	m->m_flags = flags|M_EXT;
373129906Sbmilekic	m->m_ext.ext_free = NULL;
374129906Sbmilekic	m->m_ext.ext_args = NULL;
375129906Sbmilekic	m->m_ext.ext_size = MCLBYTES;
376129906Sbmilekic	m->m_ext.ext_type = EXT_PACKET;
377141668Sbmilekic	m->m_ext.ref_cnt = NULL;	/* Lazy counter assign. */
378129906Sbmilekic
379129906Sbmilekic	if (flags & M_PKTHDR) {
380129906Sbmilekic		m->m_pkthdr.rcvif = NULL;
381129906Sbmilekic		m->m_pkthdr.csum_flags = 0;
382129906Sbmilekic		SLIST_INIT(&m->m_pkthdr.tags);
383129906Sbmilekic#ifdef MAC
384129906Sbmilekic		/* If the label init fails, fail the alloc */
385132987Sgreen		error = mac_init_mbuf(m, how);
386132987Sgreen		if (error)
387132987Sgreen			return (error);
388129906Sbmilekic#endif
389129906Sbmilekic	}
390129906Sbmilekic	mbstat.m_mbufs += 1;	/* XXX */
391129906Sbmilekic	mbstat.m_mclusts += 1;	/* XXX */
392132987Sgreen	return (0);
393129906Sbmilekic}
394129906Sbmilekic
395129906Sbmilekic/*
396129906Sbmilekic * This is the protocol drain routine.
397129906Sbmilekic *
398129906Sbmilekic * No locks should be held when this is called.  The drain routines have to
399129906Sbmilekic * presently acquire some locks which raises the possibility of lock order
400129906Sbmilekic * reversal.
401129906Sbmilekic */
402129906Sbmilekicstatic void
403129906Sbmilekicmb_reclaim(void *junk)
404129906Sbmilekic{
405129906Sbmilekic	struct domain *dp;
406129906Sbmilekic	struct protosw *pr;
407129906Sbmilekic
408129906Sbmilekic	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
409129906Sbmilekic	    "mb_reclaim()");
410129906Sbmilekic
411129906Sbmilekic	mbstat.m_drain++;
412129906Sbmilekic	for (dp = domains; dp != NULL; dp = dp->dom_next)
413129906Sbmilekic		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
414129906Sbmilekic			if (pr->pr_drain != NULL)
415129906Sbmilekic				(*pr->pr_drain)();
416129906Sbmilekic}
417