tcp_pcap.c revision 295482
1289276Shiren/*-
2289276Shiren * Copyright (c) 2015
3289276Shiren *	Jonathan Looney. All rights reserved.
4289276Shiren *
5289276Shiren * Redistribution and use in source and binary forms, with or without
6289276Shiren * modification, are permitted provided that the following conditions
7289276Shiren * are met:
8289276Shiren * 1. Redistributions of source code must retain the above copyright
9289276Shiren *    notice, this list of conditions and the following disclaimer.
10289276Shiren * 2. Redistributions in binary form must reproduce the above copyright
11289276Shiren *    notice, this list of conditions and the following disclaimer in the
12289276Shiren *    documentation and/or other materials provided with the distribution.
13289276Shiren *
14289276Shiren * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15289276Shiren * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16289276Shiren * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17289276Shiren * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18289276Shiren * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19289276Shiren * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20289276Shiren * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21289276Shiren * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22289276Shiren * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23289276Shiren * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24289276Shiren * SUCH DAMAGE.
25289276Shiren *
26289276Shiren * $FreeBSD: head/sys/netinet/tcp_pcap.c 295482 2016-02-10 18:54:18Z glebius $
27289276Shiren */
28289276Shiren
29289276Shiren#include <sys/queue.h>
30289276Shiren#include <sys/param.h>
31289276Shiren#include <sys/types.h>
32289276Shiren#include <sys/socket.h>
33289276Shiren#include <sys/socketvar.h>
34289276Shiren#include <sys/sysctl.h>
35289276Shiren#include <sys/systm.h>
36289276Shiren#include <sys/mbuf.h>
37289276Shiren#include <sys/eventhandler.h>
38289276Shiren#include <machine/atomic.h>
39289276Shiren#include <netinet/tcp_var.h>
40289276Shiren#include <netinet/tcp_pcap.h>
41289276Shiren
42289276Shiren#define M_LEADINGSPACE_NOWRITE(m)					\
43289276Shiren	((m)->m_data - M_START(m))
44289276Shiren
45289276Shirenstatic int tcp_pcap_clusters_referenced_cur = 0;
46289276Shirenstatic int tcp_pcap_clusters_referenced_max = 0;
47289276Shiren
48289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_cur,
49289276Shiren	CTLFLAG_RD, &tcp_pcap_clusters_referenced_cur, 0,
50289276Shiren	"Number of clusters currently referenced on TCP PCAP queues");
51289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_max,
52289276Shiren	CTLFLAG_RW, &tcp_pcap_clusters_referenced_max, 0,
53289276Shiren	"Maximum number of clusters allowed to be referenced on TCP PCAP "
54289276Shiren	"queues");
55289276Shiren
56289276Shirenstatic int tcp_pcap_alloc_reuse_ext = 0;
57289276Shirenstatic int tcp_pcap_alloc_reuse_mbuf = 0;
58289276Shirenstatic int tcp_pcap_alloc_new_mbuf = 0;
59289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_ext,
60289276Shiren	CTLFLAG_RD, &tcp_pcap_alloc_reuse_ext, 0,
61289276Shiren	"Number of mbufs with external storage reused for the TCP PCAP "
62289276Shiren	"functionality");
63289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_mbuf,
64289276Shiren	CTLFLAG_RD, &tcp_pcap_alloc_reuse_mbuf, 0,
65289276Shiren	"Number of mbufs with internal storage reused for the TCP PCAP "
66289276Shiren	"functionality");
67289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_new_mbuf,
68289276Shiren	CTLFLAG_RD, &tcp_pcap_alloc_new_mbuf, 0,
69289276Shiren	"Number of new mbufs allocated for the TCP PCAP functionality");
70289276Shiren
71289276ShirenVNET_DEFINE(int, tcp_pcap_packets) = 0;
72289276Shiren#define V_tcp_pcap_packets	VNET(tcp_pcap_packets)
73289350SbzSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_packets,
74289350Sbz	CTLFLAG_RW, &VNET_NAME(tcp_pcap_packets), 0,
75289350Sbz	"Default number of packets saved per direction per TCPCB");
76289276Shiren
77289276Shiren/* Initialize the values. */
78289276Shirenstatic void
79289326Sbztcp_pcap_max_set(void)
80289326Sbz{
81289326Sbz
82289276Shiren	tcp_pcap_clusters_referenced_max = nmbclusters / 4;
83289276Shiren}
84289276Shiren
85289276Shirenvoid
86289326Sbztcp_pcap_init(void)
87289326Sbz{
88289326Sbz
89289276Shiren	tcp_pcap_max_set();
90289276Shiren	EVENTHANDLER_REGISTER(nmbclusters_change, tcp_pcap_max_set,
91289276Shiren		NULL, EVENTHANDLER_PRI_ANY);
92289276Shiren}
93289276Shiren
94289276Shiren/*
95289276Shiren * If we are below the maximum allowed cluster references,
96289276Shiren * increment the reference count and return TRUE. Otherwise,
97289276Shiren * leave the reference count alone and return FALSE.
98289276Shiren */
99289276Shirenstatic __inline bool
100289276Shirentcp_pcap_take_cluster_reference(void)
101289276Shiren{
102289276Shiren	if (atomic_fetchadd_int(&tcp_pcap_clusters_referenced_cur, 1) >=
103289276Shiren		tcp_pcap_clusters_referenced_max) {
104289276Shiren		atomic_add_int(&tcp_pcap_clusters_referenced_cur, -1);
105289276Shiren		return FALSE;
106289276Shiren	}
107289276Shiren	return TRUE;
108289276Shiren}
109289276Shiren
110289276Shiren/*
111289276Shiren * For all the external entries in m, apply the given adjustment.
112289276Shiren * This can be used to adjust the counter when an mbuf chain is
113289276Shiren * copied or freed.
114289276Shiren */
115289276Shirenstatic __inline void
116289276Shirentcp_pcap_adj_cluster_reference(struct mbuf *m, int adj)
117289276Shiren{
118289276Shiren	while (m) {
119289276Shiren		if (m->m_flags & M_EXT)
120289276Shiren			atomic_add_int(&tcp_pcap_clusters_referenced_cur, adj);
121289276Shiren
122289276Shiren		m = m->m_next;
123289276Shiren	}
124289276Shiren}
125289276Shiren
126289276Shiren/*
127289276Shiren * Free all mbufs in a chain, decrementing the reference count as
128289276Shiren * necessary.
129289276Shiren *
130289276Shiren * Functions in this file should use this instead of m_freem() when
131289276Shiren * they are freeing mbuf chains that may contain clusters that were
132289276Shiren * already included in tcp_pcap_clusters_referenced_cur.
133289276Shiren */
134289276Shirenstatic void
135289276Shirentcp_pcap_m_freem(struct mbuf *mb)
136289276Shiren{
137289276Shiren	while (mb != NULL) {
138289276Shiren		if (mb->m_flags & M_EXT)
139289276Shiren			atomic_subtract_int(&tcp_pcap_clusters_referenced_cur,
140289276Shiren			    1);
141289276Shiren		mb = m_free(mb);
142289276Shiren	}
143289276Shiren}
144289276Shiren
145289276Shiren/*
146289276Shiren * Copy data from m to n, where n cannot fit all the data we might
147289276Shiren * want from m.
148289276Shiren *
149289276Shiren * Prioritize data like this:
150289276Shiren * 1. TCP header
151289276Shiren * 2. IP header
152289276Shiren * 3. Data
153289276Shiren */
154289276Shirenstatic void
155289276Shirentcp_pcap_copy_bestfit(struct tcphdr *th, struct mbuf *m, struct mbuf *n)
156289276Shiren{
157289276Shiren	struct mbuf *m_cur = m;
158289276Shiren	int bytes_to_copy=0, trailing_data, skip=0, tcp_off;
159289276Shiren
160289276Shiren	/* Below, we assume these will be non-NULL. */
161289276Shiren	KASSERT(th, ("%s: called with th == NULL", __func__));
162289276Shiren	KASSERT(m, ("%s: called with m == NULL", __func__));
163289276Shiren	KASSERT(n, ("%s: called with n == NULL", __func__));
164289276Shiren
165289276Shiren	/* We assume this initialization occurred elsewhere. */
166289276Shiren	KASSERT(n->m_len == 0, ("%s: called with n->m_len=%d (expected 0)",
167289276Shiren		__func__, n->m_len));
168289276Shiren	KASSERT(n->m_data == M_START(n),
169289276Shiren		("%s: called with n->m_data != M_START(n)", __func__));
170289276Shiren
171289276Shiren	/*
172289276Shiren	 * Calculate the size of the TCP header. We use this often
173289276Shiren	 * enough that it is worth just calculating at the start.
174289276Shiren	 */
175289276Shiren	tcp_off = th->th_off << 2;
176289276Shiren
177289276Shiren	/* Trim off leading empty mbufs. */
178289276Shiren	while (m && m->m_len == 0)
179289276Shiren		m = m->m_next;
180289276Shiren
181289276Shiren	if (m) {
182289276Shiren		m_cur = m;
183289276Shiren	}
184289276Shiren	else {
185289276Shiren		/*
186289276Shiren		 * No data? Highly unusual. We would expect to at
187289276Shiren		 * least see a TCP header in the mbuf.
188289276Shiren		 * As we have a pointer to the TCP header, I guess
189289276Shiren		 * we should just copy that. (???)
190289276Shiren		 */
191289276Shirenfallback:
192289276Shiren		bytes_to_copy = tcp_off;
193289276Shiren		if (bytes_to_copy > M_SIZE(n))
194289276Shiren			bytes_to_copy = M_SIZE(n);
195289276Shiren		bcopy(th, n->m_data, bytes_to_copy);
196289276Shiren		n->m_len = bytes_to_copy;
197289276Shiren		return;
198289276Shiren	}
199289276Shiren
200289276Shiren	/*
201289276Shiren	 * Find TCP header. Record the total number of bytes up to,
202289276Shiren	 * and including, the TCP header.
203289276Shiren	 */
204289276Shiren	while (m_cur) {
205289276Shiren		if ((caddr_t) th >= (caddr_t) m_cur->m_data &&
206289276Shiren			(caddr_t) th < (caddr_t) (m_cur->m_data + m_cur->m_len))
207289276Shiren			break;
208289276Shiren		bytes_to_copy += m_cur->m_len;
209289276Shiren		m_cur = m_cur->m_next;
210289276Shiren	}
211289276Shiren	if (m_cur)
212289276Shiren		bytes_to_copy += (caddr_t) th - (caddr_t) m_cur->m_data;
213289276Shiren	else
214289276Shiren		goto fallback;
215289276Shiren	bytes_to_copy += tcp_off;
216289276Shiren
217289276Shiren	/*
218289276Shiren	 * If we already want to copy more bytes than we can hold
219289276Shiren	 * in the destination mbuf, skip leading bytes and copy
220289276Shiren	 * what we can.
221289276Shiren	 *
222289276Shiren	 * Otherwise, consider trailing data.
223289276Shiren	 */
224289276Shiren	if (bytes_to_copy > M_SIZE(n)) {
225289276Shiren		skip  = bytes_to_copy - M_SIZE(n);
226289276Shiren		bytes_to_copy = M_SIZE(n);
227289276Shiren	}
228289276Shiren	else {
229289276Shiren		/*
230289276Shiren		 * Determine how much trailing data is in the chain.
231289276Shiren		 * We start with the length of this mbuf (the one
232289276Shiren		 * containing th) and subtract the size of the TCP
233289276Shiren		 * header (tcp_off) and the size of the data prior
234289276Shiren		 * to th (th - m_cur->m_data).
235289276Shiren		 *
236289276Shiren		 * This *should not* be negative, as the TCP code
237289276Shiren		 * should put the whole TCP header in a single
238289276Shiren		 * mbuf. But, it isn't a problem if it is. We will
239289276Shiren		 * simple work off our negative balance as we look
240289276Shiren		 * at subsequent mbufs.
241289276Shiren		 */
242289276Shiren		trailing_data = m_cur->m_len - tcp_off;
243289276Shiren		trailing_data -= (caddr_t) th - (caddr_t) m_cur->m_data;
244289276Shiren		m_cur = m_cur->m_next;
245289276Shiren		while (m_cur) {
246289276Shiren			trailing_data += m_cur->m_len;
247289276Shiren			m_cur = m_cur->m_next;
248289276Shiren		}
249289276Shiren		if ((bytes_to_copy + trailing_data) > M_SIZE(n))
250289276Shiren			bytes_to_copy = M_SIZE(n);
251289276Shiren		else
252289276Shiren			bytes_to_copy += trailing_data;
253289276Shiren	}
254289276Shiren
255289276Shiren	m_copydata(m, skip, bytes_to_copy, n->m_data);
256289276Shiren	n->m_len = bytes_to_copy;
257289276Shiren}
258289276Shiren
259289276Shirenvoid
260289276Shirentcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
261289276Shiren{
262289276Shiren	struct mbuf *n = NULL, *mhead;
263289276Shiren
264289276Shiren	KASSERT(th, ("%s: called with th == NULL", __func__));
265289276Shiren	KASSERT(m, ("%s: called with m == NULL", __func__));
266289276Shiren	KASSERT(queue, ("%s: called with queue == NULL", __func__));
267289276Shiren
268289276Shiren	/* We only care about data packets. */
269289276Shiren	while (m && m->m_type != MT_DATA)
270289276Shiren		m = m->m_next;
271289276Shiren
272289276Shiren	/* We only need to do something if we still have an mbuf. */
273289276Shiren	if (!m)
274289276Shiren		return;
275289276Shiren
276289276Shiren	/* If we are not saving mbufs, return now. */
277289276Shiren	if (queue->mq_maxlen == 0)
278289276Shiren		return;
279289276Shiren
280289276Shiren	/*
281289276Shiren	 * Check to see if we will need to recycle mbufs.
282289276Shiren	 *
283289276Shiren	 * If we need to get rid of mbufs to stay below
284289276Shiren	 * our packet count, try to reuse the mbuf. Once
285289276Shiren	 * we already have a new mbuf (n), then we can
286289276Shiren	 * simply free subsequent mbufs.
287289276Shiren	 *
288289276Shiren	 * Note that most of the logic in here is to deal
289289276Shiren	 * with the reuse. If we are fine with constant
290289276Shiren	 * mbuf allocs/deallocs, we could ditch this logic.
291289276Shiren	 * But, it only seems to make sense to reuse
292289276Shiren	 * mbufs we already have.
293289276Shiren	 */
294289276Shiren	while (mbufq_full(queue)) {
295289276Shiren		mhead = mbufq_dequeue(queue);
296289276Shiren
297289276Shiren		if (n) {
298289276Shiren			tcp_pcap_m_freem(mhead);
299289276Shiren		}
300289276Shiren		else {
301289276Shiren			/*
302289276Shiren			 * If this held an external cluster, try to
303289276Shiren			 * detach the cluster. But, if we held the
304289276Shiren			 * last reference, go through the normal
305289276Shiren			 * free-ing process.
306289276Shiren			 */
307289276Shiren			if (mhead->m_flags & M_EXT) {
308289276Shiren				switch (mhead->m_ext.ext_type) {
309289276Shiren				case EXT_SFBUF:
310289276Shiren					/* Don't mess around with these. */
311289276Shiren					tcp_pcap_m_freem(mhead);
312289276Shiren					continue;
313289276Shiren				default:
314289276Shiren					if (atomic_fetchadd_int(
315289276Shiren						mhead->m_ext.ext_cnt, -1) == 1)
316289276Shiren					{
317289276Shiren						/*
318289276Shiren						 * We held the last reference
319289276Shiren						 * on this cluster. Restore
320289276Shiren						 * the reference count and put
321289276Shiren						 * it back in the pool.
322289276Shiren				 		 */
323289276Shiren						*(mhead->m_ext.ext_cnt) = 1;
324289276Shiren						tcp_pcap_m_freem(mhead);
325289276Shiren						continue;
326289276Shiren					}
327289276Shiren					/*
328289276Shiren					 * We were able to cleanly free the
329289276Shiren					 * reference.
330289276Shiren				 	 */
331289276Shiren					atomic_subtract_int(
332289276Shiren					    &tcp_pcap_clusters_referenced_cur,
333289276Shiren					    1);
334289276Shiren					tcp_pcap_alloc_reuse_ext++;
335289276Shiren					break;
336289276Shiren				}
337289276Shiren			}
338289276Shiren			else {
339289276Shiren				tcp_pcap_alloc_reuse_mbuf++;
340289276Shiren			}
341289276Shiren
342289276Shiren			n = mhead;
343289276Shiren			tcp_pcap_m_freem(n->m_next);
344295482Sglebius			m_init(n, M_NOWAIT, MT_DATA, 0);
345289276Shiren		}
346289276Shiren	}
347289276Shiren
348289276Shiren	/* Check to see if we need to get a new mbuf. */
349289276Shiren	if (!n) {
350289276Shiren		if (!(n = m_get(M_NOWAIT, MT_DATA)))
351289276Shiren			return;
352289276Shiren		tcp_pcap_alloc_new_mbuf++;
353289276Shiren	}
354289276Shiren
355289276Shiren	/*
356289276Shiren	 * What are we dealing with? If a cluster, attach it. Otherwise,
357289276Shiren	 * try to copy the data from the beginning of the mbuf to the
358289276Shiren	 * end of data. (There may be data between the start of the data
359289276Shiren	 * area and the current data pointer. We want to get this, because
360289276Shiren	 * it may contain header information that is useful.)
361289276Shiren	 * In cases where that isn't possible, settle for what we can
362289276Shiren	 * get.
363289276Shiren	 */
364289276Shiren	if ((m->m_flags & M_EXT) && tcp_pcap_take_cluster_reference()) {
365289276Shiren		n->m_data = m->m_data;
366289276Shiren		n->m_len = m->m_len;
367289276Shiren		mb_dupcl(n, m);
368289276Shiren	}
369289276Shiren	else if (((m->m_data + m->m_len) - M_START(m)) <= M_SIZE(n)) {
370289276Shiren		/*
371289276Shiren		 * At this point, n is guaranteed to be a normal mbuf
372289276Shiren		 * with no cluster and no packet header. Because the
373289276Shiren		 * logic in this code block requires this, the assert
374289276Shiren		 * is here to catch any instances where someone
375289276Shiren		 * changes the logic to invalidate that assumption.
376289276Shiren		 */
377289276Shiren		KASSERT((n->m_flags & (M_EXT | M_PKTHDR)) == 0,
378289276Shiren			("%s: Unexpected flags (%#x) for mbuf",
379289276Shiren			__func__, n->m_flags));
380289276Shiren		n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m);
381289276Shiren		n->m_len = m->m_len;
382289276Shiren		bcopy(M_START(m), n->m_dat,
383289276Shiren			m->m_len + M_LEADINGSPACE_NOWRITE(m));
384289276Shiren	}
385289276Shiren	else {
386289276Shiren		/*
387289276Shiren		 * This is the case where we need to "settle for what
388289276Shiren		 * we can get". The most probable way to this code
389289276Shiren		 * path is that we've already taken references to the
390289276Shiren		 * maximum number of mbuf clusters we can, and the data
391289276Shiren		 * is too long to fit in an mbuf's internal storage.
392289276Shiren		 * Try for a "best fit".
393289276Shiren		 */
394289276Shiren		tcp_pcap_copy_bestfit(th, m, n);
395289276Shiren
396289276Shiren		/* Don't try to get additional data. */
397289276Shiren		goto add_to_queue;
398289276Shiren	}
399289276Shiren
400289276Shiren	if (m->m_next) {
401289276Shiren		n->m_next = m_copym(m->m_next, 0, M_COPYALL, M_NOWAIT);
402289276Shiren		tcp_pcap_adj_cluster_reference(n->m_next, 1);
403289276Shiren	}
404289276Shiren
405289276Shirenadd_to_queue:
406289276Shiren	/* Add the new mbuf to the list. */
407289276Shiren	if (mbufq_enqueue(queue, n)) {
408289276Shiren		/* This shouldn't happen. If INVARIANTS is defined, panic. */
409289276Shiren		KASSERT(0, ("%s: mbufq was unexpectedly full!", __func__));
410289276Shiren		tcp_pcap_m_freem(n);
411289276Shiren	}
412289276Shiren}
413289276Shiren
414289276Shirenvoid
415289276Shirentcp_pcap_drain(struct mbufq *queue)
416289276Shiren{
417289276Shiren	struct mbuf *m;
418289276Shiren	while ((m = mbufq_dequeue(queue)))
419289276Shiren		tcp_pcap_m_freem(m);
420289276Shiren}
421289276Shiren
422289276Shirenvoid
423289276Shirentcp_pcap_tcpcb_init(struct tcpcb *tp)
424289276Shiren{
425289276Shiren	mbufq_init(&(tp->t_inpkts), V_tcp_pcap_packets);
426289276Shiren	mbufq_init(&(tp->t_outpkts), V_tcp_pcap_packets);
427289276Shiren}
428289276Shiren
429289276Shirenvoid
430289276Shirentcp_pcap_set_sock_max(struct mbufq *queue, int newval)
431289276Shiren{
432289276Shiren	queue->mq_maxlen = newval;
433289276Shiren	while (queue->mq_len > queue->mq_maxlen)
434289276Shiren		tcp_pcap_m_freem(mbufq_dequeue(queue));
435289276Shiren}
436289276Shiren
437289276Shirenint
438289276Shirentcp_pcap_get_sock_max(struct mbufq *queue)
439289276Shiren{
440289276Shiren	return queue->mq_maxlen;
441289276Shiren}
442