tcp_pcap.c revision 289276
1289276Shiren/*-
2289276Shiren * Copyright (c) 2015
3289276Shiren *	Jonathan Looney. All rights reserved.
4289276Shiren *
5289276Shiren * Redistribution and use in source and binary forms, with or without
6289276Shiren * modification, are permitted provided that the following conditions
7289276Shiren * are met:
8289276Shiren * 1. Redistributions of source code must retain the above copyright
9289276Shiren *    notice, this list of conditions and the following disclaimer.
10289276Shiren * 2. Redistributions in binary form must reproduce the above copyright
11289276Shiren *    notice, this list of conditions and the following disclaimer in the
12289276Shiren *    documentation and/or other materials provided with the distribution.
13289276Shiren *
14289276Shiren * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15289276Shiren * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16289276Shiren * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17289276Shiren * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18289276Shiren * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19289276Shiren * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20289276Shiren * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21289276Shiren * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22289276Shiren * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23289276Shiren * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24289276Shiren * SUCH DAMAGE.
25289276Shiren *
26289276Shiren * $FreeBSD: head/sys/netinet/tcp_pcap.c 289276 2015-10-14 00:35:37Z hiren $
27289276Shiren */
28289276Shiren
29289276Shiren#include <sys/queue.h>
30289276Shiren#include <sys/param.h>
31289276Shiren#include <sys/types.h>
32289276Shiren#include <sys/socket.h>
33289276Shiren#include <sys/socketvar.h>
34289276Shiren#include <sys/sysctl.h>
35289276Shiren#include <sys/systm.h>
36289276Shiren#include <sys/mbuf.h>
37289276Shiren#include <sys/eventhandler.h>
38289276Shiren#include <machine/atomic.h>
39289276Shiren#include <netinet/tcp_var.h>
40289276Shiren#include <netinet/tcp_pcap.h>
41289276Shiren
42289276Shiren#define M_LEADINGSPACE_NOWRITE(m)					\
43289276Shiren	((m)->m_data - M_START(m))
44289276Shiren
45289276Shirenstatic int tcp_pcap_clusters_referenced_cur = 0;
46289276Shirenstatic int tcp_pcap_clusters_referenced_max = 0;
47289276Shiren
48289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_cur,
49289276Shiren	CTLFLAG_RD, &tcp_pcap_clusters_referenced_cur, 0,
50289276Shiren	"Number of clusters currently referenced on TCP PCAP queues");
51289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_max,
52289276Shiren	CTLFLAG_RW, &tcp_pcap_clusters_referenced_max, 0,
53289276Shiren	"Maximum number of clusters allowed to be referenced on TCP PCAP "
54289276Shiren	"queues");
55289276Shiren
56289276Shirenstatic int tcp_pcap_alloc_reuse_ext = 0;
57289276Shirenstatic int tcp_pcap_alloc_reuse_mbuf = 0;
58289276Shirenstatic int tcp_pcap_alloc_new_mbuf = 0;
59289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_ext,
60289276Shiren	CTLFLAG_RD, &tcp_pcap_alloc_reuse_ext, 0,
61289276Shiren	"Number of mbufs with external storage reused for the TCP PCAP "
62289276Shiren	"functionality");
63289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_mbuf,
64289276Shiren	CTLFLAG_RD, &tcp_pcap_alloc_reuse_mbuf, 0,
65289276Shiren	"Number of mbufs with internal storage reused for the TCP PCAP "
66289276Shiren	"functionality");
67289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_new_mbuf,
68289276Shiren	CTLFLAG_RD, &tcp_pcap_alloc_new_mbuf, 0,
69289276Shiren	"Number of new mbufs allocated for the TCP PCAP functionality");
70289276Shiren
71289276ShirenVNET_DEFINE(int, tcp_pcap_packets) = 0;
72289276Shiren#define V_tcp_pcap_packets	VNET(tcp_pcap_packets)
73289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_packets, CTLFLAG_RW,
74289276Shiren	&V_tcp_pcap_packets, 0, "Default number of packets saved per direction "
75289276Shiren	"per TCPCB");
76289276Shiren
77289276Shiren/* Initialize the values. */
78289276Shirenstatic void
79289276Shirentcp_pcap_max_set() {
80289276Shiren	tcp_pcap_clusters_referenced_max = nmbclusters / 4;
81289276Shiren}
82289276Shiren
83289276Shirenvoid
84289276Shirentcp_pcap_init() {
85289276Shiren	tcp_pcap_max_set();
86289276Shiren	EVENTHANDLER_REGISTER(nmbclusters_change, tcp_pcap_max_set,
87289276Shiren		NULL, EVENTHANDLER_PRI_ANY);
88289276Shiren}
89289276Shiren
90289276Shiren/*
91289276Shiren * If we are below the maximum allowed cluster references,
92289276Shiren * increment the reference count and return TRUE. Otherwise,
93289276Shiren * leave the reference count alone and return FALSE.
94289276Shiren */
95289276Shirenstatic __inline bool
96289276Shirentcp_pcap_take_cluster_reference(void)
97289276Shiren{
98289276Shiren	if (atomic_fetchadd_int(&tcp_pcap_clusters_referenced_cur, 1) >=
99289276Shiren		tcp_pcap_clusters_referenced_max) {
100289276Shiren		atomic_add_int(&tcp_pcap_clusters_referenced_cur, -1);
101289276Shiren		return FALSE;
102289276Shiren	}
103289276Shiren	return TRUE;
104289276Shiren}
105289276Shiren
106289276Shiren/*
107289276Shiren * For all the external entries in m, apply the given adjustment.
108289276Shiren * This can be used to adjust the counter when an mbuf chain is
109289276Shiren * copied or freed.
110289276Shiren */
111289276Shirenstatic __inline void
112289276Shirentcp_pcap_adj_cluster_reference(struct mbuf *m, int adj)
113289276Shiren{
114289276Shiren	while (m) {
115289276Shiren		if (m->m_flags & M_EXT)
116289276Shiren			atomic_add_int(&tcp_pcap_clusters_referenced_cur, adj);
117289276Shiren
118289276Shiren		m = m->m_next;
119289276Shiren	}
120289276Shiren}
121289276Shiren
122289276Shiren/*
123289276Shiren * Free all mbufs in a chain, decrementing the reference count as
124289276Shiren * necessary.
125289276Shiren *
126289276Shiren * Functions in this file should use this instead of m_freem() when
127289276Shiren * they are freeing mbuf chains that may contain clusters that were
128289276Shiren * already included in tcp_pcap_clusters_referenced_cur.
129289276Shiren */
130289276Shirenstatic void
131289276Shirentcp_pcap_m_freem(struct mbuf *mb)
132289276Shiren{
133289276Shiren	while (mb != NULL) {
134289276Shiren		if (mb->m_flags & M_EXT)
135289276Shiren			atomic_subtract_int(&tcp_pcap_clusters_referenced_cur,
136289276Shiren			    1);
137289276Shiren		mb = m_free(mb);
138289276Shiren	}
139289276Shiren}
140289276Shiren
141289276Shiren/*
142289276Shiren * Copy data from m to n, where n cannot fit all the data we might
143289276Shiren * want from m.
144289276Shiren *
145289276Shiren * Prioritize data like this:
146289276Shiren * 1. TCP header
147289276Shiren * 2. IP header
148289276Shiren * 3. Data
149289276Shiren */
150289276Shirenstatic void
151289276Shirentcp_pcap_copy_bestfit(struct tcphdr *th, struct mbuf *m, struct mbuf *n)
152289276Shiren{
153289276Shiren	struct mbuf *m_cur = m;
154289276Shiren	int bytes_to_copy=0, trailing_data, skip=0, tcp_off;
155289276Shiren
156289276Shiren	/* Below, we assume these will be non-NULL. */
157289276Shiren	KASSERT(th, ("%s: called with th == NULL", __func__));
158289276Shiren	KASSERT(m, ("%s: called with m == NULL", __func__));
159289276Shiren	KASSERT(n, ("%s: called with n == NULL", __func__));
160289276Shiren
161289276Shiren	/* We assume this initialization occurred elsewhere. */
162289276Shiren	KASSERT(n->m_len == 0, ("%s: called with n->m_len=%d (expected 0)",
163289276Shiren		__func__, n->m_len));
164289276Shiren	KASSERT(n->m_data == M_START(n),
165289276Shiren		("%s: called with n->m_data != M_START(n)", __func__));
166289276Shiren
167289276Shiren	/*
168289276Shiren	 * Calculate the size of the TCP header. We use this often
169289276Shiren	 * enough that it is worth just calculating at the start.
170289276Shiren	 */
171289276Shiren	tcp_off = th->th_off << 2;
172289276Shiren
173289276Shiren	/* Trim off leading empty mbufs. */
174289276Shiren	while (m && m->m_len == 0)
175289276Shiren		m = m->m_next;
176289276Shiren
177289276Shiren	if (m) {
178289276Shiren		m_cur = m;
179289276Shiren	}
180289276Shiren	else {
181289276Shiren		/*
182289276Shiren		 * No data? Highly unusual. We would expect to at
183289276Shiren		 * least see a TCP header in the mbuf.
184289276Shiren		 * As we have a pointer to the TCP header, I guess
185289276Shiren		 * we should just copy that. (???)
186289276Shiren		 */
187289276Shirenfallback:
188289276Shiren		bytes_to_copy = tcp_off;
189289276Shiren		if (bytes_to_copy > M_SIZE(n))
190289276Shiren			bytes_to_copy = M_SIZE(n);
191289276Shiren		bcopy(th, n->m_data, bytes_to_copy);
192289276Shiren		n->m_len = bytes_to_copy;
193289276Shiren		return;
194289276Shiren	}
195289276Shiren
196289276Shiren	/*
197289276Shiren	 * Find TCP header. Record the total number of bytes up to,
198289276Shiren	 * and including, the TCP header.
199289276Shiren	 */
200289276Shiren	while (m_cur) {
201289276Shiren		if ((caddr_t) th >= (caddr_t) m_cur->m_data &&
202289276Shiren			(caddr_t) th < (caddr_t) (m_cur->m_data + m_cur->m_len))
203289276Shiren			break;
204289276Shiren		bytes_to_copy += m_cur->m_len;
205289276Shiren		m_cur = m_cur->m_next;
206289276Shiren	}
207289276Shiren	if (m_cur)
208289276Shiren		bytes_to_copy += (caddr_t) th - (caddr_t) m_cur->m_data;
209289276Shiren	else
210289276Shiren		goto fallback;
211289276Shiren	bytes_to_copy += tcp_off;
212289276Shiren
213289276Shiren	/*
214289276Shiren	 * If we already want to copy more bytes than we can hold
215289276Shiren	 * in the destination mbuf, skip leading bytes and copy
216289276Shiren	 * what we can.
217289276Shiren	 *
218289276Shiren	 * Otherwise, consider trailing data.
219289276Shiren	 */
220289276Shiren	if (bytes_to_copy > M_SIZE(n)) {
221289276Shiren		skip  = bytes_to_copy - M_SIZE(n);
222289276Shiren		bytes_to_copy = M_SIZE(n);
223289276Shiren	}
224289276Shiren	else {
225289276Shiren		/*
226289276Shiren		 * Determine how much trailing data is in the chain.
227289276Shiren		 * We start with the length of this mbuf (the one
228289276Shiren		 * containing th) and subtract the size of the TCP
229289276Shiren		 * header (tcp_off) and the size of the data prior
230289276Shiren		 * to th (th - m_cur->m_data).
231289276Shiren		 *
232289276Shiren		 * This *should not* be negative, as the TCP code
233289276Shiren		 * should put the whole TCP header in a single
234289276Shiren		 * mbuf. But, it isn't a problem if it is. We will
235289276Shiren		 * simple work off our negative balance as we look
236289276Shiren		 * at subsequent mbufs.
237289276Shiren		 */
238289276Shiren		trailing_data = m_cur->m_len - tcp_off;
239289276Shiren		trailing_data -= (caddr_t) th - (caddr_t) m_cur->m_data;
240289276Shiren		m_cur = m_cur->m_next;
241289276Shiren		while (m_cur) {
242289276Shiren			trailing_data += m_cur->m_len;
243289276Shiren			m_cur = m_cur->m_next;
244289276Shiren		}
245289276Shiren		if ((bytes_to_copy + trailing_data) > M_SIZE(n))
246289276Shiren			bytes_to_copy = M_SIZE(n);
247289276Shiren		else
248289276Shiren			bytes_to_copy += trailing_data;
249289276Shiren	}
250289276Shiren
251289276Shiren	m_copydata(m, skip, bytes_to_copy, n->m_data);
252289276Shiren	n->m_len = bytes_to_copy;
253289276Shiren}
254289276Shiren
255289276Shirenvoid
256289276Shirentcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
257289276Shiren{
258289276Shiren	struct mbuf *n = NULL, *mhead;
259289276Shiren
260289276Shiren	KASSERT(th, ("%s: called with th == NULL", __func__));
261289276Shiren	KASSERT(m, ("%s: called with m == NULL", __func__));
262289276Shiren	KASSERT(queue, ("%s: called with queue == NULL", __func__));
263289276Shiren
264289276Shiren	/* We only care about data packets. */
265289276Shiren	while (m && m->m_type != MT_DATA)
266289276Shiren		m = m->m_next;
267289276Shiren
268289276Shiren	/* We only need to do something if we still have an mbuf. */
269289276Shiren	if (!m)
270289276Shiren		return;
271289276Shiren
272289276Shiren	/* If we are not saving mbufs, return now. */
273289276Shiren	if (queue->mq_maxlen == 0)
274289276Shiren		return;
275289276Shiren
276289276Shiren	/*
277289276Shiren	 * Check to see if we will need to recycle mbufs.
278289276Shiren	 *
279289276Shiren	 * If we need to get rid of mbufs to stay below
280289276Shiren	 * our packet count, try to reuse the mbuf. Once
281289276Shiren	 * we already have a new mbuf (n), then we can
282289276Shiren	 * simply free subsequent mbufs.
283289276Shiren	 *
284289276Shiren	 * Note that most of the logic in here is to deal
285289276Shiren	 * with the reuse. If we are fine with constant
286289276Shiren	 * mbuf allocs/deallocs, we could ditch this logic.
287289276Shiren	 * But, it only seems to make sense to reuse
288289276Shiren	 * mbufs we already have.
289289276Shiren	 */
290289276Shiren	while (mbufq_full(queue)) {
291289276Shiren		mhead = mbufq_dequeue(queue);
292289276Shiren
293289276Shiren		if (n) {
294289276Shiren			tcp_pcap_m_freem(mhead);
295289276Shiren		}
296289276Shiren		else {
297289276Shiren			/*
298289276Shiren			 * If this held an external cluster, try to
299289276Shiren			 * detach the cluster. But, if we held the
300289276Shiren			 * last reference, go through the normal
301289276Shiren			 * free-ing process.
302289276Shiren			 */
303289276Shiren			if (mhead->m_flags & M_EXT) {
304289276Shiren				switch (mhead->m_ext.ext_type) {
305289276Shiren				case EXT_SFBUF:
306289276Shiren					/* Don't mess around with these. */
307289276Shiren					tcp_pcap_m_freem(mhead);
308289276Shiren					continue;
309289276Shiren				default:
310289276Shiren					if (atomic_fetchadd_int(
311289276Shiren						mhead->m_ext.ext_cnt, -1) == 1)
312289276Shiren					{
313289276Shiren						/*
314289276Shiren						 * We held the last reference
315289276Shiren						 * on this cluster. Restore
316289276Shiren						 * the reference count and put
317289276Shiren						 * it back in the pool.
318289276Shiren				 		 */
319289276Shiren						*(mhead->m_ext.ext_cnt) = 1;
320289276Shiren						tcp_pcap_m_freem(mhead);
321289276Shiren						continue;
322289276Shiren					}
323289276Shiren					/*
324289276Shiren					 * We were able to cleanly free the
325289276Shiren					 * reference.
326289276Shiren				 	 */
327289276Shiren					atomic_subtract_int(
328289276Shiren					    &tcp_pcap_clusters_referenced_cur,
329289276Shiren					    1);
330289276Shiren					tcp_pcap_alloc_reuse_ext++;
331289276Shiren					break;
332289276Shiren				}
333289276Shiren			}
334289276Shiren			else {
335289276Shiren				tcp_pcap_alloc_reuse_mbuf++;
336289276Shiren			}
337289276Shiren
338289276Shiren			n = mhead;
339289276Shiren			tcp_pcap_m_freem(n->m_next);
340289276Shiren			m_init(n, NULL, 0, M_NOWAIT, MT_DATA, 0);
341289276Shiren		}
342289276Shiren	}
343289276Shiren
344289276Shiren	/* Check to see if we need to get a new mbuf. */
345289276Shiren	if (!n) {
346289276Shiren		if (!(n = m_get(M_NOWAIT, MT_DATA)))
347289276Shiren			return;
348289276Shiren		tcp_pcap_alloc_new_mbuf++;
349289276Shiren	}
350289276Shiren
351289276Shiren	/*
352289276Shiren	 * What are we dealing with? If a cluster, attach it. Otherwise,
353289276Shiren	 * try to copy the data from the beginning of the mbuf to the
354289276Shiren	 * end of data. (There may be data between the start of the data
355289276Shiren	 * area and the current data pointer. We want to get this, because
356289276Shiren	 * it may contain header information that is useful.)
357289276Shiren	 * In cases where that isn't possible, settle for what we can
358289276Shiren	 * get.
359289276Shiren	 */
360289276Shiren	if ((m->m_flags & M_EXT) && tcp_pcap_take_cluster_reference()) {
361289276Shiren		n->m_data = m->m_data;
362289276Shiren		n->m_len = m->m_len;
363289276Shiren		mb_dupcl(n, m);
364289276Shiren	}
365289276Shiren	else if (((m->m_data + m->m_len) - M_START(m)) <= M_SIZE(n)) {
366289276Shiren		/*
367289276Shiren		 * At this point, n is guaranteed to be a normal mbuf
368289276Shiren		 * with no cluster and no packet header. Because the
369289276Shiren		 * logic in this code block requires this, the assert
370289276Shiren		 * is here to catch any instances where someone
371289276Shiren		 * changes the logic to invalidate that assumption.
372289276Shiren		 */
373289276Shiren		KASSERT((n->m_flags & (M_EXT | M_PKTHDR)) == 0,
374289276Shiren			("%s: Unexpected flags (%#x) for mbuf",
375289276Shiren			__func__, n->m_flags));
376289276Shiren		n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m);
377289276Shiren		n->m_len = m->m_len;
378289276Shiren		bcopy(M_START(m), n->m_dat,
379289276Shiren			m->m_len + M_LEADINGSPACE_NOWRITE(m));
380289276Shiren	}
381289276Shiren	else {
382289276Shiren		/*
383289276Shiren		 * This is the case where we need to "settle for what
384289276Shiren		 * we can get". The most probable way to this code
385289276Shiren		 * path is that we've already taken references to the
386289276Shiren		 * maximum number of mbuf clusters we can, and the data
387289276Shiren		 * is too long to fit in an mbuf's internal storage.
388289276Shiren		 * Try for a "best fit".
389289276Shiren		 */
390289276Shiren		tcp_pcap_copy_bestfit(th, m, n);
391289276Shiren
392289276Shiren		/* Don't try to get additional data. */
393289276Shiren		goto add_to_queue;
394289276Shiren	}
395289276Shiren
396289276Shiren	if (m->m_next) {
397289276Shiren		n->m_next = m_copym(m->m_next, 0, M_COPYALL, M_NOWAIT);
398289276Shiren		tcp_pcap_adj_cluster_reference(n->m_next, 1);
399289276Shiren	}
400289276Shiren
401289276Shirenadd_to_queue:
402289276Shiren	/* Add the new mbuf to the list. */
403289276Shiren	if (mbufq_enqueue(queue, n)) {
404289276Shiren		/* This shouldn't happen. If INVARIANTS is defined, panic. */
405289276Shiren		KASSERT(0, ("%s: mbufq was unexpectedly full!", __func__));
406289276Shiren		tcp_pcap_m_freem(n);
407289276Shiren	}
408289276Shiren}
409289276Shiren
410289276Shirenvoid
411289276Shirentcp_pcap_drain(struct mbufq *queue)
412289276Shiren{
413289276Shiren	struct mbuf *m;
414289276Shiren	while ((m = mbufq_dequeue(queue)))
415289276Shiren		tcp_pcap_m_freem(m);
416289276Shiren}
417289276Shiren
418289276Shirenvoid
419289276Shirentcp_pcap_tcpcb_init(struct tcpcb *tp)
420289276Shiren{
421289276Shiren	mbufq_init(&(tp->t_inpkts), V_tcp_pcap_packets);
422289276Shiren	mbufq_init(&(tp->t_outpkts), V_tcp_pcap_packets);
423289276Shiren}
424289276Shiren
425289276Shirenvoid
426289276Shirentcp_pcap_set_sock_max(struct mbufq *queue, int newval)
427289276Shiren{
428289276Shiren	queue->mq_maxlen = newval;
429289276Shiren	while (queue->mq_len > queue->mq_maxlen)
430289276Shiren		tcp_pcap_m_freem(mbufq_dequeue(queue));
431289276Shiren}
432289276Shiren
433289276Shirenint
434289276Shirentcp_pcap_get_sock_max(struct mbufq *queue)
435289276Shiren{
436289276Shiren	return queue->mq_maxlen;
437289276Shiren}
438