1289276Shiren/*-
2289276Shiren * Copyright (c) 2015
3289276Shiren *	Jonathan Looney. All rights reserved.
4289276Shiren *
5289276Shiren * Redistribution and use in source and binary forms, with or without
6289276Shiren * modification, are permitted provided that the following conditions
7289276Shiren * are met:
8289276Shiren * 1. Redistributions of source code must retain the above copyright
9289276Shiren *    notice, this list of conditions and the following disclaimer.
10289276Shiren * 2. Redistributions in binary form must reproduce the above copyright
11289276Shiren *    notice, this list of conditions and the following disclaimer in the
12289276Shiren *    documentation and/or other materials provided with the distribution.
13289276Shiren *
14289276Shiren * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15289276Shiren * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16289276Shiren * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17289276Shiren * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18289276Shiren * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19289276Shiren * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20289276Shiren * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21289276Shiren * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22289276Shiren * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23289276Shiren * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24289276Shiren * SUCH DAMAGE.
25289276Shiren *
26289276Shiren * $FreeBSD: releng/11.0/sys/netinet/tcp_pcap.c 302374 2016-07-06 16:17:13Z jtl $
27289276Shiren */
28289276Shiren
29289276Shiren#include <sys/queue.h>
30289276Shiren#include <sys/param.h>
31289276Shiren#include <sys/types.h>
32289276Shiren#include <sys/socket.h>
33289276Shiren#include <sys/socketvar.h>
34289276Shiren#include <sys/sysctl.h>
35289276Shiren#include <sys/systm.h>
36289276Shiren#include <sys/mbuf.h>
37289276Shiren#include <sys/eventhandler.h>
38289276Shiren#include <machine/atomic.h>
39289276Shiren#include <netinet/tcp_var.h>
40289276Shiren#include <netinet/tcp_pcap.h>
41289276Shiren
42289276Shiren#define M_LEADINGSPACE_NOWRITE(m)					\
43289276Shiren	((m)->m_data - M_START(m))
44289276Shiren
45302374Sjtlint tcp_pcap_aggressive_free = 1;
46289276Shirenstatic int tcp_pcap_clusters_referenced_cur = 0;
47289276Shirenstatic int tcp_pcap_clusters_referenced_max = 0;
48289276Shiren
49302374SjtlSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_aggressive_free,
50302374Sjtl	CTLFLAG_RW, &tcp_pcap_aggressive_free, 0,
51302374Sjtl	"Free saved packets when the memory system comes under pressure");
52289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_cur,
53289276Shiren	CTLFLAG_RD, &tcp_pcap_clusters_referenced_cur, 0,
54289276Shiren	"Number of clusters currently referenced on TCP PCAP queues");
55289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_max,
56289276Shiren	CTLFLAG_RW, &tcp_pcap_clusters_referenced_max, 0,
57289276Shiren	"Maximum number of clusters allowed to be referenced on TCP PCAP "
58289276Shiren	"queues");
59289276Shiren
60289276Shirenstatic int tcp_pcap_alloc_reuse_ext = 0;
61289276Shirenstatic int tcp_pcap_alloc_reuse_mbuf = 0;
62289276Shirenstatic int tcp_pcap_alloc_new_mbuf = 0;
63289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_ext,
64289276Shiren	CTLFLAG_RD, &tcp_pcap_alloc_reuse_ext, 0,
65289276Shiren	"Number of mbufs with external storage reused for the TCP PCAP "
66289276Shiren	"functionality");
67289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_mbuf,
68289276Shiren	CTLFLAG_RD, &tcp_pcap_alloc_reuse_mbuf, 0,
69289276Shiren	"Number of mbufs with internal storage reused for the TCP PCAP "
70289276Shiren	"functionality");
71289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_new_mbuf,
72289276Shiren	CTLFLAG_RD, &tcp_pcap_alloc_new_mbuf, 0,
73289276Shiren	"Number of new mbufs allocated for the TCP PCAP functionality");
74289276Shiren
75289276ShirenVNET_DEFINE(int, tcp_pcap_packets) = 0;
76289276Shiren#define V_tcp_pcap_packets	VNET(tcp_pcap_packets)
77289350SbzSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_packets,
78289350Sbz	CTLFLAG_RW, &VNET_NAME(tcp_pcap_packets), 0,
79289350Sbz	"Default number of packets saved per direction per TCPCB");
80289276Shiren
81289276Shiren/* Initialize the values. */
82289276Shirenstatic void
83289326Sbztcp_pcap_max_set(void)
84289326Sbz{
85289326Sbz
86289276Shiren	tcp_pcap_clusters_referenced_max = nmbclusters / 4;
87289276Shiren}
88289276Shiren
89289276Shirenvoid
90289326Sbztcp_pcap_init(void)
91289326Sbz{
92289326Sbz
93289276Shiren	tcp_pcap_max_set();
94289276Shiren	EVENTHANDLER_REGISTER(nmbclusters_change, tcp_pcap_max_set,
95289276Shiren		NULL, EVENTHANDLER_PRI_ANY);
96289276Shiren}
97289276Shiren
98289276Shiren/*
99289276Shiren * If we are below the maximum allowed cluster references,
100289276Shiren * increment the reference count and return TRUE. Otherwise,
101289276Shiren * leave the reference count alone and return FALSE.
102289276Shiren */
103289276Shirenstatic __inline bool
104289276Shirentcp_pcap_take_cluster_reference(void)
105289276Shiren{
106289276Shiren	if (atomic_fetchadd_int(&tcp_pcap_clusters_referenced_cur, 1) >=
107289276Shiren		tcp_pcap_clusters_referenced_max) {
108289276Shiren		atomic_add_int(&tcp_pcap_clusters_referenced_cur, -1);
109289276Shiren		return FALSE;
110289276Shiren	}
111289276Shiren	return TRUE;
112289276Shiren}
113289276Shiren
114289276Shiren/*
115289276Shiren * For all the external entries in m, apply the given adjustment.
116289276Shiren * This can be used to adjust the counter when an mbuf chain is
117289276Shiren * copied or freed.
118289276Shiren */
119289276Shirenstatic __inline void
120289276Shirentcp_pcap_adj_cluster_reference(struct mbuf *m, int adj)
121289276Shiren{
122289276Shiren	while (m) {
123289276Shiren		if (m->m_flags & M_EXT)
124289276Shiren			atomic_add_int(&tcp_pcap_clusters_referenced_cur, adj);
125289276Shiren
126289276Shiren		m = m->m_next;
127289276Shiren	}
128289276Shiren}
129289276Shiren
130289276Shiren/*
131289276Shiren * Free all mbufs in a chain, decrementing the reference count as
132289276Shiren * necessary.
133289276Shiren *
134289276Shiren * Functions in this file should use this instead of m_freem() when
135289276Shiren * they are freeing mbuf chains that may contain clusters that were
136289276Shiren * already included in tcp_pcap_clusters_referenced_cur.
137289276Shiren */
138289276Shirenstatic void
139289276Shirentcp_pcap_m_freem(struct mbuf *mb)
140289276Shiren{
141289276Shiren	while (mb != NULL) {
142289276Shiren		if (mb->m_flags & M_EXT)
143289276Shiren			atomic_subtract_int(&tcp_pcap_clusters_referenced_cur,
144289276Shiren			    1);
145289276Shiren		mb = m_free(mb);
146289276Shiren	}
147289276Shiren}
148289276Shiren
149289276Shiren/*
150289276Shiren * Copy data from m to n, where n cannot fit all the data we might
151289276Shiren * want from m.
152289276Shiren *
153289276Shiren * Prioritize data like this:
154289276Shiren * 1. TCP header
155289276Shiren * 2. IP header
156289276Shiren * 3. Data
157289276Shiren */
158289276Shirenstatic void
159289276Shirentcp_pcap_copy_bestfit(struct tcphdr *th, struct mbuf *m, struct mbuf *n)
160289276Shiren{
161289276Shiren	struct mbuf *m_cur = m;
162289276Shiren	int bytes_to_copy=0, trailing_data, skip=0, tcp_off;
163289276Shiren
164289276Shiren	/* Below, we assume these will be non-NULL. */
165289276Shiren	KASSERT(th, ("%s: called with th == NULL", __func__));
166289276Shiren	KASSERT(m, ("%s: called with m == NULL", __func__));
167289276Shiren	KASSERT(n, ("%s: called with n == NULL", __func__));
168289276Shiren
169289276Shiren	/* We assume this initialization occurred elsewhere. */
170289276Shiren	KASSERT(n->m_len == 0, ("%s: called with n->m_len=%d (expected 0)",
171289276Shiren		__func__, n->m_len));
172289276Shiren	KASSERT(n->m_data == M_START(n),
173289276Shiren		("%s: called with n->m_data != M_START(n)", __func__));
174289276Shiren
175289276Shiren	/*
176289276Shiren	 * Calculate the size of the TCP header. We use this often
177289276Shiren	 * enough that it is worth just calculating at the start.
178289276Shiren	 */
179289276Shiren	tcp_off = th->th_off << 2;
180289276Shiren
181289276Shiren	/* Trim off leading empty mbufs. */
182289276Shiren	while (m && m->m_len == 0)
183289276Shiren		m = m->m_next;
184289276Shiren
185289276Shiren	if (m) {
186289276Shiren		m_cur = m;
187289276Shiren	}
188289276Shiren	else {
189289276Shiren		/*
190289276Shiren		 * No data? Highly unusual. We would expect to at
191289276Shiren		 * least see a TCP header in the mbuf.
192289276Shiren		 * As we have a pointer to the TCP header, I guess
193289276Shiren		 * we should just copy that. (???)
194289276Shiren		 */
195289276Shirenfallback:
196289276Shiren		bytes_to_copy = tcp_off;
197289276Shiren		if (bytes_to_copy > M_SIZE(n))
198289276Shiren			bytes_to_copy = M_SIZE(n);
199289276Shiren		bcopy(th, n->m_data, bytes_to_copy);
200289276Shiren		n->m_len = bytes_to_copy;
201289276Shiren		return;
202289276Shiren	}
203289276Shiren
204289276Shiren	/*
205289276Shiren	 * Find TCP header. Record the total number of bytes up to,
206289276Shiren	 * and including, the TCP header.
207289276Shiren	 */
208289276Shiren	while (m_cur) {
209289276Shiren		if ((caddr_t) th >= (caddr_t) m_cur->m_data &&
210289276Shiren			(caddr_t) th < (caddr_t) (m_cur->m_data + m_cur->m_len))
211289276Shiren			break;
212289276Shiren		bytes_to_copy += m_cur->m_len;
213289276Shiren		m_cur = m_cur->m_next;
214289276Shiren	}
215289276Shiren	if (m_cur)
216289276Shiren		bytes_to_copy += (caddr_t) th - (caddr_t) m_cur->m_data;
217289276Shiren	else
218289276Shiren		goto fallback;
219289276Shiren	bytes_to_copy += tcp_off;
220289276Shiren
221289276Shiren	/*
222289276Shiren	 * If we already want to copy more bytes than we can hold
223289276Shiren	 * in the destination mbuf, skip leading bytes and copy
224289276Shiren	 * what we can.
225289276Shiren	 *
226289276Shiren	 * Otherwise, consider trailing data.
227289276Shiren	 */
228289276Shiren	if (bytes_to_copy > M_SIZE(n)) {
229289276Shiren		skip  = bytes_to_copy - M_SIZE(n);
230289276Shiren		bytes_to_copy = M_SIZE(n);
231289276Shiren	}
232289276Shiren	else {
233289276Shiren		/*
234289276Shiren		 * Determine how much trailing data is in the chain.
235289276Shiren		 * We start with the length of this mbuf (the one
236289276Shiren		 * containing th) and subtract the size of the TCP
237289276Shiren		 * header (tcp_off) and the size of the data prior
238289276Shiren		 * to th (th - m_cur->m_data).
239289276Shiren		 *
240289276Shiren		 * This *should not* be negative, as the TCP code
241289276Shiren		 * should put the whole TCP header in a single
242289276Shiren		 * mbuf. But, it isn't a problem if it is. We will
243289276Shiren		 * simple work off our negative balance as we look
244289276Shiren		 * at subsequent mbufs.
245289276Shiren		 */
246289276Shiren		trailing_data = m_cur->m_len - tcp_off;
247289276Shiren		trailing_data -= (caddr_t) th - (caddr_t) m_cur->m_data;
248289276Shiren		m_cur = m_cur->m_next;
249289276Shiren		while (m_cur) {
250289276Shiren			trailing_data += m_cur->m_len;
251289276Shiren			m_cur = m_cur->m_next;
252289276Shiren		}
253289276Shiren		if ((bytes_to_copy + trailing_data) > M_SIZE(n))
254289276Shiren			bytes_to_copy = M_SIZE(n);
255289276Shiren		else
256289276Shiren			bytes_to_copy += trailing_data;
257289276Shiren	}
258289276Shiren
259289276Shiren	m_copydata(m, skip, bytes_to_copy, n->m_data);
260289276Shiren	n->m_len = bytes_to_copy;
261289276Shiren}
262289276Shiren
263289276Shirenvoid
264289276Shirentcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
265289276Shiren{
266289276Shiren	struct mbuf *n = NULL, *mhead;
267289276Shiren
268289276Shiren	KASSERT(th, ("%s: called with th == NULL", __func__));
269289276Shiren	KASSERT(m, ("%s: called with m == NULL", __func__));
270289276Shiren	KASSERT(queue, ("%s: called with queue == NULL", __func__));
271289276Shiren
272289276Shiren	/* We only care about data packets. */
273289276Shiren	while (m && m->m_type != MT_DATA)
274289276Shiren		m = m->m_next;
275289276Shiren
276289276Shiren	/* We only need to do something if we still have an mbuf. */
277289276Shiren	if (!m)
278289276Shiren		return;
279289276Shiren
280289276Shiren	/* If we are not saving mbufs, return now. */
281289276Shiren	if (queue->mq_maxlen == 0)
282289276Shiren		return;
283289276Shiren
284289276Shiren	/*
285289276Shiren	 * Check to see if we will need to recycle mbufs.
286289276Shiren	 *
287289276Shiren	 * If we need to get rid of mbufs to stay below
288289276Shiren	 * our packet count, try to reuse the mbuf. Once
289289276Shiren	 * we already have a new mbuf (n), then we can
290289276Shiren	 * simply free subsequent mbufs.
291289276Shiren	 *
292289276Shiren	 * Note that most of the logic in here is to deal
293289276Shiren	 * with the reuse. If we are fine with constant
294289276Shiren	 * mbuf allocs/deallocs, we could ditch this logic.
295289276Shiren	 * But, it only seems to make sense to reuse
296289276Shiren	 * mbufs we already have.
297289276Shiren	 */
298289276Shiren	while (mbufq_full(queue)) {
299289276Shiren		mhead = mbufq_dequeue(queue);
300289276Shiren
301289276Shiren		if (n) {
302289276Shiren			tcp_pcap_m_freem(mhead);
303289276Shiren		}
304289276Shiren		else {
305289276Shiren			/*
306289276Shiren			 * If this held an external cluster, try to
307289276Shiren			 * detach the cluster. But, if we held the
308289276Shiren			 * last reference, go through the normal
309289276Shiren			 * free-ing process.
310289276Shiren			 */
311289276Shiren			if (mhead->m_flags & M_EXT) {
312289276Shiren				switch (mhead->m_ext.ext_type) {
313289276Shiren				case EXT_SFBUF:
314289276Shiren					/* Don't mess around with these. */
315289276Shiren					tcp_pcap_m_freem(mhead);
316289276Shiren					continue;
317289276Shiren				default:
318289276Shiren					if (atomic_fetchadd_int(
319289276Shiren						mhead->m_ext.ext_cnt, -1) == 1)
320289276Shiren					{
321289276Shiren						/*
322289276Shiren						 * We held the last reference
323289276Shiren						 * on this cluster. Restore
324289276Shiren						 * the reference count and put
325289276Shiren						 * it back in the pool.
326289276Shiren				 		 */
327289276Shiren						*(mhead->m_ext.ext_cnt) = 1;
328289276Shiren						tcp_pcap_m_freem(mhead);
329289276Shiren						continue;
330289276Shiren					}
331289276Shiren					/*
332289276Shiren					 * We were able to cleanly free the
333289276Shiren					 * reference.
334289276Shiren				 	 */
335289276Shiren					atomic_subtract_int(
336289276Shiren					    &tcp_pcap_clusters_referenced_cur,
337289276Shiren					    1);
338289276Shiren					tcp_pcap_alloc_reuse_ext++;
339289276Shiren					break;
340289276Shiren				}
341289276Shiren			}
342289276Shiren			else {
343289276Shiren				tcp_pcap_alloc_reuse_mbuf++;
344289276Shiren			}
345289276Shiren
346289276Shiren			n = mhead;
347289276Shiren			tcp_pcap_m_freem(n->m_next);
348295482Sglebius			m_init(n, M_NOWAIT, MT_DATA, 0);
349289276Shiren		}
350289276Shiren	}
351289276Shiren
352289276Shiren	/* Check to see if we need to get a new mbuf. */
353289276Shiren	if (!n) {
354289276Shiren		if (!(n = m_get(M_NOWAIT, MT_DATA)))
355289276Shiren			return;
356289276Shiren		tcp_pcap_alloc_new_mbuf++;
357289276Shiren	}
358289276Shiren
359289276Shiren	/*
360289276Shiren	 * What are we dealing with? If a cluster, attach it. Otherwise,
361289276Shiren	 * try to copy the data from the beginning of the mbuf to the
362289276Shiren	 * end of data. (There may be data between the start of the data
363289276Shiren	 * area and the current data pointer. We want to get this, because
364289276Shiren	 * it may contain header information that is useful.)
365289276Shiren	 * In cases where that isn't possible, settle for what we can
366289276Shiren	 * get.
367289276Shiren	 */
368289276Shiren	if ((m->m_flags & M_EXT) && tcp_pcap_take_cluster_reference()) {
369289276Shiren		n->m_data = m->m_data;
370289276Shiren		n->m_len = m->m_len;
371289276Shiren		mb_dupcl(n, m);
372289276Shiren	}
373289276Shiren	else if (((m->m_data + m->m_len) - M_START(m)) <= M_SIZE(n)) {
374289276Shiren		/*
375289276Shiren		 * At this point, n is guaranteed to be a normal mbuf
376289276Shiren		 * with no cluster and no packet header. Because the
377289276Shiren		 * logic in this code block requires this, the assert
378289276Shiren		 * is here to catch any instances where someone
379289276Shiren		 * changes the logic to invalidate that assumption.
380289276Shiren		 */
381289276Shiren		KASSERT((n->m_flags & (M_EXT | M_PKTHDR)) == 0,
382289276Shiren			("%s: Unexpected flags (%#x) for mbuf",
383289276Shiren			__func__, n->m_flags));
384289276Shiren		n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m);
385289276Shiren		n->m_len = m->m_len;
386289276Shiren		bcopy(M_START(m), n->m_dat,
387289276Shiren			m->m_len + M_LEADINGSPACE_NOWRITE(m));
388289276Shiren	}
389289276Shiren	else {
390289276Shiren		/*
391289276Shiren		 * This is the case where we need to "settle for what
392289276Shiren		 * we can get". The most probable way to this code
393289276Shiren		 * path is that we've already taken references to the
394289276Shiren		 * maximum number of mbuf clusters we can, and the data
395289276Shiren		 * is too long to fit in an mbuf's internal storage.
396289276Shiren		 * Try for a "best fit".
397289276Shiren		 */
398289276Shiren		tcp_pcap_copy_bestfit(th, m, n);
399289276Shiren
400289276Shiren		/* Don't try to get additional data. */
401289276Shiren		goto add_to_queue;
402289276Shiren	}
403289276Shiren
404289276Shiren	if (m->m_next) {
405289276Shiren		n->m_next = m_copym(m->m_next, 0, M_COPYALL, M_NOWAIT);
406289276Shiren		tcp_pcap_adj_cluster_reference(n->m_next, 1);
407289276Shiren	}
408289276Shiren
409289276Shirenadd_to_queue:
410289276Shiren	/* Add the new mbuf to the list. */
411289276Shiren	if (mbufq_enqueue(queue, n)) {
412289276Shiren		/* This shouldn't happen. If INVARIANTS is defined, panic. */
413289276Shiren		KASSERT(0, ("%s: mbufq was unexpectedly full!", __func__));
414289276Shiren		tcp_pcap_m_freem(n);
415289276Shiren	}
416289276Shiren}
417289276Shiren
418289276Shirenvoid
419289276Shirentcp_pcap_drain(struct mbufq *queue)
420289276Shiren{
421289276Shiren	struct mbuf *m;
422289276Shiren	while ((m = mbufq_dequeue(queue)))
423289276Shiren		tcp_pcap_m_freem(m);
424289276Shiren}
425289276Shiren
426289276Shirenvoid
427289276Shirentcp_pcap_tcpcb_init(struct tcpcb *tp)
428289276Shiren{
429289276Shiren	mbufq_init(&(tp->t_inpkts), V_tcp_pcap_packets);
430289276Shiren	mbufq_init(&(tp->t_outpkts), V_tcp_pcap_packets);
431289276Shiren}
432289276Shiren
433289276Shirenvoid
434289276Shirentcp_pcap_set_sock_max(struct mbufq *queue, int newval)
435289276Shiren{
436289276Shiren	queue->mq_maxlen = newval;
437289276Shiren	while (queue->mq_len > queue->mq_maxlen)
438289276Shiren		tcp_pcap_m_freem(mbufq_dequeue(queue));
439289276Shiren}
440289276Shiren
441289276Shirenint
442289276Shirentcp_pcap_get_sock_max(struct mbufq *queue)
443289276Shiren{
444289276Shiren	return queue->mq_maxlen;
445289276Shiren}
446