1/*-
2 * Copyright (c) 2015
3 *	Jonathan Looney. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD$
27 */
28
29#include <sys/queue.h>
30#include <sys/param.h>
31#include <sys/types.h>
32#include <sys/socket.h>
33#include <sys/socketvar.h>
34#include <sys/sysctl.h>
35#include <sys/systm.h>
36#include <sys/mbuf.h>
37#include <sys/eventhandler.h>
38#include <machine/atomic.h>
39#include <netinet/tcp_var.h>
40#include <netinet/tcp_pcap.h>
41
42#define M_LEADINGSPACE_NOWRITE(m)					\
43	((m)->m_data - M_START(m))
44
45int tcp_pcap_aggressive_free = 1;
46static int tcp_pcap_clusters_referenced_cur = 0;
47static int tcp_pcap_clusters_referenced_max = 0;
48
49SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_aggressive_free,
50	CTLFLAG_RW, &tcp_pcap_aggressive_free, 0,
51	"Free saved packets when the memory system comes under pressure");
52SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_cur,
53	CTLFLAG_RD, &tcp_pcap_clusters_referenced_cur, 0,
54	"Number of clusters currently referenced on TCP PCAP queues");
55SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_max,
56	CTLFLAG_RW, &tcp_pcap_clusters_referenced_max, 0,
57	"Maximum number of clusters allowed to be referenced on TCP PCAP "
58	"queues");
59
60static int tcp_pcap_alloc_reuse_ext = 0;
61static int tcp_pcap_alloc_reuse_mbuf = 0;
62static int tcp_pcap_alloc_new_mbuf = 0;
63SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_ext,
64	CTLFLAG_RD, &tcp_pcap_alloc_reuse_ext, 0,
65	"Number of mbufs with external storage reused for the TCP PCAP "
66	"functionality");
67SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_mbuf,
68	CTLFLAG_RD, &tcp_pcap_alloc_reuse_mbuf, 0,
69	"Number of mbufs with internal storage reused for the TCP PCAP "
70	"functionality");
71SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_new_mbuf,
72	CTLFLAG_RD, &tcp_pcap_alloc_new_mbuf, 0,
73	"Number of new mbufs allocated for the TCP PCAP functionality");
74
75VNET_DEFINE(int, tcp_pcap_packets) = 0;
76#define V_tcp_pcap_packets	VNET(tcp_pcap_packets)
77SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_packets,
78	CTLFLAG_RW, &VNET_NAME(tcp_pcap_packets), 0,
79	"Default number of packets saved per direction per TCPCB");
80
81/* Initialize the values. */
82static void
83tcp_pcap_max_set(void)
84{
85
86	tcp_pcap_clusters_referenced_max = nmbclusters / 4;
87}
88
89void
90tcp_pcap_init(void)
91{
92
93	tcp_pcap_max_set();
94	EVENTHANDLER_REGISTER(nmbclusters_change, tcp_pcap_max_set,
95		NULL, EVENTHANDLER_PRI_ANY);
96}
97
98/*
99 * If we are below the maximum allowed cluster references,
100 * increment the reference count and return TRUE. Otherwise,
101 * leave the reference count alone and return FALSE.
102 */
103static __inline bool
104tcp_pcap_take_cluster_reference(void)
105{
106	if (atomic_fetchadd_int(&tcp_pcap_clusters_referenced_cur, 1) >=
107		tcp_pcap_clusters_referenced_max) {
108		atomic_add_int(&tcp_pcap_clusters_referenced_cur, -1);
109		return FALSE;
110	}
111	return TRUE;
112}
113
114/*
115 * For all the external entries in m, apply the given adjustment.
116 * This can be used to adjust the counter when an mbuf chain is
117 * copied or freed.
118 */
119static __inline void
120tcp_pcap_adj_cluster_reference(struct mbuf *m, int adj)
121{
122	while (m) {
123		if (m->m_flags & M_EXT)
124			atomic_add_int(&tcp_pcap_clusters_referenced_cur, adj);
125
126		m = m->m_next;
127	}
128}
129
130/*
131 * Free all mbufs in a chain, decrementing the reference count as
132 * necessary.
133 *
134 * Functions in this file should use this instead of m_freem() when
135 * they are freeing mbuf chains that may contain clusters that were
136 * already included in tcp_pcap_clusters_referenced_cur.
137 */
138static void
139tcp_pcap_m_freem(struct mbuf *mb)
140{
141	while (mb != NULL) {
142		if (mb->m_flags & M_EXT)
143			atomic_subtract_int(&tcp_pcap_clusters_referenced_cur,
144			    1);
145		mb = m_free(mb);
146	}
147}
148
149/*
150 * Copy data from m to n, where n cannot fit all the data we might
151 * want from m.
152 *
153 * Prioritize data like this:
154 * 1. TCP header
155 * 2. IP header
156 * 3. Data
157 */
158static void
159tcp_pcap_copy_bestfit(struct tcphdr *th, struct mbuf *m, struct mbuf *n)
160{
161	struct mbuf *m_cur = m;
162	int bytes_to_copy=0, trailing_data, skip=0, tcp_off;
163
164	/* Below, we assume these will be non-NULL. */
165	KASSERT(th, ("%s: called with th == NULL", __func__));
166	KASSERT(m, ("%s: called with m == NULL", __func__));
167	KASSERT(n, ("%s: called with n == NULL", __func__));
168
169	/* We assume this initialization occurred elsewhere. */
170	KASSERT(n->m_len == 0, ("%s: called with n->m_len=%d (expected 0)",
171		__func__, n->m_len));
172	KASSERT(n->m_data == M_START(n),
173		("%s: called with n->m_data != M_START(n)", __func__));
174
175	/*
176	 * Calculate the size of the TCP header. We use this often
177	 * enough that it is worth just calculating at the start.
178	 */
179	tcp_off = th->th_off << 2;
180
181	/* Trim off leading empty mbufs. */
182	while (m && m->m_len == 0)
183		m = m->m_next;
184
185	if (m) {
186		m_cur = m;
187	}
188	else {
189		/*
190		 * No data? Highly unusual. We would expect to at
191		 * least see a TCP header in the mbuf.
192		 * As we have a pointer to the TCP header, I guess
193		 * we should just copy that. (???)
194		 */
195fallback:
196		bytes_to_copy = tcp_off;
197		if (bytes_to_copy > M_SIZE(n))
198			bytes_to_copy = M_SIZE(n);
199		bcopy(th, n->m_data, bytes_to_copy);
200		n->m_len = bytes_to_copy;
201		return;
202	}
203
204	/*
205	 * Find TCP header. Record the total number of bytes up to,
206	 * and including, the TCP header.
207	 */
208	while (m_cur) {
209		if ((caddr_t) th >= (caddr_t) m_cur->m_data &&
210			(caddr_t) th < (caddr_t) (m_cur->m_data + m_cur->m_len))
211			break;
212		bytes_to_copy += m_cur->m_len;
213		m_cur = m_cur->m_next;
214	}
215	if (m_cur)
216		bytes_to_copy += (caddr_t) th - (caddr_t) m_cur->m_data;
217	else
218		goto fallback;
219	bytes_to_copy += tcp_off;
220
221	/*
222	 * If we already want to copy more bytes than we can hold
223	 * in the destination mbuf, skip leading bytes and copy
224	 * what we can.
225	 *
226	 * Otherwise, consider trailing data.
227	 */
228	if (bytes_to_copy > M_SIZE(n)) {
229		skip  = bytes_to_copy - M_SIZE(n);
230		bytes_to_copy = M_SIZE(n);
231	}
232	else {
233		/*
234		 * Determine how much trailing data is in the chain.
235		 * We start with the length of this mbuf (the one
236		 * containing th) and subtract the size of the TCP
237		 * header (tcp_off) and the size of the data prior
238		 * to th (th - m_cur->m_data).
239		 *
240		 * This *should not* be negative, as the TCP code
241		 * should put the whole TCP header in a single
242		 * mbuf. But, it isn't a problem if it is. We will
243		 * simple work off our negative balance as we look
244		 * at subsequent mbufs.
245		 */
246		trailing_data = m_cur->m_len - tcp_off;
247		trailing_data -= (caddr_t) th - (caddr_t) m_cur->m_data;
248		m_cur = m_cur->m_next;
249		while (m_cur) {
250			trailing_data += m_cur->m_len;
251			m_cur = m_cur->m_next;
252		}
253		if ((bytes_to_copy + trailing_data) > M_SIZE(n))
254			bytes_to_copy = M_SIZE(n);
255		else
256			bytes_to_copy += trailing_data;
257	}
258
259	m_copydata(m, skip, bytes_to_copy, n->m_data);
260	n->m_len = bytes_to_copy;
261}
262
263void
264tcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue)
265{
266	struct mbuf *n = NULL, *mhead;
267
268	KASSERT(th, ("%s: called with th == NULL", __func__));
269	KASSERT(m, ("%s: called with m == NULL", __func__));
270	KASSERT(queue, ("%s: called with queue == NULL", __func__));
271
272	/* We only care about data packets. */
273	while (m && m->m_type != MT_DATA)
274		m = m->m_next;
275
276	/* We only need to do something if we still have an mbuf. */
277	if (!m)
278		return;
279
280	/* If we are not saving mbufs, return now. */
281	if (queue->mq_maxlen == 0)
282		return;
283
284	/*
285	 * Check to see if we will need to recycle mbufs.
286	 *
287	 * If we need to get rid of mbufs to stay below
288	 * our packet count, try to reuse the mbuf. Once
289	 * we already have a new mbuf (n), then we can
290	 * simply free subsequent mbufs.
291	 *
292	 * Note that most of the logic in here is to deal
293	 * with the reuse. If we are fine with constant
294	 * mbuf allocs/deallocs, we could ditch this logic.
295	 * But, it only seems to make sense to reuse
296	 * mbufs we already have.
297	 */
298	while (mbufq_full(queue)) {
299		mhead = mbufq_dequeue(queue);
300
301		if (n) {
302			tcp_pcap_m_freem(mhead);
303		}
304		else {
305			/*
306			 * If this held an external cluster, try to
307			 * detach the cluster. But, if we held the
308			 * last reference, go through the normal
309			 * free-ing process.
310			 */
311			if (mhead->m_flags & M_EXT) {
312				switch (mhead->m_ext.ext_type) {
313				case EXT_SFBUF:
314					/* Don't mess around with these. */
315					tcp_pcap_m_freem(mhead);
316					continue;
317				default:
318					if (atomic_fetchadd_int(
319						mhead->m_ext.ext_cnt, -1) == 1)
320					{
321						/*
322						 * We held the last reference
323						 * on this cluster. Restore
324						 * the reference count and put
325						 * it back in the pool.
326				 		 */
327						*(mhead->m_ext.ext_cnt) = 1;
328						tcp_pcap_m_freem(mhead);
329						continue;
330					}
331					/*
332					 * We were able to cleanly free the
333					 * reference.
334				 	 */
335					atomic_subtract_int(
336					    &tcp_pcap_clusters_referenced_cur,
337					    1);
338					tcp_pcap_alloc_reuse_ext++;
339					break;
340				}
341			}
342			else {
343				tcp_pcap_alloc_reuse_mbuf++;
344			}
345
346			n = mhead;
347			tcp_pcap_m_freem(n->m_next);
348			m_init(n, M_NOWAIT, MT_DATA, 0);
349		}
350	}
351
352	/* Check to see if we need to get a new mbuf. */
353	if (!n) {
354		if (!(n = m_get(M_NOWAIT, MT_DATA)))
355			return;
356		tcp_pcap_alloc_new_mbuf++;
357	}
358
359	/*
360	 * What are we dealing with? If a cluster, attach it. Otherwise,
361	 * try to copy the data from the beginning of the mbuf to the
362	 * end of data. (There may be data between the start of the data
363	 * area and the current data pointer. We want to get this, because
364	 * it may contain header information that is useful.)
365	 * In cases where that isn't possible, settle for what we can
366	 * get.
367	 */
368	if ((m->m_flags & M_EXT) && tcp_pcap_take_cluster_reference()) {
369		n->m_data = m->m_data;
370		n->m_len = m->m_len;
371		mb_dupcl(n, m);
372	}
373	else if (((m->m_data + m->m_len) - M_START(m)) <= M_SIZE(n)) {
374		/*
375		 * At this point, n is guaranteed to be a normal mbuf
376		 * with no cluster and no packet header. Because the
377		 * logic in this code block requires this, the assert
378		 * is here to catch any instances where someone
379		 * changes the logic to invalidate that assumption.
380		 */
381		KASSERT((n->m_flags & (M_EXT | M_PKTHDR)) == 0,
382			("%s: Unexpected flags (%#x) for mbuf",
383			__func__, n->m_flags));
384		n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m);
385		n->m_len = m->m_len;
386		bcopy(M_START(m), n->m_dat,
387			m->m_len + M_LEADINGSPACE_NOWRITE(m));
388	}
389	else {
390		/*
391		 * This is the case where we need to "settle for what
392		 * we can get". The most probable way to this code
393		 * path is that we've already taken references to the
394		 * maximum number of mbuf clusters we can, and the data
395		 * is too long to fit in an mbuf's internal storage.
396		 * Try for a "best fit".
397		 */
398		tcp_pcap_copy_bestfit(th, m, n);
399
400		/* Don't try to get additional data. */
401		goto add_to_queue;
402	}
403
404	if (m->m_next) {
405		n->m_next = m_copym(m->m_next, 0, M_COPYALL, M_NOWAIT);
406		tcp_pcap_adj_cluster_reference(n->m_next, 1);
407	}
408
409add_to_queue:
410	/* Add the new mbuf to the list. */
411	if (mbufq_enqueue(queue, n)) {
412		/* This shouldn't happen. If INVARIANTS is defined, panic. */
413		KASSERT(0, ("%s: mbufq was unexpectedly full!", __func__));
414		tcp_pcap_m_freem(n);
415	}
416}
417
418void
419tcp_pcap_drain(struct mbufq *queue)
420{
421	struct mbuf *m;
422	while ((m = mbufq_dequeue(queue)))
423		tcp_pcap_m_freem(m);
424}
425
426void
427tcp_pcap_tcpcb_init(struct tcpcb *tp)
428{
429	mbufq_init(&(tp->t_inpkts), V_tcp_pcap_packets);
430	mbufq_init(&(tp->t_outpkts), V_tcp_pcap_packets);
431}
432
433void
434tcp_pcap_set_sock_max(struct mbufq *queue, int newval)
435{
436	queue->mq_maxlen = newval;
437	while (queue->mq_len > queue->mq_maxlen)
438		tcp_pcap_m_freem(mbufq_dequeue(queue));
439}
440
441int
442tcp_pcap_get_sock_max(struct mbufq *queue)
443{
444	return queue->mq_maxlen;
445}
446