1289276Shiren/*- 2289276Shiren * Copyright (c) 2015 3289276Shiren * Jonathan Looney. All rights reserved. 4289276Shiren * 5289276Shiren * Redistribution and use in source and binary forms, with or without 6289276Shiren * modification, are permitted provided that the following conditions 7289276Shiren * are met: 8289276Shiren * 1. Redistributions of source code must retain the above copyright 9289276Shiren * notice, this list of conditions and the following disclaimer. 10289276Shiren * 2. Redistributions in binary form must reproduce the above copyright 11289276Shiren * notice, this list of conditions and the following disclaimer in the 12289276Shiren * documentation and/or other materials provided with the distribution. 13289276Shiren * 14289276Shiren * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15289276Shiren * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16289276Shiren * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17289276Shiren * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18289276Shiren * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19289276Shiren * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20289276Shiren * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21289276Shiren * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22289276Shiren * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23289276Shiren * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24289276Shiren * SUCH DAMAGE. 25289276Shiren * 26289276Shiren * $FreeBSD: releng/11.0/sys/netinet/tcp_pcap.c 302374 2016-07-06 16:17:13Z jtl $ 27289276Shiren */ 28289276Shiren 29289276Shiren#include <sys/queue.h> 30289276Shiren#include <sys/param.h> 31289276Shiren#include <sys/types.h> 32289276Shiren#include <sys/socket.h> 33289276Shiren#include <sys/socketvar.h> 34289276Shiren#include <sys/sysctl.h> 35289276Shiren#include <sys/systm.h> 36289276Shiren#include <sys/mbuf.h> 37289276Shiren#include <sys/eventhandler.h> 38289276Shiren#include <machine/atomic.h> 39289276Shiren#include <netinet/tcp_var.h> 40289276Shiren#include <netinet/tcp_pcap.h> 41289276Shiren 42289276Shiren#define M_LEADINGSPACE_NOWRITE(m) \ 43289276Shiren ((m)->m_data - M_START(m)) 44289276Shiren 45302374Sjtlint tcp_pcap_aggressive_free = 1; 46289276Shirenstatic int tcp_pcap_clusters_referenced_cur = 0; 47289276Shirenstatic int tcp_pcap_clusters_referenced_max = 0; 48289276Shiren 49302374SjtlSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_aggressive_free, 50302374Sjtl CTLFLAG_RW, &tcp_pcap_aggressive_free, 0, 51302374Sjtl "Free saved packets when the memory system comes under pressure"); 52289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_cur, 53289276Shiren CTLFLAG_RD, &tcp_pcap_clusters_referenced_cur, 0, 54289276Shiren "Number of clusters currently referenced on TCP PCAP queues"); 55289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_clusters_referenced_max, 56289276Shiren CTLFLAG_RW, &tcp_pcap_clusters_referenced_max, 0, 57289276Shiren "Maximum number of clusters allowed to be referenced on TCP PCAP " 58289276Shiren "queues"); 59289276Shiren 60289276Shirenstatic int tcp_pcap_alloc_reuse_ext = 0; 61289276Shirenstatic int tcp_pcap_alloc_reuse_mbuf = 0; 62289276Shirenstatic int tcp_pcap_alloc_new_mbuf = 0; 63289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_ext, 64289276Shiren CTLFLAG_RD, &tcp_pcap_alloc_reuse_ext, 0, 65289276Shiren "Number of mbufs with external storage reused for the TCP PCAP " 66289276Shiren "functionality"); 67289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_reuse_mbuf, 68289276Shiren CTLFLAG_RD, &tcp_pcap_alloc_reuse_mbuf, 0, 69289276Shiren "Number of mbufs with internal storage reused for the TCP PCAP " 70289276Shiren "functionality"); 71289276ShirenSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_alloc_new_mbuf, 72289276Shiren CTLFLAG_RD, &tcp_pcap_alloc_new_mbuf, 0, 73289276Shiren "Number of new mbufs allocated for the TCP PCAP functionality"); 74289276Shiren 75289276ShirenVNET_DEFINE(int, tcp_pcap_packets) = 0; 76289276Shiren#define V_tcp_pcap_packets VNET(tcp_pcap_packets) 77289350SbzSYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_pcap_packets, 78289350Sbz CTLFLAG_RW, &VNET_NAME(tcp_pcap_packets), 0, 79289350Sbz "Default number of packets saved per direction per TCPCB"); 80289276Shiren 81289276Shiren/* Initialize the values. */ 82289276Shirenstatic void 83289326Sbztcp_pcap_max_set(void) 84289326Sbz{ 85289326Sbz 86289276Shiren tcp_pcap_clusters_referenced_max = nmbclusters / 4; 87289276Shiren} 88289276Shiren 89289276Shirenvoid 90289326Sbztcp_pcap_init(void) 91289326Sbz{ 92289326Sbz 93289276Shiren tcp_pcap_max_set(); 94289276Shiren EVENTHANDLER_REGISTER(nmbclusters_change, tcp_pcap_max_set, 95289276Shiren NULL, EVENTHANDLER_PRI_ANY); 96289276Shiren} 97289276Shiren 98289276Shiren/* 99289276Shiren * If we are below the maximum allowed cluster references, 100289276Shiren * increment the reference count and return TRUE. Otherwise, 101289276Shiren * leave the reference count alone and return FALSE. 102289276Shiren */ 103289276Shirenstatic __inline bool 104289276Shirentcp_pcap_take_cluster_reference(void) 105289276Shiren{ 106289276Shiren if (atomic_fetchadd_int(&tcp_pcap_clusters_referenced_cur, 1) >= 107289276Shiren tcp_pcap_clusters_referenced_max) { 108289276Shiren atomic_add_int(&tcp_pcap_clusters_referenced_cur, -1); 109289276Shiren return FALSE; 110289276Shiren } 111289276Shiren return TRUE; 112289276Shiren} 113289276Shiren 114289276Shiren/* 115289276Shiren * For all the external entries in m, apply the given adjustment. 116289276Shiren * This can be used to adjust the counter when an mbuf chain is 117289276Shiren * copied or freed. 118289276Shiren */ 119289276Shirenstatic __inline void 120289276Shirentcp_pcap_adj_cluster_reference(struct mbuf *m, int adj) 121289276Shiren{ 122289276Shiren while (m) { 123289276Shiren if (m->m_flags & M_EXT) 124289276Shiren atomic_add_int(&tcp_pcap_clusters_referenced_cur, adj); 125289276Shiren 126289276Shiren m = m->m_next; 127289276Shiren } 128289276Shiren} 129289276Shiren 130289276Shiren/* 131289276Shiren * Free all mbufs in a chain, decrementing the reference count as 132289276Shiren * necessary. 133289276Shiren * 134289276Shiren * Functions in this file should use this instead of m_freem() when 135289276Shiren * they are freeing mbuf chains that may contain clusters that were 136289276Shiren * already included in tcp_pcap_clusters_referenced_cur. 137289276Shiren */ 138289276Shirenstatic void 139289276Shirentcp_pcap_m_freem(struct mbuf *mb) 140289276Shiren{ 141289276Shiren while (mb != NULL) { 142289276Shiren if (mb->m_flags & M_EXT) 143289276Shiren atomic_subtract_int(&tcp_pcap_clusters_referenced_cur, 144289276Shiren 1); 145289276Shiren mb = m_free(mb); 146289276Shiren } 147289276Shiren} 148289276Shiren 149289276Shiren/* 150289276Shiren * Copy data from m to n, where n cannot fit all the data we might 151289276Shiren * want from m. 152289276Shiren * 153289276Shiren * Prioritize data like this: 154289276Shiren * 1. TCP header 155289276Shiren * 2. IP header 156289276Shiren * 3. Data 157289276Shiren */ 158289276Shirenstatic void 159289276Shirentcp_pcap_copy_bestfit(struct tcphdr *th, struct mbuf *m, struct mbuf *n) 160289276Shiren{ 161289276Shiren struct mbuf *m_cur = m; 162289276Shiren int bytes_to_copy=0, trailing_data, skip=0, tcp_off; 163289276Shiren 164289276Shiren /* Below, we assume these will be non-NULL. */ 165289276Shiren KASSERT(th, ("%s: called with th == NULL", __func__)); 166289276Shiren KASSERT(m, ("%s: called with m == NULL", __func__)); 167289276Shiren KASSERT(n, ("%s: called with n == NULL", __func__)); 168289276Shiren 169289276Shiren /* We assume this initialization occurred elsewhere. */ 170289276Shiren KASSERT(n->m_len == 0, ("%s: called with n->m_len=%d (expected 0)", 171289276Shiren __func__, n->m_len)); 172289276Shiren KASSERT(n->m_data == M_START(n), 173289276Shiren ("%s: called with n->m_data != M_START(n)", __func__)); 174289276Shiren 175289276Shiren /* 176289276Shiren * Calculate the size of the TCP header. We use this often 177289276Shiren * enough that it is worth just calculating at the start. 178289276Shiren */ 179289276Shiren tcp_off = th->th_off << 2; 180289276Shiren 181289276Shiren /* Trim off leading empty mbufs. */ 182289276Shiren while (m && m->m_len == 0) 183289276Shiren m = m->m_next; 184289276Shiren 185289276Shiren if (m) { 186289276Shiren m_cur = m; 187289276Shiren } 188289276Shiren else { 189289276Shiren /* 190289276Shiren * No data? Highly unusual. We would expect to at 191289276Shiren * least see a TCP header in the mbuf. 192289276Shiren * As we have a pointer to the TCP header, I guess 193289276Shiren * we should just copy that. (???) 194289276Shiren */ 195289276Shirenfallback: 196289276Shiren bytes_to_copy = tcp_off; 197289276Shiren if (bytes_to_copy > M_SIZE(n)) 198289276Shiren bytes_to_copy = M_SIZE(n); 199289276Shiren bcopy(th, n->m_data, bytes_to_copy); 200289276Shiren n->m_len = bytes_to_copy; 201289276Shiren return; 202289276Shiren } 203289276Shiren 204289276Shiren /* 205289276Shiren * Find TCP header. Record the total number of bytes up to, 206289276Shiren * and including, the TCP header. 207289276Shiren */ 208289276Shiren while (m_cur) { 209289276Shiren if ((caddr_t) th >= (caddr_t) m_cur->m_data && 210289276Shiren (caddr_t) th < (caddr_t) (m_cur->m_data + m_cur->m_len)) 211289276Shiren break; 212289276Shiren bytes_to_copy += m_cur->m_len; 213289276Shiren m_cur = m_cur->m_next; 214289276Shiren } 215289276Shiren if (m_cur) 216289276Shiren bytes_to_copy += (caddr_t) th - (caddr_t) m_cur->m_data; 217289276Shiren else 218289276Shiren goto fallback; 219289276Shiren bytes_to_copy += tcp_off; 220289276Shiren 221289276Shiren /* 222289276Shiren * If we already want to copy more bytes than we can hold 223289276Shiren * in the destination mbuf, skip leading bytes and copy 224289276Shiren * what we can. 225289276Shiren * 226289276Shiren * Otherwise, consider trailing data. 227289276Shiren */ 228289276Shiren if (bytes_to_copy > M_SIZE(n)) { 229289276Shiren skip = bytes_to_copy - M_SIZE(n); 230289276Shiren bytes_to_copy = M_SIZE(n); 231289276Shiren } 232289276Shiren else { 233289276Shiren /* 234289276Shiren * Determine how much trailing data is in the chain. 235289276Shiren * We start with the length of this mbuf (the one 236289276Shiren * containing th) and subtract the size of the TCP 237289276Shiren * header (tcp_off) and the size of the data prior 238289276Shiren * to th (th - m_cur->m_data). 239289276Shiren * 240289276Shiren * This *should not* be negative, as the TCP code 241289276Shiren * should put the whole TCP header in a single 242289276Shiren * mbuf. But, it isn't a problem if it is. We will 243289276Shiren * simple work off our negative balance as we look 244289276Shiren * at subsequent mbufs. 245289276Shiren */ 246289276Shiren trailing_data = m_cur->m_len - tcp_off; 247289276Shiren trailing_data -= (caddr_t) th - (caddr_t) m_cur->m_data; 248289276Shiren m_cur = m_cur->m_next; 249289276Shiren while (m_cur) { 250289276Shiren trailing_data += m_cur->m_len; 251289276Shiren m_cur = m_cur->m_next; 252289276Shiren } 253289276Shiren if ((bytes_to_copy + trailing_data) > M_SIZE(n)) 254289276Shiren bytes_to_copy = M_SIZE(n); 255289276Shiren else 256289276Shiren bytes_to_copy += trailing_data; 257289276Shiren } 258289276Shiren 259289276Shiren m_copydata(m, skip, bytes_to_copy, n->m_data); 260289276Shiren n->m_len = bytes_to_copy; 261289276Shiren} 262289276Shiren 263289276Shirenvoid 264289276Shirentcp_pcap_add(struct tcphdr *th, struct mbuf *m, struct mbufq *queue) 265289276Shiren{ 266289276Shiren struct mbuf *n = NULL, *mhead; 267289276Shiren 268289276Shiren KASSERT(th, ("%s: called with th == NULL", __func__)); 269289276Shiren KASSERT(m, ("%s: called with m == NULL", __func__)); 270289276Shiren KASSERT(queue, ("%s: called with queue == NULL", __func__)); 271289276Shiren 272289276Shiren /* We only care about data packets. */ 273289276Shiren while (m && m->m_type != MT_DATA) 274289276Shiren m = m->m_next; 275289276Shiren 276289276Shiren /* We only need to do something if we still have an mbuf. */ 277289276Shiren if (!m) 278289276Shiren return; 279289276Shiren 280289276Shiren /* If we are not saving mbufs, return now. */ 281289276Shiren if (queue->mq_maxlen == 0) 282289276Shiren return; 283289276Shiren 284289276Shiren /* 285289276Shiren * Check to see if we will need to recycle mbufs. 286289276Shiren * 287289276Shiren * If we need to get rid of mbufs to stay below 288289276Shiren * our packet count, try to reuse the mbuf. Once 289289276Shiren * we already have a new mbuf (n), then we can 290289276Shiren * simply free subsequent mbufs. 291289276Shiren * 292289276Shiren * Note that most of the logic in here is to deal 293289276Shiren * with the reuse. If we are fine with constant 294289276Shiren * mbuf allocs/deallocs, we could ditch this logic. 295289276Shiren * But, it only seems to make sense to reuse 296289276Shiren * mbufs we already have. 297289276Shiren */ 298289276Shiren while (mbufq_full(queue)) { 299289276Shiren mhead = mbufq_dequeue(queue); 300289276Shiren 301289276Shiren if (n) { 302289276Shiren tcp_pcap_m_freem(mhead); 303289276Shiren } 304289276Shiren else { 305289276Shiren /* 306289276Shiren * If this held an external cluster, try to 307289276Shiren * detach the cluster. But, if we held the 308289276Shiren * last reference, go through the normal 309289276Shiren * free-ing process. 310289276Shiren */ 311289276Shiren if (mhead->m_flags & M_EXT) { 312289276Shiren switch (mhead->m_ext.ext_type) { 313289276Shiren case EXT_SFBUF: 314289276Shiren /* Don't mess around with these. */ 315289276Shiren tcp_pcap_m_freem(mhead); 316289276Shiren continue; 317289276Shiren default: 318289276Shiren if (atomic_fetchadd_int( 319289276Shiren mhead->m_ext.ext_cnt, -1) == 1) 320289276Shiren { 321289276Shiren /* 322289276Shiren * We held the last reference 323289276Shiren * on this cluster. Restore 324289276Shiren * the reference count and put 325289276Shiren * it back in the pool. 326289276Shiren */ 327289276Shiren *(mhead->m_ext.ext_cnt) = 1; 328289276Shiren tcp_pcap_m_freem(mhead); 329289276Shiren continue; 330289276Shiren } 331289276Shiren /* 332289276Shiren * We were able to cleanly free the 333289276Shiren * reference. 334289276Shiren */ 335289276Shiren atomic_subtract_int( 336289276Shiren &tcp_pcap_clusters_referenced_cur, 337289276Shiren 1); 338289276Shiren tcp_pcap_alloc_reuse_ext++; 339289276Shiren break; 340289276Shiren } 341289276Shiren } 342289276Shiren else { 343289276Shiren tcp_pcap_alloc_reuse_mbuf++; 344289276Shiren } 345289276Shiren 346289276Shiren n = mhead; 347289276Shiren tcp_pcap_m_freem(n->m_next); 348295482Sglebius m_init(n, M_NOWAIT, MT_DATA, 0); 349289276Shiren } 350289276Shiren } 351289276Shiren 352289276Shiren /* Check to see if we need to get a new mbuf. */ 353289276Shiren if (!n) { 354289276Shiren if (!(n = m_get(M_NOWAIT, MT_DATA))) 355289276Shiren return; 356289276Shiren tcp_pcap_alloc_new_mbuf++; 357289276Shiren } 358289276Shiren 359289276Shiren /* 360289276Shiren * What are we dealing with? If a cluster, attach it. Otherwise, 361289276Shiren * try to copy the data from the beginning of the mbuf to the 362289276Shiren * end of data. (There may be data between the start of the data 363289276Shiren * area and the current data pointer. We want to get this, because 364289276Shiren * it may contain header information that is useful.) 365289276Shiren * In cases where that isn't possible, settle for what we can 366289276Shiren * get. 367289276Shiren */ 368289276Shiren if ((m->m_flags & M_EXT) && tcp_pcap_take_cluster_reference()) { 369289276Shiren n->m_data = m->m_data; 370289276Shiren n->m_len = m->m_len; 371289276Shiren mb_dupcl(n, m); 372289276Shiren } 373289276Shiren else if (((m->m_data + m->m_len) - M_START(m)) <= M_SIZE(n)) { 374289276Shiren /* 375289276Shiren * At this point, n is guaranteed to be a normal mbuf 376289276Shiren * with no cluster and no packet header. Because the 377289276Shiren * logic in this code block requires this, the assert 378289276Shiren * is here to catch any instances where someone 379289276Shiren * changes the logic to invalidate that assumption. 380289276Shiren */ 381289276Shiren KASSERT((n->m_flags & (M_EXT | M_PKTHDR)) == 0, 382289276Shiren ("%s: Unexpected flags (%#x) for mbuf", 383289276Shiren __func__, n->m_flags)); 384289276Shiren n->m_data = n->m_dat + M_LEADINGSPACE_NOWRITE(m); 385289276Shiren n->m_len = m->m_len; 386289276Shiren bcopy(M_START(m), n->m_dat, 387289276Shiren m->m_len + M_LEADINGSPACE_NOWRITE(m)); 388289276Shiren } 389289276Shiren else { 390289276Shiren /* 391289276Shiren * This is the case where we need to "settle for what 392289276Shiren * we can get". The most probable way to this code 393289276Shiren * path is that we've already taken references to the 394289276Shiren * maximum number of mbuf clusters we can, and the data 395289276Shiren * is too long to fit in an mbuf's internal storage. 396289276Shiren * Try for a "best fit". 397289276Shiren */ 398289276Shiren tcp_pcap_copy_bestfit(th, m, n); 399289276Shiren 400289276Shiren /* Don't try to get additional data. */ 401289276Shiren goto add_to_queue; 402289276Shiren } 403289276Shiren 404289276Shiren if (m->m_next) { 405289276Shiren n->m_next = m_copym(m->m_next, 0, M_COPYALL, M_NOWAIT); 406289276Shiren tcp_pcap_adj_cluster_reference(n->m_next, 1); 407289276Shiren } 408289276Shiren 409289276Shirenadd_to_queue: 410289276Shiren /* Add the new mbuf to the list. */ 411289276Shiren if (mbufq_enqueue(queue, n)) { 412289276Shiren /* This shouldn't happen. If INVARIANTS is defined, panic. */ 413289276Shiren KASSERT(0, ("%s: mbufq was unexpectedly full!", __func__)); 414289276Shiren tcp_pcap_m_freem(n); 415289276Shiren } 416289276Shiren} 417289276Shiren 418289276Shirenvoid 419289276Shirentcp_pcap_drain(struct mbufq *queue) 420289276Shiren{ 421289276Shiren struct mbuf *m; 422289276Shiren while ((m = mbufq_dequeue(queue))) 423289276Shiren tcp_pcap_m_freem(m); 424289276Shiren} 425289276Shiren 426289276Shirenvoid 427289276Shirentcp_pcap_tcpcb_init(struct tcpcb *tp) 428289276Shiren{ 429289276Shiren mbufq_init(&(tp->t_inpkts), V_tcp_pcap_packets); 430289276Shiren mbufq_init(&(tp->t_outpkts), V_tcp_pcap_packets); 431289276Shiren} 432289276Shiren 433289276Shirenvoid 434289276Shirentcp_pcap_set_sock_max(struct mbufq *queue, int newval) 435289276Shiren{ 436289276Shiren queue->mq_maxlen = newval; 437289276Shiren while (queue->mq_len > queue->mq_maxlen) 438289276Shiren tcp_pcap_m_freem(mbufq_dequeue(queue)); 439289276Shiren} 440289276Shiren 441289276Shirenint 442289276Shirentcp_pcap_get_sock_max(struct mbufq *queue) 443289276Shiren{ 444289276Shiren return queue->mq_maxlen; 445289276Shiren} 446