1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the University nor the names of its contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
32 */
33
34#include <sys/cdefs.h>
35__FBSDID("$FreeBSD$");
36
37#include "opt_inet.h"
38#include "opt_inet6.h"
39#include "opt_tcpdebug.h"
40
41/* For debugging we want counters and BB logging */
42/* #define TCP_REASS_COUNTERS 1 */
43/* #define TCP_REASS_LOGGING 1 */
44
45#include <sys/param.h>
46#include <sys/kernel.h>
47#include <sys/eventhandler.h>
48#include <sys/malloc.h>
49#include <sys/mbuf.h>
50#include <sys/socket.h>
51#include <sys/socketvar.h>
52#include <sys/sysctl.h>
53#include <sys/syslog.h>
54#include <sys/systm.h>
55
56#include <vm/uma.h>
57
58#include <net/if.h>
59#include <net/if_var.h>
60#include <net/route.h>
61#include <net/vnet.h>
62
63#include <netinet/in.h>
64#include <netinet/in_pcb.h>
65#include <netinet/in_systm.h>
66#include <netinet/in_var.h>
67#include <netinet/ip.h>
68#include <netinet/ip_var.h>
69#include <netinet/ip_options.h>
70#include <netinet/ip6.h>
71#include <netinet6/in6_pcb.h>
72#include <netinet6/ip6_var.h>
73#include <netinet6/nd6.h>
74#include <netinet/tcp.h>
75#include <netinet/tcp_fsm.h>
76#include <netinet/tcp_seq.h>
77#include <netinet/tcp_timer.h>
78#include <netinet/tcp_var.h>
79#ifdef TCP_REASS_LOGGING
80#include <netinet/tcp_log_buf.h>
81#include <netinet/tcp_hpts.h>
82#endif
83#include <netinet6/tcp6_var.h>
84#include <netinet/tcpip.h>
85#ifdef TCPDEBUG
86#include <netinet/tcp_debug.h>
87#endif /* TCPDEBUG */
88
89#define TCP_R_LOG_ADD		1
90#define TCP_R_LOG_LIMIT_REACHED 2
91#define TCP_R_LOG_APPEND	3
92#define TCP_R_LOG_PREPEND	4
93#define TCP_R_LOG_REPLACE	5
94#define TCP_R_LOG_MERGE_INTO	6
95#define TCP_R_LOG_NEW_ENTRY	7
96#define TCP_R_LOG_READ		8
97#define TCP_R_LOG_ZERO		9
98#define TCP_R_LOG_DUMP		10
99#define TCP_R_LOG_TRIM		11
100
101static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass,
102    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
103    "TCP Segment Reassembly Queue");
104
105static SYSCTL_NODE(_net_inet_tcp_reass, OID_AUTO, stats,
106    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
107    "TCP Segment Reassembly stats");
108
109static int tcp_reass_maxseg = 0;
110SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
111    &tcp_reass_maxseg, 0,
112    "Global maximum number of TCP Segments in Reassembly Queue");
113
114static uma_zone_t tcp_reass_zone;
115SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegments, 0,
116    &tcp_reass_zone,
117    "Global number of TCP Segments currently in Reassembly Queue");
118
119static u_int tcp_reass_maxqueuelen = 100;
120SYSCTL_UINT(_net_inet_tcp_reass, OID_AUTO, maxqueuelen, CTLFLAG_RWTUN,
121    &tcp_reass_maxqueuelen, 0,
122    "Maximum number of TCP Segments per Reassembly Queue");
123
124static int tcp_new_limits = 0;
125SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, new_limit, CTLFLAG_RWTUN,
126    &tcp_new_limits, 0,
127    "Do we use the new limit method we are discussing?");
128
129static u_int tcp_reass_queue_guard = 16;
130SYSCTL_UINT(_net_inet_tcp_reass, OID_AUTO, queueguard, CTLFLAG_RWTUN,
131    &tcp_reass_queue_guard, 16,
132    "Number of TCP Segments in Reassembly Queue where we flip over to guard mode");
133
134#ifdef TCP_REASS_COUNTERS
135
136counter_u64_t reass_entry;
137SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, entry, CTLFLAG_RD,
138    &reass_entry, "A segment entered reassembly ");
139
140counter_u64_t reass_path1;
141SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, path1, CTLFLAG_RD,
142    &reass_path1, "Took path 1");
143
144counter_u64_t reass_path2;
145SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, path2, CTLFLAG_RD,
146    &reass_path2, "Took path 2");
147
148counter_u64_t reass_path3;
149SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, path3, CTLFLAG_RD,
150    &reass_path3, "Took path 3");
151
152counter_u64_t reass_path4;
153SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, path4, CTLFLAG_RD,
154    &reass_path4, "Took path 4");
155
156counter_u64_t reass_path5;
157SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, path5, CTLFLAG_RD,
158    &reass_path5, "Took path 5");
159
160counter_u64_t reass_path6;
161SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, path6, CTLFLAG_RD,
162    &reass_path6, "Took path 6");
163
164counter_u64_t reass_path7;
165SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, path7, CTLFLAG_RD,
166    &reass_path7, "Took path 7");
167
168counter_u64_t reass_fullwalk;
169SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, fullwalk, CTLFLAG_RD,
170    &reass_fullwalk, "Took a full walk ");
171
172counter_u64_t reass_nospace;
173SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, nospace, CTLFLAG_RD,
174    &reass_nospace, "Had no mbuf capacity ");
175
176counter_u64_t merge_fwd;
177SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, merge_fwd, CTLFLAG_RD,
178    &merge_fwd, "Ran merge fwd");
179
180counter_u64_t merge_into;
181SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, merge_into, CTLFLAG_RD,
182    &merge_into, "Ran merge into");
183
184counter_u64_t tcp_zero_input;
185SYSCTL_COUNTER_U64(_net_inet_tcp_reass_stats, OID_AUTO, zero_input, CTLFLAG_RD,
186    &tcp_zero_input, "The reassembly buffer saw a zero len segment etc");
187
188#endif
189
190/* Initialize TCP reassembly queue */
191static void
192tcp_reass_zone_change(void *tag)
193{
194
195	/* Set the zone limit and read back the effective value. */
196	tcp_reass_maxseg = nmbclusters / 16;
197	tcp_reass_maxseg = uma_zone_set_max(tcp_reass_zone,
198	    tcp_reass_maxseg);
199}
200
201#ifdef TCP_REASS_LOGGING
202
203static void
204tcp_log_reassm(struct tcpcb *tp, struct tseg_qent *q, struct tseg_qent *p,
205    tcp_seq seq, int len, uint8_t action, int instance)
206{
207	uint32_t cts;
208	struct timeval tv;
209
210	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
211		union tcp_log_stackspecific log;
212
213		memset(&log, 0, sizeof(log));
214		cts = tcp_get_usecs(&tv);
215		log.u_bbr.flex1 = seq;
216		log.u_bbr.cur_del_rate = (uint64_t)q;
217		log.u_bbr.delRate = (uint64_t)p;
218		if (q != NULL) {
219			log.u_bbr.flex2 = q->tqe_start;
220			log.u_bbr.flex3 = q->tqe_len;
221			log.u_bbr.flex4 = q->tqe_mbuf_cnt;
222			log.u_bbr.hptsi_gain = q->tqe_flags;
223		}
224		if (p != NULL)  {
225			log.u_bbr.flex5 = p->tqe_start;
226			log.u_bbr.pkts_out = p->tqe_len;
227			log.u_bbr.epoch = p->tqe_mbuf_cnt;
228			log.u_bbr.cwnd_gain = p->tqe_flags;
229		}
230		log.u_bbr.flex6 = tp->t_segqmbuflen;
231		log.u_bbr.flex7 = instance;
232		log.u_bbr.flex8 = action;
233		log.u_bbr.timeStamp = cts;
234		TCP_LOG_EVENTP(tp, NULL,
235		    &tp->t_inpcb->inp_socket->so_rcv,
236		    &tp->t_inpcb->inp_socket->so_snd,
237		    TCP_LOG_REASS, 0,
238		    len, &log, false, &tv);
239	}
240}
241
242static void
243tcp_reass_log_dump(struct tcpcb *tp)
244{
245	struct tseg_qent *q;
246
247	if (tp->t_logstate != TCP_LOG_STATE_OFF) {
248		TAILQ_FOREACH(q, &tp->t_segq, tqe_q) {
249			tcp_log_reassm(tp, q, NULL, q->tqe_start, q->tqe_len, TCP_R_LOG_DUMP, 0);
250		}
251	};
252}
253
254static void
255tcp_reass_log_new_in(struct tcpcb *tp, tcp_seq seq, int len, struct mbuf *m,
256    int logval, struct tseg_qent *q)
257{
258	int cnt;
259	struct mbuf *t;
260
261	cnt = 0;
262	t = m;
263	while (t) {
264		cnt += t->m_len;
265		t = t->m_next;
266	}
267	tcp_log_reassm(tp, q, NULL, seq, len, logval, cnt);
268}
269
270#endif
271
272void
273tcp_reass_global_init(void)
274{
275
276	tcp_reass_maxseg = nmbclusters / 16;
277	TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
278	    &tcp_reass_maxseg);
279	tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
280	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
281	/* Set the zone limit and read back the effective value. */
282	tcp_reass_maxseg = uma_zone_set_max(tcp_reass_zone,
283	    tcp_reass_maxseg);
284#ifdef TCP_REASS_COUNTERS
285	reass_path1 = counter_u64_alloc(M_WAITOK);
286	reass_path2 = counter_u64_alloc(M_WAITOK);
287	reass_path3 = counter_u64_alloc(M_WAITOK);
288	reass_path4 = counter_u64_alloc(M_WAITOK);
289	reass_path5 = counter_u64_alloc(M_WAITOK);
290	reass_path6 = counter_u64_alloc(M_WAITOK);
291	reass_path7 = counter_u64_alloc(M_WAITOK);
292	reass_fullwalk = counter_u64_alloc(M_WAITOK);
293	reass_nospace = counter_u64_alloc(M_WAITOK);
294	reass_entry = counter_u64_alloc(M_WAITOK);
295	merge_fwd = counter_u64_alloc(M_WAITOK);
296	merge_into = counter_u64_alloc(M_WAITOK);
297	tcp_zero_input = counter_u64_alloc(M_WAITOK);
298#endif
299	EVENTHANDLER_REGISTER(nmbclusters_change,
300	    tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
301
302}
303
304void
305tcp_reass_flush(struct tcpcb *tp)
306{
307	struct tseg_qent *qe;
308
309	INP_WLOCK_ASSERT(tp->t_inpcb);
310
311	while ((qe = TAILQ_FIRST(&tp->t_segq)) != NULL) {
312		TAILQ_REMOVE(&tp->t_segq, qe, tqe_q);
313		m_freem(qe->tqe_m);
314		uma_zfree(tcp_reass_zone, qe);
315		tp->t_segqlen--;
316	}
317	tp->t_segqmbuflen = 0;
318	KASSERT((tp->t_segqlen == 0),
319	    ("TCP reass queue %p segment count is %d instead of 0 after flush.",
320	    tp, tp->t_segqlen));
321}
322
323static void
324tcp_reass_append(struct tcpcb *tp, struct tseg_qent *last,
325    struct mbuf *m, struct tcphdr *th, int tlen,
326    struct mbuf *mlast, int lenofoh)
327{
328
329#ifdef TCP_REASS_LOGGING
330	tcp_log_reassm(tp, last, NULL, th->th_seq, tlen, TCP_R_LOG_APPEND, 0);
331#endif
332	last->tqe_len += tlen;
333	last->tqe_m->m_pkthdr.len += tlen;
334	/* Preserve the FIN bit if its there */
335	last->tqe_flags |= (th->th_flags & TH_FIN);
336	last->tqe_last->m_next = m;
337	last->tqe_last = mlast;
338	last->tqe_mbuf_cnt += lenofoh;
339	tp->t_rcvoopack++;
340	TCPSTAT_INC(tcps_rcvoopack);
341	TCPSTAT_ADD(tcps_rcvoobyte, tlen);
342#ifdef TCP_REASS_LOGGING
343	tcp_reass_log_new_in(tp, last->tqe_start, lenofoh, last->tqe_m,
344			     TCP_R_LOG_APPEND,
345			     last);
346#endif
347}
348
349static void
350tcp_reass_prepend(struct tcpcb *tp, struct tseg_qent *first, struct mbuf *m, struct tcphdr *th,
351		  int tlen, struct mbuf *mlast, int lenofoh)
352{
353	int i;
354
355#ifdef TCP_REASS_LOGGING
356	tcp_log_reassm(tp, first, NULL, th->th_seq, tlen, TCP_R_LOG_PREPEND, 0);
357#endif
358	if (SEQ_GT((th->th_seq + tlen), first->tqe_start)) {
359		/* The new data overlaps into the old */
360		i = (th->th_seq + tlen) - first->tqe_start;
361#ifdef TCP_REASS_LOGGING
362		tcp_log_reassm(tp, first, NULL, 0, i, TCP_R_LOG_TRIM, 1);
363#endif
364		m_adj(first->tqe_m, i);
365		first->tqe_len -= i;
366		first->tqe_start += i;
367	}
368	/* Ok now setup our chain to point to the old first */
369	mlast->m_next = first->tqe_m;
370	first->tqe_m = m;
371	first->tqe_len += tlen;
372	first->tqe_start = th->th_seq;
373	first->tqe_m->m_pkthdr.len = first->tqe_len;
374	first->tqe_mbuf_cnt += lenofoh;
375	tp->t_rcvoopack++;
376	TCPSTAT_INC(tcps_rcvoopack);
377	TCPSTAT_ADD(tcps_rcvoobyte, tlen);
378#ifdef TCP_REASS_LOGGING
379	tcp_reass_log_new_in(tp, first->tqe_start, lenofoh, first->tqe_m,
380			     TCP_R_LOG_PREPEND,
381			     first);
382#endif
383}
384
385static void
386tcp_reass_replace(struct tcpcb *tp, struct tseg_qent *q, struct mbuf *m,
387    tcp_seq seq, int len, struct mbuf *mlast, int mbufoh, uint8_t flags)
388{
389	/*
390	 * Free the data in q, and replace
391	 * it with the new segment.
392	 */
393	int len_dif;
394
395#ifdef TCP_REASS_LOGGING
396	tcp_log_reassm(tp, q, NULL, seq, len, TCP_R_LOG_REPLACE, 0);
397#endif
398	m_freem(q->tqe_m);
399	KASSERT(tp->t_segqmbuflen >= q->tqe_mbuf_cnt,
400		("Tp:%p seg queue goes negative", tp));
401	tp->t_segqmbuflen -= q->tqe_mbuf_cnt;
402	q->tqe_mbuf_cnt = mbufoh;
403	q->tqe_m = m;
404	q->tqe_last = mlast;
405	q->tqe_start = seq;
406	if (len > q->tqe_len)
407		len_dif = len - q->tqe_len;
408	else
409		len_dif = 0;
410	tp->t_rcvoopack++;
411	TCPSTAT_INC(tcps_rcvoopack);
412	TCPSTAT_ADD(tcps_rcvoobyte, len_dif);
413	q->tqe_len = len;
414	q->tqe_flags = (flags & TH_FIN);
415	q->tqe_m->m_pkthdr.len = q->tqe_len;
416	tp->t_segqmbuflen += mbufoh;
417
418}
419
420static void
421tcp_reass_merge_into(struct tcpcb *tp, struct tseg_qent *ent,
422    struct tseg_qent *q)
423{
424	/*
425	 * Merge q into ent and free q from the list.
426	 */
427#ifdef TCP_REASS_LOGGING
428	tcp_log_reassm(tp, q, ent, 0, 0, TCP_R_LOG_MERGE_INTO, 0);
429#endif
430#ifdef TCP_REASS_COUNTERS
431	counter_u64_add(merge_into, 1);
432#endif
433	ent->tqe_last->m_next = q->tqe_m;
434	ent->tqe_last = q->tqe_last;
435	ent->tqe_len += q->tqe_len;
436	ent->tqe_mbuf_cnt += q->tqe_mbuf_cnt;
437	ent->tqe_m->m_pkthdr.len += q->tqe_len;
438	ent->tqe_flags |= (q->tqe_flags & TH_FIN);
439	TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
440	uma_zfree(tcp_reass_zone, q);
441	tp->t_segqlen--;
442
443}
444
445static void
446tcp_reass_merge_forward(struct tcpcb *tp, struct tseg_qent *ent)
447{
448	struct tseg_qent *q, *qtmp;
449	int i;
450	tcp_seq max;
451	/*
452	 * Given an entry merge forward anyplace
453	 * that ent overlaps forward.
454	 */
455
456	max = ent->tqe_start + ent->tqe_len;
457	q = TAILQ_NEXT(ent, tqe_q);
458	if (q == NULL) {
459		/* Nothing left */
460		return;
461	}
462	TAILQ_FOREACH_FROM_SAFE(q, &tp->t_segq, tqe_q, qtmp) {
463		if (SEQ_GT(q->tqe_start, max)) {
464			/* Beyond q */
465			break;
466		}
467		/* We have some or all that are overlapping */
468		if (SEQ_GEQ(max, (q->tqe_start + q->tqe_len))) {
469			/* It consumes it all */
470			tp->t_segqmbuflen -= q->tqe_mbuf_cnt;
471			m_freem(q->tqe_m);
472			TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
473			uma_zfree(tcp_reass_zone, q);
474			tp->t_segqlen--;
475			continue;
476		}
477		/*
478		 * Trim the q entry to dovetail to this one
479		 * and then merge q into ent updating max
480		 * in the process.
481		 */
482		i = max - q->tqe_start;
483#ifdef TCP_REASS_LOGGING
484		tcp_log_reassm(tp, q, NULL, 0, i, TCP_R_LOG_TRIM, 2);
485#endif
486		m_adj(q->tqe_m, i);
487		q->tqe_len -= i;
488		q->tqe_start += i;
489		tcp_reass_merge_into(tp, ent, q);
490		max = ent->tqe_start + ent->tqe_len;
491	}
492#ifdef TCP_REASS_COUNTERS
493	counter_u64_add(merge_fwd, 1);
494#endif
495}
496
497static int
498tcp_reass_overhead_of_chain(struct mbuf *m, struct mbuf **mlast)
499{
500	int len = MSIZE;
501
502	if (m->m_flags & M_EXT)
503		len += m->m_ext.ext_size;
504	while (m->m_next != NULL) {
505		m = m->m_next;
506		len += MSIZE;
507		if (m->m_flags & M_EXT)
508			len += m->m_ext.ext_size;
509	}
510	*mlast = m;
511	return (len);
512}
513
514/*
515 * NOTE!!! the new tcp-reassembly code *must not* use
516 * m_adj() with a negative index. That alters the chain
517 * of mbufs (by possibly chopping trailing mbufs). At
518 * the front of tcp_reass we count the mbuf overhead
519 * and setup the tail pointer. If we use m_adj(m, -5)
520 * we could corrupt the tail pointer. Currently the
521 * code only uses m_adj(m, postive-num). If this
522 * changes appropriate changes to update mlast would
523 * be needed.
524 */
525int
526tcp_reass(struct tcpcb *tp, struct tcphdr *th, tcp_seq *seq_start,
527	  int *tlenp, struct mbuf *m)
528{
529	struct tseg_qent *q, *last, *first;
530	struct tseg_qent *p = NULL;
531	struct tseg_qent *nq = NULL;
532	struct tseg_qent *te = NULL;
533	struct mbuf *mlast = NULL;
534	struct sockbuf *sb;
535	struct socket *so = tp->t_inpcb->inp_socket;
536	char *s = NULL;
537	int flags, i, lenofoh;
538
539	INP_WLOCK_ASSERT(tp->t_inpcb);
540	/*
541	 * XXX: tcp_reass() is rather inefficient with its data structures
542	 * and should be rewritten (see NetBSD for optimizations).
543	 */
544
545	KASSERT(th == NULL || (seq_start != NULL && tlenp != NULL),
546	        ("tcp_reass called with illegal parameter combination "
547	         "(tp=%p, th=%p, seq_start=%p, tlenp=%p, m=%p)",
548	         tp, th, seq_start, tlenp, m));
549	/*
550	 * Call with th==NULL after become established to
551	 * force pre-ESTABLISHED data up to user socket.
552	 */
553	if (th == NULL)
554		goto present;
555	KASSERT(SEQ_GEQ(th->th_seq, tp->rcv_nxt),
556		("Attempt to add old entry to reassembly queue (th=%p, tp=%p)",
557		 th, tp));
558#ifdef TCP_REASS_LOGGING
559	tcp_reass_log_new_in(tp, th->th_seq, *tlenp, m, TCP_R_LOG_ADD, NULL);
560#endif
561#ifdef TCP_REASS_COUNTERS
562	counter_u64_add(reass_entry, 1);
563#endif
564	/*
565	 * Check for zero length data.
566	 */
567	if ((*tlenp == 0) && ((th->th_flags & TH_FIN) == 0)) {
568		/*
569		 * A zero length segment does no
570		 * one any good. We could check
571		 * the rcv_nxt <-> rcv_wnd but thats
572		 * already done for us by the caller.
573		 */
574strip_fin:
575#ifdef TCP_REASS_COUNTERS
576		counter_u64_add(tcp_zero_input, 1);
577#endif
578		m_freem(m);
579#ifdef TCP_REASS_LOGGING
580		tcp_reass_log_dump(tp);
581#endif
582		return (0);
583	} else if ((*tlenp == 0) &&
584		   (th->th_flags & TH_FIN) &&
585		   !TCPS_HAVEESTABLISHED(tp->t_state)) {
586		/*
587		 * We have not established, and we
588		 * have a FIN and no data. Lets treat
589		 * this as the same as if the FIN were
590		 * not present. We don't want to save
591		 * the FIN bit in a reassembly buffer
592		 * we want to get established first before
593		 * we do that (the peer will retransmit).
594		 */
595		goto strip_fin;
596	}
597	/*
598	 * Will it fit?
599	 */
600	lenofoh = tcp_reass_overhead_of_chain(m, &mlast);
601	sb = &tp->t_inpcb->inp_socket->so_rcv;
602	if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) &&
603	    (sb->sb_mbcnt + tp->t_segqmbuflen + lenofoh) > sb->sb_mbmax) {
604		/* No room */
605		TCPSTAT_INC(tcps_rcvreassfull);
606#ifdef TCP_REASS_COUNTERS
607		counter_u64_add(reass_nospace, 1);
608#endif
609#ifdef TCP_REASS_LOGGING
610		tcp_log_reassm(tp, NULL, NULL, th->th_seq, lenofoh, TCP_R_LOG_LIMIT_REACHED, 0);
611#endif
612		if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
613			log(LOG_DEBUG, "%s; %s: mbuf count limit reached, "
614			    "segment dropped\n", s, __func__);
615			free(s, M_TCPLOG);
616		}
617		m_freem(m);
618		*tlenp = 0;
619#ifdef TCP_REASS_LOGGING
620		tcp_reass_log_dump(tp);
621#endif
622		return (0);
623	}
624	/*
625	 * First lets deal with two common cases, the
626	 * segment appends to the back of our collected
627	 * segments. Or the segment is the next in line.
628	 */
629	last = TAILQ_LAST_FAST(&tp->t_segq, tseg_qent, tqe_q);
630	if (last != NULL) {
631		if ((th->th_flags & TH_FIN) &&
632		    SEQ_LT((th->th_seq + *tlenp), (last->tqe_start + last->tqe_len))) {
633			/*
634			 * Someone is trying to game us, dump
635			 * the segment.
636			 */
637			*tlenp = 0;
638			m_freem(m);
639			return (0);
640		}
641		if ((SEQ_GEQ(th->th_seq, last->tqe_start)) &&
642		    (SEQ_GEQ((last->tqe_start + last->tqe_len), th->th_seq))) {
643			/* Common case, trailing segment is added */
644			/**
645			 *                                 +--last
646			 *                                 v
647			 *  reassembly buffer |---|  |---| |---|
648			 *  new segment                       |---|
649			 */
650#ifdef TCP_REASS_COUNTERS
651			counter_u64_add(reass_path1, 1);
652#endif
653			if (SEQ_GT((last->tqe_start + last->tqe_len), th->th_seq)) {
654				i = (last->tqe_start + last->tqe_len) - th->th_seq;
655				if (i < *tlenp) {
656#ifdef TCP_REASS_LOGGING
657					tcp_log_reassm(tp, last, NULL, 0, i, TCP_R_LOG_TRIM, 3);
658					th->th_seq += i;
659#endif
660					m_adj(m, i);
661					*tlenp -= i;
662				} else {
663					/* Complete overlap */
664					TCPSTAT_INC(tcps_rcvduppack);
665					TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
666					m_freem(m);
667					*tlenp = last->tqe_len;
668					*seq_start = last->tqe_start;
669					return (0);
670				}
671			}
672			if (last->tqe_flags & TH_FIN) {
673				/*
674				 * We have data after the FIN on the last?
675				 */
676				*tlenp = 0;
677				m_freem(m);
678				return(0);
679			}
680			tcp_reass_append(tp, last, m, th, *tlenp, mlast, lenofoh);
681			tp->t_segqmbuflen += lenofoh;
682			*seq_start = last->tqe_start;
683			*tlenp = last->tqe_len;
684			return (0);
685		} else if (SEQ_GT(th->th_seq, (last->tqe_start + last->tqe_len))) {
686			/*
687			 * Second common case, we missed
688			 * another one and have something more
689			 * for the end.
690			 */
691			/**
692			 *                                 +--last
693			 *                                 v
694			 *  reassembly buffer |---|  |---| |---|
695			 *  new segment                           |---|
696			 */
697			if (last->tqe_flags & TH_FIN) {
698				/*
699				 * We have data after the FIN on the last?
700				 */
701				*tlenp = 0;
702				m_freem(m);
703				return(0);
704			}
705#ifdef TCP_REASS_COUNTERS
706			counter_u64_add(reass_path2, 1);
707#endif
708			p = last;
709			goto new_entry;
710		}
711	} else {
712		/* First segment (it's NULL). */
713		goto new_entry;
714	}
715	first = TAILQ_FIRST(&tp->t_segq);
716	if (SEQ_LT(th->th_seq, first->tqe_start) &&
717	    SEQ_GEQ((th->th_seq + *tlenp),first->tqe_start) &&
718	    SEQ_LT((th->th_seq + *tlenp), (first->tqe_start + first->tqe_len))) {
719		/*
720		 * The head of the queue is prepended by this and
721		 * it may be the one I want most.
722		 */
723		/**
724		 *       first-------+
725		 *                   v
726		 *  rea:             |---|  |---| |---|
727		 *  new:         |---|
728		 * Note the case we do not deal with here is:
729		 *   rea=     |---|   |---|   |---|
730		 *   new=  |----|
731		 * Due to the fact that it could be
732		 *   new   |--------------------|
733		 * And we might need to merge forward.
734		 */
735#ifdef INVARIANTS
736		struct mbuf *firstmbuf;
737#endif
738
739#ifdef TCP_REASS_COUNTERS
740		counter_u64_add(reass_path3, 1);
741#endif
742		if (SEQ_LT(th->th_seq, tp->rcv_nxt)) {
743			/*
744			 * The resend was even before
745			 * what we have. We need to trim it.
746			 * Note TSNH (it should be trimmed
747			 * before the call to tcp_reass()).
748			 */
749#ifdef INVARIANTS
750			panic("th->th_seq:%u rcv_nxt:%u tp:%p not pre-trimmed",
751			      th->th_seq, tp->rcv_nxt, tp);
752#else
753			i = tp->rcv_nxt - th->th_seq;
754#ifdef TCP_REASS_LOGGING
755			tcp_log_reassm(tp, first, NULL, 0, i, TCP_R_LOG_TRIM, 4);
756#endif
757			m_adj(m, i);
758			th->th_seq += i;
759			*tlenp -= i;
760#endif
761		}
762#ifdef INVARIANTS
763		firstmbuf = first->tqe_m;
764#endif
765		tcp_reass_prepend(tp, first, m, th, *tlenp, mlast, lenofoh);
766#ifdef INVARIANTS
767		if (firstmbuf == first->tqe_m) {
768			panic("First stayed same m:%p foobar:%p first->tqe_m:%p tp:%p first:%p",
769			      m, firstmbuf, first->tqe_m, tp, first);
770		} else if (first->tqe_m != m) {
771			panic("First did not change to m:%p foobar:%p first->tqe_m:%p tp:%p first:%p",
772			      m, firstmbuf, first->tqe_m, tp, first);
773		}
774#endif
775		tp->t_segqmbuflen += lenofoh;
776		*seq_start = first->tqe_start;
777		*tlenp = first->tqe_len;
778		goto present;
779	} else if (SEQ_LT((th->th_seq + *tlenp), first->tqe_start)) {
780		/* New segment is before our earliest segment. */
781		/**
782		 *           first---->+
783		 *                      v
784		 *  rea=                |---| ....
785		 *  new"         |---|
786		 *
787		 */
788		goto new_entry;
789	}
790	/*
791	 * Find a segment which begins after this one does.
792	 */
793#ifdef TCP_REASS_COUNTERS
794	counter_u64_add(reass_fullwalk, 1);
795#endif
796	TAILQ_FOREACH(q, &tp->t_segq, tqe_q) {
797		if (SEQ_GT(q->tqe_start, th->th_seq))
798			break;
799	}
800	p = TAILQ_PREV(q, tsegqe_head, tqe_q);
801	/**
802	 * Now is this fit just in-between only?
803	 * i.e.:
804	 *      p---+        +----q
805	 *          v        v
806	 *     res= |--|     |--|    |--|
807	 *     nee       |-|
808	 */
809	if (SEQ_LT((th->th_seq + *tlenp), q->tqe_start) &&
810	    ((p == NULL) || (SEQ_GT(th->th_seq, (p->tqe_start + p->tqe_len))))) {
811		/* Yep no overlap */
812		goto new_entry;
813	}
814	/**
815	 * If we reach here we have some (possibly all) overlap
816	 * such as:
817	 *     res=     |--|     |--|    |--|
818	 *     new=  |----|
819	 * or  new=  |-----------------|
820	 * or  new=      |--------|
821	 * or  new=            |---|
822	 * or  new=            |-----------|
823	 */
824	if ((p != NULL) &&
825	    (SEQ_LEQ(th->th_seq, (p->tqe_start + p->tqe_len)))) {
826		/* conversion to int (in i) handles seq wraparound */
827
828#ifdef TCP_REASS_COUNTERS
829		counter_u64_add(reass_path4, 1);
830#endif
831		i = p->tqe_start + p->tqe_len - th->th_seq;
832		if (i >= 0) {
833			if (i >= *tlenp) {
834				/**
835				 *       prev seg---->+
836				 *                    v
837				 *  reassembly buffer |---|
838				 *  new segment        |-|
839				 */
840				TCPSTAT_INC(tcps_rcvduppack);
841				TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
842				*tlenp = p->tqe_len;
843				*seq_start = p->tqe_start;
844				m_freem(m);
845				/*
846				 * Try to present any queued data
847				 * at the left window edge to the user.
848				 * This is needed after the 3-WHS
849				 * completes. Note this probably
850				 * will not work and we will return.
851				 */
852				return (0);
853			}
854			if (i > 0) {
855				/**
856				 *       prev seg---->+
857				 *                    v
858				 *  reassembly buffer |---|
859				 *  new segment         |-----|
860				 */
861#ifdef TCP_REASS_COUNTERS
862				counter_u64_add(reass_path5, 1);
863#endif
864#ifdef TCP_REASS_LOGGING
865				tcp_log_reassm(tp, p, NULL, 0, i, TCP_R_LOG_TRIM, 5);
866#endif
867				m_adj(m, i);
868				*tlenp -= i;
869				th->th_seq += i;
870			}
871		}
872		if (th->th_seq == (p->tqe_start + p->tqe_len)) {
873			/*
874			 * If dovetails in with this one
875			 * append it.
876			 */
877			/**
878			 *       prev seg---->+
879			 *                    v
880			 *  reassembly buffer |--|     |---|
881			 *  new segment          |--|
882			 * (note: it was trimmed above if it overlapped)
883			 */
884			tcp_reass_append(tp, p, m, th, *tlenp, mlast, lenofoh);
885			tp->t_segqmbuflen += lenofoh;
886		} else {
887#ifdef INVARIANTS
888			panic("Impossible cut th_seq:%u p->seq:%u(%d) p:%p tp:%p",
889			      th->th_seq, p->tqe_start, p->tqe_len,
890			      p, tp);
891#endif
892			*tlenp = 0;
893			m_freem(m);
894			return (0);
895		}
896		q = p;
897	} else {
898		/*
899		 * The new data runs over the
900		 * top of previously sack'd data (in q).
901		 * It may be partially overlapping, or
902		 * it may overlap the entire segment.
903		 */
904#ifdef TCP_REASS_COUNTERS
905		counter_u64_add(reass_path6, 1);
906#endif
907		if (SEQ_GEQ((th->th_seq + *tlenp), (q->tqe_start + q->tqe_len))) {
908			/* It consumes it all */
909			/**
910			 *             next seg---->+
911			 *                          v
912			 *  reassembly buffer |--|     |---|
913			 *  new segment              |----------|
914			 */
915#ifdef TCP_REASS_COUNTERS
916			counter_u64_add(reass_path7, 1);
917#endif
918			tcp_reass_replace(tp, q, m, th->th_seq, *tlenp, mlast, lenofoh, th->th_flags);
919		} else {
920			/*
921			 * We just need to prepend the data
922			 * to this. It does not overrun
923			 * the end.
924			 */
925			/**
926			 *                next seg---->+
927			 *                             v
928			 *  reassembly buffer |--|     |---|
929			 *  new segment                   |----------|
930			 */
931			tcp_reass_prepend(tp, q, m, th, *tlenp, mlast, lenofoh);
932			tp->t_segqmbuflen += lenofoh;
933		}
934	}
935	/* Now does it go further than that? */
936	tcp_reass_merge_forward(tp, q);
937	*seq_start = q->tqe_start;
938	*tlenp = q->tqe_len;
939	goto present;
940
941	/*
942	 * When we reach here we can't combine it
943	 * with any existing segment.
944	 *
945	 * Limit the number of segments that can be queued to reduce the
946	 * potential for mbuf exhaustion. For best performance, we want to be
947	 * able to queue a full window's worth of segments. The size of the
948	 * socket receive buffer determines our advertised window and grows
949	 * automatically when socket buffer autotuning is enabled. Use it as the
950	 * basis for our queue limit.
951	 *
952	 * However, allow the user to specify a ceiling for the number of
953	 * segments in each queue.
954	 *
955	 * Always let the missing segment through which caused this queue.
956	 * NB: Access to the socket buffer is left intentionally unlocked as we
957	 * can tolerate stale information here.
958	 *
959	 * XXXLAS: Using sbspace(so->so_rcv) instead of so->so_rcv.sb_hiwat
960	 * should work but causes packets to be dropped when they shouldn't.
961	 * Investigate why and re-evaluate the below limit after the behaviour
962	 * is understood.
963	 */
964new_entry:
965	if (th->th_seq == tp->rcv_nxt && TCPS_HAVEESTABLISHED(tp->t_state)) {
966		tp->rcv_nxt += *tlenp;
967		flags = th->th_flags & TH_FIN;
968		TCPSTAT_INC(tcps_rcvoopack);
969		TCPSTAT_ADD(tcps_rcvoobyte, *tlenp);
970		SOCKBUF_LOCK(&so->so_rcv);
971		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
972			m_freem(m);
973		} else {
974			sbappendstream_locked(&so->so_rcv, m, 0);
975		}
976		tp->t_flags |= TF_WAKESOR;
977		return (flags);
978	}
979	if (tcp_new_limits) {
980		if ((tp->t_segqlen > tcp_reass_queue_guard) &&
981		    (*tlenp < MSIZE)) {
982			/*
983			 * This is really a lie, we are not full but
984			 * are getting a segment that is above
985			 * guard threshold. If it is and its below
986			 * a mbuf size (256) we drop it if it
987			 * can't fill in some place.
988			 */
989			TCPSTAT_INC(tcps_rcvreassfull);
990			*tlenp = 0;
991			if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
992				log(LOG_DEBUG, "%s; %s: queue limit reached, "
993				    "segment dropped\n", s, __func__);
994				free(s, M_TCPLOG);
995			}
996			m_freem(m);
997#ifdef TCP_REASS_LOGGING
998			tcp_reass_log_dump(tp);
999#endif
1000			return (0);
1001		}
1002	} else {
1003		if (tp->t_segqlen >= min((so->so_rcv.sb_hiwat / tp->t_maxseg) + 1,
1004					 tcp_reass_maxqueuelen)) {
1005			TCPSTAT_INC(tcps_rcvreassfull);
1006			*tlenp = 0;
1007			if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
1008				log(LOG_DEBUG, "%s; %s: queue limit reached, "
1009				    "segment dropped\n", s, __func__);
1010				free(s, M_TCPLOG);
1011			}
1012			m_freem(m);
1013#ifdef TCP_REASS_LOGGING
1014			tcp_reass_log_dump(tp);
1015#endif
1016			return (0);
1017		}
1018	}
1019	/*
1020	 * Allocate a new queue entry. If we can't, or hit the zone limit
1021	 * just drop the pkt.
1022	 */
1023	te = uma_zalloc(tcp_reass_zone, M_NOWAIT);
1024	if (te == NULL) {
1025		TCPSTAT_INC(tcps_rcvmemdrop);
1026		m_freem(m);
1027		*tlenp = 0;
1028		if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
1029				       NULL))) {
1030			log(LOG_DEBUG, "%s; %s: global zone limit "
1031			    "reached, segment dropped\n", s, __func__);
1032			free(s, M_TCPLOG);
1033		}
1034		return (0);
1035	}
1036	tp->t_segqlen++;
1037	tp->t_rcvoopack++;
1038	TCPSTAT_INC(tcps_rcvoopack);
1039	TCPSTAT_ADD(tcps_rcvoobyte, *tlenp);
1040	/* Insert the new segment queue entry into place. */
1041	te->tqe_m = m;
1042	te->tqe_flags = th->th_flags;
1043	te->tqe_len = *tlenp;
1044	te->tqe_start = th->th_seq;
1045	te->tqe_last = mlast;
1046	te->tqe_mbuf_cnt = lenofoh;
1047	tp->t_segqmbuflen += te->tqe_mbuf_cnt;
1048	if (p == NULL) {
1049		TAILQ_INSERT_HEAD(&tp->t_segq, te, tqe_q);
1050	} else {
1051		TAILQ_INSERT_AFTER(&tp->t_segq, p, te, tqe_q);
1052	}
1053#ifdef TCP_REASS_LOGGING
1054	tcp_reass_log_new_in(tp, th->th_seq, *tlenp, m, TCP_R_LOG_NEW_ENTRY, te);
1055#endif
1056present:
1057	/*
1058	 * Present data to user, advancing rcv_nxt through
1059	 * completed sequence space.
1060	 */
1061	if (!TCPS_HAVEESTABLISHED(tp->t_state))
1062		return (0);
1063	q = TAILQ_FIRST(&tp->t_segq);
1064	KASSERT(q == NULL || SEQ_GEQ(q->tqe_start, tp->rcv_nxt),
1065		("Reassembly queue for %p has stale entry at head", tp));
1066	if (!q || q->tqe_start != tp->rcv_nxt) {
1067#ifdef TCP_REASS_LOGGING
1068		tcp_reass_log_dump(tp);
1069#endif
1070		return (0);
1071	}
1072	SOCKBUF_LOCK(&so->so_rcv);
1073	do {
1074		tp->rcv_nxt += q->tqe_len;
1075		flags = q->tqe_flags & TH_FIN;
1076		nq = TAILQ_NEXT(q, tqe_q);
1077		TAILQ_REMOVE(&tp->t_segq, q, tqe_q);
1078		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
1079			m_freem(q->tqe_m);
1080		} else {
1081#ifdef TCP_REASS_LOGGING
1082			tcp_reass_log_new_in(tp, q->tqe_start, q->tqe_len, q->tqe_m, TCP_R_LOG_READ, q);
1083			if (th != NULL) {
1084				tcp_log_reassm(tp, q, NULL, th->th_seq, *tlenp, TCP_R_LOG_READ, 1);
1085			} else {
1086				tcp_log_reassm(tp, q, NULL, 0, 0, TCP_R_LOG_READ, 1);
1087			}
1088#endif
1089			sbappendstream_locked(&so->so_rcv, q->tqe_m, 0);
1090		}
1091#ifdef TCP_REASS_LOGGING
1092		if (th != NULL) {
1093			tcp_log_reassm(tp, q, NULL, th->th_seq, *tlenp, TCP_R_LOG_READ, 2);
1094		} else {
1095			tcp_log_reassm(tp, q, NULL, 0, 0, TCP_R_LOG_READ, 2);
1096		}
1097#endif
1098		KASSERT(tp->t_segqmbuflen >= q->tqe_mbuf_cnt,
1099			("tp:%p seg queue goes negative", tp));
1100		tp->t_segqmbuflen -= q->tqe_mbuf_cnt;
1101		uma_zfree(tcp_reass_zone, q);
1102		tp->t_segqlen--;
1103		q = nq;
1104	} while (q && q->tqe_start == tp->rcv_nxt);
1105	if (TAILQ_EMPTY(&tp->t_segq) &&
1106	    (tp->t_segqmbuflen != 0)) {
1107#ifdef INVARIANTS
1108		panic("tp:%p segq:%p len:%d queue empty",
1109		      tp, &tp->t_segq, tp->t_segqmbuflen);
1110#else
1111#ifdef TCP_REASS_LOGGING
1112		if (th != NULL) {
1113			tcp_log_reassm(tp, NULL, NULL, th->th_seq, *tlenp, TCP_R_LOG_ZERO, 0);
1114		} else {
1115			tcp_log_reassm(tp, NULL, NULL, 0, 0, TCP_R_LOG_ZERO, 0);
1116		}
1117#endif
1118		tp->t_segqmbuflen = 0;
1119#endif
1120	}
1121#ifdef TCP_REASS_LOGGING
1122	tcp_reass_log_dump(tp);
1123#endif
1124	tp->t_flags |= TF_WAKESOR;
1125	return (flags);
1126}
1127