Deleted Added
full compact
tcp_reass.c (185088) tcp_reass.c (185571)
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
30 */
31
32#include <sys/cdefs.h>
1/*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
30 */
31
32#include <sys/cdefs.h>
33__FBSDID("$FreeBSD: head/sys/netinet/tcp_reass.c 185088 2008-11-19 09:39:34Z zec $");
33__FBSDID("$FreeBSD: head/sys/netinet/tcp_reass.c 185571 2008-12-02 21:37:28Z bz $");
34
35#include "opt_inet.h"
36#include "opt_inet6.h"
37#include "opt_tcpdebug.h"
38
39#include <sys/param.h>
40#include <sys/kernel.h>
41#include <sys/malloc.h>
42#include <sys/mbuf.h>
43#include <sys/socket.h>
44#include <sys/socketvar.h>
45#include <sys/sysctl.h>
46#include <sys/syslog.h>
47#include <sys/systm.h>
48#include <sys/vimage.h>
49
50#include <vm/uma.h>
51
52#include <net/if.h>
53#include <net/route.h>
54
55#include <netinet/in.h>
56#include <netinet/in_pcb.h>
57#include <netinet/in_systm.h>
58#include <netinet/in_var.h>
59#include <netinet/ip.h>
60#include <netinet/ip_var.h>
61#include <netinet/ip_options.h>
62#include <netinet/ip6.h>
63#include <netinet6/in6_pcb.h>
64#include <netinet6/ip6_var.h>
65#include <netinet6/nd6.h>
66#include <netinet/tcp.h>
67#include <netinet/tcp_fsm.h>
68#include <netinet/tcp_seq.h>
69#include <netinet/tcp_timer.h>
70#include <netinet/tcp_var.h>
71#include <netinet6/tcp6_var.h>
72#include <netinet/tcpip.h>
73#ifdef TCPDEBUG
74#include <netinet/tcp_debug.h>
75#endif /* TCPDEBUG */
34
35#include "opt_inet.h"
36#include "opt_inet6.h"
37#include "opt_tcpdebug.h"
38
39#include <sys/param.h>
40#include <sys/kernel.h>
41#include <sys/malloc.h>
42#include <sys/mbuf.h>
43#include <sys/socket.h>
44#include <sys/socketvar.h>
45#include <sys/sysctl.h>
46#include <sys/syslog.h>
47#include <sys/systm.h>
48#include <sys/vimage.h>
49
50#include <vm/uma.h>
51
52#include <net/if.h>
53#include <net/route.h>
54
55#include <netinet/in.h>
56#include <netinet/in_pcb.h>
57#include <netinet/in_systm.h>
58#include <netinet/in_var.h>
59#include <netinet/ip.h>
60#include <netinet/ip_var.h>
61#include <netinet/ip_options.h>
62#include <netinet/ip6.h>
63#include <netinet6/in6_pcb.h>
64#include <netinet6/ip6_var.h>
65#include <netinet6/nd6.h>
66#include <netinet/tcp.h>
67#include <netinet/tcp_fsm.h>
68#include <netinet/tcp_seq.h>
69#include <netinet/tcp_timer.h>
70#include <netinet/tcp_var.h>
71#include <netinet6/tcp6_var.h>
72#include <netinet/tcpip.h>
73#ifdef TCPDEBUG
74#include <netinet/tcp_debug.h>
75#endif /* TCPDEBUG */
76#include <netinet/vinet.h>
76
77#ifdef VIMAGE_GLOBALS
78static int tcp_reass_maxseg;
79int tcp_reass_qsize;
80static int tcp_reass_maxqlen;
81static int tcp_reass_overflows;
82#endif
83
84SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
85 "TCP Segment Reassembly Queue");
86
87SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, maxsegments,
88 CTLFLAG_RDTUN, tcp_reass_maxseg, 0,
89 "Global maximum number of TCP Segments in Reassembly Queue");
90
91SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, cursegments,
92 CTLFLAG_RD, tcp_reass_qsize, 0,
93 "Global number of TCP Segments currently in Reassembly Queue");
94
95SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, maxqlen,
96 CTLFLAG_RW, tcp_reass_maxqlen, 0,
97 "Maximum number of TCP Segments per individual Reassembly Queue");
98
99SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, overflows,
100 CTLFLAG_RD, tcp_reass_overflows, 0,
101 "Global number of TCP Segment Reassembly Queue Overflows");
102
103/* Initialize TCP reassembly queue */
104static void
105tcp_reass_zone_change(void *tag)
106{
107 INIT_VNET_INET(curvnet);
108
109 V_tcp_reass_maxseg = nmbclusters / 16;
110 uma_zone_set_max(tcp_reass_zone, V_tcp_reass_maxseg);
111}
112
113uma_zone_t tcp_reass_zone;
114
115void
116tcp_reass_init(void)
117{
118 INIT_VNET_INET(curvnet);
119
120 V_tcp_reass_maxseg = 0;
121 V_tcp_reass_qsize = 0;
122 V_tcp_reass_maxqlen = 48;
123 V_tcp_reass_overflows = 0;
124
125 V_tcp_reass_maxseg = nmbclusters / 16;
126 TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
127 &V_tcp_reass_maxseg);
128 tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
129 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
130 uma_zone_set_max(tcp_reass_zone, V_tcp_reass_maxseg);
131 EVENTHANDLER_REGISTER(nmbclusters_change,
132 tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
133}
134
135int
136tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
137{
138 INIT_VNET_INET(curvnet);
139 struct tseg_qent *q;
140 struct tseg_qent *p = NULL;
141 struct tseg_qent *nq;
142 struct tseg_qent *te = NULL;
143 struct socket *so = tp->t_inpcb->inp_socket;
144 int flags;
145
146 INP_WLOCK_ASSERT(tp->t_inpcb);
147
148 /*
149 * XXX: tcp_reass() is rather inefficient with its data structures
150 * and should be rewritten (see NetBSD for optimizations). While
151 * doing that it should move to its own file tcp_reass.c.
152 */
153
154 /*
155 * Call with th==NULL after become established to
156 * force pre-ESTABLISHED data up to user socket.
157 */
158 if (th == NULL)
159 goto present;
160
161 /*
162 * Limit the number of segments in the reassembly queue to prevent
163 * holding on to too many segments (and thus running out of mbufs).
164 * Make sure to let the missing segment through which caused this
165 * queue. Always keep one global queue entry spare to be able to
166 * process the missing segment.
167 */
168 if (th->th_seq != tp->rcv_nxt &&
169 (V_tcp_reass_qsize + 1 >= V_tcp_reass_maxseg ||
170 tp->t_segqlen >= V_tcp_reass_maxqlen)) {
171 V_tcp_reass_overflows++;
172 V_tcpstat.tcps_rcvmemdrop++;
173 m_freem(m);
174 *tlenp = 0;
175 return (0);
176 }
177
178 /*
179 * Allocate a new queue entry. If we can't, or hit the zone limit
180 * just drop the pkt.
181 */
182 te = uma_zalloc(tcp_reass_zone, M_NOWAIT);
183 if (te == NULL) {
184 V_tcpstat.tcps_rcvmemdrop++;
185 m_freem(m);
186 *tlenp = 0;
187 return (0);
188 }
189 tp->t_segqlen++;
190 V_tcp_reass_qsize++;
191
192 /*
193 * Find a segment which begins after this one does.
194 */
195 LIST_FOREACH(q, &tp->t_segq, tqe_q) {
196 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
197 break;
198 p = q;
199 }
200
201 /*
202 * If there is a preceding segment, it may provide some of
203 * our data already. If so, drop the data from the incoming
204 * segment. If it provides all of our data, drop us.
205 */
206 if (p != NULL) {
207 int i;
208 /* conversion to int (in i) handles seq wraparound */
209 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
210 if (i > 0) {
211 if (i >= *tlenp) {
212 V_tcpstat.tcps_rcvduppack++;
213 V_tcpstat.tcps_rcvdupbyte += *tlenp;
214 m_freem(m);
215 uma_zfree(tcp_reass_zone, te);
216 tp->t_segqlen--;
217 V_tcp_reass_qsize--;
218 /*
219 * Try to present any queued data
220 * at the left window edge to the user.
221 * This is needed after the 3-WHS
222 * completes.
223 */
224 goto present; /* ??? */
225 }
226 m_adj(m, i);
227 *tlenp -= i;
228 th->th_seq += i;
229 }
230 }
231 V_tcpstat.tcps_rcvoopack++;
232 V_tcpstat.tcps_rcvoobyte += *tlenp;
233
234 /*
235 * While we overlap succeeding segments trim them or,
236 * if they are completely covered, dequeue them.
237 */
238 while (q) {
239 int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
240 if (i <= 0)
241 break;
242 if (i < q->tqe_len) {
243 q->tqe_th->th_seq += i;
244 q->tqe_len -= i;
245 m_adj(q->tqe_m, i);
246 break;
247 }
248
249 nq = LIST_NEXT(q, tqe_q);
250 LIST_REMOVE(q, tqe_q);
251 m_freem(q->tqe_m);
252 uma_zfree(tcp_reass_zone, q);
253 tp->t_segqlen--;
254 V_tcp_reass_qsize--;
255 q = nq;
256 }
257
258 /* Insert the new segment queue entry into place. */
259 te->tqe_m = m;
260 te->tqe_th = th;
261 te->tqe_len = *tlenp;
262
263 if (p == NULL) {
264 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
265 } else {
266 LIST_INSERT_AFTER(p, te, tqe_q);
267 }
268
269present:
270 /*
271 * Present data to user, advancing rcv_nxt through
272 * completed sequence space.
273 */
274 if (!TCPS_HAVEESTABLISHED(tp->t_state))
275 return (0);
276 q = LIST_FIRST(&tp->t_segq);
277 if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
278 return (0);
279 SOCKBUF_LOCK(&so->so_rcv);
280 do {
281 tp->rcv_nxt += q->tqe_len;
282 flags = q->tqe_th->th_flags & TH_FIN;
283 nq = LIST_NEXT(q, tqe_q);
284 LIST_REMOVE(q, tqe_q);
285 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
286 m_freem(q->tqe_m);
287 else
288 sbappendstream_locked(&so->so_rcv, q->tqe_m);
289 uma_zfree(tcp_reass_zone, q);
290 tp->t_segqlen--;
291 V_tcp_reass_qsize--;
292 q = nq;
293 } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
294 ND6_HINT(tp);
295 sorwakeup_locked(so);
296 return (flags);
297}
77
78#ifdef VIMAGE_GLOBALS
79static int tcp_reass_maxseg;
80int tcp_reass_qsize;
81static int tcp_reass_maxqlen;
82static int tcp_reass_overflows;
83#endif
84
85SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
86 "TCP Segment Reassembly Queue");
87
88SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, maxsegments,
89 CTLFLAG_RDTUN, tcp_reass_maxseg, 0,
90 "Global maximum number of TCP Segments in Reassembly Queue");
91
92SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, cursegments,
93 CTLFLAG_RD, tcp_reass_qsize, 0,
94 "Global number of TCP Segments currently in Reassembly Queue");
95
96SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, maxqlen,
97 CTLFLAG_RW, tcp_reass_maxqlen, 0,
98 "Maximum number of TCP Segments per individual Reassembly Queue");
99
100SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_tcp_reass, OID_AUTO, overflows,
101 CTLFLAG_RD, tcp_reass_overflows, 0,
102 "Global number of TCP Segment Reassembly Queue Overflows");
103
104/* Initialize TCP reassembly queue */
105static void
106tcp_reass_zone_change(void *tag)
107{
108 INIT_VNET_INET(curvnet);
109
110 V_tcp_reass_maxseg = nmbclusters / 16;
111 uma_zone_set_max(tcp_reass_zone, V_tcp_reass_maxseg);
112}
113
114uma_zone_t tcp_reass_zone;
115
116void
117tcp_reass_init(void)
118{
119 INIT_VNET_INET(curvnet);
120
121 V_tcp_reass_maxseg = 0;
122 V_tcp_reass_qsize = 0;
123 V_tcp_reass_maxqlen = 48;
124 V_tcp_reass_overflows = 0;
125
126 V_tcp_reass_maxseg = nmbclusters / 16;
127 TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
128 &V_tcp_reass_maxseg);
129 tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
130 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
131 uma_zone_set_max(tcp_reass_zone, V_tcp_reass_maxseg);
132 EVENTHANDLER_REGISTER(nmbclusters_change,
133 tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
134}
135
136int
137tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
138{
139 INIT_VNET_INET(curvnet);
140 struct tseg_qent *q;
141 struct tseg_qent *p = NULL;
142 struct tseg_qent *nq;
143 struct tseg_qent *te = NULL;
144 struct socket *so = tp->t_inpcb->inp_socket;
145 int flags;
146
147 INP_WLOCK_ASSERT(tp->t_inpcb);
148
149 /*
150 * XXX: tcp_reass() is rather inefficient with its data structures
151 * and should be rewritten (see NetBSD for optimizations). While
152 * doing that it should move to its own file tcp_reass.c.
153 */
154
155 /*
156 * Call with th==NULL after become established to
157 * force pre-ESTABLISHED data up to user socket.
158 */
159 if (th == NULL)
160 goto present;
161
162 /*
163 * Limit the number of segments in the reassembly queue to prevent
164 * holding on to too many segments (and thus running out of mbufs).
165 * Make sure to let the missing segment through which caused this
166 * queue. Always keep one global queue entry spare to be able to
167 * process the missing segment.
168 */
169 if (th->th_seq != tp->rcv_nxt &&
170 (V_tcp_reass_qsize + 1 >= V_tcp_reass_maxseg ||
171 tp->t_segqlen >= V_tcp_reass_maxqlen)) {
172 V_tcp_reass_overflows++;
173 V_tcpstat.tcps_rcvmemdrop++;
174 m_freem(m);
175 *tlenp = 0;
176 return (0);
177 }
178
179 /*
180 * Allocate a new queue entry. If we can't, or hit the zone limit
181 * just drop the pkt.
182 */
183 te = uma_zalloc(tcp_reass_zone, M_NOWAIT);
184 if (te == NULL) {
185 V_tcpstat.tcps_rcvmemdrop++;
186 m_freem(m);
187 *tlenp = 0;
188 return (0);
189 }
190 tp->t_segqlen++;
191 V_tcp_reass_qsize++;
192
193 /*
194 * Find a segment which begins after this one does.
195 */
196 LIST_FOREACH(q, &tp->t_segq, tqe_q) {
197 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
198 break;
199 p = q;
200 }
201
202 /*
203 * If there is a preceding segment, it may provide some of
204 * our data already. If so, drop the data from the incoming
205 * segment. If it provides all of our data, drop us.
206 */
207 if (p != NULL) {
208 int i;
209 /* conversion to int (in i) handles seq wraparound */
210 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
211 if (i > 0) {
212 if (i >= *tlenp) {
213 V_tcpstat.tcps_rcvduppack++;
214 V_tcpstat.tcps_rcvdupbyte += *tlenp;
215 m_freem(m);
216 uma_zfree(tcp_reass_zone, te);
217 tp->t_segqlen--;
218 V_tcp_reass_qsize--;
219 /*
220 * Try to present any queued data
221 * at the left window edge to the user.
222 * This is needed after the 3-WHS
223 * completes.
224 */
225 goto present; /* ??? */
226 }
227 m_adj(m, i);
228 *tlenp -= i;
229 th->th_seq += i;
230 }
231 }
232 V_tcpstat.tcps_rcvoopack++;
233 V_tcpstat.tcps_rcvoobyte += *tlenp;
234
235 /*
236 * While we overlap succeeding segments trim them or,
237 * if they are completely covered, dequeue them.
238 */
239 while (q) {
240 int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
241 if (i <= 0)
242 break;
243 if (i < q->tqe_len) {
244 q->tqe_th->th_seq += i;
245 q->tqe_len -= i;
246 m_adj(q->tqe_m, i);
247 break;
248 }
249
250 nq = LIST_NEXT(q, tqe_q);
251 LIST_REMOVE(q, tqe_q);
252 m_freem(q->tqe_m);
253 uma_zfree(tcp_reass_zone, q);
254 tp->t_segqlen--;
255 V_tcp_reass_qsize--;
256 q = nq;
257 }
258
259 /* Insert the new segment queue entry into place. */
260 te->tqe_m = m;
261 te->tqe_th = th;
262 te->tqe_len = *tlenp;
263
264 if (p == NULL) {
265 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
266 } else {
267 LIST_INSERT_AFTER(p, te, tqe_q);
268 }
269
270present:
271 /*
272 * Present data to user, advancing rcv_nxt through
273 * completed sequence space.
274 */
275 if (!TCPS_HAVEESTABLISHED(tp->t_state))
276 return (0);
277 q = LIST_FIRST(&tp->t_segq);
278 if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
279 return (0);
280 SOCKBUF_LOCK(&so->so_rcv);
281 do {
282 tp->rcv_nxt += q->tqe_len;
283 flags = q->tqe_th->th_flags & TH_FIN;
284 nq = LIST_NEXT(q, tqe_q);
285 LIST_REMOVE(q, tqe_q);
286 if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
287 m_freem(q->tqe_m);
288 else
289 sbappendstream_locked(&so->so_rcv, q->tqe_m);
290 uma_zfree(tcp_reass_zone, q);
291 tp->t_segqlen--;
292 V_tcp_reass_qsize--;
293 q = nq;
294 } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
295 ND6_HINT(tp);
296 sorwakeup_locked(so);
297 return (flags);
298}