Deleted Added
full compact
tcp_input.c (122875) tcp_input.c (122922)
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright

--- 17 unchanged lines hidden (view full) ---

26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
1/*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright

--- 17 unchanged lines hidden (view full) ---

26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
34 * $FreeBSD: head/sys/netinet/tcp_input.c 122875 2003-11-18 00:39:07Z rwatson $
34 * $FreeBSD: head/sys/netinet/tcp_input.c 122922 2003-11-20 20:07:39Z andre $
35 */
36
37#include "opt_ipfw.h" /* for ipfw_fwd */
38#include "opt_inet6.h"
39#include "opt_ipsec.h"
40#include "opt_mac.h"
41#include "opt_tcpdebug.h"
42#include "opt_tcp_input.h"

--- 106 unchanged lines hidden (view full) ---

149static int tcp_timewait(struct tcptw *, struct tcpopt *,
150 struct tcphdr *, struct mbuf *, int);
151
152/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
153#ifdef INET6
154#define ND6_HINT(tp) \
155do { \
156 if ((tp) && (tp)->t_inpcb && \
35 */
36
37#include "opt_ipfw.h" /* for ipfw_fwd */
38#include "opt_inet6.h"
39#include "opt_ipsec.h"
40#include "opt_mac.h"
41#include "opt_tcpdebug.h"
42#include "opt_tcp_input.h"

--- 106 unchanged lines hidden (view full) ---

149static int tcp_timewait(struct tcptw *, struct tcpopt *,
150 struct tcphdr *, struct mbuf *, int);
151
152/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
153#ifdef INET6
154#define ND6_HINT(tp) \
155do { \
156 if ((tp) && (tp)->t_inpcb && \
157 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
158 (tp)->t_inpcb->in6p_route.ro_rt) \
159 nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
157 ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
158 nd6_nud_hint(NULL, NULL, 0); \
160} while (0)
161#else
162#define ND6_HINT(tp)
163#endif
164
165/*
166 * Indicate whether this ack should be delayed. We can delay the ack if
167 * - there is no delayed ack timer in progress and

--- 185 unchanged lines hidden (view full) ---

353 int len, tlen, off;
354 int drop_hdrlen;
355 register struct tcpcb *tp = 0;
356 register int thflags;
357 struct socket *so = 0;
358 int todrop, acked, ourfinisacked, needoutput = 0;
359 u_long tiwin;
360 struct tcpopt to; /* options in this segment */
159} while (0)
160#else
161#define ND6_HINT(tp)
162#endif
163
164/*
165 * Indicate whether this ack should be delayed. We can delay the ack if
166 * - there is no delayed ack timer in progress and

--- 185 unchanged lines hidden (view full) ---

352 int len, tlen, off;
353 int drop_hdrlen;
354 register struct tcpcb *tp = 0;
355 register int thflags;
356 struct socket *so = 0;
357 int todrop, acked, ourfinisacked, needoutput = 0;
358 u_long tiwin;
359 struct tcpopt to; /* options in this segment */
361 struct rmxp_tao *taop; /* pointer to our TAO cache entry */
362 struct rmxp_tao tao_noncached; /* in case there's no cached entry */
360 struct rmxp_tao tao; /* our TAO cache entry */
363 int headlocked = 0;
364 struct sockaddr_in *next_hop = NULL;
365 int rstreason; /* For badport_bandlim accounting purposes */
366
367 struct ip6_hdr *ip6 = NULL;
368#ifdef INET6
369 int isipv6;
370#else

--- 13 unchanged lines hidden (view full) ---

384 /* Grab info from MT_TAG mbufs prepended to the chain. */
385 for (;m && m->m_type == MT_TAG; m = m->m_next) {
386 if (m->_m_tag_id == PACKET_TAG_IPFORWARD)
387 next_hop = (struct sockaddr_in *)m->m_hdr.mh_data;
388 }
389#ifdef INET6
390 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
391#endif
361 int headlocked = 0;
362 struct sockaddr_in *next_hop = NULL;
363 int rstreason; /* For badport_bandlim accounting purposes */
364
365 struct ip6_hdr *ip6 = NULL;
366#ifdef INET6
367 int isipv6;
368#else

--- 13 unchanged lines hidden (view full) ---

382 /* Grab info from MT_TAG mbufs prepended to the chain. */
383 for (;m && m->m_type == MT_TAG; m = m->m_next) {
384 if (m->_m_tag_id == PACKET_TAG_IPFORWARD)
385 next_hop = (struct sockaddr_in *)m->m_hdr.mh_data;
386 }
387#ifdef INET6
388 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
389#endif
390 bzero(&tao, sizeof(tao));
392 bzero((char *)&to, sizeof(to));
393
394 tcpstat.tcps_rcvtotal++;
395
396 if (isipv6) {
397 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
398 ip6 = mtod(m, struct ip6_hdr *);
399 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;

--- 302 unchanged lines hidden (view full) ---

702 struct in_conninfo inc;
703
704#ifdef INET6
705 inc.inc_isipv6 = isipv6;
706#endif
707 if (isipv6) {
708 inc.inc6_faddr = ip6->ip6_src;
709 inc.inc6_laddr = ip6->ip6_dst;
391 bzero((char *)&to, sizeof(to));
392
393 tcpstat.tcps_rcvtotal++;
394
395 if (isipv6) {
396 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
397 ip6 = mtod(m, struct ip6_hdr *);
398 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;

--- 302 unchanged lines hidden (view full) ---

701 struct in_conninfo inc;
702
703#ifdef INET6
704 inc.inc_isipv6 = isipv6;
705#endif
706 if (isipv6) {
707 inc.inc6_faddr = ip6->ip6_src;
708 inc.inc6_laddr = ip6->ip6_dst;
710 inc.inc6_route.ro_rt = NULL; /* XXX */
711 } else {
712 inc.inc_faddr = ip->ip_src;
713 inc.inc_laddr = ip->ip_dst;
709 } else {
710 inc.inc_faddr = ip->ip_src;
711 inc.inc_laddr = ip->ip_dst;
714 inc.inc_route.ro_rt = NULL; /* XXX */
715 }
716 inc.inc_fport = th->th_sport;
717 inc.inc_lport = th->th_dport;
718
719 /*
720 * If the state is LISTEN then ignore segment if it contains
721 * a RST. If the segment contains an ACK then it is bad and
722 * send a RST. If it does not contain a SYN then it is not

--- 188 unchanged lines hidden (view full) ---

911 tcpstat.tcps_connects++;
912 soisconnected(so);
913 goto trimthenstep6;
914 }
915 goto drop;
916 }
917after_listen:
918
712 }
713 inc.inc_fport = th->th_sport;
714 inc.inc_lport = th->th_dport;
715
716 /*
717 * If the state is LISTEN then ignore segment if it contains
718 * a RST. If the segment contains an ACK then it is bad and
719 * send a RST. If it does not contain a SYN then it is not

--- 188 unchanged lines hidden (view full) ---

908 tcpstat.tcps_connects++;
909 soisconnected(so);
910 goto trimthenstep6;
911 }
912 goto drop;
913 }
914after_listen:
915
919/* XXX temp debugging */
916 /* XXX temp debugging */
920 /* should not happen - syncache should pick up these connections */
921 if (tp->t_state == TCPS_LISTEN)
922 panic("tcp_input: TCPS_LISTEN");
923
924 /*
925 * Segment received on connection.
926 * Reset idle time and keep-alive timer.
927 */
928 tp->t_rcvtime = ticks;
929 if (TCPS_HAVEESTABLISHED(tp->t_state))
930 callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
931
932 /*
917 /* should not happen - syncache should pick up these connections */
918 if (tp->t_state == TCPS_LISTEN)
919 panic("tcp_input: TCPS_LISTEN");
920
921 /*
922 * Segment received on connection.
923 * Reset idle time and keep-alive timer.
924 */
925 tp->t_rcvtime = ticks;
926 if (TCPS_HAVEESTABLISHED(tp->t_state))
927 callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
928
929 /*
933 * Process options.
934 * XXX this is tradtitional behavior, may need to be cleaned up.
930 * Process options only when we get SYN/ACK back. The SYN case
931 * for incoming connections is handled in tcp_syncache.
932 * XXX this is traditional behavior, may need to be cleaned up.
935 */
936 tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
937 if (thflags & TH_SYN) {
938 if (to.to_flags & TOF_SCALE) {
939 tp->t_flags |= TF_RCVD_SCALE;
940 tp->requested_s_scale = to.to_requested_s_scale;
941 }
942 if (to.to_flags & TOF_TS) {

--- 231 unchanged lines hidden (view full) ---

1174 * Otherwise this is an acceptable SYN segment
1175 * initialize tp->rcv_nxt and tp->irs
1176 * if seg contains ack then advance tp->snd_una
1177 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1178 * arrange for segment to be acked (eventually)
1179 * continue processing rest of data/controls, beginning with URG
1180 */
1181 case TCPS_SYN_SENT:
933 */
934 tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
935 if (thflags & TH_SYN) {
936 if (to.to_flags & TOF_SCALE) {
937 tp->t_flags |= TF_RCVD_SCALE;
938 tp->requested_s_scale = to.to_requested_s_scale;
939 }
940 if (to.to_flags & TOF_TS) {

--- 231 unchanged lines hidden (view full) ---

1172 * Otherwise this is an acceptable SYN segment
1173 * initialize tp->rcv_nxt and tp->irs
1174 * if seg contains ack then advance tp->snd_una
1175 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
1176 * arrange for segment to be acked (eventually)
1177 * continue processing rest of data/controls, beginning with URG
1178 */
1179 case TCPS_SYN_SENT:
1182 if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) {
1183 taop = &tao_noncached;
1184 bzero(taop, sizeof(*taop));
1185 }
1180 if (tcp_do_rfc1644)
1181 tcp_hc_gettao(&inp->inp_inc, &tao);
1186
1187 if ((thflags & TH_ACK) &&
1188 (SEQ_LEQ(th->th_ack, tp->iss) ||
1189 SEQ_GT(th->th_ack, tp->snd_max))) {
1190 /*
1191 * If we have a cached CCsent for the remote host,
1192 * hence we haven't just crashed and restarted,
1193 * do not send a RST. This may be a retransmission
1194 * from the other side after our earlier ACK was lost.
1195 * Our new SYN, when it arrives, will serve as the
1196 * needed ACK.
1197 */
1182
1183 if ((thflags & TH_ACK) &&
1184 (SEQ_LEQ(th->th_ack, tp->iss) ||
1185 SEQ_GT(th->th_ack, tp->snd_max))) {
1186 /*
1187 * If we have a cached CCsent for the remote host,
1188 * hence we haven't just crashed and restarted,
1189 * do not send a RST. This may be a retransmission
1190 * from the other side after our earlier ACK was lost.
1191 * Our new SYN, when it arrives, will serve as the
1192 * needed ACK.
1193 */
1198 if (taop->tao_ccsent != 0)
1194 if (tao.tao_ccsent != 0)
1199 goto drop;
1200 else {
1201 rstreason = BANDLIM_UNLIMITED;
1202 goto dropwithreset;
1203 }
1204 }
1205 if (thflags & TH_RST) {
1206 if (thflags & TH_ACK)

--- 13 unchanged lines hidden (view full) ---

1220 * option, check it to make sure this segment really
1221 * matches our SYN. If not, just drop it as old
1222 * duplicate, but send an RST if we're still playing
1223 * by the old rules. If no CC.ECHO option, make sure
1224 * we don't get fooled into using T/TCP.
1225 */
1226 if (to.to_flags & TOF_CCECHO) {
1227 if (tp->cc_send != to.to_ccecho) {
1195 goto drop;
1196 else {
1197 rstreason = BANDLIM_UNLIMITED;
1198 goto dropwithreset;
1199 }
1200 }
1201 if (thflags & TH_RST) {
1202 if (thflags & TH_ACK)

--- 13 unchanged lines hidden (view full) ---

1216 * option, check it to make sure this segment really
1217 * matches our SYN. If not, just drop it as old
1218 * duplicate, but send an RST if we're still playing
1219 * by the old rules. If no CC.ECHO option, make sure
1220 * we don't get fooled into using T/TCP.
1221 */
1222 if (to.to_flags & TOF_CCECHO) {
1223 if (tp->cc_send != to.to_ccecho) {
1228 if (taop->tao_ccsent != 0)
1224 if (tao.tao_ccsent != 0)
1229 goto drop;
1230 else {
1231 rstreason = BANDLIM_UNLIMITED;
1232 goto dropwithreset;
1233 }
1234 }
1235 } else
1236 tp->t_flags &= ~TF_RCVD_CC;

--- 4 unchanged lines hidden (view full) ---

1241#endif
1242 /* Do window scaling on this connection? */
1243 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1244 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1245 tp->snd_scale = tp->requested_s_scale;
1246 tp->rcv_scale = tp->request_r_scale;
1247 }
1248 /* Segment is acceptable, update cache if undefined. */
1225 goto drop;
1226 else {
1227 rstreason = BANDLIM_UNLIMITED;
1228 goto dropwithreset;
1229 }
1230 }
1231 } else
1232 tp->t_flags &= ~TF_RCVD_CC;

--- 4 unchanged lines hidden (view full) ---

1237#endif
1238 /* Do window scaling on this connection? */
1239 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1240 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1241 tp->snd_scale = tp->requested_s_scale;
1242 tp->rcv_scale = tp->request_r_scale;
1243 }
1244 /* Segment is acceptable, update cache if undefined. */
1249 if (taop->tao_ccsent == 0)
1250 taop->tao_ccsent = to.to_ccecho;
1245 if (tao.tao_ccsent == 0 && tcp_do_rfc1644)
1246 tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, to.to_ccecho, 0);
1251
1252 tp->rcv_adv += tp->rcv_wnd;
1253 tp->snd_una++; /* SYN is acked */
1254 /*
1255 * If there's data, delay ACK; if there's also a FIN
1256 * ACKNOW will be turned on later.
1257 */
1258 if (DELAY_ACK(tp) && tlen != 0)

--- 26 unchanged lines hidden (view full) ---

1285 * Otherwise, do 3-way handshake:
1286 * SYN-SENT -> SYN-RECEIVED
1287 * SYN-SENT* -> SYN-RECEIVED*
1288 * If there was no CC option, clear cached CC value.
1289 */
1290 tp->t_flags |= TF_ACKNOW;
1291 callout_stop(tp->tt_rexmt);
1292 if (to.to_flags & TOF_CC) {
1247
1248 tp->rcv_adv += tp->rcv_wnd;
1249 tp->snd_una++; /* SYN is acked */
1250 /*
1251 * If there's data, delay ACK; if there's also a FIN
1252 * ACKNOW will be turned on later.
1253 */
1254 if (DELAY_ACK(tp) && tlen != 0)

--- 26 unchanged lines hidden (view full) ---

1281 * Otherwise, do 3-way handshake:
1282 * SYN-SENT -> SYN-RECEIVED
1283 * SYN-SENT* -> SYN-RECEIVED*
1284 * If there was no CC option, clear cached CC value.
1285 */
1286 tp->t_flags |= TF_ACKNOW;
1287 callout_stop(tp->tt_rexmt);
1288 if (to.to_flags & TOF_CC) {
1293 if (taop->tao_cc != 0 &&
1294 CC_GT(to.to_cc, taop->tao_cc)) {
1289 if (tao.tao_cc != 0 &&
1290 CC_GT(to.to_cc, tao.tao_cc)) {
1295 /*
1296 * update cache and make transition:
1297 * SYN-SENT -> ESTABLISHED*
1298 * SYN-SENT* -> FIN-WAIT-1*
1299 */
1291 /*
1292 * update cache and make transition:
1293 * SYN-SENT -> ESTABLISHED*
1294 * SYN-SENT* -> FIN-WAIT-1*
1295 */
1300 taop->tao_cc = to.to_cc;
1296 tao.tao_cc = to.to_cc;
1297 tcp_hc_updatetao(&inp->inp_inc,
1298 TCP_HC_TAO_CC, to.to_cc, 0);
1301 tp->t_starttime = ticks;
1302 if (tp->t_flags & TF_NEEDFIN) {
1303 tp->t_state = TCPS_FIN_WAIT_1;
1304 tp->t_flags &= ~TF_NEEDFIN;
1305 } else {
1306 tp->t_state = TCPS_ESTABLISHED;
1307 callout_reset(tp->tt_keep,
1308 tcp_keepidle,
1309 tcp_timer_keep,
1310 tp);
1311 }
1312 tp->t_flags |= TF_NEEDSYN;
1313 } else
1314 tp->t_state = TCPS_SYN_RECEIVED;
1315 } else {
1299 tp->t_starttime = ticks;
1300 if (tp->t_flags & TF_NEEDFIN) {
1301 tp->t_state = TCPS_FIN_WAIT_1;
1302 tp->t_flags &= ~TF_NEEDFIN;
1303 } else {
1304 tp->t_state = TCPS_ESTABLISHED;
1305 callout_reset(tp->tt_keep,
1306 tcp_keepidle,
1307 tcp_timer_keep,
1308 tp);
1309 }
1310 tp->t_flags |= TF_NEEDSYN;
1311 } else
1312 tp->t_state = TCPS_SYN_RECEIVED;
1313 } else {
1316 /* CC.NEW or no option => invalidate cache */
1317 taop->tao_cc = 0;
1314 if (tcp_do_rfc1644) {
1315 /* CC.NEW or no option => invalidate cache */
1316 tao.tao_cc = 0;
1317 tcp_hc_updatetao(&inp->inp_inc,
1318 TCP_HC_TAO_CC, to.to_cc, 0);
1319 }
1318 tp->t_state = TCPS_SYN_RECEIVED;
1319 }
1320 }
1321
1322trimthenstep6:
1323 /*
1324 * Advance th->th_seq to correspond to first data byte.
1325 * If data, trim to stay within window,

--- 351 unchanged lines hidden (view full) ---

1677 /* Do window scaling? */
1678 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1679 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1680 tp->snd_scale = tp->requested_s_scale;
1681 tp->rcv_scale = tp->request_r_scale;
1682 }
1683 /*
1684 * Upon successful completion of 3-way handshake,
1320 tp->t_state = TCPS_SYN_RECEIVED;
1321 }
1322 }
1323
1324trimthenstep6:
1325 /*
1326 * Advance th->th_seq to correspond to first data byte.
1327 * If data, trim to stay within window,

--- 351 unchanged lines hidden (view full) ---

1679 /* Do window scaling? */
1680 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
1681 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
1682 tp->snd_scale = tp->requested_s_scale;
1683 tp->rcv_scale = tp->request_r_scale;
1684 }
1685 /*
1686 * Upon successful completion of 3-way handshake,
1685 * update cache.CC if it was undefined, pass any queued
1686 * data to the user, and advance state appropriately.
1687 * update cache.CC, pass any queued data to the user,
1688 * and advance state appropriately.
1687 */
1689 */
1688 if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL &&
1689 taop->tao_cc == 0)
1690 taop->tao_cc = tp->cc_recv;
1691
1690 if (tcp_do_rfc1644) {
1691 tao.tao_cc = tp->cc_recv;
1692 tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC,
1693 tp->cc_recv, 0);
1694 }
1692 /*
1693 * Make transitions:
1694 * SYN-RECEIVED -> ESTABLISHED
1695 * SYN-RECEIVED* -> FIN-WAIT-1
1696 */
1697 tp->t_starttime = ticks;
1698 if (tp->t_flags & TF_NEEDFIN) {
1699 tp->t_state = TCPS_FIN_WAIT_1;

--- 906 unchanged lines hidden (view full) ---

2606 * parameters from pre-set or cached values in the routing entry.
2607 *
2608 * Also take into account the space needed for options that we
2609 * send regularly. Make maxseg shorter by that amount to assure
2610 * that we can send maxseg amount of data even when the options
2611 * are present. Store the upper limit of the length of options plus
2612 * data in maxopd.
2613 *
1695 /*
1696 * Make transitions:
1697 * SYN-RECEIVED -> ESTABLISHED
1698 * SYN-RECEIVED* -> FIN-WAIT-1
1699 */
1700 tp->t_starttime = ticks;
1701 if (tp->t_flags & TF_NEEDFIN) {
1702 tp->t_state = TCPS_FIN_WAIT_1;

--- 906 unchanged lines hidden (view full) ---

2609 * parameters from pre-set or cached values in the routing entry.
2610 *
2611 * Also take into account the space needed for options that we
2612 * send regularly. Make maxseg shorter by that amount to assure
2613 * that we can send maxseg amount of data even when the options
2614 * are present. Store the upper limit of the length of options plus
2615 * data in maxopd.
2616 *
2614 * NOTE that this routine is only called when we process an incoming
2615 * segment, for outgoing segments only tcp_mssopt is called.
2616 *
2617 * In case of T/TCP, we call this routine during implicit connection
2618 * setup as well (offer = -1), to initialize maxseg from the cached
2619 * MSS of our peer.
2617 *
2618 * In case of T/TCP, we call this routine during implicit connection
2619 * setup as well (offer = -1), to initialize maxseg from the cached
2620 * MSS of our peer.
2621 *
2622 * NOTE that this routine is only called when we process an incoming
2623 * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
2620 */
2621void
2622tcp_mss(tp, offer)
2623 struct tcpcb *tp;
2624 int offer;
2625{
2624 */
2625void
2626tcp_mss(tp, offer)
2627 struct tcpcb *tp;
2628 int offer;
2629{
2626 register struct rtentry *rt;
2627 struct ifnet *ifp;
2628 register int rtt, mss;
2630 int rtt, mss;
2629 u_long bufsize;
2631 u_long bufsize;
2632 u_long maxmtu;
2630 struct inpcb *inp = tp->t_inpcb;
2631 struct socket *so;
2633 struct inpcb *inp = tp->t_inpcb;
2634 struct socket *so;
2632 struct rmxp_tao *taop;
2635 struct hc_metrics_lite metrics;
2636 struct rmxp_tao tao;
2633 int origoffer = offer;
2634#ifdef INET6
2635 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
2636 size_t min_protoh = isipv6 ?
2637 sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
2638 sizeof (struct tcpiphdr);
2639#else
2637 int origoffer = offer;
2638#ifdef INET6
2639 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
2640 size_t min_protoh = isipv6 ?
2641 sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
2642 sizeof (struct tcpiphdr);
2643#else
2640 const int isipv6 = 0;
2641 const size_t min_protoh = sizeof (struct tcpiphdr);
2644 const size_t min_protoh = sizeof(struct tcpiphdr);
2642#endif
2645#endif
2646 bzero(&tao, sizeof(tao));
2643
2647
2644 if (isipv6)
2645 rt = tcp_rtlookup6(&inp->inp_inc);
2646 else
2647 rt = tcp_rtlookup(&inp->inp_inc);
2648 if (rt == NULL) {
2649 tp->t_maxopd = tp->t_maxseg =
2650 isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
2651 return;
2648 /* initialize */
2649#ifdef INET6
2650 if (isipv6) {
2651 maxmtu = tcp_maxmtu6(&inp->inp_inc);
2652 tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt;
2653 } else
2654#endif
2655 {
2656 maxmtu = tcp_maxmtu(&inp->inp_inc);
2657 tp->t_maxopd = tp->t_maxseg = tcp_mssdflt;
2652 }
2658 }
2653 ifp = rt->rt_ifp;
2654 so = inp->inp_socket;
2655
2659 so = inp->inp_socket;
2660
2656 taop = rmx_taop(rt->rt_rmx);
2657 /*
2661 /*
2658 * Offer == -1 means that we didn't receive SYN yet,
2659 * use cached value in that case;
2662 * no route to sender, take default mss and return
2660 */
2663 */
2661 if (offer == -1)
2662 offer = taop->tao_mssopt;
2664 if (maxmtu == 0)
2665 return;
2666
2667 /* what have we got? */
2668 switch (offer) {
2669 case 0:
2670 /*
2671 * Offer == 0 means that there was no MSS on the SYN
2672 * segment, in this case we use tcp_mssdflt.
2673 */
2674 offer =
2675#ifdef INET6
2676 isipv6 ? tcp_v6mssdflt :
2677#endif
2678 tcp_mssdflt;
2679 break;
2680
2681 case -1:
2682 /*
2683 * Offer == -1 means that we didn't receive SYN yet,
2684 * use cached value in that case;
2685 */
2686 if (tcp_do_rfc1644)
2687 tcp_hc_gettao(&inp->inp_inc, &tao);
2688 if (tao.tao_mssopt != 0)
2689 offer = tao.tao_mssopt;
2690 /* FALLTHROUGH */
2691
2692 default:
2693 /*
2694 * Sanity check: make sure that maxopd will be large
2695 * enough to allow some data on segments even if the
2696 * all the option space is used (40bytes). Otherwise
2697 * funny things may happen in tcp_output.
2698 */
2699 offer = max(offer, 64);
2700 if (tcp_do_rfc1644)
2701 tcp_hc_updatetao(&inp->inp_inc,
2702 TCP_HC_TAO_MSSOPT, 0, offer);
2703 }
2704
2663 /*
2705 /*
2664 * Offer == 0 means that there was no MSS on the SYN segment,
2665 * in this case we use tcp_mssdflt.
2706 * rmx information is now retrieved from tcp_hostcache
2666 */
2707 */
2667 if (offer == 0)
2668 offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
2669 else
2670 /*
2671 * Sanity check: make sure that maxopd will be large
2672 * enough to allow some data on segments even is the
2673 * all the option space is used (40bytes). Otherwise
2674 * funny things may happen in tcp_output.
2675 */
2676 offer = max(offer, 64);
2677 taop->tao_mssopt = offer;
2708 tcp_hc_get(&inp->inp_inc, &metrics);
2678
2679 /*
2709
2710 /*
2680 * While we're here, check if there's an initial rtt
2681 * or rttvar. Convert from the route-table units
2682 * to scaled multiples of the slow timeout timer.
2683 */
2684 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
2685 /*
2686 * XXX the lock bit for RTT indicates that the value
2687 * is also a minimum value; this is subject to time.
2688 */
2689 if (rt->rt_rmx.rmx_locks & RTV_RTT)
2690 tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
2691 tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
2692 tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
2693 tcpstat.tcps_usedrtt++;
2694 if (rt->rt_rmx.rmx_rttvar) {
2695 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
2696 (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
2697 tcpstat.tcps_usedrttvar++;
2698 } else {
2699 /* default variation is +- 1 rtt */
2700 tp->t_rttvar =
2701 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
2702 }
2703 TCPT_RANGESET(tp->t_rxtcur,
2704 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
2705 tp->t_rttmin, TCPTV_REXMTMAX);
2706 }
2707 /*
2708 * if there's an mtu associated with the route, use it
2711 * if there's a discovered mtu int tcp hostcache, use it
2709 * else, use the link mtu.
2710 */
2712 * else, use the link mtu.
2713 */
2711 if (rt->rt_rmx.rmx_mtu)
2712 mss = rt->rt_rmx.rmx_mtu - min_protoh;
2714 if (metrics.rmx_mtu)
2715 mss = metrics.rmx_mtu - min_protoh;
2713 else {
2714#ifdef INET6
2716 else {
2717#ifdef INET6
2715 mss = (isipv6 ? IN6_LINKMTU(rt->rt_ifp) : ifp->if_mtu)
2716 - min_protoh;
2717#else
2718 mss = ifp->if_mtu - min_protoh;
2719#endif
2720#ifdef INET6
2721 if (isipv6) {
2718 if (isipv6) {
2722 if (!in6_localaddr(&inp->in6p_faddr))
2719 mss = maxmtu - min_protoh;
2720 if (!path_mtu_discovery &&
2721 !in6_localaddr(&inp->in6p_faddr))
2723 mss = min(mss, tcp_v6mssdflt);
2724 } else
2725#endif
2722 mss = min(mss, tcp_v6mssdflt);
2723 } else
2724#endif
2726 if (!in_localaddr(inp->inp_faddr))
2725 {
2726 mss = maxmtu - min_protoh;
2727 if (!path_mtu_discovery &&
2728 !in_localaddr(inp->inp_faddr))
2727 mss = min(mss, tcp_mssdflt);
2729 mss = min(mss, tcp_mssdflt);
2730 }
2728 }
2729 mss = min(mss, offer);
2731 }
2732 mss = min(mss, offer);
2733
2730 /*
2731 * maxopd stores the maximum length of data AND options
2732 * in a segment; maxseg is the amount of data in a normal
2733 * segment. We need to store this value (maxopd) apart
2734 * from maxseg, because now every segment carries options
2735 * and thus we normally have somewhat less data in segments.
2736 */
2737 tp->t_maxopd = mss;

--- 6 unchanged lines hidden (view full) ---

2744 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
2745 (origoffer == -1 ||
2746 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
2747 mss -= TCPOLEN_TSTAMP_APPA;
2748 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
2749 (origoffer == -1 ||
2750 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
2751 mss -= TCPOLEN_CC_APPA;
2734 /*
2735 * maxopd stores the maximum length of data AND options
2736 * in a segment; maxseg is the amount of data in a normal
2737 * segment. We need to store this value (maxopd) apart
2738 * from maxseg, because now every segment carries options
2739 * and thus we normally have somewhat less data in segments.
2740 */
2741 tp->t_maxopd = mss;

--- 6 unchanged lines hidden (view full) ---

2748 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
2749 (origoffer == -1 ||
2750 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
2751 mss -= TCPOLEN_TSTAMP_APPA;
2752 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
2753 (origoffer == -1 ||
2754 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
2755 mss -= TCPOLEN_CC_APPA;
2756 tp->t_maxseg = mss;
2752
2753#if (MCLBYTES & (MCLBYTES - 1)) == 0
2754 if (mss > MCLBYTES)
2755 mss &= ~(MCLBYTES-1);
2756#else
2757 if (mss > MCLBYTES)
2758 mss = mss / MCLBYTES * MCLBYTES;
2759#endif
2757
2758#if (MCLBYTES & (MCLBYTES - 1)) == 0
2759 if (mss > MCLBYTES)
2760 mss &= ~(MCLBYTES-1);
2761#else
2762 if (mss > MCLBYTES)
2763 mss = mss / MCLBYTES * MCLBYTES;
2764#endif
2765 tp->t_maxseg = mss;
2766
2760 /*
2767 /*
2761 * If there's a pipesize, change the socket buffer
2762 * to that size. Make the socket buffers an integral
2763 * number of mss units; if the mss is larger than
2764 * the socket buffer, decrease the mss.
2768 * If there's a pipesize, change the socket buffer to that size,
2769 * don't change if sb_hiwat is different than default (then it
2770 * has been changed on purpose with setsockopt).
2771 * Make the socket buffers an integral number of mss units;
2772 * if the mss is larger than the socket buffer, decrease the mss.
2765 */
2773 */
2766#ifdef RTV_SPIPE
2767 if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
2768#endif
2774 if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
2775 bufsize = metrics.rmx_sendpipe;
2776 else
2769 bufsize = so->so_snd.sb_hiwat;
2770 if (bufsize < mss)
2771 mss = bufsize;
2772 else {
2773 bufsize = roundup(bufsize, mss);
2774 if (bufsize > sb_max)
2775 bufsize = sb_max;
2776 if (bufsize > so->so_snd.sb_hiwat)
2777 (void)sbreserve(&so->so_snd, bufsize, so, NULL);
2778 }
2779 tp->t_maxseg = mss;
2780
2777 bufsize = so->so_snd.sb_hiwat;
2778 if (bufsize < mss)
2779 mss = bufsize;
2780 else {
2781 bufsize = roundup(bufsize, mss);
2782 if (bufsize > sb_max)
2783 bufsize = sb_max;
2784 if (bufsize > so->so_snd.sb_hiwat)
2785 (void)sbreserve(&so->so_snd, bufsize, so, NULL);
2786 }
2787 tp->t_maxseg = mss;
2788
2781#ifdef RTV_RPIPE
2782 if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
2783#endif
2789 if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
2790 bufsize = metrics.rmx_recvpipe;
2791 else
2784 bufsize = so->so_rcv.sb_hiwat;
2785 if (bufsize > mss) {
2786 bufsize = roundup(bufsize, mss);
2787 if (bufsize > sb_max)
2788 bufsize = sb_max;
2789 if (bufsize > so->so_rcv.sb_hiwat)
2790 (void)sbreserve(&so->so_rcv, bufsize, so, NULL);
2791 }
2792 bufsize = so->so_rcv.sb_hiwat;
2793 if (bufsize > mss) {
2794 bufsize = roundup(bufsize, mss);
2795 if (bufsize > sb_max)
2796 bufsize = sb_max;
2797 if (bufsize > so->so_rcv.sb_hiwat)
2798 (void)sbreserve(&so->so_rcv, bufsize, so, NULL);
2799 }
2800 /*
2801 * While we're here, check the others too
2802 */
2803 if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
2804 tp->t_srtt = rtt;
2805 tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
2806 tcpstat.tcps_usedrtt++;
2807 if (metrics.rmx_rttvar) {
2808 tp->t_rttvar = metrics.rmx_rttvar;
2809 tcpstat.tcps_usedrttvar++;
2810 } else {
2811 /* default variation is +- 1 rtt */
2812 tp->t_rttvar =
2813 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
2814 }
2815 TCPT_RANGESET(tp->t_rxtcur,
2816 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
2817 tp->t_rttmin, TCPTV_REXMTMAX);
2818 }
2819 if (metrics.rmx_ssthresh) {
2820 /*
2821 * There's some sort of gateway or interface
2822 * buffer limit on the path. Use this to set
2823 * the slow start threshhold, but set the
2824 * threshold to no less than 2*mss.
2825 */
2826 tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
2827 tcpstat.tcps_usedssthresh++;
2828 }
2829 if (metrics.rmx_bandwidth)
2830 tp->snd_bandwidth = metrics.rmx_bandwidth;
2792
2793 /*
2794 * Set the slow-start flight size depending on whether this
2795 * is a local network or not.
2831
2832 /*
2833 * Set the slow-start flight size depending on whether this
2834 * is a local network or not.
2835 *
2836 * Extend this so we cache the cwnd too and retrieve it here.
2837 * Make cwnd even bigger than RFC3390 suggests but only if we
2838 * have previous experience with the remote host. Be careful
2839 * not make cwnd bigger than remote receive window or our own
2840 * send socket buffer. Maybe put some additional upper bound
2841 * on the retrieved cwnd. Should do incremental updates to
2842 * hostcache when cwnd collapses so next connection doesn't
2843 * overloads the path again.
2844 *
2845 * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
2846 * We currently check only in syncache_socket for that.
2796 */
2847 */
2848#define TCP_METRICS_CWND
2849#ifdef TCP_METRICS_CWND
2850 if (metrics.rmx_cwnd)
2851 tp->snd_cwnd = max(mss,
2852 min(metrics.rmx_cwnd / 2,
2853 min(tp->snd_wnd, so->so_snd.sb_hiwat)));
2854 else
2855#endif
2797 if (tcp_do_rfc3390)
2798 tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
2856 if (tcp_do_rfc3390)
2857 tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
2858#ifdef INET6
2799 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
2800 (!isipv6 && in_localaddr(inp->inp_faddr)))
2801 tp->snd_cwnd = mss * ss_fltsz_local;
2859 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
2860 (!isipv6 && in_localaddr(inp->inp_faddr)))
2861 tp->snd_cwnd = mss * ss_fltsz_local;
2862#endif
2802 else
2803 tp->snd_cwnd = mss * ss_fltsz;
2863 else
2864 tp->snd_cwnd = mss * ss_fltsz;
2804
2805 if (rt->rt_rmx.rmx_ssthresh) {
2806 /*
2807 * There's some sort of gateway or interface
2808 * buffer limit on the path. Use this to set
2809 * the slow start threshhold, but set the
2810 * threshold to no less than 2*mss.
2811 */
2812 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
2813 tcpstat.tcps_usedssthresh++;
2814 }
2815}
2816
2817/*
2818 * Determine the MSS option to send on an outgoing SYN.
2819 */
2820int
2865}
2866
2867/*
2868 * Determine the MSS option to send on an outgoing SYN.
2869 */
2870int
2821tcp_mssopt(tp)
2822 struct tcpcb *tp;
2871tcp_mssopt(inc)
2872 struct in_conninfo *inc;
2823{
2873{
2824 struct rtentry *rt;
2874 int mss = 0;
2875 u_long maxmtu = 0;
2876 u_long thcmtu = 0;
2877 size_t min_protoh;
2825#ifdef INET6
2878#ifdef INET6
2826 int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
2827 size_t min_protoh = isipv6 ?
2828 sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
2829 sizeof (struct tcpiphdr);
2830#else
2831 const int isipv6 = 0;
2832 const size_t min_protoh = sizeof (struct tcpiphdr);
2879 int isipv6 = inc->inc_isipv6 ? 1 : 0;
2833#endif
2834
2880#endif
2881
2835 if (isipv6)
2836 rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc);
2837 else
2838 rt = tcp_rtlookup(&tp->t_inpcb->inp_inc);
2839 if (rt == NULL)
2840 return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
2882 KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
2841
2842#ifdef INET6
2883
2884#ifdef INET6
2843 return (isipv6 ? IN6_LINKMTU(rt->rt_ifp) :
2844 rt->rt_ifp->if_mtu - min_protoh);
2845#else
2846 return (rt->rt_ifp->if_mtu - min_protoh);
2885 if (isipv6) {
2886 mss = tcp_v6mssdflt;
2887 maxmtu = tcp_maxmtu6(inc);
2888 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
2889 min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
2890 } else
2847#endif
2891#endif
2892 {
2893 mss = tcp_mssdflt;
2894 maxmtu = tcp_maxmtu(inc);
2895 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
2896 min_protoh = sizeof(struct tcpiphdr);
2897 }
2898 if (maxmtu && thcmtu)
2899 mss = min(maxmtu, thcmtu) - min_protoh;
2900 else if (maxmtu || thcmtu)
2901 mss = max(maxmtu, thcmtu) - min_protoh;
2902
2903 return (mss);
2848}
2849
2850
2851/*
2852 * On a partial ack arrives, force the retransmission of the
2853 * next unacknowledged segment. Do not clear tp->t_dupacks.
2854 * By setting snd_nxt to ti_ack, this forces retransmission timer to
2855 * be started again.

--- 178 unchanged lines hidden ---
2904}
2905
2906
2907/*
2908 * On a partial ack arrives, force the retransmission of the
2909 * next unacknowledged segment. Do not clear tp->t_dupacks.
2910 * By setting snd_nxt to ti_ack, this forces retransmission timer to
2911 * be started again.

--- 178 unchanged lines hidden ---