1/*
2 * Copyright (c) 2012-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/kernel.h>
32#include <sys/mbuf.h>
33#include <sys/mcache.h>
34#include <sys/socket.h>
35#include <sys/socketvar.h>
36#include <sys/syslog.h>
37#include <sys/protosw.h>
38
39#include <kern/zalloc.h>
40#include <kern/locks.h>
41
42#include <mach/thread_act.h>
43#include <mach/sdt.h>
44
45#include <dev/random/randomdev.h>
46
47#include <net/if.h>
48#include <netinet/in.h>
49#include <netinet/in_var.h>
50#include <netinet/tcp.h>
51#include <netinet/tcp_fsm.h>
52#include <netinet/tcp_seq.h>
53#include <netinet/tcp_var.h>
54#include <netinet/mptcp_var.h>
55#include <netinet/mptcp.h>
56#include <netinet/mptcp_seq.h>
57#include <netinet/mptcp_opt.h>
58#include <netinet/mptcp_timer.h>
59
60int mptcp_enable = 1;
61SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
62	&mptcp_enable, 0, "Enable Multipath TCP Support");
63
64int mptcp_dbg = 0;
65SYSCTL_INT(_net_inet_mptcp, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED,
66	&mptcp_dbg, 0, "Enable Multipath TCP Debugging");
67
68/* Number of times to try negotiating MPTCP on SYN retransmissions */
69int mptcp_mpcap_retries = MPTCP_CAPABLE_RETRIES;
70SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
71	CTLFLAG_RW | CTLFLAG_LOCKED,
72	&mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
73
74/*
75 * By default, DSS checksum is turned off, revisit if we ever do
76 * MPTCP for non SSL Traffic.
77 */
78int mptcp_dss_csum = 0;
79SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
80	&mptcp_dss_csum, 0, "Enable DSS checksum");
81
82/*
83 * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
84 * is attempted on a different path.
85 */
86int mptcp_fail_thresh = 1;
87SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
88	&mptcp_fail_thresh, 0, "Failover threshold");
89
90
91/*
92 * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
93 * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
94 * Some carrier networks have a timeout of 10 or 15 minutes.
95 */
96int mptcp_subflow_keeptime = 60*14;
97SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
98	&mptcp_subflow_keeptime, 0, "Keepalive in seconds");
99
100/*
101 * MP_PRIO option.
102 */
103int mptcp_mpprio_enable = 1;
104SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mpprio, CTLFLAG_RW | CTLFLAG_LOCKED,
105	&mptcp_mpprio_enable, 0, "Enable MP_PRIO option");
106
107/*
108 * REMOVE_ADDR option.
109 */
110int mptcp_remaddr_enable = 1;
111SYSCTL_INT(_net_inet_mptcp, OID_AUTO, remaddr, CTLFLAG_RW | CTLFLAG_LOCKED,
112	&mptcp_remaddr_enable, 0, "Enable REMOVE_ADDR option");
113
114/*
115 * FastJoin Option
116 */
117int mptcp_fastjoin = 1;
118SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fastjoin, CTLFLAG_RW | CTLFLAG_LOCKED,
119	&mptcp_fastjoin, 0, "Enable FastJoin Option");
120
121int mptcp_zerortt_fastjoin = 0;
122SYSCTL_INT(_net_inet_mptcp, OID_AUTO, zerortt_fastjoin, CTLFLAG_RW |
123	CTLFLAG_LOCKED, &mptcp_zerortt_fastjoin, 0,
124	"Enable Zero RTT Fast Join");
125
126/*
127 * R/W Notification on resume
128 */
129int mptcp_rwnotify = 0;
130SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rwnotify, CTLFLAG_RW | CTLFLAG_LOCKED,
131	&mptcp_rwnotify, 0, "Enable RW notify on resume");
132
133/*
134 * MPTCP input, called when data has been read from a subflow socket.
135 */
136void
137mptcp_input(struct mptses *mpte, struct mbuf *m)
138{
139	struct socket *mp_so;
140	struct mptcb *mp_tp = NULL;
141	u_int64_t mb_dsn;
142	u_int32_t mb_datalen;
143	int count = 0;
144	struct mbuf *save = NULL, *prev = NULL;
145	struct mbuf *freelist = NULL, *tail = NULL;
146	boolean_t in_fallback = FALSE;
147
148	VERIFY(m->m_flags & M_PKTHDR);
149
150	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
151	mp_so = mpte->mpte_mppcb->mpp_socket;
152
153	DTRACE_MPTCP(input);
154
155	/*
156	 * Each mbuf contains MPTCP Data Sequence Map
157	 * Process the data for reassembly, delivery to MPTCP socket
158	 * client, etc.
159	 *
160	 */
161	count = mp_so->so_rcv.sb_cc;
162
163	VERIFY(m != NULL);
164	mp_tp = mpte->mpte_mptcb;
165	VERIFY(mp_tp != NULL);
166
167	/* Ok to check for this flag without lock as its set in this thread */
168	in_fallback = (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
169
170	/*
171	 * In the degraded fallback case, data is accepted without DSS map
172	 */
173	if (in_fallback) {
174fallback:
175		/*
176		 * assume degraded flow as this may be the first packet
177		 * without DSS, and the subflow state is not updated yet.
178		 */
179		if (sbappendstream(&mp_so->so_rcv, m))
180			sorwakeup(mp_so);
181		DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
182		    struct socket *, mp_so,
183		    struct sockbuf *, &mp_so->so_rcv,
184		    struct sockbuf *, &mp_so->so_snd,
185		    struct mptses *, mpte);
186		count = mp_so->so_rcv.sb_cc - count;
187		mptcplog3((LOG_DEBUG, "%s: fread %d bytes\n", __func__, count));
188		return;
189	}
190
191	MPT_LOCK(mp_tp);
192	do {
193		/* If fallback occurs, mbufs will not have PKTF_MPTCP set */
194		if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
195			MPT_UNLOCK(mp_tp);
196			goto fallback;
197		}
198
199		save = m->m_next;
200		/*
201		 * A single TCP packet formed of multiple mbufs
202		 * holds DSS mapping in the first mbuf of the chain.
203		 * Other mbufs in the chain may have M_PKTHDR set
204		 * even though they belong to the same TCP packet
205		 * and therefore use the DSS mapping stored in the
206		 * first mbuf of the mbuf chain. mptcp_input() can
207		 * get an mbuf chain with multiple TCP packets.
208		 */
209		while (save && (!(save->m_flags & M_PKTHDR) ||
210		    !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
211			prev = save;
212			save = save->m_next;
213		}
214		if (prev)
215			prev->m_next = NULL;
216		else
217			m->m_next = NULL;
218
219		mb_dsn = m->m_pkthdr.mp_dsn;
220		mb_datalen = m->m_pkthdr.mp_rlen;
221
222		if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvatmark)) {
223			tcpstat.tcps_mp_oodata++;
224			MPT_UNLOCK(mp_tp);
225			m_freem(m);
226			return;
227			/*
228			 * Reassembly queue support here in future. Per spec,
229			 * senders must implement retransmission timer to
230			 * retransmit unacked data. Dropping out of order
231			 * gives a slight hit on performance but allows us to
232			 * deploy MPTCP and protects us against in-window DoS
233			 * attacks that attempt to use up memory by sending
234			 * out of order data. When doing load sharing across
235			 * subflows, out of order support is a must.
236			 */
237		}
238
239		if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvatmark)) {
240			if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
241			    mp_tp->mpt_rcvatmark)) {
242				if (freelist == NULL)
243					freelist = m;
244				else
245					tail->m_next = m;
246
247				if (prev != NULL)
248					tail = prev;
249				else
250					tail = m;
251
252				m = save;
253				prev = save = NULL;
254				continue;
255			} else {
256				m_adj(m, (mp_tp->mpt_rcvatmark - mb_dsn));
257			}
258			mptcplog((LOG_INFO, "%s: %llu %d 2 \n", __func__,
259			    mp_tp->mpt_rcvatmark, m->m_pkthdr.len));
260		}
261
262		MPT_UNLOCK(mp_tp);
263		if (sbappendstream(&mp_so->so_rcv, m)) {
264			sorwakeup(mp_so);
265		}
266		DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
267		    struct sockbuf *, &mp_so->so_rcv,
268		    struct sockbuf *, &mp_so->so_snd,
269		    struct mptses *, mpte,
270		    struct mptcb *, mp_tp);
271		MPT_LOCK(mp_tp);
272		count = mp_so->so_rcv.sb_cc - count;
273		tcpstat.tcps_mp_rcvtotal++;
274		tcpstat.tcps_mp_rcvbytes += count;
275		mptcplog3((LOG_DEBUG, "%s: read %d bytes\n", __func__, count));
276		/*
277		 * The data received at the MPTCP layer will never exceed the
278		 * receive window because anything to the right of the
279		 * receive window will be trimmed at the subflow level.
280		 */
281		mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
282		mp_tp->mpt_rcvatmark += count;
283		m = save;
284		prev = save = NULL;
285		count = mp_so->so_rcv.sb_cc;
286	} while (m);
287	MPT_UNLOCK(mp_tp);
288
289	if (freelist)
290		m_freem(freelist);
291}
292
293/*
294 * MPTCP output.
295 */
296int
297mptcp_output(struct mptses *mpte)
298{
299	struct mptsub *mpts;
300	struct mptsub *mpts_tried = NULL;
301	struct socket *mp_so;
302	int error = 0;
303
304	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
305	mp_so = mpte->mpte_mppcb->mpp_socket;
306	if (mp_so->so_state & SS_CANTSENDMORE) {
307		return (EPIPE);
308	}
309
310try_again:
311	/* get the "best" subflow to be used for transmission */
312	mpts = mptcp_get_subflow(mpte, NULL);
313	if (mpts == NULL) {
314		mptcplog((LOG_ERR, "%s: mp_so 0x%llx has no usable subflow\n",
315		    __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)));
316		goto out;
317	}
318
319	mptcplog3((LOG_INFO, "%s: mp_so 0x%llx cid %d \n", __func__,
320	    (uint64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid));
321
322	/* In case there's just one flow, we reattempt later */
323	MPTS_LOCK(mpts);
324	if ((mpts_tried != NULL) && ((mpts == mpts_tried) ||
325	    (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
326		MPTS_UNLOCK(mpts);
327		MPTS_LOCK(mpts_tried);
328		mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
329		mpts_tried->mpts_flags |= MPTSF_ACTIVE;
330		MPTS_UNLOCK(mpts_tried);
331		MPT_LOCK(mpte->mpte_mptcb);
332		mptcp_start_timer(mpte->mpte_mptcb, MPTT_REXMT);
333		MPT_UNLOCK(mpte->mpte_mptcb);
334		mptcplog((LOG_INFO, "%s: mp_so 0x%llx retry later\n",
335		    __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)));
336		goto out;
337	}
338
339	DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
340	    struct socket *, mp_so);
341	error = mptcp_subflow_output(mpte, mpts);
342	if (error) {
343		/* can be a temporary loss of source address or other error */
344		mpts->mpts_flags |= MPTSF_FAILINGOVER;
345		mpts->mpts_flags &= ~MPTSF_ACTIVE;
346		mpts_tried = mpts;
347		MPTS_UNLOCK(mpts);
348		mptcplog((LOG_INFO, "%s: error = %d \n", __func__, error));
349		goto try_again;
350	}
351	/* The model is to have only one active flow at a time */
352	mpts->mpts_flags |= MPTSF_ACTIVE;
353	MPTS_UNLOCK(mpts);
354	if (mpte->mpte_active_sub == NULL) {
355		mpte->mpte_active_sub = mpts;
356	} else if (mpte->mpte_active_sub != mpts) {
357		MPTS_LOCK(mpte->mpte_active_sub);
358		mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
359		MPTS_UNLOCK(mpte->mpte_active_sub);
360		mpte->mpte_active_sub = mpts;
361	}
362out:
363	/* subflow errors should not be percolated back up */
364	return (0);
365}
366
367/*
368 * Return the most eligible subflow to be used for sending data.
369 * This function also serves to check if any alternate subflow is available
370 * or not.
371 */
372struct mptsub *
373mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore)
374{
375	struct mptsub *mpts;
376	struct mptsub *fallback = NULL;
377	struct socket *so = NULL;
378
379	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
380
381	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
382		MPTS_LOCK(mpts);
383
384		if ((ignore) && (mpts == ignore)) {
385			MPTS_UNLOCK(mpts);
386			continue;
387		}
388
389		/* There can only be one subflow in degraded state */
390		if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
391			MPTS_UNLOCK(mpts);
392			break;
393		}
394
395		/*
396		 * Subflows with Fastjoin allow data to be written before
397		 * the subflow is mp capable.
398		 */
399		if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) &&
400		    !(mpts->mpts_flags & MPTSF_FASTJ_REQD)) {
401			MPTS_UNLOCK(mpts);
402			continue;
403		}
404
405		if (mpts->mpts_flags & MPTSF_SUSPENDED) {
406			MPTS_UNLOCK(mpts);
407			continue;
408		}
409
410		if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
411		    (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
412			MPTS_UNLOCK(mpts);
413			continue;
414		}
415
416		if (mpts->mpts_flags & MPTSF_FAILINGOVER) {
417			so = mpts->mpts_socket;
418			if ((so) && (!(so->so_flags & SOF_PCBCLEARING))) {
419				socket_lock(so, 1);
420				if ((so->so_snd.sb_cc == 0) &&
421				    (mptcp_no_rto_spike(so))) {
422					mpts->mpts_flags &= ~MPTSF_FAILINGOVER;
423					so->so_flags &= ~SOF_MP_TRYFAILOVER;
424					fallback = mpts;
425					socket_unlock(so, 1);
426				} else {
427					fallback = mpts;
428					socket_unlock(so, 1);
429					MPTS_UNLOCK(mpts);
430					continue;
431				}
432			} else {
433				MPTS_UNLOCK(mpts);
434				continue;
435			}
436		}
437
438		if (mpts->mpts_flags & MPTSF_PREFERRED) {
439			MPTS_UNLOCK(mpts);
440			break;
441		}
442
443		/* When there are no preferred flows, use first one in list */
444		fallback = mpts;
445
446		MPTS_UNLOCK(mpts);
447	}
448	/*
449	 * If there is no preferred or backup subflow, and there is no active
450	 * subflow use the last usable subflow.
451	 */
452	if (mpts == NULL) {
453		return (fallback);
454	}
455
456	return (mpts);
457}
458
459struct mptsub *
460mptcp_get_pending_subflow(struct mptses *mpte, struct mptsub *ignore)
461{
462	struct mptsub *mpts = NULL;
463
464	MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
465
466	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
467		MPTS_LOCK(mpts);
468
469		if ((ignore) && (mpts == ignore)) {
470			MPTS_UNLOCK(mpts);
471			continue;
472		}
473
474		if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
475			MPTS_UNLOCK(mpts);
476			break;
477		}
478
479		MPTS_UNLOCK(mpts);
480	}
481	return (mpts);
482}
483
484void
485mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
486{
487	MPT_LOCK_ASSERT_HELD(mp_tp);
488
489	DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
490	    uint32_t, event);
491
492	switch (mp_tp->mpt_state) {
493	case MPTCPS_CLOSED:
494	case MPTCPS_LISTEN:
495		mp_tp->mpt_state = MPTCPS_CLOSED;
496		break;
497
498	case MPTCPS_ESTABLISHED:
499		if (event == MPCE_CLOSE) {
500			mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
501			mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
502		}
503		else if (event == MPCE_RECV_DATA_FIN) {
504			mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
505			mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
506		}
507		break;
508
509	case MPTCPS_CLOSE_WAIT:
510		if (event == MPCE_CLOSE) {
511			mp_tp->mpt_state = MPTCPS_LAST_ACK;
512			mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
513		}
514		break;
515
516	case MPTCPS_FIN_WAIT_1:
517		if (event == MPCE_RECV_DATA_ACK)
518			mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
519		else if (event == MPCE_RECV_DATA_FIN) {
520			mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
521			mp_tp->mpt_state = MPTCPS_CLOSING;
522		}
523		break;
524
525	case MPTCPS_CLOSING:
526		if (event == MPCE_RECV_DATA_ACK)
527			mp_tp->mpt_state = MPTCPS_TIME_WAIT;
528		break;
529
530	case MPTCPS_LAST_ACK:
531		if (event == MPCE_RECV_DATA_ACK)
532			mp_tp->mpt_state = MPTCPS_TERMINATE;
533		break;
534
535	case MPTCPS_FIN_WAIT_2:
536		if (event == MPCE_RECV_DATA_FIN) {
537			mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
538			mp_tp->mpt_state = MPTCPS_TIME_WAIT;
539		}
540		break;
541
542	case MPTCPS_TIME_WAIT:
543		break;
544
545	case MPTCPS_FASTCLOSE_WAIT:
546		if (event == MPCE_CLOSE) {
547			/* no need to adjust for data FIN */
548			mp_tp->mpt_state = MPTCPS_TERMINATE;
549		}
550		break;
551	case MPTCPS_TERMINATE:
552		break;
553	default:
554		VERIFY(0);
555		/* NOTREACHED */
556	}
557	DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
558	    uint32_t, event);
559	mptcplog((LOG_INFO, "%s: state = %d\n",
560	    __func__, mp_tp->mpt_state));
561}
562
563/*
564 * Update the mptcb send state variables, but the actual sbdrop occurs
565 * in MPTCP layer
566 */
567void
568mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack)
569{
570	u_int64_t acked = 0;
571
572	acked = full_dack - mp_tp->mpt_snduna;
573
574	if (acked) {
575		mp_tp->mpt_snduna += acked;
576		/* In degraded mode, we may get some Data ACKs */
577		if ((tp->t_mpflags & TMPF_TCP_FALLBACK) &&
578			!(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
579			MPTCP_SEQ_GT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
580			/* bring back sndnxt to retransmit MPTCP data */
581			mp_tp->mpt_sndnxt = mp_tp->mpt_dsn_at_csum_fail;
582			mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
583			tp->t_inpcb->inp_socket->so_flags1 |=
584			    SOF1_POST_FALLBACK_SYNC;
585		}
586	}
587	if ((full_dack == mp_tp->mpt_sndmax) &&
588	    (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1)) {
589		mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_ACK);
590		tp->t_mpflags &= ~TMPF_SEND_DFIN;
591	}
592}
593
594/* If you change this function, match up mptcp_update_rcv_state_f */
595void
596mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
597    uint16_t csum)
598{
599	struct mptcb *mp_tp = tptomptp(tp);
600	u_int64_t full_dsn = 0;
601
602	NTOHL(dss_info->mdss_dsn);
603	NTOHL(dss_info->mdss_subflow_seqn);
604	NTOHS(dss_info->mdss_data_len);
605
606	/* XXX for autosndbuf grow sb here */
607	MPT_LOCK(mp_tp);
608	MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
609	MPT_UNLOCK(mp_tp);
610	mptcp_update_rcv_state_meat(mp_tp, tp,
611	    full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
612	    csum);
613
614}
615
616void
617mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
618    u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
619    uint16_t csum)
620{
621	if (mdss_data_len == 0) {
622		mptcplog((LOG_INFO, "%s: Received infinite mapping.",
623		    __func__));
624		if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
625			mptcplog((LOG_ERR, "%s: Bad checksum value %x \n",
626			    __func__, csum));
627		}
628		mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
629		return;
630	}
631	MPT_LOCK(mp_tp);
632	if (mptcp_dbg >= MP_VERBOSE_DEBUG_1)
633		printf("%s: seqn = %x len = %x full = %llx rcvnxt = %llu \n",
634		    __func__, seqn, mdss_data_len, full_dsn,
635		    mp_tp->mpt_rcvnxt);
636
637	/* Process a Data FIN packet , handled in mptcp_do_fin_opt */
638	if ((seqn == 0) && (mdss_data_len == 1)) {
639		mptcplog((LOG_INFO, "%s: Data FIN DSS opt state = %d \n",
640		    __func__, mp_tp->mpt_state));
641		MPT_UNLOCK(mp_tp);
642		return;
643	}
644	MPT_UNLOCK(mp_tp);
645	mptcp_notify_mpready(tp->t_inpcb->inp_socket);
646	tp->t_rcv_map.mpt_dsn = full_dsn;
647	tp->t_rcv_map.mpt_sseq = seqn;
648	tp->t_rcv_map.mpt_len = mdss_data_len;
649	tp->t_rcv_map.mpt_csum = csum;
650	tp->t_mpflags |= TMPF_EMBED_DSN;
651}
652
653
654void
655mptcp_update_rcv_state_f(struct mptcp_dss_ack_opt *dss_info, struct tcpcb *tp,
656    uint16_t csum)
657{
658	u_int64_t full_dsn = 0;
659	struct mptcb *mp_tp = tptomptp(tp);
660
661	NTOHL(dss_info->mdss_dsn);
662	NTOHL(dss_info->mdss_subflow_seqn);
663	NTOHS(dss_info->mdss_data_len);
664	MPT_LOCK(mp_tp);
665	MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
666	MPT_UNLOCK(mp_tp);
667	mptcp_update_rcv_state_meat(mp_tp, tp,
668	    full_dsn,
669	    dss_info->mdss_subflow_seqn,
670	    dss_info->mdss_data_len,
671	    csum);
672}
673
674void
675mptcp_update_rcv_state_g(struct mptcp_dss64_ack32_opt *dss_info,
676    struct tcpcb *tp, uint16_t csum)
677{
678	u_int64_t dsn = mptcp_ntoh64(dss_info->mdss_dsn);
679	struct mptcb *mp_tp = tptomptp(tp);
680
681	NTOHL(dss_info->mdss_subflow_seqn);
682	NTOHS(dss_info->mdss_data_len);
683	mptcp_update_rcv_state_meat(mp_tp, tp,
684	    dsn,
685	    dss_info->mdss_subflow_seqn,
686	    dss_info->mdss_data_len,
687	    csum);
688}
689
690/*
691 * MPTCP Checksum support
692 * The checksum is calculated whenever the MPTCP DSS option is included
693 * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
694 * header and the actual data indicated by the length specified in the
695 * DSS option.
696 */
697
698uint16_t
699mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, int off)
700{
701	struct mptcb *mp_tp = tptomptp(tp);
702	uint32_t sum = 0;
703	uint64_t dsn;
704	uint32_t sseq;
705	uint16_t len;
706	uint16_t csum;
707
708	if (mp_tp == NULL)
709		return (0);
710
711	if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM))
712		return (0);
713
714	if (!(tp->t_mpflags & TMPF_EMBED_DSN))
715		return (0);
716
717	if (tp->t_mpflags & TMPF_TCP_FALLBACK)
718		return (0);
719
720	/*
721	 * The remote side may send a packet with fewer bytes than the
722	 * claimed DSS checksum length.
723	 */
724	if ((int)m_length2(m, NULL) < (off + tp->t_rcv_map.mpt_len))
725		return (0xffff);
726
727	if (tp->t_rcv_map.mpt_len != 0)
728		sum = m_sum16(m, off, tp->t_rcv_map.mpt_len);
729
730	dsn = mptcp_hton64(tp->t_rcv_map.mpt_dsn);
731	sseq = htonl(tp->t_rcv_map.mpt_sseq);
732	len = htons(tp->t_rcv_map.mpt_len);
733	csum = tp->t_rcv_map.mpt_csum;
734	sum += in_pseudo64(dsn, sseq, (len + csum));
735	ADDCARRY(sum);
736	DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
737	    uint32_t, sum);
738	mptcplog((LOG_INFO, "%s: sum = %x \n", __func__, sum));
739	return (~sum & 0xffff);
740}
741
742void
743mptcp_output_csum(struct tcpcb *tp, struct mbuf *m, int32_t len,
744    unsigned hdrlen, u_int64_t dss_val, u_int32_t *sseqp)
745{
746	struct mptcb *mp_tp = tptomptp(tp);
747	u_int32_t sum = 0;
748	uint32_t sseq;
749	uint16_t dss_len;
750	uint16_t csum = 0;
751	uint16_t *csump = NULL;
752
753	if (mp_tp == NULL)
754		return;
755
756	if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM))
757		return;
758
759	if (sseqp == NULL)
760		return;
761
762	if (len)
763		sum = m_sum16(m, hdrlen, len);
764
765	dss_val = mptcp_hton64(dss_val);
766	sseq = *sseqp;
767	dss_len = *(uint16_t *)(void *)((u_char*)sseqp + sizeof (u_int32_t));
768	sum += in_pseudo64(dss_val, sseq, (dss_len + csum));
769
770	ADDCARRY(sum);
771	sum = ~sum & 0xffff;
772	csump = (uint16_t *)(void *)((u_char*)sseqp + sizeof (u_int32_t) +
773	    sizeof (uint16_t));
774	DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
775	    uint32_t, sum);
776	*csump = sum;
777	mptcplog3((LOG_INFO, "%s: sum = %x \n", __func__, sum));
778}
779