1/*
2 * Copyright (c) 2012-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/proc.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/mbuf.h>
34#include <sys/mcache.h>
35#include <sys/resourcevar.h>
36#include <sys/socket.h>
37#include <sys/socketvar.h>
38#include <sys/syslog.h>
39#include <sys/domain.h>
40#include <sys/protosw.h>
41#include <sys/sysctl.h>
42
43#include <kern/zalloc.h>
44#include <kern/locks.h>
45
46#include <mach/thread_act.h>
47#include <mach/sdt.h>
48
49#include <net/if.h>
50#include <netinet/in.h>
51#include <netinet/in_pcb.h>
52#include <netinet/in_var.h>
53#include <netinet/tcp.h>
54#include <netinet/tcp_fsm.h>
55#include <netinet/tcp_seq.h>
56#include <netinet/tcp_var.h>
57#include <netinet/mptcp_var.h>
58#include <netinet/mptcp.h>
59#include <netinet/mptcp_seq.h>
60#include <netinet/mptcp_timer.h>
61#include <libkern/crypto/sha1.h>
62#if INET6
63#include <netinet6/in6_pcb.h>
64#include <netinet6/ip6protosw.h>
65#endif /* INET6 */
66#include <dev/random/randomdev.h>
67
68/*
69 * Notes on MPTCP implementation.
70 *
71 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
72 * communication domain.  The structure mtcbinfo describes the MPTCP instance
73 * of a Multipath protocol in that domain.  It is used to keep track of all
74 * MPTCP PCB instances in the system, and is protected by the global lock
75 * mppi_lock.
76 *
77 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
78 * IPPROTO_TCP).  Upon success, a Multipath PCB gets allocated and along with
79 * it comes an MPTCP Session and an MPTCP PCB.  All three structures are
80 * allocated from the same memory block, and each structure has a pointer
81 * to the adjacent ones.  The layout is defined by the mpp_mtp structure.
82 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
83 * PCB (mppcb) as well as the MPTCP Session (mptses).
84 *
85 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
86 * in particular, the list of subflows as well as the MPTCP thread.
87 *
88 * A functioning MPTCP Session consists of one or more subflow sockets.  Each
89 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
90 * represented by the mptsub structure.  Because each subflow requires access
91 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
92 * subflow.  This gets decremented prior to the subflow's destruction.  The
93 * subflow lock (mpts_lock) is used to protect accesses to the subflow.
94 *
95 * To handle events (read, write, control) from the subflows, an MPTCP thread
96 * is created; currently, there is one thread per MPTCP Session.  In order to
97 * prevent the MPTCP socket from being destroyed while being accessed by the
98 * MPTCP thread, we bump up the MPTCP socket's so_usecount for the thread,
99 * which will be decremented prior to the thread's termination.  The thread
100 * lock (mpte_thread_lock) is used to synchronize its signalling.
101 *
102 * Lock ordering is defined as follows:
103 *
104 *	mtcbinfo (mppi_lock)
105 *		mp_so (mpp_lock)
106 *			mpts (mpts_lock)
107 *				so (inpcb_mtx)
108 *					mptcb (mpt_lock)
109 *
110 * It is not a requirement that all of the above locks need to be acquired
111 * in succession, but the correct lock ordering must be followed when there
112 * are more than one locks that need to be held.  The MPTCP thread lock is
113 * is not constrained by this arrangement, because none of the other locks
114 * is ever acquired while holding mpte_thread_lock; therefore it may be called
115 * at any moment to signal the thread.
116 *
117 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
118 * work is done by the MPTCP garbage collector which is invoked on demand by
119 * the PF_MULTIPATH garbage collector.  This process will take place once all
120 * of the subflows have been destroyed, and the MPTCP thread be instructed to
121 * self-terminate.
122 */
123
124static void mptcp_sesdestroy(struct mptses *);
125static void mptcp_thread_signal_locked(struct mptses *);
126static void mptcp_thread_terminate_signal(struct mptses *);
127static void mptcp_thread_dowork(struct mptses *);
128static void mptcp_thread_func(void *, wait_result_t);
129static void mptcp_thread_destroy(struct mptses *);
130static void mptcp_key_pool_init(void);
131static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
132static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
133static void mptcp_conn_properties(struct mptcb *);
134static void mptcp_init_statevars(struct mptcb *);
135
136static uint32_t mptcp_gc(struct mppcbinfo *);
137static int mptcp_subflow_socreate(struct mptses *, struct mptsub *,
138    int, struct proc *, struct socket **);
139static int mptcp_subflow_soclose(struct mptsub *, struct socket *);
140static int mptcp_subflow_soconnectx(struct mptses *, struct mptsub *);
141static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
142    struct uio *, struct mbuf **, struct mbuf **, int *);
143static void mptcp_subflow_rupcall(struct socket *, void *, int);
144static void mptcp_subflow_input(struct mptses *, struct mptsub *);
145static void mptcp_subflow_wupcall(struct socket *, void *, int);
146static void mptcp_subflow_eupcall(struct socket *, void *, uint32_t);
147static void mptcp_update_last_owner(struct mptsub *, struct socket *);
148static void mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts);
149
150/*
151 * Possible return values for subflow event handlers.  Note that success
152 * values must be greater or equal than MPTS_EVRET_OK.  Values less than that
153 * indicate errors or actions which require immediate attention; they will
154 * prevent the rest of the handlers from processing their respective events
155 * until the next round of events processing.
156 */
157typedef enum {
158	MPTS_EVRET_DELETE		= 1,	/* delete this subflow */
159	MPTS_EVRET_OK			= 2,	/* OK */
160	MPTS_EVRET_CONNECT_PENDING	= 3,	/* resume pended connects */
161	MPTS_EVRET_DISCONNECT_FALLBACK	= 4,	/* abort all but preferred */
162	MPTS_EVRET_OK_UPDATE		= 5,	/* OK with conninfo update */
163} ev_ret_t;
164
165static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *);
166static ev_ret_t mptcp_subflow_connreset_ev(struct mptses *, struct mptsub *);
167static ev_ret_t mptcp_subflow_cantrcvmore_ev(struct mptses *, struct mptsub *);
168static ev_ret_t mptcp_subflow_cantsendmore_ev(struct mptses *, struct mptsub *);
169static ev_ret_t mptcp_subflow_timeout_ev(struct mptses *, struct mptsub *);
170static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *);
171static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *);
172static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *);
173static ev_ret_t mptcp_subflow_suspend_ev(struct mptses *, struct mptsub *);
174static ev_ret_t mptcp_subflow_resume_ev(struct mptses *, struct mptsub *);
175static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *);
176static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *);
177static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *);
178static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *);
179static ev_ret_t mptcp_fastjoin_ev(struct mptses *, struct mptsub *);
180static ev_ret_t mptcp_deleteok_ev(struct mptses *, struct mptsub *);
181static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *);
182
183static const char *mptcp_evret2str(ev_ret_t);
184
185static mptcp_key_t *mptcp_reserve_key(void);
186static int mptcp_do_sha1(mptcp_key_t *, char *, int);
187static int mptcp_init_authparms(struct mptcb *);
188
189static unsigned int mptsub_zone_size;		/* size of mptsub */
190static struct zone *mptsub_zone;		/* zone for mptsub */
191
192static unsigned int mptopt_zone_size;		/* size of mptopt */
193static struct zone *mptopt_zone;		/* zone for mptopt */
194
195static unsigned int mpt_subauth_entry_size;	/* size of subf auth entry */
196static struct zone *mpt_subauth_zone;		/* zone of subf auth entry */
197
198struct mppcbinfo mtcbinfo;
199
200static struct mptcp_keys_pool_head mptcp_keys_pool;
201
202#define	MPTCP_SUBFLOW_WRITELEN	(8 * 1024)	/* bytes to write each time */
203#define	MPTCP_SUBFLOW_READLEN	(8 * 1024)	/* bytes to read each time */
204
205SYSCTL_DECL(_net_inet);
206
207SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
208
209uint32_t mptcp_verbose = 0;		/* more noise if greater than 1 */
210SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, verbose, CTLFLAG_RW|CTLFLAG_LOCKED,
211	&mptcp_verbose, 0, "MPTCP verbosity level");
212
213SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
214	&mtcbinfo.mppi_count, 0, "Number of active PCBs");
215
216/*
217 * Since there is one kernel thread per mptcp socket, imposing an artificial
218 * limit on number of allowed mptcp sockets.
219 */
220uint32_t mptcp_socket_limit = MPPCB_LIMIT;
221SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, sk_lim, CTLFLAG_RW|CTLFLAG_LOCKED,
222	&mptcp_socket_limit, 0, "MPTCP socket limit");
223
224/*
225 * SYSCTL to turn on delayed cellular subflow start.
226 */
227uint32_t mptcp_delayed_subf_start = 0;
228SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, delayed, CTLFLAG_RW|CTLFLAG_LOCKED,
229	&mptcp_delayed_subf_start, 0, "MPTCP Delayed Subflow start");
230
231/*
232 * SYSCTL for RTT spike measurement threshold in msecs.
233 */
234int32_t mptcp_rto_spike_thresh = 3000;
235SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, rto_spikethresh,
236	CTLFLAG_RW|CTLFLAG_LOCKED, &mptcp_rto_spike_thresh, 0,
237	"MPTCP RTT spike thresh");
238
239static struct protosw mptcp_subflow_protosw;
240static struct pr_usrreqs mptcp_subflow_usrreqs;
241#if INET6
242static struct ip6protosw mptcp_subflow_protosw6;
243static struct pr_usrreqs mptcp_subflow_usrreqs6;
244#endif /* INET6 */
245
246/*
247 * Protocol pr_init callback.
248 */
249void
250mptcp_init(struct protosw *pp, struct domain *dp)
251{
252#pragma unused(dp)
253	static int mptcp_initialized = 0;
254	struct protosw *prp;
255#if INET6
256	struct ip6protosw *prp6;
257#endif /* INET6 */
258
259	VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
260
261	/* do this only once */
262	if (mptcp_initialized)
263		return;
264	mptcp_initialized = 1;
265
266	/*
267	 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
268	 * we must be able to find IPPROTO_TCP entries for both.
269	 */
270	prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
271	VERIFY(prp != NULL);
272	bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
273	bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
274	    sizeof (mptcp_subflow_usrreqs));
275	mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
276	mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
277	mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
278	mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
279	mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
280	/*
281	 * Socket filters shouldn't attach/detach to/from this protosw
282	 * since pr_protosw is to be used instead, which points to the
283	 * real protocol; if they do, it is a bug and we should panic.
284	 */
285	mptcp_subflow_protosw.pr_filter_head.tqh_first =
286	    (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
287	mptcp_subflow_protosw.pr_filter_head.tqh_last =
288	    (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
289
290#if INET6
291	prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
292	    IPPROTO_TCP, SOCK_STREAM);
293	VERIFY(prp6 != NULL);
294	bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
295	bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
296	    sizeof (mptcp_subflow_usrreqs6));
297	mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
298	mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
299	mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
300	mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
301	mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
302	/*
303	 * Socket filters shouldn't attach/detach to/from this protosw
304	 * since pr_protosw is to be used instead, which points to the
305	 * real protocol; if they do, it is a bug and we should panic.
306	 */
307	mptcp_subflow_protosw6.pr_filter_head.tqh_first =
308	    (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
309	mptcp_subflow_protosw6.pr_filter_head.tqh_last =
310	    (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
311#endif /* INET6 */
312
313	bzero(&mtcbinfo, sizeof (mtcbinfo));
314	TAILQ_INIT(&mtcbinfo.mppi_pcbs);
315	mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
316	if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
317	    1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
318		panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
319		/* NOTREACHED */
320	}
321	zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
322	zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
323
324	mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
325	mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
326	    mtcbinfo.mppi_lock_grp_attr);
327	mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
328	lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
329	    mtcbinfo.mppi_lock_attr);
330	mtcbinfo.mppi_gc = mptcp_gc;
331
332	mtcbinfo.mppi_timer = mptcp_timer;
333
334	/* attach to MP domain for garbage collection to take place */
335	mp_pcbinfo_attach(&mtcbinfo);
336
337	mptsub_zone_size = sizeof (struct mptsub);
338	if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
339	    8192, "mptsub")) == NULL) {
340		panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
341		/* NOTREACHED */
342	}
343	zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
344	zone_change(mptsub_zone, Z_EXPAND, TRUE);
345
346	mptopt_zone_size = sizeof (struct mptopt);
347	if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
348	    1024, "mptopt")) == NULL) {
349		panic("%s: unable to allocate MPTCP option zone\n", __func__);
350		/* NOTREACHED */
351	}
352	zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
353	zone_change(mptopt_zone, Z_EXPAND, TRUE);
354
355	mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
356	if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
357	    1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
358		panic("%s: unable to allocate MPTCP address auth zone \n",
359		    __func__);
360		/* NOTREACHED */
361	}
362	zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
363	zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
364
365	/* Set up a list of unique keys */
366	mptcp_key_pool_init();
367
368}
369
370/*
371 * Create an MPTCP session, called as a result of opening a MPTCP socket.
372 */
373struct mptses *
374mptcp_sescreate(struct socket *mp_so, struct mppcb *mpp)
375{
376	struct mppcbinfo *mppi;
377	struct mptses *mpte;
378	struct mptcb *mp_tp;
379	int error = 0;
380
381	VERIFY(mpp != NULL);
382	mppi = mpp->mpp_pcbinfo;
383	VERIFY(mppi != NULL);
384
385	mpte = &((struct mpp_mtp *)mpp)->mpp_ses;
386	mp_tp = &((struct mpp_mtp *)mpp)->mtcb;
387
388	/* MPTCP Multipath PCB Extension */
389	bzero(mpte, sizeof (*mpte));
390	VERIFY(mpp->mpp_pcbe == NULL);
391	mpp->mpp_pcbe = mpte;
392	mpte->mpte_mppcb = mpp;
393	mpte->mpte_mptcb = mp_tp;
394
395	TAILQ_INIT(&mpte->mpte_sopts);
396	TAILQ_INIT(&mpte->mpte_subflows);
397	mpte->mpte_associd = ASSOCID_ANY;
398	mpte->mpte_connid_last = CONNID_ANY;
399
400	lck_mtx_init(&mpte->mpte_thread_lock, mppi->mppi_lock_grp,
401	    mppi->mppi_lock_attr);
402
403	/*
404	 * XXX: adi@apple.com
405	 *
406	 * This can be rather expensive if we have lots of MPTCP sockets,
407	 * but we need a kernel thread for this model to work.  Perhaps we
408	 * could amortize the costs by having one worker thread per a group
409	 * of MPTCP sockets.
410	 */
411	if (kernel_thread_start(mptcp_thread_func, mpte,
412	    &mpte->mpte_thread) != KERN_SUCCESS) {
413		error = ENOBUFS;
414		goto out;
415	}
416	mp_so->so_usecount++;		/* for thread */
417
418	/* MPTCP Protocol Control Block */
419	bzero(mp_tp, sizeof (*mp_tp));
420	lck_mtx_init(&mp_tp->mpt_lock, mppi->mppi_lock_grp,
421	    mppi->mppi_lock_attr);
422	mp_tp->mpt_mpte = mpte;
423
424out:
425	if (error != 0)
426		lck_mtx_destroy(&mpte->mpte_thread_lock, mppi->mppi_lock_grp);
427	DTRACE_MPTCP5(session__create, struct socket *, mp_so,
428	    struct sockbuf *, &mp_so->so_rcv,
429	    struct sockbuf *, &mp_so->so_snd,
430	    struct mppcb *, mpp, int, error);
431
432	return ((error != 0) ? NULL : mpte);
433}
434
435/*
436 * Destroy an MPTCP session.
437 */
438static void
439mptcp_sesdestroy(struct mptses *mpte)
440{
441	struct mptcb *mp_tp;
442
443	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
444
445	mp_tp = mpte->mpte_mptcb;
446	VERIFY(mp_tp != NULL);
447
448	/*
449	 * MPTCP Multipath PCB Extension section
450	 */
451	mptcp_flush_sopts(mpte);
452	VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
453
454	lck_mtx_destroy(&mpte->mpte_thread_lock,
455	    mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
456
457	/*
458	 * MPTCP Protocol Control Block section
459	 */
460	lck_mtx_destroy(&mp_tp->mpt_lock,
461	    mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
462
463	DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
464	    struct mptcb *, mp_tp);
465}
466
467/*
468 * Allocate an MPTCP socket option structure.
469 */
470struct mptopt *
471mptcp_sopt_alloc(int how)
472{
473	struct mptopt *mpo;
474
475	mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
476	    zalloc_noblock(mptopt_zone);
477	if (mpo != NULL) {
478		bzero(mpo, mptopt_zone_size);
479	}
480
481	return (mpo);
482}
483
484/*
485 * Free an MPTCP socket option structure.
486 */
487void
488mptcp_sopt_free(struct mptopt *mpo)
489{
490	VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
491
492	zfree(mptopt_zone, mpo);
493}
494
495/*
496 * Add a socket option to the MPTCP socket option list.
497 */
498void
499mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
500{
501	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
502	VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
503	mpo->mpo_flags |= MPOF_ATTACHED;
504	TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
505}
506
507/*
508 * Remove a socket option from the MPTCP socket option list.
509 */
510void
511mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
512{
513	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
514	VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
515	mpo->mpo_flags &= ~MPOF_ATTACHED;
516	TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
517}
518
519/*
520 * Search for an existing <sopt_level,sopt_name> socket option.
521 */
522struct mptopt *
523mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
524{
525	struct mptopt *mpo;
526
527	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
528
529	TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
530		if (mpo->mpo_level == sopt->sopt_level &&
531		    mpo->mpo_name == sopt->sopt_name)
532			break;
533	}
534	VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int));
535
536	return (mpo);
537}
538
539/*
540 * Flushes all recorded socket options from an MP socket.
541 */
542void
543mptcp_flush_sopts(struct mptses *mpte)
544{
545	struct mptopt *mpo, *tmpo;
546
547	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
548
549	TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
550		mptcp_sopt_remove(mpte, mpo);
551		mptcp_sopt_free(mpo);
552	}
553	VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
554}
555
556/*
557 * Allocate a MPTCP subflow structure.
558 */
559struct mptsub *
560mptcp_subflow_alloc(int how)
561{
562	struct mptsub *mpts;
563
564	mpts = (how == M_WAITOK) ? zalloc(mptsub_zone) :
565	    zalloc_noblock(mptsub_zone);
566	if (mpts != NULL) {
567		bzero(mpts, mptsub_zone_size);
568		lck_mtx_init(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp,
569		    mtcbinfo.mppi_lock_attr);
570	}
571
572	return (mpts);
573}
574
575/*
576 * Deallocate a subflow structure, called when all of the references held
577 * on it have been released.  This implies that the subflow has been deleted.
578 */
579void
580mptcp_subflow_free(struct mptsub *mpts)
581{
582	MPTS_LOCK_ASSERT_HELD(mpts);
583
584	VERIFY(mpts->mpts_refcnt == 0);
585	VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
586	VERIFY(mpts->mpts_mpte == NULL);
587	VERIFY(mpts->mpts_socket == NULL);
588
589	if (mpts->mpts_src_sl != NULL) {
590		sockaddrlist_free(mpts->mpts_src_sl);
591		mpts->mpts_src_sl = NULL;
592	}
593	if (mpts->mpts_dst_sl != NULL) {
594		sockaddrlist_free(mpts->mpts_dst_sl);
595		mpts->mpts_dst_sl = NULL;
596	}
597	MPTS_UNLOCK(mpts);
598	lck_mtx_destroy(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp);
599
600	zfree(mptsub_zone, mpts);
601}
602
603/*
604 * Create an MPTCP subflow socket.
605 */
606static int
607mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
608    struct proc *p, struct socket **so)
609{
610	struct mptopt smpo, *mpo, *tmpo;
611	struct socket *mp_so;
612	int error;
613
614	*so = NULL;
615	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
616	mp_so = mpte->mpte_mppcb->mpp_socket;
617
618	/*
619	 * Create the subflow socket (multipath subflow, non-blocking.)
620	 *
621	 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
622	 * socket; it will be cleared when the socket is peeled off or closed.
623	 * It also indicates to the underlying TCP to handle MPTCP options.
624	 * A multipath subflow socket implies SS_NOFDREF state.
625	 */
626	if ((error = socreate_internal(dom, so, SOCK_STREAM,
627	    IPPROTO_TCP, p, SOCF_ASYNC | SOCF_MP_SUBFLOW, PROC_NULL)) != 0) {
628		mptcplog((LOG_ERR, "MPTCP ERROR %s: mp_so 0x%llx unable to "
629		    "create subflow socket error %d\n", __func__,
630		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error));
631		return (error);
632	}
633
634	socket_lock(*so, 0);
635	VERIFY((*so)->so_flags & SOF_MP_SUBFLOW);
636	VERIFY(((*so)->so_state & (SS_NBIO|SS_NOFDREF)) ==
637	    (SS_NBIO|SS_NOFDREF));
638
639	/* prevent the socket buffers from being compressed */
640	(*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
641	(*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
642
643	bzero(&smpo, sizeof (smpo));
644	smpo.mpo_flags |= MPOF_SUBFLOW_OK;
645	smpo.mpo_level = SOL_SOCKET;
646	smpo.mpo_intval = 1;
647
648	/* disable SIGPIPE */
649	smpo.mpo_name = SO_NOSIGPIPE;
650	if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
651		goto out;
652
653	/* find out if the subflow's source address goes away */
654	smpo.mpo_name = SO_NOADDRERR;
655	if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
656		goto out;
657
658	/* enable keepalive */
659	smpo.mpo_name = SO_KEEPALIVE;
660	if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
661		goto out;
662
663	/*
664	 * Limit the receive socket buffer size to 64k.
665	 *
666	 * We need to take into consideration the window scale option
667	 * which could be negotiated in one subflow but disabled in
668	 * another subflow.
669	 * XXX This can be improved in the future.
670	 */
671	smpo.mpo_name = SO_RCVBUF;
672	smpo.mpo_intval = MPTCP_RWIN_MAX;
673	if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
674		goto out;
675
676	/* N.B.: set by sosetopt */
677	VERIFY(!((*so)->so_rcv.sb_flags & SB_AUTOSIZE));
678	/* Prevent automatic socket buffer sizing. */
679	(*so)->so_snd.sb_flags &= ~SB_AUTOSIZE;
680
681	smpo.mpo_level = IPPROTO_TCP;
682	smpo.mpo_intval = mptcp_subflow_keeptime;
683	smpo.mpo_name = TCP_KEEPALIVE;
684	if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
685		goto out;
686
687	/* replay setsockopt(2) on the subflow sockets for eligible options */
688	TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
689		int interim;
690
691		if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
692			continue;
693
694		/*
695		 * Skip those that are handled internally; these options
696		 * should not have been recorded and marked with the
697		 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
698		 */
699		if (mpo->mpo_level == SOL_SOCKET &&
700		    (mpo->mpo_name == SO_NOSIGPIPE ||
701		    mpo->mpo_name == SO_NOADDRERR ||
702		    mpo->mpo_name == SO_KEEPALIVE))
703			continue;
704
705		interim = (mpo->mpo_flags & MPOF_INTERIM);
706		if (mptcp_subflow_sosetopt(mpte, *so, mpo) != 0 && interim) {
707			char buf[32];
708			mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s val %d "
709			    "interim record removed\n", __func__,
710			    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
711			    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
712			    buf, sizeof (buf)), mpo->mpo_intval));
713			mptcp_sopt_remove(mpte, mpo);
714			mptcp_sopt_free(mpo);
715			continue;
716		}
717	}
718
719	/*
720	 * We need to receive everything that the subflow socket has,
721	 * so use a customized socket receive function.  We will undo
722	 * this when the socket is peeled off or closed.
723	 */
724	mpts->mpts_oprotosw = (*so)->so_proto;
725	switch (dom) {
726	case PF_INET:
727		(*so)->so_proto = &mptcp_subflow_protosw;
728		break;
729#if INET6
730	case PF_INET6:
731		(*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
732		break;
733#endif /* INET6 */
734	default:
735		VERIFY(0);
736		/* NOTREACHED */
737	}
738
739out:
740	socket_unlock(*so, 0);
741
742	DTRACE_MPTCP4(subflow__create, struct mptses *, mpte,
743	    struct mptsub *, mpts, int, dom, int, error);
744
745	return (error);
746}
747
748/*
749 * Close an MPTCP subflow socket.
750 *
751 * Note that this may be called on an embryonic subflow, and the only
752 * thing that is guaranteed valid is the protocol-user request.
753 */
754static int
755mptcp_subflow_soclose(struct mptsub *mpts, struct socket *so)
756{
757	MPTS_LOCK_ASSERT_HELD(mpts);
758
759	socket_lock(so, 0);
760	VERIFY(so->so_flags & SOF_MP_SUBFLOW);
761	VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
762
763	/* restore protocol-user requests */
764	VERIFY(mpts->mpts_oprotosw != NULL);
765	so->so_proto = mpts->mpts_oprotosw;
766	socket_unlock(so, 0);
767
768	mpts->mpts_socket = NULL;	/* may already be NULL */
769
770	DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
771	    struct socket *, so,
772	    struct sockbuf *, &so->so_rcv,
773	    struct sockbuf *, &so->so_snd,
774	    struct mptses *, mpts->mpts_mpte);
775
776	return (soclose(so));
777}
778
779/*
780 * Connect an MPTCP subflow socket.
781 *
782 * This may be called inline as part of adding a subflow, or asynchronously
783 * by the thread (upon progressing to MPTCPF_JOIN_READY).  Note that in the
784 * pending connect case, the subflow socket may have been bound to an interface
785 * and/or a source IP address which may no longer be around by the time this
786 * routine is called; in that case the connect attempt will most likely fail.
787 */
788static int
789mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
790{
791	struct socket *so;
792	int af, error;
793
794	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
795	MPTS_LOCK_ASSERT_HELD(mpts);
796
797	VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) ==
798	    MPTSF_CONNECTING);
799	VERIFY(mpts->mpts_socket != NULL);
800	so = mpts->mpts_socket;
801	af = mpts->mpts_family;
802
803	if (af == AF_INET || af == AF_INET6) {
804		struct sockaddr_entry *dst_se;
805		char dbuf[MAX_IPv6_STR_LEN];
806
807		dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
808		VERIFY(dst_se != NULL);
809
810		mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx dst %s[%d] cid %d "
811		    "[pended %s]\n", __func__,
812		    (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
813		    inet_ntop(af, ((af == AF_INET) ?
814		    (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
815		    (void *)&SIN6(dst_se->se_addr)->sin6_addr),
816		    dbuf, sizeof (dbuf)), ((af == AF_INET) ?
817		    ntohs(SIN(dst_se->se_addr)->sin_port) :
818		    ntohs(SIN6(dst_se->se_addr)->sin6_port)),
819		    mpts->mpts_connid,
820		    ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
821		    "YES" : "NO")));
822	}
823
824	mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
825
826	socket_lock(so, 0);
827	mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
828
829	/* connect the subflow socket */
830	error = soconnectxlocked(so, &mpts->mpts_src_sl, &mpts->mpts_dst_sl,
831	    mpts->mpts_mpcr.mpcr_proc, mpts->mpts_mpcr.mpcr_ifscope,
832	    mpte->mpte_associd, NULL, TCP_CONNREQF_MPTCP,
833	    &mpts->mpts_mpcr, sizeof (mpts->mpts_mpcr));
834	socket_unlock(so, 0);
835
836	/* Allocate a unique address id per subflow */
837	mpte->mpte_addrid_last++;
838	if (mpte->mpte_addrid_last == 0)
839		mpte->mpte_addrid_last++;
840
841	DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
842	    struct mptsub *, mpts, int, error);
843
844	return (error);
845}
846
847/*
848 * MPTCP subflow socket receive routine, derived from soreceive().
849 */
850static int
851mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
852    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
853{
854#pragma unused(uio)
855	int flags, error = 0;
856	struct proc *p = current_proc();
857	struct mbuf *m, **mp = mp0;
858	struct mbuf *nextrecord;
859
860	socket_lock(so, 1);
861	VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
862
863#ifdef MORE_LOCKING_DEBUG
864	if (so->so_usecount == 1) {
865		panic("%s: so=%x no other reference on socket\n", __func__, so);
866		/* NOTREACHED */
867	}
868#endif
869	/*
870	 * We return all that is there in the subflow's socket receive buffer
871	 * to the MPTCP layer, so we require that the caller passes in the
872	 * expected parameters.
873	 */
874	if (mp == NULL || controlp != NULL) {
875		socket_unlock(so, 1);
876		return (EINVAL);
877	}
878	*mp = NULL;
879	if (psa != NULL)
880		*psa = NULL;
881	if (flagsp != NULL)
882		flags = *flagsp &~ MSG_EOR;
883	else
884		flags = 0;
885
886	if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM)) {
887		socket_unlock(so, 1);
888		return (EOPNOTSUPP);
889	}
890	flags |= (MSG_DONTWAIT|MSG_NBIO);
891
892	/*
893	 * If a recv attempt is made on a previously-accepted socket
894	 * that has been marked as inactive (disconnected), reject
895	 * the request.
896	 */
897	if (so->so_flags & SOF_DEFUNCT) {
898		struct sockbuf *sb = &so->so_rcv;
899
900		error = ENOTCONN;
901		SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
902		    __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
903		    SOCK_DOM(so), SOCK_TYPE(so), error));
904		/*
905		 * This socket should have been disconnected and flushed
906		 * prior to being returned from sodefunct(); there should
907		 * be no data on its receive list, so panic otherwise.
908		 */
909		if (so->so_state & SS_DEFUNCT)
910			sb_empty_assert(sb, __func__);
911		socket_unlock(so, 1);
912		return (error);
913	}
914
915	/*
916	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
917	 * and if so just return to the caller.  This could happen when
918	 * soreceive() is called by a socket upcall function during the
919	 * time the socket is freed.  The socket buffer would have been
920	 * locked across the upcall, therefore we cannot put this thread
921	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
922	 * we may livelock), because the lock on the socket buffer will
923	 * only be released when the upcall routine returns to its caller.
924	 * Because the socket has been officially closed, there can be
925	 * no further read on it.
926	 *
927	 * A multipath subflow socket would have its SS_NOFDREF set by
928	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
929	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
930	 */
931	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
932	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
933		socket_unlock(so, 1);
934		return (0);
935	}
936
937	/*
938	 * For consistency with soreceive() semantics, we need to obey
939	 * SB_LOCK in case some other code path has locked the buffer.
940	 */
941	error = sblock(&so->so_rcv, 0);
942	if (error != 0) {
943		socket_unlock(so, 1);
944		return (error);
945	}
946
947	m = so->so_rcv.sb_mb;
948	if (m == NULL) {
949		/*
950		 * Panic if we notice inconsistencies in the socket's
951		 * receive list; both sb_mb and sb_cc should correctly
952		 * reflect the contents of the list, otherwise we may
953		 * end up with false positives during select() or poll()
954		 * which could put the application in a bad state.
955		 */
956		SB_MB_CHECK(&so->so_rcv);
957
958		if (so->so_error != 0) {
959			error = so->so_error;
960			so->so_error = 0;
961			goto release;
962		}
963
964		if (so->so_state & SS_CANTRCVMORE) {
965			goto release;
966		}
967
968		if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
969			error = ENOTCONN;
970			goto release;
971		}
972
973		/*
974		 * MSG_DONTWAIT is implicitly defined and this routine will
975		 * never block, so return EWOULDBLOCK when there is nothing.
976		 */
977		error = EWOULDBLOCK;
978		goto release;
979	}
980
981	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
982	SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
983	SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
984
985	while (m != NULL) {
986		nextrecord = m->m_nextpkt;
987		sbfree(&so->so_rcv, m);
988
989		if (mp != NULL) {
990			*mp = m;
991			mp = &m->m_next;
992			so->so_rcv.sb_mb = m = m->m_next;
993			*mp = NULL;
994		}
995
996		if (m != NULL) {
997			m->m_nextpkt = nextrecord;
998			if (nextrecord == NULL)
999				so->so_rcv.sb_lastrecord = m;
1000		} else {
1001			m = so->so_rcv.sb_mb = nextrecord;
1002			SB_EMPTY_FIXUP(&so->so_rcv);
1003		}
1004		SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1005		SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1006	}
1007
1008	DTRACE_MPTCP3(subflow__receive, struct socket *, so,
1009	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1010	/* notify protocol that we drained all the data */
1011	if ((so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
1012		(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
1013
1014	if (flagsp != NULL)
1015		*flagsp |= flags;
1016
1017release:
1018	sbunlock(&so->so_rcv, FALSE);	/* will unlock socket */
1019	return (error);
1020
1021}
1022
1023
1024/*
1025 * Prepare an MPTCP subflow socket for peeloff(2); basically undo
1026 * the work done earlier when the subflow socket was created.
1027 */
1028void
1029mptcp_subflow_sopeeloff(struct mptses *mpte, struct mptsub *mpts,
1030    struct socket *so)
1031{
1032	struct mptopt smpo;
1033	struct socket *mp_so;
1034	int p, c;
1035
1036	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1037	mp_so = mpte->mpte_mppcb->mpp_socket;
1038	MPTS_LOCK_ASSERT_HELD(mpts);
1039
1040	socket_lock(so, 0);
1041	VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1042	VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1043
1044	/* inherit MPTCP socket states */
1045	if (!(mp_so->so_state & SS_NBIO))
1046		so->so_state &= ~SS_NBIO;
1047
1048	/*
1049	 * At this point, the socket is not yet closed, as there is at least
1050	 * one outstanding usecount previously held by mpts_socket from
1051	 * socreate().  Atomically clear SOF_MP_SUBFLOW and SS_NOFDREF here.
1052	 */
1053	so->so_flags &= ~SOF_MP_SUBFLOW;
1054	so->so_state &= ~SS_NOFDREF;
1055	so->so_flags &= ~SOF_MPTCP_TRUE;
1056
1057	/* allow socket buffers to be compressed */
1058	so->so_rcv.sb_flags &= ~SB_NOCOMPRESS;
1059	so->so_snd.sb_flags &= ~SB_NOCOMPRESS;
1060
1061	/*
1062	 * Allow socket buffer auto sizing.
1063	 *
1064	 * This will increase the current 64k buffer size to whatever is best.
1065	 */
1066	if (!(so->so_rcv.sb_flags & SB_USRSIZE))
1067		so->so_rcv.sb_flags |= SB_AUTOSIZE;
1068	if (!(so->so_snd.sb_flags & SB_USRSIZE))
1069		so->so_snd.sb_flags |= SB_AUTOSIZE;
1070
1071	/* restore protocol-user requests */
1072	VERIFY(mpts->mpts_oprotosw != NULL);
1073	so->so_proto = mpts->mpts_oprotosw;
1074
1075	bzero(&smpo, sizeof (smpo));
1076	smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1077	smpo.mpo_level = SOL_SOCKET;
1078
1079	/* inherit SOF_NOSIGPIPE from parent MP socket */
1080	p = (mp_so->so_flags & SOF_NOSIGPIPE);
1081	c = (so->so_flags & SOF_NOSIGPIPE);
1082	smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1083	smpo.mpo_name = SO_NOSIGPIPE;
1084	if ((p - c) != 0)
1085		(void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1086
1087	/* inherit SOF_NOADDRAVAIL from parent MP socket */
1088	p = (mp_so->so_flags & SOF_NOADDRAVAIL);
1089	c = (so->so_flags & SOF_NOADDRAVAIL);
1090	smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1091	smpo.mpo_name = SO_NOADDRERR;
1092	if ((p - c) != 0)
1093		(void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1094
1095	/* inherit SO_KEEPALIVE from parent MP socket */
1096	p = (mp_so->so_options & SO_KEEPALIVE);
1097	c = (so->so_options & SO_KEEPALIVE);
1098	smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1099	smpo.mpo_name = SO_KEEPALIVE;
1100	if ((p - c) != 0)
1101		(void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1102
1103	/* unset TCP level default keepalive option */
1104	p = (intotcpcb(sotoinpcb(mp_so)))->t_keepidle;
1105	c = (intotcpcb(sotoinpcb(so)))->t_keepidle;
1106	smpo.mpo_level = IPPROTO_TCP;
1107	smpo.mpo_intval = 0;
1108	smpo.mpo_name = TCP_KEEPALIVE;
1109	if ((p - c) != 0)
1110		(void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1111	socket_unlock(so, 0);
1112
1113	DTRACE_MPTCP5(subflow__peeloff, struct mptses *, mpte,
1114	    struct mptsub *, mpts, struct socket *, so,
1115	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1116}
1117
1118/*
1119 * Establish an initial MPTCP connection (if first subflow and not yet
1120 * connected), or add a subflow to an existing MPTCP connection.
1121 */
1122int
1123mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts,
1124    struct proc *p, uint32_t ifscope)
1125{
1126	struct sockaddr_entry *se, *src_se = NULL, *dst_se = NULL;
1127	struct socket *mp_so, *so = NULL;
1128	struct mptsub_connreq mpcr;
1129	struct mptcb *mp_tp;
1130	int af, error = 0;
1131
1132	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1133	mp_so = mpte->mpte_mppcb->mpp_socket;
1134	mp_tp = mpte->mpte_mptcb;
1135
1136	MPT_LOCK(mp_tp);
1137	if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
1138		/* If the remote end sends Data FIN, refuse subflow adds */
1139		error = ENOTCONN;
1140		MPT_UNLOCK(mp_tp);
1141		return (error);
1142	}
1143	MPT_UNLOCK(mp_tp);
1144
1145	MPTS_LOCK(mpts);
1146	VERIFY(!(mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)));
1147	VERIFY(mpts->mpts_mpte == NULL);
1148	VERIFY(mpts->mpts_socket == NULL);
1149	VERIFY(mpts->mpts_dst_sl != NULL);
1150	VERIFY(mpts->mpts_connid == CONNID_ANY);
1151
1152	/* select source (if specified) and destination addresses */
1153	if ((error = in_selectaddrs(AF_UNSPEC, &mpts->mpts_src_sl, &src_se,
1154	    &mpts->mpts_dst_sl, &dst_se)) != 0)
1155		goto out;
1156
1157	VERIFY(mpts->mpts_dst_sl != NULL && dst_se != NULL);
1158	VERIFY(src_se == NULL || mpts->mpts_src_sl != NULL);
1159	af = mpts->mpts_family = dst_se->se_addr->sa_family;
1160	VERIFY(src_se == NULL || src_se->se_addr->sa_family == af);
1161	VERIFY(af == AF_INET || af == AF_INET6);
1162
1163	/*
1164	 * If the source address is not specified, allocate a storage for
1165	 * it, so that later on we can fill it in with the actual source
1166	 * IP address chosen by the underlying layer for the subflow after
1167	 * it is connected.
1168	 */
1169	if (mpts->mpts_src_sl == NULL) {
1170		mpts->mpts_src_sl =
1171		    sockaddrlist_dup(mpts->mpts_dst_sl, M_WAITOK);
1172		if (mpts->mpts_src_sl == NULL) {
1173			error = ENOBUFS;
1174			goto out;
1175		}
1176		se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
1177		VERIFY(se != NULL && se->se_addr != NULL &&
1178		    se->se_addr->sa_len == dst_se->se_addr->sa_len);
1179		bzero(se->se_addr, se->se_addr->sa_len);
1180		se->se_addr->sa_len = dst_se->se_addr->sa_len;
1181		se->se_addr->sa_family = dst_se->se_addr->sa_family;
1182	}
1183
1184	/* create the subflow socket */
1185	if ((error = mptcp_subflow_socreate(mpte, mpts, af, p, &so)) != 0)
1186		goto out;
1187
1188	/* If fastjoin is requested, set state in mpts */
1189	if ((so->so_flags & SOF_MPTCP_FASTJOIN) &&
1190	    (mp_tp->mpt_state == MPTCPS_ESTABLISHED) &&
1191	    (mpte->mpte_nummpcapflows == 0)) {
1192		mpts->mpts_flags |= MPTSF_FASTJ_REQD;
1193		mpts->mpts_rel_seq = 1;
1194		MPT_LOCK(mp_tp);
1195		mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1196		MPT_UNLOCK(mp_tp);
1197	}
1198
1199	/*
1200	 * Increment the counter, while avoiding 0 (CONNID_ANY) and
1201	 * -1 (CONNID_ALL).
1202	 */
1203	mpte->mpte_connid_last++;
1204	if (mpte->mpte_connid_last == CONNID_ALL ||
1205	    mpte->mpte_connid_last == CONNID_ANY)
1206		mpte->mpte_connid_last++;
1207
1208	mpts->mpts_connid = mpte->mpte_connid_last;
1209	VERIFY(mpts->mpts_connid != CONNID_ANY &&
1210	    mpts->mpts_connid != CONNID_ALL);
1211
1212	/* Allocate a unique address id per subflow */
1213	mpte->mpte_addrid_last++;
1214	if (mpte->mpte_addrid_last == 0)
1215		mpte->mpte_addrid_last++;
1216
1217	/* bind subflow socket to the specified interface */
1218	if (ifscope != IFSCOPE_NONE) {
1219		socket_lock(so, 0);
1220		error = inp_bindif(sotoinpcb(so), ifscope, &mpts->mpts_outif);
1221		if (error != 0) {
1222			socket_unlock(so, 0);
1223			(void) mptcp_subflow_soclose(mpts, so);
1224			goto out;
1225		}
1226		VERIFY(mpts->mpts_outif != NULL);
1227		mpts->mpts_flags |= MPTSF_BOUND_IF;
1228
1229		mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx bindif %s[%d] "
1230		    "cid %d\n", __func__,
1231		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1232		    mpts->mpts_outif->if_xname,
1233		    ifscope, mpts->mpts_connid));
1234		socket_unlock(so, 0);
1235	}
1236
1237	/* if source address and/or port is specified, bind to it */
1238	if (src_se != NULL) {
1239		struct sockaddr *sa = src_se->se_addr;
1240		uint32_t mpts_flags = 0;
1241		in_port_t lport;
1242
1243		switch (af) {
1244		case AF_INET:
1245			if (SIN(sa)->sin_addr.s_addr != INADDR_ANY)
1246				mpts_flags |= MPTSF_BOUND_IP;
1247			if ((lport = SIN(sa)->sin_port) != 0)
1248				mpts_flags |= MPTSF_BOUND_PORT;
1249			break;
1250#if INET6
1251		case AF_INET6:
1252			VERIFY(af == AF_INET6);
1253			if (!IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa)->sin6_addr))
1254				mpts_flags |= MPTSF_BOUND_IP;
1255			if ((lport = SIN6(sa)->sin6_port) != 0)
1256				mpts_flags |= MPTSF_BOUND_PORT;
1257			break;
1258#endif /* INET6 */
1259		}
1260
1261		error = sobindlock(so, sa, 1);	/* will lock/unlock socket */
1262		if (error != 0) {
1263			(void) mptcp_subflow_soclose(mpts, so);
1264			goto out;
1265		}
1266		mpts->mpts_flags |= mpts_flags;
1267
1268		if (af == AF_INET || af == AF_INET6) {
1269			char sbuf[MAX_IPv6_STR_LEN];
1270
1271			mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx bindip %s[%d] "
1272			    "cid %d\n", __func__,
1273			    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1274			    inet_ntop(af, ((af == AF_INET) ?
1275			    (void *)&SIN(sa)->sin_addr.s_addr :
1276			    (void *)&SIN6(sa)->sin6_addr), sbuf, sizeof (sbuf)),
1277			    ntohs(lport), mpts->mpts_connid));
1278		}
1279	}
1280
1281	/*
1282	 * Insert the subflow into the list, and associate the MPTCP PCB
1283	 * as well as the the subflow socket.  From this point on, removing
1284	 * the subflow needs to be done via mptcp_subflow_del().
1285	 */
1286	TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1287	mpte->mpte_numflows++;
1288
1289	atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1290	mpts->mpts_mpte = mpte;
1291	mpts->mpts_socket = so;
1292	MPTS_ADDREF_LOCKED(mpts);	/* for being in MPTCP subflow list */
1293	MPTS_ADDREF_LOCKED(mpts);	/* for subflow socket */
1294	mp_so->so_usecount++;		/* for subflow socket */
1295
1296	/* register for subflow socket read/write events */
1297	(void) sock_setupcalls(so, mptcp_subflow_rupcall, mpts,
1298	    mptcp_subflow_wupcall, mpts);
1299
1300	/*
1301	 * Register for subflow socket control events; ignore
1302	 * SO_FILT_HINT_CONNINFO_UPDATED from below since we
1303	 * will generate it here.
1304	 */
1305	(void) sock_catchevents(so, mptcp_subflow_eupcall, mpts,
1306	    SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
1307	    SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
1308	    SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
1309	    SO_FILT_HINT_SUSPEND | SO_FILT_HINT_RESUME |
1310	    SO_FILT_HINT_CONNECTED | SO_FILT_HINT_DISCONNECTED |
1311	    SO_FILT_HINT_MPFAILOVER | SO_FILT_HINT_MPSTATUS |
1312	    SO_FILT_HINT_MUSTRST | SO_FILT_HINT_MPFASTJ |
1313	    SO_FILT_HINT_DELETEOK | SO_FILT_HINT_MPCANTRCVMORE);
1314
1315	/* sanity check */
1316	VERIFY(!(mpts->mpts_flags &
1317	    (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
1318
1319	bzero(&mpcr, sizeof (mpcr));
1320	mpcr.mpcr_proc = p;
1321	mpcr.mpcr_ifscope = ifscope;
1322	/*
1323	 * Indicate to the TCP subflow whether or not it should establish
1324	 * the initial MPTCP connection, or join an existing one.  Fill
1325	 * in the connection request structure with additional info needed
1326	 * by the underlying TCP (to be used in the TCP options, etc.)
1327	 */
1328	MPT_LOCK(mp_tp);
1329	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
1330		if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1331			mp_tp->mpt_localkey = mptcp_reserve_key();
1332			mptcp_conn_properties(mp_tp);
1333		}
1334		MPT_UNLOCK(mp_tp);
1335		soisconnecting(mp_so);
1336		mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ENABLE;
1337	} else {
1338		if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
1339			mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
1340
1341		/* avoid starting up cellular subflow unless required */
1342		if ((mptcp_delayed_subf_start) &&
1343		    (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
1344		    	mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
1345		}
1346		MPT_UNLOCK(mp_tp);
1347		mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ADD;
1348	}
1349
1350	mpts->mpts_mpcr = mpcr;
1351	mpts->mpts_flags |= MPTSF_CONNECTING;
1352
1353	if (af == AF_INET || af == AF_INET6) {
1354		char dbuf[MAX_IPv6_STR_LEN];
1355
1356		mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx dst %s[%d] cid %d "
1357		    "[pending %s]\n", __func__,
1358		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1359		    inet_ntop(af, ((af == AF_INET) ?
1360		    (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
1361		    (void *)&SIN6(dst_se->se_addr)->sin6_addr),
1362		    dbuf, sizeof (dbuf)), ((af == AF_INET) ?
1363		    ntohs(SIN(dst_se->se_addr)->sin_port) :
1364		    ntohs(SIN6(dst_se->se_addr)->sin6_port)),
1365		    mpts->mpts_connid,
1366		    ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
1367		    "YES" : "NO")));
1368	}
1369
1370	/* connect right away if first attempt, or if join can be done now */
1371	if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
1372		error = mptcp_subflow_soconnectx(mpte, mpts);
1373
1374out:
1375	MPTS_UNLOCK(mpts);
1376	if (error == 0) {
1377		soevent(mp_so, SO_FILT_HINT_LOCKED |
1378		    SO_FILT_HINT_CONNINFO_UPDATED);
1379	}
1380	return (error);
1381}
1382
1383/*
1384 * Delete/remove a subflow from an MPTCP.  The underlying subflow socket
1385 * will no longer be accessible after a subflow is deleted, thus this
1386 * should occur only after the subflow socket has been disconnected.
1387 * If peeloff(2) is called, leave the socket open.
1388 */
1389void
1390mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close)
1391{
1392	struct socket *mp_so, *so;
1393
1394	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1395	mp_so = mpte->mpte_mppcb->mpp_socket;
1396
1397	MPTS_LOCK(mpts);
1398	so = mpts->mpts_socket;
1399	VERIFY(so != NULL);
1400
1401	if (close && !((mpts->mpts_flags & MPTSF_DELETEOK) &&
1402	    (mpts->mpts_flags & MPTSF_USER_DISCONNECT))) {
1403		MPTS_UNLOCK(mpts);
1404		mptcplog((LOG_DEBUG, "%s: %d %x\n", __func__,
1405		    mpts->mpts_soerror, mpts->mpts_flags));
1406		return;
1407	}
1408
1409	mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d "
1410	    "[close %s] %d %x\n", __func__,
1411	    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1412	    mp_so->so_usecount,
1413	    mp_so->so_retaincnt, mpts->mpts_connid,
1414	    (close ? "YES" : "NO"), mpts->mpts_soerror,
1415	    mpts->mpts_flags));
1416
1417	VERIFY(mpts->mpts_mpte == mpte);
1418	VERIFY(mpts->mpts_connid != CONNID_ANY &&
1419	    mpts->mpts_connid != CONNID_ALL);
1420
1421	VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
1422	atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1423	TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
1424	VERIFY(mpte->mpte_numflows != 0);
1425	mpte->mpte_numflows--;
1426	if (mpte->mpte_active_sub == mpts)
1427		mpte->mpte_active_sub = NULL;
1428
1429	/*
1430	 * Drop references held by this subflow socket; there
1431	 * will be no further upcalls made from this point.
1432	 */
1433	(void) sock_setupcalls(so, NULL, NULL, NULL, NULL);
1434	(void) sock_catchevents(so, NULL, NULL, 0);
1435
1436	mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
1437
1438	if (close)
1439		(void) mptcp_subflow_soclose(mpts, so);
1440
1441	VERIFY(mp_so->so_usecount != 0);
1442	mp_so->so_usecount--;		/* for subflow socket */
1443	mpts->mpts_mpte = NULL;
1444	mpts->mpts_socket = NULL;
1445	MPTS_UNLOCK(mpts);
1446
1447	MPTS_REMREF(mpts);		/* for MPTCP subflow list */
1448	MPTS_REMREF(mpts);		/* for subflow socket */
1449
1450	soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
1451}
1452
1453/*
1454 * Disconnect a subflow socket.
1455 */
1456void
1457mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts,
1458    boolean_t deleteok)
1459{
1460	struct socket *so;
1461	struct mptcb *mp_tp;
1462	int send_dfin = 0;
1463
1464	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1465	MPTS_LOCK_ASSERT_HELD(mpts);
1466
1467	VERIFY(mpts->mpts_mpte == mpte);
1468	VERIFY(mpts->mpts_socket != NULL);
1469	VERIFY(mpts->mpts_connid != CONNID_ANY &&
1470	    mpts->mpts_connid != CONNID_ALL);
1471
1472	if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
1473		return;
1474
1475	mpts->mpts_flags |= MPTSF_DISCONNECTING;
1476
1477	/*
1478	 * If this is coming from disconnectx(2) or issued as part of
1479	 * closing the MPTCP socket, the subflow shouldn't stick around.
1480	 * Otherwise let it linger around in case the upper layers need
1481	 * to retrieve its conninfo.
1482	 */
1483	if (deleteok)
1484		mpts->mpts_flags |= MPTSF_DELETEOK;
1485
1486	so = mpts->mpts_socket;
1487	mp_tp = mpte->mpte_mptcb;
1488	MPT_LOCK(mp_tp);
1489	if (mp_tp->mpt_state > MPTCPS_ESTABLISHED)
1490		send_dfin = 1;
1491	MPT_UNLOCK(mp_tp);
1492
1493	socket_lock(so, 0);
1494	if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
1495	    (so->so_state & SS_ISCONNECTED)) {
1496		mptcplog((LOG_DEBUG, "%s: cid %d fin %d [linger %s]\n",
1497		    __func__, mpts->mpts_connid, send_dfin,
1498		    (deleteok ? "NO" : "YES")));
1499
1500		if (send_dfin)
1501			mptcp_send_dfin(so);
1502		(void) soshutdownlock(so, SHUT_RD);
1503		(void) soshutdownlock(so, SHUT_WR);
1504		(void) sodisconnectlocked(so);
1505	}
1506	socket_unlock(so, 0);
1507	/*
1508	 * Generate a disconnect event for this subflow socket, in case
1509	 * the lower layer doesn't do it; this is needed because the
1510	 * subflow socket deletion relies on it.  This will also end up
1511	 * generating SO_FILT_HINT_CONNINFO_UPDATED on the MPTCP socket;
1512	 * we cannot do that here because subflow lock is currently held.
1513	 */
1514	mptcp_subflow_eupcall(so, mpts, SO_FILT_HINT_DISCONNECTED);
1515}
1516
1517/*
1518 * Subflow socket read upcall.
1519 *
1520 * Called when the associated subflow socket posted a read event.  The subflow
1521 * socket lock has been released prior to invoking the callback.  Note that the
1522 * upcall may occur synchronously as a result of MPTCP performing an action on
1523 * it, or asynchronously as a result of an event happening at the subflow layer.
1524 * Therefore, to maintain lock ordering, the only lock that can be acquired
1525 * here is the thread lock, for signalling purposes.
1526 */
1527static void
1528mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
1529{
1530#pragma unused(so, waitf)
1531	struct mptsub *mpts = arg;
1532	struct mptses *mpte = mpts->mpts_mpte;
1533
1534	/*
1535	 * mpte should never be NULL, except in a race with
1536	 * mptcp_subflow_del
1537	 */
1538	if (mpte == NULL)
1539		return;
1540
1541	lck_mtx_lock(&mpte->mpte_thread_lock);
1542	mptcp_thread_signal_locked(mpte);
1543	lck_mtx_unlock(&mpte->mpte_thread_lock);
1544}
1545
1546/*
1547 * Subflow socket input.
1548 *
1549 * Called in the context of the MPTCP thread, for reading data from the
1550 * underlying subflow socket and delivering it to MPTCP.
1551 */
1552static void
1553mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
1554{
1555	struct mbuf *m = NULL;
1556	struct socket *so;
1557	int error;
1558	struct mptsub *mpts_alt = NULL;
1559
1560	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1561	MPTS_LOCK_ASSERT_HELD(mpts);
1562
1563	DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
1564	    struct mptsub *, mpts);
1565
1566	if (!(mpts->mpts_flags & MPTSF_CONNECTED))
1567		return;
1568
1569	so = mpts->mpts_socket;
1570
1571	error = sock_receive_internal(so, NULL, &m, 0, NULL);
1572	if (error != 0 && error != EWOULDBLOCK) {
1573		mptcplog((LOG_ERR, "%s: cid %d error %d\n",
1574		    __func__, mpts->mpts_connid, error));
1575		MPTS_UNLOCK(mpts);
1576		mpts_alt = mptcp_get_subflow(mpte, mpts);
1577		if (mpts_alt == NULL) {
1578			if (mptcp_delayed_subf_start) {
1579				mpts_alt = mptcp_get_pending_subflow(mpte,
1580				    mpts);
1581				if (mpts_alt) {
1582					mptcplog((LOG_INFO,"%s: pending %d\n",
1583					    __func__, mpts_alt->mpts_connid));
1584				} else {
1585					mptcplog((LOG_ERR, "%s: no pending",
1586					    "%d\n", __func__,
1587					    mpts->mpts_connid));
1588					mpte->mpte_mppcb->mpp_socket->so_error =
1589					    error;
1590				}
1591			} else {
1592				mptcplog((LOG_ERR, "%s: no alt path cid %d\n",
1593				    __func__, mpts->mpts_connid));
1594				mpte->mpte_mppcb->mpp_socket->so_error = error;
1595			}
1596		}
1597		MPTS_LOCK(mpts);
1598	} else if (error == 0) {
1599		mptcplog3((LOG_DEBUG, "%s: cid %d \n",
1600		    __func__, mpts->mpts_connid));
1601	}
1602
1603	/* In fallback, make sure to accept data on all but one subflow */
1604	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1605	    (!(mpts->mpts_flags & MPTSF_ACTIVE))) {
1606		m_freem(m);
1607		return;
1608	}
1609
1610	if (m != NULL) {
1611		/*
1612		 * Release subflow lock since this may trigger MPTCP to send,
1613		 * possibly on a different subflow.  An extra reference has
1614		 * been held on the subflow by the MPTCP thread before coming
1615		 * here, so we can be sure that it won't go away, in the event
1616		 * the MP socket lock gets released.
1617		 */
1618		MPTS_UNLOCK(mpts);
1619		mptcp_input(mpte, m);
1620		MPTS_LOCK(mpts);
1621	}
1622}
1623
1624/*
1625 * Subflow socket write upcall.
1626 *
1627 * Called when the associated subflow socket posted a read event.  The subflow
1628 * socket lock has been released prior to invoking the callback.  Note that the
1629 * upcall may occur synchronously as a result of MPTCP performing an action on
1630 * it, or asynchronously as a result of an event happening at the subflow layer.
1631 * Therefore, to maintain lock ordering, the only lock that can be acquired
1632 * here is the thread lock, for signalling purposes.
1633 */
1634static void
1635mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
1636{
1637#pragma unused(so, waitf)
1638	struct mptsub *mpts = arg;
1639	struct mptses *mpte = mpts->mpts_mpte;
1640
1641	/*
1642	 * mpte should never be NULL except in a race with
1643	 * mptcp_subflow_del which doesn't hold socket lock across critical
1644	 * section. This upcall is made after releasing the socket lock.
1645	 * Interleaving of socket operations becomes possible therefore.
1646	 */
1647	if (mpte == NULL)
1648		return;
1649
1650	lck_mtx_lock(&mpte->mpte_thread_lock);
1651	mptcp_thread_signal_locked(mpte);
1652	lck_mtx_unlock(&mpte->mpte_thread_lock);
1653}
1654
1655/*
1656 * Subflow socket output.
1657 *
1658 * Called for sending data from MPTCP to the underlying subflow socket.
1659 */
1660int
1661mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts)
1662{
1663	struct socket *mp_so, *so;
1664	size_t sb_cc = 0, tot_sent = 0;
1665	struct mbuf *sb_mb;
1666	int error = 0;
1667	u_int64_t mpt_dsn = 0;
1668	struct mptcb *mp_tp = mpte->mpte_mptcb;
1669	struct mbuf *mpt_mbuf = NULL;
1670	u_int64_t off = 0;
1671	struct mbuf *head, *tail;
1672
1673	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1674	MPTS_LOCK_ASSERT_HELD(mpts);
1675	mp_so = mpte->mpte_mppcb->mpp_socket;
1676	so = mpts->mpts_socket;
1677
1678	DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
1679	    struct mptsub *, mpts);
1680
1681	/* subflow socket is suspended? */
1682	if (mpts->mpts_flags & MPTSF_SUSPENDED) {
1683		mptcplog((LOG_ERR, "%s: mp_so 0x%llx cid %d is flow "
1684		    "controlled\n", __func__,
1685		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid));
1686		goto out;
1687	}
1688
1689	/* subflow socket is not MPTCP capable? */
1690	if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) &&
1691	    !(mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1692	    !(mpts->mpts_flags & MPTSF_FASTJ_SEND)) {
1693		mptcplog((LOG_ERR, "%s: mp_so 0x%llx cid %d not "
1694		    "MPTCP capable\n", __func__,
1695		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid));
1696		goto out;
1697	}
1698
1699	/* Remove Addr Option is not sent reliably as per I-D */
1700	if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
1701		struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1702		tp->t_rem_aid = mpte->mpte_lost_aid;
1703		if (mptcp_remaddr_enable)
1704			tp->t_mpflags |= TMPF_SND_REM_ADDR;
1705		mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
1706	}
1707
1708	/*
1709	 * The mbuf chains containing the metadata (as well as pointing to
1710	 * the user data sitting at the MPTCP output queue) would then be
1711	 * sent down to the subflow socket.
1712	 *
1713	 * Some notes on data sequencing:
1714	 *
1715	 *   a. Each mbuf must be a M_PKTHDR.
1716	 *   b. MPTCP metadata is stored in the mptcp_pktinfo structure
1717	 *	in the mbuf pkthdr structure.
1718	 *   c. Each mbuf containing the MPTCP metadata must have its
1719	 *	pkt_flags marked with the PKTF_MPTCP flag.
1720	 */
1721
1722	/* First, drop acknowledged data */
1723	sb_mb = mp_so->so_snd.sb_mb;
1724	if (sb_mb == NULL) {
1725		goto out;
1726	}
1727
1728	VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
1729
1730	mpt_mbuf = sb_mb;
1731	while (mpt_mbuf && mpt_mbuf->m_pkthdr.mp_rlen == 0) {
1732		mpt_mbuf = mpt_mbuf->m_next;
1733	}
1734	if (mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1735		mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1736	} else {
1737		goto out;
1738	}
1739
1740	MPT_LOCK(mp_tp);
1741	if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
1742		u_int64_t len = 0;
1743		len = mp_tp->mpt_snduna - mpt_dsn;
1744		sbdrop(&mp_so->so_snd, (int)len);
1745
1746	}
1747
1748	/*
1749	 * In degraded mode, we don't receive data acks, so force free
1750	 * mbufs less than snd_nxt
1751	 */
1752	if (mp_so->so_snd.sb_mb == NULL) {
1753		MPT_UNLOCK(mp_tp);
1754		goto out;
1755	}
1756
1757	mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
1758	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1759	    (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
1760	    MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_sndnxt)) {
1761		u_int64_t len = 0;
1762		len = mp_tp->mpt_sndnxt - mpt_dsn;
1763		sbdrop(&mp_so->so_snd, (int)len);
1764		mp_tp->mpt_snduna = mp_tp->mpt_sndnxt;
1765	}
1766
1767	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1768	    !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
1769		mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
1770		so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
1771		if (mp_tp->mpt_flags & MPTCPF_RECVD_MPFAIL)
1772			mpts->mpts_sndnxt = mp_tp->mpt_dsn_at_csum_fail;
1773	}
1774
1775	/*
1776	 * Adjust the subflow's notion of next byte to send based on
1777	 * the last unacknowledged byte
1778	 */
1779	if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_snduna)) {
1780		mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1781		/*
1782		 * With FastJoin, a write before the fastjoin event will use
1783		 * an uninitialized relative sequence number.
1784		 */
1785		if (mpts->mpts_rel_seq == 0)
1786			mpts->mpts_rel_seq = 1;
1787	}
1788
1789	/*
1790	 * Adjust the top level notion of next byte used for retransmissions
1791	 * and sending FINs.
1792	 */
1793	if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
1794		mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
1795	}
1796
1797
1798	/* Now determine the offset from which to start transmitting data */
1799	sb_mb = mp_so->so_snd.sb_mb;
1800	sb_cc = mp_so->so_snd.sb_cc;
1801	if (sb_mb == NULL) {
1802		MPT_UNLOCK(mp_tp);
1803		goto out;
1804	}
1805	if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_sndmax)) {
1806		off = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
1807		sb_cc -= (size_t)off;
1808	} else {
1809		MPT_UNLOCK(mp_tp);
1810		goto out;
1811	}
1812	MPT_UNLOCK(mp_tp);
1813
1814	mpt_mbuf = sb_mb;
1815	mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1816
1817	while (mpt_mbuf && ((mpt_mbuf->m_pkthdr.mp_rlen == 0) ||
1818	    (mpt_mbuf->m_pkthdr.mp_rlen <= (u_int32_t)off))) {
1819		off -= mpt_mbuf->m_pkthdr.mp_rlen;
1820		mpt_mbuf = mpt_mbuf->m_next;
1821		mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1822	}
1823	if ((mpts->mpts_connid == 2) || (mpts->mpts_flags & MPTSF_MP_DEGRADED))
1824		mptcplog2((LOG_INFO, "%s: snduna = %llu off = %lld id = %d"
1825		    " %llu \n",
1826		    __func__,
1827		    mp_tp->mpt_snduna, off, mpts->mpts_connid,
1828		    mpts->mpts_sndnxt));
1829
1830	VERIFY(mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
1831
1832	head = tail = NULL;
1833
1834	while (tot_sent < sb_cc) {
1835		struct mbuf *m;
1836		size_t mlen;
1837
1838		mlen = mpt_mbuf->m_pkthdr.mp_rlen;
1839		mlen -= off;
1840		if (mlen == 0)
1841			goto out;
1842
1843		if (mlen > sb_cc) {
1844			panic("%s: unexpected %lu %lu \n", __func__,
1845			    mlen, sb_cc);
1846		}
1847
1848		m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
1849		    M_COPYM_MUST_COPY_HDR);
1850		if (m == NULL) {
1851			error = ENOBUFS;
1852			break;
1853		}
1854
1855		/* Create a DSN mapping for the data (m_copym does it) */
1856		mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1857		VERIFY(m->m_flags & M_PKTHDR);
1858		m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1859		m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
1860		m->m_pkthdr.mp_dsn = mpt_dsn + off;
1861		m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
1862		m->m_pkthdr.mp_rlen = mlen;
1863		mpts->mpts_rel_seq += mlen;
1864		m->m_pkthdr.len = mlen;
1865
1866		if (head == NULL) {
1867			 head = tail = m;
1868		} else {
1869			tail->m_next = m;
1870			tail = m;
1871		}
1872
1873		/* last contiguous mapping is stored for error cases */
1874		if (mpts->mpts_lastmap.mptsl_dsn +
1875		    mpts->mpts_lastmap.mptsl_len == mpt_dsn) {
1876			mpts->mpts_lastmap.mptsl_len += tot_sent;
1877		} else if (MPTCP_SEQ_LT((mpts->mpts_lastmap.mptsl_dsn +
1878		    mpts->mpts_lastmap.mptsl_len), mpt_dsn)) {
1879			if (m->m_pkthdr.mp_dsn == 0)
1880				panic("%s %llu", __func__, mpt_dsn);
1881			mpts->mpts_lastmap.mptsl_dsn = m->m_pkthdr.mp_dsn;
1882			mpts->mpts_lastmap.mptsl_sseq = m->m_pkthdr.mp_rseq;
1883			mpts->mpts_lastmap.mptsl_len = m->m_pkthdr.mp_rlen;
1884		}
1885
1886		tot_sent += mlen;
1887		off = 0;
1888		mpt_mbuf = mpt_mbuf->m_next;
1889	}
1890
1891	if (head != NULL) {
1892
1893		if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
1894			struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1895			tp->t_mpflags |= TMPF_FASTJOIN_SEND;
1896		}
1897
1898		error = sock_sendmbuf(so, NULL, head, 0, NULL);
1899
1900		DTRACE_MPTCP7(send, struct mbuf *, head, struct socket *, so,
1901		    struct sockbuf *, &so->so_rcv,
1902		    struct sockbuf *, &so->so_snd,
1903		    struct mptses *, mpte, struct mptsub *, mpts,
1904		    size_t, tot_sent);
1905	}
1906
1907	if (error == 0) {
1908		mpts->mpts_sndnxt += tot_sent;
1909		MPT_LOCK(mp_tp);
1910		if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mpts->mpts_sndnxt)) {
1911			if (MPTCP_DATASEQ_HIGH32(mpts->mpts_sndnxt) >
1912			    MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
1913				mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
1914			mp_tp->mpt_sndnxt = mpts->mpts_sndnxt;
1915		}
1916		mptcp_cancel_timer(mp_tp, MPTT_REXMT);
1917		MPT_UNLOCK(mp_tp);
1918
1919		/* Send once in SYN_SENT state to avoid sending SYN spam */
1920		if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
1921			so->so_flags &= ~SOF_MPTCP_FASTJOIN;
1922			mpts->mpts_flags &= ~MPTSF_FASTJ_SEND;
1923		}
1924
1925		if ((mpts->mpts_connid >= 2) ||
1926		    (mpts->mpts_flags & MPTSF_MP_DEGRADED))
1927			mptcplog2((LOG_DEBUG, "%s: cid %d wrote %d %d\n",
1928			    __func__, mpts->mpts_connid, (int)tot_sent,
1929			    (int) sb_cc));
1930	} else {
1931		mptcplog((LOG_ERR, "MPTCP ERROR %s: cid %d error %d len %zd\n",
1932		    __func__, mpts->mpts_connid, error, tot_sent));
1933	}
1934out:
1935	return (error);
1936}
1937
1938/*
1939 * Subflow socket control event upcall.
1940 *
1941 * Called when the associated subflow socket posted one or more control events.
1942 * The subflow socket lock has been released prior to invoking the callback.
1943 * Note that the upcall may occur synchronously as a result of MPTCP performing
1944 * an action on it, or asynchronously as a result of an event happening at the
1945 * subflow layer.  Therefore, to maintain lock ordering, the only lock that can
1946 * be acquired here is the thread lock, for signalling purposes.
1947 */
1948static void
1949mptcp_subflow_eupcall(struct socket *so, void *arg, uint32_t events)
1950{
1951#pragma unused(so)
1952	struct mptsub *mpts = arg;
1953	struct mptses *mpte = mpts->mpts_mpte;
1954
1955	VERIFY(mpte != NULL);
1956
1957	lck_mtx_lock(&mpte->mpte_thread_lock);
1958	atomic_bitset_32(&mpts->mpts_evctl, events);
1959	mptcp_thread_signal_locked(mpte);
1960	lck_mtx_unlock(&mpte->mpte_thread_lock);
1961}
1962
1963/*
1964 * Subflow socket control events.
1965 *
1966 * Called for handling events related to the underlying subflow socket.
1967 */
1968static ev_ret_t
1969mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts)
1970{
1971	uint32_t events, save_events;
1972	ev_ret_t ret = MPTS_EVRET_OK;
1973
1974	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1975	MPTS_LOCK_ASSERT_HELD(mpts);
1976
1977	/* bail if there's nothing to process */
1978	if ((events = mpts->mpts_evctl) == 0)
1979		return (ret);
1980
1981	if (events & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
1982	    SO_FILT_HINT_CANTRCVMORE|SO_FILT_HINT_CANTSENDMORE|
1983	    SO_FILT_HINT_TIMEOUT|SO_FILT_HINT_NOSRCADDR|
1984	    SO_FILT_HINT_IFDENIED|SO_FILT_HINT_SUSPEND|
1985	    SO_FILT_HINT_DISCONNECTED)) {
1986		events |= SO_FILT_HINT_MPFAILOVER;
1987	}
1988
1989	save_events = events;
1990
1991	DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
1992	    struct mptsub *, mpts, uint32_t, events);
1993
1994	mptcplog2((LOG_DEBUG, "%s: cid %d events=%b\n", __func__,
1995	    mpts->mpts_connid, events, SO_FILT_HINT_BITS));
1996
1997	if ((events & SO_FILT_HINT_MPCANTRCVMORE) && (ret >= MPTS_EVRET_OK)) {
1998		ev_ret_t error = mptcp_subflow_mpcantrcvmore_ev(mpte, mpts);
1999		events &= ~SO_FILT_HINT_MPCANTRCVMORE;
2000		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2001	}
2002	if ((events & SO_FILT_HINT_MPFAILOVER) && (ret >= MPTS_EVRET_OK)) {
2003		ev_ret_t error = mptcp_subflow_failover_ev(mpte, mpts);
2004		events &= ~SO_FILT_HINT_MPFAILOVER;
2005		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2006	}
2007	if ((events & SO_FILT_HINT_CONNRESET) && (ret >= MPTS_EVRET_OK)) {
2008		ev_ret_t error = mptcp_subflow_connreset_ev(mpte, mpts);
2009		events &= ~SO_FILT_HINT_CONNRESET;
2010		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2011	}
2012	if ((events & SO_FILT_HINT_MUSTRST) && (ret >= MPTS_EVRET_OK)) {
2013		ev_ret_t error = mptcp_subflow_mustrst_ev(mpte, mpts);
2014		events &= ~SO_FILT_HINT_MUSTRST;
2015		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2016	}
2017	if ((events & SO_FILT_HINT_CANTRCVMORE) && (ret >= MPTS_EVRET_OK)) {
2018		ev_ret_t error = mptcp_subflow_cantrcvmore_ev(mpte, mpts);
2019		events &= ~SO_FILT_HINT_CANTRCVMORE;
2020		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2021	}
2022	if ((events & SO_FILT_HINT_CANTSENDMORE) && (ret >= MPTS_EVRET_OK)) {
2023		ev_ret_t error = mptcp_subflow_cantsendmore_ev(mpte, mpts);
2024		events &= ~SO_FILT_HINT_CANTSENDMORE;
2025		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2026	}
2027	if ((events & SO_FILT_HINT_TIMEOUT) && (ret >= MPTS_EVRET_OK)) {
2028		ev_ret_t error = mptcp_subflow_timeout_ev(mpte, mpts);
2029		events &= ~SO_FILT_HINT_TIMEOUT;
2030		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2031	}
2032	if ((events & SO_FILT_HINT_NOSRCADDR) && (ret >= MPTS_EVRET_OK)) {
2033		ev_ret_t error = mptcp_subflow_nosrcaddr_ev(mpte, mpts);
2034		events &= ~SO_FILT_HINT_NOSRCADDR;
2035		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2036	}
2037	if ((events & SO_FILT_HINT_IFDENIED) && (ret >= MPTS_EVRET_OK)) {
2038		ev_ret_t error = mptcp_subflow_ifdenied_ev(mpte, mpts);
2039		events &= ~SO_FILT_HINT_IFDENIED;
2040		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2041	}
2042	if ((events & SO_FILT_HINT_SUSPEND) && (ret >= MPTS_EVRET_OK)) {
2043		ev_ret_t error = mptcp_subflow_suspend_ev(mpte, mpts);
2044		events &= ~SO_FILT_HINT_SUSPEND;
2045		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2046	}
2047	if ((events & SO_FILT_HINT_RESUME) && (ret >= MPTS_EVRET_OK)) {
2048		ev_ret_t error = mptcp_subflow_resume_ev(mpte, mpts);
2049		events &= ~SO_FILT_HINT_RESUME;
2050		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2051	}
2052	if ((events & SO_FILT_HINT_CONNECTED) && (ret >= MPTS_EVRET_OK)) {
2053		ev_ret_t error = mptcp_subflow_connected_ev(mpte, mpts);
2054		events &= ~SO_FILT_HINT_CONNECTED;
2055		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2056	}
2057	if ((events & SO_FILT_HINT_MPSTATUS) && (ret >= MPTS_EVRET_OK)) {
2058		ev_ret_t error = mptcp_subflow_mpstatus_ev(mpte, mpts);
2059		events &= ~SO_FILT_HINT_MPSTATUS;
2060		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2061	}
2062	if ((events & SO_FILT_HINT_DELETEOK) && (ret >= MPTS_EVRET_OK)) {
2063		ev_ret_t error = mptcp_deleteok_ev(mpte, mpts);
2064		events &= ~SO_FILT_HINT_DELETEOK;
2065		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2066	}
2067	if ((events & SO_FILT_HINT_DISCONNECTED) && (ret >= MPTS_EVRET_OK)) {
2068		ev_ret_t error = mptcp_subflow_disconnected_ev(mpte, mpts);
2069		events &= ~SO_FILT_HINT_DISCONNECTED;
2070		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2071	}
2072	if ((events & SO_FILT_HINT_MPFASTJ) && (ret >= MPTS_EVRET_OK)) {
2073		ev_ret_t error = mptcp_fastjoin_ev(mpte, mpts);
2074		events &= ~SO_FILT_HINT_MPFASTJ;
2075		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2076	}
2077
2078	/*
2079	 * We should be getting only events specified via sock_catchevents(),
2080	 * so loudly complain if we have any unprocessed one(s).
2081	 */
2082	if (events != 0 || ret < MPTS_EVRET_OK) {
2083		mptcplog((LOG_ERR, "%s%s: cid %d evret %s (%d)"
2084		    " unhandled events=%b\n",
2085		    (events != 0) ? "MPTCP_ERROR " : "",
2086		    __func__, mpts->mpts_connid,
2087		    mptcp_evret2str(ret), ret, events, SO_FILT_HINT_BITS));
2088	}
2089
2090	/* clear the ones we've processed */
2091	atomic_bitclear_32(&mpts->mpts_evctl, save_events);
2092
2093	return (ret);
2094}
2095
2096/*
2097 * Handle SO_FILT_HINT_CONNRESET subflow socket event.
2098 */
2099static ev_ret_t
2100mptcp_subflow_connreset_ev(struct mptses *mpte, struct mptsub *mpts)
2101{
2102	struct socket *mp_so, *so;
2103	struct mptcb *mp_tp;
2104	boolean_t linger;
2105
2106	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2107	MPTS_LOCK_ASSERT_HELD(mpts);
2108	VERIFY(mpte->mpte_mppcb != NULL);
2109	mp_so = mpte->mpte_mppcb->mpp_socket;
2110	mp_tp = mpte->mpte_mptcb;
2111	so = mpts->mpts_socket;
2112
2113	linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2114	    !(mp_so->so_flags & SOF_PCBCLEARING));
2115
2116	mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
2117	    mpts->mpts_connid, (linger ? "YES" : "NO")));
2118
2119	/*
2120	 * We got a TCP RST for this subflow connection.
2121	 *
2122	 * Right now, we simply propagate ECONNREFUSED to the MPTCP socket
2123	 * client if the MPTCP connection has not been established or
2124	 * if the connection has only one subflow and is a connection being
2125	 * resumed. Otherwise we close the socket.
2126	 */
2127	mptcp_subflow_disconnect(mpte, mpts, !linger);
2128
2129	MPT_LOCK(mp_tp);
2130	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2131		mpts->mpts_soerror = mp_so->so_error = ECONNREFUSED;
2132	} else if (mpte->mpte_nummpcapflows < 1) {
2133		mpts->mpts_soerror = mp_so->so_error = ECONNRESET;
2134		MPT_UNLOCK(mp_tp);
2135		MPTS_UNLOCK(mpts);
2136		soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNRESET);
2137		MPTS_LOCK(mpts);
2138		MPT_LOCK(mp_tp);
2139	}
2140	MPT_UNLOCK(mp_tp);
2141
2142	/*
2143	 * Keep the subflow socket around, unless the MPTCP socket has
2144	 * been detached or the subflow has been disconnected explicitly,
2145	 * in which case it should be deleted right away.
2146	 */
2147	return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2148}
2149
2150/*
2151 * Handle SO_FILT_HINT_CANTRCVMORE subflow socket event.
2152 */
2153static ev_ret_t
2154mptcp_subflow_cantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts)
2155{
2156	struct socket *so;
2157
2158	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2159	MPTS_LOCK_ASSERT_HELD(mpts);
2160
2161	so = mpts->mpts_socket;
2162
2163	mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid));
2164
2165	/*
2166	 * We got a FIN for this subflow connection.  This subflow socket
2167	 * is no longer available for receiving data;
2168	 * The FIN may arrive with data. The data is handed up to the
2169	 * mptcp socket and the subflow is disconnected.
2170	 */
2171
2172	return (MPTS_EVRET_OK);	/* keep the subflow socket around */
2173}
2174
2175/*
2176 * Handle SO_FILT_HINT_CANTSENDMORE subflow socket event.
2177 */
2178static ev_ret_t
2179mptcp_subflow_cantsendmore_ev(struct mptses *mpte, struct mptsub *mpts)
2180{
2181	struct socket *so;
2182
2183	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2184	MPTS_LOCK_ASSERT_HELD(mpts);
2185
2186	so = mpts->mpts_socket;
2187
2188	mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid));
2189	return (MPTS_EVRET_OK);	/* keep the subflow socket around */
2190}
2191
2192/*
2193 * Handle SO_FILT_HINT_TIMEOUT subflow socket event.
2194 */
2195static ev_ret_t
2196mptcp_subflow_timeout_ev(struct mptses *mpte, struct mptsub *mpts)
2197{
2198	struct socket *mp_so, *so;
2199	struct mptcb *mp_tp;
2200	boolean_t linger;
2201
2202	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2203	MPTS_LOCK_ASSERT_HELD(mpts);
2204	VERIFY(mpte->mpte_mppcb != NULL);
2205	mp_so = mpte->mpte_mppcb->mpp_socket;
2206	mp_tp = mpte->mpte_mptcb;
2207	so = mpts->mpts_socket;
2208
2209	linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2210	    !(mp_so->so_flags & SOF_PCBCLEARING));
2211
2212	mptcplog((LOG_NOTICE, "%s: cid %d [linger %s]\n", __func__,
2213	    mpts->mpts_connid, (linger ? "YES" : "NO")));
2214
2215	if (mpts->mpts_soerror == 0)
2216		mpts->mpts_soerror = ETIMEDOUT;
2217
2218	/*
2219	 * The subflow connection has timed out.
2220	 *
2221	 * Right now, we simply propagate ETIMEDOUT to the MPTCP socket
2222	 * client if the MPTCP connection has not been established. Otherwise
2223	 * drop it.
2224	 */
2225	mptcp_subflow_disconnect(mpte, mpts, !linger);
2226
2227	MPT_LOCK(mp_tp);
2228	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2229		mp_so->so_error = ETIMEDOUT;
2230	}
2231	MPT_UNLOCK(mp_tp);
2232
2233	/*
2234	 * Keep the subflow socket around, unless the MPTCP socket has
2235	 * been detached or the subflow has been disconnected explicitly,
2236	 * in which case it should be deleted right away.
2237	 */
2238	return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2239}
2240
2241/*
2242 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
2243 */
2244static ev_ret_t
2245mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts)
2246{
2247	struct socket *mp_so, *so;
2248	struct mptcb *mp_tp;
2249	boolean_t linger;
2250	struct tcpcb *tp = NULL;
2251
2252	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2253	MPTS_LOCK_ASSERT_HELD(mpts);
2254
2255	VERIFY(mpte->mpte_mppcb != NULL);
2256	mp_so = mpte->mpte_mppcb->mpp_socket;
2257	mp_tp = mpte->mpte_mptcb;
2258	so = mpts->mpts_socket;
2259
2260	/* Not grabbing socket lock as t_local_aid is write once only */
2261	tp = intotcpcb(sotoinpcb(so));
2262	/*
2263	 * This overwrites any previous mpte_lost_aid to avoid storing
2264	 * too much state when the typical case has only two subflows.
2265	 */
2266	mpte->mpte_flags |= MPTE_SND_REM_ADDR;
2267	mpte->mpte_lost_aid = tp->t_local_aid;
2268
2269	linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2270	    !(mp_so->so_flags & SOF_PCBCLEARING));
2271
2272	mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
2273	    mpts->mpts_connid, (linger ? "YES" : "NO")));
2274
2275	if (mpts->mpts_soerror == 0)
2276		mpts->mpts_soerror = EADDRNOTAVAIL;
2277
2278	/*
2279	 * The subflow connection has lost its source address.
2280	 *
2281	 * Right now, we simply propagate EADDRNOTAVAIL to the MPTCP socket
2282	 * client if the MPTCP connection has not been established.  If it
2283	 * has been established with one subflow , we keep the MPTCP
2284	 * connection valid without any subflows till closed by application.
2285	 * This lets tcp connection manager decide whether to close this or
2286	 * not as it reacts to reachability changes too.
2287	 */
2288	mptcp_subflow_disconnect(mpte, mpts, !linger);
2289
2290	MPT_LOCK(mp_tp);
2291	if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) &&
2292	    (mp_so->so_flags & SOF_NOADDRAVAIL)) {
2293		mp_so->so_error = EADDRNOTAVAIL;
2294	}
2295	MPT_UNLOCK(mp_tp);
2296
2297	/*
2298	 * Keep the subflow socket around, unless the MPTCP socket has
2299	 * been detached or the subflow has been disconnected explicitly,
2300	 * in which case it should be deleted right away.
2301	 */
2302	return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2303}
2304
2305/*
2306 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
2307 * indicates that the remote side sent a Data FIN
2308 */
2309static ev_ret_t
2310mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts)
2311{
2312	struct socket *so, *mp_so;
2313	struct mptcb *mp_tp;
2314
2315	MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
2316	MPTS_LOCK_ASSERT_HELD(mpts);
2317	mp_so = mpte->mpte_mppcb->mpp_socket;
2318	so = mpts->mpts_socket;
2319	mp_tp = mpte->mpte_mptcb;
2320
2321	mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid));
2322
2323	/*
2324	* We got a Data FIN for the MPTCP connection.
2325	* The FIN may arrive with data. The data is handed up to the
2326	* mptcp socket and the user is notified so that it may close
2327	* the socket if needed.
2328	*/
2329	MPT_LOCK(mp_tp);
2330	if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
2331		MPT_UNLOCK(mp_tp);
2332		MPTS_UNLOCK(mpts);
2333		soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE);
2334		MPTS_LOCK(mpts);
2335		MPT_LOCK(mp_tp);
2336	}
2337	MPT_UNLOCK(mp_tp);
2338	return (MPTS_EVRET_OK); /* keep the subflow socket around */
2339}
2340
2341/*
2342 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
2343 */
2344static ev_ret_t
2345mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts)
2346{
2347	struct mptsub *mpts_alt = NULL;
2348	struct socket *so = NULL;
2349	struct socket *mp_so;
2350	int altpath_exists = 0;
2351
2352	MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
2353	MPTS_LOCK_ASSERT_HELD(mpts);
2354	mp_so = mpte->mpte_mppcb->mpp_socket;
2355	mptcplog2((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__,
2356	    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)));
2357
2358	MPTS_UNLOCK(mpts);
2359	mpts_alt = mptcp_get_subflow(mpte, mpts);
2360
2361	/*
2362	 * If there is no alternate eligible subflow, ignore the
2363	 * failover hint.
2364	 */
2365	if (mpts_alt == NULL) {
2366		mptcplog2((LOG_WARNING, "%s: no alternate path\n", __func__));
2367		if (mptcp_delayed_subf_start) {
2368			mpts_alt = mptcp_get_pending_subflow(mpte, mpts);
2369			if (mpts_alt != NULL) {
2370				MPTS_LOCK(mpts_alt);
2371				(void) mptcp_subflow_soconnectx(mpte,
2372				    mpts_alt);
2373				MPTS_UNLOCK(mpts_alt);
2374			}
2375		}
2376		MPTS_LOCK(mpts);
2377		goto done;
2378	}
2379	MPTS_LOCK(mpts_alt);
2380	altpath_exists = 1;
2381	so = mpts_alt->mpts_socket;
2382	if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
2383		socket_lock(so, 1);
2384		/* All data acknowledged and no RTT spike */
2385		if ((so->so_snd.sb_cc == 0) &&
2386		    (mptcp_no_rto_spike(so))) {
2387			so->so_flags &= ~SOF_MP_TRYFAILOVER;
2388			mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
2389		} else {
2390			/* no alternate path available */
2391			altpath_exists = 0;
2392		}
2393		socket_unlock(so, 1);
2394	}
2395	if (altpath_exists) {
2396		mptcplog2((LOG_INFO, "%s: cid = %d\n",
2397		    __func__, mpts_alt->mpts_connid));
2398		mpts_alt->mpts_flags |= MPTSF_ACTIVE;
2399		struct mptcb *mp_tp = mpte->mpte_mptcb;
2400		/* Bring the subflow's notion of snd_nxt into the send window */
2401		MPT_LOCK(mp_tp);
2402		mpts_alt->mpts_sndnxt = mp_tp->mpt_snduna;
2403		MPT_UNLOCK(mp_tp);
2404		mpte->mpte_active_sub = mpts_alt;
2405		socket_lock(so, 1);
2406		sowwakeup(so);
2407		socket_unlock(so, 1);
2408	}
2409	MPTS_UNLOCK(mpts_alt);
2410
2411	if (altpath_exists) {
2412		soevent(mp_so,
2413		    SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
2414		mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx switched from "
2415		    "%d to %d\n", __func__,
2416		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2417		    mpts->mpts_connid, mpts_alt->mpts_connid));
2418		tcpstat.tcps_mp_switches++;
2419	}
2420
2421	MPTS_LOCK(mpts);
2422	if (altpath_exists) {
2423		mpts->mpts_flags |= MPTSF_FAILINGOVER;
2424		mpts->mpts_flags &= ~MPTSF_ACTIVE;
2425	} else {
2426		mptcplog2((LOG_INFO, "%s: no alt cid = %d\n",
2427		    __func__, mpts->mpts_connid));
2428done:
2429		so = mpts->mpts_socket;
2430		socket_lock(so, 1);
2431		so->so_flags &= ~SOF_MP_TRYFAILOVER;
2432		socket_unlock(so, 1);
2433	}
2434	MPTS_LOCK_ASSERT_HELD(mpts);
2435	return (MPTS_EVRET_OK);
2436}
2437
2438/*
2439 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
2440 */
2441static ev_ret_t
2442mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts)
2443{
2444	struct socket *mp_so, *so;
2445	struct mptcb *mp_tp;
2446	boolean_t linger;
2447
2448	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2449	MPTS_LOCK_ASSERT_HELD(mpts);
2450	VERIFY(mpte->mpte_mppcb != NULL);
2451	mp_so = mpte->mpte_mppcb->mpp_socket;
2452	mp_tp = mpte->mpte_mptcb;
2453	so = mpts->mpts_socket;
2454
2455	linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2456	    !(mp_so->so_flags & SOF_PCBCLEARING));
2457
2458	mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
2459	    mpts->mpts_connid, (linger ? "YES" : "NO")));
2460
2461	if (mpts->mpts_soerror == 0)
2462		mpts->mpts_soerror = EHOSTUNREACH;
2463
2464	/*
2465	 * The subflow connection cannot use the outgoing interface.
2466	 *
2467	 * Right now, we simply propagate EHOSTUNREACH to the MPTCP socket
2468	 * client if the MPTCP connection has not been established.  If it
2469	 * has been established, let the upper layer call disconnectx.
2470	 */
2471	mptcp_subflow_disconnect(mpte, mpts, !linger);
2472	MPTS_UNLOCK(mpts);
2473
2474	soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED);
2475
2476	MPT_LOCK(mp_tp);
2477	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2478		mp_so->so_error = EHOSTUNREACH;
2479	}
2480	MPT_UNLOCK(mp_tp);
2481
2482	MPTS_LOCK(mpts);
2483	/*
2484	 * Keep the subflow socket around, unless the MPTCP socket has
2485	 * been detached or the subflow has been disconnected explicitly,
2486	 * in which case it should be deleted right away.
2487	 */
2488	return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2489}
2490
2491/*
2492 * Handle SO_FILT_HINT_SUSPEND subflow socket event.
2493 */
2494static ev_ret_t
2495mptcp_subflow_suspend_ev(struct mptses *mpte, struct mptsub *mpts)
2496{
2497	struct socket *so;
2498
2499	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2500	MPTS_LOCK_ASSERT_HELD(mpts);
2501
2502	so = mpts->mpts_socket;
2503
2504	/* the subflow connection is being flow controlled */
2505	mpts->mpts_flags |= MPTSF_SUSPENDED;
2506
2507	mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
2508	    mpts->mpts_connid));
2509
2510	return (MPTS_EVRET_OK);	/* keep the subflow socket around */
2511}
2512
2513/*
2514 * Handle SO_FILT_HINT_RESUME subflow socket event.
2515 */
2516static ev_ret_t
2517mptcp_subflow_resume_ev(struct mptses *mpte, struct mptsub *mpts)
2518{
2519	struct socket *so;
2520
2521	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2522	MPTS_LOCK_ASSERT_HELD(mpts);
2523
2524	so = mpts->mpts_socket;
2525
2526	/* the subflow connection is no longer flow controlled */
2527	mpts->mpts_flags &= ~MPTSF_SUSPENDED;
2528
2529	mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid));
2530
2531	return (MPTS_EVRET_OK);	/* keep the subflow socket around */
2532}
2533
2534/*
2535 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
2536 */
2537static ev_ret_t
2538mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts)
2539{
2540	char buf0[MAX_IPv6_STR_LEN], buf1[MAX_IPv6_STR_LEN];
2541	struct sockaddr_entry *src_se, *dst_se;
2542	struct sockaddr_storage src;
2543	struct socket *mp_so, *so;
2544	struct mptcb *mp_tp;
2545	struct ifnet *outifp;
2546	int af, error = 0;
2547	boolean_t mpok = FALSE;
2548
2549	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2550	VERIFY(mpte->mpte_mppcb != NULL);
2551	mp_so = mpte->mpte_mppcb->mpp_socket;
2552	mp_tp = mpte->mpte_mptcb;
2553
2554	MPTS_LOCK_ASSERT_HELD(mpts);
2555	so = mpts->mpts_socket;
2556	af = mpts->mpts_family;
2557
2558	if (mpts->mpts_flags & MPTSF_CONNECTED)
2559		return (MPTS_EVRET_OK);
2560
2561	if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
2562	    (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
2563        	socket_lock(so, 0);
2564		if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2565		    (so->so_state & SS_ISCONNECTED)) {
2566		    mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
2567		        __func__, mpts->mpts_connid));
2568			(void) soshutdownlock(so, SHUT_RD);
2569			(void) soshutdownlock(so, SHUT_WR);
2570			(void) sodisconnectlocked(so);
2571		}
2572		socket_unlock(so, 0);
2573		return (MPTS_EVRET_OK);
2574	}
2575
2576	/*
2577	 * The subflow connection has been connected.  Find out whether it
2578	 * is connected as a regular TCP or as a MPTCP subflow.  The idea is:
2579	 *
2580	 *   a. If MPTCP connection is not yet established, then this must be
2581	 *	the first subflow connection.  If MPTCP failed to negotiate,
2582	 *	indicate to the MPTCP socket client via EPROTO, that the
2583	 *	underlying TCP connection may be peeled off via peeloff(2).
2584	 *	Otherwise, mark the MPTCP socket as connected.
2585	 *
2586	 *   b. If MPTCP connection has been established, then this must be
2587	 *	one of the subsequent subflow connections. If MPTCP failed
2588	 *	to negotiate, disconnect the connection since peeloff(2)
2589	 *	is no longer possible.
2590	 *
2591	 * Right now, we simply unblock any waiters at the MPTCP socket layer
2592	 * if the MPTCP connection has not been established.
2593	 */
2594	socket_lock(so, 0);
2595
2596	if (so->so_state & SS_ISDISCONNECTED) {
2597		/*
2598		 * With MPTCP joins, a connection is connected at the subflow
2599		 * level, but the 4th ACK from the server elevates the MPTCP
2600		 * subflow to connected state. So there is a small window
2601		 * where the subflow could get disconnected before the
2602		 * connected event is processed.
2603		 */
2604		socket_unlock(so, 0);
2605		return (MPTS_EVRET_OK);
2606	}
2607
2608	mpts->mpts_soerror = 0;
2609	mpts->mpts_flags &= ~MPTSF_CONNECTING;
2610	mpts->mpts_flags |= MPTSF_CONNECTED;
2611	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
2612		mpts->mpts_flags |= MPTSF_MP_CAPABLE;
2613
2614	VERIFY(mpts->mpts_dst_sl != NULL);
2615	dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
2616	VERIFY(dst_se != NULL && dst_se->se_addr != NULL &&
2617	    dst_se->se_addr->sa_family == af);
2618
2619	VERIFY(mpts->mpts_src_sl != NULL);
2620	src_se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
2621	VERIFY(src_se != NULL && src_se->se_addr != NULL &&
2622	    src_se->se_addr->sa_family == af);
2623
2624	/* get/check source IP address */
2625	switch (af) {
2626	case AF_INET: {
2627		error = in_getsockaddr_s(so, &src);
2628		if (error == 0) {
2629			struct sockaddr_in *ms = SIN(src_se->se_addr);
2630			struct sockaddr_in *s = SIN(&src);
2631
2632			VERIFY(s->sin_len == ms->sin_len);
2633			VERIFY(ms->sin_family == AF_INET);
2634
2635			if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2636			    bcmp(&ms->sin_addr, &s->sin_addr,
2637			    sizeof (ms->sin_addr)) != 0) {
2638				mptcplog((LOG_ERR, "%s: cid %d local "
2639				    "address %s (expected %s)\n", __func__,
2640				    mpts->mpts_connid, inet_ntop(AF_INET,
2641				    (void *)&s->sin_addr.s_addr, buf0,
2642				    sizeof (buf0)), inet_ntop(AF_INET,
2643				    (void *)&ms->sin_addr.s_addr, buf1,
2644				    sizeof (buf1))));
2645			}
2646			bcopy(s, ms, sizeof (*s));
2647		}
2648		break;
2649	}
2650#if INET6
2651	case AF_INET6: {
2652		error = in6_getsockaddr_s(so, &src);
2653		if (error == 0) {
2654			struct sockaddr_in6 *ms = SIN6(src_se->se_addr);
2655			struct sockaddr_in6 *s = SIN6(&src);
2656
2657			VERIFY(s->sin6_len == ms->sin6_len);
2658			VERIFY(ms->sin6_family == AF_INET6);
2659
2660			if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2661			    bcmp(&ms->sin6_addr, &s->sin6_addr,
2662			    sizeof (ms->sin6_addr)) != 0) {
2663				mptcplog((LOG_ERR, "%s: cid %d local "
2664				    "address %s (expected %s)\n", __func__,
2665				    mpts->mpts_connid, inet_ntop(AF_INET6,
2666				    (void *)&s->sin6_addr, buf0,
2667				    sizeof (buf0)), inet_ntop(AF_INET6,
2668				    (void *)&ms->sin6_addr, buf1,
2669				    sizeof (buf1))));
2670			}
2671			bcopy(s, ms, sizeof (*s));
2672		}
2673		break;
2674	}
2675#endif /* INET6 */
2676	default:
2677		VERIFY(0);
2678		/* NOTREACHED */
2679	}
2680
2681	if (error != 0) {
2682		mptcplog((LOG_ERR, "%s: cid %d getsockaddr failed (%d)\n",
2683		    __func__, mpts->mpts_connid, error));
2684	}
2685
2686	/* get/verify the outbound interface */
2687	outifp = sotoinpcb(so)->inp_last_outifp;	/* could be NULL */
2688	if (mpts->mpts_flags & MPTSF_BOUND_IF) {
2689		VERIFY(mpts->mpts_outif != NULL);
2690		if (mpts->mpts_outif != outifp) {
2691			mptcplog((LOG_ERR, "%s: cid %d outif %s "
2692			    "(expected %s)\n", __func__, mpts->mpts_connid,
2693			    ((outifp != NULL) ? outifp->if_xname : "NULL"),
2694			    mpts->mpts_outif->if_xname));
2695			if (outifp == NULL)
2696				outifp = mpts->mpts_outif;
2697		}
2698	} else {
2699		mpts->mpts_outif = outifp;
2700	}
2701
2702	socket_unlock(so, 0);
2703
2704	mptcplog((LOG_DEBUG, "%s: cid %d outif %s %s[%d] -> %s[%d] "
2705	    "is %s\n", __func__, mpts->mpts_connid, ((outifp != NULL) ?
2706	    outifp->if_xname : "NULL"), inet_ntop(af, (af == AF_INET) ?
2707	    (void *)&SIN(src_se->se_addr)->sin_addr.s_addr :
2708	    (void *)&SIN6(src_se->se_addr)->sin6_addr, buf0, sizeof (buf0)),
2709	    ((af == AF_INET) ? ntohs(SIN(src_se->se_addr)->sin_port) :
2710	    ntohs(SIN6(src_se->se_addr)->sin6_port)),
2711	    inet_ntop(af, ((af == AF_INET) ?
2712	    (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
2713	    (void *)&SIN6(dst_se->se_addr)->sin6_addr), buf1, sizeof (buf1)),
2714	    ((af == AF_INET) ? ntohs(SIN(dst_se->se_addr)->sin_port) :
2715	    ntohs(SIN6(dst_se->se_addr)->sin6_port)),
2716	    ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ?
2717	    "MPTCP capable" : "a regular TCP")));
2718
2719	mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
2720	MPTS_UNLOCK(mpts);
2721
2722	soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
2723
2724	MPT_LOCK(mp_tp);
2725	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2726		/* case (a) above */
2727		if (!mpok) {
2728			mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
2729			(void) mptcp_drop(mpte, mp_tp, EPROTO);
2730			MPT_UNLOCK(mp_tp);
2731		} else {
2732			if (mptcp_init_authparms(mp_tp) != 0) {
2733				mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
2734				(void) mptcp_drop(mpte, mp_tp, EPROTO);
2735				MPT_UNLOCK(mp_tp);
2736				mpok = FALSE;
2737			} else {
2738				mp_tp->mpt_state = MPTCPS_ESTABLISHED;
2739				mpte->mpte_associd = mpts->mpts_connid;
2740				DTRACE_MPTCP2(state__change,
2741				    struct mptcb *, mp_tp,
2742				    uint32_t, 0 /* event */);
2743				mptcp_init_statevars(mp_tp);
2744				MPT_UNLOCK(mp_tp);
2745
2746				(void) mptcp_setconnorder(mpte,
2747				    mpts->mpts_connid, 1);
2748				soisconnected(mp_so);
2749			}
2750		}
2751		MPTS_LOCK(mpts);
2752		if (mpok) {
2753			/* Initialize the relative sequence number */
2754			mpts->mpts_rel_seq = 1;
2755			mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
2756			mpte->mpte_nummpcapflows++;
2757			MPT_LOCK_SPIN(mp_tp);
2758			mpts->mpts_sndnxt = mp_tp->mpt_snduna;
2759			MPT_UNLOCK(mp_tp);
2760		}
2761	} else if (mpok) {
2762		MPT_UNLOCK(mp_tp);
2763		if (mptcp_rwnotify && (mpte->mpte_nummpcapflows == 0)) {
2764			/* Experimental code, disabled by default. */
2765			sorwakeup(mp_so);
2766			sowwakeup(mp_so);
2767		}
2768		/*
2769		 * case (b) above
2770		 * In case of additional flows, the MPTCP socket is not
2771		 * MPTSF_MP_CAPABLE until an ACK is received from server
2772		 * for 3-way handshake.  TCP would have guaranteed that this
2773		 * is an MPTCP subflow.
2774		 */
2775		MPTS_LOCK(mpts);
2776		mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
2777		mpts->mpts_flags &= ~MPTSF_FASTJ_REQD;
2778		mpte->mpte_nummpcapflows++;
2779		/* With Fastjoin, rel sequence will be nonzero */
2780		if (mpts->mpts_rel_seq == 0)
2781			mpts->mpts_rel_seq = 1;
2782		MPT_LOCK_SPIN(mp_tp);
2783		/* With Fastjoin, sndnxt is updated before connected_ev */
2784		if (mpts->mpts_sndnxt == 0) {
2785			mpts->mpts_sndnxt = mp_tp->mpt_snduna;
2786		}
2787		MPT_UNLOCK(mp_tp);
2788		mptcp_output_needed(mpte, mpts);
2789	} else {
2790		MPT_UNLOCK(mp_tp);
2791		MPTS_LOCK(mpts);
2792	}
2793
2794	MPTS_LOCK_ASSERT_HELD(mpts);
2795
2796	return (MPTS_EVRET_OK);	/* keep the subflow socket around */
2797}
2798
2799/*
2800 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
2801 */
2802static ev_ret_t
2803mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts)
2804{
2805	struct socket *mp_so, *so;
2806	struct mptcb *mp_tp;
2807	boolean_t linger;
2808
2809	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2810	MPTS_LOCK_ASSERT_HELD(mpts);
2811	VERIFY(mpte->mpte_mppcb != NULL);
2812	mp_so = mpte->mpte_mppcb->mpp_socket;
2813	mp_tp = mpte->mpte_mptcb;
2814	so = mpts->mpts_socket;
2815
2816	linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2817	    !(mp_so->so_flags & SOF_PCBCLEARING));
2818
2819	mptcplog2((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
2820	    mpts->mpts_connid, (linger ? "YES" : "NO")));
2821
2822	if (mpts->mpts_flags & MPTSF_DISCONNECTED)
2823		return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2824
2825	/*
2826	 * Clear flags that are used by getconninfo to return state.
2827	 * Retain like MPTSF_DELETEOK for internal purposes.
2828	 */
2829	mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
2830	    MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
2831	    MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|
2832	    MPTSF_SUSPENDED|MPTSF_ACTIVE);
2833	mpts->mpts_flags |= MPTSF_DISCONNECTED;
2834
2835	/*
2836	 * The subflow connection has been disconnected.
2837	 *
2838	 * Right now, we simply unblock any waiters at the MPTCP socket layer
2839	 * if the MPTCP connection has not been established.
2840	 */
2841	MPTS_UNLOCK(mpts);
2842
2843	soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
2844
2845	if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
2846		mpte->mpte_nummpcapflows--;
2847		if (mpte->mpte_active_sub == mpts) {
2848			mpte->mpte_active_sub = NULL;
2849			mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
2850			    __func__));
2851		}
2852		mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
2853	}
2854
2855	MPT_LOCK(mp_tp);
2856	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2857		MPT_UNLOCK(mp_tp);
2858		soisdisconnected(mp_so);
2859	} else {
2860		MPT_UNLOCK(mp_tp);
2861	}
2862
2863	MPTS_LOCK(mpts);
2864	/*
2865	 * The underlying subflow socket has been disconnected;
2866	 * it is no longer useful to us.  Keep the subflow socket
2867	 * around, unless the MPTCP socket has been detached or
2868	 * the subflow has been disconnected explicitly, in which
2869	 * case it should be deleted right away.
2870	 */
2871	return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2872}
2873
2874/*
2875 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
2876 */
2877static ev_ret_t
2878mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts)
2879{
2880	struct socket *mp_so, *so;
2881	struct mptcb *mp_tp;
2882	ev_ret_t ret = MPTS_EVRET_OK_UPDATE;
2883
2884	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2885	VERIFY(mpte->mpte_mppcb != NULL);
2886	mp_so = mpte->mpte_mppcb->mpp_socket;
2887	mp_tp = mpte->mpte_mptcb;
2888
2889	MPTS_LOCK_ASSERT_HELD(mpts);
2890	so = mpts->mpts_socket;
2891
2892	socket_lock(so, 0);
2893	MPT_LOCK(mp_tp);
2894
2895	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
2896		mpts->mpts_flags |= MPTSF_MP_CAPABLE;
2897	else
2898		mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
2899
2900	if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
2901		if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
2902			goto done;
2903		mpts->mpts_flags |= MPTSF_MP_DEGRADED;
2904	}
2905	else
2906		mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
2907
2908	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
2909		mpts->mpts_flags |= MPTSF_MP_READY;
2910	else
2911		mpts->mpts_flags &= ~MPTSF_MP_READY;
2912
2913	if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
2914		mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
2915		mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
2916	}
2917
2918	if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2919		VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
2920		ret = MPTS_EVRET_DISCONNECT_FALLBACK;
2921	} else if (mpts->mpts_flags & MPTSF_MP_READY) {
2922		mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
2923		ret = MPTS_EVRET_CONNECT_PENDING;
2924	}
2925
2926	mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d "
2927	    "mptsf=%b\n", __func__,
2928	    (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
2929	    mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
2930	    mpts->mpts_flags, MPTSF_BITS));
2931done:
2932	MPT_UNLOCK(mp_tp);
2933	socket_unlock(so, 0);
2934	return (ret);
2935}
2936
2937/*
2938 * Handle SO_FILT_HINT_MUSTRST subflow socket event
2939 */
2940static ev_ret_t
2941mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts)
2942{
2943	struct socket *mp_so, *so;
2944	struct mptcb *mp_tp;
2945	boolean_t linger;
2946
2947
2948	MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
2949	MPTS_LOCK_ASSERT_HELD(mpts);
2950	VERIFY(mpte->mpte_mppcb != NULL);
2951	mp_so = mpte->mpte_mppcb->mpp_socket;
2952	mp_tp = mpte->mpte_mptcb;
2953	so = mpts->mpts_socket;
2954
2955	linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2956	    !(mp_so->so_flags & SOF_PCBCLEARING));
2957
2958	if (mpts->mpts_soerror == 0)
2959		mpts->mpts_soerror = ECONNABORTED;
2960
2961	/* We got an invalid option or a fast close */
2962	socket_lock(so, 0);
2963	struct tcptemp *t_template;
2964	struct inpcb *inp = sotoinpcb(so);
2965	struct tcpcb *tp = NULL;
2966
2967	tp = intotcpcb(inp);
2968	so->so_error = ECONNABORTED;
2969
2970	t_template = tcp_maketemplate(tp);
2971	if (t_template) {
2972		struct tcp_respond_args tra;
2973
2974		bzero(&tra, sizeof(tra));
2975		if (inp->inp_flags & INP_BOUND_IF)
2976			tra.ifscope = inp->inp_boundifp->if_index;
2977		else
2978			tra.ifscope = IFSCOPE_NONE;
2979		tra.awdl_unrestricted = 1;
2980
2981		tcp_respond(tp, t_template->tt_ipgen,
2982		    &t_template->tt_t, (struct mbuf *)NULL,
2983		    tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
2984		(void) m_free(dtom(t_template));
2985		mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d \n",
2986		    __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2987		    so, mpts->mpts_connid));
2988	}
2989	socket_unlock(so, 0);
2990	mptcp_subflow_disconnect(mpte, mpts, !linger);
2991	MPTS_UNLOCK(mpts);
2992
2993	soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED |
2994	    SO_FILT_HINT_CONNRESET);
2995
2996	MPT_LOCK(mp_tp);
2997	if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) ||
2998	    (mp_tp->mpt_state == MPTCPS_FASTCLOSE_WAIT)) {
2999		mp_so->so_error = ECONNABORTED;
3000	}
3001	MPT_UNLOCK(mp_tp);
3002
3003	MPTS_LOCK(mpts);
3004	/*
3005	 * Keep the subflow socket around unless the subflow has been
3006	 * disconnected explicitly.
3007	 */
3008	return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3009}
3010
3011static ev_ret_t
3012mptcp_fastjoin_ev(struct mptses *mpte, struct mptsub *mpts)
3013{
3014	MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
3015	MPTS_LOCK_ASSERT_HELD(mpts);
3016	VERIFY(mpte->mpte_mppcb != NULL);
3017
3018	if (mpte->mpte_nummpcapflows == 0) {
3019		struct mptcb *mp_tp = mpte->mpte_mptcb;
3020		mptcplog((LOG_DEBUG,"%s %llx %llx \n",
3021		    __func__, mp_tp->mpt_snduna, mpts->mpts_sndnxt));
3022		mpte->mpte_active_sub = mpts;
3023		mpts->mpts_flags |= (MPTSF_FASTJ_SEND | MPTSF_ACTIVE);
3024		MPT_LOCK(mp_tp);
3025		/*
3026		 * If mptcp_subflow_output is called before fastjoin_ev
3027		 * then mpts->mpts_sndnxt is initialized to mp_tp->mpt_snduna
3028		 * and further mpts->mpts_sndnxt is incremented by len copied.
3029		 */
3030		if (mpts->mpts_sndnxt == 0) {
3031			mpts->mpts_sndnxt = mp_tp->mpt_snduna;
3032			mpts->mpts_rel_seq = 1;
3033		}
3034		MPT_UNLOCK(mp_tp);
3035	}
3036
3037	return (MPTS_EVRET_OK);
3038}
3039
3040static ev_ret_t
3041mptcp_deleteok_ev(struct mptses *mpte, struct mptsub *mpts)
3042{
3043	MPTE_LOCK_ASSERT_HELD(mpte);
3044	MPTS_LOCK_ASSERT_HELD(mpts);
3045	VERIFY(mpte->mpte_mppcb != NULL);
3046	mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid));
3047
3048	mpts->mpts_flags |= MPTSF_DELETEOK;
3049	if (mpts->mpts_flags & MPTSF_DISCONNECTED)
3050		return (MPTS_EVRET_DELETE);
3051	else
3052		return (MPTS_EVRET_OK);
3053}
3054
3055static const char *
3056mptcp_evret2str(ev_ret_t ret)
3057{
3058	const char *c = "UNKNOWN";
3059
3060	switch (ret) {
3061	case MPTS_EVRET_DELETE:
3062		c = "MPTS_EVRET_DELETE";
3063		break;
3064	case MPTS_EVRET_CONNECT_PENDING:
3065		c = "MPTS_EVRET_CONNECT_PENDING";
3066		break;
3067	case MPTS_EVRET_DISCONNECT_FALLBACK:
3068		c = "MPTS_EVRET_DISCONNECT_FALLBACK";
3069		break;
3070	case MPTS_EVRET_OK:
3071		c = "MPTS_EVRET_OK";
3072		break;
3073	case MPTS_EVRET_OK_UPDATE:
3074		c = "MPTS_EVRET_OK_UPDATE";
3075		break;
3076	}
3077	return (c);
3078}
3079
3080/*
3081 * Add a reference to a subflow structure; used by MPTS_ADDREF().
3082 */
3083void
3084mptcp_subflow_addref(struct mptsub *mpts, int locked)
3085{
3086	if (!locked)
3087		MPTS_LOCK(mpts);
3088	else
3089		MPTS_LOCK_ASSERT_HELD(mpts);
3090
3091	if (++mpts->mpts_refcnt == 0) {
3092		panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
3093		/* NOTREACHED */
3094	}
3095	if (!locked)
3096		MPTS_UNLOCK(mpts);
3097}
3098
3099/*
3100 * Remove a reference held on a subflow structure; used by MPTS_REMREF();
3101 */
3102void
3103mptcp_subflow_remref(struct mptsub *mpts)
3104{
3105	MPTS_LOCK(mpts);
3106	if (mpts->mpts_refcnt == 0) {
3107		panic("%s: mpts %p negative refcnt\n", __func__, mpts);
3108		/* NOTREACHED */
3109	}
3110	if (--mpts->mpts_refcnt > 0) {
3111		MPTS_UNLOCK(mpts);
3112		return;
3113	}
3114	/* callee will unlock and destroy lock */
3115	mptcp_subflow_free(mpts);
3116}
3117
3118/*
3119 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
3120 * caller must ensure that the option can be issued on subflow sockets, via
3121 * MPOF_SUBFLOW_OK flag.
3122 */
3123int
3124mptcp_subflow_sosetopt(struct mptses *mpte, struct socket *so,
3125    struct mptopt *mpo)
3126{
3127	struct socket *mp_so;
3128	struct sockopt sopt;
3129	char buf[32];
3130	int error;
3131
3132	VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3133	mpo->mpo_flags &= ~MPOF_INTERIM;
3134
3135	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
3136	mp_so = mpte->mpte_mppcb->mpp_socket;
3137
3138	bzero(&sopt, sizeof (sopt));
3139	sopt.sopt_dir = SOPT_SET;
3140	sopt.sopt_level = mpo->mpo_level;
3141	sopt.sopt_name = mpo->mpo_name;
3142	sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3143	sopt.sopt_valsize = sizeof (int);
3144	sopt.sopt_p = kernproc;
3145
3146	error = sosetoptlock(so, &sopt, 0);	/* already locked */
3147	if (error == 0) {
3148		mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s "
3149		    "val %d set successful\n", __func__,
3150		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3151		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3152		    buf, sizeof (buf)), mpo->mpo_intval));
3153	} else {
3154		mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s "
3155		    "val %d set error %d\n", __func__,
3156		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3157		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3158		    buf, sizeof (buf)), mpo->mpo_intval, error));
3159	}
3160	return (error);
3161}
3162
3163/*
3164 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
3165 * caller must ensure that the option can be issued on subflow sockets, via
3166 * MPOF_SUBFLOW_OK flag.
3167 */
3168int
3169mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
3170    struct mptopt *mpo)
3171{
3172	struct socket *mp_so;
3173	struct sockopt sopt;
3174	char buf[32];
3175	int error;
3176
3177	VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3178	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
3179	mp_so = mpte->mpte_mppcb->mpp_socket;
3180
3181	bzero(&sopt, sizeof (sopt));
3182	sopt.sopt_dir = SOPT_GET;
3183	sopt.sopt_level = mpo->mpo_level;
3184	sopt.sopt_name = mpo->mpo_name;
3185	sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3186	sopt.sopt_valsize = sizeof (int);
3187	sopt.sopt_p = kernproc;
3188
3189	error = sogetoptlock(so, &sopt, 0);	/* already locked */
3190	if (error == 0) {
3191		mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s "
3192		    "val %d get successful\n", __func__,
3193		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3194		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3195		    buf, sizeof (buf)), mpo->mpo_intval));
3196	} else {
3197		mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s get error %d\n",
3198		    __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3199		    mptcp_sopt2str(mpo->mpo_level,
3200		    mpo->mpo_name, buf, sizeof (buf)), error));
3201	}
3202	return (error);
3203}
3204
3205
3206/*
3207 * MPTCP garbage collector.
3208 *
3209 * This routine is called by the MP domain on-demand, periodic callout,
3210 * which is triggered when a MPTCP socket is closed.  The callout will
3211 * repeat as long as this routine returns a non-zero value.
3212 */
3213static uint32_t
3214mptcp_gc(struct mppcbinfo *mppi)
3215{
3216	struct mppcb *mpp, *tmpp;
3217	uint32_t active = 0;
3218
3219	lck_mtx_assert(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
3220
3221	mptcplog3((LOG_DEBUG, "%s: running\n", __func__));
3222
3223	TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
3224		struct socket *mp_so;
3225		struct mptses *mpte;
3226		struct mptcb *mp_tp;
3227
3228		VERIFY(mpp->mpp_flags & MPP_ATTACHED);
3229		mp_so = mpp->mpp_socket;
3230		VERIFY(mp_so != NULL);
3231		mpte = mptompte(mpp);
3232		VERIFY(mpte != NULL);
3233		mp_tp = mpte->mpte_mptcb;
3234		VERIFY(mp_tp != NULL);
3235
3236		mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx found "
3237		    "(u=%d,r=%d,s=%d)\n", __func__,
3238		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
3239		    mp_so->so_retaincnt, mpp->mpp_state));
3240
3241		if (!lck_mtx_try_lock(&mpp->mpp_lock)) {
3242			mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped "
3243			    "(u=%d,r=%d)\n", __func__,
3244			    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3245			    mp_so->so_usecount, mp_so->so_retaincnt));
3246			active++;
3247			continue;
3248		}
3249
3250		/* check again under the lock */
3251		if (mp_so->so_usecount > 1) {
3252			boolean_t wakeup = FALSE;
3253			struct mptsub *mpts, *tmpts;
3254
3255			mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped "
3256			    "[u=%d,r=%d] %d %d\n", __func__,
3257			    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3258			    mp_so->so_usecount, mp_so->so_retaincnt,
3259			    mp_tp->mpt_gc_ticks,
3260			    mp_tp->mpt_state));
3261			MPT_LOCK(mp_tp);
3262			if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
3263				if (mp_tp->mpt_gc_ticks > 0)
3264					mp_tp->mpt_gc_ticks--;
3265				if (mp_tp->mpt_gc_ticks == 0) {
3266					wakeup = TRUE;
3267					if (mp_tp->mpt_localkey != NULL) {
3268						mptcp_free_key(
3269						    mp_tp->mpt_localkey);
3270						mp_tp->mpt_localkey = NULL;
3271					}
3272				}
3273			}
3274			MPT_UNLOCK(mp_tp);
3275			if (wakeup) {
3276				TAILQ_FOREACH_SAFE(mpts,
3277				    &mpte->mpte_subflows, mpts_entry, tmpts) {
3278					MPTS_LOCK(mpts);
3279					mpts->mpts_flags |= MPTSF_DELETEOK;
3280					if (mpts->mpts_soerror == 0)
3281						mpts->mpts_soerror = ETIMEDOUT;
3282					mptcp_subflow_eupcall(mpts->mpts_socket,
3283					    mpts, SO_FILT_HINT_DISCONNECTED);
3284					MPTS_UNLOCK(mpts);
3285				}
3286			}
3287			lck_mtx_unlock(&mpp->mpp_lock);
3288			active++;
3289			continue;
3290		}
3291
3292		if (mpp->mpp_state != MPPCB_STATE_DEAD) {
3293			mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped "
3294			    "[u=%d,r=%d,s=%d]\n", __func__,
3295			    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3296			    mp_so->so_usecount, mp_so->so_retaincnt,
3297			    mpp->mpp_state));
3298			lck_mtx_unlock(&mpp->mpp_lock);
3299			active++;
3300			continue;
3301		}
3302
3303		/*
3304		 * The PCB has been detached, and there is exactly 1 refnct
3305		 * held by the MPTCP thread.  Signal that thread to terminate,
3306		 * after which the last refcnt will be released.  That will
3307		 * allow it to be destroyed below during the next round.
3308		 */
3309		if (mp_so->so_usecount == 1) {
3310			mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx scheduled for "
3311			    "termination [u=%d,r=%d]\n", __func__,
3312			    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3313			    mp_so->so_usecount, mp_so->so_retaincnt));
3314			/* signal MPTCP thread to terminate */
3315			mptcp_thread_terminate_signal(mpte);
3316			lck_mtx_unlock(&mpp->mpp_lock);
3317			active++;
3318			continue;
3319		}
3320
3321		mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
3322		    __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3323		    mp_so->so_usecount, mp_so->so_retaincnt));
3324		DTRACE_MPTCP4(dispose, struct socket *, mp_so,
3325		    struct sockbuf *, &mp_so->so_rcv,
3326		    struct sockbuf *, &mp_so->so_snd,
3327		    struct mppcb *, mpp);
3328
3329		mp_pcbdispose(mpp);
3330	}
3331
3332	return (active);
3333}
3334
3335/*
3336 * Drop a MPTCP connection, reporting the specified error.
3337 */
3338struct mptses *
3339mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
3340{
3341	struct socket *mp_so;
3342
3343	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
3344	MPT_LOCK_ASSERT_HELD(mp_tp);
3345	VERIFY(mpte->mpte_mptcb == mp_tp);
3346	mp_so = mpte->mpte_mppcb->mpp_socket;
3347
3348	mp_tp->mpt_state = MPTCPS_TERMINATE;
3349	DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
3350	    uint32_t, 0 /* event */);
3351
3352	if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
3353		errno = mp_tp->mpt_softerror;
3354	mp_so->so_error = errno;
3355
3356	return (mptcp_close(mpte, mp_tp));
3357}
3358
3359/*
3360 * Close a MPTCP control block.
3361 */
3362struct mptses *
3363mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
3364{
3365	struct socket *mp_so;
3366	struct mptsub *mpts, *tmpts;
3367
3368	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
3369	MPT_LOCK_ASSERT_HELD(mp_tp);
3370	VERIFY(mpte->mpte_mptcb == mp_tp);
3371	mp_so = mpte->mpte_mppcb->mpp_socket;
3372	if (mp_tp->mpt_localkey != NULL) {
3373		mptcp_free_key(mp_tp->mpt_localkey);
3374		mp_tp->mpt_localkey = NULL;
3375	}
3376
3377	MPT_UNLOCK(mp_tp);
3378	soisdisconnected(mp_so);
3379
3380	MPT_LOCK(mp_tp);
3381	if (mp_tp->mpt_flags & MPTCPF_PEEL_OFF) {
3382		return (NULL);
3383	}
3384	MPT_UNLOCK(mp_tp);
3385
3386	/* Clean up all subflows */
3387	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3388		MPTS_LOCK(mpts);
3389		mpts->mpts_flags |= MPTSF_USER_DISCONNECT;
3390		mptcp_subflow_disconnect(mpte, mpts, TRUE);
3391		MPTS_UNLOCK(mpts);
3392		mptcp_subflow_del(mpte, mpts, TRUE);
3393	}
3394	MPT_LOCK(mp_tp);
3395
3396	return (NULL);
3397}
3398
3399void
3400mptcp_notify_close(struct socket *so)
3401{
3402	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
3403}
3404
3405/*
3406 * Signal MPTCP thread to wake up.
3407 */
3408void
3409mptcp_thread_signal(struct mptses *mpte)
3410{
3411	lck_mtx_lock(&mpte->mpte_thread_lock);
3412	mptcp_thread_signal_locked(mpte);
3413	lck_mtx_unlock(&mpte->mpte_thread_lock);
3414}
3415
3416/*
3417 * Signal MPTCP thread to wake up (locked version)
3418 */
3419static void
3420mptcp_thread_signal_locked(struct mptses *mpte)
3421{
3422	lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3423
3424	mpte->mpte_thread_reqs++;
3425	if (!mpte->mpte_thread_active && mpte->mpte_thread != THREAD_NULL)
3426		wakeup_one((caddr_t)&mpte->mpte_thread);
3427}
3428
3429/*
3430 * Signal MPTCP thread to terminate.
3431 */
3432static void
3433mptcp_thread_terminate_signal(struct mptses *mpte)
3434{
3435	lck_mtx_lock(&mpte->mpte_thread_lock);
3436	if (mpte->mpte_thread != THREAD_NULL) {
3437		mpte->mpte_thread = THREAD_NULL;
3438		mpte->mpte_thread_reqs++;
3439		if (!mpte->mpte_thread_active)
3440			wakeup_one((caddr_t)&mpte->mpte_thread);
3441	}
3442	lck_mtx_unlock(&mpte->mpte_thread_lock);
3443}
3444
3445/*
3446 * MPTCP thread workloop.
3447 */
3448static void
3449mptcp_thread_dowork(struct mptses *mpte)
3450{
3451	struct socket *mp_so;
3452	struct mptsub *mpts, *tmpts;
3453	boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
3454	boolean_t conninfo_update = FALSE;
3455
3456	MPTE_LOCK(mpte);		/* same as MP socket lock */
3457	VERIFY(mpte->mpte_mppcb != NULL);
3458	mp_so = mpte->mpte_mppcb->mpp_socket;
3459	VERIFY(mp_so != NULL);
3460
3461	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3462		ev_ret_t ret;
3463
3464		MPTS_LOCK(mpts);
3465		MPTS_ADDREF_LOCKED(mpts);	/* for us */
3466
3467		/* Update process ownership based on parent mptcp socket */
3468		mptcp_update_last_owner(mpts, mp_so);
3469
3470		mptcp_subflow_input(mpte, mpts);
3471		ret = mptcp_subflow_events(mpte, mpts);
3472
3473		if (mpts->mpts_flags & MPTSF_ACTIVE) {
3474			mptcplog3((LOG_INFO, "%s: cid %d \n", __func__,
3475			    mpts->mpts_connid));
3476			(void) mptcp_subflow_output(mpte, mpts);
3477		}
3478
3479		/*
3480		 * If MPTCP socket is closed, disconnect all subflows.
3481		 * This will generate a disconnect event which will
3482		 * be handled during the next iteration, causing a
3483		 * non-zero error to be returned above.
3484		 */
3485		if (mp_so->so_flags & SOF_PCBCLEARING)
3486			mptcp_subflow_disconnect(mpte, mpts, FALSE);
3487		MPTS_UNLOCK(mpts);
3488
3489		switch (ret) {
3490		case MPTS_EVRET_OK_UPDATE:
3491			conninfo_update = TRUE;
3492			break;
3493		case MPTS_EVRET_OK:
3494			/* nothing to do */
3495			break;
3496		case MPTS_EVRET_DELETE:
3497			mptcp_subflow_del(mpte, mpts, TRUE);
3498			break;
3499		case MPTS_EVRET_CONNECT_PENDING:
3500			connect_pending = TRUE;
3501			break;
3502		case MPTS_EVRET_DISCONNECT_FALLBACK:
3503			disconnect_fallback = TRUE;
3504			break;
3505		}
3506		MPTS_REMREF(mpts);		/* ours */
3507	}
3508
3509	if (conninfo_update) {
3510		soevent(mp_so, SO_FILT_HINT_LOCKED |
3511		    SO_FILT_HINT_CONNINFO_UPDATED);
3512	}
3513
3514	if (!connect_pending && !disconnect_fallback) {
3515		MPTE_UNLOCK(mpte);
3516		return;
3517	}
3518
3519	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3520		MPTS_LOCK(mpts);
3521		if (disconnect_fallback) {
3522			struct socket *so = NULL;
3523			struct inpcb *inp = NULL;
3524			struct tcpcb *tp = NULL;
3525
3526			if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3527				MPTS_UNLOCK(mpts);
3528				continue;
3529			}
3530
3531			mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3532
3533			if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
3534			    MPTSF_DISCONNECTED)) {
3535				MPTS_UNLOCK(mpts);
3536				continue;
3537			}
3538			so = mpts->mpts_socket;
3539
3540			/*
3541			 * The MPTCP connection has degraded to a fallback
3542			 * mode, so there is no point in keeping this subflow
3543			 * regardless of its MPTCP-readiness state, unless it
3544			 * is the primary one which we use for fallback.  This
3545			 * assumes that the subflow used for fallback is the
3546			 * ACTIVE one.
3547			 */
3548
3549			socket_lock(so, 1);
3550			inp = sotoinpcb(so);
3551			tp = intotcpcb(inp);
3552			tp->t_mpflags &=
3553			    ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
3554			tp->t_mpflags |= TMPF_TCP_FALLBACK;
3555			if (mpts->mpts_flags & MPTSF_ACTIVE) {
3556				socket_unlock(so, 1);
3557				MPTS_UNLOCK(mpts);
3558				continue;
3559			}
3560			tp->t_mpflags |= TMPF_RESET;
3561			soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3562			socket_unlock(so, 1);
3563
3564		} else if (connect_pending) {
3565			/*
3566			 * If delayed subflow start is set and cellular,
3567			 * delay the connect till a retransmission timeout
3568			 */
3569
3570			if ((mptcp_delayed_subf_start) &&
3571			    (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
3572				MPTS_UNLOCK(mpts);
3573				continue;
3574			}
3575
3576			/*
3577			 * The MPTCP connection has progressed to a state
3578			 * where it supports full multipath semantics; allow
3579			 * additional joins to be attempted for all subflows
3580			 * that are in the PENDING state.
3581			 */
3582			if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
3583				(void) mptcp_subflow_soconnectx(mpte, mpts);
3584			}
3585		}
3586		MPTS_UNLOCK(mpts);
3587	}
3588
3589	MPTE_UNLOCK(mpte);
3590}
3591
3592/*
3593 * MPTCP thread.
3594 */
3595static void
3596mptcp_thread_func(void *v, wait_result_t w)
3597{
3598#pragma unused(w)
3599	struct mptses *mpte = v;
3600	struct timespec *ts = NULL;
3601
3602	VERIFY(mpte != NULL);
3603
3604	lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3605
3606	for (;;) {
3607		lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3608
3609		if (mpte->mpte_thread != THREAD_NULL) {
3610			(void) msleep(&mpte->mpte_thread,
3611			    &mpte->mpte_thread_lock, (PZERO - 1) | PSPIN,
3612			    __func__, ts);
3613		}
3614
3615		/* MPTCP socket is closed? */
3616		if (mpte->mpte_thread == THREAD_NULL) {
3617			lck_mtx_unlock(&mpte->mpte_thread_lock);
3618			/* callee will destroy thread lock */
3619			mptcp_thread_destroy(mpte);
3620			/* NOTREACHED */
3621			return;
3622		}
3623
3624		mpte->mpte_thread_active = 1;
3625		for (;;) {
3626			uint32_t reqs = mpte->mpte_thread_reqs;
3627
3628			lck_mtx_unlock(&mpte->mpte_thread_lock);
3629			mptcp_thread_dowork(mpte);
3630			lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3631
3632			/* if there's no pending request, we're done */
3633			if (reqs == mpte->mpte_thread_reqs ||
3634			    mpte->mpte_thread == THREAD_NULL)
3635				break;
3636		}
3637		mpte->mpte_thread_reqs = 0;
3638		mpte->mpte_thread_active = 0;
3639	}
3640}
3641
3642/*
3643 * Destroy a MTCP thread, to be called in the MPTCP thread context
3644 * upon receiving an indication to self-terminate.  This routine
3645 * will not return, as the current thread is terminated at the end.
3646 */
3647static void
3648mptcp_thread_destroy(struct mptses *mpte)
3649{
3650	struct socket *mp_so;
3651
3652	MPTE_LOCK(mpte);		/* same as MP socket lock */
3653	VERIFY(mpte->mpte_thread == THREAD_NULL);
3654	VERIFY(mpte->mpte_mppcb != NULL);
3655
3656	mptcp_sesdestroy(mpte);
3657
3658	mp_so = mpte->mpte_mppcb->mpp_socket;
3659	VERIFY(mp_so != NULL);
3660	VERIFY(mp_so->so_usecount != 0);
3661	mp_so->so_usecount--;		/* for thread */
3662	mpte->mpte_mppcb->mpp_flags |= MPP_DEFUNCT;
3663	MPTE_UNLOCK(mpte);
3664
3665	/* for the extra refcnt from kernel_thread_start() */
3666	thread_deallocate(current_thread());
3667	/* this is the end */
3668	thread_terminate(current_thread());
3669	/* NOTREACHED */
3670}
3671
3672/*
3673 * Protocol pr_lock callback.
3674 */
3675int
3676mptcp_lock(struct socket *mp_so, int refcount, void *lr)
3677{
3678	struct mppcb *mpp = sotomppcb(mp_so);
3679	void *lr_saved;
3680
3681	if (lr == NULL)
3682		lr_saved = __builtin_return_address(0);
3683	else
3684		lr_saved = lr;
3685
3686	if (mpp == NULL) {
3687		panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
3688		    mp_so, lr_saved, solockhistory_nr(mp_so));
3689		/* NOTREACHED */
3690	}
3691	lck_mtx_lock(&mpp->mpp_lock);
3692
3693	if (mp_so->so_usecount < 0) {
3694		panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
3695		    mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
3696		    solockhistory_nr(mp_so));
3697		/* NOTREACHED */
3698	}
3699	if (refcount != 0)
3700		mp_so->so_usecount++;
3701	mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
3702	mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
3703
3704	return (0);
3705}
3706
3707/*
3708 * Protocol pr_unlock callback.
3709 */
3710int
3711mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
3712{
3713	struct mppcb *mpp = sotomppcb(mp_so);
3714	void *lr_saved;
3715
3716	if (lr == NULL)
3717		lr_saved = __builtin_return_address(0);
3718	else
3719		lr_saved = lr;
3720
3721	if (mpp == NULL) {
3722		panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
3723		    mp_so, mp_so->so_usecount, lr_saved,
3724		    solockhistory_nr(mp_so));
3725		/* NOTREACHED */
3726	}
3727	lck_mtx_assert(&mpp->mpp_lock, LCK_MTX_ASSERT_OWNED);
3728
3729	if (refcount != 0)
3730		mp_so->so_usecount--;
3731
3732	if (mp_so->so_usecount < 0) {
3733		panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
3734		    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
3735		/* NOTREACHED */
3736	}
3737	mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
3738	mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
3739	lck_mtx_unlock(&mpp->mpp_lock);
3740
3741	return (0);
3742}
3743
3744/*
3745 * Protocol pr_getlock callback.
3746 */
3747lck_mtx_t *
3748mptcp_getlock(struct socket *mp_so, int locktype)
3749{
3750#pragma unused(locktype)
3751	struct mppcb *mpp = sotomppcb(mp_so);
3752
3753	if (mpp == NULL) {
3754		panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
3755		    solockhistory_nr(mp_so));
3756		/* NOTREACHED */
3757	}
3758	if (mp_so->so_usecount < 0) {
3759		panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
3760		    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
3761		/* NOTREACHED */
3762	}
3763	return (&mpp->mpp_lock);
3764}
3765
3766/*
3767 * Key generation functions
3768 */
3769static void
3770mptcp_generate_unique_key(struct mptcp_key_entry *key_entry)
3771{
3772	struct mptcp_key_entry *key_elm;
3773try_again:
3774	read_random(&key_entry->mkey_value, sizeof (key_entry->mkey_value));
3775	if (key_entry->mkey_value == 0)
3776		goto try_again;
3777	mptcp_do_sha1(&key_entry->mkey_value, key_entry->mkey_digest,
3778	    sizeof (key_entry->mkey_digest));
3779
3780	LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
3781		if (key_elm->mkey_value == key_entry->mkey_value) {
3782			goto try_again;
3783		}
3784		if (bcmp(key_elm->mkey_digest, key_entry->mkey_digest, 4) ==
3785		    0) {
3786			goto try_again;
3787		}
3788	}
3789}
3790
3791static mptcp_key_t *
3792mptcp_reserve_key(void)
3793{
3794	struct mptcp_key_entry *key_elm;
3795	struct mptcp_key_entry *found_elm = NULL;
3796
3797	lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3798	LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
3799		if (key_elm->mkey_flags == MKEYF_FREE) {
3800			key_elm->mkey_flags = MKEYF_INUSE;
3801			found_elm = key_elm;
3802			break;
3803		}
3804	}
3805	lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3806
3807	if (found_elm) {
3808		return (&found_elm->mkey_value);
3809	}
3810
3811	key_elm = (struct mptcp_key_entry *)
3812	    zalloc(mptcp_keys_pool.mkph_key_entry_zone);
3813	key_elm->mkey_flags = MKEYF_INUSE;
3814
3815	lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3816	mptcp_generate_unique_key(key_elm);
3817	LIST_INSERT_HEAD(&mptcp_keys_pool, key_elm, mkey_next);
3818	mptcp_keys_pool.mkph_count += 1;
3819	lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3820	return (&key_elm->mkey_value);
3821}
3822
3823static caddr_t
3824mptcp_get_stored_digest(mptcp_key_t *key)
3825{
3826	struct mptcp_key_entry *key_holder;
3827	caddr_t digest = NULL;
3828
3829	lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3830	key_holder = (struct mptcp_key_entry *)(void *)((caddr_t)key -
3831	    offsetof(struct mptcp_key_entry, mkey_value));
3832	if (key_holder->mkey_flags != MKEYF_INUSE)
3833		panic_plain("%s", __func__);
3834	digest = &key_holder->mkey_digest[0];
3835	lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3836	return (digest);
3837}
3838
3839void
3840mptcp_free_key(mptcp_key_t *key)
3841{
3842	struct mptcp_key_entry *key_holder;
3843	struct mptcp_key_entry *key_elm;
3844	int pt = RandomULong();
3845
3846	mptcplog((LOG_INFO, "%s\n", __func__));
3847
3848	lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3849	key_holder = (struct mptcp_key_entry *)(void*)((caddr_t)key -
3850	    offsetof(struct mptcp_key_entry, mkey_value));
3851	key_holder->mkey_flags = MKEYF_FREE;
3852
3853	LIST_REMOVE(key_holder, mkey_next);
3854	mptcp_keys_pool.mkph_count -= 1;
3855
3856	/* Free half the time */
3857	if (pt & 0x01) {
3858		zfree(mptcp_keys_pool.mkph_key_entry_zone, key_holder);
3859	} else {
3860		/* Insert it at random point to avoid early reuse */
3861		int i = 0;
3862		if (mptcp_keys_pool.mkph_count > 1) {
3863			pt = pt % (mptcp_keys_pool.mkph_count - 1);
3864			LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
3865				if (++i >= pt) {
3866					LIST_INSERT_AFTER(key_elm, key_holder,
3867					    mkey_next);
3868					break;
3869				}
3870			}
3871			if (i < pt)
3872				panic("missed insertion");
3873		} else {
3874			LIST_INSERT_HEAD(&mptcp_keys_pool, key_holder,
3875			    mkey_next);
3876		}
3877		mptcp_keys_pool.mkph_count += 1;
3878	}
3879	lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3880}
3881
3882static void
3883mptcp_key_pool_init(void)
3884{
3885	int i;
3886	struct mptcp_key_entry *key_entry;
3887
3888	LIST_INIT(&mptcp_keys_pool);
3889	mptcp_keys_pool.mkph_count = 0;
3890
3891	mptcp_keys_pool.mkph_key_elm_sz = (vm_size_t)
3892	    (sizeof (struct mptcp_key_entry));
3893	mptcp_keys_pool.mkph_key_entry_zone = zinit(
3894	    mptcp_keys_pool.mkph_key_elm_sz,
3895	    MPTCP_MX_KEY_ALLOCS * mptcp_keys_pool.mkph_key_elm_sz,
3896	    MPTCP_MX_PREALLOC_ZONE_SZ, "mptkeys");
3897	if (mptcp_keys_pool.mkph_key_entry_zone == NULL) {
3898		panic("%s: unable to allocate MPTCP keys zone \n", __func__);
3899		/* NOTREACHED */
3900	}
3901	zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_CALLERACCT, FALSE);
3902	zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_EXPAND, TRUE);
3903
3904	for (i = 0; i < MPTCP_KEY_PREALLOCS_MX; i++) {
3905		key_entry = (struct mptcp_key_entry *)
3906		    zalloc(mptcp_keys_pool.mkph_key_entry_zone);
3907		key_entry->mkey_flags = MKEYF_FREE;
3908		mptcp_generate_unique_key(key_entry);
3909		LIST_INSERT_HEAD(&mptcp_keys_pool, key_entry, mkey_next);
3910		mptcp_keys_pool.mkph_count += 1;
3911	}
3912	lck_mtx_init(&mptcp_keys_pool.mkph_lock, mtcbinfo.mppi_lock_grp,
3913	    mtcbinfo.mppi_lock_attr);
3914}
3915
3916/*
3917 * MPTCP Join support
3918 */
3919
3920static void
3921mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
3922    uint8_t addr_id)
3923{
3924	struct tcpcb *tp = sototcpcb(so);
3925	struct mptcp_subf_auth_entry *sauth_entry;
3926	MPT_LOCK_ASSERT_NOTHELD(mp_tp);
3927
3928	MPT_LOCK_SPIN(mp_tp);
3929	tp->t_mptcb = mp_tp;
3930	/*
3931	 * The address ID of the first flow is implicitly 0.
3932	 */
3933	if (mp_tp->mpt_state == MPTCPS_CLOSED) {
3934		tp->t_local_aid = 0;
3935	} else {
3936		tp->t_local_aid = addr_id;
3937		tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
3938		so->so_flags |= SOF_MP_SEC_SUBFLOW;
3939	}
3940	MPT_UNLOCK(mp_tp);
3941	sauth_entry = zalloc(mpt_subauth_zone);
3942	sauth_entry->msae_laddr_id = tp->t_local_aid;
3943	sauth_entry->msae_raddr_id = 0;
3944	sauth_entry->msae_raddr_rand = 0;
3945try_again:
3946	sauth_entry->msae_laddr_rand = RandomULong();
3947	if (sauth_entry->msae_laddr_rand == 0)
3948		goto try_again;
3949	MPT_LOCK_SPIN(mp_tp);
3950	LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
3951	MPT_UNLOCK(mp_tp);
3952}
3953
3954static void
3955mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
3956{
3957	struct mptcp_subf_auth_entry *sauth_entry;
3958	struct tcpcb *tp = NULL;
3959	int found = 0;
3960
3961	socket_lock(so, 0);
3962	tp = sototcpcb(so);
3963	if (tp == NULL) {
3964		socket_unlock(so, 0);
3965		return;
3966	}
3967
3968	MPT_LOCK(mp_tp);
3969	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
3970		if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
3971			found = 1;
3972			break;
3973		}
3974	}
3975	if (found) {
3976		LIST_REMOVE(sauth_entry, msae_next);
3977		zfree(mpt_subauth_zone, sauth_entry);
3978	}
3979	MPT_UNLOCK(mp_tp);
3980
3981	tp->t_mptcb = NULL;
3982	socket_unlock(so, 0);
3983}
3984
3985void
3986mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
3987    u_int32_t *rrand)
3988{
3989	struct mptcp_subf_auth_entry *sauth_entry;
3990	MPT_LOCK_ASSERT_NOTHELD(mp_tp);
3991
3992	MPT_LOCK(mp_tp);
3993	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
3994		if (sauth_entry->msae_laddr_id == addr_id) {
3995			if (lrand)
3996				*lrand = sauth_entry->msae_laddr_rand;
3997			if (rrand)
3998				*rrand = sauth_entry->msae_raddr_rand;
3999			break;
4000		}
4001	}
4002	MPT_UNLOCK(mp_tp);
4003}
4004
4005void
4006mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
4007    mptcp_addr_id raddr_id, u_int32_t raddr_rand)
4008{
4009	struct mptcp_subf_auth_entry *sauth_entry;
4010	MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4011
4012	MPT_LOCK(mp_tp);
4013	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4014		if (sauth_entry->msae_laddr_id == laddr_id) {
4015			if ((sauth_entry->msae_raddr_id != 0) &&
4016			    (sauth_entry->msae_raddr_id != raddr_id)) {
4017				mptcplog((LOG_ERR, "MPTCP ERROR %s: mismatched"
4018				    " address ids %d %d \n", __func__, raddr_id,
4019				    sauth_entry->msae_raddr_id));
4020				MPT_UNLOCK(mp_tp);
4021				return;
4022			}
4023			sauth_entry->msae_raddr_id = raddr_id;
4024			if ((sauth_entry->msae_raddr_rand != 0) &&
4025			    (sauth_entry->msae_raddr_rand != raddr_rand)) {
4026				mptcplog((LOG_ERR, "%s: dup SYN_ACK %d %d \n",
4027				    __func__, raddr_rand,
4028				    sauth_entry->msae_raddr_rand));
4029				MPT_UNLOCK(mp_tp);
4030				return;
4031			}
4032			sauth_entry->msae_raddr_rand = raddr_rand;
4033			MPT_UNLOCK(mp_tp);
4034			return;
4035		}
4036	}
4037	MPT_UNLOCK(mp_tp);
4038}
4039
4040/*
4041 * SHA1 support for MPTCP
4042 */
4043static int
4044mptcp_do_sha1(mptcp_key_t *key, char *sha_digest, int digest_len)
4045{
4046	SHA1_CTX sha1ctxt;
4047	const unsigned char *sha1_base;
4048	int sha1_size;
4049
4050	if (digest_len != SHA1_RESULTLEN) {
4051		return (FALSE);
4052	}
4053
4054	sha1_base = (const unsigned char *) key;
4055	sha1_size = sizeof (mptcp_key_t);
4056	SHA1Init(&sha1ctxt);
4057	SHA1Update(&sha1ctxt, sha1_base, sha1_size);
4058	SHA1Final(sha_digest, &sha1ctxt);
4059	return (TRUE);
4060}
4061
4062void
4063mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
4064	u_int32_t rand1, u_int32_t rand2, u_char *digest, int digest_len)
4065{
4066	SHA1_CTX  sha1ctxt;
4067	mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
4068	mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
4069	u_int32_t data[2];
4070	int i;
4071
4072	bzero(digest, digest_len);
4073
4074	/* Set up the Key for HMAC */
4075	key_ipad[0] = key1;
4076	key_ipad[1] = key2;
4077
4078	key_opad[0] = key1;
4079	key_opad[1] = key2;
4080
4081	/* Set up the message for HMAC */
4082	data[0] = rand1;
4083	data[1] = rand2;
4084
4085	/* Key is 512 block length, so no need to compute hash */
4086
4087	/* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4088
4089	for (i = 0; i < 8; i++) {
4090		key_ipad[i] ^= 0x3636363636363636;
4091		key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
4092	}
4093
4094	/* Perform inner SHA1 */
4095	SHA1Init(&sha1ctxt);
4096	SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
4097	SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
4098	SHA1Final(digest, &sha1ctxt);
4099
4100	/* Perform outer SHA1 */
4101	SHA1Init(&sha1ctxt);
4102	SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
4103	SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
4104	SHA1Final(digest, &sha1ctxt);
4105}
4106
4107/*
4108 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4109 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4110 */
4111void
4112mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest,
4113    int digest_len)
4114{
4115	uint32_t lrand, rrand;
4116	mptcp_key_t localkey, remotekey;
4117	MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4118
4119	if (digest_len != SHA1_RESULTLEN)
4120		return;
4121
4122	lrand = rrand = 0;
4123	mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
4124	MPT_LOCK_SPIN(mp_tp);
4125	localkey = *mp_tp->mpt_localkey;
4126	remotekey = mp_tp->mpt_remotekey;
4127	MPT_UNLOCK(mp_tp);
4128	mptcp_hmac_sha1(localkey, remotekey, lrand, rrand, digest,
4129	    digest_len);
4130}
4131
4132u_int64_t
4133mptcp_get_trunced_hmac(mptcp_addr_id aid, struct mptcb *mp_tp)
4134{
4135	u_char digest[SHA1_RESULTLEN];
4136	u_int64_t trunced_digest;
4137
4138	mptcp_get_hmac(aid, mp_tp, &digest[0], sizeof (digest));
4139	bcopy(digest, &trunced_digest, 8);
4140	return (trunced_digest);
4141}
4142
4143/*
4144 * Authentication data generation
4145 */
4146int
4147mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
4148    int token_len)
4149{
4150	VERIFY(token_len == sizeof (u_int32_t));
4151	VERIFY(sha_digest_len == SHA1_RESULTLEN);
4152
4153	/* Most significant 32 bits of the SHA1 hash */
4154	bcopy(sha_digest, token, sizeof (u_int32_t));
4155	return (TRUE);
4156}
4157
4158int
4159mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
4160    int idsn_len)
4161{
4162	VERIFY(idsn_len == sizeof (u_int64_t));
4163	VERIFY(sha_digest_len == SHA1_RESULTLEN);
4164
4165	/*
4166	 * Least significant 64 bits of the SHA1 hash
4167	 */
4168
4169	idsn[7] = sha_digest[12];
4170	idsn[6] = sha_digest[13];
4171	idsn[5] = sha_digest[14];
4172	idsn[4] = sha_digest[15];
4173	idsn[3] = sha_digest[16];
4174	idsn[2] = sha_digest[17];
4175	idsn[1] = sha_digest[18];
4176	idsn[0] = sha_digest[19];
4177	return (TRUE);
4178}
4179
4180static int
4181mptcp_init_authparms(struct mptcb *mp_tp)
4182{
4183	caddr_t local_digest = NULL;
4184	char remote_digest[MPTCP_SHA1_RESULTLEN];
4185	MPT_LOCK_ASSERT_HELD(mp_tp);
4186
4187	/* Only Version 0 is supported for auth purposes */
4188	if (mp_tp->mpt_version != MP_DRAFT_VERSION_12)
4189		return (-1);
4190
4191	/* Setup local and remote tokens and Initial DSNs */
4192	local_digest = mptcp_get_stored_digest(mp_tp->mpt_localkey);
4193	mptcp_generate_token(local_digest, SHA1_RESULTLEN,
4194	    (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
4195	mptcp_generate_idsn(local_digest, SHA1_RESULTLEN,
4196	    (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
4197
4198	if (!mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest,
4199	    SHA1_RESULTLEN)) {
4200		mptcplog((LOG_ERR, "MPTCP ERROR %s: unexpected failure",
4201		    __func__));
4202		return (-1);
4203	}
4204	mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
4205	    (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_localtoken));
4206	mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
4207	    (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
4208	return (0);
4209}
4210
4211static void
4212mptcp_init_statevars(struct mptcb *mp_tp)
4213{
4214	MPT_LOCK_ASSERT_HELD(mp_tp);
4215
4216	/* The subflow SYN is also first MPTCP byte */
4217	mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
4218	mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4219
4220	mp_tp->mpt_rcvatmark = mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
4221}
4222
4223static void
4224mptcp_conn_properties(struct mptcb *mp_tp)
4225{
4226	/* There is only Version 0 at this time */
4227	mp_tp->mpt_version = MP_DRAFT_VERSION_12;
4228
4229	/* Set DSS checksum flag */
4230	if (mptcp_dss_csum)
4231		mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
4232
4233	/* Set up receive window */
4234	mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
4235
4236	/* Set up gc ticks */
4237	mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
4238}
4239
4240/*
4241 * Helper Functions
4242 */
4243mptcp_token_t
4244mptcp_get_localtoken(void* mptcb_arg)
4245{
4246	struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4247	return (mp_tp->mpt_localtoken);
4248}
4249
4250mptcp_token_t
4251mptcp_get_remotetoken(void* mptcb_arg)
4252{
4253	struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4254	return (mp_tp->mpt_remotetoken);
4255}
4256
4257u_int64_t
4258mptcp_get_localkey(void* mptcb_arg)
4259{
4260	struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4261	if (mp_tp->mpt_localkey != NULL)
4262		return (*mp_tp->mpt_localkey);
4263	else
4264		return (0);
4265}
4266
4267u_int64_t
4268mptcp_get_remotekey(void* mptcb_arg)
4269{
4270	struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4271	return (mp_tp->mpt_remotekey);
4272}
4273
4274void
4275mptcp_send_dfin(struct socket *so)
4276{
4277	struct tcpcb *tp = NULL;
4278	struct inpcb *inp = NULL;
4279
4280	inp = sotoinpcb(so);
4281	if (!inp)
4282		return;
4283
4284	tp = intotcpcb(inp);
4285	if (!tp)
4286		return;
4287
4288	if (!(tp->t_mpflags & TMPF_RESET))
4289		tp->t_mpflags |= TMPF_SEND_DFIN;
4290}
4291
4292/*
4293 * Data Sequence Mapping routines
4294 */
4295void
4296mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4297{
4298	struct mptcb *mp_tp;
4299
4300	if (m == NULL)
4301		return;
4302
4303	mp_tp = &((struct mpp_mtp *)mpp)->mtcb;
4304	MPT_LOCK(mp_tp);
4305	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4306		MPT_UNLOCK(mp_tp);
4307		panic("%s: data write before establishment.",
4308		    __func__);
4309		return;
4310	}
4311
4312	while (m) {
4313		VERIFY(m->m_flags & M_PKTHDR);
4314		m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4315		m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4316		m->m_pkthdr.mp_rlen = m_pktlen(m);
4317		mp_tp->mpt_sndmax += m_pktlen(m);
4318		m = m->m_next;
4319	}
4320	MPT_UNLOCK(mp_tp);
4321}
4322
4323void
4324mptcp_preproc_sbdrop(struct mbuf *m, unsigned int len)
4325{
4326	u_int32_t sub_len = 0;
4327
4328	while (m) {
4329		VERIFY(m->m_flags & M_PKTHDR);
4330
4331		if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4332			sub_len = m->m_pkthdr.mp_rlen;
4333
4334			if (sub_len < len) {
4335				m->m_pkthdr.mp_dsn += sub_len;
4336				if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4337					m->m_pkthdr.mp_rseq += sub_len;
4338				}
4339				m->m_pkthdr.mp_rlen = 0;
4340				len -= sub_len;
4341			} else {
4342				/* sub_len >= len */
4343				m->m_pkthdr.mp_dsn += len;
4344				if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4345					m->m_pkthdr.mp_rseq += len;
4346				}
4347				mptcplog3((LOG_INFO,
4348				    "%s: %llu %u %d %d\n", __func__,
4349				    m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rseq,
4350				    m->m_pkthdr.mp_rlen, len));
4351				m->m_pkthdr.mp_rlen -= len;
4352				return;
4353			}
4354		} else {
4355			panic("%s: MPTCP tag not set", __func__);
4356			/* NOTREACHED */
4357		}
4358		m = m->m_next;
4359	}
4360}
4361
4362/* Obtain the DSN mapping stored in the mbuf */
4363void
4364mptcp_output_getm_dsnmap32(struct socket *so, int off, uint32_t datalen,
4365    u_int32_t *dsn, u_int32_t *relseq, u_int16_t *data_len, u_int64_t *dsn64p)
4366{
4367	u_int64_t dsn64;
4368
4369	mptcp_output_getm_dsnmap64(so, off, datalen, &dsn64, relseq, data_len);
4370	*dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
4371	*dsn64p = dsn64;
4372}
4373
4374void
4375mptcp_output_getm_dsnmap64(struct socket *so, int off, uint32_t datalen,
4376    u_int64_t *dsn, u_int32_t *relseq, u_int16_t *data_len)
4377{
4378	struct mbuf *m = so->so_snd.sb_mb;
4379	struct mbuf *mnext = NULL;
4380	uint32_t runlen = 0;
4381	u_int64_t dsn64;
4382	uint32_t contig_len = 0;
4383
4384	if (m == NULL)
4385		return;
4386
4387	if (off < 0)
4388		return;
4389	/*
4390	 * In the subflow socket, the DSN sequencing can be discontiguous,
4391	 * but the subflow sequence mapping is contiguous. Use the subflow
4392	 * sequence property to find the right mbuf and corresponding dsn
4393	 * mapping.
4394	 */
4395
4396	while (m) {
4397		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4398		VERIFY(m->m_flags & M_PKTHDR);
4399
4400		if ((unsigned int)off >= m->m_pkthdr.mp_rlen) {
4401			off -= m->m_pkthdr.mp_rlen;
4402			m = m->m_next;
4403		} else {
4404			break;
4405		}
4406	}
4407
4408	if (m == NULL) {
4409		panic("%s: bad offset", __func__);
4410		/* NOTREACHED */
4411	}
4412
4413	dsn64 = m->m_pkthdr.mp_dsn + off;
4414	*dsn = dsn64;
4415	*relseq = m->m_pkthdr.mp_rseq + off;
4416
4417	/*
4418	 * Now find the last contiguous byte and its length from
4419	 * start.
4420	 */
4421	runlen = m->m_pkthdr.mp_rlen - off;
4422	contig_len = runlen;
4423
4424	/* If datalen does not span multiple mbufs, return */
4425	if (datalen <= runlen) {
4426		*data_len = min(datalen, UINT16_MAX);
4427		return;
4428	}
4429
4430	mnext = m->m_next;
4431	while (datalen > runlen) {
4432		if (mnext == NULL) {
4433			panic("%s: bad datalen = %d, %d %d", __func__, datalen,
4434			    runlen, off);
4435			/* NOTREACHED */
4436		}
4437		VERIFY(mnext->m_flags & M_PKTHDR);
4438		VERIFY(mnext->m_pkthdr.pkt_flags & PKTF_MPTCP);
4439
4440		/*
4441		 * case A. contiguous DSN stream
4442		 * case B. discontiguous DSN stream
4443		 */
4444		if (mnext->m_pkthdr.mp_dsn == (dsn64 + runlen)) {
4445			/* case A */
4446			runlen += mnext->m_pkthdr.mp_rlen;
4447			contig_len += mnext->m_pkthdr.mp_rlen;
4448			mptcplog3((LOG_INFO, "%s: contig \n",
4449			    __func__));
4450		} else {
4451			/* case B */
4452			mptcplog((LOG_INFO,
4453			    "%s: discontig datalen %d contig_len %d cc %d \n",
4454			    __func__, datalen, contig_len, so->so_snd.sb_cc));
4455			break;
4456		}
4457		mnext = mnext->m_next;
4458	}
4459	datalen = min(datalen, UINT16_MAX);
4460	*data_len = min(datalen, contig_len);
4461	mptcplog3((LOG_INFO, "%s: %llu %u %d %d \n", __func__,
4462	    *dsn, *relseq, *data_len, off));
4463}
4464
4465/*
4466 * MPTCP's notion of the next insequence Data Sequence number is adjusted
4467 * here. It must be called from mptcp_adj_rmap() which is called only after
4468 * reassembly of out of order data. The rcvnxt variable must
4469 * be updated only when atleast some insequence new data is received.
4470 */
4471static void
4472mptcp_adj_rcvnxt(struct tcpcb *tp, struct mbuf *m)
4473{
4474	struct mptcb *mp_tp = tptomptp(tp);
4475
4476	if (mp_tp == NULL)
4477		return;
4478	MPT_LOCK(mp_tp);
4479	if ((MPTCP_SEQ_GEQ(mp_tp->mpt_rcvnxt, m->m_pkthdr.mp_dsn)) &&
4480	    (MPTCP_SEQ_LEQ(mp_tp->mpt_rcvnxt, (m->m_pkthdr.mp_dsn +
4481	    m->m_pkthdr.mp_rlen)))) {
4482		mp_tp->mpt_rcvnxt = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4483	}
4484	MPT_UNLOCK(mp_tp);
4485}
4486
4487/*
4488 * Note that this is called only from tcp_input() which may trim data
4489 * after the dsn mapping is inserted into the mbuf. When it trims data
4490 * tcp_input calls m_adj() which does not remove the m_pkthdr even if the
4491 * m_len becomes 0 as a result of trimming the mbuf. The dsn map insertion
4492 * cannot be delayed after trim, because data can be in the reassembly
4493 * queue for a while and the DSN option info in tp will be overwritten for
4494 * every new packet received.
4495 * The dsn map will be adjusted just prior to appending to subflow sockbuf
4496 * with mptcp_adj_rmap()
4497 */
4498void
4499mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m)
4500{
4501	VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
4502
4503	if (tp->t_mpflags & TMPF_EMBED_DSN) {
4504		VERIFY(m->m_flags & M_PKTHDR);
4505		m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
4506		m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
4507		m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
4508		m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
4509		tp->t_mpflags &= ~TMPF_EMBED_DSN;
4510		tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
4511	}
4512}
4513
4514int
4515mptcp_adj_rmap(struct socket *so, struct mbuf *m)
4516{
4517	u_int64_t dsn;
4518	u_int32_t sseq, datalen;
4519	struct tcpcb *tp = intotcpcb(sotoinpcb(so));
4520	u_int32_t old_rcvnxt = 0;
4521
4522	if (m_pktlen(m) == 0)
4523		return 0;
4524
4525	if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4526		VERIFY(m->m_flags & M_PKTHDR);
4527
4528		dsn = m->m_pkthdr.mp_dsn;
4529		sseq = m->m_pkthdr.mp_rseq + tp->irs;
4530		datalen = m->m_pkthdr.mp_rlen;
4531	} else {
4532		/* data arrived without an DSS option mapping */
4533
4534		/* initial subflow can fallback right after SYN handshake */
4535		mptcp_notify_mpfail(so);
4536		return 0;
4537	}
4538
4539	/* In the common case, data is in window and in sequence */
4540	if (m->m_pkthdr.len == (int)datalen) {
4541		mptcp_adj_rcvnxt(tp, m);
4542		return 0;
4543	}
4544
4545	if (m->m_pkthdr.len > (int)datalen) {
4546		panic("%s: mbuf len = %d expected = %d", __func__,
4547		    m->m_pkthdr.len, datalen);
4548	}
4549
4550	old_rcvnxt = tp->rcv_nxt - m->m_pkthdr.len;
4551	if (SEQ_GT(old_rcvnxt, sseq)) {
4552		/* data trimmed from the left */
4553		int off = old_rcvnxt - sseq;
4554		m->m_pkthdr.mp_dsn += off;
4555		m->m_pkthdr.mp_rseq += off;
4556		m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
4557	} else if (old_rcvnxt == sseq) {
4558		/*
4559		 * Data was trimmed from the right
4560		 */
4561		m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
4562	} else {
4563		/* handle gracefully with reass or fallback */
4564		mptcp_notify_mpfail(so);
4565		m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP;
4566		m_freem(m);
4567		return -1;
4568	}
4569	mptcp_adj_rcvnxt(tp, m);
4570	return 0;
4571}
4572
4573/*
4574 * Following routines help with failure detection and failover of data
4575 * transfer from one subflow to another.
4576 */
4577void
4578mptcp_act_on_txfail(struct socket *so)
4579{
4580	struct tcpcb *tp = NULL;
4581	struct inpcb *inp = sotoinpcb(so);
4582
4583	if (inp == NULL)
4584		return;
4585
4586	tp = intotcpcb(inp);
4587	if (tp == NULL)
4588		return;
4589
4590	if (tp->t_state != TCPS_ESTABLISHED)
4591		mptcplog((LOG_INFO, "%s: state = %d \n", __func__,
4592		    tp->t_state));
4593
4594	mptcplog((LOG_INFO, "%s: Failover = %d \n", __func__,
4595	    (so->so_flags & SOF_MP_TRYFAILOVER) ? 1 : 0));
4596
4597	if (so->so_flags & SOF_MP_TRYFAILOVER) {
4598		return;
4599	}
4600
4601	so->so_flags |= SOF_MP_TRYFAILOVER;
4602	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
4603}
4604
4605/*
4606 * Support for MP_FAIL option
4607 */
4608int
4609mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
4610{
4611	struct mbuf *m = so->so_snd.sb_mb;
4612	u_int64_t dsn;
4613	int off = 0;
4614	u_int32_t datalen;
4615
4616	if (m == NULL)
4617		return (-1);
4618
4619	while (m != NULL) {
4620		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4621		VERIFY(m->m_flags & M_PKTHDR);
4622		dsn = m->m_pkthdr.mp_dsn;
4623		datalen = m->m_pkthdr.mp_rlen;
4624		if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
4625		    (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
4626			off = dsn_fail - dsn;
4627			*tcp_seq = m->m_pkthdr.mp_rseq + off;
4628			mptcplog((LOG_INFO, "%s: %llu %llu \n",
4629			    __func__, dsn, dsn_fail));
4630			return (0);
4631		}
4632
4633		m = m->m_next;
4634	}
4635
4636	/*
4637	 * If there was no mbuf data and a fallback to TCP occurred, there's
4638	 * not much else to do.
4639	 */
4640
4641	mptcplog((LOG_ERR, "%s: %llu not found \n", __func__, dsn_fail));
4642	return (-1);
4643}
4644
4645/*
4646 * Support for sending contiguous MPTCP bytes in subflow
4647 * Also for preventing sending data with ACK in 3-way handshake
4648 */
4649int32_t
4650mptcp_adj_sendlen(struct socket *so, int32_t off, int32_t len)
4651{
4652	u_int64_t	mdss_dsn = 0;
4653	u_int32_t	mdss_subflow_seq = 0;
4654	u_int16_t	mdss_data_len = 0;
4655
4656	if (len == 0)
4657		return (len);
4658
4659	mptcp_output_getm_dsnmap64(so, off, (u_int32_t)len,
4660	    &mdss_dsn, &mdss_subflow_seq, &mdss_data_len);
4661
4662	/*
4663	 * Special case handling for Fast Join. We want to send data right
4664	 * after ACK of the 3-way handshake, but not piggyback the data
4665	 * with the 3rd ACK of the 3WHS. TMPF_FASTJOINBY2_SEND and
4666	 * mdss_data_len control this.
4667	 */
4668	struct tcpcb *tp = NULL;
4669	tp = intotcpcb(sotoinpcb(so));
4670	if ((tp->t_mpflags & TMPF_JOINED_FLOW) &&
4671            (tp->t_mpflags & TMPF_PREESTABLISHED) &&
4672	    (!(tp->t_mpflags & TMPF_RECVD_JOIN)) &&
4673	    (tp->t_mpflags & TMPF_SENT_JOIN) &&
4674	    (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
4675	    (!(tp->t_mpflags & TMPF_FASTJOINBY2_SEND))) {
4676	    mdss_data_len = 0;
4677	    tp->t_mpflags |= TMPF_FASTJOINBY2_SEND;
4678	}
4679	return (mdss_data_len);
4680}
4681
4682int32_t
4683mptcp_sbspace(struct mptcb *mpt)
4684{
4685	struct sockbuf *sb;
4686	uint32_t rcvbuf;
4687	int32_t space;
4688
4689	MPT_LOCK_ASSERT_HELD(mpt);
4690	MPTE_LOCK_ASSERT_HELD(mpt->mpt_mpte);
4691
4692	sb = &mpt->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
4693	rcvbuf = sb->sb_hiwat;
4694	space = ((int32_t)imin((rcvbuf - sb->sb_cc),
4695	    (sb->sb_mbmax - sb->sb_mbcnt)));
4696	if (space < 0)
4697		space = 0;
4698	/* XXX check if it's too small? */
4699
4700	return (space);
4701}
4702
4703/*
4704 * Support Fallback to Regular TCP
4705 */
4706void
4707mptcp_notify_mpready(struct socket *so)
4708{
4709	struct tcpcb *tp = NULL;
4710
4711	if (so == NULL)
4712		return;
4713
4714	tp = intotcpcb(sotoinpcb(so));
4715
4716	if (tp == NULL)
4717		return;
4718
4719	DTRACE_MPTCP4(multipath__ready, struct socket *, so,
4720	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
4721	    struct tcpcb *, tp);
4722
4723	if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
4724		return;
4725
4726	if (tp->t_mpflags & TMPF_MPTCP_READY)
4727		return;
4728
4729	tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
4730	tp->t_mpflags |= TMPF_MPTCP_READY;
4731
4732	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
4733}
4734
4735void
4736mptcp_notify_mpfail(struct socket *so)
4737{
4738	struct tcpcb *tp = NULL;
4739
4740	if (so == NULL)
4741		return;
4742
4743	tp = intotcpcb(sotoinpcb(so));
4744
4745	if (tp == NULL)
4746		return;
4747
4748	DTRACE_MPTCP4(multipath__failed, struct socket *, so,
4749	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
4750	    struct tcpcb *, tp);
4751
4752	if (tp->t_mpflags & TMPF_TCP_FALLBACK)
4753		return;
4754
4755	tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
4756	tp->t_mpflags |= TMPF_TCP_FALLBACK;
4757
4758	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
4759}
4760
4761/*
4762 * Keepalive helper function
4763 */
4764boolean_t
4765mptcp_ok_to_keepalive(struct mptcb *mp_tp)
4766{
4767	boolean_t ret = 1;
4768	VERIFY(mp_tp != NULL);
4769	MPT_LOCK(mp_tp);
4770	if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
4771		ret = 0;
4772	}
4773	MPT_UNLOCK(mp_tp);
4774	return (ret);
4775}
4776
4777/*
4778 * MPTCP t_maxseg adjustment function
4779 */
4780int
4781mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
4782{
4783	int mss_lower = 0;
4784	struct mptcb *mp_tp = tptomptp(tp);
4785
4786#define	MPTCP_COMPUTE_LEN {				\
4787	mss_lower = sizeof (struct mptcp_dss_ack_opt);	\
4788	MPT_LOCK(mp_tp);				\
4789	if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)		\
4790		mss_lower += 2;				\
4791	else						\
4792		/* adjust to 32-bit boundary + EOL */	\
4793		mss_lower += 2;				\
4794	MPT_UNLOCK(mp_tp);				\
4795}
4796	if (mp_tp == NULL)
4797		return (0);
4798
4799	/*
4800	 * For the first subflow and subsequent subflows, adjust mss for
4801	 * most common MPTCP option size, for case where tcp_mss is called
4802	 * during option processing and MTU discovery.
4803	 */
4804	if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
4805	    (!(tp->t_mpflags & TMPF_JOINED_FLOW))) {
4806		MPTCP_COMPUTE_LEN;
4807	}
4808
4809	if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
4810	    (tp->t_mpflags & TMPF_SENT_JOIN)) {
4811		MPTCP_COMPUTE_LEN;
4812	}
4813
4814	if ((mtudisc) && (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
4815		MPTCP_COMPUTE_LEN;
4816	}
4817
4818	return (mss_lower);
4819}
4820
4821/*
4822 * Update the pid, upid, uuid of the subflow so, based on parent so
4823 */
4824void
4825mptcp_update_last_owner(struct mptsub *mpts, struct socket *parent_mpso)
4826{
4827	struct socket *subflow_so = mpts->mpts_socket;
4828
4829	MPTS_LOCK_ASSERT_HELD(mpts);
4830
4831	socket_lock(subflow_so, 0);
4832	if ((subflow_so->last_pid != parent_mpso->last_pid) ||
4833		(subflow_so->last_upid != parent_mpso->last_upid)) {
4834		subflow_so->last_upid = parent_mpso->last_upid;
4835		subflow_so->last_pid = parent_mpso->last_pid;
4836		uuid_copy(subflow_so->last_uuid, parent_mpso->last_uuid);
4837	}
4838	so_update_policy(subflow_so);
4839	socket_unlock(subflow_so, 0);
4840}
4841
4842static void
4843fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
4844{
4845	struct inpcb *inp;
4846
4847	tcp_getconninfo(so, &flow->flow_ci);
4848	inp = sotoinpcb(so);
4849#if INET6
4850	if ((inp->inp_vflag & INP_IPV6) != 0) {
4851		flow->flow_src.ss_family = AF_INET6;
4852		flow->flow_dst.ss_family = AF_INET6;
4853		flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
4854		flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
4855		SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
4856		SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
4857		SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
4858		SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
4859	} else
4860#endif
4861	{
4862		flow->flow_src.ss_family = AF_INET;
4863		flow->flow_dst.ss_family = AF_INET;
4864		flow->flow_src.ss_len = sizeof(struct sockaddr_in);
4865		flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
4866		SIN(&flow->flow_src)->sin_port = inp->inp_lport;
4867		SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
4868		SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
4869		SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
4870	}
4871	flow->flow_flags = mpts->mpts_flags;
4872	flow->flow_cid = mpts->mpts_connid;
4873}
4874
4875static int
4876mptcp_pcblist SYSCTL_HANDLER_ARGS
4877{
4878#pragma unused(oidp, arg1, arg2)
4879	int error = 0, f;
4880	size_t n, len;
4881	struct mppcb *mpp;
4882	struct mptses *mpte;
4883	struct mptcb *mp_tp;
4884	struct mptsub *mpts;
4885	struct socket *so;
4886	conninfo_mptcp_t mptcpci;
4887	mptcp_flow_t *flows = NULL;
4888
4889	if (req->newptr != USER_ADDR_NULL)
4890		return (EPERM);
4891
4892	lck_mtx_lock(&mtcbinfo.mppi_lock);
4893	n = mtcbinfo.mppi_count;
4894	if (req->oldptr == USER_ADDR_NULL) {
4895		lck_mtx_unlock(&mtcbinfo.mppi_lock);
4896		req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
4897		    4 * (n + n/8)  * sizeof(mptcp_flow_t);
4898		return (0);
4899	}
4900	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
4901		flows = NULL;
4902		bzero(&mptcpci, sizeof(mptcpci));
4903		lck_mtx_lock(&mpp->mpp_lock);
4904		VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4905		mpte = mptompte(mpp);
4906		VERIFY(mpte != NULL);
4907		mp_tp = mpte->mpte_mptcb;
4908		VERIFY(mp_tp != NULL);
4909		/* N.B. we don't take the mpt_lock just for the state. */
4910		mptcpci.mptcpci_state = mp_tp->mpt_state;
4911		mptcpci.mptcpci_nflows = mpte->mpte_numflows;
4912		len = sizeof(*flows) * mpte->mpte_numflows;
4913		if (mpte->mpte_numflows != 0) {
4914			flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
4915			if (flows == NULL) {
4916				lck_mtx_unlock(&mpp->mpp_lock);
4917				break;
4918			}
4919			mptcpci.mptcpci_len = sizeof(mptcpci) +
4920			    sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
4921			error = SYSCTL_OUT(req, &mptcpci,
4922			    sizeof(mptcpci) - sizeof(mptcp_flow_t));
4923		} else {
4924			mptcpci.mptcpci_len = sizeof(mptcpci);
4925			error = SYSCTL_OUT(req, &mptcpci,
4926		    	    sizeof(mptcpci));
4927		}
4928		if (error) {
4929			lck_mtx_unlock(&mpp->mpp_lock);
4930			FREE(flows, M_TEMP);
4931			break;
4932		}
4933		f = 0;
4934		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4935			MPTS_LOCK(mpts);
4936			so = mpts->mpts_socket;
4937			socket_lock(so, 0);
4938			fill_mptcp_subflow(so, &flows[f], mpts);
4939			socket_unlock(so, 0);
4940			MPTS_UNLOCK(mpts);
4941			f++;
4942		}
4943		lck_mtx_unlock(&mpp->mpp_lock);
4944		if (flows) {
4945			error = SYSCTL_OUT(req, flows, len);
4946			FREE(flows, M_TEMP);
4947			if (error)
4948				break;
4949		}
4950	}
4951	lck_mtx_unlock(&mtcbinfo.mppi_lock);
4952
4953	return (error);
4954}
4955
4956SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
4957    0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
4958    "List of active MPTCP connections");
4959
4960/*
4961 * Check the health of the other subflows and do an mptcp_output if
4962 * there is no other active or functional subflow at the time of
4963 * call of this function.
4964 */
4965static void
4966mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts)
4967{
4968	struct mptsub *from_mpts = NULL;
4969
4970	MPTE_LOCK_ASSERT_HELD(mpte);
4971
4972	MPTS_UNLOCK(to_mpts);
4973
4974	from_mpts = mpte->mpte_active_sub;
4975
4976	if (from_mpts == NULL)
4977		goto output_needed;
4978
4979	MPTS_LOCK(from_mpts);
4980
4981	if ((from_mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4982	    (from_mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4983		MPTS_UNLOCK(from_mpts);
4984		goto output_needed;
4985	}
4986
4987	MPTS_UNLOCK(from_mpts);
4988	MPTS_LOCK(to_mpts);
4989	return;
4990
4991output_needed:
4992	mptcp_output(mpte);
4993	MPTS_LOCK(to_mpts);
4994}
4995
4996
4997/*
4998 * When WiFi signal starts fading, there's more loss and RTT spikes.
4999 * Check if there has been a large spike by comparing against
5000 * a tolerable RTT spike threshold.
5001 */
5002boolean_t
5003mptcp_no_rto_spike(struct socket *so)
5004{
5005	struct tcpcb *tp = intotcpcb(sotoinpcb(so));
5006	int32_t spike = 0;
5007
5008	if (tp->t_rxtcur > mptcp_rto_spike_thresh) {
5009		spike = tp->t_rxtcur - mptcp_rto_spike_thresh;
5010
5011		mptcplog2((LOG_INFO, "%s: spike = %d rto = %d",
5012		    "best = %d cur = %d\n", __func__, spike,
5013		    tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT,
5014		    tp->t_rttcur));
5015
5016	}
5017
5018	if (spike > 0 ) {
5019		return (FALSE);
5020	} else {
5021		return (TRUE);
5022	}
5023}
5024
5025/*
5026 * Set notsent lowat mark on the MPTCB
5027 */
5028int
5029mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
5030{
5031	struct mptcb *mp_tp = NULL;
5032	int error = 0;
5033
5034	if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5035		mp_tp = mpte->mpte_mptcb;
5036
5037	if (mp_tp)
5038		mp_tp->mpt_notsent_lowat = optval;
5039	else
5040		error = EINVAL;
5041
5042	return error;
5043}
5044
5045u_int32_t
5046mptcp_get_notsent_lowat(struct mptses *mpte)
5047{
5048	struct mptcb *mp_tp = NULL;
5049
5050	if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5051		mp_tp = mpte->mpte_mptcb;
5052
5053	if (mp_tp)
5054		return mp_tp->mpt_notsent_lowat;
5055	else
5056		return 0;
5057}
5058
5059int
5060mptcp_notsent_lowat_check(struct socket *so) {
5061	struct mptses *mpte;
5062	struct mppcb *mpp;
5063	struct mptcb *mp_tp;
5064	struct mptsub *mpts;
5065
5066	int notsent = 0;
5067
5068	mpp = sotomppcb(so);
5069	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
5070		return (0);
5071	}
5072
5073	mpte = mptompte(mpp);
5074	mp_tp = mpte->mpte_mptcb;
5075
5076	MPT_LOCK(mp_tp);
5077	notsent = so->so_snd.sb_cc;
5078
5079	if ((notsent == 0) ||
5080	    ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
5081	    mp_tp->mpt_notsent_lowat)) {
5082		mptcplog3((LOG_INFO, "%s: lowat %d notsent %d actual %d \n",
5083		    __func__, mp_tp->mpt_notsent_lowat, notsent,
5084		    notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)));
5085		MPT_UNLOCK(mp_tp);
5086		return (1);
5087	}
5088	MPT_UNLOCK(mp_tp);
5089
5090	/* When Nagle's algorithm is not disabled, it is better
5091	 * to wakeup the client even before there is atleast one
5092	 * maxseg of data to write.
5093	 */
5094	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5095		int retval = 0;
5096		MPTS_LOCK(mpts);
5097		if (mpts->mpts_flags & MPTSF_ACTIVE) {
5098			struct socket *subf_so = mpts->mpts_socket;
5099			socket_lock(subf_so, 0);
5100			struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
5101
5102			notsent = so->so_snd.sb_cc -
5103			   (tp->snd_nxt - tp->snd_una);
5104
5105			if ((tp->t_flags & TF_NODELAY) == 0 &&
5106			    notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
5107				retval = 1;
5108			}
5109			mptcplog3((LOG_INFO, "%s: lowat %d notsent %d"
5110			    " nodelay false \n",
5111			    __func__, mp_tp->mpt_notsent_lowat, notsent));
5112			socket_unlock(subf_so, 0);
5113			MPTS_UNLOCK(mpts);
5114			return (retval);
5115		}
5116		MPTS_UNLOCK(mpts);
5117	}
5118	return (0);
5119}
5120
5121