1/*
2 * Copyright (c) 2012-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/proc.h>
31#include <sys/systm.h>
32#include <sys/kernel.h>
33#include <sys/mbuf.h>
34#include <sys/mcache.h>
35#include <sys/resourcevar.h>
36#include <sys/socket.h>
37#include <sys/socketvar.h>
38#include <sys/syslog.h>
39#include <sys/domain.h>
40#include <sys/protosw.h>
41#include <sys/sysctl.h>
42
43#include <kern/zalloc.h>
44#include <kern/locks.h>
45
46#include <mach/thread_act.h>
47#include <mach/sdt.h>
48
49#include <net/if.h>
50#include <netinet/in.h>
51#include <netinet/in_pcb.h>
52#include <netinet/in_var.h>
53#include <netinet/tcp.h>
54#include <netinet/tcp_fsm.h>
55#include <netinet/tcp_seq.h>
56#include <netinet/tcp_var.h>
57#include <netinet/mptcp_var.h>
58#include <netinet/mptcp.h>
59#include <netinet/mptcp_seq.h>
60#include <netinet/mptcp_timer.h>
61#include <libkern/crypto/sha1.h>
62#if INET6
63#include <netinet6/in6_pcb.h>
64#include <netinet6/ip6protosw.h>
65#endif /* INET6 */
66#include <dev/random/randomdev.h>
67
68/*
69 * Notes on MPTCP implementation.
70 *
71 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
72 * communication domain.  The structure mtcbinfo describes the MPTCP instance
73 * of a Multipath protocol in that domain.  It is used to keep track of all
74 * MPTCP PCB instances in the system, and is protected by the global lock
75 * mppi_lock.
76 *
77 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
78 * IPPROTO_TCP).  Upon success, a Multipath PCB gets allocated and along with
79 * it comes an MPTCP Session and an MPTCP PCB.  All three structures are
80 * allocated from the same memory block, and each structure has a pointer
81 * to the adjacent ones.  The layout is defined by the mpp_mtp structure.
82 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
83 * PCB (mppcb) as well as the MPTCP Session (mptses).
84 *
85 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
86 * in particular, the list of subflows as well as the MPTCP thread.
87 *
88 * A functioning MPTCP Session consists of one or more subflow sockets.  Each
89 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
90 * represented by the mptsub structure.  Because each subflow requires access
91 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
92 * subflow.  This gets decremented prior to the subflow's destruction.  The
93 * subflow lock (mpts_lock) is used to protect accesses to the subflow.
94 *
95 * To handle events (read, write, control) from the subflows, an MPTCP thread
96 * is created; currently, there is one thread per MPTCP Session.  In order to
97 * prevent the MPTCP socket from being destroyed while being accessed by the
98 * MPTCP thread, we bump up the MPTCP socket's so_usecount for the thread,
99 * which will be decremented prior to the thread's termination.  The thread
100 * lock (mpte_thread_lock) is used to synchronize its signalling.
101 *
102 * Lock ordering is defined as follows:
103 *
104 *	mtcbinfo (mppi_lock)
105 *		mp_so (mpp_lock)
106 *			mpts (mpts_lock)
107 *				so (inpcb_mtx)
108 *					mptcb (mpt_lock)
109 *
110 * It is not a requirement that all of the above locks need to be acquired
111 * in succession, but the correct lock ordering must be followed when there
112 * are more than one locks that need to be held.  The MPTCP thread lock is
113 * is not constrained by this arrangement, because none of the other locks
114 * is ever acquired while holding mpte_thread_lock; therefore it may be called
115 * at any moment to signal the thread.
116 *
117 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
118 * work is done by the MPTCP garbage collector which is invoked on demand by
119 * the PF_MULTIPATH garbage collector.  This process will take place once all
120 * of the subflows have been destroyed, and the MPTCP thread be instructed to
121 * self-terminate.
122 */
123
124static void mptcp_sesdestroy(struct mptses *);
125static void mptcp_thread_signal_locked(struct mptses *);
126static void mptcp_thread_terminate_signal(struct mptses *);
127static void mptcp_thread_dowork(struct mptses *);
128static void mptcp_thread_func(void *, wait_result_t);
129static void mptcp_thread_destroy(struct mptses *);
130static void mptcp_key_pool_init(void);
131static void mptcp_attach_to_subf(struct socket *, struct mptcb *, connid_t);
132static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
133static void mptcp_conn_properties(struct mptcb *);
134static void mptcp_init_statevars(struct mptcb *);
135
136static uint32_t mptcp_gc(struct mppcbinfo *);
137static int mptcp_subflow_socreate(struct mptses *, struct mptsub *,
138    int, struct proc *, struct socket **);
139static int mptcp_subflow_soclose(struct mptsub *, struct socket *);
140static int mptcp_subflow_soconnectx(struct mptses *, struct mptsub *);
141static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
142    struct uio *, struct mbuf **, struct mbuf **, int *);
143static void mptcp_subflow_rupcall(struct socket *, void *, int);
144static void mptcp_subflow_input(struct mptses *, struct mptsub *);
145static void mptcp_subflow_wupcall(struct socket *, void *, int);
146static void mptcp_subflow_eupcall(struct socket *, void *, uint32_t);
147static void mptcp_update_last_owner(struct mptsub *, struct socket *);
148
149/*
150 * Possible return values for subflow event handlers.  Note that success
151 * values must be greater or equal than MPTS_EVRET_OK.  Values less than that
152 * indicate errors or actions which require immediate attention; they will
153 * prevent the rest of the handlers from processing their respective events
154 * until the next round of events processing.
155 */
156typedef enum {
157	MPTS_EVRET_DELETE		= 1,	/* delete this subflow */
158	MPTS_EVRET_OK			= 2,	/* OK */
159	MPTS_EVRET_CONNECT_PENDING	= 3,	/* resume pended connects */
160	MPTS_EVRET_DISCONNECT_FALLBACK	= 4,	/* abort all but preferred */
161	MPTS_EVRET_OK_UPDATE		= 5,	/* OK with conninfo update */
162} ev_ret_t;
163
164static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *);
165static ev_ret_t mptcp_subflow_connreset_ev(struct mptses *, struct mptsub *);
166static ev_ret_t mptcp_subflow_cantrcvmore_ev(struct mptses *, struct mptsub *);
167static ev_ret_t mptcp_subflow_cantsendmore_ev(struct mptses *, struct mptsub *);
168static ev_ret_t mptcp_subflow_timeout_ev(struct mptses *, struct mptsub *);
169static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *);
170static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *);
171static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *);
172static ev_ret_t mptcp_subflow_suspend_ev(struct mptses *, struct mptsub *);
173static ev_ret_t mptcp_subflow_resume_ev(struct mptses *, struct mptsub *);
174static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *);
175static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *);
176static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *);
177static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *);
178static const char *mptcp_evret2str(ev_ret_t);
179
180static mptcp_key_t *mptcp_reserve_key(void);
181static int mptcp_do_sha1(mptcp_key_t *, char *, int);
182static int mptcp_init_authparms(struct mptcb *);
183static int mptcp_delete_ok(struct mptses *mpte, struct mptsub *mpts);
184
185static unsigned int mptsub_zone_size;		/* size of mptsub */
186static struct zone *mptsub_zone;		/* zone for mptsub */
187
188static unsigned int mptopt_zone_size;		/* size of mptopt */
189static struct zone *mptopt_zone;		/* zone for mptopt */
190
191static unsigned int mpt_subauth_entry_size;	/* size of subf auth entry */
192static struct zone *mpt_subauth_zone;		/* zone of subf auth entry */
193
194struct mppcbinfo mtcbinfo;
195
196static struct mptcp_keys_pool_head mptcp_keys_pool;
197
198#define	MPTCP_SUBFLOW_WRITELEN	(8 * 1024)	/* bytes to write each time */
199#define	MPTCP_SUBFLOW_READLEN	(8 * 1024)	/* bytes to read each time */
200
201SYSCTL_DECL(_net_inet);
202
203SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
204
205uint32_t mptcp_verbose = 0;		/* more noise if greater than 1 */
206SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, verbose, CTLFLAG_RW|CTLFLAG_LOCKED,
207	&mptcp_verbose, 0, "MPTCP verbosity level");
208
209SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
210	&mtcbinfo.mppi_count, 0, "Number of active PCBs");
211
212/*
213 * Since there is one kernel thread per mptcp socket, imposing an artificial
214 * limit on number of allowed mptcp sockets.
215 */
216uint32_t mptcp_socket_limit = MPPCB_LIMIT;
217SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, sk_lim, CTLFLAG_RW|CTLFLAG_LOCKED,
218	&mptcp_socket_limit, 0, "MPTCP socket limit");
219
220static struct protosw mptcp_subflow_protosw;
221static struct pr_usrreqs mptcp_subflow_usrreqs;
222#if INET6
223static struct ip6protosw mptcp_subflow_protosw6;
224static struct pr_usrreqs mptcp_subflow_usrreqs6;
225#endif /* INET6 */
226
227/*
228 * Protocol pr_init callback.
229 */
230void
231mptcp_init(struct protosw *pp, struct domain *dp)
232{
233#pragma unused(dp)
234	static int mptcp_initialized = 0;
235	struct protosw *prp;
236#if INET6
237	struct ip6protosw *prp6;
238#endif /* INET6 */
239
240	VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
241
242	/* do this only once */
243	if (mptcp_initialized)
244		return;
245	mptcp_initialized = 1;
246
247	/*
248	 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
249	 * we must be able to find IPPROTO_TCP entries for both.
250	 */
251	prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
252	VERIFY(prp != NULL);
253	bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
254	bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
255	    sizeof (mptcp_subflow_usrreqs));
256	mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
257	mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
258	mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
259	mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
260	mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
261	/*
262	 * Socket filters shouldn't attach/detach to/from this protosw
263	 * since pr_protosw is to be used instead, which points to the
264	 * real protocol; if they do, it is a bug and we should panic.
265	 */
266	mptcp_subflow_protosw.pr_filter_head.tqh_first =
267	    (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
268	mptcp_subflow_protosw.pr_filter_head.tqh_last =
269	    (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
270
271#if INET6
272	prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
273	    IPPROTO_TCP, SOCK_STREAM);
274	VERIFY(prp6 != NULL);
275	bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
276	bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
277	    sizeof (mptcp_subflow_usrreqs6));
278	mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
279	mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
280	mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
281	mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
282	mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
283	/*
284	 * Socket filters shouldn't attach/detach to/from this protosw
285	 * since pr_protosw is to be used instead, which points to the
286	 * real protocol; if they do, it is a bug and we should panic.
287	 */
288	mptcp_subflow_protosw6.pr_filter_head.tqh_first =
289	    (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
290	mptcp_subflow_protosw6.pr_filter_head.tqh_last =
291	    (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
292#endif /* INET6 */
293
294	bzero(&mtcbinfo, sizeof (mtcbinfo));
295	TAILQ_INIT(&mtcbinfo.mppi_pcbs);
296	mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
297	if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
298	    1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
299		panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
300		/* NOTREACHED */
301	}
302	zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
303	zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
304
305	mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
306	mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
307	    mtcbinfo.mppi_lock_grp_attr);
308	mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
309	lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
310	    mtcbinfo.mppi_lock_attr);
311	mtcbinfo.mppi_gc = mptcp_gc;
312
313	mtcbinfo.mppi_timer = mptcp_timer;
314
315	/* attach to MP domain for garbage collection to take place */
316	mp_pcbinfo_attach(&mtcbinfo);
317
318	mptsub_zone_size = sizeof (struct mptsub);
319	if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
320	    8192, "mptsub")) == NULL) {
321		panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
322		/* NOTREACHED */
323	}
324	zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
325	zone_change(mptsub_zone, Z_EXPAND, TRUE);
326
327	mptopt_zone_size = sizeof (struct mptopt);
328	if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
329	    1024, "mptopt")) == NULL) {
330		panic("%s: unable to allocate MPTCP option zone\n", __func__);
331		/* NOTREACHED */
332	}
333	zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
334	zone_change(mptopt_zone, Z_EXPAND, TRUE);
335
336	mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
337	if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
338	    1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
339		panic("%s: unable to allocate MPTCP address auth zone \n",
340		    __func__);
341		/* NOTREACHED */
342	}
343	zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
344	zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
345
346	/* Set up a list of unique keys */
347	mptcp_key_pool_init();
348
349}
350
351/*
352 * Create an MPTCP session, called as a result of opening a MPTCP socket.
353 */
354struct mptses *
355mptcp_sescreate(struct socket *mp_so, struct mppcb *mpp)
356{
357	struct mppcbinfo *mppi;
358	struct mptses *mpte;
359	struct mptcb *mp_tp;
360	int error = 0;
361
362	VERIFY(mpp != NULL);
363	mppi = mpp->mpp_pcbinfo;
364	VERIFY(mppi != NULL);
365
366	mpte = &((struct mpp_mtp *)mpp)->mpp_ses;
367	mp_tp = &((struct mpp_mtp *)mpp)->mtcb;
368
369	/* MPTCP Multipath PCB Extension */
370	bzero(mpte, sizeof (*mpte));
371	VERIFY(mpp->mpp_pcbe == NULL);
372	mpp->mpp_pcbe = mpte;
373	mpte->mpte_mppcb = mpp;
374	mpte->mpte_mptcb = mp_tp;
375
376	TAILQ_INIT(&mpte->mpte_sopts);
377	TAILQ_INIT(&mpte->mpte_subflows);
378	mpte->mpte_associd = ASSOCID_ANY;
379	mpte->mpte_connid_last = CONNID_ANY;
380
381	lck_mtx_init(&mpte->mpte_thread_lock, mppi->mppi_lock_grp,
382	    mppi->mppi_lock_attr);
383
384	/*
385	 * XXX: adi@apple.com
386	 *
387	 * This can be rather expensive if we have lots of MPTCP sockets,
388	 * but we need a kernel thread for this model to work.  Perhaps we
389	 * could amortize the costs by having one worker thread per a group
390	 * of MPTCP sockets.
391	 */
392	if (kernel_thread_start(mptcp_thread_func, mpte,
393	    &mpte->mpte_thread) != KERN_SUCCESS) {
394		error = ENOBUFS;
395		goto out;
396	}
397	mp_so->so_usecount++;		/* for thread */
398
399	/* MPTCP Protocol Control Block */
400	bzero(mp_tp, sizeof (*mp_tp));
401	lck_mtx_init(&mp_tp->mpt_lock, mppi->mppi_lock_grp,
402	    mppi->mppi_lock_attr);
403	mp_tp->mpt_mpte = mpte;
404
405out:
406	if (error != 0)
407		lck_mtx_destroy(&mpte->mpte_thread_lock, mppi->mppi_lock_grp);
408	DTRACE_MPTCP5(session__create, struct socket *, mp_so,
409	    struct sockbuf *, &mp_so->so_rcv,
410	    struct sockbuf *, &mp_so->so_snd,
411	    struct mppcb *, mpp, int, error);
412
413	return ((error != 0) ? NULL : mpte);
414}
415
416/*
417 * Destroy an MPTCP session.
418 */
419static void
420mptcp_sesdestroy(struct mptses *mpte)
421{
422	struct mptcb *mp_tp;
423
424	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
425
426	mp_tp = mpte->mpte_mptcb;
427	VERIFY(mp_tp != NULL);
428
429	/*
430	 * MPTCP Multipath PCB Extension section
431	 */
432	mptcp_flush_sopts(mpte);
433	VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
434
435	lck_mtx_destroy(&mpte->mpte_thread_lock,
436	    mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
437
438	/*
439	 * MPTCP Protocol Control Block section
440	 */
441	lck_mtx_destroy(&mp_tp->mpt_lock,
442	    mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
443
444	DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
445	    struct mptcb *, mp_tp);
446}
447
448/*
449 * Allocate an MPTCP socket option structure.
450 */
451struct mptopt *
452mptcp_sopt_alloc(int how)
453{
454	struct mptopt *mpo;
455
456	mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
457	    zalloc_noblock(mptopt_zone);
458	if (mpo != NULL) {
459		bzero(mpo, mptopt_zone_size);
460	}
461
462	return (mpo);
463}
464
465/*
466 * Free an MPTCP socket option structure.
467 */
468void
469mptcp_sopt_free(struct mptopt *mpo)
470{
471	VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
472
473	zfree(mptopt_zone, mpo);
474}
475
476/*
477 * Add a socket option to the MPTCP socket option list.
478 */
479void
480mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
481{
482	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
483	VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
484	mpo->mpo_flags |= MPOF_ATTACHED;
485	TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
486}
487
488/*
489 * Remove a socket option from the MPTCP socket option list.
490 */
491void
492mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
493{
494	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
495	VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
496	mpo->mpo_flags &= ~MPOF_ATTACHED;
497	TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
498}
499
500/*
501 * Search for an existing <sopt_level,sopt_name> socket option.
502 */
503struct mptopt *
504mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
505{
506	struct mptopt *mpo;
507
508	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
509
510	TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
511		if (mpo->mpo_level == sopt->sopt_level &&
512		    mpo->mpo_name == sopt->sopt_name)
513			break;
514	}
515	VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int));
516
517	return (mpo);
518}
519
520/*
521 * Flushes all recorded socket options from an MP socket.
522 */
523void
524mptcp_flush_sopts(struct mptses *mpte)
525{
526	struct mptopt *mpo, *tmpo;
527
528	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
529
530	TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
531		mptcp_sopt_remove(mpte, mpo);
532		mptcp_sopt_free(mpo);
533	}
534	VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
535}
536
537/*
538 * Allocate a MPTCP subflow structure.
539 */
540struct mptsub *
541mptcp_subflow_alloc(int how)
542{
543	struct mptsub *mpts;
544
545	mpts = (how == M_WAITOK) ? zalloc(mptsub_zone) :
546	    zalloc_noblock(mptsub_zone);
547	if (mpts != NULL) {
548		bzero(mpts, mptsub_zone_size);
549		lck_mtx_init(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp,
550		    mtcbinfo.mppi_lock_attr);
551	}
552
553	return (mpts);
554}
555
556/*
557 * Deallocate a subflow structure, called when all of the references held
558 * on it have been released.  This implies that the subflow has been deleted.
559 */
560void
561mptcp_subflow_free(struct mptsub *mpts)
562{
563	MPTS_LOCK_ASSERT_HELD(mpts);
564
565	VERIFY(mpts->mpts_refcnt == 0);
566	VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
567	VERIFY(mpts->mpts_mpte == NULL);
568	VERIFY(mpts->mpts_socket == NULL);
569
570	if (mpts->mpts_src_sl != NULL) {
571		sockaddrlist_free(mpts->mpts_src_sl);
572		mpts->mpts_src_sl = NULL;
573	}
574	if (mpts->mpts_dst_sl != NULL) {
575		sockaddrlist_free(mpts->mpts_dst_sl);
576		mpts->mpts_dst_sl = NULL;
577	}
578	MPTS_UNLOCK(mpts);
579	lck_mtx_destroy(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp);
580
581	zfree(mptsub_zone, mpts);
582}
583
584/*
585 * Create an MPTCP subflow socket.
586 */
587static int
588mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
589    struct proc *p, struct socket **so)
590{
591	struct mptopt smpo, *mpo, *tmpo;
592	struct socket *mp_so;
593	int error;
594
595	*so = NULL;
596	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
597	mp_so = mpte->mpte_mppcb->mpp_socket;
598
599	/*
600	 * Create the subflow socket (multipath subflow, non-blocking.)
601	 *
602	 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
603	 * socket; it will be cleared when the socket is peeled off or closed.
604	 * It also indicates to the underlying TCP to handle MPTCP options.
605	 * A multipath subflow socket implies SS_NOFDREF state.
606	 */
607	if ((error = socreate_internal(dom, so, SOCK_STREAM,
608	    IPPROTO_TCP, p, SOCF_ASYNC | SOCF_MP_SUBFLOW, PROC_NULL)) != 0) {
609		mptcplog((LOG_ERR, "MPTCP ERROR %s: mp_so 0x%llx unable to "
610		    "create subflow socket error %d\n", __func__,
611		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error));
612		return (error);
613	}
614
615	socket_lock(*so, 0);
616	VERIFY((*so)->so_flags & SOF_MP_SUBFLOW);
617	VERIFY(((*so)->so_state & (SS_NBIO|SS_NOFDREF)) ==
618	    (SS_NBIO|SS_NOFDREF));
619
620	/* prevent the socket buffers from being compressed */
621	(*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
622	(*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
623
624	bzero(&smpo, sizeof (smpo));
625	smpo.mpo_flags |= MPOF_SUBFLOW_OK;
626	smpo.mpo_level = SOL_SOCKET;
627	smpo.mpo_intval = 1;
628
629	/* disable SIGPIPE */
630	smpo.mpo_name = SO_NOSIGPIPE;
631	if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
632		goto out;
633
634	/* find out if the subflow's source address goes away */
635	smpo.mpo_name = SO_NOADDRERR;
636	if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
637		goto out;
638
639	/* enable keepalive */
640	smpo.mpo_name = SO_KEEPALIVE;
641	if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
642		goto out;
643
644	/*
645	 * Limit the receive socket buffer size to 64k.
646	 *
647	 * We need to take into consideration the window scale option
648	 * which could be negotiated in one subflow but disabled in
649	 * another subflow.
650	 * XXX This can be improved in the future.
651	 */
652	smpo.mpo_name = SO_RCVBUF;
653	smpo.mpo_intval = MPTCP_RWIN_MAX;
654	if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
655		goto out;
656
657	/* N.B.: set by sosetopt */
658	VERIFY(!((*so)->so_rcv.sb_flags & SB_AUTOSIZE));
659	/* Prevent automatic socket buffer sizing. */
660	(*so)->so_snd.sb_flags &= ~SB_AUTOSIZE;
661
662	smpo.mpo_level = IPPROTO_TCP;
663	smpo.mpo_intval = mptcp_subflow_keeptime;
664	smpo.mpo_name = TCP_KEEPALIVE;
665	if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
666		goto out;
667
668	/* replay setsockopt(2) on the subflow sockets for eligible options */
669	TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
670		int interim;
671
672		if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
673			continue;
674
675		/*
676		 * Skip those that are handled internally; these options
677		 * should not have been recorded and marked with the
678		 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
679		 */
680		if (mpo->mpo_level == SOL_SOCKET &&
681		    (mpo->mpo_name == SO_NOSIGPIPE ||
682		    mpo->mpo_name == SO_NOADDRERR ||
683		    mpo->mpo_name == SO_KEEPALIVE))
684			continue;
685
686		interim = (mpo->mpo_flags & MPOF_INTERIM);
687		if (mptcp_subflow_sosetopt(mpte, *so, mpo) != 0 && interim) {
688			char buf[32];
689			mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s val %d "
690			    "interim record removed\n", __func__,
691			    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
692			    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
693			    buf, sizeof (buf)), mpo->mpo_intval));
694			mptcp_sopt_remove(mpte, mpo);
695			mptcp_sopt_free(mpo);
696			continue;
697		}
698	}
699
700	/*
701	 * We need to receive everything that the subflow socket has,
702	 * so use a customized socket receive function.  We will undo
703	 * this when the socket is peeled off or closed.
704	 */
705	mpts->mpts_oprotosw = (*so)->so_proto;
706	switch (dom) {
707	case PF_INET:
708		(*so)->so_proto = &mptcp_subflow_protosw;
709		break;
710#if INET6
711	case PF_INET6:
712		(*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
713		break;
714#endif /* INET6 */
715	default:
716		VERIFY(0);
717		/* NOTREACHED */
718	}
719
720out:
721	socket_unlock(*so, 0);
722
723	DTRACE_MPTCP4(subflow__create, struct mptses *, mpte,
724	    struct mptsub *, mpts, int, dom, int, error);
725
726	return (error);
727}
728
729/*
730 * Close an MPTCP subflow socket.
731 *
732 * Note that this may be called on an embryonic subflow, and the only
733 * thing that is guaranteed valid is the protocol-user request.
734 */
735static int
736mptcp_subflow_soclose(struct mptsub *mpts, struct socket *so)
737{
738	MPTS_LOCK_ASSERT_HELD(mpts);
739
740	socket_lock(so, 0);
741	VERIFY(so->so_flags & SOF_MP_SUBFLOW);
742	VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
743
744	/* restore protocol-user requests */
745	VERIFY(mpts->mpts_oprotosw != NULL);
746	so->so_proto = mpts->mpts_oprotosw;
747	socket_unlock(so, 0);
748
749	mpts->mpts_socket = NULL;	/* may already be NULL */
750
751	DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
752	    struct socket *, so,
753	    struct sockbuf *, &so->so_rcv,
754	    struct sockbuf *, &so->so_snd,
755	    struct mptses *, mpts->mpts_mpte);
756
757	return (soclose(so));
758}
759
760/*
761 * Connect an MPTCP subflow socket.
762 *
763 * This may be called inline as part of adding a subflow, or asynchronously
764 * by the thread (upon progressing to MPTCPF_JOIN_READY).  Note that in the
765 * pending connect case, the subflow socket may have been bound to an interface
766 * and/or a source IP address which may no longer be around by the time this
767 * routine is called; in that case the connect attempt will most likely fail.
768 */
769static int
770mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
771{
772	struct socket *so;
773	int af, error;
774
775	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
776	MPTS_LOCK_ASSERT_HELD(mpts);
777
778	VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) ==
779	    MPTSF_CONNECTING);
780	VERIFY(mpts->mpts_socket != NULL);
781	so = mpts->mpts_socket;
782	af = mpts->mpts_family;
783
784	if (af == AF_INET || af == AF_INET6) {
785		struct sockaddr_entry *dst_se;
786		char dbuf[MAX_IPv6_STR_LEN];
787
788		dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
789		VERIFY(dst_se != NULL);
790
791		mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx dst %s[%d] cid %d "
792		    "[pended %s]\n", __func__,
793		    (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
794		    inet_ntop(af, ((af == AF_INET) ?
795		    (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
796		    (void *)&SIN6(dst_se->se_addr)->sin6_addr),
797		    dbuf, sizeof (dbuf)), ((af == AF_INET) ?
798		    ntohs(SIN(dst_se->se_addr)->sin_port) :
799		    ntohs(SIN6(dst_se->se_addr)->sin6_port)),
800		    mpts->mpts_connid,
801		    ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
802		    "YES" : "NO")));
803	}
804
805	mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
806
807	socket_lock(so, 0);
808	mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpts->mpts_connid);
809	/* connect the subflow socket */
810	error = soconnectxlocked(so, &mpts->mpts_src_sl, &mpts->mpts_dst_sl,
811	    mpts->mpts_mpcr.mpcr_proc, mpts->mpts_mpcr.mpcr_ifscope,
812	    mpte->mpte_associd, NULL, TCP_CONNREQF_MPTCP,
813	    &mpts->mpts_mpcr, sizeof (mpts->mpts_mpcr));
814	socket_unlock(so, 0);
815
816	DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
817	    struct mptsub *, mpts, int, error);
818
819	return (error);
820}
821
822/*
823 * MPTCP subflow socket receive routine, derived from soreceive().
824 */
825static int
826mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
827    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
828{
829#pragma unused(uio)
830	int flags, error = 0;
831	struct proc *p = current_proc();
832	struct mbuf *m, **mp = mp0;
833	struct mbuf *nextrecord;
834
835	socket_lock(so, 1);
836	VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
837
838#ifdef MORE_LOCKING_DEBUG
839	if (so->so_usecount == 1) {
840		panic("%s: so=%x no other reference on socket\n", __func__, so);
841		/* NOTREACHED */
842	}
843#endif
844	/*
845	 * We return all that is there in the subflow's socket receive buffer
846	 * to the MPTCP layer, so we require that the caller passes in the
847	 * expected parameters.
848	 */
849	if (mp == NULL || controlp != NULL) {
850		socket_unlock(so, 1);
851		return (EINVAL);
852	}
853	*mp = NULL;
854	if (psa != NULL)
855		*psa = NULL;
856	if (flagsp != NULL)
857		flags = *flagsp &~ MSG_EOR;
858	else
859		flags = 0;
860
861	if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM)) {
862		socket_unlock(so, 1);
863		return (EOPNOTSUPP);
864	}
865	flags |= (MSG_DONTWAIT|MSG_NBIO);
866
867	/*
868	 * If a recv attempt is made on a previously-accepted socket
869	 * that has been marked as inactive (disconnected), reject
870	 * the request.
871	 */
872	if (so->so_flags & SOF_DEFUNCT) {
873		struct sockbuf *sb = &so->so_rcv;
874
875		error = ENOTCONN;
876		SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
877		    __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
878		    SOCK_DOM(so), SOCK_TYPE(so), error));
879		/*
880		 * This socket should have been disconnected and flushed
881		 * prior to being returned from sodefunct(); there should
882		 * be no data on its receive list, so panic otherwise.
883		 */
884		if (so->so_state & SS_DEFUNCT)
885			sb_empty_assert(sb, __func__);
886		socket_unlock(so, 1);
887		return (error);
888	}
889
890	/*
891	 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
892	 * and if so just return to the caller.  This could happen when
893	 * soreceive() is called by a socket upcall function during the
894	 * time the socket is freed.  The socket buffer would have been
895	 * locked across the upcall, therefore we cannot put this thread
896	 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
897	 * we may livelock), because the lock on the socket buffer will
898	 * only be released when the upcall routine returns to its caller.
899	 * Because the socket has been officially closed, there can be
900	 * no further read on it.
901	 *
902	 * A multipath subflow socket would have its SS_NOFDREF set by
903	 * default, so check for SOF_MP_SUBFLOW socket flag; when the
904	 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
905	 */
906	if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
907	    (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
908		socket_unlock(so, 1);
909		return (0);
910	}
911
912	/*
913	 * For consistency with soreceive() semantics, we need to obey
914	 * SB_LOCK in case some other code path has locked the buffer.
915	 */
916	error = sblock(&so->so_rcv, 0);
917	if (error != 0) {
918		socket_unlock(so, 1);
919		return (error);
920	}
921
922	m = so->so_rcv.sb_mb;
923	if (m == NULL) {
924		/*
925		 * Panic if we notice inconsistencies in the socket's
926		 * receive list; both sb_mb and sb_cc should correctly
927		 * reflect the contents of the list, otherwise we may
928		 * end up with false positives during select() or poll()
929		 * which could put the application in a bad state.
930		 */
931		SB_MB_CHECK(&so->so_rcv);
932
933		if (so->so_error != 0) {
934			error = so->so_error;
935			so->so_error = 0;
936			goto release;
937		}
938
939		if (so->so_state & SS_CANTRCVMORE) {
940			goto release;
941		}
942
943		if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
944			error = ENOTCONN;
945			goto release;
946		}
947
948		/*
949		 * MSG_DONTWAIT is implicitly defined and this routine will
950		 * never block, so return EWOULDBLOCK when there is nothing.
951		 */
952		error = EWOULDBLOCK;
953		goto release;
954	}
955
956	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
957	SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
958	SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
959
960	while (m != NULL) {
961		nextrecord = m->m_nextpkt;
962		sbfree(&so->so_rcv, m);
963
964		if (mp != NULL) {
965			*mp = m;
966			mp = &m->m_next;
967			so->so_rcv.sb_mb = m = m->m_next;
968			*mp = NULL;
969		}
970
971		if (m != NULL) {
972			m->m_nextpkt = nextrecord;
973			if (nextrecord == NULL)
974				so->so_rcv.sb_lastrecord = m;
975		} else {
976			m = so->so_rcv.sb_mb = nextrecord;
977			SB_EMPTY_FIXUP(&so->so_rcv);
978		}
979		SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
980		SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
981	}
982
983	DTRACE_MPTCP3(subflow__receive, struct socket *, so,
984	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
985	/* notify protocol that we drained all the data */
986	if ((so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
987		(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
988
989	if (flagsp != NULL)
990		*flagsp |= flags;
991
992release:
993	sbunlock(&so->so_rcv, FALSE);	/* will unlock socket */
994	return (error);
995
996}
997
998
999/*
1000 * Prepare an MPTCP subflow socket for peeloff(2); basically undo
1001 * the work done earlier when the subflow socket was created.
1002 */
1003void
1004mptcp_subflow_sopeeloff(struct mptses *mpte, struct mptsub *mpts,
1005    struct socket *so)
1006{
1007	struct mptopt smpo;
1008	struct socket *mp_so;
1009	int p, c;
1010
1011	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1012	mp_so = mpte->mpte_mppcb->mpp_socket;
1013	MPTS_LOCK_ASSERT_HELD(mpts);
1014
1015	socket_lock(so, 0);
1016	VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1017	VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1018
1019	/* inherit MPTCP socket states */
1020	if (!(mp_so->so_state & SS_NBIO))
1021		so->so_state &= ~SS_NBIO;
1022
1023	/*
1024	 * At this point, the socket is not yet closed, as there is at least
1025	 * one outstanding usecount previously held by mpts_socket from
1026	 * socreate().  Atomically clear SOF_MP_SUBFLOW and SS_NOFDREF here.
1027	 */
1028	so->so_flags &= ~SOF_MP_SUBFLOW;
1029	so->so_state &= ~SS_NOFDREF;
1030	so->so_state &= ~SOF_MPTCP_TRUE;
1031
1032	/* allow socket buffers to be compressed */
1033	so->so_rcv.sb_flags &= ~SB_NOCOMPRESS;
1034	so->so_snd.sb_flags &= ~SB_NOCOMPRESS;
1035
1036	/*
1037	 * Allow socket buffer auto sizing.
1038	 *
1039	 * This will increase the current 64k buffer size to whatever is best.
1040	 */
1041	so->so_rcv.sb_flags |= SB_AUTOSIZE;
1042	so->so_snd.sb_flags |= SB_AUTOSIZE;
1043
1044	/* restore protocol-user requests */
1045	VERIFY(mpts->mpts_oprotosw != NULL);
1046	so->so_proto = mpts->mpts_oprotosw;
1047
1048	bzero(&smpo, sizeof (smpo));
1049	smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1050	smpo.mpo_level = SOL_SOCKET;
1051
1052	/* inherit SOF_NOSIGPIPE from parent MP socket */
1053	p = (mp_so->so_flags & SOF_NOSIGPIPE);
1054	c = (so->so_flags & SOF_NOSIGPIPE);
1055	smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1056	smpo.mpo_name = SO_NOSIGPIPE;
1057	if ((p - c) != 0)
1058		(void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1059
1060	/* inherit SOF_NOADDRAVAIL from parent MP socket */
1061	p = (mp_so->so_flags & SOF_NOADDRAVAIL);
1062	c = (so->so_flags & SOF_NOADDRAVAIL);
1063	smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1064	smpo.mpo_name = SO_NOADDRERR;
1065	if ((p - c) != 0)
1066		(void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1067
1068	/* inherit SO_KEEPALIVE from parent MP socket */
1069	p = (mp_so->so_options & SO_KEEPALIVE);
1070	c = (so->so_options & SO_KEEPALIVE);
1071	smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1072	smpo.mpo_name = SO_KEEPALIVE;
1073	if ((p - c) != 0)
1074		(void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1075
1076	/* unset TCP level default keepalive option */
1077	p = (intotcpcb(sotoinpcb(mp_so)))->t_keepidle;
1078	c = (intotcpcb(sotoinpcb(so)))->t_keepidle;
1079	smpo.mpo_level = IPPROTO_TCP;
1080	smpo.mpo_intval = 0;
1081	smpo.mpo_name = TCP_KEEPALIVE;
1082	if ((p - c) != 0)
1083		(void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1084	socket_unlock(so, 0);
1085
1086	DTRACE_MPTCP5(subflow__peeloff, struct mptses *, mpte,
1087	    struct mptsub *, mpts, struct socket *, so,
1088	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1089}
1090
1091/*
1092 * Establish an initial MPTCP connection (if first subflow and not yet
1093 * connected), or add a subflow to an existing MPTCP connection.
1094 */
1095int
1096mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts,
1097    struct proc *p, uint32_t ifscope)
1098{
1099	struct sockaddr_entry *se, *src_se = NULL, *dst_se = NULL;
1100	struct socket *mp_so, *so = NULL;
1101	struct mptsub_connreq mpcr;
1102	struct mptcb *mp_tp;
1103	int af, error = 0;
1104
1105	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1106	mp_so = mpte->mpte_mppcb->mpp_socket;
1107	mp_tp = mpte->mpte_mptcb;
1108
1109	MPTS_LOCK(mpts);
1110	VERIFY(!(mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)));
1111	VERIFY(mpts->mpts_mpte == NULL);
1112	VERIFY(mpts->mpts_socket == NULL);
1113	VERIFY(mpts->mpts_dst_sl != NULL);
1114	VERIFY(mpts->mpts_connid == CONNID_ANY);
1115
1116	/* select source (if specified) and destination addresses */
1117	if ((error = in_selectaddrs(AF_UNSPEC, &mpts->mpts_src_sl, &src_se,
1118	    &mpts->mpts_dst_sl, &dst_se)) != 0)
1119		goto out;
1120
1121	VERIFY(mpts->mpts_dst_sl != NULL && dst_se != NULL);
1122	VERIFY(src_se == NULL || mpts->mpts_src_sl != NULL);
1123	af = mpts->mpts_family = dst_se->se_addr->sa_family;
1124	VERIFY(src_se == NULL || src_se->se_addr->sa_family == af);
1125	VERIFY(af == AF_INET || af == AF_INET6);
1126
1127	/*
1128	 * If the source address is not specified, allocate a storage for
1129	 * it, so that later on we can fill it in with the actual source
1130	 * IP address chosen by the underlying layer for the subflow after
1131	 * it is connected.
1132	 */
1133	if (mpts->mpts_src_sl == NULL) {
1134		mpts->mpts_src_sl =
1135		    sockaddrlist_dup(mpts->mpts_dst_sl, M_WAITOK);
1136		if (mpts->mpts_src_sl == NULL) {
1137			error = ENOBUFS;
1138			goto out;
1139		}
1140		se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
1141		VERIFY(se != NULL && se->se_addr != NULL &&
1142		    se->se_addr->sa_len == dst_se->se_addr->sa_len);
1143		bzero(se->se_addr, se->se_addr->sa_len);
1144		se->se_addr->sa_len = dst_se->se_addr->sa_len;
1145		se->se_addr->sa_family = dst_se->se_addr->sa_family;
1146	}
1147
1148	/* create the subflow socket */
1149	if ((error = mptcp_subflow_socreate(mpte, mpts, af, p, &so)) != 0)
1150		goto out;
1151
1152	/*
1153	 * XXX: adi@apple.com
1154	 *
1155	 * This probably needs to be made smarter, but for now simply
1156	 * increment the counter, while avoiding 0 (CONNID_ANY) and
1157	 * -1 (CONNID_ALL).  Assume that an MPTCP connection will not
1158	 * live too long with (2^32)-2 subflow connection attempts.
1159	 */
1160	mpte->mpte_connid_last++;
1161	if (mpte->mpte_connid_last == CONNID_ALL ||
1162	    mpte->mpte_connid_last == CONNID_ANY)
1163		mpte->mpte_connid_last++;
1164
1165	mpts->mpts_connid = mpte->mpte_connid_last;
1166	VERIFY(mpts->mpts_connid != CONNID_ANY &&
1167	    mpts->mpts_connid != CONNID_ALL);
1168
1169	/* bind subflow socket to the specified interface */
1170	if (ifscope != IFSCOPE_NONE) {
1171		socket_lock(so, 0);
1172		error = inp_bindif(sotoinpcb(so), ifscope, &mpts->mpts_outif);
1173		if (error != 0) {
1174			socket_unlock(so, 0);
1175			(void) mptcp_subflow_soclose(mpts, so);
1176			goto out;
1177		}
1178		VERIFY(mpts->mpts_outif != NULL);
1179		mpts->mpts_flags |= MPTSF_BOUND_IF;
1180
1181		mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx bindif %s[%d] "
1182		    "cid %d\n", __func__,
1183		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1184		    mpts->mpts_outif->if_xname,
1185		    ifscope, mpts->mpts_connid));
1186		socket_unlock(so, 0);
1187	}
1188
1189	/* if source address and/or port is specified, bind to it */
1190	if (src_se != NULL) {
1191		struct sockaddr *sa = src_se->se_addr;
1192		uint32_t mpts_flags = 0;
1193		in_port_t lport;
1194
1195		switch (af) {
1196		case AF_INET:
1197			if (SIN(sa)->sin_addr.s_addr != INADDR_ANY)
1198				mpts_flags |= MPTSF_BOUND_IP;
1199			if ((lport = SIN(sa)->sin_port) != 0)
1200				mpts_flags |= MPTSF_BOUND_PORT;
1201			break;
1202#if INET6
1203		case AF_INET6:
1204			VERIFY(af == AF_INET6);
1205			if (!IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa)->sin6_addr))
1206				mpts_flags |= MPTSF_BOUND_IP;
1207			if ((lport = SIN6(sa)->sin6_port) != 0)
1208				mpts_flags |= MPTSF_BOUND_PORT;
1209			break;
1210#endif /* INET6 */
1211		}
1212
1213		error = sobindlock(so, sa, 1);	/* will lock/unlock socket */
1214		if (error != 0) {
1215			(void) mptcp_subflow_soclose(mpts, so);
1216			goto out;
1217		}
1218		mpts->mpts_flags |= mpts_flags;
1219
1220		if (af == AF_INET || af == AF_INET6) {
1221			char sbuf[MAX_IPv6_STR_LEN];
1222
1223			mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx bindip %s[%d] "
1224			    "cid %d\n", __func__,
1225			    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1226			    inet_ntop(af, ((af == AF_INET) ?
1227			    (void *)&SIN(sa)->sin_addr.s_addr :
1228			    (void *)&SIN6(sa)->sin6_addr), sbuf, sizeof (sbuf)),
1229			    ntohs(lport), mpts->mpts_connid));
1230		}
1231	}
1232
1233	/*
1234	 * Insert the subflow into the list, and associate the MPTCP PCB
1235	 * as well as the the subflow socket.  From this point on, removing
1236	 * the subflow needs to be done via mptcp_subflow_del().
1237	 */
1238	TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1239	mpte->mpte_numflows++;
1240
1241	atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1242	mpts->mpts_mpte = mpte;
1243	mpts->mpts_socket = so;
1244	MPTS_ADDREF_LOCKED(mpts);	/* for being in MPTCP subflow list */
1245	MPTS_ADDREF_LOCKED(mpts);	/* for subflow socket */
1246	mp_so->so_usecount++;		/* for subflow socket */
1247
1248	/* register for subflow socket read/write events */
1249	(void) sock_setupcalls(so, mptcp_subflow_rupcall, mpts,
1250	    mptcp_subflow_wupcall, mpts);
1251
1252	/*
1253	 * Register for subflow socket control events; ignore
1254	 * SO_FILT_HINT_CONNINFO_UPDATED from below since we
1255	 * will generate it here.
1256	 */
1257	(void) sock_catchevents(so, mptcp_subflow_eupcall, mpts,
1258	    SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
1259	    SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
1260	    SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
1261	    SO_FILT_HINT_SUSPEND | SO_FILT_HINT_RESUME |
1262	    SO_FILT_HINT_CONNECTED | SO_FILT_HINT_DISCONNECTED |
1263	    SO_FILT_HINT_MPFAILOVER | SO_FILT_HINT_MPSTATUS |
1264	    SO_FILT_HINT_MUSTRST);
1265
1266	/* sanity check */
1267	VERIFY(!(mpts->mpts_flags &
1268	    (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
1269
1270	bzero(&mpcr, sizeof (mpcr));
1271	mpcr.mpcr_proc = p;
1272	mpcr.mpcr_ifscope = ifscope;
1273	/*
1274	 * Indicate to the TCP subflow whether or not it should establish
1275	 * the initial MPTCP connection, or join an existing one.  Fill
1276	 * in the connection request structure with additional info needed
1277	 * by the underlying TCP (to be used in the TCP options, etc.)
1278	 */
1279	MPT_LOCK(mp_tp);
1280	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
1281		if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1282			mp_tp->mpt_localkey = mptcp_reserve_key();
1283			mptcp_conn_properties(mp_tp);
1284		}
1285		MPT_UNLOCK(mp_tp);
1286		soisconnecting(mp_so);
1287		mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ENABLE;
1288	} else {
1289		if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
1290			mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
1291		MPT_UNLOCK(mp_tp);
1292		mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ADD;
1293	}
1294
1295	mpts->mpts_mpcr = mpcr;
1296	mpts->mpts_flags |= MPTSF_CONNECTING;
1297
1298	if (af == AF_INET || af == AF_INET6) {
1299		char dbuf[MAX_IPv6_STR_LEN];
1300
1301		mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx dst %s[%d] cid %d "
1302		    "[pending %s]\n", __func__,
1303		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1304		    inet_ntop(af, ((af == AF_INET) ?
1305		    (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
1306		    (void *)&SIN6(dst_se->se_addr)->sin6_addr),
1307		    dbuf, sizeof (dbuf)), ((af == AF_INET) ?
1308		    ntohs(SIN(dst_se->se_addr)->sin_port) :
1309		    ntohs(SIN6(dst_se->se_addr)->sin6_port)),
1310		    mpts->mpts_connid,
1311		    ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
1312		    "YES" : "NO")));
1313	}
1314
1315	/* connect right away if first attempt, or if join can be done now */
1316	if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
1317		error = mptcp_subflow_soconnectx(mpte, mpts);
1318
1319out:
1320	MPTS_UNLOCK(mpts);
1321	if (error == 0) {
1322		soevent(mp_so, SO_FILT_HINT_LOCKED |
1323		    SO_FILT_HINT_CONNINFO_UPDATED);
1324	}
1325	return (error);
1326}
1327
1328static int
1329mptcp_delete_ok(struct mptses *mpte, struct mptsub *mpts)
1330{
1331	int ret = 1;
1332	struct mptcb *mp_tp = NULL;
1333
1334	MPTE_LOCK_ASSERT_HELD(mpte);
1335	mp_tp = mpte->mpte_mptcb;
1336	VERIFY(mp_tp != NULL);
1337	MPTS_LOCK(mpts);
1338	MPT_LOCK(mp_tp);
1339	if ((mpts->mpts_soerror == 0) &&
1340	    (mpts->mpts_flags & MPTSF_ACTIVE) &&
1341	    (mp_tp->mpt_state != MPTCPS_CLOSED) &&
1342	    (mp_tp->mpt_state <= MPTCPS_TIME_WAIT))
1343		ret = 0;
1344	MPT_UNLOCK(mp_tp);
1345	MPTS_UNLOCK(mpts);
1346	return (ret);
1347}
1348
1349/*
1350 * Delete/remove a subflow from an MPTCP.  The underlying subflow socket
1351 * will no longer be accessible after a subflow is deleted, thus this
1352 * should occur only after the subflow socket has been disconnected.
1353 * If peeloff(2) is called, leave the socket open.
1354 */
1355void
1356mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close)
1357{
1358	struct socket *mp_so, *so;
1359
1360	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1361	mp_so = mpte->mpte_mppcb->mpp_socket;
1362
1363	MPTS_LOCK(mpts);
1364	so = mpts->mpts_socket;
1365	VERIFY(so != NULL);
1366
1367	mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d "
1368	    "[close %s] %d %x\n", __func__,
1369	    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1370	    mp_so->so_usecount,
1371	    mp_so->so_retaincnt, mpts->mpts_connid,
1372	    (close ? "YES" : "NO"), mpts->mpts_soerror,
1373	    mpts->mpts_flags));
1374
1375	VERIFY(mpts->mpts_mpte == mpte);
1376	VERIFY(mpts->mpts_connid != CONNID_ANY &&
1377	    mpts->mpts_connid != CONNID_ALL);
1378
1379	VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
1380	atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1381	TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
1382	VERIFY(mpte->mpte_numflows != 0);
1383	mpte->mpte_numflows--;
1384
1385	/*
1386	 * Drop references held by this subflow socket; there
1387	 * will be no further upcalls made from this point.
1388	 */
1389	(void) sock_setupcalls(so, NULL, NULL, NULL, NULL);
1390	(void) sock_catchevents(so, NULL, NULL, 0);
1391	mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
1392	if (close)
1393		(void) mptcp_subflow_soclose(mpts, so);
1394
1395	VERIFY(mp_so->so_usecount != 0);
1396	mp_so->so_usecount--;		/* for subflow socket */
1397	mpts->mpts_mpte = NULL;
1398	mpts->mpts_socket = NULL;
1399	MPTS_UNLOCK(mpts);
1400
1401	MPTS_REMREF(mpts);		/* for MPTCP subflow list */
1402	MPTS_REMREF(mpts);		/* for subflow socket */
1403
1404	soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
1405}
1406
1407/*
1408 * Disconnect a subflow socket.
1409 */
1410void
1411mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts,
1412    boolean_t deleteok)
1413{
1414	struct socket *so;
1415	struct mptcb *mp_tp;
1416	int send_dfin = 0;
1417
1418	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1419	MPTS_LOCK_ASSERT_HELD(mpts);
1420
1421	VERIFY(mpts->mpts_mpte == mpte);
1422	VERIFY(mpts->mpts_socket != NULL);
1423	VERIFY(mpts->mpts_connid != CONNID_ANY &&
1424	    mpts->mpts_connid != CONNID_ALL);
1425
1426	if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
1427		return;
1428
1429	mpts->mpts_flags |= MPTSF_DISCONNECTING;
1430
1431	/*
1432	 * If this is coming from disconnectx(2) or issued as part of
1433	 * closing the MPTCP socket, the subflow shouldn't stick around.
1434	 * Otherwise let it linger around in case the upper layers need
1435	 * to retrieve its conninfo.
1436	 */
1437	if (deleteok)
1438		mpts->mpts_flags |= MPTSF_DELETEOK;
1439
1440	so = mpts->mpts_socket;
1441	mp_tp = mpte->mpte_mptcb;
1442	MPT_LOCK(mp_tp);
1443	if (mp_tp->mpt_state > MPTCPS_ESTABLISHED)
1444		send_dfin = 1;
1445	MPT_UNLOCK(mp_tp);
1446
1447	socket_lock(so, 0);
1448	if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
1449	    (so->so_state & SS_ISCONNECTED)) {
1450		mptcplog((LOG_DEBUG, "%s: cid %d fin %d [linger %s]\n",
1451		    __func__, mpts->mpts_connid, send_dfin,
1452		    (deleteok ? "NO" : "YES")));
1453
1454		if (send_dfin)
1455			mptcp_send_dfin(so);
1456		(void) soshutdownlock(so, SHUT_RD);
1457		(void) soshutdownlock(so, SHUT_WR);
1458		(void) sodisconnectlocked(so);
1459	}
1460	socket_unlock(so, 0);
1461	/*
1462	 * Generate a disconnect event for this subflow socket, in case
1463	 * the lower layer doesn't do it; this is needed because the
1464	 * subflow socket deletion relies on it.  This will also end up
1465	 * generating SO_FILT_HINT_CONNINFO_UPDATED on the MPTCP socket;
1466	 * we cannot do that here because subflow lock is currently held.
1467	 */
1468	mptcp_subflow_eupcall(so, mpts, SO_FILT_HINT_DISCONNECTED);
1469}
1470
1471/*
1472 * Subflow socket read upcall.
1473 *
1474 * Called when the associated subflow socket posted a read event.  The subflow
1475 * socket lock has been released prior to invoking the callback.  Note that the
1476 * upcall may occur synchronously as a result of MPTCP performing an action on
1477 * it, or asynchronously as a result of an event happening at the subflow layer.
1478 * Therefore, to maintain lock ordering, the only lock that can be acquired
1479 * here is the thread lock, for signalling purposes.
1480 */
1481static void
1482mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
1483{
1484#pragma unused(so, waitf)
1485	struct mptsub *mpts = arg;
1486	struct mptses *mpte = mpts->mpts_mpte;
1487
1488	VERIFY(mpte != NULL);
1489
1490	lck_mtx_lock(&mpte->mpte_thread_lock);
1491	mptcp_thread_signal_locked(mpte);
1492	lck_mtx_unlock(&mpte->mpte_thread_lock);
1493}
1494
1495/*
1496 * Subflow socket input.
1497 *
1498 * Called in the context of the MPTCP thread, for reading data from the
1499 * underlying subflow socket and delivering it to MPTCP.
1500 */
1501static void
1502mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
1503{
1504	struct mbuf *m = NULL;
1505	struct socket *so;
1506	int error;
1507	struct mptsub *mpts_alt = NULL;
1508
1509	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1510	MPTS_LOCK_ASSERT_HELD(mpts);
1511
1512	DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
1513	    struct mptsub *, mpts);
1514
1515	if (!(mpts->mpts_flags & MPTSF_CONNECTED))
1516		return;
1517
1518	so = mpts->mpts_socket;
1519
1520	error = sock_receive_internal(so, NULL, &m, 0, NULL);
1521	if (error != 0 && error != EWOULDBLOCK) {
1522		mptcplog((LOG_ERR, "%s: cid %d error %d\n",
1523		    __func__, mpts->mpts_connid, error));
1524		MPTS_UNLOCK(mpts);
1525		mpts_alt = mptcp_get_subflow(mpte, mpts);
1526		if (mpts_alt == NULL) {
1527			mptcplog((LOG_ERR, "%s: no alt path cid %d\n",
1528			    __func__, mpts->mpts_connid));
1529			mpte->mpte_mppcb->mpp_socket->so_error = error;
1530		}
1531		MPTS_LOCK(mpts);
1532	} else if (error == 0) {
1533		mptcplog3((LOG_DEBUG, "%s: cid %d \n",
1534		    __func__, mpts->mpts_connid));
1535	}
1536
1537	/* In fallback, make sure to accept data on all but one subflow */
1538	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1539	    (!(mpts->mpts_flags & MPTSF_ACTIVE))) {
1540		m_freem(m);
1541		return;
1542	}
1543
1544	if (m != NULL) {
1545		/*
1546		 * Release subflow lock since this may trigger MPTCP to send,
1547		 * possibly on a different subflow.  An extra reference has
1548		 * been held on the subflow by the MPTCP thread before coming
1549		 * here, so we can be sure that it won't go away, in the event
1550		 * the MP socket lock gets released.
1551		 */
1552		MPTS_UNLOCK(mpts);
1553		mptcp_input(mpte, m);
1554		MPTS_LOCK(mpts);
1555	}
1556}
1557
1558/*
1559 * Subflow socket write upcall.
1560 *
1561 * Called when the associated subflow socket posted a read event.  The subflow
1562 * socket lock has been released prior to invoking the callback.  Note that the
1563 * upcall may occur synchronously as a result of MPTCP performing an action on
1564 * it, or asynchronously as a result of an event happening at the subflow layer.
1565 * Therefore, to maintain lock ordering, the only lock that can be acquired
1566 * here is the thread lock, for signalling purposes.
1567 */
1568static void
1569mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
1570{
1571#pragma unused(so, waitf)
1572	struct mptsub *mpts = arg;
1573	struct mptses *mpte = mpts->mpts_mpte;
1574
1575	VERIFY(mpte != NULL);
1576
1577	lck_mtx_lock(&mpte->mpte_thread_lock);
1578	mptcp_thread_signal_locked(mpte);
1579	lck_mtx_unlock(&mpte->mpte_thread_lock);
1580}
1581
1582/*
1583 * Subflow socket output.
1584 *
1585 * Called for sending data from MPTCP to the underlying subflow socket.
1586 */
1587int
1588mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts)
1589{
1590	struct socket *mp_so, *so;
1591	size_t sb_cc = 0, tot_sent = 0;
1592	struct mbuf *sb_mb;
1593	int error = 0;
1594	u_int64_t mpt_dsn = 0;
1595	struct mptcb *mp_tp = mpte->mpte_mptcb;
1596	struct mbuf *mpt_mbuf = NULL;
1597	unsigned int off = 0;
1598
1599	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1600	MPTS_LOCK_ASSERT_HELD(mpts);
1601	mp_so = mpte->mpte_mppcb->mpp_socket;
1602	so = mpts->mpts_socket;
1603
1604	DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
1605	    struct mptsub *, mpts);
1606
1607	/* subflow socket is suspended? */
1608	if (mpts->mpts_flags & MPTSF_SUSPENDED) {
1609		mptcplog((LOG_ERR, "%s: mp_so 0x%llx cid %d is flow "
1610		    "controlled\n", __func__,
1611		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid));
1612		goto out;
1613	}
1614
1615	/* subflow socket is not MPTCP capable? */
1616	if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) &&
1617	    !(mpts->mpts_flags & MPTSF_MP_DEGRADED)) {
1618		mptcplog((LOG_ERR, "%s: mp_so 0x%llx cid %d not "
1619		    "MPTCP capable\n", __func__,
1620		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid));
1621		goto out;
1622	}
1623
1624	/* Remove Addr Option is not sent reliably as per I-D */
1625	if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
1626		struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1627		tp->t_rem_aid = mpte->mpte_lost_aid;
1628		if (mptcp_remaddr_enable)
1629			tp->t_mpflags |= TMPF_SND_REM_ADDR;
1630		mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
1631	}
1632
1633	/*
1634	 * The mbuf chains containing the metadata (as well as pointing to
1635	 * the user data sitting at the MPTCP output queue) would then be
1636	 * sent down to the subflow socket.
1637	 *
1638	 * Some notes on data sequencing:
1639	 *
1640	 *   a. Each mbuf must be a M_PKTHDR.
1641	 *   b. MPTCP metadata is stored in the mptcp_pktinfo structure
1642	 *	in the mbuf pkthdr structure.
1643	 *   c. Each mbuf containing the MPTCP metadata must have its
1644	 *	pkt_flags marked with the PKTF_MPTCP flag.
1645	 */
1646
1647	/* First, drop acknowledged data */
1648	sb_mb = mp_so->so_snd.sb_mb;
1649	if (sb_mb == NULL) {
1650		goto out;
1651	}
1652
1653	VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
1654
1655	mpt_mbuf = sb_mb;
1656	while (mpt_mbuf && mpt_mbuf->m_pkthdr.mp_rlen == 0) {
1657		mpt_mbuf = mpt_mbuf->m_next;
1658	}
1659	if (mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1660		mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1661	} else {
1662		goto out;
1663	}
1664
1665	MPT_LOCK(mp_tp);
1666	if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
1667		int len = 0;
1668		len = mp_tp->mpt_snduna - mpt_dsn;
1669		sbdrop(&mp_so->so_snd, len);
1670
1671	}
1672
1673	/*
1674	 * In degraded mode, we don't receive data acks, so force free
1675	 * mbufs less than snd_nxt
1676	 */
1677	mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
1678	if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1679	    MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_sndnxt)) {
1680		int len = 0;
1681		len = mp_tp->mpt_sndnxt - mpt_dsn;
1682		sbdrop(&mp_so->so_snd, len);
1683		mp_tp->mpt_snduna = mp_tp->mpt_sndnxt;
1684	}
1685
1686	/*
1687	 * Adjust the subflow's notion of next byte to send based on
1688	 * the last unacknowledged byte
1689	 */
1690	if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_snduna)) {
1691		mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1692	}
1693
1694	/*
1695	 * Adjust the top level notion of next byte used for retransmissions
1696	 * and sending FINs.
1697	 */
1698	if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
1699		mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
1700	}
1701
1702
1703	/* Now determine the offset from which to start transmitting data */
1704	sb_mb = mp_so->so_snd.sb_mb;
1705	sb_cc = mp_so->so_snd.sb_cc;
1706	if (sb_mb == NULL) {
1707		MPT_UNLOCK(mp_tp);
1708		goto out;
1709	}
1710	if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_sndmax)) {
1711		off = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
1712		sb_cc -= off;
1713	} else {
1714		MPT_UNLOCK(mp_tp);
1715		goto out;
1716	}
1717	MPT_UNLOCK(mp_tp);
1718
1719	mpt_mbuf = sb_mb;
1720	mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1721
1722	while (mpt_mbuf && ((mpt_mbuf->m_pkthdr.mp_rlen == 0) ||
1723	    (mpt_mbuf->m_pkthdr.mp_rlen <= off))) {
1724		off -= mpt_mbuf->m_pkthdr.mp_rlen;
1725		mpt_mbuf = mpt_mbuf->m_next;
1726		mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1727	}
1728	if ((mpts->mpts_connid == 2) || (mpts->mpts_flags & MPTSF_MP_DEGRADED))
1729		mptcplog((LOG_INFO, "%s: snduna = %llu off = %d id = %d"
1730		    " %llu \n",
1731		    __func__,
1732		    mp_tp->mpt_snduna, off, mpts->mpts_connid,
1733		    mpts->mpts_sndnxt));
1734
1735	VERIFY(mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
1736
1737	while (tot_sent < sb_cc) {
1738		struct mbuf *m;
1739		size_t mlen, len = 0;
1740
1741		mlen = mpt_mbuf->m_pkthdr.mp_rlen;
1742		mlen -= off;
1743		if (mlen == 0)
1744			goto out;
1745
1746		if (mlen > sb_cc) {
1747			panic("%s: unexpected %lu %lu \n", __func__,
1748			    mlen, sb_cc);
1749		}
1750
1751		m = m_copym_mode(mpt_mbuf, off, mlen, M_DONTWAIT,
1752		    M_COPYM_COPY_HDR);
1753		if (m == NULL) {
1754			error = ENOBUFS;
1755			break;
1756		}
1757
1758		/* Create a DSN mapping for the data (m_copym does it) */
1759		mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1760		m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1761		m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
1762		m->m_pkthdr.mp_dsn = mpt_dsn + off;
1763		m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
1764		m->m_pkthdr.mp_rlen = mlen;
1765		mpts->mpts_rel_seq += mlen;
1766		m->m_pkthdr.len = mlen;
1767
1768		/* last contiguous mapping is stored for error cases */
1769		if (mpts->mpts_lastmap.mptsl_dsn +
1770		    mpts->mpts_lastmap.mptsl_len == mpt_dsn) {
1771			mpts->mpts_lastmap.mptsl_len += tot_sent;
1772		} else if (MPTCP_SEQ_LT((mpts->mpts_lastmap.mptsl_dsn +
1773		    mpts->mpts_lastmap.mptsl_len), mpt_dsn)) {
1774			if (m->m_pkthdr.mp_dsn == 0)
1775				panic("%s %llu", __func__, mpt_dsn);
1776			mpts->mpts_lastmap.mptsl_dsn = m->m_pkthdr.mp_dsn;
1777			mpts->mpts_lastmap.mptsl_sseq = m->m_pkthdr.mp_rseq;
1778			mpts->mpts_lastmap.mptsl_len = m->m_pkthdr.mp_rlen;
1779		}
1780
1781		error = sock_sendmbuf(so, NULL, m, 0, &len);
1782		DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
1783		    struct sockbuf *, &so->so_rcv,
1784		    struct sockbuf *, &so->so_snd,
1785		    struct mptses *, mpte, struct mptsub *, mpts,
1786		    size_t, mlen);
1787		if (error != 0) {
1788			mptcplog((LOG_ERR, "%s: len = %zd error = %d \n",
1789			    __func__, len, error));
1790			break;
1791		}
1792		mpts->mpts_sndnxt += mlen;
1793		MPT_LOCK(mp_tp);
1794		if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mpts->mpts_sndnxt)) {
1795			if (MPTCP_DATASEQ_HIGH32(mpts->mpts_sndnxt) >
1796			    MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
1797				mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
1798			mp_tp->mpt_sndnxt = mpts->mpts_sndnxt;
1799		}
1800		MPT_UNLOCK(mp_tp);
1801		if (len != mlen) {
1802			mptcplog((LOG_ERR, "%s: cid %d wrote %d "
1803			    "(expected %d)\n", __func__,
1804			    mpts->mpts_connid, len, mlen));
1805		}
1806		tot_sent += mlen;
1807		off = 0;
1808		mpt_mbuf = mpt_mbuf->m_next;
1809	}
1810
1811	if (error != 0 && error != EWOULDBLOCK) {
1812		mptcplog((LOG_ERR, "MPTCP ERROR %s: cid %d error %d\n",
1813		    __func__, mpts->mpts_connid, error));
1814	} if (error == 0) {
1815		if ((mpts->mpts_connid == 2) ||
1816		    (mpts->mpts_flags & MPTSF_MP_DEGRADED))
1817			mptcplog((LOG_DEBUG, "%s: cid %d wrote %d %d\n",
1818			    __func__, mpts->mpts_connid, tot_sent,
1819			    sb_cc));
1820		MPT_LOCK(mp_tp);
1821		mptcp_cancel_timer(mp_tp, MPTT_REXMT);
1822		MPT_UNLOCK(mp_tp);
1823	}
1824out:
1825	return (error);
1826}
1827
1828/*
1829 * Subflow socket control event upcall.
1830 *
1831 * Called when the associated subflow socket posted one or more control events.
1832 * The subflow socket lock has been released prior to invoking the callback.
1833 * Note that the upcall may occur synchronously as a result of MPTCP performing
1834 * an action on it, or asynchronously as a result of an event happening at the
1835 * subflow layer.  Therefore, to maintain lock ordering, the only lock that can
1836 * be acquired here is the thread lock, for signalling purposes.
1837 */
1838static void
1839mptcp_subflow_eupcall(struct socket *so, void *arg, uint32_t events)
1840{
1841#pragma unused(so)
1842	struct mptsub *mpts = arg;
1843	struct mptses *mpte = mpts->mpts_mpte;
1844
1845	VERIFY(mpte != NULL);
1846
1847	lck_mtx_lock(&mpte->mpte_thread_lock);
1848	atomic_bitset_32(&mpts->mpts_evctl, events);
1849	mptcp_thread_signal_locked(mpte);
1850	lck_mtx_unlock(&mpte->mpte_thread_lock);
1851}
1852
1853/*
1854 * Subflow socket control events.
1855 *
1856 * Called for handling events related to the underlying subflow socket.
1857 */
1858static ev_ret_t
1859mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts)
1860{
1861	uint32_t events;
1862	ev_ret_t ret = MPTS_EVRET_OK;
1863
1864	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1865	MPTS_LOCK_ASSERT_HELD(mpts);
1866
1867	/* bail if there's nothing to process */
1868	if ((events = mpts->mpts_evctl) == 0)
1869		return (ret);
1870
1871	if (events & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
1872	    SO_FILT_HINT_CANTRCVMORE|SO_FILT_HINT_CANTSENDMORE|
1873	    SO_FILT_HINT_TIMEOUT|SO_FILT_HINT_NOSRCADDR|
1874	    SO_FILT_HINT_IFDENIED|SO_FILT_HINT_SUSPEND|
1875	    SO_FILT_HINT_DISCONNECTED)) {
1876		events |= SO_FILT_HINT_MPFAILOVER;
1877	}
1878
1879	DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
1880	    struct mptsub *, mpts, uint32_t, events);
1881
1882	mptcplog2((LOG_DEBUG, "%s: cid %d events=%b\n", __func__,
1883	    mpts->mpts_connid, events, SO_FILT_HINT_BITS));
1884
1885	if ((events & SO_FILT_HINT_MPFAILOVER) && (ret >= MPTS_EVRET_OK)) {
1886		ev_ret_t error = mptcp_subflow_failover_ev(mpte, mpts);
1887		events &= ~SO_FILT_HINT_MPFAILOVER;
1888		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1889	}
1890	if ((events & SO_FILT_HINT_CONNRESET) && (ret >= MPTS_EVRET_OK)) {
1891		ev_ret_t error = mptcp_subflow_connreset_ev(mpte, mpts);
1892		events &= ~SO_FILT_HINT_CONNRESET;
1893		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1894	}
1895	if ((events & SO_FILT_HINT_MUSTRST) && (ret >= MPTS_EVRET_OK)) {
1896		ev_ret_t error = mptcp_subflow_mustrst_ev(mpte, mpts);
1897		events &= ~SO_FILT_HINT_MUSTRST;
1898		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1899	}
1900	if ((events & SO_FILT_HINT_CANTRCVMORE) && (ret >= MPTS_EVRET_OK)) {
1901		ev_ret_t error = mptcp_subflow_cantrcvmore_ev(mpte, mpts);
1902		events &= ~SO_FILT_HINT_CANTRCVMORE;
1903		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1904	}
1905	if ((events & SO_FILT_HINT_CANTSENDMORE) && (ret >= MPTS_EVRET_OK)) {
1906		ev_ret_t error = mptcp_subflow_cantsendmore_ev(mpte, mpts);
1907		events &= ~SO_FILT_HINT_CANTSENDMORE;
1908		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1909	}
1910	if ((events & SO_FILT_HINT_TIMEOUT) && (ret >= MPTS_EVRET_OK)) {
1911		ev_ret_t error = mptcp_subflow_timeout_ev(mpte, mpts);
1912		events &= ~SO_FILT_HINT_TIMEOUT;
1913		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1914	}
1915	if ((events & SO_FILT_HINT_NOSRCADDR) && (ret >= MPTS_EVRET_OK)) {
1916		ev_ret_t error = mptcp_subflow_nosrcaddr_ev(mpte, mpts);
1917		events &= ~SO_FILT_HINT_NOSRCADDR;
1918		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1919	}
1920	if ((events & SO_FILT_HINT_IFDENIED) && (ret >= MPTS_EVRET_OK)) {
1921		ev_ret_t error = mptcp_subflow_ifdenied_ev(mpte, mpts);
1922		events &= ~SO_FILT_HINT_IFDENIED;
1923		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1924	}
1925	if ((events & SO_FILT_HINT_SUSPEND) && (ret >= MPTS_EVRET_OK)) {
1926		ev_ret_t error = mptcp_subflow_suspend_ev(mpte, mpts);
1927		events &= ~SO_FILT_HINT_SUSPEND;
1928		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1929	}
1930	if ((events & SO_FILT_HINT_RESUME) && (ret >= MPTS_EVRET_OK)) {
1931		ev_ret_t error = mptcp_subflow_resume_ev(mpte, mpts);
1932		events &= ~SO_FILT_HINT_RESUME;
1933		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1934	}
1935	if ((events & SO_FILT_HINT_CONNECTED) && (ret >= MPTS_EVRET_OK)) {
1936		ev_ret_t error = mptcp_subflow_connected_ev(mpte, mpts);
1937		events &= ~SO_FILT_HINT_CONNECTED;
1938		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1939	}
1940	if ((events & SO_FILT_HINT_MPSTATUS) && (ret >= MPTS_EVRET_OK)) {
1941		ev_ret_t error = mptcp_subflow_mpstatus_ev(mpte, mpts);
1942		events &= ~SO_FILT_HINT_MPSTATUS;
1943		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1944	}
1945	if ((events & SO_FILT_HINT_DISCONNECTED) && (ret >= MPTS_EVRET_OK)) {
1946		ev_ret_t error = mptcp_subflow_disconnected_ev(mpte, mpts);
1947		events &= ~SO_FILT_HINT_DISCONNECTED;
1948		ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
1949	}
1950	/*
1951	 * We should be getting only events specified via sock_catchevents(),
1952	 * so loudly complain if we have any unprocessed one(s).
1953	 */
1954	if (events != 0 || ret < MPTS_EVRET_OK) {
1955		mptcplog((LOG_ERR, "%s%s: cid %d evret %s (%d)"
1956		    " unhandled events=%b\n",
1957		    (events != 0) ? "MPTCP_ERROR " : "",
1958		    __func__, mpts->mpts_connid,
1959		    mptcp_evret2str(ret), ret, events, SO_FILT_HINT_BITS));
1960	}
1961
1962	/* clear the ones we've processed */
1963	atomic_bitclear_32(&mpts->mpts_evctl, ~events);
1964
1965	return (ret);
1966}
1967
1968/*
1969 * Handle SO_FILT_HINT_CONNRESET subflow socket event.
1970 */
1971static ev_ret_t
1972mptcp_subflow_connreset_ev(struct mptses *mpte, struct mptsub *mpts)
1973{
1974	struct socket *mp_so, *so;
1975	struct mptcb *mp_tp;
1976	boolean_t linger;
1977
1978	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1979	MPTS_LOCK_ASSERT_HELD(mpts);
1980	VERIFY(mpte->mpte_mppcb != NULL);
1981	mp_so = mpte->mpte_mppcb->mpp_socket;
1982	mp_tp = mpte->mpte_mptcb;
1983	so = mpts->mpts_socket;
1984
1985	linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
1986	    !(mp_so->so_flags & SOF_PCBCLEARING));
1987
1988	mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
1989	    mpts->mpts_connid, (linger ? "YES" : "NO")));
1990
1991	if (mpts->mpts_soerror == 0)
1992		mpts->mpts_soerror = ECONNREFUSED;
1993
1994	/*
1995	 * We got a TCP RST for this subflow connection.
1996	 *
1997	 * Right now, we simply propagate ECONNREFUSED to the MPTCP socket
1998	 * client if the MPTCP connection has not been established. Otherwise
1999	 * we close the socket.
2000	 */
2001	mptcp_subflow_disconnect(mpte, mpts, !linger);
2002
2003	MPT_LOCK(mp_tp);
2004	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2005		mp_so->so_error = ECONNREFUSED;
2006	}
2007	MPT_UNLOCK(mp_tp);
2008
2009	/*
2010	 * Keep the subflow socket around, unless the MPTCP socket has
2011	 * been detached or the subflow has been disconnected explicitly,
2012	 * in which case it should be deleted right away.
2013	 */
2014	return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2015}
2016
2017/*
2018 * Handle SO_FILT_HINT_CANTRCVMORE subflow socket event.
2019 */
2020static ev_ret_t
2021mptcp_subflow_cantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts)
2022{
2023	struct socket *so;
2024
2025	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2026	MPTS_LOCK_ASSERT_HELD(mpts);
2027
2028	so = mpts->mpts_socket;
2029
2030	mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid));
2031
2032	/*
2033	 * We got a FIN for this subflow connection.  This subflow socket
2034	 * is no longer available for receiving data;
2035	 * The FIN may arrive with data. The data is handed up to the
2036	 * mptcp socket and the subflow is disconnected.
2037	 */
2038
2039	return (MPTS_EVRET_OK);	/* keep the subflow socket around */
2040}
2041
2042/*
2043 * Handle SO_FILT_HINT_CANTSENDMORE subflow socket event.
2044 */
2045static ev_ret_t
2046mptcp_subflow_cantsendmore_ev(struct mptses *mpte, struct mptsub *mpts)
2047{
2048	struct socket *so;
2049
2050	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2051	MPTS_LOCK_ASSERT_HELD(mpts);
2052
2053	so = mpts->mpts_socket;
2054
2055	mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid));
2056	return (MPTS_EVRET_OK);	/* keep the subflow socket around */
2057}
2058
2059/*
2060 * Handle SO_FILT_HINT_TIMEOUT subflow socket event.
2061 */
2062static ev_ret_t
2063mptcp_subflow_timeout_ev(struct mptses *mpte, struct mptsub *mpts)
2064{
2065	struct socket *mp_so, *so;
2066	struct mptcb *mp_tp;
2067	boolean_t linger;
2068
2069	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2070	MPTS_LOCK_ASSERT_HELD(mpts);
2071	VERIFY(mpte->mpte_mppcb != NULL);
2072	mp_so = mpte->mpte_mppcb->mpp_socket;
2073	mp_tp = mpte->mpte_mptcb;
2074	so = mpts->mpts_socket;
2075
2076	linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2077	    !(mp_so->so_flags & SOF_PCBCLEARING));
2078
2079	mptcplog((LOG_NOTICE, "%s: cid %d [linger %s]\n", __func__,
2080	    mpts->mpts_connid, (linger ? "YES" : "NO")));
2081
2082	if (mpts->mpts_soerror == 0)
2083		mpts->mpts_soerror = ETIMEDOUT;
2084
2085	/*
2086	 * The subflow connection has timed out.
2087	 *
2088	 * Right now, we simply propagate ETIMEDOUT to the MPTCP socket
2089	 * client if the MPTCP connection has not been established. Otherwise
2090	 * drop it.
2091	 */
2092	mptcp_subflow_disconnect(mpte, mpts, !linger);
2093
2094	MPT_LOCK(mp_tp);
2095	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2096		mp_so->so_error = ETIMEDOUT;
2097	}
2098	MPT_UNLOCK(mp_tp);
2099
2100	/*
2101	 * Keep the subflow socket around, unless the MPTCP socket has
2102	 * been detached or the subflow has been disconnected explicitly,
2103	 * in which case it should be deleted right away.
2104	 */
2105	return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2106}
2107
2108/*
2109 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
2110 */
2111static ev_ret_t
2112mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts)
2113{
2114	struct socket *mp_so, *so;
2115	struct mptcb *mp_tp;
2116	boolean_t linger;
2117	struct tcpcb *tp = NULL;
2118
2119	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2120	MPTS_LOCK_ASSERT_HELD(mpts);
2121
2122	VERIFY(mpte->mpte_mppcb != NULL);
2123	mp_so = mpte->mpte_mppcb->mpp_socket;
2124	mp_tp = mpte->mpte_mptcb;
2125	so = mpts->mpts_socket;
2126
2127	/* Not grabbing socket lock as t_local_aid is write once only */
2128	tp = intotcpcb(sotoinpcb(so));
2129	/*
2130	 * This overwrites any previous mpte_lost_aid to avoid storing
2131	 * too much state when the typical case has only two subflows.
2132	 */
2133	mpte->mpte_flags |= MPTE_SND_REM_ADDR;
2134	mpte->mpte_lost_aid = tp->t_local_aid;
2135
2136	linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2137	    !(mp_so->so_flags & SOF_PCBCLEARING));
2138
2139	mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
2140	    mpts->mpts_connid, (linger ? "YES" : "NO")));
2141
2142	if (mpts->mpts_soerror == 0)
2143		mpts->mpts_soerror = EADDRNOTAVAIL;
2144
2145	/*
2146	 * The subflow connection has lost its source address.
2147	 *
2148	 * Right now, we simply propagate EADDRNOTAVAIL to the MPTCP socket
2149	 * client if the MPTCP connection has not been established.  If it
2150	 * has been established with one subflow , we keep the MPTCP
2151	 * connection valid without any subflows till closed by application.
2152	 * This lets tcp connection manager decide whether to close this or
2153	 * not as it reacts to reachability changes too.
2154	 */
2155	mptcp_subflow_disconnect(mpte, mpts, !linger);
2156
2157	MPT_LOCK(mp_tp);
2158	if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) &&
2159	    (mp_so->so_flags & SOF_NOADDRAVAIL)) {
2160		mp_so->so_error = EADDRNOTAVAIL;
2161	}
2162	MPT_UNLOCK(mp_tp);
2163
2164	/*
2165	 * Keep the subflow socket around, unless the MPTCP socket has
2166	 * been detached or the subflow has been disconnected explicitly,
2167	 * in which case it should be deleted right away.
2168	 */
2169	return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2170}
2171
2172/*
2173 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
2174 */
2175static ev_ret_t
2176mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts)
2177{
2178	struct mptsub *mpts_alt = NULL;
2179	struct socket *so = NULL;
2180	struct socket *mp_so;
2181	int altpath_exists = 0;
2182
2183	MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
2184	MPTS_LOCK_ASSERT_HELD(mpts);
2185	mp_so = mpte->mpte_mppcb->mpp_socket;
2186	mptcplog2((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__,
2187	    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)));
2188
2189	MPTS_UNLOCK(mpts);
2190	mpts_alt = mptcp_get_subflow(mpte, mpts);
2191
2192	/*
2193	 * If there is no alternate eligible subflow, ignore the
2194	 * failover hint.
2195	 */
2196	if (mpts_alt == NULL) {
2197		mptcplog2((LOG_WARNING, "%s: no alternate path\n", __func__));
2198		MPTS_LOCK(mpts);
2199		goto done;
2200	}
2201	MPTS_LOCK(mpts_alt);
2202	altpath_exists = 1;
2203	so = mpts_alt->mpts_socket;
2204	if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
2205		socket_lock(so, 1);
2206		/* All data acknowledged */
2207		if (so->so_snd.sb_cc == 0) {
2208			so->so_flags &= ~SOF_MP_TRYFAILOVER;
2209			mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
2210		} else {
2211			/* no alternate path available */
2212			altpath_exists = 0;
2213		}
2214		socket_unlock(so, 1);
2215	}
2216	if (altpath_exists) {
2217		mpts_alt->mpts_flags |= MPTSF_ACTIVE;
2218		struct mptcb *mp_tp = mpte->mpte_mptcb;
2219		/* Bring the subflow's notion of snd_nxt into the send window */
2220		MPT_LOCK(mp_tp);
2221		mpts_alt->mpts_sndnxt = mp_tp->mpt_snduna;
2222		MPT_UNLOCK(mp_tp);
2223		mpte->mpte_active_sub = mpts_alt;
2224		socket_lock(so, 1);
2225		sowwakeup(so);
2226		socket_unlock(so, 1);
2227	}
2228	MPTS_UNLOCK(mpts_alt);
2229
2230	if (altpath_exists) {
2231		soevent(mp_so,
2232		    SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
2233		mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx switched from "
2234		    "%d to %d\n", __func__,
2235		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2236		    mpts->mpts_connid, mpts_alt->mpts_connid));
2237		tcpstat.tcps_mp_switches++;
2238	}
2239
2240	MPTS_LOCK(mpts);
2241	if (altpath_exists) {
2242		mpts->mpts_flags |= MPTSF_FAILINGOVER;
2243		mpts->mpts_flags &= ~MPTSF_ACTIVE;
2244	} else {
2245		so = mpts->mpts_socket;
2246		socket_lock(so, 1);
2247		so->so_flags &= ~SOF_MP_TRYFAILOVER;
2248		socket_unlock(so, 1);
2249	}
2250done:
2251	MPTS_LOCK_ASSERT_HELD(mpts);
2252	return (MPTS_EVRET_OK);
2253}
2254
2255/*
2256 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
2257 */
2258static ev_ret_t
2259mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts)
2260{
2261	struct socket *mp_so, *so;
2262	struct mptcb *mp_tp;
2263	boolean_t linger;
2264
2265	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2266	MPTS_LOCK_ASSERT_HELD(mpts);
2267	VERIFY(mpte->mpte_mppcb != NULL);
2268	mp_so = mpte->mpte_mppcb->mpp_socket;
2269	mp_tp = mpte->mpte_mptcb;
2270	so = mpts->mpts_socket;
2271
2272	linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2273	    !(mp_so->so_flags & SOF_PCBCLEARING));
2274
2275	mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
2276	    mpts->mpts_connid, (linger ? "YES" : "NO")));
2277
2278	if (mpts->mpts_soerror == 0)
2279		mpts->mpts_soerror = EHOSTUNREACH;
2280
2281	/*
2282	 * The subflow connection cannot use the outgoing interface.
2283	 *
2284	 * Right now, we simply propagate EHOSTUNREACH to the MPTCP socket
2285	 * client if the MPTCP connection has not been established.  If it
2286	 * has been established, let the upper layer call disconnectx.
2287	 */
2288	mptcp_subflow_disconnect(mpte, mpts, !linger);
2289	MPTS_UNLOCK(mpts);
2290
2291	soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED);
2292
2293	MPT_LOCK(mp_tp);
2294	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2295		mp_so->so_error = EHOSTUNREACH;
2296	}
2297	MPT_UNLOCK(mp_tp);
2298
2299	MPTS_LOCK(mpts);
2300	/*
2301	 * Keep the subflow socket around, unless the MPTCP socket has
2302	 * been detached or the subflow has been disconnected explicitly,
2303	 * in which case it should be deleted right away.
2304	 */
2305	return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2306}
2307
2308/*
2309 * Handle SO_FILT_HINT_SUSPEND subflow socket event.
2310 */
2311static ev_ret_t
2312mptcp_subflow_suspend_ev(struct mptses *mpte, struct mptsub *mpts)
2313{
2314	struct socket *so;
2315
2316	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2317	MPTS_LOCK_ASSERT_HELD(mpts);
2318
2319	so = mpts->mpts_socket;
2320
2321	/* the subflow connection is being flow controlled */
2322	mpts->mpts_flags |= MPTSF_SUSPENDED;
2323
2324	mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
2325	    mpts->mpts_connid));
2326
2327	return (MPTS_EVRET_OK);	/* keep the subflow socket around */
2328}
2329
2330/*
2331 * Handle SO_FILT_HINT_RESUME subflow socket event.
2332 */
2333static ev_ret_t
2334mptcp_subflow_resume_ev(struct mptses *mpte, struct mptsub *mpts)
2335{
2336	struct socket *so;
2337
2338	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2339	MPTS_LOCK_ASSERT_HELD(mpts);
2340
2341	so = mpts->mpts_socket;
2342
2343	/* the subflow connection is no longer flow controlled */
2344	mpts->mpts_flags &= ~MPTSF_SUSPENDED;
2345
2346	mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid));
2347
2348	return (MPTS_EVRET_OK);	/* keep the subflow socket around */
2349}
2350
2351/*
2352 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
2353 */
2354static ev_ret_t
2355mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts)
2356{
2357	char buf0[MAX_IPv6_STR_LEN], buf1[MAX_IPv6_STR_LEN];
2358	struct sockaddr_entry *src_se, *dst_se;
2359	struct sockaddr_storage src;
2360	struct socket *mp_so, *so;
2361	struct mptcb *mp_tp;
2362	struct ifnet *outifp;
2363	int af, error = 0;
2364	boolean_t mpok = FALSE;
2365
2366	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2367	VERIFY(mpte->mpte_mppcb != NULL);
2368	mp_so = mpte->mpte_mppcb->mpp_socket;
2369	mp_tp = mpte->mpte_mptcb;
2370
2371	MPTS_LOCK_ASSERT_HELD(mpts);
2372	so = mpts->mpts_socket;
2373	af = mpts->mpts_family;
2374
2375	if (mpts->mpts_flags & MPTSF_CONNECTED)
2376		return (MPTS_EVRET_OK);
2377
2378	if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
2379	    (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
2380		return (MPTS_EVRET_OK);
2381	}
2382
2383	/*
2384	 * The subflow connection has been connected.  Find out whether it
2385	 * is connected as a regular TCP or as a MPTCP subflow.  The idea is:
2386	 *
2387	 *   a. If MPTCP connection is not yet established, then this must be
2388	 *	the first subflow connection.  If MPTCP failed to negotiate,
2389	 *	indicate to the MPTCP socket client via EPROTO, that the
2390	 *	underlying TCP connection may be peeled off via peeloff(2).
2391	 *	Otherwise, mark the MPTCP socket as connected.
2392	 *
2393	 *   b. If MPTCP connection has been established, then this must be
2394	 *	one of the subsequent subflow connections. If MPTCP failed
2395	 *	to negotiate, disconnect the connection since peeloff(2)
2396	 *	is no longer possible.
2397	 *
2398	 * Right now, we simply unblock any waiters at the MPTCP socket layer
2399	 * if the MPTCP connection has not been established.
2400	 */
2401	socket_lock(so, 0);
2402
2403	if (so->so_state & SS_ISDISCONNECTED) {
2404		/*
2405		 * With MPTCP joins, a connection is connected at the subflow
2406		 * level, but the 4th ACK from the server elevates the MPTCP
2407		 * subflow to connected state. So there is a small window
2408		 * where the subflow could get disconnected before the
2409		 * connected event is processed.
2410		 */
2411		socket_unlock(so, 0);
2412		return (MPTS_EVRET_OK);
2413	}
2414
2415	mpts->mpts_soerror = 0;
2416	mpts->mpts_flags &= ~MPTSF_CONNECTING;
2417	mpts->mpts_flags |= MPTSF_CONNECTED;
2418	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
2419		mpts->mpts_flags |= MPTSF_MP_CAPABLE;
2420
2421	VERIFY(mpts->mpts_dst_sl != NULL);
2422	dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
2423	VERIFY(dst_se != NULL && dst_se->se_addr != NULL &&
2424	    dst_se->se_addr->sa_family == af);
2425
2426	VERIFY(mpts->mpts_src_sl != NULL);
2427	src_se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
2428	VERIFY(src_se != NULL && src_se->se_addr != NULL &&
2429	    src_se->se_addr->sa_family == af);
2430
2431	/* get/check source IP address */
2432	switch (af) {
2433	case AF_INET: {
2434		error = in_getsockaddr_s(so, &src);
2435		if (error == 0) {
2436			struct sockaddr_in *ms = SIN(src_se->se_addr);
2437			struct sockaddr_in *s = SIN(&src);
2438
2439			VERIFY(s->sin_len == ms->sin_len);
2440			VERIFY(ms->sin_family == AF_INET);
2441
2442			if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2443			    bcmp(&ms->sin_addr, &s->sin_addr,
2444			    sizeof (ms->sin_addr)) != 0) {
2445				mptcplog((LOG_ERR, "%s: cid %d local "
2446				    "address %s (expected %s)\n", __func__,
2447				    mpts->mpts_connid, inet_ntop(AF_INET,
2448				    (void *)&s->sin_addr.s_addr, buf0,
2449				    sizeof (buf0)), inet_ntop(AF_INET,
2450				    (void *)&ms->sin_addr.s_addr, buf1,
2451				    sizeof (buf1))));
2452			}
2453			bcopy(s, ms, sizeof (*s));
2454		}
2455		break;
2456	}
2457#if INET6
2458	case AF_INET6: {
2459		error = in6_getsockaddr_s(so, &src);
2460		if (error == 0) {
2461			struct sockaddr_in6 *ms = SIN6(src_se->se_addr);
2462			struct sockaddr_in6 *s = SIN6(&src);
2463
2464			VERIFY(s->sin6_len == ms->sin6_len);
2465			VERIFY(ms->sin6_family == AF_INET6);
2466
2467			if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2468			    bcmp(&ms->sin6_addr, &s->sin6_addr,
2469			    sizeof (ms->sin6_addr)) != 0) {
2470				mptcplog((LOG_ERR, "%s: cid %d local "
2471				    "address %s (expected %s)\n", __func__,
2472				    mpts->mpts_connid, inet_ntop(AF_INET6,
2473				    (void *)&s->sin6_addr, buf0,
2474				    sizeof (buf0)), inet_ntop(AF_INET6,
2475				    (void *)&ms->sin6_addr, buf1,
2476				    sizeof (buf1))));
2477			}
2478			bcopy(s, ms, sizeof (*s));
2479		}
2480		break;
2481	}
2482#endif /* INET6 */
2483	default:
2484		VERIFY(0);
2485		/* NOTREACHED */
2486	}
2487
2488	if (error != 0) {
2489		mptcplog((LOG_ERR, "%s: cid %d getsockaddr failed (%d)\n",
2490		    __func__, mpts->mpts_connid, error));
2491	}
2492
2493	/* get/verify the outbound interface */
2494	outifp = sotoinpcb(so)->inp_last_outifp;	/* could be NULL */
2495	if (mpts->mpts_flags & MPTSF_BOUND_IF) {
2496		VERIFY(mpts->mpts_outif != NULL);
2497		if (mpts->mpts_outif != outifp) {
2498			mptcplog((LOG_ERR, "%s: cid %d outif %s "
2499			    "(expected %s)\n", __func__, mpts->mpts_connid,
2500			    ((outifp != NULL) ? outifp->if_xname : "NULL"),
2501			    mpts->mpts_outif->if_xname));
2502			if (outifp == NULL)
2503				outifp = mpts->mpts_outif;
2504		}
2505	} else {
2506		mpts->mpts_outif = outifp;
2507	}
2508
2509	socket_unlock(so, 0);
2510
2511	mptcplog((LOG_DEBUG, "%s: cid %d outif %s %s[%d] -> %s[%d] "
2512	    "is %s\n", __func__, mpts->mpts_connid, ((outifp != NULL) ?
2513	    outifp->if_xname : "NULL"), inet_ntop(af, (af == AF_INET) ?
2514	    (void *)&SIN(src_se->se_addr)->sin_addr.s_addr :
2515	    (void *)&SIN6(src_se->se_addr)->sin6_addr, buf0, sizeof (buf0)),
2516	    ((af == AF_INET) ? ntohs(SIN(src_se->se_addr)->sin_port) :
2517	    ntohs(SIN6(src_se->se_addr)->sin6_port)),
2518	    inet_ntop(af, ((af == AF_INET) ?
2519	    (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
2520	    (void *)&SIN6(dst_se->se_addr)->sin6_addr), buf1, sizeof (buf1)),
2521	    ((af == AF_INET) ? ntohs(SIN(dst_se->se_addr)->sin_port) :
2522	    ntohs(SIN6(dst_se->se_addr)->sin6_port)),
2523	    ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ?
2524	    "MPTCP capable" : "a regular TCP")));
2525
2526	mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
2527	MPTS_UNLOCK(mpts);
2528
2529	soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
2530
2531	MPT_LOCK(mp_tp);
2532	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2533		/* case (a) above */
2534		if (!mpok) {
2535			mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
2536			(void) mptcp_drop(mpte, mp_tp, EPROTO);
2537			MPT_UNLOCK(mp_tp);
2538		} else {
2539			if (mptcp_init_authparms(mp_tp) != 0) {
2540				mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
2541				(void) mptcp_drop(mpte, mp_tp, EPROTO);
2542				MPT_UNLOCK(mp_tp);
2543				mpok = FALSE;
2544			} else {
2545				mp_tp->mpt_state = MPTCPS_ESTABLISHED;
2546				mpte->mpte_associd = mpts->mpts_connid;
2547				DTRACE_MPTCP2(state__change,
2548				    struct mptcb *, mp_tp,
2549				    uint32_t, 0 /* event */);
2550				mptcp_init_statevars(mp_tp);
2551				MPT_UNLOCK(mp_tp);
2552
2553				(void) mptcp_setconnorder(mpte,
2554				    mpts->mpts_connid, 1);
2555				soisconnected(mp_so);
2556			}
2557		}
2558		MPTS_LOCK(mpts);
2559		if (mpok) {
2560			/* Initialize the relative sequence number */
2561			mpts->mpts_rel_seq = 1;
2562			mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
2563			mpte->mpte_nummpcapflows++;
2564			MPT_LOCK_SPIN(mp_tp);
2565			mpts->mpts_sndnxt = mp_tp->mpt_snduna;
2566			MPT_UNLOCK(mp_tp);
2567		}
2568	} else if (mpok) {
2569		MPT_UNLOCK(mp_tp);
2570		/*
2571		 * case (b) above
2572		 * In case of additional flows, the MPTCP socket is not
2573		 * MPTSF_MP_CAPABLE until an ACK is received from server
2574		 * for 3-way handshake.  TCP would have guaranteed that this
2575		 * is an MPTCP subflow.
2576		 */
2577		MPTS_LOCK(mpts);
2578		mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
2579		mpte->mpte_nummpcapflows++;
2580		mpts->mpts_rel_seq = 1;
2581		MPT_LOCK_SPIN(mp_tp);
2582		mpts->mpts_sndnxt = mp_tp->mpt_snduna;
2583		MPT_UNLOCK(mp_tp);
2584	}
2585	MPTS_LOCK_ASSERT_HELD(mpts);
2586
2587	return (MPTS_EVRET_OK);	/* keep the subflow socket around */
2588}
2589
2590/*
2591 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
2592 */
2593static ev_ret_t
2594mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts)
2595{
2596	struct socket *mp_so, *so;
2597	struct mptcb *mp_tp;
2598	boolean_t linger;
2599
2600	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2601	MPTS_LOCK_ASSERT_HELD(mpts);
2602	VERIFY(mpte->mpte_mppcb != NULL);
2603	mp_so = mpte->mpte_mppcb->mpp_socket;
2604	mp_tp = mpte->mpte_mptcb;
2605	so = mpts->mpts_socket;
2606
2607	linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2608	    !(mp_so->so_flags & SOF_PCBCLEARING));
2609
2610	mptcplog2((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__,
2611	    mpts->mpts_connid, (linger ? "YES" : "NO")));
2612
2613	if (mpts->mpts_flags & MPTSF_DISCONNECTED)
2614		return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2615
2616	/*
2617	 * Clear flags that are used by getconninfo to return state.
2618	 * Retain like MPTSF_DELETEOK, MPTSF_ACTIVE for internal purposes.
2619	 */
2620	mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
2621	    MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
2622	    MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|
2623	    MPTSF_SUSPENDED|MPTSF_ACTIVE);
2624	mpts->mpts_flags |= MPTSF_DISCONNECTED;
2625
2626	/*
2627	 * The subflow connection has been disconnected.
2628	 *
2629	 * Right now, we simply unblock any waiters at the MPTCP socket layer
2630	 * if the MPTCP connection has not been established.
2631	 */
2632	MPTS_UNLOCK(mpts);
2633
2634	soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
2635
2636	if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
2637		mpte->mpte_nummpcapflows--;
2638		mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
2639	}
2640
2641	MPT_LOCK(mp_tp);
2642	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2643		MPT_UNLOCK(mp_tp);
2644		soisdisconnected(mp_so);
2645	} else {
2646		MPT_UNLOCK(mp_tp);
2647	}
2648
2649	MPTS_LOCK(mpts);
2650	/*
2651	 * The underlying subflow socket has been disconnected;
2652	 * it is no longer useful to us.  Keep the subflow socket
2653	 * around, unless the MPTCP socket has been detached or
2654	 * the subflow has been disconnected explicitly, in which
2655	 * case it should be deleted right away.
2656	 */
2657	return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2658}
2659
2660/*
2661 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
2662 */
2663static ev_ret_t
2664mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts)
2665{
2666	struct socket *mp_so, *so;
2667	struct mptcb *mp_tp;
2668	ev_ret_t ret = MPTS_EVRET_OK_UPDATE;
2669
2670	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2671	VERIFY(mpte->mpte_mppcb != NULL);
2672	mp_so = mpte->mpte_mppcb->mpp_socket;
2673	mp_tp = mpte->mpte_mptcb;
2674
2675	MPTS_LOCK_ASSERT_HELD(mpts);
2676	so = mpts->mpts_socket;
2677
2678	socket_lock(so, 0);
2679	MPT_LOCK(mp_tp);
2680
2681	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
2682		mpts->mpts_flags |= MPTSF_MP_CAPABLE;
2683	else
2684		mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
2685
2686	if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
2687		if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
2688			goto done;
2689		mpts->mpts_flags |= MPTSF_MP_DEGRADED;
2690	}
2691	else
2692		mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
2693
2694	if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
2695		mpts->mpts_flags |= MPTSF_MP_READY;
2696	else
2697		mpts->mpts_flags &= ~MPTSF_MP_READY;
2698
2699	if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
2700		mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
2701		mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
2702	}
2703
2704	if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2705		VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
2706		ret = MPTS_EVRET_DISCONNECT_FALLBACK;
2707	} else if (mpts->mpts_flags & MPTSF_MP_READY) {
2708		mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
2709		ret = MPTS_EVRET_CONNECT_PENDING;
2710	}
2711
2712	mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d "
2713	    "mptsf=%b\n", __func__,
2714	    (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
2715	    mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
2716	    mpts->mpts_flags, MPTSF_BITS));
2717done:
2718	MPT_UNLOCK(mp_tp);
2719	socket_unlock(so, 0);
2720
2721	return (ret);
2722}
2723
2724/*
2725 * Handle SO_FILT_HINT_MUSTRST subflow socket event
2726 */
2727static ev_ret_t
2728mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts)
2729{
2730	struct socket *mp_so, *so;
2731	struct mptcb *mp_tp;
2732	boolean_t linger;
2733
2734
2735	MPTE_LOCK_ASSERT_HELD(mpte);    /* same as MP socket lock */
2736	MPTS_LOCK_ASSERT_HELD(mpts);
2737	VERIFY(mpte->mpte_mppcb != NULL);
2738	mp_so = mpte->mpte_mppcb->mpp_socket;
2739	mp_tp = mpte->mpte_mptcb;
2740	so = mpts->mpts_socket;
2741
2742	linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2743	    !(mp_so->so_flags & SOF_PCBCLEARING));
2744
2745	if (mpts->mpts_soerror == 0)
2746		mpts->mpts_soerror = ECONNABORTED;
2747
2748	so->so_error = ECONNABORTED;
2749
2750	/* We got an invalid option or a fast close */
2751	socket_lock(so, 0);
2752	struct tcptemp *t_template;
2753	struct inpcb *inp = sotoinpcb(so);
2754	struct tcpcb *tp = NULL;
2755
2756	tp = intotcpcb(inp);
2757
2758	t_template = tcp_maketemplate(tp);
2759	if (t_template) {
2760		unsigned int ifscope, nocell = 0;
2761
2762		if (inp->inp_flags & INP_BOUND_IF)
2763			ifscope = inp->inp_boundifp->if_index;
2764		else
2765			ifscope = IFSCOPE_NONE;
2766
2767		if (inp->inp_flags & INP_NO_IFT_CELLULAR)
2768			nocell = 1;
2769
2770		tcp_respond(tp, t_template->tt_ipgen,
2771		    &t_template->tt_t, (struct mbuf *)NULL,
2772		    tp->rcv_nxt, tp->snd_una, TH_RST, ifscope, nocell);
2773		(void) m_free(dtom(t_template));
2774		mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d \n",
2775		    __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2776		    so, mpts->mpts_connid));
2777	}
2778	socket_unlock(so, 0);
2779	mptcp_subflow_disconnect(mpte, mpts, !linger);
2780	MPTS_UNLOCK(mpts);
2781
2782	soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
2783
2784	MPT_LOCK(mp_tp);
2785	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2786		mp_so->so_error = ECONNABORTED;
2787	}
2788	MPT_UNLOCK(mp_tp);
2789
2790	MPTS_LOCK(mpts);
2791	/*
2792	 * Keep the subflow socket around unless the subflow has been
2793	 * disconnected explicitly.
2794	 */
2795	return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2796}
2797
2798static const char *
2799mptcp_evret2str(ev_ret_t ret)
2800{
2801	const char *c = "UNKNOWN";
2802
2803	switch (ret) {
2804	case MPTS_EVRET_DELETE:
2805		c = "MPTS_EVRET_DELETE";
2806		break;
2807	case MPTS_EVRET_CONNECT_PENDING:
2808		c = "MPTS_EVRET_CONNECT_PENDING";
2809		break;
2810	case MPTS_EVRET_DISCONNECT_FALLBACK:
2811		c = "MPTS_EVRET_DISCONNECT_FALLBACK";
2812		break;
2813	case MPTS_EVRET_OK:
2814		c = "MPTS_EVRET_OK";
2815		break;
2816	case MPTS_EVRET_OK_UPDATE:
2817		c = "MPTS_EVRET_OK_UPDATE";
2818		break;
2819	}
2820	return (c);
2821}
2822
2823/*
2824 * Add a reference to a subflow structure; used by MPTS_ADDREF().
2825 */
2826void
2827mptcp_subflow_addref(struct mptsub *mpts, int locked)
2828{
2829	if (!locked)
2830		MPTS_LOCK(mpts);
2831	else
2832		MPTS_LOCK_ASSERT_HELD(mpts);
2833
2834	if (++mpts->mpts_refcnt == 0) {
2835		panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
2836		/* NOTREACHED */
2837	}
2838	if (!locked)
2839		MPTS_UNLOCK(mpts);
2840}
2841
2842/*
2843 * Remove a reference held on a subflow structure; used by MPTS_REMREF();
2844 */
2845void
2846mptcp_subflow_remref(struct mptsub *mpts)
2847{
2848	MPTS_LOCK(mpts);
2849	if (mpts->mpts_refcnt == 0) {
2850		panic("%s: mpts %p negative refcnt\n", __func__, mpts);
2851		/* NOTREACHED */
2852	}
2853	if (--mpts->mpts_refcnt > 0) {
2854		MPTS_UNLOCK(mpts);
2855		return;
2856	}
2857	/* callee will unlock and destroy lock */
2858	mptcp_subflow_free(mpts);
2859}
2860
2861/*
2862 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
2863 * caller must ensure that the option can be issued on subflow sockets, via
2864 * MPOF_SUBFLOW_OK flag.
2865 */
2866int
2867mptcp_subflow_sosetopt(struct mptses *mpte, struct socket *so,
2868    struct mptopt *mpo)
2869{
2870	struct socket *mp_so;
2871	struct sockopt sopt;
2872	char buf[32];
2873	int error;
2874
2875	VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
2876	mpo->mpo_flags &= ~MPOF_INTERIM;
2877
2878	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2879	mp_so = mpte->mpte_mppcb->mpp_socket;
2880
2881	bzero(&sopt, sizeof (sopt));
2882	sopt.sopt_dir = SOPT_SET;
2883	sopt.sopt_level = mpo->mpo_level;
2884	sopt.sopt_name = mpo->mpo_name;
2885	sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
2886	sopt.sopt_valsize = sizeof (int);
2887	sopt.sopt_p = kernproc;
2888
2889	error = sosetoptlock(so, &sopt, 0);	/* already locked */
2890	if (error == 0) {
2891		mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s "
2892		    "val %d set successful\n", __func__,
2893		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2894		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
2895		    buf, sizeof (buf)), mpo->mpo_intval));
2896	} else {
2897		mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s "
2898		    "val %d set error %d\n", __func__,
2899		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2900		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
2901		    buf, sizeof (buf)), mpo->mpo_intval, error));
2902	}
2903	return (error);
2904}
2905
2906/*
2907 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
2908 * caller must ensure that the option can be issued on subflow sockets, via
2909 * MPOF_SUBFLOW_OK flag.
2910 */
2911int
2912mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
2913    struct mptopt *mpo)
2914{
2915	struct socket *mp_so;
2916	struct sockopt sopt;
2917	char buf[32];
2918	int error;
2919
2920	VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
2921	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
2922	mp_so = mpte->mpte_mppcb->mpp_socket;
2923
2924	bzero(&sopt, sizeof (sopt));
2925	sopt.sopt_dir = SOPT_GET;
2926	sopt.sopt_level = mpo->mpo_level;
2927	sopt.sopt_name = mpo->mpo_name;
2928	sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
2929	sopt.sopt_valsize = sizeof (int);
2930	sopt.sopt_p = kernproc;
2931
2932	error = sogetoptlock(so, &sopt, 0);	/* already locked */
2933	if (error == 0) {
2934		mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s "
2935		    "val %d get successful\n", __func__,
2936		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2937		    mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
2938		    buf, sizeof (buf)), mpo->mpo_intval));
2939	} else {
2940		mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s get error %d\n",
2941		    __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2942		    mptcp_sopt2str(mpo->mpo_level,
2943		    mpo->mpo_name, buf, sizeof (buf)), error));
2944	}
2945	return (error);
2946}
2947
2948
2949/*
2950 * MPTCP garbage collector.
2951 *
2952 * This routine is called by the MP domain on-demand, periodic callout,
2953 * which is triggered when a MPTCP socket is closed.  The callout will
2954 * repeat as long as this routine returns a non-zero value.
2955 */
2956static uint32_t
2957mptcp_gc(struct mppcbinfo *mppi)
2958{
2959	struct mppcb *mpp, *tmpp;
2960	uint32_t active = 0;
2961
2962	lck_mtx_assert(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
2963
2964	mptcplog3((LOG_DEBUG, "%s: running\n", __func__));
2965
2966	TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
2967		struct socket *mp_so;
2968		struct mptses *mpte;
2969		struct mptcb *mp_tp;
2970
2971		VERIFY(mpp->mpp_flags & MPP_ATTACHED);
2972		mp_so = mpp->mpp_socket;
2973		VERIFY(mp_so != NULL);
2974		mpte = mptompte(mpp);
2975		VERIFY(mpte != NULL);
2976		mp_tp = mpte->mpte_mptcb;
2977		VERIFY(mp_tp != NULL);
2978
2979		mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx found "
2980		    "(u=%d,r=%d,s=%d)\n", __func__,
2981		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
2982		    mp_so->so_retaincnt, mpp->mpp_state));
2983
2984		if (!lck_mtx_try_lock(&mpp->mpp_lock)) {
2985			mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped "
2986			    "(u=%d,r=%d)\n", __func__,
2987			    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2988			    mp_so->so_usecount, mp_so->so_retaincnt));
2989			active++;
2990			continue;
2991		}
2992
2993		/* check again under the lock */
2994		if (mp_so->so_usecount > 1) {
2995			boolean_t wakeup = FALSE;
2996			struct mptsub *mpts, *tmpts;
2997
2998			mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped "
2999			    "[u=%d,r=%d] %d %d\n", __func__,
3000			    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3001			    mp_so->so_usecount, mp_so->so_retaincnt,
3002			    mp_tp->mpt_gc_ticks,
3003			    mp_tp->mpt_state));
3004			MPT_LOCK(mp_tp);
3005			if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
3006				if (mp_tp->mpt_gc_ticks > 0)
3007					mp_tp->mpt_gc_ticks--;
3008				if (mp_tp->mpt_gc_ticks == 0) {
3009					wakeup = TRUE;
3010					if (mp_tp->mpt_localkey != NULL) {
3011						mptcp_free_key(
3012						    mp_tp->mpt_localkey);
3013						mp_tp->mpt_localkey = NULL;
3014					}
3015				}
3016			}
3017			MPT_UNLOCK(mp_tp);
3018			if (wakeup) {
3019				TAILQ_FOREACH_SAFE(mpts,
3020				    &mpte->mpte_subflows, mpts_entry, tmpts) {
3021					MPTS_LOCK(mpts);
3022					mpts->mpts_flags |= MPTSF_DELETEOK;
3023					if (mpts->mpts_soerror == 0)
3024						mpts->mpts_soerror = ETIMEDOUT;
3025					mptcp_subflow_eupcall(mpts->mpts_socket,
3026					    mpts, SO_FILT_HINT_DISCONNECTED);
3027					MPTS_UNLOCK(mpts);
3028				}
3029			}
3030			lck_mtx_unlock(&mpp->mpp_lock);
3031			active++;
3032			continue;
3033		}
3034
3035		if (mpp->mpp_state != MPPCB_STATE_DEAD) {
3036			mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped "
3037			    "[u=%d,r=%d,s=%d]\n", __func__,
3038			    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3039			    mp_so->so_usecount, mp_so->so_retaincnt,
3040			    mpp->mpp_state));
3041			lck_mtx_unlock(&mpp->mpp_lock);
3042			active++;
3043			continue;
3044		}
3045
3046		/*
3047		 * The PCB has been detached, and there is exactly 1 refnct
3048		 * held by the MPTCP thread.  Signal that thread to terminate,
3049		 * after which the last refcnt will be released.  That will
3050		 * allow it to be destroyed below during the next round.
3051		 */
3052		if (mp_so->so_usecount == 1) {
3053			mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx scheduled for "
3054			    "termination [u=%d,r=%d]\n", __func__,
3055			    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3056			    mp_so->so_usecount, mp_so->so_retaincnt));
3057			/* signal MPTCP thread to terminate */
3058			mptcp_thread_terminate_signal(mpte);
3059			lck_mtx_unlock(&mpp->mpp_lock);
3060			active++;
3061			continue;
3062		}
3063
3064		mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
3065		    __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3066		    mp_so->so_usecount, mp_so->so_retaincnt));
3067		DTRACE_MPTCP4(dispose, struct socket *, mp_so,
3068		    struct sockbuf *, &mp_so->so_rcv,
3069		    struct sockbuf *, &mp_so->so_snd,
3070		    struct mppcb *, mpp);
3071
3072		mp_pcbdispose(mpp);
3073	}
3074
3075	return (active);
3076}
3077
3078/*
3079 * Drop a MPTCP connection, reporting the specified error.
3080 */
3081struct mptses *
3082mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
3083{
3084	struct socket *mp_so;
3085
3086	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
3087	MPT_LOCK_ASSERT_HELD(mp_tp);
3088	VERIFY(mpte->mpte_mptcb == mp_tp);
3089	mp_so = mpte->mpte_mppcb->mpp_socket;
3090
3091	mp_tp->mpt_state = MPTCPS_CLOSED;
3092	DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
3093	    uint32_t, 0 /* event */);
3094
3095	if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
3096		errno = mp_tp->mpt_softerror;
3097	mp_so->so_error = errno;
3098
3099	return (mptcp_close(mpte, mp_tp));
3100}
3101
3102/*
3103 * Close a MPTCP control block.
3104 */
3105struct mptses *
3106mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
3107{
3108	struct socket *mp_so;
3109	struct mptsub *mpts, *tmpts;
3110
3111	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
3112	MPT_LOCK_ASSERT_HELD(mp_tp);
3113	VERIFY(mpte->mpte_mptcb == mp_tp);
3114	mp_so = mpte->mpte_mppcb->mpp_socket;
3115	if (mp_tp->mpt_localkey != NULL) {
3116		mptcp_free_key(mp_tp->mpt_localkey);
3117		mp_tp->mpt_localkey = NULL;
3118	}
3119
3120	MPT_UNLOCK(mp_tp);
3121	soisdisconnected(mp_so);
3122
3123	MPT_LOCK(mp_tp);
3124	if (mp_tp->mpt_flags & MPTCPF_PEEL_OFF) {
3125		return (NULL);
3126	}
3127	MPT_UNLOCK(mp_tp);
3128
3129	/* Clean up all subflows */
3130	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3131		MPTS_LOCK(mpts);
3132		mptcp_subflow_disconnect(mpte, mpts, TRUE);
3133		MPTS_UNLOCK(mpts);
3134		mptcp_subflow_del(mpte, mpts, TRUE);
3135	}
3136	MPT_LOCK(mp_tp);
3137
3138	return (NULL);
3139}
3140
3141void
3142mptcp_notify_close(struct socket *so)
3143{
3144	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
3145}
3146
3147/*
3148 * Signal MPTCP thread to wake up.
3149 */
3150void
3151mptcp_thread_signal(struct mptses *mpte)
3152{
3153	lck_mtx_lock(&mpte->mpte_thread_lock);
3154	mptcp_thread_signal_locked(mpte);
3155	lck_mtx_unlock(&mpte->mpte_thread_lock);
3156}
3157
3158/*
3159 * Signal MPTCP thread to wake up (locked version)
3160 */
3161static void
3162mptcp_thread_signal_locked(struct mptses *mpte)
3163{
3164	lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3165
3166	mpte->mpte_thread_reqs++;
3167	if (!mpte->mpte_thread_active && mpte->mpte_thread != THREAD_NULL)
3168		wakeup_one((caddr_t)&mpte->mpte_thread);
3169}
3170
3171/*
3172 * Signal MPTCP thread to terminate.
3173 */
3174static void
3175mptcp_thread_terminate_signal(struct mptses *mpte)
3176{
3177	lck_mtx_lock(&mpte->mpte_thread_lock);
3178	if (mpte->mpte_thread != THREAD_NULL) {
3179		mpte->mpte_thread = THREAD_NULL;
3180		mpte->mpte_thread_reqs++;
3181		if (!mpte->mpte_thread_active)
3182			wakeup_one((caddr_t)&mpte->mpte_thread);
3183	}
3184	lck_mtx_unlock(&mpte->mpte_thread_lock);
3185}
3186
3187/*
3188 * MPTCP thread workloop.
3189 */
3190static void
3191mptcp_thread_dowork(struct mptses *mpte)
3192{
3193	struct socket *mp_so;
3194	struct mptsub *mpts, *tmpts;
3195	boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
3196	boolean_t conninfo_update = FALSE;
3197
3198	MPTE_LOCK(mpte);		/* same as MP socket lock */
3199	VERIFY(mpte->mpte_mppcb != NULL);
3200	mp_so = mpte->mpte_mppcb->mpp_socket;
3201	VERIFY(mp_so != NULL);
3202
3203	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3204		ev_ret_t ret;
3205
3206		MPTS_LOCK(mpts);
3207		MPTS_ADDREF_LOCKED(mpts);	/* for us */
3208
3209		/* Update process ownership based on parent mptcp socket */
3210		mptcp_update_last_owner(mpts, mp_so);
3211
3212		mptcp_subflow_input(mpte, mpts);
3213		ret = mptcp_subflow_events(mpte, mpts);
3214
3215		if (mpts->mpts_flags & MPTSF_ACTIVE) {
3216			mptcplog3((LOG_INFO, "%s: cid %d \n", __func__,
3217			    mpts->mpts_connid));
3218			(void) mptcp_subflow_output(mpte, mpts);
3219		}
3220
3221		/*
3222		 * If MPTCP socket is closed, disconnect all subflows.
3223		 * This will generate a disconnect event which will
3224		 * be handled during the next iteration, causing a
3225		 * non-zero error to be returned above.
3226		 */
3227		if (mp_so->so_flags & SOF_PCBCLEARING)
3228			mptcp_subflow_disconnect(mpte, mpts, FALSE);
3229		MPTS_UNLOCK(mpts);
3230
3231		switch (ret) {
3232		case MPTS_EVRET_OK_UPDATE:
3233			conninfo_update = TRUE;
3234			break;
3235		case MPTS_EVRET_OK:
3236			/* nothing to do */
3237			break;
3238		case MPTS_EVRET_DELETE:
3239			if (mptcp_delete_ok(mpte, mpts)) {
3240				mptcp_subflow_del(mpte, mpts, TRUE);
3241			}
3242			break;
3243		case MPTS_EVRET_CONNECT_PENDING:
3244			connect_pending = TRUE;
3245			break;
3246		case MPTS_EVRET_DISCONNECT_FALLBACK:
3247			disconnect_fallback = TRUE;
3248			break;
3249		}
3250		MPTS_REMREF(mpts);		/* ours */
3251	}
3252
3253	if (conninfo_update) {
3254		soevent(mp_so, SO_FILT_HINT_LOCKED |
3255		    SO_FILT_HINT_CONNINFO_UPDATED);
3256	}
3257
3258	if (!connect_pending && !disconnect_fallback) {
3259		MPTE_UNLOCK(mpte);
3260		return;
3261	}
3262
3263	TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3264		MPTS_LOCK(mpts);
3265		if (disconnect_fallback) {
3266			struct socket *so = NULL;
3267			struct inpcb *inp = NULL;
3268			struct tcpcb *tp = NULL;
3269
3270			if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3271				MPTS_UNLOCK(mpts);
3272				continue;
3273			}
3274
3275			mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3276
3277			if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
3278			    MPTSF_DISCONNECTED)) {
3279				MPTS_UNLOCK(mpts);
3280				continue;
3281			}
3282			so = mpts->mpts_socket;
3283
3284			/*
3285			 * The MPTCP connection has degraded to a fallback
3286			 * mode, so there is no point in keeping this subflow
3287			 * regardless of its MPTCP-readiness state, unless it
3288			 * is the primary one which we use for fallback.  This
3289			 * assumes that the subflow used for fallback is the
3290			 * ACTIVE one.
3291			 */
3292
3293			socket_lock(so, 1);
3294			inp = sotoinpcb(so);
3295			tp = intotcpcb(inp);
3296			tp->t_mpflags &=
3297			    ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
3298			tp->t_mpflags |= TMPF_TCP_FALLBACK;
3299			if (mpts->mpts_flags & MPTSF_ACTIVE) {
3300				socket_unlock(so, 1);
3301				MPTS_UNLOCK(mpts);
3302				continue;
3303			}
3304			tp->t_mpflags |= TMPF_RESET;
3305			soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3306			socket_unlock(so, 1);
3307
3308		} else if (connect_pending) {
3309			/*
3310			 * The MPTCP connection has progressed to a state
3311			 * where it supports full multipath semantics; allow
3312			 * additional joins to be attempted for all subflows
3313			 * that are in the PENDING state.
3314			 */
3315			if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
3316				(void) mptcp_subflow_soconnectx(mpte, mpts);
3317			}
3318		}
3319		MPTS_UNLOCK(mpts);
3320	}
3321
3322	MPTE_UNLOCK(mpte);
3323}
3324
3325/*
3326 * MPTCP thread.
3327 */
3328static void
3329mptcp_thread_func(void *v, wait_result_t w)
3330{
3331#pragma unused(w)
3332	struct mptses *mpte = v;
3333	struct timespec *ts = NULL;
3334
3335	VERIFY(mpte != NULL);
3336
3337	lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3338
3339	for (;;) {
3340		lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3341
3342		if (mpte->mpte_thread != THREAD_NULL) {
3343			(void) msleep(&mpte->mpte_thread,
3344			    &mpte->mpte_thread_lock, (PZERO - 1) | PSPIN,
3345			    __func__, ts);
3346		}
3347
3348		/* MPTCP socket is closed? */
3349		if (mpte->mpte_thread == THREAD_NULL) {
3350			lck_mtx_unlock(&mpte->mpte_thread_lock);
3351			/* callee will destroy thread lock */
3352			mptcp_thread_destroy(mpte);
3353			/* NOTREACHED */
3354			return;
3355		}
3356
3357		mpte->mpte_thread_active = 1;
3358		for (;;) {
3359			uint32_t reqs = mpte->mpte_thread_reqs;
3360
3361			lck_mtx_unlock(&mpte->mpte_thread_lock);
3362			mptcp_thread_dowork(mpte);
3363			lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3364
3365			/* if there's no pending request, we're done */
3366			if (reqs == mpte->mpte_thread_reqs ||
3367			    mpte->mpte_thread == THREAD_NULL)
3368				break;
3369		}
3370		mpte->mpte_thread_reqs = 0;
3371		mpte->mpte_thread_active = 0;
3372	}
3373}
3374
3375/*
3376 * Destroy a MTCP thread, to be called in the MPTCP thread context
3377 * upon receiving an indication to self-terminate.  This routine
3378 * will not return, as the current thread is terminated at the end.
3379 */
3380static void
3381mptcp_thread_destroy(struct mptses *mpte)
3382{
3383	struct socket *mp_so;
3384
3385	MPTE_LOCK(mpte);		/* same as MP socket lock */
3386	VERIFY(mpte->mpte_thread == THREAD_NULL);
3387	VERIFY(mpte->mpte_mppcb != NULL);
3388
3389	mptcp_sesdestroy(mpte);
3390
3391	mp_so = mpte->mpte_mppcb->mpp_socket;
3392	VERIFY(mp_so != NULL);
3393	VERIFY(mp_so->so_usecount != 0);
3394	mp_so->so_usecount--;		/* for thread */
3395	mpte->mpte_mppcb->mpp_flags |= MPP_DEFUNCT;
3396	MPTE_UNLOCK(mpte);
3397
3398	/* for the extra refcnt from kernel_thread_start() */
3399	thread_deallocate(current_thread());
3400	/* this is the end */
3401	thread_terminate(current_thread());
3402	/* NOTREACHED */
3403}
3404
3405/*
3406 * Protocol pr_lock callback.
3407 */
3408int
3409mptcp_lock(struct socket *mp_so, int refcount, void *lr)
3410{
3411	struct mppcb *mpp = sotomppcb(mp_so);
3412	void *lr_saved;
3413
3414	if (lr == NULL)
3415		lr_saved = __builtin_return_address(0);
3416	else
3417		lr_saved = lr;
3418
3419	if (mpp == NULL) {
3420		panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
3421		    mp_so, lr_saved, solockhistory_nr(mp_so));
3422		/* NOTREACHED */
3423	}
3424	lck_mtx_lock(&mpp->mpp_lock);
3425
3426	if (mp_so->so_usecount < 0) {
3427		panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
3428		    mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
3429		    solockhistory_nr(mp_so));
3430		/* NOTREACHED */
3431	}
3432	if (refcount != 0)
3433		mp_so->so_usecount++;
3434	mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
3435	mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
3436
3437	return (0);
3438}
3439
3440/*
3441 * Protocol pr_unlock callback.
3442 */
3443int
3444mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
3445{
3446	struct mppcb *mpp = sotomppcb(mp_so);
3447	void *lr_saved;
3448
3449	if (lr == NULL)
3450		lr_saved = __builtin_return_address(0);
3451	else
3452		lr_saved = lr;
3453
3454	if (mpp == NULL) {
3455		panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
3456		    mp_so, mp_so->so_usecount, lr_saved,
3457		    solockhistory_nr(mp_so));
3458		/* NOTREACHED */
3459	}
3460	lck_mtx_assert(&mpp->mpp_lock, LCK_MTX_ASSERT_OWNED);
3461
3462	if (refcount != 0)
3463		mp_so->so_usecount--;
3464
3465	if (mp_so->so_usecount < 0) {
3466		panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
3467		    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
3468		/* NOTREACHED */
3469	}
3470	mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
3471	mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
3472	lck_mtx_unlock(&mpp->mpp_lock);
3473
3474	return (0);
3475}
3476
3477/*
3478 * Protocol pr_getlock callback.
3479 */
3480lck_mtx_t *
3481mptcp_getlock(struct socket *mp_so, int locktype)
3482{
3483#pragma unused(locktype)
3484	struct mppcb *mpp = sotomppcb(mp_so);
3485
3486	if (mpp == NULL) {
3487		panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
3488		    solockhistory_nr(mp_so));
3489		/* NOTREACHED */
3490	}
3491	if (mp_so->so_usecount < 0) {
3492		panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
3493		    mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
3494		/* NOTREACHED */
3495	}
3496	return (&mpp->mpp_lock);
3497}
3498
3499/*
3500 * Key generation functions
3501 */
3502static void
3503mptcp_generate_unique_key(struct mptcp_key_entry *key_entry)
3504{
3505	struct mptcp_key_entry *key_elm;
3506try_again:
3507	read_random(&key_entry->mkey_value, sizeof (key_entry->mkey_value));
3508	if (key_entry->mkey_value == 0)
3509		goto try_again;
3510	mptcp_do_sha1(&key_entry->mkey_value, key_entry->mkey_digest,
3511	    sizeof (key_entry->mkey_digest));
3512
3513	LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
3514		if (key_elm->mkey_value == key_entry->mkey_value) {
3515			goto try_again;
3516		}
3517		if (bcmp(key_elm->mkey_digest, key_entry->mkey_digest, 4) ==
3518		    0) {
3519			goto try_again;
3520		}
3521	}
3522}
3523
3524static mptcp_key_t *
3525mptcp_reserve_key(void)
3526{
3527	struct mptcp_key_entry *key_elm;
3528	struct mptcp_key_entry *found_elm = NULL;
3529
3530	lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3531	LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
3532		if (key_elm->mkey_flags == MKEYF_FREE) {
3533			key_elm->mkey_flags = MKEYF_INUSE;
3534			found_elm = key_elm;
3535			break;
3536		}
3537	}
3538	lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3539
3540	if (found_elm) {
3541		return (&found_elm->mkey_value);
3542	}
3543
3544	key_elm = (struct mptcp_key_entry *)
3545	    zalloc(mptcp_keys_pool.mkph_key_entry_zone);
3546	key_elm->mkey_flags = MKEYF_INUSE;
3547
3548	lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3549	mptcp_generate_unique_key(key_elm);
3550	LIST_INSERT_HEAD(&mptcp_keys_pool, key_elm, mkey_next);
3551	mptcp_keys_pool.mkph_count += 1;
3552	lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3553	return (&key_elm->mkey_value);
3554}
3555
3556static caddr_t
3557mptcp_get_stored_digest(mptcp_key_t *key)
3558{
3559	struct mptcp_key_entry *key_holder;
3560	caddr_t digest = NULL;
3561
3562	lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3563	key_holder = (struct mptcp_key_entry *)(void *)((caddr_t)key -
3564	    offsetof(struct mptcp_key_entry, mkey_value));
3565	if (key_holder->mkey_flags != MKEYF_INUSE)
3566		panic_plain("%s", __func__);
3567	digest = &key_holder->mkey_digest[0];
3568	lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3569	return (digest);
3570}
3571
3572void
3573mptcp_free_key(mptcp_key_t *key)
3574{
3575	struct mptcp_key_entry *key_holder;
3576	struct mptcp_key_entry *key_elm;
3577	int pt = RandomULong();
3578
3579	mptcplog((LOG_INFO, "%s\n", __func__));
3580
3581	lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
3582	key_holder = (struct mptcp_key_entry *)(void*)((caddr_t)key -
3583	    offsetof(struct mptcp_key_entry, mkey_value));
3584	key_holder->mkey_flags = MKEYF_FREE;
3585
3586	LIST_REMOVE(key_holder, mkey_next);
3587	mptcp_keys_pool.mkph_count -= 1;
3588
3589	/* Free half the time */
3590	if (pt & 0x01) {
3591		zfree(mptcp_keys_pool.mkph_key_entry_zone, key_holder);
3592	} else {
3593		/* Insert it at random point to avoid early reuse */
3594		int i = 0;
3595		if (mptcp_keys_pool.mkph_count > 1) {
3596			pt = pt % (mptcp_keys_pool.mkph_count - 1);
3597			LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
3598				if (++i >= pt) {
3599					LIST_INSERT_AFTER(key_elm, key_holder,
3600					    mkey_next);
3601					break;
3602				}
3603			}
3604			if (i < pt)
3605				panic("missed insertion");
3606		} else {
3607			LIST_INSERT_HEAD(&mptcp_keys_pool, key_holder,
3608			    mkey_next);
3609		}
3610		mptcp_keys_pool.mkph_count += 1;
3611	}
3612	lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
3613}
3614
3615static void
3616mptcp_key_pool_init(void)
3617{
3618	int i;
3619	struct mptcp_key_entry *key_entry;
3620
3621	LIST_INIT(&mptcp_keys_pool);
3622	mptcp_keys_pool.mkph_count = 0;
3623
3624	mptcp_keys_pool.mkph_key_elm_sz = (vm_size_t)
3625	    (sizeof (struct mptcp_key_entry));
3626	mptcp_keys_pool.mkph_key_entry_zone = zinit(
3627	    mptcp_keys_pool.mkph_key_elm_sz,
3628	    MPTCP_MX_KEY_ALLOCS * mptcp_keys_pool.mkph_key_elm_sz,
3629	    MPTCP_MX_PREALLOC_ZONE_SZ, "mptkeys");
3630	if (mptcp_keys_pool.mkph_key_entry_zone == NULL) {
3631		panic("%s: unable to allocate MPTCP keys zone \n", __func__);
3632		/* NOTREACHED */
3633	}
3634	zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_CALLERACCT, FALSE);
3635	zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_EXPAND, TRUE);
3636
3637	for (i = 0; i < MPTCP_KEY_PREALLOCS_MX; i++) {
3638		key_entry = (struct mptcp_key_entry *)
3639		    zalloc(mptcp_keys_pool.mkph_key_entry_zone);
3640		key_entry->mkey_flags = MKEYF_FREE;
3641		mptcp_generate_unique_key(key_entry);
3642		LIST_INSERT_HEAD(&mptcp_keys_pool, key_entry, mkey_next);
3643		mptcp_keys_pool.mkph_count += 1;
3644	}
3645	lck_mtx_init(&mptcp_keys_pool.mkph_lock, mtcbinfo.mppi_lock_grp,
3646	    mtcbinfo.mppi_lock_attr);
3647}
3648
3649/*
3650 * MPTCP Join support
3651 */
3652
3653static void
3654mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
3655    connid_t conn_id)
3656{
3657	struct tcpcb *tp = sototcpcb(so);
3658	struct mptcp_subf_auth_entry *sauth_entry;
3659	MPT_LOCK_ASSERT_NOTHELD(mp_tp);
3660
3661	MPT_LOCK_SPIN(mp_tp);
3662	tp->t_mptcb = mp_tp;
3663	MPT_UNLOCK(mp_tp);
3664	/*
3665	 * As long as the mpts_connid is unique it can be used as the
3666	 * address ID for additional subflows.
3667	 * The address ID of the first flow is implicitly 0.
3668	 */
3669	if (mp_tp->mpt_state == MPTCPS_CLOSED) {
3670		tp->t_local_aid = 0;
3671	} else {
3672		tp->t_local_aid = conn_id;
3673		tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
3674		so->so_flags |= SOF_MP_SEC_SUBFLOW;
3675	}
3676	sauth_entry = zalloc(mpt_subauth_zone);
3677	sauth_entry->msae_laddr_id = tp->t_local_aid;
3678	sauth_entry->msae_raddr_id = 0;
3679	sauth_entry->msae_raddr_rand = 0;
3680try_again:
3681	sauth_entry->msae_laddr_rand = RandomULong();
3682	if (sauth_entry->msae_laddr_rand == 0)
3683		goto try_again;
3684	LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
3685}
3686
3687static void
3688mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
3689{
3690	struct mptcp_subf_auth_entry *sauth_entry;
3691	struct tcpcb *tp = sototcpcb(so);
3692	int found = 0;
3693
3694	if (tp == NULL)
3695		return;
3696
3697	MPT_LOCK(mp_tp);
3698	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
3699		if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
3700			found = 1;
3701			break;
3702		}
3703	}
3704	if (found) {
3705		LIST_REMOVE(sauth_entry, msae_next);
3706		zfree(mpt_subauth_zone, sauth_entry);
3707	}
3708	tp->t_mptcb = NULL;
3709	MPT_UNLOCK(mp_tp);
3710}
3711
3712void
3713mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
3714    u_int32_t *rrand)
3715{
3716	struct mptcp_subf_auth_entry *sauth_entry;
3717	MPT_LOCK_ASSERT_NOTHELD(mp_tp);
3718
3719	MPT_LOCK(mp_tp);
3720	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
3721		if (sauth_entry->msae_laddr_id == addr_id) {
3722			if (lrand)
3723				*lrand = sauth_entry->msae_laddr_rand;
3724			if (rrand)
3725				*rrand = sauth_entry->msae_raddr_rand;
3726			break;
3727		}
3728	}
3729	MPT_UNLOCK(mp_tp);
3730}
3731
3732void
3733mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
3734    mptcp_addr_id raddr_id, u_int32_t raddr_rand)
3735{
3736	struct mptcp_subf_auth_entry *sauth_entry;
3737	MPT_LOCK_ASSERT_NOTHELD(mp_tp);
3738
3739	MPT_LOCK(mp_tp);
3740	LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
3741		if (sauth_entry->msae_laddr_id == laddr_id) {
3742			if ((sauth_entry->msae_raddr_id != 0) &&
3743			    (sauth_entry->msae_raddr_id != raddr_id)) {
3744				mptcplog((LOG_ERR, "MPTCP ERROR %s: mismatched"
3745				    " address ids %d %d \n", __func__, raddr_id,
3746				    sauth_entry->msae_raddr_id));
3747				MPT_UNLOCK(mp_tp);
3748				return;
3749			}
3750			sauth_entry->msae_raddr_id = raddr_id;
3751			if ((sauth_entry->msae_raddr_rand != 0) &&
3752			    (sauth_entry->msae_raddr_rand != raddr_rand)) {
3753				mptcplog((LOG_ERR, "%s: dup SYN_ACK %d %d \n",
3754				    __func__, raddr_rand,
3755				    sauth_entry->msae_raddr_rand));
3756				MPT_UNLOCK(mp_tp);
3757				return;
3758			}
3759			sauth_entry->msae_raddr_rand = raddr_rand;
3760			MPT_UNLOCK(mp_tp);
3761			return;
3762		}
3763	}
3764	MPT_UNLOCK(mp_tp);
3765}
3766
3767/*
3768 * SHA1 support for MPTCP
3769 */
3770static int
3771mptcp_do_sha1(mptcp_key_t *key, char *sha_digest, int digest_len)
3772{
3773	SHA1_CTX sha1ctxt;
3774	const unsigned char *sha1_base;
3775	int sha1_size;
3776
3777	if (digest_len != SHA1_RESULTLEN) {
3778		return (FALSE);
3779	}
3780
3781	sha1_base = (const unsigned char *) key;
3782	sha1_size = sizeof (mptcp_key_t);
3783	SHA1Init(&sha1ctxt);
3784	SHA1Update(&sha1ctxt, sha1_base, sha1_size);
3785	SHA1Final(sha_digest, &sha1ctxt);
3786	return (TRUE);
3787}
3788
3789void
3790mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
3791	u_int32_t rand1, u_int32_t rand2, u_char *digest, int digest_len)
3792{
3793	SHA1_CTX  sha1ctxt;
3794	mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
3795	mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
3796	u_int32_t data[2];
3797	int i;
3798
3799	bzero(digest, digest_len);
3800
3801	/* Set up the Key for HMAC */
3802	key_ipad[0] = key1;
3803	key_ipad[1] = key2;
3804
3805	key_opad[0] = key1;
3806	key_opad[1] = key2;
3807
3808	/* Set up the message for HMAC */
3809	data[0] = rand1;
3810	data[1] = rand2;
3811
3812	/* Key is 512 block length, so no need to compute hash */
3813
3814	/* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
3815
3816	for (i = 0; i < 8; i++) {
3817		key_ipad[i] ^= 0x3636363636363636;
3818		key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
3819	}
3820
3821	/* Perform inner SHA1 */
3822	SHA1Init(&sha1ctxt);
3823	SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
3824	SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
3825	SHA1Final(digest, &sha1ctxt);
3826
3827	/* Perform outer SHA1 */
3828	SHA1Init(&sha1ctxt);
3829	SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
3830	SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
3831	SHA1Final(digest, &sha1ctxt);
3832}
3833
3834/*
3835 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
3836 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
3837 */
3838void
3839mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest,
3840    int digest_len)
3841{
3842	uint32_t lrand, rrand;
3843	mptcp_key_t localkey, remotekey;
3844	MPT_LOCK_ASSERT_NOTHELD(mp_tp);
3845
3846	if (digest_len != SHA1_RESULTLEN)
3847		return;
3848
3849	lrand = rrand = 0;
3850	mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
3851	MPT_LOCK_SPIN(mp_tp);
3852	localkey = *mp_tp->mpt_localkey;
3853	remotekey = mp_tp->mpt_remotekey;
3854	MPT_UNLOCK(mp_tp);
3855	mptcp_hmac_sha1(localkey, remotekey, lrand, rrand, digest,
3856	    digest_len);
3857}
3858
3859u_int64_t
3860mptcp_get_trunced_hmac(mptcp_addr_id aid, struct mptcb *mp_tp)
3861{
3862	u_char digest[SHA1_RESULTLEN];
3863	u_int64_t trunced_digest;
3864
3865	mptcp_get_hmac(aid, mp_tp, &digest[0], sizeof (digest));
3866	bcopy(digest, &trunced_digest, 8);
3867	return (trunced_digest);
3868}
3869
3870/*
3871 * Authentication data generation
3872 */
3873int
3874mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
3875    int token_len)
3876{
3877	VERIFY(token_len == sizeof (u_int32_t));
3878	VERIFY(sha_digest_len == SHA1_RESULTLEN);
3879
3880	/* Most significant 32 bits of the SHA1 hash */
3881	bcopy(sha_digest, token, sizeof (u_int32_t));
3882	return (TRUE);
3883}
3884
3885int
3886mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
3887    int idsn_len)
3888{
3889	VERIFY(idsn_len == sizeof (u_int64_t));
3890	VERIFY(sha_digest_len == SHA1_RESULTLEN);
3891
3892	/*
3893	 * Least significant 64 bits of the SHA1 hash
3894	 */
3895
3896	idsn[7] = sha_digest[12];
3897	idsn[6] = sha_digest[13];
3898	idsn[5] = sha_digest[14];
3899	idsn[4] = sha_digest[15];
3900	idsn[3] = sha_digest[16];
3901	idsn[2] = sha_digest[17];
3902	idsn[1] = sha_digest[18];
3903	idsn[0] = sha_digest[19];
3904	return (TRUE);
3905}
3906
3907static int
3908mptcp_init_authparms(struct mptcb *mp_tp)
3909{
3910	caddr_t local_digest = NULL;
3911	char remote_digest[MPTCP_SHA1_RESULTLEN];
3912	MPT_LOCK_ASSERT_HELD(mp_tp);
3913
3914	/* Only Version 0 is supported for auth purposes */
3915	if (mp_tp->mpt_version != MP_DRAFT_VERSION_12)
3916		return (-1);
3917
3918	/* Setup local and remote tokens and Initial DSNs */
3919	local_digest = mptcp_get_stored_digest(mp_tp->mpt_localkey);
3920	mptcp_generate_token(local_digest, SHA1_RESULTLEN,
3921	    (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
3922	mptcp_generate_idsn(local_digest, SHA1_RESULTLEN,
3923	    (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
3924
3925	if (!mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest,
3926	    SHA1_RESULTLEN)) {
3927		mptcplog((LOG_ERR, "MPTCP ERROR %s: unexpected failure",
3928		    __func__));
3929		return (-1);
3930	}
3931	mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
3932	    (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_localtoken));
3933	mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
3934	    (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
3935	return (0);
3936}
3937
3938static void
3939mptcp_init_statevars(struct mptcb *mp_tp)
3940{
3941	MPT_LOCK_ASSERT_HELD(mp_tp);
3942
3943	/* The subflow SYN is also first MPTCP byte */
3944	mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
3945	mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3946
3947	mp_tp->mpt_rcvatmark = mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
3948}
3949
3950static void
3951mptcp_conn_properties(struct mptcb *mp_tp)
3952{
3953	/* There is only Version 0 at this time */
3954	mp_tp->mpt_version = MP_DRAFT_VERSION_12;
3955
3956	/* Set DSS checksum flag */
3957	if (mptcp_dss_csum)
3958		mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
3959
3960	/* Set up receive window */
3961	mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
3962
3963	/* Set up gc ticks */
3964	mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
3965}
3966
3967/*
3968 * Helper Functions
3969 */
3970mptcp_token_t
3971mptcp_get_localtoken(void* mptcb_arg)
3972{
3973	struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
3974	return (mp_tp->mpt_localtoken);
3975}
3976
3977mptcp_token_t
3978mptcp_get_remotetoken(void* mptcb_arg)
3979{
3980	struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
3981	return (mp_tp->mpt_remotetoken);
3982}
3983
3984u_int64_t
3985mptcp_get_localkey(void* mptcb_arg)
3986{
3987	struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
3988	if (mp_tp->mpt_localkey != NULL)
3989		return (*mp_tp->mpt_localkey);
3990	else
3991		return (0);
3992}
3993
3994u_int64_t
3995mptcp_get_remotekey(void* mptcb_arg)
3996{
3997	struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
3998	return (mp_tp->mpt_remotekey);
3999}
4000
4001void
4002mptcp_send_dfin(struct socket *so)
4003{
4004	struct tcpcb *tp = NULL;
4005	struct inpcb *inp = NULL;
4006
4007	inp = sotoinpcb(so);
4008	if (!inp)
4009		return;
4010
4011	tp = intotcpcb(inp);
4012	if (!tp)
4013		return;
4014
4015	if (!(tp->t_mpflags & TMPF_RESET))
4016		tp->t_mpflags |= TMPF_SEND_DFIN;
4017}
4018
4019/*
4020 * Data Sequence Mapping routines
4021 */
4022void
4023mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4024{
4025	struct mptcb *mp_tp;
4026
4027	if (m == NULL)
4028		return;
4029
4030	mp_tp = &((struct mpp_mtp *)mpp)->mtcb;
4031	MPT_LOCK(mp_tp);
4032	if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4033		MPT_UNLOCK(mp_tp);
4034		panic("%s: data write before establishment.",
4035		    __func__);
4036		return;
4037	}
4038
4039	while (m) {
4040		VERIFY(m->m_flags & M_PKTHDR);
4041		m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4042		m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4043		m->m_pkthdr.mp_rlen = m_pktlen(m);
4044		mp_tp->mpt_sndmax += m_pktlen(m);
4045		m = m->m_next;
4046	}
4047	MPT_UNLOCK(mp_tp);
4048}
4049
4050void
4051mptcp_preproc_sbdrop(struct mbuf *m, unsigned int len)
4052{
4053	u_int32_t sub_len = 0;
4054
4055	while (m) {
4056		VERIFY(m->m_flags & M_PKTHDR);
4057
4058		if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4059			sub_len = m->m_pkthdr.mp_rlen;
4060
4061			if (sub_len < len) {
4062				m->m_pkthdr.mp_dsn += sub_len;
4063				if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4064					m->m_pkthdr.mp_rseq += sub_len;
4065				}
4066				m->m_pkthdr.mp_rlen = 0;
4067				len -= sub_len;
4068			} else {
4069				/* sub_len >= len */
4070				m->m_pkthdr.mp_dsn += len;
4071				if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4072					m->m_pkthdr.mp_rseq += len;
4073				}
4074				mptcplog3((LOG_INFO,
4075				    "%s: %llu %u %d %d\n", __func__,
4076				    m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rseq,
4077				    m->m_pkthdr.mp_rlen, len));
4078				m->m_pkthdr.mp_rlen -= len;
4079				return;
4080			}
4081		} else {
4082			panic("%s: MPTCP tag not set", __func__);
4083			/* NOTREACHED */
4084		}
4085		m = m->m_next;
4086	}
4087}
4088
4089/* Obtain the DSN mapping stored in the mbuf */
4090void
4091mptcp_output_getm_dsnmap32(struct socket *so, int off, uint32_t datalen,
4092    u_int32_t *dsn, u_int32_t *relseq, u_int16_t *data_len, u_int64_t *dsn64p)
4093{
4094	u_int64_t dsn64;
4095
4096	mptcp_output_getm_dsnmap64(so, off, datalen, &dsn64, relseq, data_len);
4097	*dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
4098	*dsn64p = dsn64;
4099}
4100
4101void
4102mptcp_output_getm_dsnmap64(struct socket *so, int off, uint32_t datalen,
4103    u_int64_t *dsn, u_int32_t *relseq, u_int16_t *data_len)
4104{
4105	struct mbuf *m = so->so_snd.sb_mb;
4106	struct mbuf *mnext = NULL;
4107	uint32_t runlen = 0;
4108	u_int64_t dsn64;
4109	uint32_t contig_len = 0;
4110
4111	if (m == NULL)
4112		return;
4113
4114	if (off < 0)
4115		return;
4116	/*
4117	 * In the subflow socket, the DSN sequencing can be discontiguous,
4118	 * but the subflow sequence mapping is contiguous. Use the subflow
4119	 * sequence property to find the right mbuf and corresponding dsn
4120	 * mapping.
4121	 */
4122
4123	while (m) {
4124		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4125		VERIFY(m->m_flags & M_PKTHDR);
4126
4127		if ((unsigned int)off >= m->m_pkthdr.mp_rlen) {
4128			off -= m->m_pkthdr.mp_rlen;
4129			m = m->m_next;
4130		} else {
4131			break;
4132		}
4133	}
4134
4135	if (m == NULL) {
4136		panic("%s: bad offset", __func__);
4137		/* NOTREACHED */
4138	}
4139
4140	dsn64 = m->m_pkthdr.mp_dsn + off;
4141	*dsn = dsn64;
4142	*relseq = m->m_pkthdr.mp_rseq + off;
4143
4144	/*
4145	 * Now find the last contiguous byte and its length from
4146	 * start.
4147	 */
4148	runlen = m->m_pkthdr.mp_rlen - off;
4149	contig_len = runlen;
4150
4151	/* If datalen does not span multiple mbufs, return */
4152	if (datalen <= runlen) {
4153		*data_len = min(datalen, UINT16_MAX);
4154		return;
4155	}
4156
4157	mnext = m->m_next;
4158	while (datalen > runlen) {
4159		if (mnext == NULL) {
4160			panic("%s: bad datalen = %d, %d %d", __func__, datalen,
4161			    runlen, off);
4162			/* NOTREACHED */
4163		}
4164		VERIFY(mnext->m_flags & M_PKTHDR);
4165		VERIFY(mnext->m_pkthdr.pkt_flags & PKTF_MPTCP);
4166
4167		/*
4168		 * case A. contiguous DSN stream
4169		 * case B. discontiguous DSN stream
4170		 */
4171		if (mnext->m_pkthdr.mp_dsn == (dsn64 + runlen)) {
4172			/* case A */
4173			runlen += mnext->m_pkthdr.mp_rlen;
4174			contig_len += mnext->m_pkthdr.mp_rlen;
4175			mptcplog3((LOG_INFO, "%s: contig \n",
4176			    __func__));
4177		} else {
4178			/* case B */
4179			mptcplog((LOG_INFO, "%s: discontig %d %d \n",
4180			    __func__, datalen, contig_len));
4181			break;
4182		}
4183		mnext = mnext->m_next;
4184	}
4185	datalen = min(datalen, UINT16_MAX);
4186	*data_len = min(datalen, contig_len);
4187	mptcplog3((LOG_INFO, "%s: %llu %u %d %d \n", __func__,
4188	    *dsn, *relseq, *data_len, off));
4189}
4190
4191/*
4192 * MPTCP's notion of the next insequence Data Sequence number is adjusted
4193 * here. It must be called from mptcp_adj_rmap() which is called only after
4194 * reassembly of out of order data. The rcvnxt variable must
4195 * be updated only when atleast some insequence new data is received.
4196 */
4197static void
4198mptcp_adj_rcvnxt(struct tcpcb *tp, struct mbuf *m)
4199{
4200	struct mptcb *mp_tp = tptomptp(tp);
4201
4202	if (mp_tp == NULL)
4203		return;
4204	MPT_LOCK(mp_tp);
4205	if ((MPTCP_SEQ_GEQ(mp_tp->mpt_rcvnxt, m->m_pkthdr.mp_dsn)) &&
4206	    (MPTCP_SEQ_LEQ(mp_tp->mpt_rcvnxt, (m->m_pkthdr.mp_dsn +
4207	    m->m_pkthdr.mp_rlen)))) {
4208		mp_tp->mpt_rcvnxt = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4209	}
4210	MPT_UNLOCK(mp_tp);
4211}
4212
4213/*
4214 * Note that this is called only from tcp_input() which may trim data
4215 * after the dsn mapping is inserted into the mbuf. When it trims data
4216 * tcp_input calls m_adj() which does not remove the m_pkthdr even if the
4217 * m_len becomes 0 as a result of trimming the mbuf. The dsn map insertion
4218 * cannot be delayed after trim, because data can be in the reassembly
4219 * queue for a while and the DSN option info in tp will be overwritten for
4220 * every new packet received.
4221 * The dsn map will be adjusted just prior to appending to subflow sockbuf
4222 * with mptcp_adj_rmap()
4223 */
4224void
4225mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m)
4226{
4227	VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
4228
4229	if (tp->t_mpflags & TMPF_EMBED_DSN) {
4230		VERIFY(m->m_flags & M_PKTHDR);
4231		m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
4232		m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
4233		m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
4234		m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
4235		tp->t_mpflags &= ~TMPF_EMBED_DSN;
4236		tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
4237	}
4238}
4239
4240void
4241mptcp_adj_rmap(struct socket *so, struct mbuf *m)
4242{
4243	u_int64_t dsn;
4244	u_int32_t sseq, datalen;
4245	struct tcpcb *tp = intotcpcb(sotoinpcb(so));
4246	u_int32_t old_rcvnxt = 0;
4247
4248	if (m_pktlen(m) == 0)
4249		return;
4250
4251	if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4252		VERIFY(m->m_flags & M_PKTHDR);
4253
4254		dsn = m->m_pkthdr.mp_dsn;
4255		sseq = m->m_pkthdr.mp_rseq + tp->irs;
4256		datalen = m->m_pkthdr.mp_rlen;
4257	} else {
4258		/* data arrived without an DSS option mapping */
4259		mptcp_notify_mpfail(so);
4260		return;
4261	}
4262
4263	/* In the common case, data is in window and in sequence */
4264	if (m->m_pkthdr.len == (int)datalen) {
4265		mptcp_adj_rcvnxt(tp, m);
4266		return;
4267	}
4268
4269	if (m->m_pkthdr.len > (int)datalen) {
4270		panic("%s: mbuf len = %d expected = %d", __func__,
4271		    m->m_pkthdr.len, datalen);
4272	}
4273
4274	old_rcvnxt = tp->rcv_nxt - m->m_pkthdr.len;
4275	if (SEQ_GT(old_rcvnxt, sseq)) {
4276		/* data trimmed from the left */
4277		int off = old_rcvnxt - sseq;
4278		m->m_pkthdr.mp_dsn += off;
4279		m->m_pkthdr.mp_rseq += off;
4280		m->m_pkthdr.mp_rlen -= off;
4281	} else if (old_rcvnxt == sseq) {
4282		/*
4283		 * Data was trimmed from the right
4284		 */
4285		m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
4286	} else {
4287		/* XXX handle gracefully with reass or fallback in January */
4288		panic("%s: partial map %u %u", __func__, old_rcvnxt, sseq);
4289		/* NOTREACHED */
4290	}
4291	mptcp_adj_rcvnxt(tp, m);
4292
4293}
4294
4295/*
4296 * Following routines help with failure detection and failover of data
4297 * transfer from one subflow to another.
4298 */
4299void
4300mptcp_act_on_txfail(struct socket *so)
4301{
4302	struct tcpcb *tp = NULL;
4303	struct inpcb *inp = sotoinpcb(so);
4304
4305	if (inp == NULL)
4306		return;
4307
4308	tp = intotcpcb(inp);
4309	if (tp == NULL)
4310		return;
4311
4312	if (tp->t_state != TCPS_ESTABLISHED)
4313		mptcplog((LOG_INFO, "%s: state = %d \n", __func__,
4314		    tp->t_state));
4315
4316	if (so->so_flags & SOF_MP_TRYFAILOVER) {
4317		return;
4318	}
4319
4320	so->so_flags |= SOF_MP_TRYFAILOVER;
4321	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
4322}
4323
4324/*
4325 * Support for MP_FAIL option
4326 */
4327int
4328mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
4329{
4330	struct mbuf *m = so->so_snd.sb_mb;
4331	u_int64_t dsn;
4332	int off = 0;
4333	u_int32_t datalen;
4334
4335	if (m == NULL)
4336		return (-1);
4337
4338	while (m != NULL) {
4339		VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4340		VERIFY(m->m_flags & M_PKTHDR);
4341		dsn = m->m_pkthdr.mp_dsn;
4342		datalen = m->m_pkthdr.mp_rlen;
4343		if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
4344		    (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
4345			off = dsn_fail - dsn;
4346			*tcp_seq = m->m_pkthdr.mp_rseq + off;
4347			return (0);
4348		}
4349
4350		m = m->m_next;
4351	}
4352
4353	/*
4354	 * If there was no mbuf data and a fallback to TCP occurred, there's
4355	 * not much else to do.
4356	 */
4357
4358	mptcplog((LOG_ERR, "%s: %llu not found \n", __func__, dsn_fail));
4359	return (-1);
4360}
4361
4362/*
4363 * Support for sending contiguous MPTCP bytes in subflow
4364 */
4365int32_t
4366mptcp_adj_sendlen(struct socket *so, int32_t off, int32_t len)
4367{
4368	u_int64_t	mdss_dsn = 0;
4369	u_int32_t	mdss_subflow_seq = 0;
4370	u_int16_t	mdss_data_len = 0;
4371
4372	if (len == 0)
4373		return (len);
4374
4375	mptcp_output_getm_dsnmap64(so, off, (u_int32_t)len,
4376	    &mdss_dsn, &mdss_subflow_seq, &mdss_data_len);
4377
4378	return (mdss_data_len);
4379}
4380
4381int32_t
4382mptcp_sbspace(struct mptcb *mpt)
4383{
4384	struct sockbuf *sb;
4385	uint32_t rcvbuf;
4386	int32_t space;
4387
4388	MPT_LOCK_ASSERT_HELD(mpt);
4389	MPTE_LOCK_ASSERT_HELD(mpt->mpt_mpte);
4390
4391	sb = &mpt->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
4392	rcvbuf = sb->sb_hiwat;
4393	space = ((int32_t)imin((rcvbuf - sb->sb_cc),
4394	    (sb->sb_mbmax - sb->sb_mbcnt)));
4395	if (space < 0)
4396		space = 0;
4397	/* XXX check if it's too small? */
4398
4399	return (space);
4400}
4401
4402/*
4403 * Support Fallback to Regular TCP
4404 */
4405void
4406mptcp_notify_mpready(struct socket *so)
4407{
4408	struct tcpcb *tp = NULL;
4409
4410	if (so == NULL)
4411		return;
4412
4413	tp = intotcpcb(sotoinpcb(so));
4414
4415	if (tp == NULL)
4416		return;
4417
4418	DTRACE_MPTCP4(multipath__ready, struct socket *, so,
4419	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
4420	    struct tcpcb *, tp);
4421
4422	if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
4423		return;
4424
4425	if (tp->t_mpflags & TMPF_MPTCP_READY)
4426		return;
4427
4428	tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
4429	tp->t_mpflags |= TMPF_MPTCP_READY;
4430
4431	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
4432}
4433
4434void
4435mptcp_notify_mpfail(struct socket *so)
4436{
4437	struct tcpcb *tp = NULL;
4438
4439	if (so == NULL)
4440		return;
4441
4442	tp = intotcpcb(sotoinpcb(so));
4443
4444	if (tp == NULL)
4445		return;
4446
4447	DTRACE_MPTCP4(multipath__failed, struct socket *, so,
4448	    struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
4449	    struct tcpcb *, tp);
4450
4451	if (tp->t_mpflags & TMPF_TCP_FALLBACK)
4452		return;
4453
4454	tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
4455	tp->t_mpflags |= TMPF_TCP_FALLBACK;
4456
4457	soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
4458}
4459
4460/*
4461 * Keepalive helper function
4462 */
4463boolean_t
4464mptcp_ok_to_keepalive(struct mptcb *mp_tp)
4465{
4466	boolean_t ret = 1;
4467	VERIFY(mp_tp != NULL);
4468	MPT_LOCK(mp_tp);
4469	if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
4470		ret = 0;
4471	}
4472	MPT_UNLOCK(mp_tp);
4473	return (ret);
4474}
4475
4476/*
4477 * MPTCP t_maxseg adjustment function
4478 */
4479int
4480mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
4481{
4482	int mss_lower = 0;
4483	struct mptcb *mp_tp = tptomptp(tp);
4484
4485#define	MPTCP_COMPUTE_LEN {				\
4486	mss_lower = sizeof (struct mptcp_dss_ack_opt);	\
4487	MPT_LOCK(mp_tp);				\
4488	if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)		\
4489		mss_lower += 2;				\
4490	else						\
4491		/* adjust to 32-bit boundary + EOL */	\
4492		mss_lower += 2;				\
4493	MPT_UNLOCK(mp_tp);				\
4494}
4495	if (mp_tp == NULL)
4496		return (0);
4497
4498	/*
4499	 * For the first subflow and subsequent subflows, adjust mss for
4500	 * most common MPTCP option size, for case where tcp_mss is called
4501	 * during option processing and MTU discovery.
4502	 */
4503	if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
4504	    (!(tp->t_mpflags & TMPF_JOINED_FLOW))) {
4505		MPTCP_COMPUTE_LEN;
4506	}
4507
4508	if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
4509	    (tp->t_mpflags & TMPF_SENT_JOIN)) {
4510		MPTCP_COMPUTE_LEN;
4511	}
4512
4513	if ((mtudisc) && (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
4514		MPTCP_COMPUTE_LEN;
4515	}
4516
4517	return (mss_lower);
4518}
4519
4520/*
4521 * Update the pid, upid, uuid of the subflow so, based on parent so
4522 */
4523void
4524mptcp_update_last_owner(struct mptsub *mpts, struct socket *parent_mpso)
4525{
4526	struct socket *subflow_so = mpts->mpts_socket;
4527
4528	MPTS_LOCK_ASSERT_HELD(mpts);
4529
4530	socket_lock(subflow_so, 0);
4531	if ((subflow_so->last_pid != parent_mpso->last_pid) ||
4532		(subflow_so->last_upid != parent_mpso->last_upid)) {
4533		subflow_so->last_upid = parent_mpso->last_upid;
4534		subflow_so->last_pid = parent_mpso->last_pid;
4535		uuid_copy(subflow_so->last_uuid, parent_mpso->last_uuid);
4536	}
4537	so_update_policy(subflow_so);
4538	socket_unlock(subflow_so, 0);
4539}
4540
4541static void
4542fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
4543{
4544	struct inpcb *inp;
4545
4546	tcp_getconninfo(so, &flow->flow_ci);
4547	inp = sotoinpcb(so);
4548#if INET6
4549	if ((inp->inp_vflag & INP_IPV6) != 0) {
4550		flow->flow_src.ss_family = AF_INET6;
4551		flow->flow_dst.ss_family = AF_INET6;
4552		flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
4553		flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
4554		SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
4555		SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
4556		SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
4557		SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
4558	} else
4559#endif
4560	{
4561		flow->flow_src.ss_family = AF_INET;
4562		flow->flow_dst.ss_family = AF_INET;
4563		flow->flow_src.ss_len = sizeof(struct sockaddr_in);
4564		flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
4565		SIN(&flow->flow_src)->sin_port = inp->inp_lport;
4566		SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
4567		SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
4568		SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
4569	}
4570	flow->flow_flags = mpts->mpts_flags;
4571	flow->flow_cid = mpts->mpts_connid;
4572}
4573
4574static int
4575mptcp_pcblist SYSCTL_HANDLER_ARGS
4576{
4577#pragma unused(oidp, arg1, arg2)
4578	int error = 0, f;
4579	size_t n, len;
4580	struct mppcb *mpp;
4581	struct mptses *mpte;
4582	struct mptcb *mp_tp;
4583	struct mptsub *mpts;
4584	struct socket *so;
4585	conninfo_mptcp_t mptcpci;
4586	mptcp_flow_t *flows;
4587
4588	if (req->newptr != USER_ADDR_NULL)
4589		return (EPERM);
4590
4591	lck_mtx_lock(&mtcbinfo.mppi_lock);
4592	n = mtcbinfo.mppi_count;
4593	if (req->oldptr == USER_ADDR_NULL) {
4594		lck_mtx_unlock(&mtcbinfo.mppi_lock);
4595		req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
4596		    4 * (n + n/8)  * sizeof(mptcp_flow_t);
4597		return (0);
4598	}
4599	TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
4600		bzero(&mptcpci, sizeof(mptcpci));
4601		lck_mtx_lock(&mpp->mpp_lock);
4602		VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4603		mpte = mptompte(mpp);
4604		VERIFY(mpte != NULL);
4605		mp_tp = mpte->mpte_mptcb;
4606		VERIFY(mp_tp != NULL);
4607		len = sizeof(*flows) * mpte->mpte_numflows;
4608		flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
4609		if (flows == NULL) {
4610			lck_mtx_unlock(&mpp->mpp_lock);
4611			break;
4612		}
4613		/* N.B. we don't take the mpt_lock just for the state. */
4614		mptcpci.mptcpci_state = mp_tp->mpt_state;
4615		mptcpci.mptcpci_nflows = mpte->mpte_numflows;
4616		mptcpci.mptcpci_len = sizeof(mptcpci) +
4617		    sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
4618		error = SYSCTL_OUT(req, &mptcpci,
4619		    sizeof(mptcpci) - sizeof(*flows));
4620		if (error) {
4621			lck_mtx_unlock(&mpp->mpp_lock);
4622			FREE(flows, M_TEMP);
4623			break;
4624		}
4625		f = 0;
4626		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4627			MPTS_LOCK(mpts);
4628			so = mpts->mpts_socket;
4629			socket_lock(so, 0);
4630			fill_mptcp_subflow(so, &flows[f], mpts);
4631			socket_unlock(so, 0);
4632			MPTS_UNLOCK(mpts);
4633			f++;
4634		}
4635		lck_mtx_unlock(&mpp->mpp_lock);
4636		error = SYSCTL_OUT(req, flows, len);
4637		FREE(flows, M_TEMP);
4638		if (error)
4639			break;
4640	}
4641	lck_mtx_unlock(&mtcbinfo.mppi_lock);
4642
4643	return (error);
4644}
4645
4646SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
4647    0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
4648    "List of active MPTCP connections");
4649