1/*
2 * Copyright (c) 2012-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/kernel.h>
32#include <sys/socket.h>
33#include <sys/socketvar.h>
34#include <sys/protosw.h>
35#include <sys/mcache.h>
36#include <sys/syslog.h>
37#include <sys/proc.h>
38#include <sys/proc_internal.h>
39#include <sys/resourcevar.h>
40
41#include <net/if.h>
42#include <netinet/in.h>
43#include <netinet/in_var.h>
44#include <netinet/tcp.h>
45#include <netinet/tcp_fsm.h>
46#include <netinet/tcp_seq.h>
47#include <netinet/tcp_var.h>
48#include <netinet/tcp_timer.h>
49#include <netinet/mptcp_var.h>
50#include <netinet/mptcp_timer.h>
51
52#include <mach/sdt.h>
53
54static int mptcp_usr_attach(struct socket *, int, struct proc *);
55static int mptcp_usr_detach(struct socket *);
56static int mptcp_attach(struct socket *, struct proc *);
57static int mptcp_detach(struct socket *, struct mppcb *);
58static int mptcp_connectx(struct mptses *, struct sockaddr_list **,
59    struct sockaddr_list **, struct proc *, uint32_t, associd_t, connid_t *,
60    uint32_t, void *, uint32_t);
61static int mptcp_usr_connectx(struct socket *, struct sockaddr_list **,
62    struct sockaddr_list **, struct proc *, uint32_t, associd_t, connid_t *,
63    uint32_t, void *, uint32_t);
64static int mptcp_getassocids(struct mptses *, uint32_t *, user_addr_t);
65static int mptcp_getconnids(struct mptses *, associd_t, uint32_t *,
66    user_addr_t);
67static int mptcp_getconninfo(struct mptses *, connid_t *, uint32_t *,
68    uint32_t *, int32_t *, user_addr_t, socklen_t *, user_addr_t, socklen_t *,
69    uint32_t *, user_addr_t, uint32_t *);
70static int mptcp_usr_control(struct socket *, u_long, caddr_t, struct ifnet *,
71    struct proc *);
72static int mptcp_disconnectx(struct mptses *, associd_t, connid_t);
73static int mptcp_usr_disconnectx(struct socket *, associd_t, connid_t);
74static struct mptses *mptcp_usrclosed(struct mptses *);
75static int mptcp_usr_peeloff(struct socket *, associd_t, struct socket **);
76static int mptcp_peeloff(struct mptses *, associd_t, struct socket **);
77static int mptcp_usr_rcvd(struct socket *, int);
78static int mptcp_usr_send(struct socket *, int, struct mbuf *,
79    struct sockaddr *, struct mbuf *, struct proc *);
80static int mptcp_usr_shutdown(struct socket *);
81static int mptcp_uiotombuf(struct uio *, int, int, uint32_t, struct mbuf **);
82static int mptcp_usr_sosend(struct socket *, struct sockaddr *, struct uio *,
83    struct mbuf *, struct mbuf *, int);
84static int mptcp_usr_socheckopt(struct socket *, struct sockopt *);
85static int mptcp_setopt_apply(struct mptses *, struct mptopt *);
86static int mptcp_setopt(struct mptses *, struct sockopt *);
87static int mptcp_getopt(struct mptses *, struct sockopt *);
88static int mptcp_default_tcp_optval(struct mptses *, struct sockopt *, int *);
89static void mptcp_connorder_helper(struct mptsub *mpts);
90
91struct pr_usrreqs mptcp_usrreqs = {
92	.pru_attach =		mptcp_usr_attach,
93	.pru_connectx =		mptcp_usr_connectx,
94	.pru_control =		mptcp_usr_control,
95	.pru_detach =		mptcp_usr_detach,
96	.pru_disconnectx =	mptcp_usr_disconnectx,
97	.pru_peeloff =		mptcp_usr_peeloff,
98	.pru_rcvd =		mptcp_usr_rcvd,
99	.pru_send =		mptcp_usr_send,
100	.pru_shutdown =		mptcp_usr_shutdown,
101	.pru_sosend =		mptcp_usr_sosend,
102	.pru_soreceive =	soreceive,
103	.pru_socheckopt =	mptcp_usr_socheckopt,
104};
105
106/*
107 * Attaches an MPTCP control block to a socket.
108 */
109static int
110mptcp_usr_attach(struct socket *mp_so, int proto, struct proc *p)
111{
112#pragma unused(proto)
113	int error;
114
115	VERIFY(sotomppcb(mp_so) == NULL);
116
117	error = mptcp_attach(mp_so, p);
118	if (error != 0)
119		goto out;
120	/*
121	 * XXX: adi@apple.com
122	 *
123	 * Might want to use a different SO_LINGER timeout than TCP's?
124	 */
125	if ((mp_so->so_options & SO_LINGER) && mp_so->so_linger == 0)
126		mp_so->so_linger = TCP_LINGERTIME * hz;
127out:
128	return (error);
129}
130
131/*
132 * Detaches an MPTCP control block from a socket.
133 */
134static int
135mptcp_usr_detach(struct socket *mp_so)
136{
137	struct mppcb *mpp = sotomppcb(mp_so);
138	int error = 0;
139
140	VERIFY(mpp != NULL);
141	VERIFY(mpp->mpp_socket != NULL);
142
143	error = mptcp_detach(mp_so, mpp);
144	return (error);
145}
146
147/*
148 * Attach MPTCP protocol to socket, allocating MP control block,
149 * MPTCP session, control block, buffer space, etc.
150 */
151static int
152mptcp_attach(struct socket *mp_so, struct proc *p)
153{
154#pragma unused(p)
155	struct mptses *mpte;
156	struct mptcb *mp_tp;
157	struct mppcb *mpp;
158	int error = 0;
159
160	if (mp_so->so_snd.sb_hiwat == 0 || mp_so->so_rcv.sb_hiwat == 0) {
161		error = soreserve(mp_so, tcp_sendspace, MPTCP_RWIN_MAX);
162		if (error != 0)
163			goto out;
164	}
165
166	/*
167	 * MPTCP socket buffers cannot be compressed, due to the
168	 * fact that each mbuf chained via m_next is a M_PKTHDR
169	 * which carries some MPTCP metadata.
170	 */
171	mp_so->so_snd.sb_flags |= SB_NOCOMPRESS;
172	mp_so->so_rcv.sb_flags |= SB_NOCOMPRESS;
173
174	/* Disable socket buffer auto-tuning. */
175	mp_so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
176	mp_so->so_snd.sb_flags &= ~SB_AUTOSIZE;
177
178	if ((error = mp_pcballoc(mp_so, &mtcbinfo)) != 0)
179		goto out;
180
181	mpp = sotomppcb(mp_so);
182	VERIFY(mpp != NULL);
183
184	mpte = mptcp_sescreate(mp_so, mpp);
185	if (mpte == NULL) {
186		mp_pcbdetach(mpp);
187		error = ENOBUFS;
188		goto out;
189	}
190	mp_tp = mpte->mpte_mptcb;
191	VERIFY(mp_tp != NULL);
192
193	MPT_LOCK(mp_tp);
194	mp_tp->mpt_state = MPTCPS_CLOSED;
195	MPT_UNLOCK(mp_tp);
196
197out:
198	return (error);
199}
200
201/*
202 * Called when the socket layer loses its final reference to the socket;
203 * at this point, there is only one case in which we will keep things
204 * around: time wait.
205 */
206static int
207mptcp_detach(struct socket *mp_so, struct mppcb *mpp)
208{
209	struct mptses *mpte;
210	struct mppcbinfo *mppi;
211
212	VERIFY(mp_so->so_pcb == mpp);
213	VERIFY(mpp->mpp_socket == mp_so);
214
215	mppi = mpp->mpp_pcbinfo;
216	VERIFY(mppi != NULL);
217
218	mpte = &((struct mpp_mtp *)mpp)->mpp_ses;
219	VERIFY(mpte->mpte_mppcb == mpp);
220
221	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
222
223	/*
224	 * We are done with this MPTCP socket (it has been closed);
225	 * trigger all subflows to be disconnected, if not already,
226	 * by initiating the PCB detach sequence (SOF_PCBCLEARING
227	 * will be set.)
228	 */
229	mp_pcbdetach(mpp);
230
231	(void) mptcp_disconnectx(mpte, ASSOCID_ALL, CONNID_ALL);
232
233	/*
234	 * XXX: adi@apple.com
235	 *
236	 * Here, we would want to handle time wait state.
237	 */
238
239	return (0);
240}
241
242/*
243 * Common subroutine to open a MPTCP connection to one of the remote hosts
244 * specified by dst_sl.  This includes allocating and establishing a
245 * subflow TCP connection, either initially to establish MPTCP connection,
246 * or to join an existing one.  Returns a connection handle upon success.
247 */
248static int
249mptcp_connectx(struct mptses *mpte, struct sockaddr_list **src_sl,
250    struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
251    associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
252    uint32_t arglen)
253{
254#pragma unused(p, aid, flags, arg, arglen)
255	struct mptsub *mpts;
256	struct socket *mp_so;
257	int error = 0;
258
259	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
260	mp_so = mpte->mpte_mppcb->mpp_socket;
261
262	VERIFY(dst_sl != NULL && *dst_sl != NULL);
263	VERIFY(pcid != NULL);
264
265	mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx\n", __func__,
266	    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)));
267	DTRACE_MPTCP3(connectx, struct mptses *, mpte, associd_t, aid,
268	    struct socket *, mp_so);
269
270	mpts = mptcp_subflow_alloc(M_WAITOK);
271	if (mpts == NULL) {
272		error = ENOBUFS;
273		goto out;
274	}
275	MPTS_ADDREF(mpts);		/* for this routine */
276
277	if (src_sl != NULL) {
278		mpts->mpts_src_sl = *src_sl;
279		*src_sl = NULL;
280	}
281	mpts->mpts_dst_sl = *dst_sl;
282	*dst_sl = NULL;
283
284	error = mptcp_subflow_add(mpte, mpts, p, ifscope);
285	if (error == 0 && pcid != NULL)
286		*pcid = mpts->mpts_connid;
287
288out:
289	if (mpts != NULL) {
290		if ((error != 0) && (error != EWOULDBLOCK)) {
291			MPTS_LOCK(mpts);
292			if (mpts->mpts_flags & MPTSF_ATTACHED) {
293				MPTS_UNLOCK(mpts);
294				MPTS_REMREF(mpts);
295				mptcp_subflow_del(mpte, mpts, TRUE);
296				return (error);
297			}
298			MPTS_UNLOCK(mpts);
299		}
300		MPTS_REMREF(mpts);
301	}
302
303	return (error);
304}
305
306/*
307 * User-protocol pru_connectx callback.
308 */
309static int
310mptcp_usr_connectx(struct socket *mp_so, struct sockaddr_list **src_sl,
311    struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
312    associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
313    uint32_t arglen)
314{
315#pragma unused(arg, arglen)
316	struct mppcb *mpp = sotomppcb(mp_so);
317	struct mptses *mpte;
318	int error = 0;
319
320	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
321		error = EINVAL;
322		goto out;
323	}
324	mpte = mptompte(mpp);
325	VERIFY(mpte != NULL);
326
327	error = mptcp_connectx(mpte, src_sl, dst_sl, p, ifscope,
328	    aid, pcid, flags, arg, arglen);
329out:
330	return (error);
331}
332
333/*
334 * Handle SIOCGASSOCIDS ioctl for PF_MULTIPATH domain.
335 */
336static int
337mptcp_getassocids(struct mptses *mpte, uint32_t *cnt, user_addr_t aidp)
338{
339	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
340
341	/* MPTCP has at most 1 association */
342	*cnt = (mpte->mpte_associd != ASSOCID_ANY) ? 1 : 0;
343
344	/* just asking how many there are? */
345	if (aidp == USER_ADDR_NULL)
346		return (0);
347
348	return (copyout(&mpte->mpte_associd, aidp,
349	    sizeof (mpte->mpte_associd)));
350}
351
352/*
353 * Handle SIOCGCONNIDS ioctl for PF_MULTIPATH domain.
354 */
355static int
356mptcp_getconnids(struct mptses *mpte, associd_t aid, uint32_t *cnt,
357    user_addr_t cidp)
358{
359	struct mptsub *mpts;
360	int error = 0;
361
362	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
363
364	if (aid != ASSOCID_ANY && aid != ASSOCID_ALL &&
365	    aid != mpte->mpte_associd)
366		return (EINVAL);
367
368	*cnt = mpte->mpte_numflows;
369
370	/* just asking how many there are? */
371	if (cidp == USER_ADDR_NULL)
372		return (0);
373
374	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
375		if ((error = copyout(&mpts->mpts_connid, cidp,
376		    sizeof (mpts->mpts_connid))) != 0)
377			break;
378
379		cidp += sizeof (mpts->mpts_connid);
380	}
381
382	return (error);
383}
384
385/*
386 * Handle SIOCGCONNINFO ioctl for PF_MULTIPATH domain.
387 */
388static int
389mptcp_getconninfo(struct mptses *mpte, connid_t *cid, uint32_t *flags,
390    uint32_t *ifindex, int32_t *soerror, user_addr_t src, socklen_t *src_len,
391    user_addr_t dst, socklen_t *dst_len, uint32_t *aux_type,
392    user_addr_t aux_data, uint32_t *aux_len)
393{
394#pragma unused(aux_data)
395	struct sockaddr_entry *se;
396	struct ifnet *ifp = NULL;
397	struct mptsub *mpts;
398	int error = 0;
399
400	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
401
402	if (*cid == CONNID_ALL)
403		return (EINVAL);
404
405	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
406		if (mpts->mpts_connid == *cid || *cid == CONNID_ANY)
407			break;
408	}
409	if (mpts == NULL)
410		return ((*cid == CONNID_ANY) ? ENXIO : EINVAL);
411
412	MPTS_LOCK(mpts);
413	ifp = mpts->mpts_outif;
414	*cid = mpts->mpts_connid;
415	*ifindex = ((ifp != NULL) ? ifp->if_index : 0);
416	*soerror = mpts->mpts_soerror;
417	*flags = 0;
418	if (mpts->mpts_flags & MPTSF_CONNECTING)
419		*flags |= CIF_CONNECTING;
420	if (mpts->mpts_flags & MPTSF_CONNECTED)
421		*flags |= CIF_CONNECTED;
422	if (mpts->mpts_flags & MPTSF_DISCONNECTING)
423		*flags |= CIF_DISCONNECTING;
424	if (mpts->mpts_flags & MPTSF_DISCONNECTED)
425		*flags |= CIF_DISCONNECTED;
426	if (mpts->mpts_flags & MPTSF_BOUND_IF)
427		*flags |= CIF_BOUND_IF;
428	if (mpts->mpts_flags & MPTSF_BOUND_IP)
429		*flags |= CIF_BOUND_IP;
430	if (mpts->mpts_flags & MPTSF_BOUND_PORT)
431		*flags |= CIF_BOUND_PORT;
432	if (mpts->mpts_flags & MPTSF_PREFERRED)
433		*flags |= CIF_PREFERRED;
434	if (mpts->mpts_flags & MPTSF_MP_CAPABLE)
435		*flags |= CIF_MP_CAPABLE;
436	if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
437		*flags |= CIF_MP_DEGRADED;
438	if (mpts->mpts_flags & MPTSF_MP_READY)
439		*flags |= CIF_MP_READY;
440	if (mpts->mpts_flags & MPTSF_ACTIVE)
441		*flags |= CIF_MP_ACTIVE;
442
443	VERIFY(mpts->mpts_src_sl != NULL);
444	se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
445	VERIFY(se != NULL && se->se_addr != NULL);
446	*src_len = se->se_addr->sa_len;
447	if (src != USER_ADDR_NULL) {
448		error = copyout(se->se_addr, src, se->se_addr->sa_len);
449		if (error != 0)
450			goto out;
451	}
452
453	VERIFY(mpts->mpts_dst_sl != NULL);
454	se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
455	VERIFY(se != NULL && se->se_addr != NULL);
456	*dst_len = se->se_addr->sa_len;
457	if (dst != USER_ADDR_NULL) {
458		error = copyout(se->se_addr, dst, se->se_addr->sa_len);
459		if (error != 0)
460			goto out;
461	}
462
463	*aux_type = 0;
464	*aux_len = 0;
465	if (mpts->mpts_socket != NULL) {
466		struct conninfo_tcp tcp_ci;
467
468		*aux_type = CIAUX_TCP;
469		*aux_len = sizeof (tcp_ci);
470
471		if (aux_data != USER_ADDR_NULL) {
472			struct socket *so = mpts->mpts_socket;
473
474			VERIFY(SOCK_PROTO(so) == IPPROTO_TCP);
475			bzero(&tcp_ci, sizeof (tcp_ci));
476			socket_lock(so, 0);
477			tcp_getconninfo(so, &tcp_ci);
478			socket_unlock(so, 0);
479			error = copyout(&tcp_ci, aux_data, sizeof (tcp_ci));
480			if (error != 0)
481				goto out;
482		}
483	}
484out:
485	MPTS_UNLOCK(mpts);
486	return (error);
487}
488
489/*
490 * Handle SIOCSCONNORDER
491 */
492int
493mptcp_setconnorder(struct mptses *mpte, connid_t cid, uint32_t rank)
494{
495	struct mptsub *mpts, *mpts1;
496	int error = 0;
497
498	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
499	mptcplog((LOG_DEBUG, "%s: cid %d rank %d \n", __func__, cid, rank));
500
501	if (cid == CONNID_ANY || cid == CONNID_ALL) {
502		error = EINVAL;
503		goto out;
504	}
505
506	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
507		if (mpts->mpts_connid == cid)
508			break;
509	}
510	if (mpts == NULL) {
511		error = ENXIO;
512		goto out;
513	}
514
515	if (rank == 0 || rank > 1) {
516		/*
517		 * If rank is 0, determine whether this should be the
518		 * primary or backup subflow, depending on what we have.
519		 *
520		 * Otherwise, if greater than 0, make it a backup flow.
521		 */
522		TAILQ_FOREACH(mpts1, &mpte->mpte_subflows, mpts_entry) {
523			MPTS_LOCK(mpts1);
524			if (mpts1->mpts_flags & MPTSF_PREFERRED) {
525				MPTS_UNLOCK(mpts1);
526				break;
527			}
528			MPTS_UNLOCK(mpts1);
529		}
530
531		MPTS_LOCK(mpts);
532		mpts->mpts_flags &= ~MPTSF_PREFERRED;
533		mpts->mpts_rank = rank;
534		if (mpts1 != NULL && mpts != mpts1) {
535			/* preferred subflow found; set rank as necessary */
536			if (rank == 0)
537				mpts->mpts_rank = (mpts1->mpts_rank + 1);
538		} else if (rank == 0) {
539			/* no preferred one found; promote this */
540			rank = 1;
541		}
542		MPTS_UNLOCK(mpts);
543	}
544
545	if (rank == 1) {
546		/*
547		 * If rank is 1, promote this subflow to be preferred.
548		 */
549		TAILQ_FOREACH(mpts1, &mpte->mpte_subflows, mpts_entry) {
550			MPTS_LOCK(mpts1);
551			if (mpts1 != mpts &&
552			    (mpts1->mpts_flags & MPTSF_PREFERRED)) {
553				mpts1->mpts_flags &= ~MPTSF_PREFERRED;
554				if (mpte->mpte_nummpcapflows > 1)
555					mptcp_connorder_helper(mpts1);
556			} else if (mpts1 == mpts) {
557				mpts1->mpts_rank = 1;
558				if (mpts1->mpts_flags & MPTSF_MP_CAPABLE) {
559					mpts1->mpts_flags |= MPTSF_PREFERRED;
560					if (mpte->mpte_nummpcapflows > 1)
561						mptcp_connorder_helper(mpts1);
562				}
563			}
564			MPTS_UNLOCK(mpts1);
565		}
566	}
567
568out:
569	return (error);
570}
571
572static void
573mptcp_connorder_helper(struct mptsub *mpts)
574{
575	struct socket *so = mpts->mpts_socket;
576	struct tcpcb *tp = NULL;
577
578	socket_lock(so, 0);
579
580	tp = intotcpcb(sotoinpcb(so));
581	tp->t_mpflags |= TMPF_SND_MPPRIO;
582	if (mpts->mpts_flags & MPTSF_PREFERRED)
583		tp->t_mpflags &= ~TMPF_BACKUP_PATH;
584	else
585		tp->t_mpflags |= TMPF_BACKUP_PATH;
586	mptcplog((LOG_DEBUG, "%s cid %d flags %x", __func__,
587	    mpts->mpts_connid, mpts->mpts_flags));
588	socket_unlock(so, 0);
589
590}
591
592/*
593 * Handle SIOCSGONNORDER
594 */
595int
596mptcp_getconnorder(struct mptses *mpte, connid_t cid, uint32_t *rank)
597{
598	struct mptsub *mpts;
599	int error = 0;
600
601	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
602	VERIFY(rank != NULL);
603	*rank = 0;
604
605	if (cid == CONNID_ANY || cid == CONNID_ALL) {
606		error = EINVAL;
607		goto out;
608	}
609
610	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
611		if (mpts->mpts_connid == cid)
612			break;
613	}
614	if (mpts == NULL) {
615		error = ENXIO;
616		goto out;
617	}
618
619	MPTS_LOCK(mpts);
620	*rank = mpts->mpts_rank;
621	MPTS_UNLOCK(mpts);
622out:
623	return (error);
624}
625
626/*
627 * User-protocol pru_control callback.
628 */
629static int
630mptcp_usr_control(struct socket *mp_so, u_long cmd, caddr_t data,
631    struct ifnet *ifp, struct proc *p)
632{
633#pragma unused(ifp, p)
634	struct mppcb *mpp = sotomppcb(mp_so);
635	struct mptses *mpte;
636	int error = 0;
637
638	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
639		error = EINVAL;
640		goto out;
641	}
642	mpte = mptompte(mpp);
643	VERIFY(mpte != NULL);
644
645	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
646
647	switch (cmd) {
648	case SIOCGASSOCIDS32: {		/* struct so_aidreq32 */
649		struct so_aidreq32 aidr;
650		bcopy(data, &aidr, sizeof (aidr));
651		error = mptcp_getassocids(mpte, &aidr.sar_cnt,
652		    aidr.sar_aidp);
653		if (error == 0)
654			bcopy(&aidr, data, sizeof (aidr));
655		break;
656	}
657
658	case SIOCGASSOCIDS64: {		/* struct so_aidreq64 */
659		struct so_aidreq64 aidr;
660		bcopy(data, &aidr, sizeof (aidr));
661		error = mptcp_getassocids(mpte, &aidr.sar_cnt,
662		    aidr.sar_aidp);
663		if (error == 0)
664			bcopy(&aidr, data, sizeof (aidr));
665		break;
666	}
667
668	case SIOCGCONNIDS32: {		/* struct so_cidreq32 */
669		struct so_cidreq32 cidr;
670		bcopy(data, &cidr, sizeof (cidr));
671		error = mptcp_getconnids(mpte, cidr.scr_aid, &cidr.scr_cnt,
672		    cidr.scr_cidp);
673		if (error == 0)
674			bcopy(&cidr, data, sizeof (cidr));
675		break;
676	}
677
678	case SIOCGCONNIDS64: {		/* struct so_cidreq64 */
679		struct so_cidreq64 cidr;
680		bcopy(data, &cidr, sizeof (cidr));
681		error = mptcp_getconnids(mpte, cidr.scr_aid, &cidr.scr_cnt,
682		    cidr.scr_cidp);
683		if (error == 0)
684			bcopy(&cidr, data, sizeof (cidr));
685		break;
686	}
687
688	case SIOCGCONNINFO32: {		/* struct so_cinforeq32 */
689		struct so_cinforeq32 cifr;
690		bcopy(data, &cifr, sizeof (cifr));
691		error = mptcp_getconninfo(mpte, &cifr.scir_cid,
692		    &cifr.scir_flags, &cifr.scir_ifindex, &cifr.scir_error,
693		    cifr.scir_src, &cifr.scir_src_len, cifr.scir_dst,
694		    &cifr.scir_dst_len, &cifr.scir_aux_type, cifr.scir_aux_data,
695		    &cifr.scir_aux_len);
696		if (error == 0)
697			bcopy(&cifr, data, sizeof (cifr));
698		break;
699	}
700
701	case SIOCGCONNINFO64: {		/* struct so_cinforeq64 */
702		struct so_cinforeq64 cifr;
703		bcopy(data, &cifr, sizeof (cifr));
704		error = mptcp_getconninfo(mpte, &cifr.scir_cid,
705		    &cifr.scir_flags, &cifr.scir_ifindex, &cifr.scir_error,
706		    cifr.scir_src, &cifr.scir_src_len, cifr.scir_dst,
707		    &cifr.scir_dst_len, &cifr.scir_aux_type, cifr.scir_aux_data,
708		    &cifr.scir_aux_len);
709		if (error == 0)
710			bcopy(&cifr, data, sizeof (cifr));
711		break;
712	}
713
714	case SIOCSCONNORDER: {		/* struct so_cordreq */
715		struct so_cordreq cor;
716		bcopy(data, &cor, sizeof (cor));
717		error = mptcp_setconnorder(mpte, cor.sco_cid, cor.sco_rank);
718		if (error == 0)
719			bcopy(&cor, data, sizeof (cor));
720		break;
721	}
722
723	case SIOCGCONNORDER: {		/* struct so_cordreq */
724		struct so_cordreq cor;
725		bcopy(data, &cor, sizeof (cor));
726		error = mptcp_getconnorder(mpte, cor.sco_cid, &cor.sco_rank);
727		if (error == 0)
728			bcopy(&cor, data, sizeof (cor));
729		break;
730	}
731
732	default:
733		error = EOPNOTSUPP;
734		break;
735	}
736out:
737	return (error);
738}
739
740/*
741 * Initiate a disconnect.  MPTCP-level disconnection is specified by
742 * CONNID_{ANY,ALL}.  Otherwise, selectively disconnect a subflow
743 * connection while keeping the MPTCP-level connection (association).
744 */
745static int
746mptcp_disconnectx(struct mptses *mpte, associd_t aid, connid_t cid)
747{
748	struct mptsub *mpts;
749	struct socket *mp_so;
750	struct mptcb *mp_tp;
751	int error = 0;
752
753	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
754
755	mp_so = mpte->mpte_mppcb->mpp_socket;
756	mp_tp = mpte->mpte_mptcb;
757
758	mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx aid %d cid %d\n", __func__,
759	    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), aid, cid));
760	DTRACE_MPTCP5(disconnectx, struct mptses *, mpte, associd_t, aid,
761	    connid_t, cid, struct socket *, mp_so, struct mptcb *, mp_tp);
762
763	VERIFY(aid == ASSOCID_ANY || aid == ASSOCID_ALL ||
764	    aid == mpte->mpte_associd);
765
766	/* terminate the association? */
767	if (cid == CONNID_ANY || cid == CONNID_ALL) {
768		/* if we're not detached, go thru socket state checks */
769		if (!(mp_so->so_flags & SOF_PCBCLEARING)) {
770			if (!(mp_so->so_state & (SS_ISCONNECTED|
771			    SS_ISCONNECTING))) {
772				error = ENOTCONN;
773				goto out;
774			}
775			if (mp_so->so_state & SS_ISDISCONNECTING) {
776				error = EALREADY;
777				goto out;
778			}
779		}
780		MPT_LOCK(mp_tp);
781		mptcp_cancel_all_timers(mp_tp);
782		if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
783			(void) mptcp_close(mpte, mp_tp);
784			MPT_UNLOCK(mp_tp);
785		} else if ((mp_so->so_options & SO_LINGER) &&
786		    mp_so->so_linger == 0) {
787			(void) mptcp_drop(mpte, mp_tp, 0);
788			MPT_UNLOCK(mp_tp);
789		} else {
790			MPT_UNLOCK(mp_tp);
791			soisdisconnecting(mp_so);
792			sbflush(&mp_so->so_rcv);
793			if (mptcp_usrclosed(mpte) != NULL)
794				(void) mptcp_output(mpte);
795		}
796	} else {
797		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
798			if (mpts->mpts_connid != cid)
799				continue;
800			MPTS_LOCK(mpts);
801			mptcp_subflow_disconnect(mpte, mpts, FALSE);
802			MPTS_UNLOCK(mpts);
803			break;
804		}
805
806		if (mpts == NULL) {
807			error = EINVAL;
808			goto out;
809		}
810	}
811
812	if (error == 0)
813		mptcp_thread_signal(mpte);
814
815	if ((mp_so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
816	    (SS_CANTRCVMORE | SS_CANTSENDMORE)) {
817		/* the socket has been shutdown, no more sockopt's */
818		mptcp_flush_sopts(mpte);
819	}
820
821out:
822	return (error);
823}
824
825/*
826 * User-protocol pru_disconnectx callback.
827 */
828static int
829mptcp_usr_disconnectx(struct socket *mp_so, associd_t aid, connid_t cid)
830{
831	struct mppcb *mpp = sotomppcb(mp_so);
832	struct mptses *mpte;
833	int error = 0;
834
835	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
836		error = EINVAL;
837		goto out;
838	}
839	mpte = mptompte(mpp);
840	VERIFY(mpte != NULL);
841	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
842
843	if (aid != ASSOCID_ANY && aid != ASSOCID_ALL &&
844	    aid != mpte->mpte_associd) {
845		error = EINVAL;
846		goto out;
847	}
848
849	error = mptcp_disconnectx(mpte, aid, cid);
850out:
851	return (error);
852}
853
854/*
855 * User issued close, and wish to trail thru shutdown states.
856 */
857static struct mptses *
858mptcp_usrclosed(struct mptses *mpte)
859{
860	struct socket *mp_so;
861	struct mptcb *mp_tp;
862	struct mptsub *mpts;
863
864	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
865	mp_so = mpte->mpte_mppcb->mpp_socket;
866	mp_tp = mpte->mpte_mptcb;
867
868	MPT_LOCK(mp_tp);
869	mptcp_close_fsm(mp_tp, MPCE_CLOSE);
870
871	if (mp_tp->mpt_state == TCPS_CLOSED) {
872		mpte = mptcp_close(mpte, mp_tp);
873		MPT_UNLOCK(mp_tp);
874	} else if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
875		MPT_UNLOCK(mp_tp);
876		soisdisconnected(mp_so);
877	} else {
878		mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
879		MPT_UNLOCK(mp_tp);
880
881		TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
882			MPTS_LOCK(mpts);
883			mptcp_subflow_disconnect(mpte, mpts, FALSE);
884			MPTS_UNLOCK(mpts);
885		}
886	}
887	/*
888	 * XXX: adi@apple.com
889	 *
890	 * Do we need to handle time wait specially here?  We need to handle
891	 * the case where MPTCP has been established, but we have not usable
892	 * subflow to use.  Do we want to wait a while before forcibly
893	 * tearing this MPTCP down, in case we have one or more subflows
894	 * that are flow controlled?
895	 */
896
897	return (mpte);
898}
899
900/*
901 * User-protocol pru_peeloff callback.
902 */
903static int
904mptcp_usr_peeloff(struct socket *mp_so, associd_t aid, struct socket **psop)
905{
906	struct mppcb *mpp = sotomppcb(mp_so);
907	struct mptses *mpte;
908	int error = 0;
909
910	VERIFY(psop != NULL);
911
912	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
913		error = EINVAL;
914		goto out;
915	}
916	mpte = mptompte(mpp);
917	VERIFY(mpte != NULL);
918
919	error = mptcp_peeloff(mpte, aid, psop);
920out:
921	return (error);
922}
923
924/*
925 * Transform a previously connected TCP subflow connection which has
926 * failed to negotiate MPTCP to its own socket which can be externalized
927 * with a file descriptor.  Valid only when the MPTCP socket is not
928 * yet associated (MPTCP-level connection has not been established.)
929 */
930static int
931mptcp_peeloff(struct mptses *mpte, associd_t aid, struct socket **psop)
932{
933	struct socket *so = NULL, *mp_so;
934	struct mptsub *mpts;
935	int error = 0;
936
937	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
938	mp_so = mpte->mpte_mppcb->mpp_socket;
939
940	VERIFY(psop != NULL);
941	*psop = NULL;
942
943	DTRACE_MPTCP3(peeloff, struct mptses *, mpte, associd_t, aid,
944	    struct socket *, mp_so);
945
946	/* peeloff cannot happen after an association is established */
947	if (mpte->mpte_associd != ASSOCID_ANY) {
948		error = EINVAL;
949		goto out;
950	}
951
952	if (aid != ASSOCID_ANY && aid != ASSOCID_ALL) {
953		error = EINVAL;
954		goto out;
955	}
956
957	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
958		MPTS_LOCK(mpts);
959		if (mpts->mpts_flags & MPTSF_MP_CAPABLE) {
960			panic("%s: so %p is MPTCP capable but mp_so %p "
961			    "aid is %d\n", __func__, so, mp_so,
962			    mpte->mpte_associd);
963			/* NOTREACHED */
964		}
965		MPTS_ADDREF_LOCKED(mpts);	/* for us */
966		so = mpts->mpts_socket;
967		VERIFY(so != NULL);
968		/*
969		 * This subflow socket is about to be externalized; make it
970		 * appear as if it has the same properties as the MPTCP socket,
971		 * undo what's done earlier in mptcp_subflow_add().
972		 */
973		mptcp_subflow_sopeeloff(mpte, mpts, so);
974		MPTS_UNLOCK(mpts);
975
976		mptcp_subflow_del(mpte, mpts, FALSE);
977		MPTS_REMREF(mpts);		/* ours */
978		/*
979		 * XXX adi@apple.com
980		 *
981		 * Here we need to make sure the subflow socket is not
982		 * flow controlled; need to clear both INP_FLOW_CONTROLLED
983		 * and INP_FLOW_SUSPENDED on the subflow socket, since
984		 * we will no longer be monitoring its events.
985		 */
986		break;
987	}
988
989	if (so == NULL) {
990		error = EINVAL;
991		goto out;
992	}
993	*psop = so;
994
995	mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx\n", __func__,
996	    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)));
997out:
998	return (error);
999}
1000
1001/*
1002 * After a receive, possible send some update to peer.
1003 */
1004static int
1005mptcp_usr_rcvd(struct socket *mp_so, int flags)
1006{
1007#pragma unused(flags)
1008	struct mppcb *mpp = sotomppcb(mp_so);
1009	struct mptses *mpte;
1010	int error = 0;
1011
1012	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
1013		error = EINVAL;
1014		goto out;
1015	}
1016	mpte = mptompte(mpp);
1017	VERIFY(mpte != NULL);
1018
1019	error = mptcp_output(mpte);
1020out:
1021	return (error);
1022}
1023
1024/*
1025 * Do a send by putting data in the output queue.
1026 */
1027static int
1028mptcp_usr_send(struct socket *mp_so, int prus_flags, struct mbuf *m,
1029    struct sockaddr *nam, struct mbuf *control, struct proc *p)
1030{
1031#pragma unused(nam, p)
1032	struct mppcb *mpp = sotomppcb(mp_so);
1033	struct mptses *mpte;
1034	int error = 0;
1035
1036	if (prus_flags & (PRUS_OOB|PRUS_EOF)) {
1037		error = EOPNOTSUPP;
1038		goto out;
1039	}
1040
1041	if (nam != NULL) {
1042		error = EOPNOTSUPP;
1043		goto out;
1044	}
1045
1046	if (control != NULL && control->m_len != 0) {
1047		error = EOPNOTSUPP;
1048		goto out;
1049	}
1050
1051	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
1052		error = ECONNRESET;
1053		goto out;
1054	}
1055	mpte = mptompte(mpp);
1056	VERIFY(mpte != NULL);
1057
1058	if (!(mp_so->so_state & SS_ISCONNECTED)) {
1059		error = ENOTCONN;
1060		goto out;
1061	}
1062
1063	mptcp_insert_dsn(mpp, m);
1064	VERIFY(mp_so->so_snd.sb_flags & SB_NOCOMPRESS);
1065	(void) sbappendstream(&mp_so->so_snd, m);
1066	m = NULL;
1067
1068	if (mpte != NULL) {
1069		/*
1070		 * XXX: adi@apple.com
1071		 *
1072		 * PRUS_MORETOCOME could be set, but we don't check it now.
1073		 */
1074		error = mptcp_output(mpte);
1075	}
1076
1077out:
1078	if (error) {
1079		if (m != NULL)
1080			m_freem(m);
1081		if (control != NULL)
1082			m_freem(control);
1083	}
1084	return (error);
1085}
1086
1087/*
1088 * Mark the MPTCP connection as being incapable of further output.
1089 */
1090static int
1091mptcp_usr_shutdown(struct socket *mp_so)
1092{
1093	struct mppcb *mpp = sotomppcb(mp_so);
1094	struct mptses *mpte;
1095	int error = 0;
1096
1097	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
1098		error = EINVAL;
1099		goto out;
1100	}
1101	mpte = mptompte(mpp);
1102	VERIFY(mpte != NULL);
1103
1104	socantsendmore(mp_so);
1105
1106	mpte = mptcp_usrclosed(mpte);
1107	if (mpte != NULL)
1108		error = mptcp_output(mpte);
1109out:
1110	return (error);
1111}
1112
1113/*
1114 * Copy the contents of uio into a properly sized mbuf chain.
1115 */
1116static int
1117mptcp_uiotombuf(struct uio *uio, int how, int space, uint32_t align,
1118    struct mbuf **top)
1119{
1120	struct mbuf *m, *mb, *nm = NULL, *mtail = NULL;
1121	user_ssize_t resid, tot, len, progress;	/* must be user_ssize_t */
1122	int error;
1123
1124	VERIFY(top != NULL && *top == NULL);
1125
1126	/*
1127	 * space can be zero or an arbitrary large value bound by
1128	 * the total data supplied by the uio.
1129	 */
1130	resid = uio_resid(uio);
1131	if (space > 0)
1132		tot = imin(resid, space);
1133	else
1134		tot = resid;
1135
1136	/*
1137	 * The smallest unit is a single mbuf with pkthdr.
1138	 * We can't align past it.
1139	 */
1140	if (align >= MHLEN)
1141		return (EINVAL);
1142
1143	/*
1144	 * Give us the full allocation or nothing.
1145	 * If space is zero return the smallest empty mbuf.
1146	 */
1147	if ((len = tot + align) == 0)
1148		len = 1;
1149
1150	/* Loop and append maximum sized mbufs to the chain tail. */
1151	while (len > 0) {
1152		uint32_t m_needed = 1;
1153
1154		if (njcl > 0 && len > MBIGCLBYTES)
1155			mb = m_getpackets_internal(&m_needed, 1,
1156			    how, 1, M16KCLBYTES);
1157		else if (len > MCLBYTES)
1158			mb = m_getpackets_internal(&m_needed, 1,
1159			    how, 1, MBIGCLBYTES);
1160		else if (len >= (signed)MINCLSIZE)
1161			mb = m_getpackets_internal(&m_needed, 1,
1162			    how, 1, MCLBYTES);
1163		else
1164			mb = m_gethdr(how, MT_DATA);
1165
1166		/* Fail the whole operation if one mbuf can't be allocated. */
1167		if (mb == NULL) {
1168			if (nm != NULL)
1169				m_freem(nm);
1170			return (ENOBUFS);
1171		}
1172
1173		/* Book keeping. */
1174		VERIFY(mb->m_flags & M_PKTHDR);
1175		len -= ((mb->m_flags & M_EXT) ? mb->m_ext.ext_size : MHLEN);
1176		if (mtail != NULL)
1177			mtail->m_next = mb;
1178		else
1179			nm = mb;
1180		mtail = mb;
1181	}
1182
1183	m = nm;
1184	m->m_data += align;
1185
1186	progress = 0;
1187	/* Fill all mbufs with uio data and update header information. */
1188	for (mb = m; mb != NULL; mb = mb->m_next) {
1189		len = imin(M_TRAILINGSPACE(mb), tot - progress);
1190
1191		error = uiomove(mtod(mb, char *), len, uio);
1192		if (error != 0) {
1193			m_freem(m);
1194			return (error);
1195		}
1196
1197		/* each mbuf is M_PKTHDR chained via m_next */
1198		mb->m_len = len;
1199		mb->m_pkthdr.len = len;
1200
1201		progress += len;
1202	}
1203	VERIFY(progress == tot);
1204	*top = m;
1205	return (0);
1206}
1207
1208/*
1209 * MPTCP socket protocol-user socket send routine, derived from sosend().
1210 */
1211static int
1212mptcp_usr_sosend(struct socket *mp_so, struct sockaddr *addr, struct uio *uio,
1213    struct mbuf *top, struct mbuf *control, int flags)
1214{
1215#pragma unused(addr)
1216	int32_t space;
1217	user_ssize_t resid;
1218	int error, sendflags;
1219	struct proc *p = current_proc();
1220	int sblocked = 0;
1221
1222	/* UIO is required for now, due to per-mbuf M_PKTHDR constrains */
1223	if (uio == NULL || top != NULL) {
1224		error = EINVAL;
1225		goto out;
1226	}
1227	resid = uio_resid(uio);
1228
1229	socket_lock(mp_so, 1);
1230	so_update_last_owner_locked(mp_so, p);
1231	so_update_policy(mp_so);
1232
1233	VERIFY(mp_so->so_type == SOCK_STREAM);
1234	VERIFY(!(mp_so->so_flags & SOF_MP_SUBFLOW));
1235
1236	if ((flags & (MSG_OOB|MSG_DONTROUTE|MSG_HOLD|MSG_SEND|MSG_FLUSH)) ||
1237	    (mp_so->so_flags & SOF_ENABLE_MSGS)) {
1238		error = EOPNOTSUPP;
1239		socket_unlock(mp_so, 1);
1240		goto out;
1241	}
1242
1243	/*
1244	 * In theory resid should be unsigned.  However, space must be
1245	 * signed, as it might be less than 0 if we over-committed, and we
1246	 * must use a signed comparison of space and resid.  On the other
1247	 * hand, a negative resid causes us to loop sending 0-length
1248	 * segments to the protocol.
1249	 */
1250	if (resid < 0 || (flags & MSG_EOR) || control != NULL) {
1251		error = EINVAL;
1252		socket_unlock(mp_so, 1);
1253		goto out;
1254	}
1255
1256	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1257
1258	do {
1259		error = sosendcheck(mp_so, NULL, resid, 0, 0, flags,
1260		    &sblocked, NULL);
1261		if (error != 0)
1262			goto release;
1263
1264		space = sbspace(&mp_so->so_snd);
1265		do {
1266			socket_unlock(mp_so, 0);
1267			/*
1268			 * Copy the data from userland into an mbuf chain.
1269			 */
1270			error = mptcp_uiotombuf(uio, M_WAITOK, space, 0, &top);
1271			if (error != 0) {
1272				socket_lock(mp_so, 0);
1273				goto release;
1274			}
1275			VERIFY(top != NULL);
1276			space -= resid - uio_resid(uio);
1277			resid = uio_resid(uio);
1278			socket_lock(mp_so, 0);
1279
1280			/*
1281			 * Compute flags here, for pru_send and NKEs.
1282			 */
1283			sendflags = (resid > 0 && space > 0) ?
1284			    PRUS_MORETOCOME : 0;
1285
1286			/*
1287			 * Socket filter processing
1288			 */
1289			VERIFY(control == NULL);
1290			error = sflt_data_out(mp_so, NULL, &top, &control, 0);
1291			if (error != 0) {
1292				if (error == EJUSTRETURN) {
1293					error = 0;
1294					top = NULL;
1295					/* always free control if any */
1296				}
1297				goto release;
1298			}
1299			if (control != NULL) {
1300				m_freem(control);
1301				control = NULL;
1302			}
1303
1304			/*
1305			 * Pass data to protocol.
1306			 */
1307			error = (*mp_so->so_proto->pr_usrreqs->pru_send)
1308			    (mp_so, sendflags, top, NULL, NULL, p);
1309
1310			top = NULL;
1311			if (error != 0)
1312				goto release;
1313		} while (resid != 0 && space > 0);
1314	} while (resid != 0);
1315
1316release:
1317	if (sblocked)
1318		sbunlock(&mp_so->so_snd, FALSE); /* will unlock socket */
1319	else
1320		socket_unlock(mp_so, 1);
1321out:
1322	if (top != NULL)
1323		m_freem(top);
1324	if (control != NULL)
1325		m_freem(control);
1326
1327	return (error);
1328}
1329
1330/*
1331 * Called to filter SOPT_{SET,GET} for SOL_SOCKET level socket options.
1332 * This routine simply indicates to the caller whether or not to proceed
1333 * further with the given socket option.  This is invoked by sosetoptlock()
1334 * and sogetoptlock().
1335 */
1336static int
1337mptcp_usr_socheckopt(struct socket *mp_so, struct sockopt *sopt)
1338{
1339#pragma unused(mp_so)
1340	int error = 0;
1341
1342	VERIFY(sopt->sopt_level == SOL_SOCKET);
1343
1344	/*
1345	 * We could check for sopt_dir (set/get) here, but we'll just
1346	 * let the caller deal with it as appropriate; therefore the
1347	 * following is a superset of the socket options which we
1348	 * allow for set/get.
1349	 *
1350	 * XXX: adi@apple.com
1351	 *
1352	 * Need to consider the following cases:
1353	 *
1354	 *   a. In the event peeloff(2) occurs on the subflow socket,
1355	 *	we may want to issue those options which are now
1356	 *	handled at the MP socket.  In that case, we will need
1357	 *	to record them in mptcp_setopt() so that they can
1358	 *	be replayed during peeloff.
1359	 *
1360	 *   b.	Certain socket options don't have a clear definition
1361	 *	on the expected behavior post connect(2).  At the time
1362	 *	those options are issued on the MP socket, there may
1363	 *	be existing subflow sockets that are already connected.
1364	 */
1365	switch (sopt->sopt_name) {
1366	case SO_LINGER:				/* MP */
1367	case SO_LINGER_SEC:			/* MP */
1368	case SO_TYPE:				/* MP */
1369	case SO_NREAD:				/* MP */
1370	case SO_NWRITE:				/* MP */
1371	case SO_ERROR:				/* MP */
1372	case SO_SNDBUF:				/* MP */
1373	case SO_RCVBUF:				/* MP */
1374	case SO_SNDLOWAT:			/* MP */
1375	case SO_RCVLOWAT:			/* MP */
1376	case SO_SNDTIMEO:			/* MP */
1377	case SO_RCVTIMEO:			/* MP */
1378	case SO_NKE:				/* MP */
1379	case SO_NOSIGPIPE:			/* MP */
1380	case SO_NOADDRERR:			/* MP */
1381	case SO_LABEL:				/* MP */
1382	case SO_PEERLABEL:			/* MP */
1383	case SO_DEFUNCTOK:			/* MP */
1384	case SO_ISDEFUNCT:			/* MP */
1385	case SO_TRAFFIC_CLASS_DBG:		/* MP */
1386		/*
1387		 * Tell the caller that these options are to be processed.
1388		 */
1389		break;
1390
1391	case SO_DEBUG:				/* MP + subflow */
1392	case SO_KEEPALIVE:			/* MP + subflow */
1393	case SO_USELOOPBACK:			/* MP + subflow */
1394	case SO_RANDOMPORT:			/* MP + subflow */
1395	case SO_TRAFFIC_CLASS:			/* MP + subflow */
1396	case SO_RECV_TRAFFIC_CLASS:		/* MP + subflow */
1397	case SO_PRIVILEGED_TRAFFIC_CLASS:	/* MP + subflow */
1398	case SO_RECV_ANYIF:			/* MP + subflow */
1399	case SO_RESTRICTIONS:			/* MP + subflow */
1400	case SO_FLUSH:				/* MP + subflow */
1401		/*
1402		 * Tell the caller that these options are to be processed;
1403		 * these will also be recorded later by mptcp_setopt().
1404		 *
1405		 * NOTE: Only support integer option value for now.
1406		 */
1407		if (sopt->sopt_valsize != sizeof (int))
1408			error = EINVAL;
1409		break;
1410
1411	default:
1412		/*
1413		 * Tell the caller to stop immediately and return an error.
1414		 */
1415		error = ENOPROTOOPT;
1416		break;
1417	}
1418
1419	return (error);
1420}
1421
1422/*
1423 * Issue SOPT_SET for all MPTCP subflows (for integer option values.)
1424 */
1425static int
1426mptcp_setopt_apply(struct mptses *mpte, struct mptopt *mpo)
1427{
1428	struct socket *mp_so;
1429	struct mptsub *mpts;
1430	struct mptopt smpo;
1431	int error = 0;
1432
1433	/* just bail now if this isn't applicable to subflow sockets */
1434	if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1435		error = ENOPROTOOPT;
1436		goto out;
1437	}
1438
1439	/*
1440	 * Skip those that are handled internally; these options
1441	 * should not have been recorded and marked with the
1442	 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1443	 */
1444	if (mpo->mpo_level == SOL_SOCKET &&
1445	    (mpo->mpo_name == SO_NOSIGPIPE || mpo->mpo_name == SO_NOADDRERR)) {
1446		error = ENOPROTOOPT;
1447		goto out;
1448	}
1449
1450	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1451	mp_so = mpte->mpte_mppcb->mpp_socket;
1452
1453	/*
1454	 * Don't bother going further if there's no subflow; mark the option
1455	 * with MPOF_INTERIM so that we know whether or not to remove this
1456	 * option upon encountering an error while issuing it during subflow
1457	 * socket creation.
1458	 */
1459	if (mpte->mpte_numflows == 0) {
1460		VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows));
1461		mpo->mpo_flags |= MPOF_INTERIM;
1462		/* return success */
1463		goto out;
1464	}
1465
1466	bzero(&smpo, sizeof (smpo));
1467	smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1468	smpo.mpo_level = mpo->mpo_level;
1469	smpo.mpo_name = mpo->mpo_name;
1470
1471	/* grab exisiting values in case we need to rollback */
1472	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1473		struct socket *so;
1474
1475		MPTS_LOCK(mpts);
1476		mpts->mpts_flags &= ~(MPTSF_SOPT_OLDVAL|MPTSF_SOPT_INPROG);
1477		mpts->mpts_oldintval = 0;
1478		smpo.mpo_intval = 0;
1479		VERIFY(mpts->mpts_socket != NULL);
1480		so = mpts->mpts_socket;
1481		socket_lock(so, 0);
1482		if (mptcp_subflow_sogetopt(mpte, so, &smpo) == 0) {
1483			mpts->mpts_flags |= MPTSF_SOPT_OLDVAL;
1484			mpts->mpts_oldintval = smpo.mpo_intval;
1485		}
1486		socket_unlock(so, 0);
1487		MPTS_UNLOCK(mpts);
1488	}
1489
1490	/* apply socket option */
1491	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1492		struct socket *so;
1493
1494		MPTS_LOCK(mpts);
1495		mpts->mpts_flags |= MPTSF_SOPT_INPROG;
1496		VERIFY(mpts->mpts_socket != NULL);
1497		so = mpts->mpts_socket;
1498		socket_lock(so, 0);
1499		error = mptcp_subflow_sosetopt(mpte, so, mpo);
1500		socket_unlock(so, 0);
1501		MPTS_UNLOCK(mpts);
1502		if (error != 0)
1503			break;
1504	}
1505
1506	/* cleanup, and rollback if needed */
1507	TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1508		struct socket *so;
1509
1510		MPTS_LOCK(mpts);
1511		if (!(mpts->mpts_flags & MPTSF_SOPT_INPROG)) {
1512			/* clear in case it's set */
1513			mpts->mpts_flags &= ~MPTSF_SOPT_OLDVAL;
1514			mpts->mpts_oldintval = 0;
1515			MPTS_UNLOCK(mpts);
1516			continue;
1517		}
1518		if (!(mpts->mpts_flags & MPTSF_SOPT_OLDVAL)) {
1519			mpts->mpts_flags &= ~MPTSF_SOPT_INPROG;
1520			VERIFY(mpts->mpts_oldintval == 0);
1521			MPTS_UNLOCK(mpts);
1522			continue;
1523		}
1524		/* error during sosetopt, so roll it back */
1525		if (error != 0) {
1526			VERIFY(mpts->mpts_socket != NULL);
1527			so = mpts->mpts_socket;
1528			socket_lock(so, 0);
1529			smpo.mpo_intval = mpts->mpts_oldintval;
1530			(void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1531			socket_unlock(so, 0);
1532		}
1533		mpts->mpts_oldintval = 0;
1534		mpts->mpts_flags &= ~(MPTSF_SOPT_OLDVAL|MPTSF_SOPT_INPROG);
1535		MPTS_UNLOCK(mpts);
1536	}
1537
1538out:
1539	return (error);
1540}
1541
1542/*
1543 * Handle SOPT_SET for socket options issued on MP socket.
1544 */
1545static int
1546mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
1547{
1548	int error = 0, optval, level, optname, rec = 1;
1549	struct mptopt smpo, *mpo = NULL;
1550	struct socket *mp_so;
1551	char buf[32];
1552
1553	level = sopt->sopt_level;
1554	optname = sopt->sopt_name;
1555
1556	VERIFY(sopt->sopt_dir == SOPT_SET);
1557	VERIFY(level == SOL_SOCKET || level == IPPROTO_TCP);
1558	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1559	mp_so = mpte->mpte_mppcb->mpp_socket;
1560
1561	/*
1562	 * Record socket options which are applicable to subflow sockets so
1563	 * that we can replay them for new ones; see mptcp_usr_socheckopt()
1564	 * for the list of eligible socket-level options.
1565	 */
1566	if (level == SOL_SOCKET) {
1567		switch (optname) {
1568		case SO_DEBUG:
1569		case SO_KEEPALIVE:
1570		case SO_USELOOPBACK:
1571		case SO_RANDOMPORT:
1572		case SO_TRAFFIC_CLASS:
1573		case SO_RECV_TRAFFIC_CLASS:
1574		case SO_PRIVILEGED_TRAFFIC_CLASS:
1575		case SO_RECV_ANYIF:
1576		case SO_RESTRICTIONS:
1577			/* record it */
1578			break;
1579		case SO_FLUSH:
1580			/* don't record it */
1581			rec = 0;
1582			break;
1583		default:
1584			/* nothing to do; just return success */
1585			goto out;
1586		}
1587	} else {
1588		switch (optname) {
1589		case TCP_NODELAY:
1590		case TCP_RXT_FINDROP:
1591		case TCP_KEEPALIVE:
1592		case TCP_KEEPINTVL:
1593		case TCP_KEEPCNT:
1594		case TCP_CONNECTIONTIMEOUT:
1595		case TCP_RXT_CONNDROPTIME:
1596		case PERSIST_TIMEOUT:
1597			/* eligible; record it */
1598			break;
1599		default:
1600			/* not eligible */
1601			error = ENOPROTOOPT;
1602			goto out;
1603		}
1604	}
1605
1606	if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
1607	    sizeof (optval))) != 0)
1608		goto out;
1609
1610	if (rec) {
1611		/* search for an existing one; if not found, allocate */
1612		if ((mpo = mptcp_sopt_find(mpte, sopt)) == NULL)
1613			mpo = mptcp_sopt_alloc(M_WAITOK);
1614
1615		if (mpo == NULL) {
1616			error = ENOBUFS;
1617		} else {
1618			mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s "
1619			    "val %d %s\n", __func__,
1620			    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1621			    mptcp_sopt2str(level, optname, buf,
1622			    sizeof (buf)), optval,
1623			    (mpo->mpo_flags & MPOF_ATTACHED) ?
1624			    "updated" : "recorded"));
1625
1626			/* initialize or update, as needed */
1627			mpo->mpo_intval = optval;
1628			if (!(mpo->mpo_flags & MPOF_ATTACHED)) {
1629				mpo->mpo_level = level;
1630				mpo->mpo_name = optname;
1631				mptcp_sopt_insert(mpte, mpo);
1632			}
1633			VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1634			/* this can be issued on the subflow socket */
1635			mpo->mpo_flags |= MPOF_SUBFLOW_OK;
1636		}
1637	} else {
1638		bzero(&smpo, sizeof (smpo));
1639		mpo = &smpo;
1640		mpo->mpo_flags |= MPOF_SUBFLOW_OK;
1641		mpo->mpo_level = level;
1642		mpo->mpo_name = optname;
1643		mpo->mpo_intval = optval;
1644	}
1645	VERIFY(mpo == NULL || error == 0);
1646
1647	/* issue this socket option on existing subflows */
1648	if (error == 0) {
1649		error = mptcp_setopt_apply(mpte, mpo);
1650		if (error != 0 && (mpo->mpo_flags & MPOF_ATTACHED)) {
1651			VERIFY(mpo != &smpo);
1652			mptcp_sopt_remove(mpte, mpo);
1653			mptcp_sopt_free(mpo);
1654		}
1655		if (mpo == &smpo)
1656			mpo->mpo_flags &= ~MPOF_INTERIM;
1657	}
1658out:
1659	if (error == 0 && mpo != NULL) {
1660		mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s val %d set %s\n",
1661		    __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1662		    mptcp_sopt2str(level, optname, buf,
1663		    sizeof (buf)), optval, (mpo->mpo_flags & MPOF_INTERIM) ?
1664		    "pending" : "successful"));
1665	} else if (error != 0) {
1666		mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s can't be issued "
1667		    "error %d\n", __func__,
1668		    (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mptcp_sopt2str(level,
1669		    optname, buf, sizeof (buf)), error));
1670	}
1671	return (error);
1672}
1673
1674/*
1675 * Handle SOPT_GET for socket options issued on MP socket.
1676 */
1677static int
1678mptcp_getopt(struct mptses *mpte, struct sockopt *sopt)
1679{
1680	int error = 0, optval;
1681
1682	VERIFY(sopt->sopt_dir == SOPT_GET);
1683	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1684
1685	/*
1686	 * We only handle SOPT_GET for TCP level socket options; we should
1687	 * not get here for socket level options since they are already
1688	 * handled at the socket layer.
1689	 */
1690	if (sopt->sopt_level != IPPROTO_TCP) {
1691		error = ENOPROTOOPT;
1692		goto out;
1693	}
1694
1695	switch (sopt->sopt_name) {
1696	case TCP_NODELAY:
1697	case TCP_RXT_FINDROP:
1698	case TCP_KEEPALIVE:
1699	case TCP_KEEPINTVL:
1700	case TCP_KEEPCNT:
1701	case TCP_CONNECTIONTIMEOUT:
1702	case TCP_RXT_CONNDROPTIME:
1703	case PERSIST_TIMEOUT:
1704		/* eligible; get the default value just in case */
1705		error = mptcp_default_tcp_optval(mpte, sopt, &optval);
1706		break;
1707	default:
1708		/* not eligible */
1709		error = ENOPROTOOPT;
1710		break;
1711	}
1712
1713	/*
1714	 * Search for a previously-issued TCP level socket option and
1715	 * return the recorded option value.  This assumes that the
1716	 * value did not get modified by the lower layer after it was
1717	 * issued at setsockopt(2) time.  If not found, we'll return
1718	 * the default value obtained ealier.
1719	 */
1720	if (error == 0) {
1721		struct mptopt *mpo;
1722
1723		if ((mpo = mptcp_sopt_find(mpte, sopt)) != NULL)
1724			optval = mpo->mpo_intval;
1725
1726		error = sooptcopyout(sopt, &optval, sizeof (int));
1727	}
1728out:
1729	return (error);
1730}
1731
1732/*
1733 * Return default values for TCP socket options.  Ideally we would query the
1734 * subflow TCP socket, but that requires creating a subflow socket before
1735 * connectx(2) time.  To simplify things, just return the default values
1736 * that we know of.
1737 */
1738static int
1739mptcp_default_tcp_optval(struct mptses *mpte, struct sockopt *sopt, int *optval)
1740{
1741	int error = 0;
1742
1743	VERIFY(sopt->sopt_level == IPPROTO_TCP);
1744	VERIFY(sopt->sopt_dir == SOPT_GET);
1745	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1746
1747	/* try to do what tcp_newtcpcb() does */
1748	switch (sopt->sopt_name) {
1749	case TCP_NODELAY:
1750	case TCP_RXT_FINDROP:
1751	case TCP_KEEPINTVL:
1752	case TCP_KEEPCNT:
1753	case TCP_CONNECTIONTIMEOUT:
1754	case TCP_RXT_CONNDROPTIME:
1755		*optval = 0;
1756		break;
1757
1758	case TCP_KEEPALIVE:
1759		*optval = mptcp_subflow_keeptime;
1760		break;
1761
1762	case PERSIST_TIMEOUT:
1763		*optval = tcp_max_persist_timeout;
1764		break;
1765
1766	default:
1767		error = ENOPROTOOPT;
1768		break;
1769	}
1770	return (error);
1771}
1772
1773/*
1774 * MPTCP SOPT_{SET,GET} socket option handler, for options issued on the MP
1775 * socket, at SOL_SOCKET and IPPROTO_TCP levels.  The former is restricted
1776 * to those that are allowed by mptcp_usr_socheckopt().
1777 */
1778int
1779mptcp_ctloutput(struct socket *mp_so, struct sockopt *sopt)
1780{
1781	struct mppcb *mpp = sotomppcb(mp_so);
1782	struct mptses *mpte;
1783	int error = 0;
1784
1785	if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
1786		error = EINVAL;
1787		goto out;
1788	}
1789	mpte = mptompte(mpp);
1790	MPTE_LOCK_ASSERT_HELD(mpte);	/* same as MP socket lock */
1791
1792	/* we only handle socket and TCP-level socket options for MPTCP */
1793	if (sopt->sopt_level != SOL_SOCKET && sopt->sopt_level != IPPROTO_TCP) {
1794		char buf[32];
1795		mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s level not "
1796		    "handled\n", __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1797		    mptcp_sopt2str(sopt->sopt_level,
1798		    sopt->sopt_name, buf, sizeof (buf))));
1799		error = EINVAL;
1800		goto out;
1801	}
1802
1803	switch (sopt->sopt_dir) {
1804	case SOPT_SET:
1805		error = mptcp_setopt(mpte, sopt);
1806		break;
1807
1808	case SOPT_GET:
1809		error = mptcp_getopt(mpte, sopt);
1810		break;
1811	}
1812out:
1813	return (error);
1814}
1815
1816/*
1817 * Return a string representation of <sopt_level,sopt_name>
1818 */
1819const char *
1820mptcp_sopt2str(int level, int optname, char *dst, int size)
1821{
1822	char lbuf[32], obuf[32];
1823	const char *l = lbuf, *o = obuf;
1824
1825	(void) snprintf(lbuf, sizeof (lbuf), "0x%x", level);
1826	(void) snprintf(obuf, sizeof (obuf), "0x%x", optname);
1827
1828	switch (level) {
1829	case SOL_SOCKET:
1830		l = "SOL_SOCKET";
1831		switch (optname) {
1832		case SO_LINGER:
1833			o = "SO_LINGER";
1834			break;
1835		case SO_LINGER_SEC:
1836			o = "SO_LINGER_SEC";
1837			break;
1838		case SO_DEBUG:
1839			o = "SO_DEBUG";
1840			break;
1841		case SO_KEEPALIVE:
1842			o = "SO_KEEPALIVE";
1843			break;
1844		case SO_USELOOPBACK:
1845			o = "SO_USELOOPBACK";
1846			break;
1847		case SO_TYPE:
1848			o = "SO_TYPE";
1849			break;
1850		case SO_NREAD:
1851			o = "SO_NREAD";
1852			break;
1853		case SO_NWRITE:
1854			o = "SO_NWRITE";
1855			break;
1856		case SO_ERROR:
1857			o = "SO_ERROR";
1858			break;
1859		case SO_SNDBUF:
1860			o = "SO_SNDBUF";
1861			break;
1862		case SO_RCVBUF:
1863			o = "SO_RCVBUF";
1864			break;
1865		case SO_SNDLOWAT:
1866			o = "SO_SNDLOWAT";
1867			break;
1868		case SO_RCVLOWAT:
1869			o = "SO_RCVLOWAT";
1870			break;
1871		case SO_SNDTIMEO:
1872			o = "SO_SNDTIMEO";
1873			break;
1874		case SO_RCVTIMEO:
1875			o = "SO_RCVTIMEO";
1876			break;
1877		case SO_NKE:
1878			o = "SO_NKE";
1879			break;
1880		case SO_NOSIGPIPE:
1881			o = "SO_NOSIGPIPE";
1882			break;
1883		case SO_NOADDRERR:
1884			o = "SO_NOADDRERR";
1885			break;
1886		case SO_RESTRICTIONS:
1887			o = "SO_RESTRICTIONS";
1888			break;
1889		case SO_LABEL:
1890			o = "SO_LABEL";
1891			break;
1892		case SO_PEERLABEL:
1893			o = "SO_PEERLABEL";
1894			break;
1895		case SO_RANDOMPORT:
1896			o = "SO_RANDOMPORT";
1897			break;
1898		case SO_TRAFFIC_CLASS:
1899			o = "SO_TRAFFIC_CLASS";
1900			break;
1901		case SO_RECV_TRAFFIC_CLASS:
1902			o = "SO_RECV_TRAFFIC_CLASS";
1903			break;
1904		case SO_TRAFFIC_CLASS_DBG:
1905			o = "SO_TRAFFIC_CLASS_DBG";
1906			break;
1907		case SO_PRIVILEGED_TRAFFIC_CLASS:
1908			o = "SO_PRIVILEGED_TRAFFIC_CLASS";
1909			break;
1910		case SO_DEFUNCTOK:
1911			o = "SO_DEFUNCTOK";
1912			break;
1913		case SO_ISDEFUNCT:
1914			o = "SO_ISDEFUNCT";
1915			break;
1916		case SO_OPPORTUNISTIC:
1917			o = "SO_OPPORTUNISTIC";
1918			break;
1919		case SO_FLUSH:
1920			o = "SO_FLUSH";
1921			break;
1922		case SO_RECV_ANYIF:
1923			o = "SO_RECV_ANYIF";
1924			break;
1925		}
1926		break;
1927	case IPPROTO_TCP:
1928		l = "IPPROTO_TCP";
1929		switch (optname) {
1930		case TCP_KEEPALIVE:
1931			o = "TCP_KEEPALIVE";
1932			break;
1933		case TCP_KEEPINTVL:
1934			o = "TCP_KEEPINTVL";
1935			break;
1936		case TCP_KEEPCNT:
1937			o = "TCP_KEEPCNT";
1938			break;
1939		case TCP_CONNECTIONTIMEOUT:
1940			o = "TCP_CONNECTIONTIMEOUT";
1941			break;
1942		case TCP_RXT_CONNDROPTIME:
1943			o = "TCP_RXT_CONNDROPTIME";
1944			break;
1945		case PERSIST_TIMEOUT:
1946			o = "PERSIST_TIMEOUT";
1947			break;
1948		}
1949		break;
1950	}
1951
1952	(void) snprintf(dst, size, "<%s,%s>", l, o);
1953	return (dst);
1954}
1955