1/* 2 * Copyright (c) 2012-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <sys/param.h> 30#include <sys/proc.h> 31#include <sys/systm.h> 32#include <sys/kernel.h> 33#include <sys/mbuf.h> 34#include <sys/mcache.h> 35#include <sys/resourcevar.h> 36#include <sys/socket.h> 37#include <sys/socketvar.h> 38#include <sys/syslog.h> 39#include <sys/domain.h> 40#include <sys/protosw.h> 41#include <sys/sysctl.h> 42 43#include <kern/zalloc.h> 44#include <kern/locks.h> 45 46#include <mach/thread_act.h> 47#include <mach/sdt.h> 48 49#include <net/if.h> 50#include <netinet/in.h> 51#include <netinet/in_pcb.h> 52#include <netinet/in_var.h> 53#include <netinet/tcp.h> 54#include <netinet/tcp_fsm.h> 55#include <netinet/tcp_seq.h> 56#include <netinet/tcp_var.h> 57#include <netinet/mptcp_var.h> 58#include <netinet/mptcp.h> 59#include <netinet/mptcp_seq.h> 60#include <netinet/mptcp_timer.h> 61#include <libkern/crypto/sha1.h> 62#if INET6 63#include <netinet6/in6_pcb.h> 64#include <netinet6/ip6protosw.h> 65#endif /* INET6 */ 66#include <dev/random/randomdev.h> 67 68/* 69 * Notes on MPTCP implementation. 70 * 71 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH 72 * communication domain. The structure mtcbinfo describes the MPTCP instance 73 * of a Multipath protocol in that domain. It is used to keep track of all 74 * MPTCP PCB instances in the system, and is protected by the global lock 75 * mppi_lock. 76 * 77 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM, 78 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with 79 * it comes an MPTCP Session and an MPTCP PCB. All three structures are 80 * allocated from the same memory block, and each structure has a pointer 81 * to the adjacent ones. The layout is defined by the mpp_mtp structure. 82 * The socket lock (mpp_lock) is used to protect accesses to the Multipath 83 * PCB (mppcb) as well as the MPTCP Session (mptses). 84 * 85 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB; 86 * in particular, the list of subflows as well as the MPTCP thread. 87 * 88 * A functioning MPTCP Session consists of one or more subflow sockets. Each 89 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is 90 * represented by the mptsub structure. Because each subflow requires access 91 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each 92 * subflow. This gets decremented prior to the subflow's destruction. The 93 * subflow lock (mpts_lock) is used to protect accesses to the subflow. 94 * 95 * To handle events (read, write, control) from the subflows, an MPTCP thread 96 * is created; currently, there is one thread per MPTCP Session. In order to 97 * prevent the MPTCP socket from being destroyed while being accessed by the 98 * MPTCP thread, we bump up the MPTCP socket's so_usecount for the thread, 99 * which will be decremented prior to the thread's termination. The thread 100 * lock (mpte_thread_lock) is used to synchronize its signalling. 101 * 102 * Lock ordering is defined as follows: 103 * 104 * mtcbinfo (mppi_lock) 105 * mp_so (mpp_lock) 106 * mpts (mpts_lock) 107 * so (inpcb_mtx) 108 * mptcb (mpt_lock) 109 * 110 * It is not a requirement that all of the above locks need to be acquired 111 * in succession, but the correct lock ordering must be followed when there 112 * are more than one locks that need to be held. The MPTCP thread lock is 113 * is not constrained by this arrangement, because none of the other locks 114 * is ever acquired while holding mpte_thread_lock; therefore it may be called 115 * at any moment to signal the thread. 116 * 117 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this 118 * work is done by the MPTCP garbage collector which is invoked on demand by 119 * the PF_MULTIPATH garbage collector. This process will take place once all 120 * of the subflows have been destroyed, and the MPTCP thread be instructed to 121 * self-terminate. 122 */ 123 124static void mptcp_sesdestroy(struct mptses *); 125static void mptcp_thread_signal_locked(struct mptses *); 126static void mptcp_thread_terminate_signal(struct mptses *); 127static void mptcp_thread_dowork(struct mptses *); 128static void mptcp_thread_func(void *, wait_result_t); 129static void mptcp_thread_destroy(struct mptses *); 130static void mptcp_key_pool_init(void); 131static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t); 132static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *); 133static void mptcp_conn_properties(struct mptcb *); 134static void mptcp_init_statevars(struct mptcb *); 135 136static uint32_t mptcp_gc(struct mppcbinfo *); 137static int mptcp_subflow_socreate(struct mptses *, struct mptsub *, 138 int, struct proc *, struct socket **); 139static int mptcp_subflow_soclose(struct mptsub *, struct socket *); 140static int mptcp_subflow_soconnectx(struct mptses *, struct mptsub *); 141static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **, 142 struct uio *, struct mbuf **, struct mbuf **, int *); 143static void mptcp_subflow_rupcall(struct socket *, void *, int); 144static void mptcp_subflow_input(struct mptses *, struct mptsub *); 145static void mptcp_subflow_wupcall(struct socket *, void *, int); 146static void mptcp_subflow_eupcall(struct socket *, void *, uint32_t); 147static void mptcp_update_last_owner(struct mptsub *, struct socket *); 148static void mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts); 149 150/* 151 * Possible return values for subflow event handlers. Note that success 152 * values must be greater or equal than MPTS_EVRET_OK. Values less than that 153 * indicate errors or actions which require immediate attention; they will 154 * prevent the rest of the handlers from processing their respective events 155 * until the next round of events processing. 156 */ 157typedef enum { 158 MPTS_EVRET_DELETE = 1, /* delete this subflow */ 159 MPTS_EVRET_OK = 2, /* OK */ 160 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */ 161 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */ 162 MPTS_EVRET_OK_UPDATE = 5, /* OK with conninfo update */ 163} ev_ret_t; 164 165static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *); 166static ev_ret_t mptcp_subflow_connreset_ev(struct mptses *, struct mptsub *); 167static ev_ret_t mptcp_subflow_cantrcvmore_ev(struct mptses *, struct mptsub *); 168static ev_ret_t mptcp_subflow_cantsendmore_ev(struct mptses *, struct mptsub *); 169static ev_ret_t mptcp_subflow_timeout_ev(struct mptses *, struct mptsub *); 170static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *); 171static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *); 172static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *); 173static ev_ret_t mptcp_subflow_suspend_ev(struct mptses *, struct mptsub *); 174static ev_ret_t mptcp_subflow_resume_ev(struct mptses *, struct mptsub *); 175static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *); 176static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *); 177static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *); 178static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *); 179static ev_ret_t mptcp_fastjoin_ev(struct mptses *, struct mptsub *); 180static ev_ret_t mptcp_deleteok_ev(struct mptses *, struct mptsub *); 181static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *); 182 183static const char *mptcp_evret2str(ev_ret_t); 184 185static mptcp_key_t *mptcp_reserve_key(void); 186static int mptcp_do_sha1(mptcp_key_t *, char *, int); 187static int mptcp_init_authparms(struct mptcb *); 188 189static unsigned int mptsub_zone_size; /* size of mptsub */ 190static struct zone *mptsub_zone; /* zone for mptsub */ 191 192static unsigned int mptopt_zone_size; /* size of mptopt */ 193static struct zone *mptopt_zone; /* zone for mptopt */ 194 195static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */ 196static struct zone *mpt_subauth_zone; /* zone of subf auth entry */ 197 198struct mppcbinfo mtcbinfo; 199 200static struct mptcp_keys_pool_head mptcp_keys_pool; 201 202#define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */ 203#define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */ 204 205SYSCTL_DECL(_net_inet); 206 207SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP"); 208 209uint32_t mptcp_verbose = 0; /* more noise if greater than 1 */ 210SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, verbose, CTLFLAG_RW|CTLFLAG_LOCKED, 211 &mptcp_verbose, 0, "MPTCP verbosity level"); 212 213SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED, 214 &mtcbinfo.mppi_count, 0, "Number of active PCBs"); 215 216/* 217 * Since there is one kernel thread per mptcp socket, imposing an artificial 218 * limit on number of allowed mptcp sockets. 219 */ 220uint32_t mptcp_socket_limit = MPPCB_LIMIT; 221SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, sk_lim, CTLFLAG_RW|CTLFLAG_LOCKED, 222 &mptcp_socket_limit, 0, "MPTCP socket limit"); 223 224/* 225 * SYSCTL to turn on delayed cellular subflow start. 226 */ 227uint32_t mptcp_delayed_subf_start = 0; 228SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, delayed, CTLFLAG_RW|CTLFLAG_LOCKED, 229 &mptcp_delayed_subf_start, 0, "MPTCP Delayed Subflow start"); 230 231/* 232 * SYSCTL for RTT spike measurement threshold in msecs. 233 */ 234int32_t mptcp_rto_spike_thresh = 3000; 235SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, rto_spikethresh, 236 CTLFLAG_RW|CTLFLAG_LOCKED, &mptcp_rto_spike_thresh, 0, 237 "MPTCP RTT spike thresh"); 238 239static struct protosw mptcp_subflow_protosw; 240static struct pr_usrreqs mptcp_subflow_usrreqs; 241#if INET6 242static struct ip6protosw mptcp_subflow_protosw6; 243static struct pr_usrreqs mptcp_subflow_usrreqs6; 244#endif /* INET6 */ 245 246/* 247 * Protocol pr_init callback. 248 */ 249void 250mptcp_init(struct protosw *pp, struct domain *dp) 251{ 252#pragma unused(dp) 253 static int mptcp_initialized = 0; 254 struct protosw *prp; 255#if INET6 256 struct ip6protosw *prp6; 257#endif /* INET6 */ 258 259 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED); 260 261 /* do this only once */ 262 if (mptcp_initialized) 263 return; 264 mptcp_initialized = 1; 265 266 /* 267 * Since PF_MULTIPATH gets initialized after PF_INET/INET6, 268 * we must be able to find IPPROTO_TCP entries for both. 269 */ 270 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM); 271 VERIFY(prp != NULL); 272 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp)); 273 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs, 274 sizeof (mptcp_subflow_usrreqs)); 275 mptcp_subflow_protosw.pr_entry.tqe_next = NULL; 276 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL; 277 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs; 278 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive; 279 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp; 280 /* 281 * Socket filters shouldn't attach/detach to/from this protosw 282 * since pr_protosw is to be used instead, which points to the 283 * real protocol; if they do, it is a bug and we should panic. 284 */ 285 mptcp_subflow_protosw.pr_filter_head.tqh_first = 286 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef; 287 mptcp_subflow_protosw.pr_filter_head.tqh_last = 288 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef; 289 290#if INET6 291 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6, 292 IPPROTO_TCP, SOCK_STREAM); 293 VERIFY(prp6 != NULL); 294 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6)); 295 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6, 296 sizeof (mptcp_subflow_usrreqs6)); 297 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL; 298 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL; 299 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6; 300 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive; 301 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp; 302 /* 303 * Socket filters shouldn't attach/detach to/from this protosw 304 * since pr_protosw is to be used instead, which points to the 305 * real protocol; if they do, it is a bug and we should panic. 306 */ 307 mptcp_subflow_protosw6.pr_filter_head.tqh_first = 308 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef; 309 mptcp_subflow_protosw6.pr_filter_head.tqh_last = 310 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef; 311#endif /* INET6 */ 312 313 bzero(&mtcbinfo, sizeof (mtcbinfo)); 314 TAILQ_INIT(&mtcbinfo.mppi_pcbs); 315 mtcbinfo.mppi_size = sizeof (struct mpp_mtp); 316 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size, 317 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) { 318 panic("%s: unable to allocate MPTCP PCB zone\n", __func__); 319 /* NOTREACHED */ 320 } 321 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE); 322 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE); 323 324 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init(); 325 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb", 326 mtcbinfo.mppi_lock_grp_attr); 327 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init(); 328 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp, 329 mtcbinfo.mppi_lock_attr); 330 mtcbinfo.mppi_gc = mptcp_gc; 331 332 mtcbinfo.mppi_timer = mptcp_timer; 333 334 /* attach to MP domain for garbage collection to take place */ 335 mp_pcbinfo_attach(&mtcbinfo); 336 337 mptsub_zone_size = sizeof (struct mptsub); 338 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size, 339 8192, "mptsub")) == NULL) { 340 panic("%s: unable to allocate MPTCP subflow zone\n", __func__); 341 /* NOTREACHED */ 342 } 343 zone_change(mptsub_zone, Z_CALLERACCT, FALSE); 344 zone_change(mptsub_zone, Z_EXPAND, TRUE); 345 346 mptopt_zone_size = sizeof (struct mptopt); 347 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size, 348 1024, "mptopt")) == NULL) { 349 panic("%s: unable to allocate MPTCP option zone\n", __func__); 350 /* NOTREACHED */ 351 } 352 zone_change(mptopt_zone, Z_CALLERACCT, FALSE); 353 zone_change(mptopt_zone, Z_EXPAND, TRUE); 354 355 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry); 356 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size, 357 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) { 358 panic("%s: unable to allocate MPTCP address auth zone \n", 359 __func__); 360 /* NOTREACHED */ 361 } 362 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE); 363 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE); 364 365 /* Set up a list of unique keys */ 366 mptcp_key_pool_init(); 367 368} 369 370/* 371 * Create an MPTCP session, called as a result of opening a MPTCP socket. 372 */ 373struct mptses * 374mptcp_sescreate(struct socket *mp_so, struct mppcb *mpp) 375{ 376 struct mppcbinfo *mppi; 377 struct mptses *mpte; 378 struct mptcb *mp_tp; 379 int error = 0; 380 381 VERIFY(mpp != NULL); 382 mppi = mpp->mpp_pcbinfo; 383 VERIFY(mppi != NULL); 384 385 mpte = &((struct mpp_mtp *)mpp)->mpp_ses; 386 mp_tp = &((struct mpp_mtp *)mpp)->mtcb; 387 388 /* MPTCP Multipath PCB Extension */ 389 bzero(mpte, sizeof (*mpte)); 390 VERIFY(mpp->mpp_pcbe == NULL); 391 mpp->mpp_pcbe = mpte; 392 mpte->mpte_mppcb = mpp; 393 mpte->mpte_mptcb = mp_tp; 394 395 TAILQ_INIT(&mpte->mpte_sopts); 396 TAILQ_INIT(&mpte->mpte_subflows); 397 mpte->mpte_associd = ASSOCID_ANY; 398 mpte->mpte_connid_last = CONNID_ANY; 399 400 lck_mtx_init(&mpte->mpte_thread_lock, mppi->mppi_lock_grp, 401 mppi->mppi_lock_attr); 402 403 /* 404 * XXX: adi@apple.com 405 * 406 * This can be rather expensive if we have lots of MPTCP sockets, 407 * but we need a kernel thread for this model to work. Perhaps we 408 * could amortize the costs by having one worker thread per a group 409 * of MPTCP sockets. 410 */ 411 if (kernel_thread_start(mptcp_thread_func, mpte, 412 &mpte->mpte_thread) != KERN_SUCCESS) { 413 error = ENOBUFS; 414 goto out; 415 } 416 mp_so->so_usecount++; /* for thread */ 417 418 /* MPTCP Protocol Control Block */ 419 bzero(mp_tp, sizeof (*mp_tp)); 420 lck_mtx_init(&mp_tp->mpt_lock, mppi->mppi_lock_grp, 421 mppi->mppi_lock_attr); 422 mp_tp->mpt_mpte = mpte; 423 424out: 425 if (error != 0) 426 lck_mtx_destroy(&mpte->mpte_thread_lock, mppi->mppi_lock_grp); 427 DTRACE_MPTCP5(session__create, struct socket *, mp_so, 428 struct sockbuf *, &mp_so->so_rcv, 429 struct sockbuf *, &mp_so->so_snd, 430 struct mppcb *, mpp, int, error); 431 432 return ((error != 0) ? NULL : mpte); 433} 434 435/* 436 * Destroy an MPTCP session. 437 */ 438static void 439mptcp_sesdestroy(struct mptses *mpte) 440{ 441 struct mptcb *mp_tp; 442 443 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 444 445 mp_tp = mpte->mpte_mptcb; 446 VERIFY(mp_tp != NULL); 447 448 /* 449 * MPTCP Multipath PCB Extension section 450 */ 451 mptcp_flush_sopts(mpte); 452 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0); 453 454 lck_mtx_destroy(&mpte->mpte_thread_lock, 455 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp); 456 457 /* 458 * MPTCP Protocol Control Block section 459 */ 460 lck_mtx_destroy(&mp_tp->mpt_lock, 461 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp); 462 463 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte, 464 struct mptcb *, mp_tp); 465} 466 467/* 468 * Allocate an MPTCP socket option structure. 469 */ 470struct mptopt * 471mptcp_sopt_alloc(int how) 472{ 473 struct mptopt *mpo; 474 475 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) : 476 zalloc_noblock(mptopt_zone); 477 if (mpo != NULL) { 478 bzero(mpo, mptopt_zone_size); 479 } 480 481 return (mpo); 482} 483 484/* 485 * Free an MPTCP socket option structure. 486 */ 487void 488mptcp_sopt_free(struct mptopt *mpo) 489{ 490 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED)); 491 492 zfree(mptopt_zone, mpo); 493} 494 495/* 496 * Add a socket option to the MPTCP socket option list. 497 */ 498void 499mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo) 500{ 501 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 502 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED)); 503 mpo->mpo_flags |= MPOF_ATTACHED; 504 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry); 505} 506 507/* 508 * Remove a socket option from the MPTCP socket option list. 509 */ 510void 511mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo) 512{ 513 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 514 VERIFY(mpo->mpo_flags & MPOF_ATTACHED); 515 mpo->mpo_flags &= ~MPOF_ATTACHED; 516 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry); 517} 518 519/* 520 * Search for an existing <sopt_level,sopt_name> socket option. 521 */ 522struct mptopt * 523mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt) 524{ 525 struct mptopt *mpo; 526 527 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 528 529 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) { 530 if (mpo->mpo_level == sopt->sopt_level && 531 mpo->mpo_name == sopt->sopt_name) 532 break; 533 } 534 VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int)); 535 536 return (mpo); 537} 538 539/* 540 * Flushes all recorded socket options from an MP socket. 541 */ 542void 543mptcp_flush_sopts(struct mptses *mpte) 544{ 545 struct mptopt *mpo, *tmpo; 546 547 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 548 549 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) { 550 mptcp_sopt_remove(mpte, mpo); 551 mptcp_sopt_free(mpo); 552 } 553 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts)); 554} 555 556/* 557 * Allocate a MPTCP subflow structure. 558 */ 559struct mptsub * 560mptcp_subflow_alloc(int how) 561{ 562 struct mptsub *mpts; 563 564 mpts = (how == M_WAITOK) ? zalloc(mptsub_zone) : 565 zalloc_noblock(mptsub_zone); 566 if (mpts != NULL) { 567 bzero(mpts, mptsub_zone_size); 568 lck_mtx_init(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp, 569 mtcbinfo.mppi_lock_attr); 570 } 571 572 return (mpts); 573} 574 575/* 576 * Deallocate a subflow structure, called when all of the references held 577 * on it have been released. This implies that the subflow has been deleted. 578 */ 579void 580mptcp_subflow_free(struct mptsub *mpts) 581{ 582 MPTS_LOCK_ASSERT_HELD(mpts); 583 584 VERIFY(mpts->mpts_refcnt == 0); 585 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED)); 586 VERIFY(mpts->mpts_mpte == NULL); 587 VERIFY(mpts->mpts_socket == NULL); 588 589 if (mpts->mpts_src_sl != NULL) { 590 sockaddrlist_free(mpts->mpts_src_sl); 591 mpts->mpts_src_sl = NULL; 592 } 593 if (mpts->mpts_dst_sl != NULL) { 594 sockaddrlist_free(mpts->mpts_dst_sl); 595 mpts->mpts_dst_sl = NULL; 596 } 597 MPTS_UNLOCK(mpts); 598 lck_mtx_destroy(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp); 599 600 zfree(mptsub_zone, mpts); 601} 602 603/* 604 * Create an MPTCP subflow socket. 605 */ 606static int 607mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom, 608 struct proc *p, struct socket **so) 609{ 610 struct mptopt smpo, *mpo, *tmpo; 611 struct socket *mp_so; 612 int error; 613 614 *so = NULL; 615 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 616 mp_so = mpte->mpte_mppcb->mpp_socket; 617 618 /* 619 * Create the subflow socket (multipath subflow, non-blocking.) 620 * 621 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow 622 * socket; it will be cleared when the socket is peeled off or closed. 623 * It also indicates to the underlying TCP to handle MPTCP options. 624 * A multipath subflow socket implies SS_NOFDREF state. 625 */ 626 if ((error = socreate_internal(dom, so, SOCK_STREAM, 627 IPPROTO_TCP, p, SOCF_ASYNC | SOCF_MP_SUBFLOW, PROC_NULL)) != 0) { 628 mptcplog((LOG_ERR, "MPTCP ERROR %s: mp_so 0x%llx unable to " 629 "create subflow socket error %d\n", __func__, 630 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error)); 631 return (error); 632 } 633 634 socket_lock(*so, 0); 635 VERIFY((*so)->so_flags & SOF_MP_SUBFLOW); 636 VERIFY(((*so)->so_state & (SS_NBIO|SS_NOFDREF)) == 637 (SS_NBIO|SS_NOFDREF)); 638 639 /* prevent the socket buffers from being compressed */ 640 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS; 641 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS; 642 643 bzero(&smpo, sizeof (smpo)); 644 smpo.mpo_flags |= MPOF_SUBFLOW_OK; 645 smpo.mpo_level = SOL_SOCKET; 646 smpo.mpo_intval = 1; 647 648 /* disable SIGPIPE */ 649 smpo.mpo_name = SO_NOSIGPIPE; 650 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0) 651 goto out; 652 653 /* find out if the subflow's source address goes away */ 654 smpo.mpo_name = SO_NOADDRERR; 655 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0) 656 goto out; 657 658 /* enable keepalive */ 659 smpo.mpo_name = SO_KEEPALIVE; 660 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0) 661 goto out; 662 663 /* 664 * Limit the receive socket buffer size to 64k. 665 * 666 * We need to take into consideration the window scale option 667 * which could be negotiated in one subflow but disabled in 668 * another subflow. 669 * XXX This can be improved in the future. 670 */ 671 smpo.mpo_name = SO_RCVBUF; 672 smpo.mpo_intval = MPTCP_RWIN_MAX; 673 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0) 674 goto out; 675 676 /* N.B.: set by sosetopt */ 677 VERIFY(!((*so)->so_rcv.sb_flags & SB_AUTOSIZE)); 678 /* Prevent automatic socket buffer sizing. */ 679 (*so)->so_snd.sb_flags &= ~SB_AUTOSIZE; 680 681 smpo.mpo_level = IPPROTO_TCP; 682 smpo.mpo_intval = mptcp_subflow_keeptime; 683 smpo.mpo_name = TCP_KEEPALIVE; 684 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0) 685 goto out; 686 687 /* replay setsockopt(2) on the subflow sockets for eligible options */ 688 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) { 689 int interim; 690 691 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) 692 continue; 693 694 /* 695 * Skip those that are handled internally; these options 696 * should not have been recorded and marked with the 697 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case. 698 */ 699 if (mpo->mpo_level == SOL_SOCKET && 700 (mpo->mpo_name == SO_NOSIGPIPE || 701 mpo->mpo_name == SO_NOADDRERR || 702 mpo->mpo_name == SO_KEEPALIVE)) 703 continue; 704 705 interim = (mpo->mpo_flags & MPOF_INTERIM); 706 if (mptcp_subflow_sosetopt(mpte, *so, mpo) != 0 && interim) { 707 char buf[32]; 708 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s val %d " 709 "interim record removed\n", __func__, 710 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 711 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name, 712 buf, sizeof (buf)), mpo->mpo_intval)); 713 mptcp_sopt_remove(mpte, mpo); 714 mptcp_sopt_free(mpo); 715 continue; 716 } 717 } 718 719 /* 720 * We need to receive everything that the subflow socket has, 721 * so use a customized socket receive function. We will undo 722 * this when the socket is peeled off or closed. 723 */ 724 mpts->mpts_oprotosw = (*so)->so_proto; 725 switch (dom) { 726 case PF_INET: 727 (*so)->so_proto = &mptcp_subflow_protosw; 728 break; 729#if INET6 730 case PF_INET6: 731 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6; 732 break; 733#endif /* INET6 */ 734 default: 735 VERIFY(0); 736 /* NOTREACHED */ 737 } 738 739out: 740 socket_unlock(*so, 0); 741 742 DTRACE_MPTCP4(subflow__create, struct mptses *, mpte, 743 struct mptsub *, mpts, int, dom, int, error); 744 745 return (error); 746} 747 748/* 749 * Close an MPTCP subflow socket. 750 * 751 * Note that this may be called on an embryonic subflow, and the only 752 * thing that is guaranteed valid is the protocol-user request. 753 */ 754static int 755mptcp_subflow_soclose(struct mptsub *mpts, struct socket *so) 756{ 757 MPTS_LOCK_ASSERT_HELD(mpts); 758 759 socket_lock(so, 0); 760 VERIFY(so->so_flags & SOF_MP_SUBFLOW); 761 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF)); 762 763 /* restore protocol-user requests */ 764 VERIFY(mpts->mpts_oprotosw != NULL); 765 so->so_proto = mpts->mpts_oprotosw; 766 socket_unlock(so, 0); 767 768 mpts->mpts_socket = NULL; /* may already be NULL */ 769 770 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts, 771 struct socket *, so, 772 struct sockbuf *, &so->so_rcv, 773 struct sockbuf *, &so->so_snd, 774 struct mptses *, mpts->mpts_mpte); 775 776 return (soclose(so)); 777} 778 779/* 780 * Connect an MPTCP subflow socket. 781 * 782 * This may be called inline as part of adding a subflow, or asynchronously 783 * by the thread (upon progressing to MPTCPF_JOIN_READY). Note that in the 784 * pending connect case, the subflow socket may have been bound to an interface 785 * and/or a source IP address which may no longer be around by the time this 786 * routine is called; in that case the connect attempt will most likely fail. 787 */ 788static int 789mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts) 790{ 791 struct socket *so; 792 int af, error; 793 794 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 795 MPTS_LOCK_ASSERT_HELD(mpts); 796 797 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) == 798 MPTSF_CONNECTING); 799 VERIFY(mpts->mpts_socket != NULL); 800 so = mpts->mpts_socket; 801 af = mpts->mpts_family; 802 803 if (af == AF_INET || af == AF_INET6) { 804 struct sockaddr_entry *dst_se; 805 char dbuf[MAX_IPv6_STR_LEN]; 806 807 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head); 808 VERIFY(dst_se != NULL); 809 810 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx dst %s[%d] cid %d " 811 "[pended %s]\n", __func__, 812 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket), 813 inet_ntop(af, ((af == AF_INET) ? 814 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr : 815 (void *)&SIN6(dst_se->se_addr)->sin6_addr), 816 dbuf, sizeof (dbuf)), ((af == AF_INET) ? 817 ntohs(SIN(dst_se->se_addr)->sin_port) : 818 ntohs(SIN6(dst_se->se_addr)->sin6_port)), 819 mpts->mpts_connid, 820 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ? 821 "YES" : "NO"))); 822 } 823 824 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING; 825 826 socket_lock(so, 0); 827 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last); 828 829 /* connect the subflow socket */ 830 error = soconnectxlocked(so, &mpts->mpts_src_sl, &mpts->mpts_dst_sl, 831 mpts->mpts_mpcr.mpcr_proc, mpts->mpts_mpcr.mpcr_ifscope, 832 mpte->mpte_associd, NULL, TCP_CONNREQF_MPTCP, 833 &mpts->mpts_mpcr, sizeof (mpts->mpts_mpcr)); 834 socket_unlock(so, 0); 835 836 /* Allocate a unique address id per subflow */ 837 mpte->mpte_addrid_last++; 838 if (mpte->mpte_addrid_last == 0) 839 mpte->mpte_addrid_last++; 840 841 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte, 842 struct mptsub *, mpts, int, error); 843 844 return (error); 845} 846 847/* 848 * MPTCP subflow socket receive routine, derived from soreceive(). 849 */ 850static int 851mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa, 852 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 853{ 854#pragma unused(uio) 855 int flags, error = 0; 856 struct proc *p = current_proc(); 857 struct mbuf *m, **mp = mp0; 858 struct mbuf *nextrecord; 859 860 socket_lock(so, 1); 861 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED); 862 863#ifdef MORE_LOCKING_DEBUG 864 if (so->so_usecount == 1) { 865 panic("%s: so=%x no other reference on socket\n", __func__, so); 866 /* NOTREACHED */ 867 } 868#endif 869 /* 870 * We return all that is there in the subflow's socket receive buffer 871 * to the MPTCP layer, so we require that the caller passes in the 872 * expected parameters. 873 */ 874 if (mp == NULL || controlp != NULL) { 875 socket_unlock(so, 1); 876 return (EINVAL); 877 } 878 *mp = NULL; 879 if (psa != NULL) 880 *psa = NULL; 881 if (flagsp != NULL) 882 flags = *flagsp &~ MSG_EOR; 883 else 884 flags = 0; 885 886 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM)) { 887 socket_unlock(so, 1); 888 return (EOPNOTSUPP); 889 } 890 flags |= (MSG_DONTWAIT|MSG_NBIO); 891 892 /* 893 * If a recv attempt is made on a previously-accepted socket 894 * that has been marked as inactive (disconnected), reject 895 * the request. 896 */ 897 if (so->so_flags & SOF_DEFUNCT) { 898 struct sockbuf *sb = &so->so_rcv; 899 900 error = ENOTCONN; 901 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n", 902 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), 903 SOCK_DOM(so), SOCK_TYPE(so), error)); 904 /* 905 * This socket should have been disconnected and flushed 906 * prior to being returned from sodefunct(); there should 907 * be no data on its receive list, so panic otherwise. 908 */ 909 if (so->so_state & SS_DEFUNCT) 910 sb_empty_assert(sb, __func__); 911 socket_unlock(so, 1); 912 return (error); 913 } 914 915 /* 916 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE) 917 * and if so just return to the caller. This could happen when 918 * soreceive() is called by a socket upcall function during the 919 * time the socket is freed. The socket buffer would have been 920 * locked across the upcall, therefore we cannot put this thread 921 * to sleep (else we will deadlock) or return EWOULDBLOCK (else 922 * we may livelock), because the lock on the socket buffer will 923 * only be released when the upcall routine returns to its caller. 924 * Because the socket has been officially closed, there can be 925 * no further read on it. 926 * 927 * A multipath subflow socket would have its SS_NOFDREF set by 928 * default, so check for SOF_MP_SUBFLOW socket flag; when the 929 * socket is closed for real, SOF_MP_SUBFLOW would be cleared. 930 */ 931 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) == 932 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) { 933 socket_unlock(so, 1); 934 return (0); 935 } 936 937 /* 938 * For consistency with soreceive() semantics, we need to obey 939 * SB_LOCK in case some other code path has locked the buffer. 940 */ 941 error = sblock(&so->so_rcv, 0); 942 if (error != 0) { 943 socket_unlock(so, 1); 944 return (error); 945 } 946 947 m = so->so_rcv.sb_mb; 948 if (m == NULL) { 949 /* 950 * Panic if we notice inconsistencies in the socket's 951 * receive list; both sb_mb and sb_cc should correctly 952 * reflect the contents of the list, otherwise we may 953 * end up with false positives during select() or poll() 954 * which could put the application in a bad state. 955 */ 956 SB_MB_CHECK(&so->so_rcv); 957 958 if (so->so_error != 0) { 959 error = so->so_error; 960 so->so_error = 0; 961 goto release; 962 } 963 964 if (so->so_state & SS_CANTRCVMORE) { 965 goto release; 966 } 967 968 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) { 969 error = ENOTCONN; 970 goto release; 971 } 972 973 /* 974 * MSG_DONTWAIT is implicitly defined and this routine will 975 * never block, so return EWOULDBLOCK when there is nothing. 976 */ 977 error = EWOULDBLOCK; 978 goto release; 979 } 980 981 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv); 982 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1"); 983 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1"); 984 985 while (m != NULL) { 986 nextrecord = m->m_nextpkt; 987 sbfree(&so->so_rcv, m); 988 989 if (mp != NULL) { 990 *mp = m; 991 mp = &m->m_next; 992 so->so_rcv.sb_mb = m = m->m_next; 993 *mp = NULL; 994 } 995 996 if (m != NULL) { 997 m->m_nextpkt = nextrecord; 998 if (nextrecord == NULL) 999 so->so_rcv.sb_lastrecord = m; 1000 } else { 1001 m = so->so_rcv.sb_mb = nextrecord; 1002 SB_EMPTY_FIXUP(&so->so_rcv); 1003 } 1004 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2"); 1005 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2"); 1006 } 1007 1008 DTRACE_MPTCP3(subflow__receive, struct socket *, so, 1009 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd); 1010 /* notify protocol that we drained all the data */ 1011 if ((so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) 1012 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 1013 1014 if (flagsp != NULL) 1015 *flagsp |= flags; 1016 1017release: 1018 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */ 1019 return (error); 1020 1021} 1022 1023 1024/* 1025 * Prepare an MPTCP subflow socket for peeloff(2); basically undo 1026 * the work done earlier when the subflow socket was created. 1027 */ 1028void 1029mptcp_subflow_sopeeloff(struct mptses *mpte, struct mptsub *mpts, 1030 struct socket *so) 1031{ 1032 struct mptopt smpo; 1033 struct socket *mp_so; 1034 int p, c; 1035 1036 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1037 mp_so = mpte->mpte_mppcb->mpp_socket; 1038 MPTS_LOCK_ASSERT_HELD(mpts); 1039 1040 socket_lock(so, 0); 1041 VERIFY(so->so_flags & SOF_MP_SUBFLOW); 1042 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF)); 1043 1044 /* inherit MPTCP socket states */ 1045 if (!(mp_so->so_state & SS_NBIO)) 1046 so->so_state &= ~SS_NBIO; 1047 1048 /* 1049 * At this point, the socket is not yet closed, as there is at least 1050 * one outstanding usecount previously held by mpts_socket from 1051 * socreate(). Atomically clear SOF_MP_SUBFLOW and SS_NOFDREF here. 1052 */ 1053 so->so_flags &= ~SOF_MP_SUBFLOW; 1054 so->so_state &= ~SS_NOFDREF; 1055 so->so_flags &= ~SOF_MPTCP_TRUE; 1056 1057 /* allow socket buffers to be compressed */ 1058 so->so_rcv.sb_flags &= ~SB_NOCOMPRESS; 1059 so->so_snd.sb_flags &= ~SB_NOCOMPRESS; 1060 1061 /* 1062 * Allow socket buffer auto sizing. 1063 * 1064 * This will increase the current 64k buffer size to whatever is best. 1065 */ 1066 if (!(so->so_rcv.sb_flags & SB_USRSIZE)) 1067 so->so_rcv.sb_flags |= SB_AUTOSIZE; 1068 if (!(so->so_snd.sb_flags & SB_USRSIZE)) 1069 so->so_snd.sb_flags |= SB_AUTOSIZE; 1070 1071 /* restore protocol-user requests */ 1072 VERIFY(mpts->mpts_oprotosw != NULL); 1073 so->so_proto = mpts->mpts_oprotosw; 1074 1075 bzero(&smpo, sizeof (smpo)); 1076 smpo.mpo_flags |= MPOF_SUBFLOW_OK; 1077 smpo.mpo_level = SOL_SOCKET; 1078 1079 /* inherit SOF_NOSIGPIPE from parent MP socket */ 1080 p = (mp_so->so_flags & SOF_NOSIGPIPE); 1081 c = (so->so_flags & SOF_NOSIGPIPE); 1082 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0; 1083 smpo.mpo_name = SO_NOSIGPIPE; 1084 if ((p - c) != 0) 1085 (void) mptcp_subflow_sosetopt(mpte, so, &smpo); 1086 1087 /* inherit SOF_NOADDRAVAIL from parent MP socket */ 1088 p = (mp_so->so_flags & SOF_NOADDRAVAIL); 1089 c = (so->so_flags & SOF_NOADDRAVAIL); 1090 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0; 1091 smpo.mpo_name = SO_NOADDRERR; 1092 if ((p - c) != 0) 1093 (void) mptcp_subflow_sosetopt(mpte, so, &smpo); 1094 1095 /* inherit SO_KEEPALIVE from parent MP socket */ 1096 p = (mp_so->so_options & SO_KEEPALIVE); 1097 c = (so->so_options & SO_KEEPALIVE); 1098 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0; 1099 smpo.mpo_name = SO_KEEPALIVE; 1100 if ((p - c) != 0) 1101 (void) mptcp_subflow_sosetopt(mpte, so, &smpo); 1102 1103 /* unset TCP level default keepalive option */ 1104 p = (intotcpcb(sotoinpcb(mp_so)))->t_keepidle; 1105 c = (intotcpcb(sotoinpcb(so)))->t_keepidle; 1106 smpo.mpo_level = IPPROTO_TCP; 1107 smpo.mpo_intval = 0; 1108 smpo.mpo_name = TCP_KEEPALIVE; 1109 if ((p - c) != 0) 1110 (void) mptcp_subflow_sosetopt(mpte, so, &smpo); 1111 socket_unlock(so, 0); 1112 1113 DTRACE_MPTCP5(subflow__peeloff, struct mptses *, mpte, 1114 struct mptsub *, mpts, struct socket *, so, 1115 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd); 1116} 1117 1118/* 1119 * Establish an initial MPTCP connection (if first subflow and not yet 1120 * connected), or add a subflow to an existing MPTCP connection. 1121 */ 1122int 1123mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts, 1124 struct proc *p, uint32_t ifscope) 1125{ 1126 struct sockaddr_entry *se, *src_se = NULL, *dst_se = NULL; 1127 struct socket *mp_so, *so = NULL; 1128 struct mptsub_connreq mpcr; 1129 struct mptcb *mp_tp; 1130 int af, error = 0; 1131 1132 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1133 mp_so = mpte->mpte_mppcb->mpp_socket; 1134 mp_tp = mpte->mpte_mptcb; 1135 1136 MPT_LOCK(mp_tp); 1137 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) { 1138 /* If the remote end sends Data FIN, refuse subflow adds */ 1139 error = ENOTCONN; 1140 MPT_UNLOCK(mp_tp); 1141 return (error); 1142 } 1143 MPT_UNLOCK(mp_tp); 1144 1145 MPTS_LOCK(mpts); 1146 VERIFY(!(mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED))); 1147 VERIFY(mpts->mpts_mpte == NULL); 1148 VERIFY(mpts->mpts_socket == NULL); 1149 VERIFY(mpts->mpts_dst_sl != NULL); 1150 VERIFY(mpts->mpts_connid == CONNID_ANY); 1151 1152 /* select source (if specified) and destination addresses */ 1153 if ((error = in_selectaddrs(AF_UNSPEC, &mpts->mpts_src_sl, &src_se, 1154 &mpts->mpts_dst_sl, &dst_se)) != 0) 1155 goto out; 1156 1157 VERIFY(mpts->mpts_dst_sl != NULL && dst_se != NULL); 1158 VERIFY(src_se == NULL || mpts->mpts_src_sl != NULL); 1159 af = mpts->mpts_family = dst_se->se_addr->sa_family; 1160 VERIFY(src_se == NULL || src_se->se_addr->sa_family == af); 1161 VERIFY(af == AF_INET || af == AF_INET6); 1162 1163 /* 1164 * If the source address is not specified, allocate a storage for 1165 * it, so that later on we can fill it in with the actual source 1166 * IP address chosen by the underlying layer for the subflow after 1167 * it is connected. 1168 */ 1169 if (mpts->mpts_src_sl == NULL) { 1170 mpts->mpts_src_sl = 1171 sockaddrlist_dup(mpts->mpts_dst_sl, M_WAITOK); 1172 if (mpts->mpts_src_sl == NULL) { 1173 error = ENOBUFS; 1174 goto out; 1175 } 1176 se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head); 1177 VERIFY(se != NULL && se->se_addr != NULL && 1178 se->se_addr->sa_len == dst_se->se_addr->sa_len); 1179 bzero(se->se_addr, se->se_addr->sa_len); 1180 se->se_addr->sa_len = dst_se->se_addr->sa_len; 1181 se->se_addr->sa_family = dst_se->se_addr->sa_family; 1182 } 1183 1184 /* create the subflow socket */ 1185 if ((error = mptcp_subflow_socreate(mpte, mpts, af, p, &so)) != 0) 1186 goto out; 1187 1188 /* If fastjoin is requested, set state in mpts */ 1189 if ((so->so_flags & SOF_MPTCP_FASTJOIN) && 1190 (mp_tp->mpt_state == MPTCPS_ESTABLISHED) && 1191 (mpte->mpte_nummpcapflows == 0)) { 1192 mpts->mpts_flags |= MPTSF_FASTJ_REQD; 1193 mpts->mpts_rel_seq = 1; 1194 MPT_LOCK(mp_tp); 1195 mpts->mpts_sndnxt = mp_tp->mpt_snduna; 1196 MPT_UNLOCK(mp_tp); 1197 } 1198 1199 /* 1200 * Increment the counter, while avoiding 0 (CONNID_ANY) and 1201 * -1 (CONNID_ALL). 1202 */ 1203 mpte->mpte_connid_last++; 1204 if (mpte->mpte_connid_last == CONNID_ALL || 1205 mpte->mpte_connid_last == CONNID_ANY) 1206 mpte->mpte_connid_last++; 1207 1208 mpts->mpts_connid = mpte->mpte_connid_last; 1209 VERIFY(mpts->mpts_connid != CONNID_ANY && 1210 mpts->mpts_connid != CONNID_ALL); 1211 1212 /* Allocate a unique address id per subflow */ 1213 mpte->mpte_addrid_last++; 1214 if (mpte->mpte_addrid_last == 0) 1215 mpte->mpte_addrid_last++; 1216 1217 /* bind subflow socket to the specified interface */ 1218 if (ifscope != IFSCOPE_NONE) { 1219 socket_lock(so, 0); 1220 error = inp_bindif(sotoinpcb(so), ifscope, &mpts->mpts_outif); 1221 if (error != 0) { 1222 socket_unlock(so, 0); 1223 (void) mptcp_subflow_soclose(mpts, so); 1224 goto out; 1225 } 1226 VERIFY(mpts->mpts_outif != NULL); 1227 mpts->mpts_flags |= MPTSF_BOUND_IF; 1228 1229 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx bindif %s[%d] " 1230 "cid %d\n", __func__, 1231 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 1232 mpts->mpts_outif->if_xname, 1233 ifscope, mpts->mpts_connid)); 1234 socket_unlock(so, 0); 1235 } 1236 1237 /* if source address and/or port is specified, bind to it */ 1238 if (src_se != NULL) { 1239 struct sockaddr *sa = src_se->se_addr; 1240 uint32_t mpts_flags = 0; 1241 in_port_t lport; 1242 1243 switch (af) { 1244 case AF_INET: 1245 if (SIN(sa)->sin_addr.s_addr != INADDR_ANY) 1246 mpts_flags |= MPTSF_BOUND_IP; 1247 if ((lport = SIN(sa)->sin_port) != 0) 1248 mpts_flags |= MPTSF_BOUND_PORT; 1249 break; 1250#if INET6 1251 case AF_INET6: 1252 VERIFY(af == AF_INET6); 1253 if (!IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa)->sin6_addr)) 1254 mpts_flags |= MPTSF_BOUND_IP; 1255 if ((lport = SIN6(sa)->sin6_port) != 0) 1256 mpts_flags |= MPTSF_BOUND_PORT; 1257 break; 1258#endif /* INET6 */ 1259 } 1260 1261 error = sobindlock(so, sa, 1); /* will lock/unlock socket */ 1262 if (error != 0) { 1263 (void) mptcp_subflow_soclose(mpts, so); 1264 goto out; 1265 } 1266 mpts->mpts_flags |= mpts_flags; 1267 1268 if (af == AF_INET || af == AF_INET6) { 1269 char sbuf[MAX_IPv6_STR_LEN]; 1270 1271 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx bindip %s[%d] " 1272 "cid %d\n", __func__, 1273 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 1274 inet_ntop(af, ((af == AF_INET) ? 1275 (void *)&SIN(sa)->sin_addr.s_addr : 1276 (void *)&SIN6(sa)->sin6_addr), sbuf, sizeof (sbuf)), 1277 ntohs(lport), mpts->mpts_connid)); 1278 } 1279 } 1280 1281 /* 1282 * Insert the subflow into the list, and associate the MPTCP PCB 1283 * as well as the the subflow socket. From this point on, removing 1284 * the subflow needs to be done via mptcp_subflow_del(). 1285 */ 1286 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry); 1287 mpte->mpte_numflows++; 1288 1289 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED); 1290 mpts->mpts_mpte = mpte; 1291 mpts->mpts_socket = so; 1292 MPTS_ADDREF_LOCKED(mpts); /* for being in MPTCP subflow list */ 1293 MPTS_ADDREF_LOCKED(mpts); /* for subflow socket */ 1294 mp_so->so_usecount++; /* for subflow socket */ 1295 1296 /* register for subflow socket read/write events */ 1297 (void) sock_setupcalls(so, mptcp_subflow_rupcall, mpts, 1298 mptcp_subflow_wupcall, mpts); 1299 1300 /* 1301 * Register for subflow socket control events; ignore 1302 * SO_FILT_HINT_CONNINFO_UPDATED from below since we 1303 * will generate it here. 1304 */ 1305 (void) sock_catchevents(so, mptcp_subflow_eupcall, mpts, 1306 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE | 1307 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT | 1308 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED | 1309 SO_FILT_HINT_SUSPEND | SO_FILT_HINT_RESUME | 1310 SO_FILT_HINT_CONNECTED | SO_FILT_HINT_DISCONNECTED | 1311 SO_FILT_HINT_MPFAILOVER | SO_FILT_HINT_MPSTATUS | 1312 SO_FILT_HINT_MUSTRST | SO_FILT_HINT_MPFASTJ | 1313 SO_FILT_HINT_DELETEOK | SO_FILT_HINT_MPCANTRCVMORE); 1314 1315 /* sanity check */ 1316 VERIFY(!(mpts->mpts_flags & 1317 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING))); 1318 1319 bzero(&mpcr, sizeof (mpcr)); 1320 mpcr.mpcr_proc = p; 1321 mpcr.mpcr_ifscope = ifscope; 1322 /* 1323 * Indicate to the TCP subflow whether or not it should establish 1324 * the initial MPTCP connection, or join an existing one. Fill 1325 * in the connection request structure with additional info needed 1326 * by the underlying TCP (to be used in the TCP options, etc.) 1327 */ 1328 MPT_LOCK(mp_tp); 1329 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) { 1330 if (mp_tp->mpt_state == MPTCPS_CLOSED) { 1331 mp_tp->mpt_localkey = mptcp_reserve_key(); 1332 mptcp_conn_properties(mp_tp); 1333 } 1334 MPT_UNLOCK(mp_tp); 1335 soisconnecting(mp_so); 1336 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ENABLE; 1337 } else { 1338 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) 1339 mpts->mpts_flags |= MPTSF_CONNECT_PENDING; 1340 1341 /* avoid starting up cellular subflow unless required */ 1342 if ((mptcp_delayed_subf_start) && 1343 (IFNET_IS_CELLULAR(mpts->mpts_outif))) { 1344 mpts->mpts_flags |= MPTSF_CONNECT_PENDING; 1345 } 1346 MPT_UNLOCK(mp_tp); 1347 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ADD; 1348 } 1349 1350 mpts->mpts_mpcr = mpcr; 1351 mpts->mpts_flags |= MPTSF_CONNECTING; 1352 1353 if (af == AF_INET || af == AF_INET6) { 1354 char dbuf[MAX_IPv6_STR_LEN]; 1355 1356 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx dst %s[%d] cid %d " 1357 "[pending %s]\n", __func__, 1358 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 1359 inet_ntop(af, ((af == AF_INET) ? 1360 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr : 1361 (void *)&SIN6(dst_se->se_addr)->sin6_addr), 1362 dbuf, sizeof (dbuf)), ((af == AF_INET) ? 1363 ntohs(SIN(dst_se->se_addr)->sin_port) : 1364 ntohs(SIN6(dst_se->se_addr)->sin6_port)), 1365 mpts->mpts_connid, 1366 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ? 1367 "YES" : "NO"))); 1368 } 1369 1370 /* connect right away if first attempt, or if join can be done now */ 1371 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) 1372 error = mptcp_subflow_soconnectx(mpte, mpts); 1373 1374out: 1375 MPTS_UNLOCK(mpts); 1376 if (error == 0) { 1377 soevent(mp_so, SO_FILT_HINT_LOCKED | 1378 SO_FILT_HINT_CONNINFO_UPDATED); 1379 } 1380 return (error); 1381} 1382 1383/* 1384 * Delete/remove a subflow from an MPTCP. The underlying subflow socket 1385 * will no longer be accessible after a subflow is deleted, thus this 1386 * should occur only after the subflow socket has been disconnected. 1387 * If peeloff(2) is called, leave the socket open. 1388 */ 1389void 1390mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close) 1391{ 1392 struct socket *mp_so, *so; 1393 1394 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1395 mp_so = mpte->mpte_mppcb->mpp_socket; 1396 1397 MPTS_LOCK(mpts); 1398 so = mpts->mpts_socket; 1399 VERIFY(so != NULL); 1400 1401 if (close && !((mpts->mpts_flags & MPTSF_DELETEOK) && 1402 (mpts->mpts_flags & MPTSF_USER_DISCONNECT))) { 1403 MPTS_UNLOCK(mpts); 1404 mptcplog((LOG_DEBUG, "%s: %d %x\n", __func__, 1405 mpts->mpts_soerror, mpts->mpts_flags)); 1406 return; 1407 } 1408 1409 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d " 1410 "[close %s] %d %x\n", __func__, 1411 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 1412 mp_so->so_usecount, 1413 mp_so->so_retaincnt, mpts->mpts_connid, 1414 (close ? "YES" : "NO"), mpts->mpts_soerror, 1415 mpts->mpts_flags)); 1416 1417 VERIFY(mpts->mpts_mpte == mpte); 1418 VERIFY(mpts->mpts_connid != CONNID_ANY && 1419 mpts->mpts_connid != CONNID_ALL); 1420 1421 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED); 1422 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED); 1423 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry); 1424 VERIFY(mpte->mpte_numflows != 0); 1425 mpte->mpte_numflows--; 1426 if (mpte->mpte_active_sub == mpts) 1427 mpte->mpte_active_sub = NULL; 1428 1429 /* 1430 * Drop references held by this subflow socket; there 1431 * will be no further upcalls made from this point. 1432 */ 1433 (void) sock_setupcalls(so, NULL, NULL, NULL, NULL); 1434 (void) sock_catchevents(so, NULL, NULL, 0); 1435 1436 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so); 1437 1438 if (close) 1439 (void) mptcp_subflow_soclose(mpts, so); 1440 1441 VERIFY(mp_so->so_usecount != 0); 1442 mp_so->so_usecount--; /* for subflow socket */ 1443 mpts->mpts_mpte = NULL; 1444 mpts->mpts_socket = NULL; 1445 MPTS_UNLOCK(mpts); 1446 1447 MPTS_REMREF(mpts); /* for MPTCP subflow list */ 1448 MPTS_REMREF(mpts); /* for subflow socket */ 1449 1450 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED); 1451} 1452 1453/* 1454 * Disconnect a subflow socket. 1455 */ 1456void 1457mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts, 1458 boolean_t deleteok) 1459{ 1460 struct socket *so; 1461 struct mptcb *mp_tp; 1462 int send_dfin = 0; 1463 1464 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1465 MPTS_LOCK_ASSERT_HELD(mpts); 1466 1467 VERIFY(mpts->mpts_mpte == mpte); 1468 VERIFY(mpts->mpts_socket != NULL); 1469 VERIFY(mpts->mpts_connid != CONNID_ANY && 1470 mpts->mpts_connid != CONNID_ALL); 1471 1472 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED)) 1473 return; 1474 1475 mpts->mpts_flags |= MPTSF_DISCONNECTING; 1476 1477 /* 1478 * If this is coming from disconnectx(2) or issued as part of 1479 * closing the MPTCP socket, the subflow shouldn't stick around. 1480 * Otherwise let it linger around in case the upper layers need 1481 * to retrieve its conninfo. 1482 */ 1483 if (deleteok) 1484 mpts->mpts_flags |= MPTSF_DELETEOK; 1485 1486 so = mpts->mpts_socket; 1487 mp_tp = mpte->mpte_mptcb; 1488 MPT_LOCK(mp_tp); 1489 if (mp_tp->mpt_state > MPTCPS_ESTABLISHED) 1490 send_dfin = 1; 1491 MPT_UNLOCK(mp_tp); 1492 1493 socket_lock(so, 0); 1494 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) && 1495 (so->so_state & SS_ISCONNECTED)) { 1496 mptcplog((LOG_DEBUG, "%s: cid %d fin %d [linger %s]\n", 1497 __func__, mpts->mpts_connid, send_dfin, 1498 (deleteok ? "NO" : "YES"))); 1499 1500 if (send_dfin) 1501 mptcp_send_dfin(so); 1502 (void) soshutdownlock(so, SHUT_RD); 1503 (void) soshutdownlock(so, SHUT_WR); 1504 (void) sodisconnectlocked(so); 1505 } 1506 socket_unlock(so, 0); 1507 /* 1508 * Generate a disconnect event for this subflow socket, in case 1509 * the lower layer doesn't do it; this is needed because the 1510 * subflow socket deletion relies on it. This will also end up 1511 * generating SO_FILT_HINT_CONNINFO_UPDATED on the MPTCP socket; 1512 * we cannot do that here because subflow lock is currently held. 1513 */ 1514 mptcp_subflow_eupcall(so, mpts, SO_FILT_HINT_DISCONNECTED); 1515} 1516 1517/* 1518 * Subflow socket read upcall. 1519 * 1520 * Called when the associated subflow socket posted a read event. The subflow 1521 * socket lock has been released prior to invoking the callback. Note that the 1522 * upcall may occur synchronously as a result of MPTCP performing an action on 1523 * it, or asynchronously as a result of an event happening at the subflow layer. 1524 * Therefore, to maintain lock ordering, the only lock that can be acquired 1525 * here is the thread lock, for signalling purposes. 1526 */ 1527static void 1528mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf) 1529{ 1530#pragma unused(so, waitf) 1531 struct mptsub *mpts = arg; 1532 struct mptses *mpte = mpts->mpts_mpte; 1533 1534 /* 1535 * mpte should never be NULL, except in a race with 1536 * mptcp_subflow_del 1537 */ 1538 if (mpte == NULL) 1539 return; 1540 1541 lck_mtx_lock(&mpte->mpte_thread_lock); 1542 mptcp_thread_signal_locked(mpte); 1543 lck_mtx_unlock(&mpte->mpte_thread_lock); 1544} 1545 1546/* 1547 * Subflow socket input. 1548 * 1549 * Called in the context of the MPTCP thread, for reading data from the 1550 * underlying subflow socket and delivering it to MPTCP. 1551 */ 1552static void 1553mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts) 1554{ 1555 struct mbuf *m = NULL; 1556 struct socket *so; 1557 int error; 1558 struct mptsub *mpts_alt = NULL; 1559 1560 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1561 MPTS_LOCK_ASSERT_HELD(mpts); 1562 1563 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte, 1564 struct mptsub *, mpts); 1565 1566 if (!(mpts->mpts_flags & MPTSF_CONNECTED)) 1567 return; 1568 1569 so = mpts->mpts_socket; 1570 1571 error = sock_receive_internal(so, NULL, &m, 0, NULL); 1572 if (error != 0 && error != EWOULDBLOCK) { 1573 mptcplog((LOG_ERR, "%s: cid %d error %d\n", 1574 __func__, mpts->mpts_connid, error)); 1575 MPTS_UNLOCK(mpts); 1576 mpts_alt = mptcp_get_subflow(mpte, mpts); 1577 if (mpts_alt == NULL) { 1578 if (mptcp_delayed_subf_start) { 1579 mpts_alt = mptcp_get_pending_subflow(mpte, 1580 mpts); 1581 if (mpts_alt) { 1582 mptcplog((LOG_INFO,"%s: pending %d\n", 1583 __func__, mpts_alt->mpts_connid)); 1584 } else { 1585 mptcplog((LOG_ERR, "%s: no pending", 1586 "%d\n", __func__, 1587 mpts->mpts_connid)); 1588 mpte->mpte_mppcb->mpp_socket->so_error = 1589 error; 1590 } 1591 } else { 1592 mptcplog((LOG_ERR, "%s: no alt path cid %d\n", 1593 __func__, mpts->mpts_connid)); 1594 mpte->mpte_mppcb->mpp_socket->so_error = error; 1595 } 1596 } 1597 MPTS_LOCK(mpts); 1598 } else if (error == 0) { 1599 mptcplog3((LOG_DEBUG, "%s: cid %d \n", 1600 __func__, mpts->mpts_connid)); 1601 } 1602 1603 /* In fallback, make sure to accept data on all but one subflow */ 1604 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) && 1605 (!(mpts->mpts_flags & MPTSF_ACTIVE))) { 1606 m_freem(m); 1607 return; 1608 } 1609 1610 if (m != NULL) { 1611 /* 1612 * Release subflow lock since this may trigger MPTCP to send, 1613 * possibly on a different subflow. An extra reference has 1614 * been held on the subflow by the MPTCP thread before coming 1615 * here, so we can be sure that it won't go away, in the event 1616 * the MP socket lock gets released. 1617 */ 1618 MPTS_UNLOCK(mpts); 1619 mptcp_input(mpte, m); 1620 MPTS_LOCK(mpts); 1621 } 1622} 1623 1624/* 1625 * Subflow socket write upcall. 1626 * 1627 * Called when the associated subflow socket posted a read event. The subflow 1628 * socket lock has been released prior to invoking the callback. Note that the 1629 * upcall may occur synchronously as a result of MPTCP performing an action on 1630 * it, or asynchronously as a result of an event happening at the subflow layer. 1631 * Therefore, to maintain lock ordering, the only lock that can be acquired 1632 * here is the thread lock, for signalling purposes. 1633 */ 1634static void 1635mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf) 1636{ 1637#pragma unused(so, waitf) 1638 struct mptsub *mpts = arg; 1639 struct mptses *mpte = mpts->mpts_mpte; 1640 1641 /* 1642 * mpte should never be NULL except in a race with 1643 * mptcp_subflow_del which doesn't hold socket lock across critical 1644 * section. This upcall is made after releasing the socket lock. 1645 * Interleaving of socket operations becomes possible therefore. 1646 */ 1647 if (mpte == NULL) 1648 return; 1649 1650 lck_mtx_lock(&mpte->mpte_thread_lock); 1651 mptcp_thread_signal_locked(mpte); 1652 lck_mtx_unlock(&mpte->mpte_thread_lock); 1653} 1654 1655/* 1656 * Subflow socket output. 1657 * 1658 * Called for sending data from MPTCP to the underlying subflow socket. 1659 */ 1660int 1661mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts) 1662{ 1663 struct socket *mp_so, *so; 1664 size_t sb_cc = 0, tot_sent = 0; 1665 struct mbuf *sb_mb; 1666 int error = 0; 1667 u_int64_t mpt_dsn = 0; 1668 struct mptcb *mp_tp = mpte->mpte_mptcb; 1669 struct mbuf *mpt_mbuf = NULL; 1670 u_int64_t off = 0; 1671 struct mbuf *head, *tail; 1672 1673 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1674 MPTS_LOCK_ASSERT_HELD(mpts); 1675 mp_so = mpte->mpte_mppcb->mpp_socket; 1676 so = mpts->mpts_socket; 1677 1678 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte, 1679 struct mptsub *, mpts); 1680 1681 /* subflow socket is suspended? */ 1682 if (mpts->mpts_flags & MPTSF_SUSPENDED) { 1683 mptcplog((LOG_ERR, "%s: mp_so 0x%llx cid %d is flow " 1684 "controlled\n", __func__, 1685 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid)); 1686 goto out; 1687 } 1688 1689 /* subflow socket is not MPTCP capable? */ 1690 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) && 1691 !(mpts->mpts_flags & MPTSF_MP_DEGRADED) && 1692 !(mpts->mpts_flags & MPTSF_FASTJ_SEND)) { 1693 mptcplog((LOG_ERR, "%s: mp_so 0x%llx cid %d not " 1694 "MPTCP capable\n", __func__, 1695 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid)); 1696 goto out; 1697 } 1698 1699 /* Remove Addr Option is not sent reliably as per I-D */ 1700 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) { 1701 struct tcpcb *tp = intotcpcb(sotoinpcb(so)); 1702 tp->t_rem_aid = mpte->mpte_lost_aid; 1703 if (mptcp_remaddr_enable) 1704 tp->t_mpflags |= TMPF_SND_REM_ADDR; 1705 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR; 1706 } 1707 1708 /* 1709 * The mbuf chains containing the metadata (as well as pointing to 1710 * the user data sitting at the MPTCP output queue) would then be 1711 * sent down to the subflow socket. 1712 * 1713 * Some notes on data sequencing: 1714 * 1715 * a. Each mbuf must be a M_PKTHDR. 1716 * b. MPTCP metadata is stored in the mptcp_pktinfo structure 1717 * in the mbuf pkthdr structure. 1718 * c. Each mbuf containing the MPTCP metadata must have its 1719 * pkt_flags marked with the PKTF_MPTCP flag. 1720 */ 1721 1722 /* First, drop acknowledged data */ 1723 sb_mb = mp_so->so_snd.sb_mb; 1724 if (sb_mb == NULL) { 1725 goto out; 1726 } 1727 1728 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP); 1729 1730 mpt_mbuf = sb_mb; 1731 while (mpt_mbuf && mpt_mbuf->m_pkthdr.mp_rlen == 0) { 1732 mpt_mbuf = mpt_mbuf->m_next; 1733 } 1734 if (mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)) { 1735 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn; 1736 } else { 1737 goto out; 1738 } 1739 1740 MPT_LOCK(mp_tp); 1741 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) { 1742 u_int64_t len = 0; 1743 len = mp_tp->mpt_snduna - mpt_dsn; 1744 sbdrop(&mp_so->so_snd, (int)len); 1745 1746 } 1747 1748 /* 1749 * In degraded mode, we don't receive data acks, so force free 1750 * mbufs less than snd_nxt 1751 */ 1752 if (mp_so->so_snd.sb_mb == NULL) { 1753 MPT_UNLOCK(mp_tp); 1754 goto out; 1755 } 1756 1757 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn; 1758 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) && 1759 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) && 1760 MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_sndnxt)) { 1761 u_int64_t len = 0; 1762 len = mp_tp->mpt_sndnxt - mpt_dsn; 1763 sbdrop(&mp_so->so_snd, (int)len); 1764 mp_tp->mpt_snduna = mp_tp->mpt_sndnxt; 1765 } 1766 1767 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) && 1768 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) { 1769 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC; 1770 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC; 1771 if (mp_tp->mpt_flags & MPTCPF_RECVD_MPFAIL) 1772 mpts->mpts_sndnxt = mp_tp->mpt_dsn_at_csum_fail; 1773 } 1774 1775 /* 1776 * Adjust the subflow's notion of next byte to send based on 1777 * the last unacknowledged byte 1778 */ 1779 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_snduna)) { 1780 mpts->mpts_sndnxt = mp_tp->mpt_snduna; 1781 /* 1782 * With FastJoin, a write before the fastjoin event will use 1783 * an uninitialized relative sequence number. 1784 */ 1785 if (mpts->mpts_rel_seq == 0) 1786 mpts->mpts_rel_seq = 1; 1787 } 1788 1789 /* 1790 * Adjust the top level notion of next byte used for retransmissions 1791 * and sending FINs. 1792 */ 1793 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) { 1794 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna; 1795 } 1796 1797 1798 /* Now determine the offset from which to start transmitting data */ 1799 sb_mb = mp_so->so_snd.sb_mb; 1800 sb_cc = mp_so->so_snd.sb_cc; 1801 if (sb_mb == NULL) { 1802 MPT_UNLOCK(mp_tp); 1803 goto out; 1804 } 1805 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_sndmax)) { 1806 off = mpts->mpts_sndnxt - mp_tp->mpt_snduna; 1807 sb_cc -= (size_t)off; 1808 } else { 1809 MPT_UNLOCK(mp_tp); 1810 goto out; 1811 } 1812 MPT_UNLOCK(mp_tp); 1813 1814 mpt_mbuf = sb_mb; 1815 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn; 1816 1817 while (mpt_mbuf && ((mpt_mbuf->m_pkthdr.mp_rlen == 0) || 1818 (mpt_mbuf->m_pkthdr.mp_rlen <= (u_int32_t)off))) { 1819 off -= mpt_mbuf->m_pkthdr.mp_rlen; 1820 mpt_mbuf = mpt_mbuf->m_next; 1821 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn; 1822 } 1823 if ((mpts->mpts_connid == 2) || (mpts->mpts_flags & MPTSF_MP_DEGRADED)) 1824 mptcplog2((LOG_INFO, "%s: snduna = %llu off = %lld id = %d" 1825 " %llu \n", 1826 __func__, 1827 mp_tp->mpt_snduna, off, mpts->mpts_connid, 1828 mpts->mpts_sndnxt)); 1829 1830 VERIFY(mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)); 1831 1832 head = tail = NULL; 1833 1834 while (tot_sent < sb_cc) { 1835 struct mbuf *m; 1836 size_t mlen; 1837 1838 mlen = mpt_mbuf->m_pkthdr.mp_rlen; 1839 mlen -= off; 1840 if (mlen == 0) 1841 goto out; 1842 1843 if (mlen > sb_cc) { 1844 panic("%s: unexpected %lu %lu \n", __func__, 1845 mlen, sb_cc); 1846 } 1847 1848 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT, 1849 M_COPYM_MUST_COPY_HDR); 1850 if (m == NULL) { 1851 error = ENOBUFS; 1852 break; 1853 } 1854 1855 /* Create a DSN mapping for the data (m_copym does it) */ 1856 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn; 1857 VERIFY(m->m_flags & M_PKTHDR); 1858 m->m_pkthdr.pkt_flags |= PKTF_MPTCP; 1859 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO; 1860 m->m_pkthdr.mp_dsn = mpt_dsn + off; 1861 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq; 1862 m->m_pkthdr.mp_rlen = mlen; 1863 mpts->mpts_rel_seq += mlen; 1864 m->m_pkthdr.len = mlen; 1865 1866 if (head == NULL) { 1867 head = tail = m; 1868 } else { 1869 tail->m_next = m; 1870 tail = m; 1871 } 1872 1873 /* last contiguous mapping is stored for error cases */ 1874 if (mpts->mpts_lastmap.mptsl_dsn + 1875 mpts->mpts_lastmap.mptsl_len == mpt_dsn) { 1876 mpts->mpts_lastmap.mptsl_len += tot_sent; 1877 } else if (MPTCP_SEQ_LT((mpts->mpts_lastmap.mptsl_dsn + 1878 mpts->mpts_lastmap.mptsl_len), mpt_dsn)) { 1879 if (m->m_pkthdr.mp_dsn == 0) 1880 panic("%s %llu", __func__, mpt_dsn); 1881 mpts->mpts_lastmap.mptsl_dsn = m->m_pkthdr.mp_dsn; 1882 mpts->mpts_lastmap.mptsl_sseq = m->m_pkthdr.mp_rseq; 1883 mpts->mpts_lastmap.mptsl_len = m->m_pkthdr.mp_rlen; 1884 } 1885 1886 tot_sent += mlen; 1887 off = 0; 1888 mpt_mbuf = mpt_mbuf->m_next; 1889 } 1890 1891 if (head != NULL) { 1892 1893 if (mpts->mpts_flags & MPTSF_FASTJ_SEND) { 1894 struct tcpcb *tp = intotcpcb(sotoinpcb(so)); 1895 tp->t_mpflags |= TMPF_FASTJOIN_SEND; 1896 } 1897 1898 error = sock_sendmbuf(so, NULL, head, 0, NULL); 1899 1900 DTRACE_MPTCP7(send, struct mbuf *, head, struct socket *, so, 1901 struct sockbuf *, &so->so_rcv, 1902 struct sockbuf *, &so->so_snd, 1903 struct mptses *, mpte, struct mptsub *, mpts, 1904 size_t, tot_sent); 1905 } 1906 1907 if (error == 0) { 1908 mpts->mpts_sndnxt += tot_sent; 1909 MPT_LOCK(mp_tp); 1910 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mpts->mpts_sndnxt)) { 1911 if (MPTCP_DATASEQ_HIGH32(mpts->mpts_sndnxt) > 1912 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) 1913 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN; 1914 mp_tp->mpt_sndnxt = mpts->mpts_sndnxt; 1915 } 1916 mptcp_cancel_timer(mp_tp, MPTT_REXMT); 1917 MPT_UNLOCK(mp_tp); 1918 1919 /* Send once in SYN_SENT state to avoid sending SYN spam */ 1920 if (mpts->mpts_flags & MPTSF_FASTJ_SEND) { 1921 so->so_flags &= ~SOF_MPTCP_FASTJOIN; 1922 mpts->mpts_flags &= ~MPTSF_FASTJ_SEND; 1923 } 1924 1925 if ((mpts->mpts_connid >= 2) || 1926 (mpts->mpts_flags & MPTSF_MP_DEGRADED)) 1927 mptcplog2((LOG_DEBUG, "%s: cid %d wrote %d %d\n", 1928 __func__, mpts->mpts_connid, (int)tot_sent, 1929 (int) sb_cc)); 1930 } else { 1931 mptcplog((LOG_ERR, "MPTCP ERROR %s: cid %d error %d len %zd\n", 1932 __func__, mpts->mpts_connid, error, tot_sent)); 1933 } 1934out: 1935 return (error); 1936} 1937 1938/* 1939 * Subflow socket control event upcall. 1940 * 1941 * Called when the associated subflow socket posted one or more control events. 1942 * The subflow socket lock has been released prior to invoking the callback. 1943 * Note that the upcall may occur synchronously as a result of MPTCP performing 1944 * an action on it, or asynchronously as a result of an event happening at the 1945 * subflow layer. Therefore, to maintain lock ordering, the only lock that can 1946 * be acquired here is the thread lock, for signalling purposes. 1947 */ 1948static void 1949mptcp_subflow_eupcall(struct socket *so, void *arg, uint32_t events) 1950{ 1951#pragma unused(so) 1952 struct mptsub *mpts = arg; 1953 struct mptses *mpte = mpts->mpts_mpte; 1954 1955 VERIFY(mpte != NULL); 1956 1957 lck_mtx_lock(&mpte->mpte_thread_lock); 1958 atomic_bitset_32(&mpts->mpts_evctl, events); 1959 mptcp_thread_signal_locked(mpte); 1960 lck_mtx_unlock(&mpte->mpte_thread_lock); 1961} 1962 1963/* 1964 * Subflow socket control events. 1965 * 1966 * Called for handling events related to the underlying subflow socket. 1967 */ 1968static ev_ret_t 1969mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts) 1970{ 1971 uint32_t events, save_events; 1972 ev_ret_t ret = MPTS_EVRET_OK; 1973 1974 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1975 MPTS_LOCK_ASSERT_HELD(mpts); 1976 1977 /* bail if there's nothing to process */ 1978 if ((events = mpts->mpts_evctl) == 0) 1979 return (ret); 1980 1981 if (events & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST| 1982 SO_FILT_HINT_CANTRCVMORE|SO_FILT_HINT_CANTSENDMORE| 1983 SO_FILT_HINT_TIMEOUT|SO_FILT_HINT_NOSRCADDR| 1984 SO_FILT_HINT_IFDENIED|SO_FILT_HINT_SUSPEND| 1985 SO_FILT_HINT_DISCONNECTED)) { 1986 events |= SO_FILT_HINT_MPFAILOVER; 1987 } 1988 1989 save_events = events; 1990 1991 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte, 1992 struct mptsub *, mpts, uint32_t, events); 1993 1994 mptcplog2((LOG_DEBUG, "%s: cid %d events=%b\n", __func__, 1995 mpts->mpts_connid, events, SO_FILT_HINT_BITS)); 1996 1997 if ((events & SO_FILT_HINT_MPCANTRCVMORE) && (ret >= MPTS_EVRET_OK)) { 1998 ev_ret_t error = mptcp_subflow_mpcantrcvmore_ev(mpte, mpts); 1999 events &= ~SO_FILT_HINT_MPCANTRCVMORE; 2000 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2001 } 2002 if ((events & SO_FILT_HINT_MPFAILOVER) && (ret >= MPTS_EVRET_OK)) { 2003 ev_ret_t error = mptcp_subflow_failover_ev(mpte, mpts); 2004 events &= ~SO_FILT_HINT_MPFAILOVER; 2005 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2006 } 2007 if ((events & SO_FILT_HINT_CONNRESET) && (ret >= MPTS_EVRET_OK)) { 2008 ev_ret_t error = mptcp_subflow_connreset_ev(mpte, mpts); 2009 events &= ~SO_FILT_HINT_CONNRESET; 2010 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2011 } 2012 if ((events & SO_FILT_HINT_MUSTRST) && (ret >= MPTS_EVRET_OK)) { 2013 ev_ret_t error = mptcp_subflow_mustrst_ev(mpte, mpts); 2014 events &= ~SO_FILT_HINT_MUSTRST; 2015 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2016 } 2017 if ((events & SO_FILT_HINT_CANTRCVMORE) && (ret >= MPTS_EVRET_OK)) { 2018 ev_ret_t error = mptcp_subflow_cantrcvmore_ev(mpte, mpts); 2019 events &= ~SO_FILT_HINT_CANTRCVMORE; 2020 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2021 } 2022 if ((events & SO_FILT_HINT_CANTSENDMORE) && (ret >= MPTS_EVRET_OK)) { 2023 ev_ret_t error = mptcp_subflow_cantsendmore_ev(mpte, mpts); 2024 events &= ~SO_FILT_HINT_CANTSENDMORE; 2025 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2026 } 2027 if ((events & SO_FILT_HINT_TIMEOUT) && (ret >= MPTS_EVRET_OK)) { 2028 ev_ret_t error = mptcp_subflow_timeout_ev(mpte, mpts); 2029 events &= ~SO_FILT_HINT_TIMEOUT; 2030 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2031 } 2032 if ((events & SO_FILT_HINT_NOSRCADDR) && (ret >= MPTS_EVRET_OK)) { 2033 ev_ret_t error = mptcp_subflow_nosrcaddr_ev(mpte, mpts); 2034 events &= ~SO_FILT_HINT_NOSRCADDR; 2035 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2036 } 2037 if ((events & SO_FILT_HINT_IFDENIED) && (ret >= MPTS_EVRET_OK)) { 2038 ev_ret_t error = mptcp_subflow_ifdenied_ev(mpte, mpts); 2039 events &= ~SO_FILT_HINT_IFDENIED; 2040 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2041 } 2042 if ((events & SO_FILT_HINT_SUSPEND) && (ret >= MPTS_EVRET_OK)) { 2043 ev_ret_t error = mptcp_subflow_suspend_ev(mpte, mpts); 2044 events &= ~SO_FILT_HINT_SUSPEND; 2045 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2046 } 2047 if ((events & SO_FILT_HINT_RESUME) && (ret >= MPTS_EVRET_OK)) { 2048 ev_ret_t error = mptcp_subflow_resume_ev(mpte, mpts); 2049 events &= ~SO_FILT_HINT_RESUME; 2050 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2051 } 2052 if ((events & SO_FILT_HINT_CONNECTED) && (ret >= MPTS_EVRET_OK)) { 2053 ev_ret_t error = mptcp_subflow_connected_ev(mpte, mpts); 2054 events &= ~SO_FILT_HINT_CONNECTED; 2055 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2056 } 2057 if ((events & SO_FILT_HINT_MPSTATUS) && (ret >= MPTS_EVRET_OK)) { 2058 ev_ret_t error = mptcp_subflow_mpstatus_ev(mpte, mpts); 2059 events &= ~SO_FILT_HINT_MPSTATUS; 2060 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2061 } 2062 if ((events & SO_FILT_HINT_DELETEOK) && (ret >= MPTS_EVRET_OK)) { 2063 ev_ret_t error = mptcp_deleteok_ev(mpte, mpts); 2064 events &= ~SO_FILT_HINT_DELETEOK; 2065 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2066 } 2067 if ((events & SO_FILT_HINT_DISCONNECTED) && (ret >= MPTS_EVRET_OK)) { 2068 ev_ret_t error = mptcp_subflow_disconnected_ev(mpte, mpts); 2069 events &= ~SO_FILT_HINT_DISCONNECTED; 2070 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2071 } 2072 if ((events & SO_FILT_HINT_MPFASTJ) && (ret >= MPTS_EVRET_OK)) { 2073 ev_ret_t error = mptcp_fastjoin_ev(mpte, mpts); 2074 events &= ~SO_FILT_HINT_MPFASTJ; 2075 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 2076 } 2077 2078 /* 2079 * We should be getting only events specified via sock_catchevents(), 2080 * so loudly complain if we have any unprocessed one(s). 2081 */ 2082 if (events != 0 || ret < MPTS_EVRET_OK) { 2083 mptcplog((LOG_ERR, "%s%s: cid %d evret %s (%d)" 2084 " unhandled events=%b\n", 2085 (events != 0) ? "MPTCP_ERROR " : "", 2086 __func__, mpts->mpts_connid, 2087 mptcp_evret2str(ret), ret, events, SO_FILT_HINT_BITS)); 2088 } 2089 2090 /* clear the ones we've processed */ 2091 atomic_bitclear_32(&mpts->mpts_evctl, save_events); 2092 2093 return (ret); 2094} 2095 2096/* 2097 * Handle SO_FILT_HINT_CONNRESET subflow socket event. 2098 */ 2099static ev_ret_t 2100mptcp_subflow_connreset_ev(struct mptses *mpte, struct mptsub *mpts) 2101{ 2102 struct socket *mp_so, *so; 2103 struct mptcb *mp_tp; 2104 boolean_t linger; 2105 2106 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2107 MPTS_LOCK_ASSERT_HELD(mpts); 2108 VERIFY(mpte->mpte_mppcb != NULL); 2109 mp_so = mpte->mpte_mppcb->mpp_socket; 2110 mp_tp = mpte->mpte_mptcb; 2111 so = mpts->mpts_socket; 2112 2113 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) && 2114 !(mp_so->so_flags & SOF_PCBCLEARING)); 2115 2116 mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__, 2117 mpts->mpts_connid, (linger ? "YES" : "NO"))); 2118 2119 /* 2120 * We got a TCP RST for this subflow connection. 2121 * 2122 * Right now, we simply propagate ECONNREFUSED to the MPTCP socket 2123 * client if the MPTCP connection has not been established or 2124 * if the connection has only one subflow and is a connection being 2125 * resumed. Otherwise we close the socket. 2126 */ 2127 mptcp_subflow_disconnect(mpte, mpts, !linger); 2128 2129 MPT_LOCK(mp_tp); 2130 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { 2131 mpts->mpts_soerror = mp_so->so_error = ECONNREFUSED; 2132 } else if (mpte->mpte_nummpcapflows < 1) { 2133 mpts->mpts_soerror = mp_so->so_error = ECONNRESET; 2134 MPT_UNLOCK(mp_tp); 2135 MPTS_UNLOCK(mpts); 2136 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNRESET); 2137 MPTS_LOCK(mpts); 2138 MPT_LOCK(mp_tp); 2139 } 2140 MPT_UNLOCK(mp_tp); 2141 2142 /* 2143 * Keep the subflow socket around, unless the MPTCP socket has 2144 * been detached or the subflow has been disconnected explicitly, 2145 * in which case it should be deleted right away. 2146 */ 2147 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE); 2148} 2149 2150/* 2151 * Handle SO_FILT_HINT_CANTRCVMORE subflow socket event. 2152 */ 2153static ev_ret_t 2154mptcp_subflow_cantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts) 2155{ 2156 struct socket *so; 2157 2158 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2159 MPTS_LOCK_ASSERT_HELD(mpts); 2160 2161 so = mpts->mpts_socket; 2162 2163 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid)); 2164 2165 /* 2166 * We got a FIN for this subflow connection. This subflow socket 2167 * is no longer available for receiving data; 2168 * The FIN may arrive with data. The data is handed up to the 2169 * mptcp socket and the subflow is disconnected. 2170 */ 2171 2172 return (MPTS_EVRET_OK); /* keep the subflow socket around */ 2173} 2174 2175/* 2176 * Handle SO_FILT_HINT_CANTSENDMORE subflow socket event. 2177 */ 2178static ev_ret_t 2179mptcp_subflow_cantsendmore_ev(struct mptses *mpte, struct mptsub *mpts) 2180{ 2181 struct socket *so; 2182 2183 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2184 MPTS_LOCK_ASSERT_HELD(mpts); 2185 2186 so = mpts->mpts_socket; 2187 2188 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid)); 2189 return (MPTS_EVRET_OK); /* keep the subflow socket around */ 2190} 2191 2192/* 2193 * Handle SO_FILT_HINT_TIMEOUT subflow socket event. 2194 */ 2195static ev_ret_t 2196mptcp_subflow_timeout_ev(struct mptses *mpte, struct mptsub *mpts) 2197{ 2198 struct socket *mp_so, *so; 2199 struct mptcb *mp_tp; 2200 boolean_t linger; 2201 2202 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2203 MPTS_LOCK_ASSERT_HELD(mpts); 2204 VERIFY(mpte->mpte_mppcb != NULL); 2205 mp_so = mpte->mpte_mppcb->mpp_socket; 2206 mp_tp = mpte->mpte_mptcb; 2207 so = mpts->mpts_socket; 2208 2209 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) && 2210 !(mp_so->so_flags & SOF_PCBCLEARING)); 2211 2212 mptcplog((LOG_NOTICE, "%s: cid %d [linger %s]\n", __func__, 2213 mpts->mpts_connid, (linger ? "YES" : "NO"))); 2214 2215 if (mpts->mpts_soerror == 0) 2216 mpts->mpts_soerror = ETIMEDOUT; 2217 2218 /* 2219 * The subflow connection has timed out. 2220 * 2221 * Right now, we simply propagate ETIMEDOUT to the MPTCP socket 2222 * client if the MPTCP connection has not been established. Otherwise 2223 * drop it. 2224 */ 2225 mptcp_subflow_disconnect(mpte, mpts, !linger); 2226 2227 MPT_LOCK(mp_tp); 2228 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { 2229 mp_so->so_error = ETIMEDOUT; 2230 } 2231 MPT_UNLOCK(mp_tp); 2232 2233 /* 2234 * Keep the subflow socket around, unless the MPTCP socket has 2235 * been detached or the subflow has been disconnected explicitly, 2236 * in which case it should be deleted right away. 2237 */ 2238 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE); 2239} 2240 2241/* 2242 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event. 2243 */ 2244static ev_ret_t 2245mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts) 2246{ 2247 struct socket *mp_so, *so; 2248 struct mptcb *mp_tp; 2249 boolean_t linger; 2250 struct tcpcb *tp = NULL; 2251 2252 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2253 MPTS_LOCK_ASSERT_HELD(mpts); 2254 2255 VERIFY(mpte->mpte_mppcb != NULL); 2256 mp_so = mpte->mpte_mppcb->mpp_socket; 2257 mp_tp = mpte->mpte_mptcb; 2258 so = mpts->mpts_socket; 2259 2260 /* Not grabbing socket lock as t_local_aid is write once only */ 2261 tp = intotcpcb(sotoinpcb(so)); 2262 /* 2263 * This overwrites any previous mpte_lost_aid to avoid storing 2264 * too much state when the typical case has only two subflows. 2265 */ 2266 mpte->mpte_flags |= MPTE_SND_REM_ADDR; 2267 mpte->mpte_lost_aid = tp->t_local_aid; 2268 2269 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) && 2270 !(mp_so->so_flags & SOF_PCBCLEARING)); 2271 2272 mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__, 2273 mpts->mpts_connid, (linger ? "YES" : "NO"))); 2274 2275 if (mpts->mpts_soerror == 0) 2276 mpts->mpts_soerror = EADDRNOTAVAIL; 2277 2278 /* 2279 * The subflow connection has lost its source address. 2280 * 2281 * Right now, we simply propagate EADDRNOTAVAIL to the MPTCP socket 2282 * client if the MPTCP connection has not been established. If it 2283 * has been established with one subflow , we keep the MPTCP 2284 * connection valid without any subflows till closed by application. 2285 * This lets tcp connection manager decide whether to close this or 2286 * not as it reacts to reachability changes too. 2287 */ 2288 mptcp_subflow_disconnect(mpte, mpts, !linger); 2289 2290 MPT_LOCK(mp_tp); 2291 if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) && 2292 (mp_so->so_flags & SOF_NOADDRAVAIL)) { 2293 mp_so->so_error = EADDRNOTAVAIL; 2294 } 2295 MPT_UNLOCK(mp_tp); 2296 2297 /* 2298 * Keep the subflow socket around, unless the MPTCP socket has 2299 * been detached or the subflow has been disconnected explicitly, 2300 * in which case it should be deleted right away. 2301 */ 2302 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE); 2303} 2304 2305/* 2306 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that 2307 * indicates that the remote side sent a Data FIN 2308 */ 2309static ev_ret_t 2310mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts) 2311{ 2312 struct socket *so, *mp_so; 2313 struct mptcb *mp_tp; 2314 2315 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2316 MPTS_LOCK_ASSERT_HELD(mpts); 2317 mp_so = mpte->mpte_mppcb->mpp_socket; 2318 so = mpts->mpts_socket; 2319 mp_tp = mpte->mpte_mptcb; 2320 2321 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid)); 2322 2323 /* 2324 * We got a Data FIN for the MPTCP connection. 2325 * The FIN may arrive with data. The data is handed up to the 2326 * mptcp socket and the user is notified so that it may close 2327 * the socket if needed. 2328 */ 2329 MPT_LOCK(mp_tp); 2330 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) { 2331 MPT_UNLOCK(mp_tp); 2332 MPTS_UNLOCK(mpts); 2333 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE); 2334 MPTS_LOCK(mpts); 2335 MPT_LOCK(mp_tp); 2336 } 2337 MPT_UNLOCK(mp_tp); 2338 return (MPTS_EVRET_OK); /* keep the subflow socket around */ 2339} 2340 2341/* 2342 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event 2343 */ 2344static ev_ret_t 2345mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts) 2346{ 2347 struct mptsub *mpts_alt = NULL; 2348 struct socket *so = NULL; 2349 struct socket *mp_so; 2350 int altpath_exists = 0; 2351 2352 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2353 MPTS_LOCK_ASSERT_HELD(mpts); 2354 mp_so = mpte->mpte_mppcb->mpp_socket; 2355 mptcplog2((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__, 2356 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so))); 2357 2358 MPTS_UNLOCK(mpts); 2359 mpts_alt = mptcp_get_subflow(mpte, mpts); 2360 2361 /* 2362 * If there is no alternate eligible subflow, ignore the 2363 * failover hint. 2364 */ 2365 if (mpts_alt == NULL) { 2366 mptcplog2((LOG_WARNING, "%s: no alternate path\n", __func__)); 2367 if (mptcp_delayed_subf_start) { 2368 mpts_alt = mptcp_get_pending_subflow(mpte, mpts); 2369 if (mpts_alt != NULL) { 2370 MPTS_LOCK(mpts_alt); 2371 (void) mptcp_subflow_soconnectx(mpte, 2372 mpts_alt); 2373 MPTS_UNLOCK(mpts_alt); 2374 } 2375 } 2376 MPTS_LOCK(mpts); 2377 goto done; 2378 } 2379 MPTS_LOCK(mpts_alt); 2380 altpath_exists = 1; 2381 so = mpts_alt->mpts_socket; 2382 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) { 2383 socket_lock(so, 1); 2384 /* All data acknowledged and no RTT spike */ 2385 if ((so->so_snd.sb_cc == 0) && 2386 (mptcp_no_rto_spike(so))) { 2387 so->so_flags &= ~SOF_MP_TRYFAILOVER; 2388 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER; 2389 } else { 2390 /* no alternate path available */ 2391 altpath_exists = 0; 2392 } 2393 socket_unlock(so, 1); 2394 } 2395 if (altpath_exists) { 2396 mptcplog2((LOG_INFO, "%s: cid = %d\n", 2397 __func__, mpts_alt->mpts_connid)); 2398 mpts_alt->mpts_flags |= MPTSF_ACTIVE; 2399 struct mptcb *mp_tp = mpte->mpte_mptcb; 2400 /* Bring the subflow's notion of snd_nxt into the send window */ 2401 MPT_LOCK(mp_tp); 2402 mpts_alt->mpts_sndnxt = mp_tp->mpt_snduna; 2403 MPT_UNLOCK(mp_tp); 2404 mpte->mpte_active_sub = mpts_alt; 2405 socket_lock(so, 1); 2406 sowwakeup(so); 2407 socket_unlock(so, 1); 2408 } 2409 MPTS_UNLOCK(mpts_alt); 2410 2411 if (altpath_exists) { 2412 soevent(mp_so, 2413 SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED); 2414 mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx switched from " 2415 "%d to %d\n", __func__, 2416 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 2417 mpts->mpts_connid, mpts_alt->mpts_connid)); 2418 tcpstat.tcps_mp_switches++; 2419 } 2420 2421 MPTS_LOCK(mpts); 2422 if (altpath_exists) { 2423 mpts->mpts_flags |= MPTSF_FAILINGOVER; 2424 mpts->mpts_flags &= ~MPTSF_ACTIVE; 2425 } else { 2426 mptcplog2((LOG_INFO, "%s: no alt cid = %d\n", 2427 __func__, mpts->mpts_connid)); 2428done: 2429 so = mpts->mpts_socket; 2430 socket_lock(so, 1); 2431 so->so_flags &= ~SOF_MP_TRYFAILOVER; 2432 socket_unlock(so, 1); 2433 } 2434 MPTS_LOCK_ASSERT_HELD(mpts); 2435 return (MPTS_EVRET_OK); 2436} 2437 2438/* 2439 * Handle SO_FILT_HINT_IFDENIED subflow socket event. 2440 */ 2441static ev_ret_t 2442mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts) 2443{ 2444 struct socket *mp_so, *so; 2445 struct mptcb *mp_tp; 2446 boolean_t linger; 2447 2448 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2449 MPTS_LOCK_ASSERT_HELD(mpts); 2450 VERIFY(mpte->mpte_mppcb != NULL); 2451 mp_so = mpte->mpte_mppcb->mpp_socket; 2452 mp_tp = mpte->mpte_mptcb; 2453 so = mpts->mpts_socket; 2454 2455 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) && 2456 !(mp_so->so_flags & SOF_PCBCLEARING)); 2457 2458 mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__, 2459 mpts->mpts_connid, (linger ? "YES" : "NO"))); 2460 2461 if (mpts->mpts_soerror == 0) 2462 mpts->mpts_soerror = EHOSTUNREACH; 2463 2464 /* 2465 * The subflow connection cannot use the outgoing interface. 2466 * 2467 * Right now, we simply propagate EHOSTUNREACH to the MPTCP socket 2468 * client if the MPTCP connection has not been established. If it 2469 * has been established, let the upper layer call disconnectx. 2470 */ 2471 mptcp_subflow_disconnect(mpte, mpts, !linger); 2472 MPTS_UNLOCK(mpts); 2473 2474 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED); 2475 2476 MPT_LOCK(mp_tp); 2477 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { 2478 mp_so->so_error = EHOSTUNREACH; 2479 } 2480 MPT_UNLOCK(mp_tp); 2481 2482 MPTS_LOCK(mpts); 2483 /* 2484 * Keep the subflow socket around, unless the MPTCP socket has 2485 * been detached or the subflow has been disconnected explicitly, 2486 * in which case it should be deleted right away. 2487 */ 2488 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE); 2489} 2490 2491/* 2492 * Handle SO_FILT_HINT_SUSPEND subflow socket event. 2493 */ 2494static ev_ret_t 2495mptcp_subflow_suspend_ev(struct mptses *mpte, struct mptsub *mpts) 2496{ 2497 struct socket *so; 2498 2499 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2500 MPTS_LOCK_ASSERT_HELD(mpts); 2501 2502 so = mpts->mpts_socket; 2503 2504 /* the subflow connection is being flow controlled */ 2505 mpts->mpts_flags |= MPTSF_SUSPENDED; 2506 2507 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, 2508 mpts->mpts_connid)); 2509 2510 return (MPTS_EVRET_OK); /* keep the subflow socket around */ 2511} 2512 2513/* 2514 * Handle SO_FILT_HINT_RESUME subflow socket event. 2515 */ 2516static ev_ret_t 2517mptcp_subflow_resume_ev(struct mptses *mpte, struct mptsub *mpts) 2518{ 2519 struct socket *so; 2520 2521 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2522 MPTS_LOCK_ASSERT_HELD(mpts); 2523 2524 so = mpts->mpts_socket; 2525 2526 /* the subflow connection is no longer flow controlled */ 2527 mpts->mpts_flags &= ~MPTSF_SUSPENDED; 2528 2529 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid)); 2530 2531 return (MPTS_EVRET_OK); /* keep the subflow socket around */ 2532} 2533 2534/* 2535 * Handle SO_FILT_HINT_CONNECTED subflow socket event. 2536 */ 2537static ev_ret_t 2538mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts) 2539{ 2540 char buf0[MAX_IPv6_STR_LEN], buf1[MAX_IPv6_STR_LEN]; 2541 struct sockaddr_entry *src_se, *dst_se; 2542 struct sockaddr_storage src; 2543 struct socket *mp_so, *so; 2544 struct mptcb *mp_tp; 2545 struct ifnet *outifp; 2546 int af, error = 0; 2547 boolean_t mpok = FALSE; 2548 2549 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2550 VERIFY(mpte->mpte_mppcb != NULL); 2551 mp_so = mpte->mpte_mppcb->mpp_socket; 2552 mp_tp = mpte->mpte_mptcb; 2553 2554 MPTS_LOCK_ASSERT_HELD(mpts); 2555 so = mpts->mpts_socket; 2556 af = mpts->mpts_family; 2557 2558 if (mpts->mpts_flags & MPTSF_CONNECTED) 2559 return (MPTS_EVRET_OK); 2560 2561 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) || 2562 (mpts->mpts_flags & MPTSF_DISCONNECTING)) { 2563 socket_lock(so, 0); 2564 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) && 2565 (so->so_state & SS_ISCONNECTED)) { 2566 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n", 2567 __func__, mpts->mpts_connid)); 2568 (void) soshutdownlock(so, SHUT_RD); 2569 (void) soshutdownlock(so, SHUT_WR); 2570 (void) sodisconnectlocked(so); 2571 } 2572 socket_unlock(so, 0); 2573 return (MPTS_EVRET_OK); 2574 } 2575 2576 /* 2577 * The subflow connection has been connected. Find out whether it 2578 * is connected as a regular TCP or as a MPTCP subflow. The idea is: 2579 * 2580 * a. If MPTCP connection is not yet established, then this must be 2581 * the first subflow connection. If MPTCP failed to negotiate, 2582 * indicate to the MPTCP socket client via EPROTO, that the 2583 * underlying TCP connection may be peeled off via peeloff(2). 2584 * Otherwise, mark the MPTCP socket as connected. 2585 * 2586 * b. If MPTCP connection has been established, then this must be 2587 * one of the subsequent subflow connections. If MPTCP failed 2588 * to negotiate, disconnect the connection since peeloff(2) 2589 * is no longer possible. 2590 * 2591 * Right now, we simply unblock any waiters at the MPTCP socket layer 2592 * if the MPTCP connection has not been established. 2593 */ 2594 socket_lock(so, 0); 2595 2596 if (so->so_state & SS_ISDISCONNECTED) { 2597 /* 2598 * With MPTCP joins, a connection is connected at the subflow 2599 * level, but the 4th ACK from the server elevates the MPTCP 2600 * subflow to connected state. So there is a small window 2601 * where the subflow could get disconnected before the 2602 * connected event is processed. 2603 */ 2604 socket_unlock(so, 0); 2605 return (MPTS_EVRET_OK); 2606 } 2607 2608 mpts->mpts_soerror = 0; 2609 mpts->mpts_flags &= ~MPTSF_CONNECTING; 2610 mpts->mpts_flags |= MPTSF_CONNECTED; 2611 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) 2612 mpts->mpts_flags |= MPTSF_MP_CAPABLE; 2613 2614 VERIFY(mpts->mpts_dst_sl != NULL); 2615 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head); 2616 VERIFY(dst_se != NULL && dst_se->se_addr != NULL && 2617 dst_se->se_addr->sa_family == af); 2618 2619 VERIFY(mpts->mpts_src_sl != NULL); 2620 src_se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head); 2621 VERIFY(src_se != NULL && src_se->se_addr != NULL && 2622 src_se->se_addr->sa_family == af); 2623 2624 /* get/check source IP address */ 2625 switch (af) { 2626 case AF_INET: { 2627 error = in_getsockaddr_s(so, &src); 2628 if (error == 0) { 2629 struct sockaddr_in *ms = SIN(src_se->se_addr); 2630 struct sockaddr_in *s = SIN(&src); 2631 2632 VERIFY(s->sin_len == ms->sin_len); 2633 VERIFY(ms->sin_family == AF_INET); 2634 2635 if ((mpts->mpts_flags & MPTSF_BOUND_IP) && 2636 bcmp(&ms->sin_addr, &s->sin_addr, 2637 sizeof (ms->sin_addr)) != 0) { 2638 mptcplog((LOG_ERR, "%s: cid %d local " 2639 "address %s (expected %s)\n", __func__, 2640 mpts->mpts_connid, inet_ntop(AF_INET, 2641 (void *)&s->sin_addr.s_addr, buf0, 2642 sizeof (buf0)), inet_ntop(AF_INET, 2643 (void *)&ms->sin_addr.s_addr, buf1, 2644 sizeof (buf1)))); 2645 } 2646 bcopy(s, ms, sizeof (*s)); 2647 } 2648 break; 2649 } 2650#if INET6 2651 case AF_INET6: { 2652 error = in6_getsockaddr_s(so, &src); 2653 if (error == 0) { 2654 struct sockaddr_in6 *ms = SIN6(src_se->se_addr); 2655 struct sockaddr_in6 *s = SIN6(&src); 2656 2657 VERIFY(s->sin6_len == ms->sin6_len); 2658 VERIFY(ms->sin6_family == AF_INET6); 2659 2660 if ((mpts->mpts_flags & MPTSF_BOUND_IP) && 2661 bcmp(&ms->sin6_addr, &s->sin6_addr, 2662 sizeof (ms->sin6_addr)) != 0) { 2663 mptcplog((LOG_ERR, "%s: cid %d local " 2664 "address %s (expected %s)\n", __func__, 2665 mpts->mpts_connid, inet_ntop(AF_INET6, 2666 (void *)&s->sin6_addr, buf0, 2667 sizeof (buf0)), inet_ntop(AF_INET6, 2668 (void *)&ms->sin6_addr, buf1, 2669 sizeof (buf1)))); 2670 } 2671 bcopy(s, ms, sizeof (*s)); 2672 } 2673 break; 2674 } 2675#endif /* INET6 */ 2676 default: 2677 VERIFY(0); 2678 /* NOTREACHED */ 2679 } 2680 2681 if (error != 0) { 2682 mptcplog((LOG_ERR, "%s: cid %d getsockaddr failed (%d)\n", 2683 __func__, mpts->mpts_connid, error)); 2684 } 2685 2686 /* get/verify the outbound interface */ 2687 outifp = sotoinpcb(so)->inp_last_outifp; /* could be NULL */ 2688 if (mpts->mpts_flags & MPTSF_BOUND_IF) { 2689 VERIFY(mpts->mpts_outif != NULL); 2690 if (mpts->mpts_outif != outifp) { 2691 mptcplog((LOG_ERR, "%s: cid %d outif %s " 2692 "(expected %s)\n", __func__, mpts->mpts_connid, 2693 ((outifp != NULL) ? outifp->if_xname : "NULL"), 2694 mpts->mpts_outif->if_xname)); 2695 if (outifp == NULL) 2696 outifp = mpts->mpts_outif; 2697 } 2698 } else { 2699 mpts->mpts_outif = outifp; 2700 } 2701 2702 socket_unlock(so, 0); 2703 2704 mptcplog((LOG_DEBUG, "%s: cid %d outif %s %s[%d] -> %s[%d] " 2705 "is %s\n", __func__, mpts->mpts_connid, ((outifp != NULL) ? 2706 outifp->if_xname : "NULL"), inet_ntop(af, (af == AF_INET) ? 2707 (void *)&SIN(src_se->se_addr)->sin_addr.s_addr : 2708 (void *)&SIN6(src_se->se_addr)->sin6_addr, buf0, sizeof (buf0)), 2709 ((af == AF_INET) ? ntohs(SIN(src_se->se_addr)->sin_port) : 2710 ntohs(SIN6(src_se->se_addr)->sin6_port)), 2711 inet_ntop(af, ((af == AF_INET) ? 2712 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr : 2713 (void *)&SIN6(dst_se->se_addr)->sin6_addr), buf1, sizeof (buf1)), 2714 ((af == AF_INET) ? ntohs(SIN(dst_se->se_addr)->sin_port) : 2715 ntohs(SIN6(dst_se->se_addr)->sin6_port)), 2716 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? 2717 "MPTCP capable" : "a regular TCP"))); 2718 2719 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE); 2720 MPTS_UNLOCK(mpts); 2721 2722 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED); 2723 2724 MPT_LOCK(mp_tp); 2725 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { 2726 /* case (a) above */ 2727 if (!mpok) { 2728 mp_tp->mpt_flags |= MPTCPF_PEEL_OFF; 2729 (void) mptcp_drop(mpte, mp_tp, EPROTO); 2730 MPT_UNLOCK(mp_tp); 2731 } else { 2732 if (mptcp_init_authparms(mp_tp) != 0) { 2733 mp_tp->mpt_flags |= MPTCPF_PEEL_OFF; 2734 (void) mptcp_drop(mpte, mp_tp, EPROTO); 2735 MPT_UNLOCK(mp_tp); 2736 mpok = FALSE; 2737 } else { 2738 mp_tp->mpt_state = MPTCPS_ESTABLISHED; 2739 mpte->mpte_associd = mpts->mpts_connid; 2740 DTRACE_MPTCP2(state__change, 2741 struct mptcb *, mp_tp, 2742 uint32_t, 0 /* event */); 2743 mptcp_init_statevars(mp_tp); 2744 MPT_UNLOCK(mp_tp); 2745 2746 (void) mptcp_setconnorder(mpte, 2747 mpts->mpts_connid, 1); 2748 soisconnected(mp_so); 2749 } 2750 } 2751 MPTS_LOCK(mpts); 2752 if (mpok) { 2753 /* Initialize the relative sequence number */ 2754 mpts->mpts_rel_seq = 1; 2755 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET; 2756 mpte->mpte_nummpcapflows++; 2757 MPT_LOCK_SPIN(mp_tp); 2758 mpts->mpts_sndnxt = mp_tp->mpt_snduna; 2759 MPT_UNLOCK(mp_tp); 2760 } 2761 } else if (mpok) { 2762 MPT_UNLOCK(mp_tp); 2763 if (mptcp_rwnotify && (mpte->mpte_nummpcapflows == 0)) { 2764 /* Experimental code, disabled by default. */ 2765 sorwakeup(mp_so); 2766 sowwakeup(mp_so); 2767 } 2768 /* 2769 * case (b) above 2770 * In case of additional flows, the MPTCP socket is not 2771 * MPTSF_MP_CAPABLE until an ACK is received from server 2772 * for 3-way handshake. TCP would have guaranteed that this 2773 * is an MPTCP subflow. 2774 */ 2775 MPTS_LOCK(mpts); 2776 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET; 2777 mpts->mpts_flags &= ~MPTSF_FASTJ_REQD; 2778 mpte->mpte_nummpcapflows++; 2779 /* With Fastjoin, rel sequence will be nonzero */ 2780 if (mpts->mpts_rel_seq == 0) 2781 mpts->mpts_rel_seq = 1; 2782 MPT_LOCK_SPIN(mp_tp); 2783 /* With Fastjoin, sndnxt is updated before connected_ev */ 2784 if (mpts->mpts_sndnxt == 0) { 2785 mpts->mpts_sndnxt = mp_tp->mpt_snduna; 2786 } 2787 MPT_UNLOCK(mp_tp); 2788 mptcp_output_needed(mpte, mpts); 2789 } else { 2790 MPT_UNLOCK(mp_tp); 2791 MPTS_LOCK(mpts); 2792 } 2793 2794 MPTS_LOCK_ASSERT_HELD(mpts); 2795 2796 return (MPTS_EVRET_OK); /* keep the subflow socket around */ 2797} 2798 2799/* 2800 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event. 2801 */ 2802static ev_ret_t 2803mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts) 2804{ 2805 struct socket *mp_so, *so; 2806 struct mptcb *mp_tp; 2807 boolean_t linger; 2808 2809 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2810 MPTS_LOCK_ASSERT_HELD(mpts); 2811 VERIFY(mpte->mpte_mppcb != NULL); 2812 mp_so = mpte->mpte_mppcb->mpp_socket; 2813 mp_tp = mpte->mpte_mptcb; 2814 so = mpts->mpts_socket; 2815 2816 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) && 2817 !(mp_so->so_flags & SOF_PCBCLEARING)); 2818 2819 mptcplog2((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__, 2820 mpts->mpts_connid, (linger ? "YES" : "NO"))); 2821 2822 if (mpts->mpts_flags & MPTSF_DISCONNECTED) 2823 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE); 2824 2825 /* 2826 * Clear flags that are used by getconninfo to return state. 2827 * Retain like MPTSF_DELETEOK for internal purposes. 2828 */ 2829 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING| 2830 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED| 2831 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED| 2832 MPTSF_SUSPENDED|MPTSF_ACTIVE); 2833 mpts->mpts_flags |= MPTSF_DISCONNECTED; 2834 2835 /* 2836 * The subflow connection has been disconnected. 2837 * 2838 * Right now, we simply unblock any waiters at the MPTCP socket layer 2839 * if the MPTCP connection has not been established. 2840 */ 2841 MPTS_UNLOCK(mpts); 2842 2843 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED); 2844 2845 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) { 2846 mpte->mpte_nummpcapflows--; 2847 if (mpte->mpte_active_sub == mpts) { 2848 mpte->mpte_active_sub = NULL; 2849 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n", 2850 __func__)); 2851 } 2852 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET; 2853 } 2854 2855 MPT_LOCK(mp_tp); 2856 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { 2857 MPT_UNLOCK(mp_tp); 2858 soisdisconnected(mp_so); 2859 } else { 2860 MPT_UNLOCK(mp_tp); 2861 } 2862 2863 MPTS_LOCK(mpts); 2864 /* 2865 * The underlying subflow socket has been disconnected; 2866 * it is no longer useful to us. Keep the subflow socket 2867 * around, unless the MPTCP socket has been detached or 2868 * the subflow has been disconnected explicitly, in which 2869 * case it should be deleted right away. 2870 */ 2871 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE); 2872} 2873 2874/* 2875 * Handle SO_FILT_HINT_MPSTATUS subflow socket event 2876 */ 2877static ev_ret_t 2878mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts) 2879{ 2880 struct socket *mp_so, *so; 2881 struct mptcb *mp_tp; 2882 ev_ret_t ret = MPTS_EVRET_OK_UPDATE; 2883 2884 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2885 VERIFY(mpte->mpte_mppcb != NULL); 2886 mp_so = mpte->mpte_mppcb->mpp_socket; 2887 mp_tp = mpte->mpte_mptcb; 2888 2889 MPTS_LOCK_ASSERT_HELD(mpts); 2890 so = mpts->mpts_socket; 2891 2892 socket_lock(so, 0); 2893 MPT_LOCK(mp_tp); 2894 2895 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) 2896 mpts->mpts_flags |= MPTSF_MP_CAPABLE; 2897 else 2898 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE; 2899 2900 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) { 2901 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) 2902 goto done; 2903 mpts->mpts_flags |= MPTSF_MP_DEGRADED; 2904 } 2905 else 2906 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED; 2907 2908 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) 2909 mpts->mpts_flags |= MPTSF_MP_READY; 2910 else 2911 mpts->mpts_flags &= ~MPTSF_MP_READY; 2912 2913 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) { 2914 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP; 2915 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY; 2916 } 2917 2918 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) { 2919 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)); 2920 ret = MPTS_EVRET_DISCONNECT_FALLBACK; 2921 } else if (mpts->mpts_flags & MPTSF_MP_READY) { 2922 mp_tp->mpt_flags |= MPTCPF_JOIN_READY; 2923 ret = MPTS_EVRET_CONNECT_PENDING; 2924 } 2925 2926 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d " 2927 "mptsf=%b\n", __func__, 2928 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket), 2929 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid, 2930 mpts->mpts_flags, MPTSF_BITS)); 2931done: 2932 MPT_UNLOCK(mp_tp); 2933 socket_unlock(so, 0); 2934 return (ret); 2935} 2936 2937/* 2938 * Handle SO_FILT_HINT_MUSTRST subflow socket event 2939 */ 2940static ev_ret_t 2941mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts) 2942{ 2943 struct socket *mp_so, *so; 2944 struct mptcb *mp_tp; 2945 boolean_t linger; 2946 2947 2948 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2949 MPTS_LOCK_ASSERT_HELD(mpts); 2950 VERIFY(mpte->mpte_mppcb != NULL); 2951 mp_so = mpte->mpte_mppcb->mpp_socket; 2952 mp_tp = mpte->mpte_mptcb; 2953 so = mpts->mpts_socket; 2954 2955 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) && 2956 !(mp_so->so_flags & SOF_PCBCLEARING)); 2957 2958 if (mpts->mpts_soerror == 0) 2959 mpts->mpts_soerror = ECONNABORTED; 2960 2961 /* We got an invalid option or a fast close */ 2962 socket_lock(so, 0); 2963 struct tcptemp *t_template; 2964 struct inpcb *inp = sotoinpcb(so); 2965 struct tcpcb *tp = NULL; 2966 2967 tp = intotcpcb(inp); 2968 so->so_error = ECONNABORTED; 2969 2970 t_template = tcp_maketemplate(tp); 2971 if (t_template) { 2972 struct tcp_respond_args tra; 2973 2974 bzero(&tra, sizeof(tra)); 2975 if (inp->inp_flags & INP_BOUND_IF) 2976 tra.ifscope = inp->inp_boundifp->if_index; 2977 else 2978 tra.ifscope = IFSCOPE_NONE; 2979 tra.awdl_unrestricted = 1; 2980 2981 tcp_respond(tp, t_template->tt_ipgen, 2982 &t_template->tt_t, (struct mbuf *)NULL, 2983 tp->rcv_nxt, tp->snd_una, TH_RST, &tra); 2984 (void) m_free(dtom(t_template)); 2985 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d \n", 2986 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 2987 so, mpts->mpts_connid)); 2988 } 2989 socket_unlock(so, 0); 2990 mptcp_subflow_disconnect(mpte, mpts, !linger); 2991 MPTS_UNLOCK(mpts); 2992 2993 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED | 2994 SO_FILT_HINT_CONNRESET); 2995 2996 MPT_LOCK(mp_tp); 2997 if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) || 2998 (mp_tp->mpt_state == MPTCPS_FASTCLOSE_WAIT)) { 2999 mp_so->so_error = ECONNABORTED; 3000 } 3001 MPT_UNLOCK(mp_tp); 3002 3003 MPTS_LOCK(mpts); 3004 /* 3005 * Keep the subflow socket around unless the subflow has been 3006 * disconnected explicitly. 3007 */ 3008 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE); 3009} 3010 3011static ev_ret_t 3012mptcp_fastjoin_ev(struct mptses *mpte, struct mptsub *mpts) 3013{ 3014 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 3015 MPTS_LOCK_ASSERT_HELD(mpts); 3016 VERIFY(mpte->mpte_mppcb != NULL); 3017 3018 if (mpte->mpte_nummpcapflows == 0) { 3019 struct mptcb *mp_tp = mpte->mpte_mptcb; 3020 mptcplog((LOG_DEBUG,"%s %llx %llx \n", 3021 __func__, mp_tp->mpt_snduna, mpts->mpts_sndnxt)); 3022 mpte->mpte_active_sub = mpts; 3023 mpts->mpts_flags |= (MPTSF_FASTJ_SEND | MPTSF_ACTIVE); 3024 MPT_LOCK(mp_tp); 3025 /* 3026 * If mptcp_subflow_output is called before fastjoin_ev 3027 * then mpts->mpts_sndnxt is initialized to mp_tp->mpt_snduna 3028 * and further mpts->mpts_sndnxt is incremented by len copied. 3029 */ 3030 if (mpts->mpts_sndnxt == 0) { 3031 mpts->mpts_sndnxt = mp_tp->mpt_snduna; 3032 mpts->mpts_rel_seq = 1; 3033 } 3034 MPT_UNLOCK(mp_tp); 3035 } 3036 3037 return (MPTS_EVRET_OK); 3038} 3039 3040static ev_ret_t 3041mptcp_deleteok_ev(struct mptses *mpte, struct mptsub *mpts) 3042{ 3043 MPTE_LOCK_ASSERT_HELD(mpte); 3044 MPTS_LOCK_ASSERT_HELD(mpts); 3045 VERIFY(mpte->mpte_mppcb != NULL); 3046 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid)); 3047 3048 mpts->mpts_flags |= MPTSF_DELETEOK; 3049 if (mpts->mpts_flags & MPTSF_DISCONNECTED) 3050 return (MPTS_EVRET_DELETE); 3051 else 3052 return (MPTS_EVRET_OK); 3053} 3054 3055static const char * 3056mptcp_evret2str(ev_ret_t ret) 3057{ 3058 const char *c = "UNKNOWN"; 3059 3060 switch (ret) { 3061 case MPTS_EVRET_DELETE: 3062 c = "MPTS_EVRET_DELETE"; 3063 break; 3064 case MPTS_EVRET_CONNECT_PENDING: 3065 c = "MPTS_EVRET_CONNECT_PENDING"; 3066 break; 3067 case MPTS_EVRET_DISCONNECT_FALLBACK: 3068 c = "MPTS_EVRET_DISCONNECT_FALLBACK"; 3069 break; 3070 case MPTS_EVRET_OK: 3071 c = "MPTS_EVRET_OK"; 3072 break; 3073 case MPTS_EVRET_OK_UPDATE: 3074 c = "MPTS_EVRET_OK_UPDATE"; 3075 break; 3076 } 3077 return (c); 3078} 3079 3080/* 3081 * Add a reference to a subflow structure; used by MPTS_ADDREF(). 3082 */ 3083void 3084mptcp_subflow_addref(struct mptsub *mpts, int locked) 3085{ 3086 if (!locked) 3087 MPTS_LOCK(mpts); 3088 else 3089 MPTS_LOCK_ASSERT_HELD(mpts); 3090 3091 if (++mpts->mpts_refcnt == 0) { 3092 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts); 3093 /* NOTREACHED */ 3094 } 3095 if (!locked) 3096 MPTS_UNLOCK(mpts); 3097} 3098 3099/* 3100 * Remove a reference held on a subflow structure; used by MPTS_REMREF(); 3101 */ 3102void 3103mptcp_subflow_remref(struct mptsub *mpts) 3104{ 3105 MPTS_LOCK(mpts); 3106 if (mpts->mpts_refcnt == 0) { 3107 panic("%s: mpts %p negative refcnt\n", __func__, mpts); 3108 /* NOTREACHED */ 3109 } 3110 if (--mpts->mpts_refcnt > 0) { 3111 MPTS_UNLOCK(mpts); 3112 return; 3113 } 3114 /* callee will unlock and destroy lock */ 3115 mptcp_subflow_free(mpts); 3116} 3117 3118/* 3119 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked, 3120 * caller must ensure that the option can be issued on subflow sockets, via 3121 * MPOF_SUBFLOW_OK flag. 3122 */ 3123int 3124mptcp_subflow_sosetopt(struct mptses *mpte, struct socket *so, 3125 struct mptopt *mpo) 3126{ 3127 struct socket *mp_so; 3128 struct sockopt sopt; 3129 char buf[32]; 3130 int error; 3131 3132 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK); 3133 mpo->mpo_flags &= ~MPOF_INTERIM; 3134 3135 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 3136 mp_so = mpte->mpte_mppcb->mpp_socket; 3137 3138 bzero(&sopt, sizeof (sopt)); 3139 sopt.sopt_dir = SOPT_SET; 3140 sopt.sopt_level = mpo->mpo_level; 3141 sopt.sopt_name = mpo->mpo_name; 3142 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval); 3143 sopt.sopt_valsize = sizeof (int); 3144 sopt.sopt_p = kernproc; 3145 3146 error = sosetoptlock(so, &sopt, 0); /* already locked */ 3147 if (error == 0) { 3148 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s " 3149 "val %d set successful\n", __func__, 3150 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 3151 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name, 3152 buf, sizeof (buf)), mpo->mpo_intval)); 3153 } else { 3154 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s " 3155 "val %d set error %d\n", __func__, 3156 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 3157 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name, 3158 buf, sizeof (buf)), mpo->mpo_intval, error)); 3159 } 3160 return (error); 3161} 3162 3163/* 3164 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked, 3165 * caller must ensure that the option can be issued on subflow sockets, via 3166 * MPOF_SUBFLOW_OK flag. 3167 */ 3168int 3169mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so, 3170 struct mptopt *mpo) 3171{ 3172 struct socket *mp_so; 3173 struct sockopt sopt; 3174 char buf[32]; 3175 int error; 3176 3177 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK); 3178 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 3179 mp_so = mpte->mpte_mppcb->mpp_socket; 3180 3181 bzero(&sopt, sizeof (sopt)); 3182 sopt.sopt_dir = SOPT_GET; 3183 sopt.sopt_level = mpo->mpo_level; 3184 sopt.sopt_name = mpo->mpo_name; 3185 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval); 3186 sopt.sopt_valsize = sizeof (int); 3187 sopt.sopt_p = kernproc; 3188 3189 error = sogetoptlock(so, &sopt, 0); /* already locked */ 3190 if (error == 0) { 3191 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s " 3192 "val %d get successful\n", __func__, 3193 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 3194 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name, 3195 buf, sizeof (buf)), mpo->mpo_intval)); 3196 } else { 3197 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s get error %d\n", 3198 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 3199 mptcp_sopt2str(mpo->mpo_level, 3200 mpo->mpo_name, buf, sizeof (buf)), error)); 3201 } 3202 return (error); 3203} 3204 3205 3206/* 3207 * MPTCP garbage collector. 3208 * 3209 * This routine is called by the MP domain on-demand, periodic callout, 3210 * which is triggered when a MPTCP socket is closed. The callout will 3211 * repeat as long as this routine returns a non-zero value. 3212 */ 3213static uint32_t 3214mptcp_gc(struct mppcbinfo *mppi) 3215{ 3216 struct mppcb *mpp, *tmpp; 3217 uint32_t active = 0; 3218 3219 lck_mtx_assert(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED); 3220 3221 mptcplog3((LOG_DEBUG, "%s: running\n", __func__)); 3222 3223 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) { 3224 struct socket *mp_so; 3225 struct mptses *mpte; 3226 struct mptcb *mp_tp; 3227 3228 VERIFY(mpp->mpp_flags & MPP_ATTACHED); 3229 mp_so = mpp->mpp_socket; 3230 VERIFY(mp_so != NULL); 3231 mpte = mptompte(mpp); 3232 VERIFY(mpte != NULL); 3233 mp_tp = mpte->mpte_mptcb; 3234 VERIFY(mp_tp != NULL); 3235 3236 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx found " 3237 "(u=%d,r=%d,s=%d)\n", __func__, 3238 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount, 3239 mp_so->so_retaincnt, mpp->mpp_state)); 3240 3241 if (!lck_mtx_try_lock(&mpp->mpp_lock)) { 3242 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped " 3243 "(u=%d,r=%d)\n", __func__, 3244 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 3245 mp_so->so_usecount, mp_so->so_retaincnt)); 3246 active++; 3247 continue; 3248 } 3249 3250 /* check again under the lock */ 3251 if (mp_so->so_usecount > 1) { 3252 boolean_t wakeup = FALSE; 3253 struct mptsub *mpts, *tmpts; 3254 3255 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped " 3256 "[u=%d,r=%d] %d %d\n", __func__, 3257 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 3258 mp_so->so_usecount, mp_so->so_retaincnt, 3259 mp_tp->mpt_gc_ticks, 3260 mp_tp->mpt_state)); 3261 MPT_LOCK(mp_tp); 3262 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) { 3263 if (mp_tp->mpt_gc_ticks > 0) 3264 mp_tp->mpt_gc_ticks--; 3265 if (mp_tp->mpt_gc_ticks == 0) { 3266 wakeup = TRUE; 3267 if (mp_tp->mpt_localkey != NULL) { 3268 mptcp_free_key( 3269 mp_tp->mpt_localkey); 3270 mp_tp->mpt_localkey = NULL; 3271 } 3272 } 3273 } 3274 MPT_UNLOCK(mp_tp); 3275 if (wakeup) { 3276 TAILQ_FOREACH_SAFE(mpts, 3277 &mpte->mpte_subflows, mpts_entry, tmpts) { 3278 MPTS_LOCK(mpts); 3279 mpts->mpts_flags |= MPTSF_DELETEOK; 3280 if (mpts->mpts_soerror == 0) 3281 mpts->mpts_soerror = ETIMEDOUT; 3282 mptcp_subflow_eupcall(mpts->mpts_socket, 3283 mpts, SO_FILT_HINT_DISCONNECTED); 3284 MPTS_UNLOCK(mpts); 3285 } 3286 } 3287 lck_mtx_unlock(&mpp->mpp_lock); 3288 active++; 3289 continue; 3290 } 3291 3292 if (mpp->mpp_state != MPPCB_STATE_DEAD) { 3293 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped " 3294 "[u=%d,r=%d,s=%d]\n", __func__, 3295 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 3296 mp_so->so_usecount, mp_so->so_retaincnt, 3297 mpp->mpp_state)); 3298 lck_mtx_unlock(&mpp->mpp_lock); 3299 active++; 3300 continue; 3301 } 3302 3303 /* 3304 * The PCB has been detached, and there is exactly 1 refnct 3305 * held by the MPTCP thread. Signal that thread to terminate, 3306 * after which the last refcnt will be released. That will 3307 * allow it to be destroyed below during the next round. 3308 */ 3309 if (mp_so->so_usecount == 1) { 3310 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx scheduled for " 3311 "termination [u=%d,r=%d]\n", __func__, 3312 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 3313 mp_so->so_usecount, mp_so->so_retaincnt)); 3314 /* signal MPTCP thread to terminate */ 3315 mptcp_thread_terminate_signal(mpte); 3316 lck_mtx_unlock(&mpp->mpp_lock); 3317 active++; 3318 continue; 3319 } 3320 3321 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n", 3322 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 3323 mp_so->so_usecount, mp_so->so_retaincnt)); 3324 DTRACE_MPTCP4(dispose, struct socket *, mp_so, 3325 struct sockbuf *, &mp_so->so_rcv, 3326 struct sockbuf *, &mp_so->so_snd, 3327 struct mppcb *, mpp); 3328 3329 mp_pcbdispose(mpp); 3330 } 3331 3332 return (active); 3333} 3334 3335/* 3336 * Drop a MPTCP connection, reporting the specified error. 3337 */ 3338struct mptses * 3339mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno) 3340{ 3341 struct socket *mp_so; 3342 3343 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 3344 MPT_LOCK_ASSERT_HELD(mp_tp); 3345 VERIFY(mpte->mpte_mptcb == mp_tp); 3346 mp_so = mpte->mpte_mppcb->mpp_socket; 3347 3348 mp_tp->mpt_state = MPTCPS_TERMINATE; 3349 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, 3350 uint32_t, 0 /* event */); 3351 3352 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) 3353 errno = mp_tp->mpt_softerror; 3354 mp_so->so_error = errno; 3355 3356 return (mptcp_close(mpte, mp_tp)); 3357} 3358 3359/* 3360 * Close a MPTCP control block. 3361 */ 3362struct mptses * 3363mptcp_close(struct mptses *mpte, struct mptcb *mp_tp) 3364{ 3365 struct socket *mp_so; 3366 struct mptsub *mpts, *tmpts; 3367 3368 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 3369 MPT_LOCK_ASSERT_HELD(mp_tp); 3370 VERIFY(mpte->mpte_mptcb == mp_tp); 3371 mp_so = mpte->mpte_mppcb->mpp_socket; 3372 if (mp_tp->mpt_localkey != NULL) { 3373 mptcp_free_key(mp_tp->mpt_localkey); 3374 mp_tp->mpt_localkey = NULL; 3375 } 3376 3377 MPT_UNLOCK(mp_tp); 3378 soisdisconnected(mp_so); 3379 3380 MPT_LOCK(mp_tp); 3381 if (mp_tp->mpt_flags & MPTCPF_PEEL_OFF) { 3382 return (NULL); 3383 } 3384 MPT_UNLOCK(mp_tp); 3385 3386 /* Clean up all subflows */ 3387 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { 3388 MPTS_LOCK(mpts); 3389 mpts->mpts_flags |= MPTSF_USER_DISCONNECT; 3390 mptcp_subflow_disconnect(mpte, mpts, TRUE); 3391 MPTS_UNLOCK(mpts); 3392 mptcp_subflow_del(mpte, mpts, TRUE); 3393 } 3394 MPT_LOCK(mp_tp); 3395 3396 return (NULL); 3397} 3398 3399void 3400mptcp_notify_close(struct socket *so) 3401{ 3402 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED)); 3403} 3404 3405/* 3406 * Signal MPTCP thread to wake up. 3407 */ 3408void 3409mptcp_thread_signal(struct mptses *mpte) 3410{ 3411 lck_mtx_lock(&mpte->mpte_thread_lock); 3412 mptcp_thread_signal_locked(mpte); 3413 lck_mtx_unlock(&mpte->mpte_thread_lock); 3414} 3415 3416/* 3417 * Signal MPTCP thread to wake up (locked version) 3418 */ 3419static void 3420mptcp_thread_signal_locked(struct mptses *mpte) 3421{ 3422 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED); 3423 3424 mpte->mpte_thread_reqs++; 3425 if (!mpte->mpte_thread_active && mpte->mpte_thread != THREAD_NULL) 3426 wakeup_one((caddr_t)&mpte->mpte_thread); 3427} 3428 3429/* 3430 * Signal MPTCP thread to terminate. 3431 */ 3432static void 3433mptcp_thread_terminate_signal(struct mptses *mpte) 3434{ 3435 lck_mtx_lock(&mpte->mpte_thread_lock); 3436 if (mpte->mpte_thread != THREAD_NULL) { 3437 mpte->mpte_thread = THREAD_NULL; 3438 mpte->mpte_thread_reqs++; 3439 if (!mpte->mpte_thread_active) 3440 wakeup_one((caddr_t)&mpte->mpte_thread); 3441 } 3442 lck_mtx_unlock(&mpte->mpte_thread_lock); 3443} 3444 3445/* 3446 * MPTCP thread workloop. 3447 */ 3448static void 3449mptcp_thread_dowork(struct mptses *mpte) 3450{ 3451 struct socket *mp_so; 3452 struct mptsub *mpts, *tmpts; 3453 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE; 3454 boolean_t conninfo_update = FALSE; 3455 3456 MPTE_LOCK(mpte); /* same as MP socket lock */ 3457 VERIFY(mpte->mpte_mppcb != NULL); 3458 mp_so = mpte->mpte_mppcb->mpp_socket; 3459 VERIFY(mp_so != NULL); 3460 3461 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { 3462 ev_ret_t ret; 3463 3464 MPTS_LOCK(mpts); 3465 MPTS_ADDREF_LOCKED(mpts); /* for us */ 3466 3467 /* Update process ownership based on parent mptcp socket */ 3468 mptcp_update_last_owner(mpts, mp_so); 3469 3470 mptcp_subflow_input(mpte, mpts); 3471 ret = mptcp_subflow_events(mpte, mpts); 3472 3473 if (mpts->mpts_flags & MPTSF_ACTIVE) { 3474 mptcplog3((LOG_INFO, "%s: cid %d \n", __func__, 3475 mpts->mpts_connid)); 3476 (void) mptcp_subflow_output(mpte, mpts); 3477 } 3478 3479 /* 3480 * If MPTCP socket is closed, disconnect all subflows. 3481 * This will generate a disconnect event which will 3482 * be handled during the next iteration, causing a 3483 * non-zero error to be returned above. 3484 */ 3485 if (mp_so->so_flags & SOF_PCBCLEARING) 3486 mptcp_subflow_disconnect(mpte, mpts, FALSE); 3487 MPTS_UNLOCK(mpts); 3488 3489 switch (ret) { 3490 case MPTS_EVRET_OK_UPDATE: 3491 conninfo_update = TRUE; 3492 break; 3493 case MPTS_EVRET_OK: 3494 /* nothing to do */ 3495 break; 3496 case MPTS_EVRET_DELETE: 3497 mptcp_subflow_del(mpte, mpts, TRUE); 3498 break; 3499 case MPTS_EVRET_CONNECT_PENDING: 3500 connect_pending = TRUE; 3501 break; 3502 case MPTS_EVRET_DISCONNECT_FALLBACK: 3503 disconnect_fallback = TRUE; 3504 break; 3505 } 3506 MPTS_REMREF(mpts); /* ours */ 3507 } 3508 3509 if (conninfo_update) { 3510 soevent(mp_so, SO_FILT_HINT_LOCKED | 3511 SO_FILT_HINT_CONNINFO_UPDATED); 3512 } 3513 3514 if (!connect_pending && !disconnect_fallback) { 3515 MPTE_UNLOCK(mpte); 3516 return; 3517 } 3518 3519 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { 3520 MPTS_LOCK(mpts); 3521 if (disconnect_fallback) { 3522 struct socket *so = NULL; 3523 struct inpcb *inp = NULL; 3524 struct tcpcb *tp = NULL; 3525 3526 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) { 3527 MPTS_UNLOCK(mpts); 3528 continue; 3529 } 3530 3531 mpts->mpts_flags |= MPTSF_MP_DEGRADED; 3532 3533 if (mpts->mpts_flags & (MPTSF_DISCONNECTING| 3534 MPTSF_DISCONNECTED)) { 3535 MPTS_UNLOCK(mpts); 3536 continue; 3537 } 3538 so = mpts->mpts_socket; 3539 3540 /* 3541 * The MPTCP connection has degraded to a fallback 3542 * mode, so there is no point in keeping this subflow 3543 * regardless of its MPTCP-readiness state, unless it 3544 * is the primary one which we use for fallback. This 3545 * assumes that the subflow used for fallback is the 3546 * ACTIVE one. 3547 */ 3548 3549 socket_lock(so, 1); 3550 inp = sotoinpcb(so); 3551 tp = intotcpcb(inp); 3552 tp->t_mpflags &= 3553 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE); 3554 tp->t_mpflags |= TMPF_TCP_FALLBACK; 3555 if (mpts->mpts_flags & MPTSF_ACTIVE) { 3556 socket_unlock(so, 1); 3557 MPTS_UNLOCK(mpts); 3558 continue; 3559 } 3560 tp->t_mpflags |= TMPF_RESET; 3561 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); 3562 socket_unlock(so, 1); 3563 3564 } else if (connect_pending) { 3565 /* 3566 * If delayed subflow start is set and cellular, 3567 * delay the connect till a retransmission timeout 3568 */ 3569 3570 if ((mptcp_delayed_subf_start) && 3571 (IFNET_IS_CELLULAR(mpts->mpts_outif))) { 3572 MPTS_UNLOCK(mpts); 3573 continue; 3574 } 3575 3576 /* 3577 * The MPTCP connection has progressed to a state 3578 * where it supports full multipath semantics; allow 3579 * additional joins to be attempted for all subflows 3580 * that are in the PENDING state. 3581 */ 3582 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) { 3583 (void) mptcp_subflow_soconnectx(mpte, mpts); 3584 } 3585 } 3586 MPTS_UNLOCK(mpts); 3587 } 3588 3589 MPTE_UNLOCK(mpte); 3590} 3591 3592/* 3593 * MPTCP thread. 3594 */ 3595static void 3596mptcp_thread_func(void *v, wait_result_t w) 3597{ 3598#pragma unused(w) 3599 struct mptses *mpte = v; 3600 struct timespec *ts = NULL; 3601 3602 VERIFY(mpte != NULL); 3603 3604 lck_mtx_lock_spin(&mpte->mpte_thread_lock); 3605 3606 for (;;) { 3607 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED); 3608 3609 if (mpte->mpte_thread != THREAD_NULL) { 3610 (void) msleep(&mpte->mpte_thread, 3611 &mpte->mpte_thread_lock, (PZERO - 1) | PSPIN, 3612 __func__, ts); 3613 } 3614 3615 /* MPTCP socket is closed? */ 3616 if (mpte->mpte_thread == THREAD_NULL) { 3617 lck_mtx_unlock(&mpte->mpte_thread_lock); 3618 /* callee will destroy thread lock */ 3619 mptcp_thread_destroy(mpte); 3620 /* NOTREACHED */ 3621 return; 3622 } 3623 3624 mpte->mpte_thread_active = 1; 3625 for (;;) { 3626 uint32_t reqs = mpte->mpte_thread_reqs; 3627 3628 lck_mtx_unlock(&mpte->mpte_thread_lock); 3629 mptcp_thread_dowork(mpte); 3630 lck_mtx_lock_spin(&mpte->mpte_thread_lock); 3631 3632 /* if there's no pending request, we're done */ 3633 if (reqs == mpte->mpte_thread_reqs || 3634 mpte->mpte_thread == THREAD_NULL) 3635 break; 3636 } 3637 mpte->mpte_thread_reqs = 0; 3638 mpte->mpte_thread_active = 0; 3639 } 3640} 3641 3642/* 3643 * Destroy a MTCP thread, to be called in the MPTCP thread context 3644 * upon receiving an indication to self-terminate. This routine 3645 * will not return, as the current thread is terminated at the end. 3646 */ 3647static void 3648mptcp_thread_destroy(struct mptses *mpte) 3649{ 3650 struct socket *mp_so; 3651 3652 MPTE_LOCK(mpte); /* same as MP socket lock */ 3653 VERIFY(mpte->mpte_thread == THREAD_NULL); 3654 VERIFY(mpte->mpte_mppcb != NULL); 3655 3656 mptcp_sesdestroy(mpte); 3657 3658 mp_so = mpte->mpte_mppcb->mpp_socket; 3659 VERIFY(mp_so != NULL); 3660 VERIFY(mp_so->so_usecount != 0); 3661 mp_so->so_usecount--; /* for thread */ 3662 mpte->mpte_mppcb->mpp_flags |= MPP_DEFUNCT; 3663 MPTE_UNLOCK(mpte); 3664 3665 /* for the extra refcnt from kernel_thread_start() */ 3666 thread_deallocate(current_thread()); 3667 /* this is the end */ 3668 thread_terminate(current_thread()); 3669 /* NOTREACHED */ 3670} 3671 3672/* 3673 * Protocol pr_lock callback. 3674 */ 3675int 3676mptcp_lock(struct socket *mp_so, int refcount, void *lr) 3677{ 3678 struct mppcb *mpp = sotomppcb(mp_so); 3679 void *lr_saved; 3680 3681 if (lr == NULL) 3682 lr_saved = __builtin_return_address(0); 3683 else 3684 lr_saved = lr; 3685 3686 if (mpp == NULL) { 3687 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__, 3688 mp_so, lr_saved, solockhistory_nr(mp_so)); 3689 /* NOTREACHED */ 3690 } 3691 lck_mtx_lock(&mpp->mpp_lock); 3692 3693 if (mp_so->so_usecount < 0) { 3694 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__, 3695 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount, 3696 solockhistory_nr(mp_so)); 3697 /* NOTREACHED */ 3698 } 3699 if (refcount != 0) 3700 mp_so->so_usecount++; 3701 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved; 3702 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX; 3703 3704 return (0); 3705} 3706 3707/* 3708 * Protocol pr_unlock callback. 3709 */ 3710int 3711mptcp_unlock(struct socket *mp_so, int refcount, void *lr) 3712{ 3713 struct mppcb *mpp = sotomppcb(mp_so); 3714 void *lr_saved; 3715 3716 if (lr == NULL) 3717 lr_saved = __builtin_return_address(0); 3718 else 3719 lr_saved = lr; 3720 3721 if (mpp == NULL) { 3722 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__, 3723 mp_so, mp_so->so_usecount, lr_saved, 3724 solockhistory_nr(mp_so)); 3725 /* NOTREACHED */ 3726 } 3727 lck_mtx_assert(&mpp->mpp_lock, LCK_MTX_ASSERT_OWNED); 3728 3729 if (refcount != 0) 3730 mp_so->so_usecount--; 3731 3732 if (mp_so->so_usecount < 0) { 3733 panic("%s: so=%p usecount=%x lrh= %s\n", __func__, 3734 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so)); 3735 /* NOTREACHED */ 3736 } 3737 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved; 3738 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX; 3739 lck_mtx_unlock(&mpp->mpp_lock); 3740 3741 return (0); 3742} 3743 3744/* 3745 * Protocol pr_getlock callback. 3746 */ 3747lck_mtx_t * 3748mptcp_getlock(struct socket *mp_so, int locktype) 3749{ 3750#pragma unused(locktype) 3751 struct mppcb *mpp = sotomppcb(mp_so); 3752 3753 if (mpp == NULL) { 3754 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so, 3755 solockhistory_nr(mp_so)); 3756 /* NOTREACHED */ 3757 } 3758 if (mp_so->so_usecount < 0) { 3759 panic("%s: so=%p usecount=%x lrh= %s\n", __func__, 3760 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so)); 3761 /* NOTREACHED */ 3762 } 3763 return (&mpp->mpp_lock); 3764} 3765 3766/* 3767 * Key generation functions 3768 */ 3769static void 3770mptcp_generate_unique_key(struct mptcp_key_entry *key_entry) 3771{ 3772 struct mptcp_key_entry *key_elm; 3773try_again: 3774 read_random(&key_entry->mkey_value, sizeof (key_entry->mkey_value)); 3775 if (key_entry->mkey_value == 0) 3776 goto try_again; 3777 mptcp_do_sha1(&key_entry->mkey_value, key_entry->mkey_digest, 3778 sizeof (key_entry->mkey_digest)); 3779 3780 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) { 3781 if (key_elm->mkey_value == key_entry->mkey_value) { 3782 goto try_again; 3783 } 3784 if (bcmp(key_elm->mkey_digest, key_entry->mkey_digest, 4) == 3785 0) { 3786 goto try_again; 3787 } 3788 } 3789} 3790 3791static mptcp_key_t * 3792mptcp_reserve_key(void) 3793{ 3794 struct mptcp_key_entry *key_elm; 3795 struct mptcp_key_entry *found_elm = NULL; 3796 3797 lck_mtx_lock(&mptcp_keys_pool.mkph_lock); 3798 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) { 3799 if (key_elm->mkey_flags == MKEYF_FREE) { 3800 key_elm->mkey_flags = MKEYF_INUSE; 3801 found_elm = key_elm; 3802 break; 3803 } 3804 } 3805 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock); 3806 3807 if (found_elm) { 3808 return (&found_elm->mkey_value); 3809 } 3810 3811 key_elm = (struct mptcp_key_entry *) 3812 zalloc(mptcp_keys_pool.mkph_key_entry_zone); 3813 key_elm->mkey_flags = MKEYF_INUSE; 3814 3815 lck_mtx_lock(&mptcp_keys_pool.mkph_lock); 3816 mptcp_generate_unique_key(key_elm); 3817 LIST_INSERT_HEAD(&mptcp_keys_pool, key_elm, mkey_next); 3818 mptcp_keys_pool.mkph_count += 1; 3819 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock); 3820 return (&key_elm->mkey_value); 3821} 3822 3823static caddr_t 3824mptcp_get_stored_digest(mptcp_key_t *key) 3825{ 3826 struct mptcp_key_entry *key_holder; 3827 caddr_t digest = NULL; 3828 3829 lck_mtx_lock(&mptcp_keys_pool.mkph_lock); 3830 key_holder = (struct mptcp_key_entry *)(void *)((caddr_t)key - 3831 offsetof(struct mptcp_key_entry, mkey_value)); 3832 if (key_holder->mkey_flags != MKEYF_INUSE) 3833 panic_plain("%s", __func__); 3834 digest = &key_holder->mkey_digest[0]; 3835 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock); 3836 return (digest); 3837} 3838 3839void 3840mptcp_free_key(mptcp_key_t *key) 3841{ 3842 struct mptcp_key_entry *key_holder; 3843 struct mptcp_key_entry *key_elm; 3844 int pt = RandomULong(); 3845 3846 mptcplog((LOG_INFO, "%s\n", __func__)); 3847 3848 lck_mtx_lock(&mptcp_keys_pool.mkph_lock); 3849 key_holder = (struct mptcp_key_entry *)(void*)((caddr_t)key - 3850 offsetof(struct mptcp_key_entry, mkey_value)); 3851 key_holder->mkey_flags = MKEYF_FREE; 3852 3853 LIST_REMOVE(key_holder, mkey_next); 3854 mptcp_keys_pool.mkph_count -= 1; 3855 3856 /* Free half the time */ 3857 if (pt & 0x01) { 3858 zfree(mptcp_keys_pool.mkph_key_entry_zone, key_holder); 3859 } else { 3860 /* Insert it at random point to avoid early reuse */ 3861 int i = 0; 3862 if (mptcp_keys_pool.mkph_count > 1) { 3863 pt = pt % (mptcp_keys_pool.mkph_count - 1); 3864 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) { 3865 if (++i >= pt) { 3866 LIST_INSERT_AFTER(key_elm, key_holder, 3867 mkey_next); 3868 break; 3869 } 3870 } 3871 if (i < pt) 3872 panic("missed insertion"); 3873 } else { 3874 LIST_INSERT_HEAD(&mptcp_keys_pool, key_holder, 3875 mkey_next); 3876 } 3877 mptcp_keys_pool.mkph_count += 1; 3878 } 3879 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock); 3880} 3881 3882static void 3883mptcp_key_pool_init(void) 3884{ 3885 int i; 3886 struct mptcp_key_entry *key_entry; 3887 3888 LIST_INIT(&mptcp_keys_pool); 3889 mptcp_keys_pool.mkph_count = 0; 3890 3891 mptcp_keys_pool.mkph_key_elm_sz = (vm_size_t) 3892 (sizeof (struct mptcp_key_entry)); 3893 mptcp_keys_pool.mkph_key_entry_zone = zinit( 3894 mptcp_keys_pool.mkph_key_elm_sz, 3895 MPTCP_MX_KEY_ALLOCS * mptcp_keys_pool.mkph_key_elm_sz, 3896 MPTCP_MX_PREALLOC_ZONE_SZ, "mptkeys"); 3897 if (mptcp_keys_pool.mkph_key_entry_zone == NULL) { 3898 panic("%s: unable to allocate MPTCP keys zone \n", __func__); 3899 /* NOTREACHED */ 3900 } 3901 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_CALLERACCT, FALSE); 3902 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_EXPAND, TRUE); 3903 3904 for (i = 0; i < MPTCP_KEY_PREALLOCS_MX; i++) { 3905 key_entry = (struct mptcp_key_entry *) 3906 zalloc(mptcp_keys_pool.mkph_key_entry_zone); 3907 key_entry->mkey_flags = MKEYF_FREE; 3908 mptcp_generate_unique_key(key_entry); 3909 LIST_INSERT_HEAD(&mptcp_keys_pool, key_entry, mkey_next); 3910 mptcp_keys_pool.mkph_count += 1; 3911 } 3912 lck_mtx_init(&mptcp_keys_pool.mkph_lock, mtcbinfo.mppi_lock_grp, 3913 mtcbinfo.mppi_lock_attr); 3914} 3915 3916/* 3917 * MPTCP Join support 3918 */ 3919 3920static void 3921mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, 3922 uint8_t addr_id) 3923{ 3924 struct tcpcb *tp = sototcpcb(so); 3925 struct mptcp_subf_auth_entry *sauth_entry; 3926 MPT_LOCK_ASSERT_NOTHELD(mp_tp); 3927 3928 MPT_LOCK_SPIN(mp_tp); 3929 tp->t_mptcb = mp_tp; 3930 /* 3931 * The address ID of the first flow is implicitly 0. 3932 */ 3933 if (mp_tp->mpt_state == MPTCPS_CLOSED) { 3934 tp->t_local_aid = 0; 3935 } else { 3936 tp->t_local_aid = addr_id; 3937 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW); 3938 so->so_flags |= SOF_MP_SEC_SUBFLOW; 3939 } 3940 MPT_UNLOCK(mp_tp); 3941 sauth_entry = zalloc(mpt_subauth_zone); 3942 sauth_entry->msae_laddr_id = tp->t_local_aid; 3943 sauth_entry->msae_raddr_id = 0; 3944 sauth_entry->msae_raddr_rand = 0; 3945try_again: 3946 sauth_entry->msae_laddr_rand = RandomULong(); 3947 if (sauth_entry->msae_laddr_rand == 0) 3948 goto try_again; 3949 MPT_LOCK_SPIN(mp_tp); 3950 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next); 3951 MPT_UNLOCK(mp_tp); 3952} 3953 3954static void 3955mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so) 3956{ 3957 struct mptcp_subf_auth_entry *sauth_entry; 3958 struct tcpcb *tp = NULL; 3959 int found = 0; 3960 3961 socket_lock(so, 0); 3962 tp = sototcpcb(so); 3963 if (tp == NULL) { 3964 socket_unlock(so, 0); 3965 return; 3966 } 3967 3968 MPT_LOCK(mp_tp); 3969 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) { 3970 if (sauth_entry->msae_laddr_id == tp->t_local_aid) { 3971 found = 1; 3972 break; 3973 } 3974 } 3975 if (found) { 3976 LIST_REMOVE(sauth_entry, msae_next); 3977 zfree(mpt_subauth_zone, sauth_entry); 3978 } 3979 MPT_UNLOCK(mp_tp); 3980 3981 tp->t_mptcb = NULL; 3982 socket_unlock(so, 0); 3983} 3984 3985void 3986mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand, 3987 u_int32_t *rrand) 3988{ 3989 struct mptcp_subf_auth_entry *sauth_entry; 3990 MPT_LOCK_ASSERT_NOTHELD(mp_tp); 3991 3992 MPT_LOCK(mp_tp); 3993 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) { 3994 if (sauth_entry->msae_laddr_id == addr_id) { 3995 if (lrand) 3996 *lrand = sauth_entry->msae_laddr_rand; 3997 if (rrand) 3998 *rrand = sauth_entry->msae_raddr_rand; 3999 break; 4000 } 4001 } 4002 MPT_UNLOCK(mp_tp); 4003} 4004 4005void 4006mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp, 4007 mptcp_addr_id raddr_id, u_int32_t raddr_rand) 4008{ 4009 struct mptcp_subf_auth_entry *sauth_entry; 4010 MPT_LOCK_ASSERT_NOTHELD(mp_tp); 4011 4012 MPT_LOCK(mp_tp); 4013 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) { 4014 if (sauth_entry->msae_laddr_id == laddr_id) { 4015 if ((sauth_entry->msae_raddr_id != 0) && 4016 (sauth_entry->msae_raddr_id != raddr_id)) { 4017 mptcplog((LOG_ERR, "MPTCP ERROR %s: mismatched" 4018 " address ids %d %d \n", __func__, raddr_id, 4019 sauth_entry->msae_raddr_id)); 4020 MPT_UNLOCK(mp_tp); 4021 return; 4022 } 4023 sauth_entry->msae_raddr_id = raddr_id; 4024 if ((sauth_entry->msae_raddr_rand != 0) && 4025 (sauth_entry->msae_raddr_rand != raddr_rand)) { 4026 mptcplog((LOG_ERR, "%s: dup SYN_ACK %d %d \n", 4027 __func__, raddr_rand, 4028 sauth_entry->msae_raddr_rand)); 4029 MPT_UNLOCK(mp_tp); 4030 return; 4031 } 4032 sauth_entry->msae_raddr_rand = raddr_rand; 4033 MPT_UNLOCK(mp_tp); 4034 return; 4035 } 4036 } 4037 MPT_UNLOCK(mp_tp); 4038} 4039 4040/* 4041 * SHA1 support for MPTCP 4042 */ 4043static int 4044mptcp_do_sha1(mptcp_key_t *key, char *sha_digest, int digest_len) 4045{ 4046 SHA1_CTX sha1ctxt; 4047 const unsigned char *sha1_base; 4048 int sha1_size; 4049 4050 if (digest_len != SHA1_RESULTLEN) { 4051 return (FALSE); 4052 } 4053 4054 sha1_base = (const unsigned char *) key; 4055 sha1_size = sizeof (mptcp_key_t); 4056 SHA1Init(&sha1ctxt); 4057 SHA1Update(&sha1ctxt, sha1_base, sha1_size); 4058 SHA1Final(sha_digest, &sha1ctxt); 4059 return (TRUE); 4060} 4061 4062void 4063mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2, 4064 u_int32_t rand1, u_int32_t rand2, u_char *digest, int digest_len) 4065{ 4066 SHA1_CTX sha1ctxt; 4067 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */ 4068 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */ 4069 u_int32_t data[2]; 4070 int i; 4071 4072 bzero(digest, digest_len); 4073 4074 /* Set up the Key for HMAC */ 4075 key_ipad[0] = key1; 4076 key_ipad[1] = key2; 4077 4078 key_opad[0] = key1; 4079 key_opad[1] = key2; 4080 4081 /* Set up the message for HMAC */ 4082 data[0] = rand1; 4083 data[1] = rand2; 4084 4085 /* Key is 512 block length, so no need to compute hash */ 4086 4087 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */ 4088 4089 for (i = 0; i < 8; i++) { 4090 key_ipad[i] ^= 0x3636363636363636; 4091 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c; 4092 } 4093 4094 /* Perform inner SHA1 */ 4095 SHA1Init(&sha1ctxt); 4096 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad)); 4097 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data)); 4098 SHA1Final(digest, &sha1ctxt); 4099 4100 /* Perform outer SHA1 */ 4101 SHA1Init(&sha1ctxt); 4102 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad)); 4103 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN); 4104 SHA1Final(digest, &sha1ctxt); 4105} 4106 4107/* 4108 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A)) 4109 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B)) 4110 */ 4111void 4112mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest, 4113 int digest_len) 4114{ 4115 uint32_t lrand, rrand; 4116 mptcp_key_t localkey, remotekey; 4117 MPT_LOCK_ASSERT_NOTHELD(mp_tp); 4118 4119 if (digest_len != SHA1_RESULTLEN) 4120 return; 4121 4122 lrand = rrand = 0; 4123 mptcp_get_rands(aid, mp_tp, &lrand, &rrand); 4124 MPT_LOCK_SPIN(mp_tp); 4125 localkey = *mp_tp->mpt_localkey; 4126 remotekey = mp_tp->mpt_remotekey; 4127 MPT_UNLOCK(mp_tp); 4128 mptcp_hmac_sha1(localkey, remotekey, lrand, rrand, digest, 4129 digest_len); 4130} 4131 4132u_int64_t 4133mptcp_get_trunced_hmac(mptcp_addr_id aid, struct mptcb *mp_tp) 4134{ 4135 u_char digest[SHA1_RESULTLEN]; 4136 u_int64_t trunced_digest; 4137 4138 mptcp_get_hmac(aid, mp_tp, &digest[0], sizeof (digest)); 4139 bcopy(digest, &trunced_digest, 8); 4140 return (trunced_digest); 4141} 4142 4143/* 4144 * Authentication data generation 4145 */ 4146int 4147mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token, 4148 int token_len) 4149{ 4150 VERIFY(token_len == sizeof (u_int32_t)); 4151 VERIFY(sha_digest_len == SHA1_RESULTLEN); 4152 4153 /* Most significant 32 bits of the SHA1 hash */ 4154 bcopy(sha_digest, token, sizeof (u_int32_t)); 4155 return (TRUE); 4156} 4157 4158int 4159mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn, 4160 int idsn_len) 4161{ 4162 VERIFY(idsn_len == sizeof (u_int64_t)); 4163 VERIFY(sha_digest_len == SHA1_RESULTLEN); 4164 4165 /* 4166 * Least significant 64 bits of the SHA1 hash 4167 */ 4168 4169 idsn[7] = sha_digest[12]; 4170 idsn[6] = sha_digest[13]; 4171 idsn[5] = sha_digest[14]; 4172 idsn[4] = sha_digest[15]; 4173 idsn[3] = sha_digest[16]; 4174 idsn[2] = sha_digest[17]; 4175 idsn[1] = sha_digest[18]; 4176 idsn[0] = sha_digest[19]; 4177 return (TRUE); 4178} 4179 4180static int 4181mptcp_init_authparms(struct mptcb *mp_tp) 4182{ 4183 caddr_t local_digest = NULL; 4184 char remote_digest[MPTCP_SHA1_RESULTLEN]; 4185 MPT_LOCK_ASSERT_HELD(mp_tp); 4186 4187 /* Only Version 0 is supported for auth purposes */ 4188 if (mp_tp->mpt_version != MP_DRAFT_VERSION_12) 4189 return (-1); 4190 4191 /* Setup local and remote tokens and Initial DSNs */ 4192 local_digest = mptcp_get_stored_digest(mp_tp->mpt_localkey); 4193 mptcp_generate_token(local_digest, SHA1_RESULTLEN, 4194 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken)); 4195 mptcp_generate_idsn(local_digest, SHA1_RESULTLEN, 4196 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t)); 4197 4198 if (!mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest, 4199 SHA1_RESULTLEN)) { 4200 mptcplog((LOG_ERR, "MPTCP ERROR %s: unexpected failure", 4201 __func__)); 4202 return (-1); 4203 } 4204 mptcp_generate_token(remote_digest, SHA1_RESULTLEN, 4205 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_localtoken)); 4206 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN, 4207 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t)); 4208 return (0); 4209} 4210 4211static void 4212mptcp_init_statevars(struct mptcb *mp_tp) 4213{ 4214 MPT_LOCK_ASSERT_HELD(mp_tp); 4215 4216 /* The subflow SYN is also first MPTCP byte */ 4217 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1; 4218 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna; 4219 4220 mp_tp->mpt_rcvatmark = mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1; 4221} 4222 4223static void 4224mptcp_conn_properties(struct mptcb *mp_tp) 4225{ 4226 /* There is only Version 0 at this time */ 4227 mp_tp->mpt_version = MP_DRAFT_VERSION_12; 4228 4229 /* Set DSS checksum flag */ 4230 if (mptcp_dss_csum) 4231 mp_tp->mpt_flags |= MPTCPF_CHECKSUM; 4232 4233 /* Set up receive window */ 4234 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp); 4235 4236 /* Set up gc ticks */ 4237 mp_tp->mpt_gc_ticks = MPT_GC_TICKS; 4238} 4239 4240/* 4241 * Helper Functions 4242 */ 4243mptcp_token_t 4244mptcp_get_localtoken(void* mptcb_arg) 4245{ 4246 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg; 4247 return (mp_tp->mpt_localtoken); 4248} 4249 4250mptcp_token_t 4251mptcp_get_remotetoken(void* mptcb_arg) 4252{ 4253 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg; 4254 return (mp_tp->mpt_remotetoken); 4255} 4256 4257u_int64_t 4258mptcp_get_localkey(void* mptcb_arg) 4259{ 4260 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg; 4261 if (mp_tp->mpt_localkey != NULL) 4262 return (*mp_tp->mpt_localkey); 4263 else 4264 return (0); 4265} 4266 4267u_int64_t 4268mptcp_get_remotekey(void* mptcb_arg) 4269{ 4270 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg; 4271 return (mp_tp->mpt_remotekey); 4272} 4273 4274void 4275mptcp_send_dfin(struct socket *so) 4276{ 4277 struct tcpcb *tp = NULL; 4278 struct inpcb *inp = NULL; 4279 4280 inp = sotoinpcb(so); 4281 if (!inp) 4282 return; 4283 4284 tp = intotcpcb(inp); 4285 if (!tp) 4286 return; 4287 4288 if (!(tp->t_mpflags & TMPF_RESET)) 4289 tp->t_mpflags |= TMPF_SEND_DFIN; 4290} 4291 4292/* 4293 * Data Sequence Mapping routines 4294 */ 4295void 4296mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m) 4297{ 4298 struct mptcb *mp_tp; 4299 4300 if (m == NULL) 4301 return; 4302 4303 mp_tp = &((struct mpp_mtp *)mpp)->mtcb; 4304 MPT_LOCK(mp_tp); 4305 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { 4306 MPT_UNLOCK(mp_tp); 4307 panic("%s: data write before establishment.", 4308 __func__); 4309 return; 4310 } 4311 4312 while (m) { 4313 VERIFY(m->m_flags & M_PKTHDR); 4314 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO); 4315 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax; 4316 m->m_pkthdr.mp_rlen = m_pktlen(m); 4317 mp_tp->mpt_sndmax += m_pktlen(m); 4318 m = m->m_next; 4319 } 4320 MPT_UNLOCK(mp_tp); 4321} 4322 4323void 4324mptcp_preproc_sbdrop(struct mbuf *m, unsigned int len) 4325{ 4326 u_int32_t sub_len = 0; 4327 4328 while (m) { 4329 VERIFY(m->m_flags & M_PKTHDR); 4330 4331 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) { 4332 sub_len = m->m_pkthdr.mp_rlen; 4333 4334 if (sub_len < len) { 4335 m->m_pkthdr.mp_dsn += sub_len; 4336 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) { 4337 m->m_pkthdr.mp_rseq += sub_len; 4338 } 4339 m->m_pkthdr.mp_rlen = 0; 4340 len -= sub_len; 4341 } else { 4342 /* sub_len >= len */ 4343 m->m_pkthdr.mp_dsn += len; 4344 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) { 4345 m->m_pkthdr.mp_rseq += len; 4346 } 4347 mptcplog3((LOG_INFO, 4348 "%s: %llu %u %d %d\n", __func__, 4349 m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rseq, 4350 m->m_pkthdr.mp_rlen, len)); 4351 m->m_pkthdr.mp_rlen -= len; 4352 return; 4353 } 4354 } else { 4355 panic("%s: MPTCP tag not set", __func__); 4356 /* NOTREACHED */ 4357 } 4358 m = m->m_next; 4359 } 4360} 4361 4362/* Obtain the DSN mapping stored in the mbuf */ 4363void 4364mptcp_output_getm_dsnmap32(struct socket *so, int off, uint32_t datalen, 4365 u_int32_t *dsn, u_int32_t *relseq, u_int16_t *data_len, u_int64_t *dsn64p) 4366{ 4367 u_int64_t dsn64; 4368 4369 mptcp_output_getm_dsnmap64(so, off, datalen, &dsn64, relseq, data_len); 4370 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64); 4371 *dsn64p = dsn64; 4372} 4373 4374void 4375mptcp_output_getm_dsnmap64(struct socket *so, int off, uint32_t datalen, 4376 u_int64_t *dsn, u_int32_t *relseq, u_int16_t *data_len) 4377{ 4378 struct mbuf *m = so->so_snd.sb_mb; 4379 struct mbuf *mnext = NULL; 4380 uint32_t runlen = 0; 4381 u_int64_t dsn64; 4382 uint32_t contig_len = 0; 4383 4384 if (m == NULL) 4385 return; 4386 4387 if (off < 0) 4388 return; 4389 /* 4390 * In the subflow socket, the DSN sequencing can be discontiguous, 4391 * but the subflow sequence mapping is contiguous. Use the subflow 4392 * sequence property to find the right mbuf and corresponding dsn 4393 * mapping. 4394 */ 4395 4396 while (m) { 4397 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP); 4398 VERIFY(m->m_flags & M_PKTHDR); 4399 4400 if ((unsigned int)off >= m->m_pkthdr.mp_rlen) { 4401 off -= m->m_pkthdr.mp_rlen; 4402 m = m->m_next; 4403 } else { 4404 break; 4405 } 4406 } 4407 4408 if (m == NULL) { 4409 panic("%s: bad offset", __func__); 4410 /* NOTREACHED */ 4411 } 4412 4413 dsn64 = m->m_pkthdr.mp_dsn + off; 4414 *dsn = dsn64; 4415 *relseq = m->m_pkthdr.mp_rseq + off; 4416 4417 /* 4418 * Now find the last contiguous byte and its length from 4419 * start. 4420 */ 4421 runlen = m->m_pkthdr.mp_rlen - off; 4422 contig_len = runlen; 4423 4424 /* If datalen does not span multiple mbufs, return */ 4425 if (datalen <= runlen) { 4426 *data_len = min(datalen, UINT16_MAX); 4427 return; 4428 } 4429 4430 mnext = m->m_next; 4431 while (datalen > runlen) { 4432 if (mnext == NULL) { 4433 panic("%s: bad datalen = %d, %d %d", __func__, datalen, 4434 runlen, off); 4435 /* NOTREACHED */ 4436 } 4437 VERIFY(mnext->m_flags & M_PKTHDR); 4438 VERIFY(mnext->m_pkthdr.pkt_flags & PKTF_MPTCP); 4439 4440 /* 4441 * case A. contiguous DSN stream 4442 * case B. discontiguous DSN stream 4443 */ 4444 if (mnext->m_pkthdr.mp_dsn == (dsn64 + runlen)) { 4445 /* case A */ 4446 runlen += mnext->m_pkthdr.mp_rlen; 4447 contig_len += mnext->m_pkthdr.mp_rlen; 4448 mptcplog3((LOG_INFO, "%s: contig \n", 4449 __func__)); 4450 } else { 4451 /* case B */ 4452 mptcplog((LOG_INFO, 4453 "%s: discontig datalen %d contig_len %d cc %d \n", 4454 __func__, datalen, contig_len, so->so_snd.sb_cc)); 4455 break; 4456 } 4457 mnext = mnext->m_next; 4458 } 4459 datalen = min(datalen, UINT16_MAX); 4460 *data_len = min(datalen, contig_len); 4461 mptcplog3((LOG_INFO, "%s: %llu %u %d %d \n", __func__, 4462 *dsn, *relseq, *data_len, off)); 4463} 4464 4465/* 4466 * MPTCP's notion of the next insequence Data Sequence number is adjusted 4467 * here. It must be called from mptcp_adj_rmap() which is called only after 4468 * reassembly of out of order data. The rcvnxt variable must 4469 * be updated only when atleast some insequence new data is received. 4470 */ 4471static void 4472mptcp_adj_rcvnxt(struct tcpcb *tp, struct mbuf *m) 4473{ 4474 struct mptcb *mp_tp = tptomptp(tp); 4475 4476 if (mp_tp == NULL) 4477 return; 4478 MPT_LOCK(mp_tp); 4479 if ((MPTCP_SEQ_GEQ(mp_tp->mpt_rcvnxt, m->m_pkthdr.mp_dsn)) && 4480 (MPTCP_SEQ_LEQ(mp_tp->mpt_rcvnxt, (m->m_pkthdr.mp_dsn + 4481 m->m_pkthdr.mp_rlen)))) { 4482 mp_tp->mpt_rcvnxt = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen; 4483 } 4484 MPT_UNLOCK(mp_tp); 4485} 4486 4487/* 4488 * Note that this is called only from tcp_input() which may trim data 4489 * after the dsn mapping is inserted into the mbuf. When it trims data 4490 * tcp_input calls m_adj() which does not remove the m_pkthdr even if the 4491 * m_len becomes 0 as a result of trimming the mbuf. The dsn map insertion 4492 * cannot be delayed after trim, because data can be in the reassembly 4493 * queue for a while and the DSN option info in tp will be overwritten for 4494 * every new packet received. 4495 * The dsn map will be adjusted just prior to appending to subflow sockbuf 4496 * with mptcp_adj_rmap() 4497 */ 4498void 4499mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m) 4500{ 4501 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)); 4502 4503 if (tp->t_mpflags & TMPF_EMBED_DSN) { 4504 VERIFY(m->m_flags & M_PKTHDR); 4505 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn; 4506 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq; 4507 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len; 4508 m->m_pkthdr.pkt_flags |= PKTF_MPTCP; 4509 tp->t_mpflags &= ~TMPF_EMBED_DSN; 4510 tp->t_mpflags |= TMPF_MPTCP_ACKNOW; 4511 } 4512} 4513 4514int 4515mptcp_adj_rmap(struct socket *so, struct mbuf *m) 4516{ 4517 u_int64_t dsn; 4518 u_int32_t sseq, datalen; 4519 struct tcpcb *tp = intotcpcb(sotoinpcb(so)); 4520 u_int32_t old_rcvnxt = 0; 4521 4522 if (m_pktlen(m) == 0) 4523 return 0; 4524 4525 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) { 4526 VERIFY(m->m_flags & M_PKTHDR); 4527 4528 dsn = m->m_pkthdr.mp_dsn; 4529 sseq = m->m_pkthdr.mp_rseq + tp->irs; 4530 datalen = m->m_pkthdr.mp_rlen; 4531 } else { 4532 /* data arrived without an DSS option mapping */ 4533 4534 /* initial subflow can fallback right after SYN handshake */ 4535 mptcp_notify_mpfail(so); 4536 return 0; 4537 } 4538 4539 /* In the common case, data is in window and in sequence */ 4540 if (m->m_pkthdr.len == (int)datalen) { 4541 mptcp_adj_rcvnxt(tp, m); 4542 return 0; 4543 } 4544 4545 if (m->m_pkthdr.len > (int)datalen) { 4546 panic("%s: mbuf len = %d expected = %d", __func__, 4547 m->m_pkthdr.len, datalen); 4548 } 4549 4550 old_rcvnxt = tp->rcv_nxt - m->m_pkthdr.len; 4551 if (SEQ_GT(old_rcvnxt, sseq)) { 4552 /* data trimmed from the left */ 4553 int off = old_rcvnxt - sseq; 4554 m->m_pkthdr.mp_dsn += off; 4555 m->m_pkthdr.mp_rseq += off; 4556 m->m_pkthdr.mp_rlen = m->m_pkthdr.len; 4557 } else if (old_rcvnxt == sseq) { 4558 /* 4559 * Data was trimmed from the right 4560 */ 4561 m->m_pkthdr.mp_rlen = m->m_pkthdr.len; 4562 } else { 4563 /* handle gracefully with reass or fallback */ 4564 mptcp_notify_mpfail(so); 4565 m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP; 4566 m_freem(m); 4567 return -1; 4568 } 4569 mptcp_adj_rcvnxt(tp, m); 4570 return 0; 4571} 4572 4573/* 4574 * Following routines help with failure detection and failover of data 4575 * transfer from one subflow to another. 4576 */ 4577void 4578mptcp_act_on_txfail(struct socket *so) 4579{ 4580 struct tcpcb *tp = NULL; 4581 struct inpcb *inp = sotoinpcb(so); 4582 4583 if (inp == NULL) 4584 return; 4585 4586 tp = intotcpcb(inp); 4587 if (tp == NULL) 4588 return; 4589 4590 if (tp->t_state != TCPS_ESTABLISHED) 4591 mptcplog((LOG_INFO, "%s: state = %d \n", __func__, 4592 tp->t_state)); 4593 4594 mptcplog((LOG_INFO, "%s: Failover = %d \n", __func__, 4595 (so->so_flags & SOF_MP_TRYFAILOVER) ? 1 : 0)); 4596 4597 if (so->so_flags & SOF_MP_TRYFAILOVER) { 4598 return; 4599 } 4600 4601 so->so_flags |= SOF_MP_TRYFAILOVER; 4602 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER)); 4603} 4604 4605/* 4606 * Support for MP_FAIL option 4607 */ 4608int 4609mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq) 4610{ 4611 struct mbuf *m = so->so_snd.sb_mb; 4612 u_int64_t dsn; 4613 int off = 0; 4614 u_int32_t datalen; 4615 4616 if (m == NULL) 4617 return (-1); 4618 4619 while (m != NULL) { 4620 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP); 4621 VERIFY(m->m_flags & M_PKTHDR); 4622 dsn = m->m_pkthdr.mp_dsn; 4623 datalen = m->m_pkthdr.mp_rlen; 4624 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) && 4625 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) { 4626 off = dsn_fail - dsn; 4627 *tcp_seq = m->m_pkthdr.mp_rseq + off; 4628 mptcplog((LOG_INFO, "%s: %llu %llu \n", 4629 __func__, dsn, dsn_fail)); 4630 return (0); 4631 } 4632 4633 m = m->m_next; 4634 } 4635 4636 /* 4637 * If there was no mbuf data and a fallback to TCP occurred, there's 4638 * not much else to do. 4639 */ 4640 4641 mptcplog((LOG_ERR, "%s: %llu not found \n", __func__, dsn_fail)); 4642 return (-1); 4643} 4644 4645/* 4646 * Support for sending contiguous MPTCP bytes in subflow 4647 * Also for preventing sending data with ACK in 3-way handshake 4648 */ 4649int32_t 4650mptcp_adj_sendlen(struct socket *so, int32_t off, int32_t len) 4651{ 4652 u_int64_t mdss_dsn = 0; 4653 u_int32_t mdss_subflow_seq = 0; 4654 u_int16_t mdss_data_len = 0; 4655 4656 if (len == 0) 4657 return (len); 4658 4659 mptcp_output_getm_dsnmap64(so, off, (u_int32_t)len, 4660 &mdss_dsn, &mdss_subflow_seq, &mdss_data_len); 4661 4662 /* 4663 * Special case handling for Fast Join. We want to send data right 4664 * after ACK of the 3-way handshake, but not piggyback the data 4665 * with the 3rd ACK of the 3WHS. TMPF_FASTJOINBY2_SEND and 4666 * mdss_data_len control this. 4667 */ 4668 struct tcpcb *tp = NULL; 4669 tp = intotcpcb(sotoinpcb(so)); 4670 if ((tp->t_mpflags & TMPF_JOINED_FLOW) && 4671 (tp->t_mpflags & TMPF_PREESTABLISHED) && 4672 (!(tp->t_mpflags & TMPF_RECVD_JOIN)) && 4673 (tp->t_mpflags & TMPF_SENT_JOIN) && 4674 (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) && 4675 (!(tp->t_mpflags & TMPF_FASTJOINBY2_SEND))) { 4676 mdss_data_len = 0; 4677 tp->t_mpflags |= TMPF_FASTJOINBY2_SEND; 4678 } 4679 return (mdss_data_len); 4680} 4681 4682int32_t 4683mptcp_sbspace(struct mptcb *mpt) 4684{ 4685 struct sockbuf *sb; 4686 uint32_t rcvbuf; 4687 int32_t space; 4688 4689 MPT_LOCK_ASSERT_HELD(mpt); 4690 MPTE_LOCK_ASSERT_HELD(mpt->mpt_mpte); 4691 4692 sb = &mpt->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv; 4693 rcvbuf = sb->sb_hiwat; 4694 space = ((int32_t)imin((rcvbuf - sb->sb_cc), 4695 (sb->sb_mbmax - sb->sb_mbcnt))); 4696 if (space < 0) 4697 space = 0; 4698 /* XXX check if it's too small? */ 4699 4700 return (space); 4701} 4702 4703/* 4704 * Support Fallback to Regular TCP 4705 */ 4706void 4707mptcp_notify_mpready(struct socket *so) 4708{ 4709 struct tcpcb *tp = NULL; 4710 4711 if (so == NULL) 4712 return; 4713 4714 tp = intotcpcb(sotoinpcb(so)); 4715 4716 if (tp == NULL) 4717 return; 4718 4719 DTRACE_MPTCP4(multipath__ready, struct socket *, so, 4720 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd, 4721 struct tcpcb *, tp); 4722 4723 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) 4724 return; 4725 4726 if (tp->t_mpflags & TMPF_MPTCP_READY) 4727 return; 4728 4729 tp->t_mpflags &= ~TMPF_TCP_FALLBACK; 4730 tp->t_mpflags |= TMPF_MPTCP_READY; 4731 4732 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS)); 4733} 4734 4735void 4736mptcp_notify_mpfail(struct socket *so) 4737{ 4738 struct tcpcb *tp = NULL; 4739 4740 if (so == NULL) 4741 return; 4742 4743 tp = intotcpcb(sotoinpcb(so)); 4744 4745 if (tp == NULL) 4746 return; 4747 4748 DTRACE_MPTCP4(multipath__failed, struct socket *, so, 4749 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd, 4750 struct tcpcb *, tp); 4751 4752 if (tp->t_mpflags & TMPF_TCP_FALLBACK) 4753 return; 4754 4755 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE); 4756 tp->t_mpflags |= TMPF_TCP_FALLBACK; 4757 4758 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS)); 4759} 4760 4761/* 4762 * Keepalive helper function 4763 */ 4764boolean_t 4765mptcp_ok_to_keepalive(struct mptcb *mp_tp) 4766{ 4767 boolean_t ret = 1; 4768 VERIFY(mp_tp != NULL); 4769 MPT_LOCK(mp_tp); 4770 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) { 4771 ret = 0; 4772 } 4773 MPT_UNLOCK(mp_tp); 4774 return (ret); 4775} 4776 4777/* 4778 * MPTCP t_maxseg adjustment function 4779 */ 4780int 4781mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc) 4782{ 4783 int mss_lower = 0; 4784 struct mptcb *mp_tp = tptomptp(tp); 4785 4786#define MPTCP_COMPUTE_LEN { \ 4787 mss_lower = sizeof (struct mptcp_dss_ack_opt); \ 4788 MPT_LOCK(mp_tp); \ 4789 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \ 4790 mss_lower += 2; \ 4791 else \ 4792 /* adjust to 32-bit boundary + EOL */ \ 4793 mss_lower += 2; \ 4794 MPT_UNLOCK(mp_tp); \ 4795} 4796 if (mp_tp == NULL) 4797 return (0); 4798 4799 /* 4800 * For the first subflow and subsequent subflows, adjust mss for 4801 * most common MPTCP option size, for case where tcp_mss is called 4802 * during option processing and MTU discovery. 4803 */ 4804 if ((tp->t_mpflags & TMPF_PREESTABLISHED) && 4805 (!(tp->t_mpflags & TMPF_JOINED_FLOW))) { 4806 MPTCP_COMPUTE_LEN; 4807 } 4808 4809 if ((tp->t_mpflags & TMPF_PREESTABLISHED) && 4810 (tp->t_mpflags & TMPF_SENT_JOIN)) { 4811 MPTCP_COMPUTE_LEN; 4812 } 4813 4814 if ((mtudisc) && (tp->t_mpflags & TMPF_MPTCP_TRUE)) { 4815 MPTCP_COMPUTE_LEN; 4816 } 4817 4818 return (mss_lower); 4819} 4820 4821/* 4822 * Update the pid, upid, uuid of the subflow so, based on parent so 4823 */ 4824void 4825mptcp_update_last_owner(struct mptsub *mpts, struct socket *parent_mpso) 4826{ 4827 struct socket *subflow_so = mpts->mpts_socket; 4828 4829 MPTS_LOCK_ASSERT_HELD(mpts); 4830 4831 socket_lock(subflow_so, 0); 4832 if ((subflow_so->last_pid != parent_mpso->last_pid) || 4833 (subflow_so->last_upid != parent_mpso->last_upid)) { 4834 subflow_so->last_upid = parent_mpso->last_upid; 4835 subflow_so->last_pid = parent_mpso->last_pid; 4836 uuid_copy(subflow_so->last_uuid, parent_mpso->last_uuid); 4837 } 4838 so_update_policy(subflow_so); 4839 socket_unlock(subflow_so, 0); 4840} 4841 4842static void 4843fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts) 4844{ 4845 struct inpcb *inp; 4846 4847 tcp_getconninfo(so, &flow->flow_ci); 4848 inp = sotoinpcb(so); 4849#if INET6 4850 if ((inp->inp_vflag & INP_IPV6) != 0) { 4851 flow->flow_src.ss_family = AF_INET6; 4852 flow->flow_dst.ss_family = AF_INET6; 4853 flow->flow_src.ss_len = sizeof(struct sockaddr_in6); 4854 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6); 4855 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport; 4856 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport; 4857 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr; 4858 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr; 4859 } else 4860#endif 4861 { 4862 flow->flow_src.ss_family = AF_INET; 4863 flow->flow_dst.ss_family = AF_INET; 4864 flow->flow_src.ss_len = sizeof(struct sockaddr_in); 4865 flow->flow_dst.ss_len = sizeof(struct sockaddr_in); 4866 SIN(&flow->flow_src)->sin_port = inp->inp_lport; 4867 SIN(&flow->flow_dst)->sin_port = inp->inp_fport; 4868 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr; 4869 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr; 4870 } 4871 flow->flow_flags = mpts->mpts_flags; 4872 flow->flow_cid = mpts->mpts_connid; 4873} 4874 4875static int 4876mptcp_pcblist SYSCTL_HANDLER_ARGS 4877{ 4878#pragma unused(oidp, arg1, arg2) 4879 int error = 0, f; 4880 size_t n, len; 4881 struct mppcb *mpp; 4882 struct mptses *mpte; 4883 struct mptcb *mp_tp; 4884 struct mptsub *mpts; 4885 struct socket *so; 4886 conninfo_mptcp_t mptcpci; 4887 mptcp_flow_t *flows = NULL; 4888 4889 if (req->newptr != USER_ADDR_NULL) 4890 return (EPERM); 4891 4892 lck_mtx_lock(&mtcbinfo.mppi_lock); 4893 n = mtcbinfo.mppi_count; 4894 if (req->oldptr == USER_ADDR_NULL) { 4895 lck_mtx_unlock(&mtcbinfo.mppi_lock); 4896 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) + 4897 4 * (n + n/8) * sizeof(mptcp_flow_t); 4898 return (0); 4899 } 4900 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) { 4901 flows = NULL; 4902 bzero(&mptcpci, sizeof(mptcpci)); 4903 lck_mtx_lock(&mpp->mpp_lock); 4904 VERIFY(mpp->mpp_flags & MPP_ATTACHED); 4905 mpte = mptompte(mpp); 4906 VERIFY(mpte != NULL); 4907 mp_tp = mpte->mpte_mptcb; 4908 VERIFY(mp_tp != NULL); 4909 /* N.B. we don't take the mpt_lock just for the state. */ 4910 mptcpci.mptcpci_state = mp_tp->mpt_state; 4911 mptcpci.mptcpci_nflows = mpte->mpte_numflows; 4912 len = sizeof(*flows) * mpte->mpte_numflows; 4913 if (mpte->mpte_numflows != 0) { 4914 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO); 4915 if (flows == NULL) { 4916 lck_mtx_unlock(&mpp->mpp_lock); 4917 break; 4918 } 4919 mptcpci.mptcpci_len = sizeof(mptcpci) + 4920 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1); 4921 error = SYSCTL_OUT(req, &mptcpci, 4922 sizeof(mptcpci) - sizeof(mptcp_flow_t)); 4923 } else { 4924 mptcpci.mptcpci_len = sizeof(mptcpci); 4925 error = SYSCTL_OUT(req, &mptcpci, 4926 sizeof(mptcpci)); 4927 } 4928 if (error) { 4929 lck_mtx_unlock(&mpp->mpp_lock); 4930 FREE(flows, M_TEMP); 4931 break; 4932 } 4933 f = 0; 4934 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { 4935 MPTS_LOCK(mpts); 4936 so = mpts->mpts_socket; 4937 socket_lock(so, 0); 4938 fill_mptcp_subflow(so, &flows[f], mpts); 4939 socket_unlock(so, 0); 4940 MPTS_UNLOCK(mpts); 4941 f++; 4942 } 4943 lck_mtx_unlock(&mpp->mpp_lock); 4944 if (flows) { 4945 error = SYSCTL_OUT(req, flows, len); 4946 FREE(flows, M_TEMP); 4947 if (error) 4948 break; 4949 } 4950 } 4951 lck_mtx_unlock(&mtcbinfo.mppi_lock); 4952 4953 return (error); 4954} 4955 4956SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, 4957 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t", 4958 "List of active MPTCP connections"); 4959 4960/* 4961 * Check the health of the other subflows and do an mptcp_output if 4962 * there is no other active or functional subflow at the time of 4963 * call of this function. 4964 */ 4965static void 4966mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts) 4967{ 4968 struct mptsub *from_mpts = NULL; 4969 4970 MPTE_LOCK_ASSERT_HELD(mpte); 4971 4972 MPTS_UNLOCK(to_mpts); 4973 4974 from_mpts = mpte->mpte_active_sub; 4975 4976 if (from_mpts == NULL) 4977 goto output_needed; 4978 4979 MPTS_LOCK(from_mpts); 4980 4981 if ((from_mpts->mpts_flags & MPTSF_DISCONNECTED) || 4982 (from_mpts->mpts_flags & MPTSF_DISCONNECTING)) { 4983 MPTS_UNLOCK(from_mpts); 4984 goto output_needed; 4985 } 4986 4987 MPTS_UNLOCK(from_mpts); 4988 MPTS_LOCK(to_mpts); 4989 return; 4990 4991output_needed: 4992 mptcp_output(mpte); 4993 MPTS_LOCK(to_mpts); 4994} 4995 4996 4997/* 4998 * When WiFi signal starts fading, there's more loss and RTT spikes. 4999 * Check if there has been a large spike by comparing against 5000 * a tolerable RTT spike threshold. 5001 */ 5002boolean_t 5003mptcp_no_rto_spike(struct socket *so) 5004{ 5005 struct tcpcb *tp = intotcpcb(sotoinpcb(so)); 5006 int32_t spike = 0; 5007 5008 if (tp->t_rxtcur > mptcp_rto_spike_thresh) { 5009 spike = tp->t_rxtcur - mptcp_rto_spike_thresh; 5010 5011 mptcplog2((LOG_INFO, "%s: spike = %d rto = %d", 5012 "best = %d cur = %d\n", __func__, spike, 5013 tp->t_rxtcur, tp->t_rttbest >> TCP_RTT_SHIFT, 5014 tp->t_rttcur)); 5015 5016 } 5017 5018 if (spike > 0 ) { 5019 return (FALSE); 5020 } else { 5021 return (TRUE); 5022 } 5023} 5024 5025/* 5026 * Set notsent lowat mark on the MPTCB 5027 */ 5028int 5029mptcp_set_notsent_lowat(struct mptses *mpte, int optval) 5030{ 5031 struct mptcb *mp_tp = NULL; 5032 int error = 0; 5033 5034 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) 5035 mp_tp = mpte->mpte_mptcb; 5036 5037 if (mp_tp) 5038 mp_tp->mpt_notsent_lowat = optval; 5039 else 5040 error = EINVAL; 5041 5042 return error; 5043} 5044 5045u_int32_t 5046mptcp_get_notsent_lowat(struct mptses *mpte) 5047{ 5048 struct mptcb *mp_tp = NULL; 5049 5050 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) 5051 mp_tp = mpte->mpte_mptcb; 5052 5053 if (mp_tp) 5054 return mp_tp->mpt_notsent_lowat; 5055 else 5056 return 0; 5057} 5058 5059int 5060mptcp_notsent_lowat_check(struct socket *so) { 5061 struct mptses *mpte; 5062 struct mppcb *mpp; 5063 struct mptcb *mp_tp; 5064 struct mptsub *mpts; 5065 5066 int notsent = 0; 5067 5068 mpp = sotomppcb(so); 5069 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { 5070 return (0); 5071 } 5072 5073 mpte = mptompte(mpp); 5074 mp_tp = mpte->mpte_mptcb; 5075 5076 MPT_LOCK(mp_tp); 5077 notsent = so->so_snd.sb_cc; 5078 5079 if ((notsent == 0) || 5080 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <= 5081 mp_tp->mpt_notsent_lowat)) { 5082 mptcplog3((LOG_INFO, "%s: lowat %d notsent %d actual %d \n", 5083 __func__, mp_tp->mpt_notsent_lowat, notsent, 5084 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna))); 5085 MPT_UNLOCK(mp_tp); 5086 return (1); 5087 } 5088 MPT_UNLOCK(mp_tp); 5089 5090 /* When Nagle's algorithm is not disabled, it is better 5091 * to wakeup the client even before there is atleast one 5092 * maxseg of data to write. 5093 */ 5094 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { 5095 int retval = 0; 5096 MPTS_LOCK(mpts); 5097 if (mpts->mpts_flags & MPTSF_ACTIVE) { 5098 struct socket *subf_so = mpts->mpts_socket; 5099 socket_lock(subf_so, 0); 5100 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so)); 5101 5102 notsent = so->so_snd.sb_cc - 5103 (tp->snd_nxt - tp->snd_una); 5104 5105 if ((tp->t_flags & TF_NODELAY) == 0 && 5106 notsent > 0 && (notsent <= (int)tp->t_maxseg)) { 5107 retval = 1; 5108 } 5109 mptcplog3((LOG_INFO, "%s: lowat %d notsent %d" 5110 " nodelay false \n", 5111 __func__, mp_tp->mpt_notsent_lowat, notsent)); 5112 socket_unlock(subf_so, 0); 5113 MPTS_UNLOCK(mpts); 5114 return (retval); 5115 } 5116 MPTS_UNLOCK(mpts); 5117 } 5118 return (0); 5119} 5120 5121