1/* 2 * Copyright (c) 2012-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <sys/param.h> 30#include <sys/proc.h> 31#include <sys/systm.h> 32#include <sys/kernel.h> 33#include <sys/mbuf.h> 34#include <sys/mcache.h> 35#include <sys/resourcevar.h> 36#include <sys/socket.h> 37#include <sys/socketvar.h> 38#include <sys/syslog.h> 39#include <sys/domain.h> 40#include <sys/protosw.h> 41#include <sys/sysctl.h> 42 43#include <kern/zalloc.h> 44#include <kern/locks.h> 45 46#include <mach/thread_act.h> 47#include <mach/sdt.h> 48 49#include <net/if.h> 50#include <netinet/in.h> 51#include <netinet/in_pcb.h> 52#include <netinet/in_var.h> 53#include <netinet/tcp.h> 54#include <netinet/tcp_fsm.h> 55#include <netinet/tcp_seq.h> 56#include <netinet/tcp_var.h> 57#include <netinet/mptcp_var.h> 58#include <netinet/mptcp.h> 59#include <netinet/mptcp_seq.h> 60#include <netinet/mptcp_timer.h> 61#include <libkern/crypto/sha1.h> 62#if INET6 63#include <netinet6/in6_pcb.h> 64#include <netinet6/ip6protosw.h> 65#endif /* INET6 */ 66#include <dev/random/randomdev.h> 67 68/* 69 * Notes on MPTCP implementation. 70 * 71 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH 72 * communication domain. The structure mtcbinfo describes the MPTCP instance 73 * of a Multipath protocol in that domain. It is used to keep track of all 74 * MPTCP PCB instances in the system, and is protected by the global lock 75 * mppi_lock. 76 * 77 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM, 78 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with 79 * it comes an MPTCP Session and an MPTCP PCB. All three structures are 80 * allocated from the same memory block, and each structure has a pointer 81 * to the adjacent ones. The layout is defined by the mpp_mtp structure. 82 * The socket lock (mpp_lock) is used to protect accesses to the Multipath 83 * PCB (mppcb) as well as the MPTCP Session (mptses). 84 * 85 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB; 86 * in particular, the list of subflows as well as the MPTCP thread. 87 * 88 * A functioning MPTCP Session consists of one or more subflow sockets. Each 89 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is 90 * represented by the mptsub structure. Because each subflow requires access 91 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each 92 * subflow. This gets decremented prior to the subflow's destruction. The 93 * subflow lock (mpts_lock) is used to protect accesses to the subflow. 94 * 95 * To handle events (read, write, control) from the subflows, an MPTCP thread 96 * is created; currently, there is one thread per MPTCP Session. In order to 97 * prevent the MPTCP socket from being destroyed while being accessed by the 98 * MPTCP thread, we bump up the MPTCP socket's so_usecount for the thread, 99 * which will be decremented prior to the thread's termination. The thread 100 * lock (mpte_thread_lock) is used to synchronize its signalling. 101 * 102 * Lock ordering is defined as follows: 103 * 104 * mtcbinfo (mppi_lock) 105 * mp_so (mpp_lock) 106 * mpts (mpts_lock) 107 * so (inpcb_mtx) 108 * mptcb (mpt_lock) 109 * 110 * It is not a requirement that all of the above locks need to be acquired 111 * in succession, but the correct lock ordering must be followed when there 112 * are more than one locks that need to be held. The MPTCP thread lock is 113 * is not constrained by this arrangement, because none of the other locks 114 * is ever acquired while holding mpte_thread_lock; therefore it may be called 115 * at any moment to signal the thread. 116 * 117 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this 118 * work is done by the MPTCP garbage collector which is invoked on demand by 119 * the PF_MULTIPATH garbage collector. This process will take place once all 120 * of the subflows have been destroyed, and the MPTCP thread be instructed to 121 * self-terminate. 122 */ 123 124static void mptcp_sesdestroy(struct mptses *); 125static void mptcp_thread_signal_locked(struct mptses *); 126static void mptcp_thread_terminate_signal(struct mptses *); 127static void mptcp_thread_dowork(struct mptses *); 128static void mptcp_thread_func(void *, wait_result_t); 129static void mptcp_thread_destroy(struct mptses *); 130static void mptcp_key_pool_init(void); 131static void mptcp_attach_to_subf(struct socket *, struct mptcb *, connid_t); 132static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *); 133static void mptcp_conn_properties(struct mptcb *); 134static void mptcp_init_statevars(struct mptcb *); 135 136static uint32_t mptcp_gc(struct mppcbinfo *); 137static int mptcp_subflow_socreate(struct mptses *, struct mptsub *, 138 int, struct proc *, struct socket **); 139static int mptcp_subflow_soclose(struct mptsub *, struct socket *); 140static int mptcp_subflow_soconnectx(struct mptses *, struct mptsub *); 141static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **, 142 struct uio *, struct mbuf **, struct mbuf **, int *); 143static void mptcp_subflow_rupcall(struct socket *, void *, int); 144static void mptcp_subflow_input(struct mptses *, struct mptsub *); 145static void mptcp_subflow_wupcall(struct socket *, void *, int); 146static void mptcp_subflow_eupcall(struct socket *, void *, uint32_t); 147static void mptcp_update_last_owner(struct mptsub *, struct socket *); 148 149/* 150 * Possible return values for subflow event handlers. Note that success 151 * values must be greater or equal than MPTS_EVRET_OK. Values less than that 152 * indicate errors or actions which require immediate attention; they will 153 * prevent the rest of the handlers from processing their respective events 154 * until the next round of events processing. 155 */ 156typedef enum { 157 MPTS_EVRET_DELETE = 1, /* delete this subflow */ 158 MPTS_EVRET_OK = 2, /* OK */ 159 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */ 160 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */ 161 MPTS_EVRET_OK_UPDATE = 5, /* OK with conninfo update */ 162} ev_ret_t; 163 164static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *); 165static ev_ret_t mptcp_subflow_connreset_ev(struct mptses *, struct mptsub *); 166static ev_ret_t mptcp_subflow_cantrcvmore_ev(struct mptses *, struct mptsub *); 167static ev_ret_t mptcp_subflow_cantsendmore_ev(struct mptses *, struct mptsub *); 168static ev_ret_t mptcp_subflow_timeout_ev(struct mptses *, struct mptsub *); 169static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *); 170static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *); 171static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *); 172static ev_ret_t mptcp_subflow_suspend_ev(struct mptses *, struct mptsub *); 173static ev_ret_t mptcp_subflow_resume_ev(struct mptses *, struct mptsub *); 174static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *); 175static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *); 176static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *); 177static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *); 178static const char *mptcp_evret2str(ev_ret_t); 179 180static mptcp_key_t *mptcp_reserve_key(void); 181static int mptcp_do_sha1(mptcp_key_t *, char *, int); 182static int mptcp_init_authparms(struct mptcb *); 183static int mptcp_delete_ok(struct mptses *mpte, struct mptsub *mpts); 184 185static unsigned int mptsub_zone_size; /* size of mptsub */ 186static struct zone *mptsub_zone; /* zone for mptsub */ 187 188static unsigned int mptopt_zone_size; /* size of mptopt */ 189static struct zone *mptopt_zone; /* zone for mptopt */ 190 191static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */ 192static struct zone *mpt_subauth_zone; /* zone of subf auth entry */ 193 194struct mppcbinfo mtcbinfo; 195 196static struct mptcp_keys_pool_head mptcp_keys_pool; 197 198#define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */ 199#define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */ 200 201SYSCTL_DECL(_net_inet); 202 203SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP"); 204 205uint32_t mptcp_verbose = 0; /* more noise if greater than 1 */ 206SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, verbose, CTLFLAG_RW|CTLFLAG_LOCKED, 207 &mptcp_verbose, 0, "MPTCP verbosity level"); 208 209SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED, 210 &mtcbinfo.mppi_count, 0, "Number of active PCBs"); 211 212/* 213 * Since there is one kernel thread per mptcp socket, imposing an artificial 214 * limit on number of allowed mptcp sockets. 215 */ 216uint32_t mptcp_socket_limit = MPPCB_LIMIT; 217SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, sk_lim, CTLFLAG_RW|CTLFLAG_LOCKED, 218 &mptcp_socket_limit, 0, "MPTCP socket limit"); 219 220static struct protosw mptcp_subflow_protosw; 221static struct pr_usrreqs mptcp_subflow_usrreqs; 222#if INET6 223static struct ip6protosw mptcp_subflow_protosw6; 224static struct pr_usrreqs mptcp_subflow_usrreqs6; 225#endif /* INET6 */ 226 227/* 228 * Protocol pr_init callback. 229 */ 230void 231mptcp_init(struct protosw *pp, struct domain *dp) 232{ 233#pragma unused(dp) 234 static int mptcp_initialized = 0; 235 struct protosw *prp; 236#if INET6 237 struct ip6protosw *prp6; 238#endif /* INET6 */ 239 240 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED); 241 242 /* do this only once */ 243 if (mptcp_initialized) 244 return; 245 mptcp_initialized = 1; 246 247 /* 248 * Since PF_MULTIPATH gets initialized after PF_INET/INET6, 249 * we must be able to find IPPROTO_TCP entries for both. 250 */ 251 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM); 252 VERIFY(prp != NULL); 253 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp)); 254 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs, 255 sizeof (mptcp_subflow_usrreqs)); 256 mptcp_subflow_protosw.pr_entry.tqe_next = NULL; 257 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL; 258 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs; 259 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive; 260 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp; 261 /* 262 * Socket filters shouldn't attach/detach to/from this protosw 263 * since pr_protosw is to be used instead, which points to the 264 * real protocol; if they do, it is a bug and we should panic. 265 */ 266 mptcp_subflow_protosw.pr_filter_head.tqh_first = 267 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef; 268 mptcp_subflow_protosw.pr_filter_head.tqh_last = 269 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef; 270 271#if INET6 272 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6, 273 IPPROTO_TCP, SOCK_STREAM); 274 VERIFY(prp6 != NULL); 275 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6)); 276 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6, 277 sizeof (mptcp_subflow_usrreqs6)); 278 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL; 279 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL; 280 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6; 281 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive; 282 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp; 283 /* 284 * Socket filters shouldn't attach/detach to/from this protosw 285 * since pr_protosw is to be used instead, which points to the 286 * real protocol; if they do, it is a bug and we should panic. 287 */ 288 mptcp_subflow_protosw6.pr_filter_head.tqh_first = 289 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef; 290 mptcp_subflow_protosw6.pr_filter_head.tqh_last = 291 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef; 292#endif /* INET6 */ 293 294 bzero(&mtcbinfo, sizeof (mtcbinfo)); 295 TAILQ_INIT(&mtcbinfo.mppi_pcbs); 296 mtcbinfo.mppi_size = sizeof (struct mpp_mtp); 297 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size, 298 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) { 299 panic("%s: unable to allocate MPTCP PCB zone\n", __func__); 300 /* NOTREACHED */ 301 } 302 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE); 303 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE); 304 305 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init(); 306 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb", 307 mtcbinfo.mppi_lock_grp_attr); 308 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init(); 309 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp, 310 mtcbinfo.mppi_lock_attr); 311 mtcbinfo.mppi_gc = mptcp_gc; 312 313 mtcbinfo.mppi_timer = mptcp_timer; 314 315 /* attach to MP domain for garbage collection to take place */ 316 mp_pcbinfo_attach(&mtcbinfo); 317 318 mptsub_zone_size = sizeof (struct mptsub); 319 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size, 320 8192, "mptsub")) == NULL) { 321 panic("%s: unable to allocate MPTCP subflow zone\n", __func__); 322 /* NOTREACHED */ 323 } 324 zone_change(mptsub_zone, Z_CALLERACCT, FALSE); 325 zone_change(mptsub_zone, Z_EXPAND, TRUE); 326 327 mptopt_zone_size = sizeof (struct mptopt); 328 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size, 329 1024, "mptopt")) == NULL) { 330 panic("%s: unable to allocate MPTCP option zone\n", __func__); 331 /* NOTREACHED */ 332 } 333 zone_change(mptopt_zone, Z_CALLERACCT, FALSE); 334 zone_change(mptopt_zone, Z_EXPAND, TRUE); 335 336 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry); 337 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size, 338 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) { 339 panic("%s: unable to allocate MPTCP address auth zone \n", 340 __func__); 341 /* NOTREACHED */ 342 } 343 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE); 344 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE); 345 346 /* Set up a list of unique keys */ 347 mptcp_key_pool_init(); 348 349} 350 351/* 352 * Create an MPTCP session, called as a result of opening a MPTCP socket. 353 */ 354struct mptses * 355mptcp_sescreate(struct socket *mp_so, struct mppcb *mpp) 356{ 357 struct mppcbinfo *mppi; 358 struct mptses *mpte; 359 struct mptcb *mp_tp; 360 int error = 0; 361 362 VERIFY(mpp != NULL); 363 mppi = mpp->mpp_pcbinfo; 364 VERIFY(mppi != NULL); 365 366 mpte = &((struct mpp_mtp *)mpp)->mpp_ses; 367 mp_tp = &((struct mpp_mtp *)mpp)->mtcb; 368 369 /* MPTCP Multipath PCB Extension */ 370 bzero(mpte, sizeof (*mpte)); 371 VERIFY(mpp->mpp_pcbe == NULL); 372 mpp->mpp_pcbe = mpte; 373 mpte->mpte_mppcb = mpp; 374 mpte->mpte_mptcb = mp_tp; 375 376 TAILQ_INIT(&mpte->mpte_sopts); 377 TAILQ_INIT(&mpte->mpte_subflows); 378 mpte->mpte_associd = ASSOCID_ANY; 379 mpte->mpte_connid_last = CONNID_ANY; 380 381 lck_mtx_init(&mpte->mpte_thread_lock, mppi->mppi_lock_grp, 382 mppi->mppi_lock_attr); 383 384 /* 385 * XXX: adi@apple.com 386 * 387 * This can be rather expensive if we have lots of MPTCP sockets, 388 * but we need a kernel thread for this model to work. Perhaps we 389 * could amortize the costs by having one worker thread per a group 390 * of MPTCP sockets. 391 */ 392 if (kernel_thread_start(mptcp_thread_func, mpte, 393 &mpte->mpte_thread) != KERN_SUCCESS) { 394 error = ENOBUFS; 395 goto out; 396 } 397 mp_so->so_usecount++; /* for thread */ 398 399 /* MPTCP Protocol Control Block */ 400 bzero(mp_tp, sizeof (*mp_tp)); 401 lck_mtx_init(&mp_tp->mpt_lock, mppi->mppi_lock_grp, 402 mppi->mppi_lock_attr); 403 mp_tp->mpt_mpte = mpte; 404 405out: 406 if (error != 0) 407 lck_mtx_destroy(&mpte->mpte_thread_lock, mppi->mppi_lock_grp); 408 DTRACE_MPTCP5(session__create, struct socket *, mp_so, 409 struct sockbuf *, &mp_so->so_rcv, 410 struct sockbuf *, &mp_so->so_snd, 411 struct mppcb *, mpp, int, error); 412 413 return ((error != 0) ? NULL : mpte); 414} 415 416/* 417 * Destroy an MPTCP session. 418 */ 419static void 420mptcp_sesdestroy(struct mptses *mpte) 421{ 422 struct mptcb *mp_tp; 423 424 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 425 426 mp_tp = mpte->mpte_mptcb; 427 VERIFY(mp_tp != NULL); 428 429 /* 430 * MPTCP Multipath PCB Extension section 431 */ 432 mptcp_flush_sopts(mpte); 433 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0); 434 435 lck_mtx_destroy(&mpte->mpte_thread_lock, 436 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp); 437 438 /* 439 * MPTCP Protocol Control Block section 440 */ 441 lck_mtx_destroy(&mp_tp->mpt_lock, 442 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp); 443 444 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte, 445 struct mptcb *, mp_tp); 446} 447 448/* 449 * Allocate an MPTCP socket option structure. 450 */ 451struct mptopt * 452mptcp_sopt_alloc(int how) 453{ 454 struct mptopt *mpo; 455 456 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) : 457 zalloc_noblock(mptopt_zone); 458 if (mpo != NULL) { 459 bzero(mpo, mptopt_zone_size); 460 } 461 462 return (mpo); 463} 464 465/* 466 * Free an MPTCP socket option structure. 467 */ 468void 469mptcp_sopt_free(struct mptopt *mpo) 470{ 471 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED)); 472 473 zfree(mptopt_zone, mpo); 474} 475 476/* 477 * Add a socket option to the MPTCP socket option list. 478 */ 479void 480mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo) 481{ 482 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 483 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED)); 484 mpo->mpo_flags |= MPOF_ATTACHED; 485 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry); 486} 487 488/* 489 * Remove a socket option from the MPTCP socket option list. 490 */ 491void 492mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo) 493{ 494 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 495 VERIFY(mpo->mpo_flags & MPOF_ATTACHED); 496 mpo->mpo_flags &= ~MPOF_ATTACHED; 497 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry); 498} 499 500/* 501 * Search for an existing <sopt_level,sopt_name> socket option. 502 */ 503struct mptopt * 504mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt) 505{ 506 struct mptopt *mpo; 507 508 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 509 510 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) { 511 if (mpo->mpo_level == sopt->sopt_level && 512 mpo->mpo_name == sopt->sopt_name) 513 break; 514 } 515 VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int)); 516 517 return (mpo); 518} 519 520/* 521 * Flushes all recorded socket options from an MP socket. 522 */ 523void 524mptcp_flush_sopts(struct mptses *mpte) 525{ 526 struct mptopt *mpo, *tmpo; 527 528 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 529 530 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) { 531 mptcp_sopt_remove(mpte, mpo); 532 mptcp_sopt_free(mpo); 533 } 534 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts)); 535} 536 537/* 538 * Allocate a MPTCP subflow structure. 539 */ 540struct mptsub * 541mptcp_subflow_alloc(int how) 542{ 543 struct mptsub *mpts; 544 545 mpts = (how == M_WAITOK) ? zalloc(mptsub_zone) : 546 zalloc_noblock(mptsub_zone); 547 if (mpts != NULL) { 548 bzero(mpts, mptsub_zone_size); 549 lck_mtx_init(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp, 550 mtcbinfo.mppi_lock_attr); 551 } 552 553 return (mpts); 554} 555 556/* 557 * Deallocate a subflow structure, called when all of the references held 558 * on it have been released. This implies that the subflow has been deleted. 559 */ 560void 561mptcp_subflow_free(struct mptsub *mpts) 562{ 563 MPTS_LOCK_ASSERT_HELD(mpts); 564 565 VERIFY(mpts->mpts_refcnt == 0); 566 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED)); 567 VERIFY(mpts->mpts_mpte == NULL); 568 VERIFY(mpts->mpts_socket == NULL); 569 570 if (mpts->mpts_src_sl != NULL) { 571 sockaddrlist_free(mpts->mpts_src_sl); 572 mpts->mpts_src_sl = NULL; 573 } 574 if (mpts->mpts_dst_sl != NULL) { 575 sockaddrlist_free(mpts->mpts_dst_sl); 576 mpts->mpts_dst_sl = NULL; 577 } 578 MPTS_UNLOCK(mpts); 579 lck_mtx_destroy(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp); 580 581 zfree(mptsub_zone, mpts); 582} 583 584/* 585 * Create an MPTCP subflow socket. 586 */ 587static int 588mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom, 589 struct proc *p, struct socket **so) 590{ 591 struct mptopt smpo, *mpo, *tmpo; 592 struct socket *mp_so; 593 int error; 594 595 *so = NULL; 596 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 597 mp_so = mpte->mpte_mppcb->mpp_socket; 598 599 /* 600 * Create the subflow socket (multipath subflow, non-blocking.) 601 * 602 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow 603 * socket; it will be cleared when the socket is peeled off or closed. 604 * It also indicates to the underlying TCP to handle MPTCP options. 605 * A multipath subflow socket implies SS_NOFDREF state. 606 */ 607 if ((error = socreate_internal(dom, so, SOCK_STREAM, 608 IPPROTO_TCP, p, SOCF_ASYNC | SOCF_MP_SUBFLOW, PROC_NULL)) != 0) { 609 mptcplog((LOG_ERR, "MPTCP ERROR %s: mp_so 0x%llx unable to " 610 "create subflow socket error %d\n", __func__, 611 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error)); 612 return (error); 613 } 614 615 socket_lock(*so, 0); 616 VERIFY((*so)->so_flags & SOF_MP_SUBFLOW); 617 VERIFY(((*so)->so_state & (SS_NBIO|SS_NOFDREF)) == 618 (SS_NBIO|SS_NOFDREF)); 619 620 /* prevent the socket buffers from being compressed */ 621 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS; 622 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS; 623 624 bzero(&smpo, sizeof (smpo)); 625 smpo.mpo_flags |= MPOF_SUBFLOW_OK; 626 smpo.mpo_level = SOL_SOCKET; 627 smpo.mpo_intval = 1; 628 629 /* disable SIGPIPE */ 630 smpo.mpo_name = SO_NOSIGPIPE; 631 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0) 632 goto out; 633 634 /* find out if the subflow's source address goes away */ 635 smpo.mpo_name = SO_NOADDRERR; 636 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0) 637 goto out; 638 639 /* enable keepalive */ 640 smpo.mpo_name = SO_KEEPALIVE; 641 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0) 642 goto out; 643 644 /* 645 * Limit the receive socket buffer size to 64k. 646 * 647 * We need to take into consideration the window scale option 648 * which could be negotiated in one subflow but disabled in 649 * another subflow. 650 * XXX This can be improved in the future. 651 */ 652 smpo.mpo_name = SO_RCVBUF; 653 smpo.mpo_intval = MPTCP_RWIN_MAX; 654 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0) 655 goto out; 656 657 /* N.B.: set by sosetopt */ 658 VERIFY(!((*so)->so_rcv.sb_flags & SB_AUTOSIZE)); 659 /* Prevent automatic socket buffer sizing. */ 660 (*so)->so_snd.sb_flags &= ~SB_AUTOSIZE; 661 662 smpo.mpo_level = IPPROTO_TCP; 663 smpo.mpo_intval = mptcp_subflow_keeptime; 664 smpo.mpo_name = TCP_KEEPALIVE; 665 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0) 666 goto out; 667 668 /* replay setsockopt(2) on the subflow sockets for eligible options */ 669 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) { 670 int interim; 671 672 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) 673 continue; 674 675 /* 676 * Skip those that are handled internally; these options 677 * should not have been recorded and marked with the 678 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case. 679 */ 680 if (mpo->mpo_level == SOL_SOCKET && 681 (mpo->mpo_name == SO_NOSIGPIPE || 682 mpo->mpo_name == SO_NOADDRERR || 683 mpo->mpo_name == SO_KEEPALIVE)) 684 continue; 685 686 interim = (mpo->mpo_flags & MPOF_INTERIM); 687 if (mptcp_subflow_sosetopt(mpte, *so, mpo) != 0 && interim) { 688 char buf[32]; 689 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s val %d " 690 "interim record removed\n", __func__, 691 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 692 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name, 693 buf, sizeof (buf)), mpo->mpo_intval)); 694 mptcp_sopt_remove(mpte, mpo); 695 mptcp_sopt_free(mpo); 696 continue; 697 } 698 } 699 700 /* 701 * We need to receive everything that the subflow socket has, 702 * so use a customized socket receive function. We will undo 703 * this when the socket is peeled off or closed. 704 */ 705 mpts->mpts_oprotosw = (*so)->so_proto; 706 switch (dom) { 707 case PF_INET: 708 (*so)->so_proto = &mptcp_subflow_protosw; 709 break; 710#if INET6 711 case PF_INET6: 712 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6; 713 break; 714#endif /* INET6 */ 715 default: 716 VERIFY(0); 717 /* NOTREACHED */ 718 } 719 720out: 721 socket_unlock(*so, 0); 722 723 DTRACE_MPTCP4(subflow__create, struct mptses *, mpte, 724 struct mptsub *, mpts, int, dom, int, error); 725 726 return (error); 727} 728 729/* 730 * Close an MPTCP subflow socket. 731 * 732 * Note that this may be called on an embryonic subflow, and the only 733 * thing that is guaranteed valid is the protocol-user request. 734 */ 735static int 736mptcp_subflow_soclose(struct mptsub *mpts, struct socket *so) 737{ 738 MPTS_LOCK_ASSERT_HELD(mpts); 739 740 socket_lock(so, 0); 741 VERIFY(so->so_flags & SOF_MP_SUBFLOW); 742 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF)); 743 744 /* restore protocol-user requests */ 745 VERIFY(mpts->mpts_oprotosw != NULL); 746 so->so_proto = mpts->mpts_oprotosw; 747 socket_unlock(so, 0); 748 749 mpts->mpts_socket = NULL; /* may already be NULL */ 750 751 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts, 752 struct socket *, so, 753 struct sockbuf *, &so->so_rcv, 754 struct sockbuf *, &so->so_snd, 755 struct mptses *, mpts->mpts_mpte); 756 757 return (soclose(so)); 758} 759 760/* 761 * Connect an MPTCP subflow socket. 762 * 763 * This may be called inline as part of adding a subflow, or asynchronously 764 * by the thread (upon progressing to MPTCPF_JOIN_READY). Note that in the 765 * pending connect case, the subflow socket may have been bound to an interface 766 * and/or a source IP address which may no longer be around by the time this 767 * routine is called; in that case the connect attempt will most likely fail. 768 */ 769static int 770mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts) 771{ 772 struct socket *so; 773 int af, error; 774 775 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 776 MPTS_LOCK_ASSERT_HELD(mpts); 777 778 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) == 779 MPTSF_CONNECTING); 780 VERIFY(mpts->mpts_socket != NULL); 781 so = mpts->mpts_socket; 782 af = mpts->mpts_family; 783 784 if (af == AF_INET || af == AF_INET6) { 785 struct sockaddr_entry *dst_se; 786 char dbuf[MAX_IPv6_STR_LEN]; 787 788 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head); 789 VERIFY(dst_se != NULL); 790 791 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx dst %s[%d] cid %d " 792 "[pended %s]\n", __func__, 793 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket), 794 inet_ntop(af, ((af == AF_INET) ? 795 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr : 796 (void *)&SIN6(dst_se->se_addr)->sin6_addr), 797 dbuf, sizeof (dbuf)), ((af == AF_INET) ? 798 ntohs(SIN(dst_se->se_addr)->sin_port) : 799 ntohs(SIN6(dst_se->se_addr)->sin6_port)), 800 mpts->mpts_connid, 801 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ? 802 "YES" : "NO"))); 803 } 804 805 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING; 806 807 socket_lock(so, 0); 808 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpts->mpts_connid); 809 /* connect the subflow socket */ 810 error = soconnectxlocked(so, &mpts->mpts_src_sl, &mpts->mpts_dst_sl, 811 mpts->mpts_mpcr.mpcr_proc, mpts->mpts_mpcr.mpcr_ifscope, 812 mpte->mpte_associd, NULL, TCP_CONNREQF_MPTCP, 813 &mpts->mpts_mpcr, sizeof (mpts->mpts_mpcr)); 814 socket_unlock(so, 0); 815 816 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte, 817 struct mptsub *, mpts, int, error); 818 819 return (error); 820} 821 822/* 823 * MPTCP subflow socket receive routine, derived from soreceive(). 824 */ 825static int 826mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa, 827 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 828{ 829#pragma unused(uio) 830 int flags, error = 0; 831 struct proc *p = current_proc(); 832 struct mbuf *m, **mp = mp0; 833 struct mbuf *nextrecord; 834 835 socket_lock(so, 1); 836 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED); 837 838#ifdef MORE_LOCKING_DEBUG 839 if (so->so_usecount == 1) { 840 panic("%s: so=%x no other reference on socket\n", __func__, so); 841 /* NOTREACHED */ 842 } 843#endif 844 /* 845 * We return all that is there in the subflow's socket receive buffer 846 * to the MPTCP layer, so we require that the caller passes in the 847 * expected parameters. 848 */ 849 if (mp == NULL || controlp != NULL) { 850 socket_unlock(so, 1); 851 return (EINVAL); 852 } 853 *mp = NULL; 854 if (psa != NULL) 855 *psa = NULL; 856 if (flagsp != NULL) 857 flags = *flagsp &~ MSG_EOR; 858 else 859 flags = 0; 860 861 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM)) { 862 socket_unlock(so, 1); 863 return (EOPNOTSUPP); 864 } 865 flags |= (MSG_DONTWAIT|MSG_NBIO); 866 867 /* 868 * If a recv attempt is made on a previously-accepted socket 869 * that has been marked as inactive (disconnected), reject 870 * the request. 871 */ 872 if (so->so_flags & SOF_DEFUNCT) { 873 struct sockbuf *sb = &so->so_rcv; 874 875 error = ENOTCONN; 876 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n", 877 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), 878 SOCK_DOM(so), SOCK_TYPE(so), error)); 879 /* 880 * This socket should have been disconnected and flushed 881 * prior to being returned from sodefunct(); there should 882 * be no data on its receive list, so panic otherwise. 883 */ 884 if (so->so_state & SS_DEFUNCT) 885 sb_empty_assert(sb, __func__); 886 socket_unlock(so, 1); 887 return (error); 888 } 889 890 /* 891 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE) 892 * and if so just return to the caller. This could happen when 893 * soreceive() is called by a socket upcall function during the 894 * time the socket is freed. The socket buffer would have been 895 * locked across the upcall, therefore we cannot put this thread 896 * to sleep (else we will deadlock) or return EWOULDBLOCK (else 897 * we may livelock), because the lock on the socket buffer will 898 * only be released when the upcall routine returns to its caller. 899 * Because the socket has been officially closed, there can be 900 * no further read on it. 901 * 902 * A multipath subflow socket would have its SS_NOFDREF set by 903 * default, so check for SOF_MP_SUBFLOW socket flag; when the 904 * socket is closed for real, SOF_MP_SUBFLOW would be cleared. 905 */ 906 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) == 907 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) { 908 socket_unlock(so, 1); 909 return (0); 910 } 911 912 /* 913 * For consistency with soreceive() semantics, we need to obey 914 * SB_LOCK in case some other code path has locked the buffer. 915 */ 916 error = sblock(&so->so_rcv, 0); 917 if (error != 0) { 918 socket_unlock(so, 1); 919 return (error); 920 } 921 922 m = so->so_rcv.sb_mb; 923 if (m == NULL) { 924 /* 925 * Panic if we notice inconsistencies in the socket's 926 * receive list; both sb_mb and sb_cc should correctly 927 * reflect the contents of the list, otherwise we may 928 * end up with false positives during select() or poll() 929 * which could put the application in a bad state. 930 */ 931 SB_MB_CHECK(&so->so_rcv); 932 933 if (so->so_error != 0) { 934 error = so->so_error; 935 so->so_error = 0; 936 goto release; 937 } 938 939 if (so->so_state & SS_CANTRCVMORE) { 940 goto release; 941 } 942 943 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) { 944 error = ENOTCONN; 945 goto release; 946 } 947 948 /* 949 * MSG_DONTWAIT is implicitly defined and this routine will 950 * never block, so return EWOULDBLOCK when there is nothing. 951 */ 952 error = EWOULDBLOCK; 953 goto release; 954 } 955 956 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv); 957 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1"); 958 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1"); 959 960 while (m != NULL) { 961 nextrecord = m->m_nextpkt; 962 sbfree(&so->so_rcv, m); 963 964 if (mp != NULL) { 965 *mp = m; 966 mp = &m->m_next; 967 so->so_rcv.sb_mb = m = m->m_next; 968 *mp = NULL; 969 } 970 971 if (m != NULL) { 972 m->m_nextpkt = nextrecord; 973 if (nextrecord == NULL) 974 so->so_rcv.sb_lastrecord = m; 975 } else { 976 m = so->so_rcv.sb_mb = nextrecord; 977 SB_EMPTY_FIXUP(&so->so_rcv); 978 } 979 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2"); 980 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2"); 981 } 982 983 DTRACE_MPTCP3(subflow__receive, struct socket *, so, 984 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd); 985 /* notify protocol that we drained all the data */ 986 if ((so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) 987 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags); 988 989 if (flagsp != NULL) 990 *flagsp |= flags; 991 992release: 993 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */ 994 return (error); 995 996} 997 998 999/* 1000 * Prepare an MPTCP subflow socket for peeloff(2); basically undo 1001 * the work done earlier when the subflow socket was created. 1002 */ 1003void 1004mptcp_subflow_sopeeloff(struct mptses *mpte, struct mptsub *mpts, 1005 struct socket *so) 1006{ 1007 struct mptopt smpo; 1008 struct socket *mp_so; 1009 int p, c; 1010 1011 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1012 mp_so = mpte->mpte_mppcb->mpp_socket; 1013 MPTS_LOCK_ASSERT_HELD(mpts); 1014 1015 socket_lock(so, 0); 1016 VERIFY(so->so_flags & SOF_MP_SUBFLOW); 1017 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF)); 1018 1019 /* inherit MPTCP socket states */ 1020 if (!(mp_so->so_state & SS_NBIO)) 1021 so->so_state &= ~SS_NBIO; 1022 1023 /* 1024 * At this point, the socket is not yet closed, as there is at least 1025 * one outstanding usecount previously held by mpts_socket from 1026 * socreate(). Atomically clear SOF_MP_SUBFLOW and SS_NOFDREF here. 1027 */ 1028 so->so_flags &= ~SOF_MP_SUBFLOW; 1029 so->so_state &= ~SS_NOFDREF; 1030 so->so_state &= ~SOF_MPTCP_TRUE; 1031 1032 /* allow socket buffers to be compressed */ 1033 so->so_rcv.sb_flags &= ~SB_NOCOMPRESS; 1034 so->so_snd.sb_flags &= ~SB_NOCOMPRESS; 1035 1036 /* 1037 * Allow socket buffer auto sizing. 1038 * 1039 * This will increase the current 64k buffer size to whatever is best. 1040 */ 1041 so->so_rcv.sb_flags |= SB_AUTOSIZE; 1042 so->so_snd.sb_flags |= SB_AUTOSIZE; 1043 1044 /* restore protocol-user requests */ 1045 VERIFY(mpts->mpts_oprotosw != NULL); 1046 so->so_proto = mpts->mpts_oprotosw; 1047 1048 bzero(&smpo, sizeof (smpo)); 1049 smpo.mpo_flags |= MPOF_SUBFLOW_OK; 1050 smpo.mpo_level = SOL_SOCKET; 1051 1052 /* inherit SOF_NOSIGPIPE from parent MP socket */ 1053 p = (mp_so->so_flags & SOF_NOSIGPIPE); 1054 c = (so->so_flags & SOF_NOSIGPIPE); 1055 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0; 1056 smpo.mpo_name = SO_NOSIGPIPE; 1057 if ((p - c) != 0) 1058 (void) mptcp_subflow_sosetopt(mpte, so, &smpo); 1059 1060 /* inherit SOF_NOADDRAVAIL from parent MP socket */ 1061 p = (mp_so->so_flags & SOF_NOADDRAVAIL); 1062 c = (so->so_flags & SOF_NOADDRAVAIL); 1063 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0; 1064 smpo.mpo_name = SO_NOADDRERR; 1065 if ((p - c) != 0) 1066 (void) mptcp_subflow_sosetopt(mpte, so, &smpo); 1067 1068 /* inherit SO_KEEPALIVE from parent MP socket */ 1069 p = (mp_so->so_options & SO_KEEPALIVE); 1070 c = (so->so_options & SO_KEEPALIVE); 1071 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0; 1072 smpo.mpo_name = SO_KEEPALIVE; 1073 if ((p - c) != 0) 1074 (void) mptcp_subflow_sosetopt(mpte, so, &smpo); 1075 1076 /* unset TCP level default keepalive option */ 1077 p = (intotcpcb(sotoinpcb(mp_so)))->t_keepidle; 1078 c = (intotcpcb(sotoinpcb(so)))->t_keepidle; 1079 smpo.mpo_level = IPPROTO_TCP; 1080 smpo.mpo_intval = 0; 1081 smpo.mpo_name = TCP_KEEPALIVE; 1082 if ((p - c) != 0) 1083 (void) mptcp_subflow_sosetopt(mpte, so, &smpo); 1084 socket_unlock(so, 0); 1085 1086 DTRACE_MPTCP5(subflow__peeloff, struct mptses *, mpte, 1087 struct mptsub *, mpts, struct socket *, so, 1088 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd); 1089} 1090 1091/* 1092 * Establish an initial MPTCP connection (if first subflow and not yet 1093 * connected), or add a subflow to an existing MPTCP connection. 1094 */ 1095int 1096mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts, 1097 struct proc *p, uint32_t ifscope) 1098{ 1099 struct sockaddr_entry *se, *src_se = NULL, *dst_se = NULL; 1100 struct socket *mp_so, *so = NULL; 1101 struct mptsub_connreq mpcr; 1102 struct mptcb *mp_tp; 1103 int af, error = 0; 1104 1105 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1106 mp_so = mpte->mpte_mppcb->mpp_socket; 1107 mp_tp = mpte->mpte_mptcb; 1108 1109 MPTS_LOCK(mpts); 1110 VERIFY(!(mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED))); 1111 VERIFY(mpts->mpts_mpte == NULL); 1112 VERIFY(mpts->mpts_socket == NULL); 1113 VERIFY(mpts->mpts_dst_sl != NULL); 1114 VERIFY(mpts->mpts_connid == CONNID_ANY); 1115 1116 /* select source (if specified) and destination addresses */ 1117 if ((error = in_selectaddrs(AF_UNSPEC, &mpts->mpts_src_sl, &src_se, 1118 &mpts->mpts_dst_sl, &dst_se)) != 0) 1119 goto out; 1120 1121 VERIFY(mpts->mpts_dst_sl != NULL && dst_se != NULL); 1122 VERIFY(src_se == NULL || mpts->mpts_src_sl != NULL); 1123 af = mpts->mpts_family = dst_se->se_addr->sa_family; 1124 VERIFY(src_se == NULL || src_se->se_addr->sa_family == af); 1125 VERIFY(af == AF_INET || af == AF_INET6); 1126 1127 /* 1128 * If the source address is not specified, allocate a storage for 1129 * it, so that later on we can fill it in with the actual source 1130 * IP address chosen by the underlying layer for the subflow after 1131 * it is connected. 1132 */ 1133 if (mpts->mpts_src_sl == NULL) { 1134 mpts->mpts_src_sl = 1135 sockaddrlist_dup(mpts->mpts_dst_sl, M_WAITOK); 1136 if (mpts->mpts_src_sl == NULL) { 1137 error = ENOBUFS; 1138 goto out; 1139 } 1140 se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head); 1141 VERIFY(se != NULL && se->se_addr != NULL && 1142 se->se_addr->sa_len == dst_se->se_addr->sa_len); 1143 bzero(se->se_addr, se->se_addr->sa_len); 1144 se->se_addr->sa_len = dst_se->se_addr->sa_len; 1145 se->se_addr->sa_family = dst_se->se_addr->sa_family; 1146 } 1147 1148 /* create the subflow socket */ 1149 if ((error = mptcp_subflow_socreate(mpte, mpts, af, p, &so)) != 0) 1150 goto out; 1151 1152 /* 1153 * XXX: adi@apple.com 1154 * 1155 * This probably needs to be made smarter, but for now simply 1156 * increment the counter, while avoiding 0 (CONNID_ANY) and 1157 * -1 (CONNID_ALL). Assume that an MPTCP connection will not 1158 * live too long with (2^32)-2 subflow connection attempts. 1159 */ 1160 mpte->mpte_connid_last++; 1161 if (mpte->mpte_connid_last == CONNID_ALL || 1162 mpte->mpte_connid_last == CONNID_ANY) 1163 mpte->mpte_connid_last++; 1164 1165 mpts->mpts_connid = mpte->mpte_connid_last; 1166 VERIFY(mpts->mpts_connid != CONNID_ANY && 1167 mpts->mpts_connid != CONNID_ALL); 1168 1169 /* bind subflow socket to the specified interface */ 1170 if (ifscope != IFSCOPE_NONE) { 1171 socket_lock(so, 0); 1172 error = inp_bindif(sotoinpcb(so), ifscope, &mpts->mpts_outif); 1173 if (error != 0) { 1174 socket_unlock(so, 0); 1175 (void) mptcp_subflow_soclose(mpts, so); 1176 goto out; 1177 } 1178 VERIFY(mpts->mpts_outif != NULL); 1179 mpts->mpts_flags |= MPTSF_BOUND_IF; 1180 1181 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx bindif %s[%d] " 1182 "cid %d\n", __func__, 1183 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 1184 mpts->mpts_outif->if_xname, 1185 ifscope, mpts->mpts_connid)); 1186 socket_unlock(so, 0); 1187 } 1188 1189 /* if source address and/or port is specified, bind to it */ 1190 if (src_se != NULL) { 1191 struct sockaddr *sa = src_se->se_addr; 1192 uint32_t mpts_flags = 0; 1193 in_port_t lport; 1194 1195 switch (af) { 1196 case AF_INET: 1197 if (SIN(sa)->sin_addr.s_addr != INADDR_ANY) 1198 mpts_flags |= MPTSF_BOUND_IP; 1199 if ((lport = SIN(sa)->sin_port) != 0) 1200 mpts_flags |= MPTSF_BOUND_PORT; 1201 break; 1202#if INET6 1203 case AF_INET6: 1204 VERIFY(af == AF_INET6); 1205 if (!IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa)->sin6_addr)) 1206 mpts_flags |= MPTSF_BOUND_IP; 1207 if ((lport = SIN6(sa)->sin6_port) != 0) 1208 mpts_flags |= MPTSF_BOUND_PORT; 1209 break; 1210#endif /* INET6 */ 1211 } 1212 1213 error = sobindlock(so, sa, 1); /* will lock/unlock socket */ 1214 if (error != 0) { 1215 (void) mptcp_subflow_soclose(mpts, so); 1216 goto out; 1217 } 1218 mpts->mpts_flags |= mpts_flags; 1219 1220 if (af == AF_INET || af == AF_INET6) { 1221 char sbuf[MAX_IPv6_STR_LEN]; 1222 1223 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx bindip %s[%d] " 1224 "cid %d\n", __func__, 1225 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 1226 inet_ntop(af, ((af == AF_INET) ? 1227 (void *)&SIN(sa)->sin_addr.s_addr : 1228 (void *)&SIN6(sa)->sin6_addr), sbuf, sizeof (sbuf)), 1229 ntohs(lport), mpts->mpts_connid)); 1230 } 1231 } 1232 1233 /* 1234 * Insert the subflow into the list, and associate the MPTCP PCB 1235 * as well as the the subflow socket. From this point on, removing 1236 * the subflow needs to be done via mptcp_subflow_del(). 1237 */ 1238 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry); 1239 mpte->mpte_numflows++; 1240 1241 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED); 1242 mpts->mpts_mpte = mpte; 1243 mpts->mpts_socket = so; 1244 MPTS_ADDREF_LOCKED(mpts); /* for being in MPTCP subflow list */ 1245 MPTS_ADDREF_LOCKED(mpts); /* for subflow socket */ 1246 mp_so->so_usecount++; /* for subflow socket */ 1247 1248 /* register for subflow socket read/write events */ 1249 (void) sock_setupcalls(so, mptcp_subflow_rupcall, mpts, 1250 mptcp_subflow_wupcall, mpts); 1251 1252 /* 1253 * Register for subflow socket control events; ignore 1254 * SO_FILT_HINT_CONNINFO_UPDATED from below since we 1255 * will generate it here. 1256 */ 1257 (void) sock_catchevents(so, mptcp_subflow_eupcall, mpts, 1258 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE | 1259 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT | 1260 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED | 1261 SO_FILT_HINT_SUSPEND | SO_FILT_HINT_RESUME | 1262 SO_FILT_HINT_CONNECTED | SO_FILT_HINT_DISCONNECTED | 1263 SO_FILT_HINT_MPFAILOVER | SO_FILT_HINT_MPSTATUS | 1264 SO_FILT_HINT_MUSTRST); 1265 1266 /* sanity check */ 1267 VERIFY(!(mpts->mpts_flags & 1268 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING))); 1269 1270 bzero(&mpcr, sizeof (mpcr)); 1271 mpcr.mpcr_proc = p; 1272 mpcr.mpcr_ifscope = ifscope; 1273 /* 1274 * Indicate to the TCP subflow whether or not it should establish 1275 * the initial MPTCP connection, or join an existing one. Fill 1276 * in the connection request structure with additional info needed 1277 * by the underlying TCP (to be used in the TCP options, etc.) 1278 */ 1279 MPT_LOCK(mp_tp); 1280 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) { 1281 if (mp_tp->mpt_state == MPTCPS_CLOSED) { 1282 mp_tp->mpt_localkey = mptcp_reserve_key(); 1283 mptcp_conn_properties(mp_tp); 1284 } 1285 MPT_UNLOCK(mp_tp); 1286 soisconnecting(mp_so); 1287 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ENABLE; 1288 } else { 1289 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) 1290 mpts->mpts_flags |= MPTSF_CONNECT_PENDING; 1291 MPT_UNLOCK(mp_tp); 1292 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ADD; 1293 } 1294 1295 mpts->mpts_mpcr = mpcr; 1296 mpts->mpts_flags |= MPTSF_CONNECTING; 1297 1298 if (af == AF_INET || af == AF_INET6) { 1299 char dbuf[MAX_IPv6_STR_LEN]; 1300 1301 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx dst %s[%d] cid %d " 1302 "[pending %s]\n", __func__, 1303 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 1304 inet_ntop(af, ((af == AF_INET) ? 1305 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr : 1306 (void *)&SIN6(dst_se->se_addr)->sin6_addr), 1307 dbuf, sizeof (dbuf)), ((af == AF_INET) ? 1308 ntohs(SIN(dst_se->se_addr)->sin_port) : 1309 ntohs(SIN6(dst_se->se_addr)->sin6_port)), 1310 mpts->mpts_connid, 1311 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ? 1312 "YES" : "NO"))); 1313 } 1314 1315 /* connect right away if first attempt, or if join can be done now */ 1316 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) 1317 error = mptcp_subflow_soconnectx(mpte, mpts); 1318 1319out: 1320 MPTS_UNLOCK(mpts); 1321 if (error == 0) { 1322 soevent(mp_so, SO_FILT_HINT_LOCKED | 1323 SO_FILT_HINT_CONNINFO_UPDATED); 1324 } 1325 return (error); 1326} 1327 1328static int 1329mptcp_delete_ok(struct mptses *mpte, struct mptsub *mpts) 1330{ 1331 int ret = 1; 1332 struct mptcb *mp_tp = NULL; 1333 1334 MPTE_LOCK_ASSERT_HELD(mpte); 1335 mp_tp = mpte->mpte_mptcb; 1336 VERIFY(mp_tp != NULL); 1337 MPTS_LOCK(mpts); 1338 MPT_LOCK(mp_tp); 1339 if ((mpts->mpts_soerror == 0) && 1340 (mpts->mpts_flags & MPTSF_ACTIVE) && 1341 (mp_tp->mpt_state != MPTCPS_CLOSED) && 1342 (mp_tp->mpt_state <= MPTCPS_TIME_WAIT)) 1343 ret = 0; 1344 MPT_UNLOCK(mp_tp); 1345 MPTS_UNLOCK(mpts); 1346 return (ret); 1347} 1348 1349/* 1350 * Delete/remove a subflow from an MPTCP. The underlying subflow socket 1351 * will no longer be accessible after a subflow is deleted, thus this 1352 * should occur only after the subflow socket has been disconnected. 1353 * If peeloff(2) is called, leave the socket open. 1354 */ 1355void 1356mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close) 1357{ 1358 struct socket *mp_so, *so; 1359 1360 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1361 mp_so = mpte->mpte_mppcb->mpp_socket; 1362 1363 MPTS_LOCK(mpts); 1364 so = mpts->mpts_socket; 1365 VERIFY(so != NULL); 1366 1367 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d " 1368 "[close %s] %d %x\n", __func__, 1369 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 1370 mp_so->so_usecount, 1371 mp_so->so_retaincnt, mpts->mpts_connid, 1372 (close ? "YES" : "NO"), mpts->mpts_soerror, 1373 mpts->mpts_flags)); 1374 1375 VERIFY(mpts->mpts_mpte == mpte); 1376 VERIFY(mpts->mpts_connid != CONNID_ANY && 1377 mpts->mpts_connid != CONNID_ALL); 1378 1379 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED); 1380 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED); 1381 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry); 1382 VERIFY(mpte->mpte_numflows != 0); 1383 mpte->mpte_numflows--; 1384 1385 /* 1386 * Drop references held by this subflow socket; there 1387 * will be no further upcalls made from this point. 1388 */ 1389 (void) sock_setupcalls(so, NULL, NULL, NULL, NULL); 1390 (void) sock_catchevents(so, NULL, NULL, 0); 1391 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so); 1392 if (close) 1393 (void) mptcp_subflow_soclose(mpts, so); 1394 1395 VERIFY(mp_so->so_usecount != 0); 1396 mp_so->so_usecount--; /* for subflow socket */ 1397 mpts->mpts_mpte = NULL; 1398 mpts->mpts_socket = NULL; 1399 MPTS_UNLOCK(mpts); 1400 1401 MPTS_REMREF(mpts); /* for MPTCP subflow list */ 1402 MPTS_REMREF(mpts); /* for subflow socket */ 1403 1404 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED); 1405} 1406 1407/* 1408 * Disconnect a subflow socket. 1409 */ 1410void 1411mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts, 1412 boolean_t deleteok) 1413{ 1414 struct socket *so; 1415 struct mptcb *mp_tp; 1416 int send_dfin = 0; 1417 1418 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1419 MPTS_LOCK_ASSERT_HELD(mpts); 1420 1421 VERIFY(mpts->mpts_mpte == mpte); 1422 VERIFY(mpts->mpts_socket != NULL); 1423 VERIFY(mpts->mpts_connid != CONNID_ANY && 1424 mpts->mpts_connid != CONNID_ALL); 1425 1426 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED)) 1427 return; 1428 1429 mpts->mpts_flags |= MPTSF_DISCONNECTING; 1430 1431 /* 1432 * If this is coming from disconnectx(2) or issued as part of 1433 * closing the MPTCP socket, the subflow shouldn't stick around. 1434 * Otherwise let it linger around in case the upper layers need 1435 * to retrieve its conninfo. 1436 */ 1437 if (deleteok) 1438 mpts->mpts_flags |= MPTSF_DELETEOK; 1439 1440 so = mpts->mpts_socket; 1441 mp_tp = mpte->mpte_mptcb; 1442 MPT_LOCK(mp_tp); 1443 if (mp_tp->mpt_state > MPTCPS_ESTABLISHED) 1444 send_dfin = 1; 1445 MPT_UNLOCK(mp_tp); 1446 1447 socket_lock(so, 0); 1448 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) && 1449 (so->so_state & SS_ISCONNECTED)) { 1450 mptcplog((LOG_DEBUG, "%s: cid %d fin %d [linger %s]\n", 1451 __func__, mpts->mpts_connid, send_dfin, 1452 (deleteok ? "NO" : "YES"))); 1453 1454 if (send_dfin) 1455 mptcp_send_dfin(so); 1456 (void) soshutdownlock(so, SHUT_RD); 1457 (void) soshutdownlock(so, SHUT_WR); 1458 (void) sodisconnectlocked(so); 1459 } 1460 socket_unlock(so, 0); 1461 /* 1462 * Generate a disconnect event for this subflow socket, in case 1463 * the lower layer doesn't do it; this is needed because the 1464 * subflow socket deletion relies on it. This will also end up 1465 * generating SO_FILT_HINT_CONNINFO_UPDATED on the MPTCP socket; 1466 * we cannot do that here because subflow lock is currently held. 1467 */ 1468 mptcp_subflow_eupcall(so, mpts, SO_FILT_HINT_DISCONNECTED); 1469} 1470 1471/* 1472 * Subflow socket read upcall. 1473 * 1474 * Called when the associated subflow socket posted a read event. The subflow 1475 * socket lock has been released prior to invoking the callback. Note that the 1476 * upcall may occur synchronously as a result of MPTCP performing an action on 1477 * it, or asynchronously as a result of an event happening at the subflow layer. 1478 * Therefore, to maintain lock ordering, the only lock that can be acquired 1479 * here is the thread lock, for signalling purposes. 1480 */ 1481static void 1482mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf) 1483{ 1484#pragma unused(so, waitf) 1485 struct mptsub *mpts = arg; 1486 struct mptses *mpte = mpts->mpts_mpte; 1487 1488 VERIFY(mpte != NULL); 1489 1490 lck_mtx_lock(&mpte->mpte_thread_lock); 1491 mptcp_thread_signal_locked(mpte); 1492 lck_mtx_unlock(&mpte->mpte_thread_lock); 1493} 1494 1495/* 1496 * Subflow socket input. 1497 * 1498 * Called in the context of the MPTCP thread, for reading data from the 1499 * underlying subflow socket and delivering it to MPTCP. 1500 */ 1501static void 1502mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts) 1503{ 1504 struct mbuf *m = NULL; 1505 struct socket *so; 1506 int error; 1507 struct mptsub *mpts_alt = NULL; 1508 1509 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1510 MPTS_LOCK_ASSERT_HELD(mpts); 1511 1512 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte, 1513 struct mptsub *, mpts); 1514 1515 if (!(mpts->mpts_flags & MPTSF_CONNECTED)) 1516 return; 1517 1518 so = mpts->mpts_socket; 1519 1520 error = sock_receive_internal(so, NULL, &m, 0, NULL); 1521 if (error != 0 && error != EWOULDBLOCK) { 1522 mptcplog((LOG_ERR, "%s: cid %d error %d\n", 1523 __func__, mpts->mpts_connid, error)); 1524 MPTS_UNLOCK(mpts); 1525 mpts_alt = mptcp_get_subflow(mpte, mpts); 1526 if (mpts_alt == NULL) { 1527 mptcplog((LOG_ERR, "%s: no alt path cid %d\n", 1528 __func__, mpts->mpts_connid)); 1529 mpte->mpte_mppcb->mpp_socket->so_error = error; 1530 } 1531 MPTS_LOCK(mpts); 1532 } else if (error == 0) { 1533 mptcplog3((LOG_DEBUG, "%s: cid %d \n", 1534 __func__, mpts->mpts_connid)); 1535 } 1536 1537 /* In fallback, make sure to accept data on all but one subflow */ 1538 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) && 1539 (!(mpts->mpts_flags & MPTSF_ACTIVE))) { 1540 m_freem(m); 1541 return; 1542 } 1543 1544 if (m != NULL) { 1545 /* 1546 * Release subflow lock since this may trigger MPTCP to send, 1547 * possibly on a different subflow. An extra reference has 1548 * been held on the subflow by the MPTCP thread before coming 1549 * here, so we can be sure that it won't go away, in the event 1550 * the MP socket lock gets released. 1551 */ 1552 MPTS_UNLOCK(mpts); 1553 mptcp_input(mpte, m); 1554 MPTS_LOCK(mpts); 1555 } 1556} 1557 1558/* 1559 * Subflow socket write upcall. 1560 * 1561 * Called when the associated subflow socket posted a read event. The subflow 1562 * socket lock has been released prior to invoking the callback. Note that the 1563 * upcall may occur synchronously as a result of MPTCP performing an action on 1564 * it, or asynchronously as a result of an event happening at the subflow layer. 1565 * Therefore, to maintain lock ordering, the only lock that can be acquired 1566 * here is the thread lock, for signalling purposes. 1567 */ 1568static void 1569mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf) 1570{ 1571#pragma unused(so, waitf) 1572 struct mptsub *mpts = arg; 1573 struct mptses *mpte = mpts->mpts_mpte; 1574 1575 VERIFY(mpte != NULL); 1576 1577 lck_mtx_lock(&mpte->mpte_thread_lock); 1578 mptcp_thread_signal_locked(mpte); 1579 lck_mtx_unlock(&mpte->mpte_thread_lock); 1580} 1581 1582/* 1583 * Subflow socket output. 1584 * 1585 * Called for sending data from MPTCP to the underlying subflow socket. 1586 */ 1587int 1588mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts) 1589{ 1590 struct socket *mp_so, *so; 1591 size_t sb_cc = 0, tot_sent = 0; 1592 struct mbuf *sb_mb; 1593 int error = 0; 1594 u_int64_t mpt_dsn = 0; 1595 struct mptcb *mp_tp = mpte->mpte_mptcb; 1596 struct mbuf *mpt_mbuf = NULL; 1597 unsigned int off = 0; 1598 1599 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1600 MPTS_LOCK_ASSERT_HELD(mpts); 1601 mp_so = mpte->mpte_mppcb->mpp_socket; 1602 so = mpts->mpts_socket; 1603 1604 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte, 1605 struct mptsub *, mpts); 1606 1607 /* subflow socket is suspended? */ 1608 if (mpts->mpts_flags & MPTSF_SUSPENDED) { 1609 mptcplog((LOG_ERR, "%s: mp_so 0x%llx cid %d is flow " 1610 "controlled\n", __func__, 1611 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid)); 1612 goto out; 1613 } 1614 1615 /* subflow socket is not MPTCP capable? */ 1616 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) && 1617 !(mpts->mpts_flags & MPTSF_MP_DEGRADED)) { 1618 mptcplog((LOG_ERR, "%s: mp_so 0x%llx cid %d not " 1619 "MPTCP capable\n", __func__, 1620 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid)); 1621 goto out; 1622 } 1623 1624 /* Remove Addr Option is not sent reliably as per I-D */ 1625 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) { 1626 struct tcpcb *tp = intotcpcb(sotoinpcb(so)); 1627 tp->t_rem_aid = mpte->mpte_lost_aid; 1628 if (mptcp_remaddr_enable) 1629 tp->t_mpflags |= TMPF_SND_REM_ADDR; 1630 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR; 1631 } 1632 1633 /* 1634 * The mbuf chains containing the metadata (as well as pointing to 1635 * the user data sitting at the MPTCP output queue) would then be 1636 * sent down to the subflow socket. 1637 * 1638 * Some notes on data sequencing: 1639 * 1640 * a. Each mbuf must be a M_PKTHDR. 1641 * b. MPTCP metadata is stored in the mptcp_pktinfo structure 1642 * in the mbuf pkthdr structure. 1643 * c. Each mbuf containing the MPTCP metadata must have its 1644 * pkt_flags marked with the PKTF_MPTCP flag. 1645 */ 1646 1647 /* First, drop acknowledged data */ 1648 sb_mb = mp_so->so_snd.sb_mb; 1649 if (sb_mb == NULL) { 1650 goto out; 1651 } 1652 1653 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP); 1654 1655 mpt_mbuf = sb_mb; 1656 while (mpt_mbuf && mpt_mbuf->m_pkthdr.mp_rlen == 0) { 1657 mpt_mbuf = mpt_mbuf->m_next; 1658 } 1659 if (mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)) { 1660 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn; 1661 } else { 1662 goto out; 1663 } 1664 1665 MPT_LOCK(mp_tp); 1666 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) { 1667 int len = 0; 1668 len = mp_tp->mpt_snduna - mpt_dsn; 1669 sbdrop(&mp_so->so_snd, len); 1670 1671 } 1672 1673 /* 1674 * In degraded mode, we don't receive data acks, so force free 1675 * mbufs less than snd_nxt 1676 */ 1677 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn; 1678 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) && 1679 MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_sndnxt)) { 1680 int len = 0; 1681 len = mp_tp->mpt_sndnxt - mpt_dsn; 1682 sbdrop(&mp_so->so_snd, len); 1683 mp_tp->mpt_snduna = mp_tp->mpt_sndnxt; 1684 } 1685 1686 /* 1687 * Adjust the subflow's notion of next byte to send based on 1688 * the last unacknowledged byte 1689 */ 1690 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_snduna)) { 1691 mpts->mpts_sndnxt = mp_tp->mpt_snduna; 1692 } 1693 1694 /* 1695 * Adjust the top level notion of next byte used for retransmissions 1696 * and sending FINs. 1697 */ 1698 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) { 1699 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna; 1700 } 1701 1702 1703 /* Now determine the offset from which to start transmitting data */ 1704 sb_mb = mp_so->so_snd.sb_mb; 1705 sb_cc = mp_so->so_snd.sb_cc; 1706 if (sb_mb == NULL) { 1707 MPT_UNLOCK(mp_tp); 1708 goto out; 1709 } 1710 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_sndmax)) { 1711 off = mpts->mpts_sndnxt - mp_tp->mpt_snduna; 1712 sb_cc -= off; 1713 } else { 1714 MPT_UNLOCK(mp_tp); 1715 goto out; 1716 } 1717 MPT_UNLOCK(mp_tp); 1718 1719 mpt_mbuf = sb_mb; 1720 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn; 1721 1722 while (mpt_mbuf && ((mpt_mbuf->m_pkthdr.mp_rlen == 0) || 1723 (mpt_mbuf->m_pkthdr.mp_rlen <= off))) { 1724 off -= mpt_mbuf->m_pkthdr.mp_rlen; 1725 mpt_mbuf = mpt_mbuf->m_next; 1726 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn; 1727 } 1728 if ((mpts->mpts_connid == 2) || (mpts->mpts_flags & MPTSF_MP_DEGRADED)) 1729 mptcplog((LOG_INFO, "%s: snduna = %llu off = %d id = %d" 1730 " %llu \n", 1731 __func__, 1732 mp_tp->mpt_snduna, off, mpts->mpts_connid, 1733 mpts->mpts_sndnxt)); 1734 1735 VERIFY(mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)); 1736 1737 while (tot_sent < sb_cc) { 1738 struct mbuf *m; 1739 size_t mlen, len = 0; 1740 1741 mlen = mpt_mbuf->m_pkthdr.mp_rlen; 1742 mlen -= off; 1743 if (mlen == 0) 1744 goto out; 1745 1746 if (mlen > sb_cc) { 1747 panic("%s: unexpected %lu %lu \n", __func__, 1748 mlen, sb_cc); 1749 } 1750 1751 m = m_copym_mode(mpt_mbuf, off, mlen, M_DONTWAIT, 1752 M_COPYM_COPY_HDR); 1753 if (m == NULL) { 1754 error = ENOBUFS; 1755 break; 1756 } 1757 1758 /* Create a DSN mapping for the data (m_copym does it) */ 1759 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn; 1760 m->m_pkthdr.pkt_flags |= PKTF_MPTCP; 1761 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO; 1762 m->m_pkthdr.mp_dsn = mpt_dsn + off; 1763 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq; 1764 m->m_pkthdr.mp_rlen = mlen; 1765 mpts->mpts_rel_seq += mlen; 1766 m->m_pkthdr.len = mlen; 1767 1768 /* last contiguous mapping is stored for error cases */ 1769 if (mpts->mpts_lastmap.mptsl_dsn + 1770 mpts->mpts_lastmap.mptsl_len == mpt_dsn) { 1771 mpts->mpts_lastmap.mptsl_len += tot_sent; 1772 } else if (MPTCP_SEQ_LT((mpts->mpts_lastmap.mptsl_dsn + 1773 mpts->mpts_lastmap.mptsl_len), mpt_dsn)) { 1774 if (m->m_pkthdr.mp_dsn == 0) 1775 panic("%s %llu", __func__, mpt_dsn); 1776 mpts->mpts_lastmap.mptsl_dsn = m->m_pkthdr.mp_dsn; 1777 mpts->mpts_lastmap.mptsl_sseq = m->m_pkthdr.mp_rseq; 1778 mpts->mpts_lastmap.mptsl_len = m->m_pkthdr.mp_rlen; 1779 } 1780 1781 error = sock_sendmbuf(so, NULL, m, 0, &len); 1782 DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so, 1783 struct sockbuf *, &so->so_rcv, 1784 struct sockbuf *, &so->so_snd, 1785 struct mptses *, mpte, struct mptsub *, mpts, 1786 size_t, mlen); 1787 if (error != 0) { 1788 mptcplog((LOG_ERR, "%s: len = %zd error = %d \n", 1789 __func__, len, error)); 1790 break; 1791 } 1792 mpts->mpts_sndnxt += mlen; 1793 MPT_LOCK(mp_tp); 1794 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mpts->mpts_sndnxt)) { 1795 if (MPTCP_DATASEQ_HIGH32(mpts->mpts_sndnxt) > 1796 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) 1797 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN; 1798 mp_tp->mpt_sndnxt = mpts->mpts_sndnxt; 1799 } 1800 MPT_UNLOCK(mp_tp); 1801 if (len != mlen) { 1802 mptcplog((LOG_ERR, "%s: cid %d wrote %d " 1803 "(expected %d)\n", __func__, 1804 mpts->mpts_connid, len, mlen)); 1805 } 1806 tot_sent += mlen; 1807 off = 0; 1808 mpt_mbuf = mpt_mbuf->m_next; 1809 } 1810 1811 if (error != 0 && error != EWOULDBLOCK) { 1812 mptcplog((LOG_ERR, "MPTCP ERROR %s: cid %d error %d\n", 1813 __func__, mpts->mpts_connid, error)); 1814 } if (error == 0) { 1815 if ((mpts->mpts_connid == 2) || 1816 (mpts->mpts_flags & MPTSF_MP_DEGRADED)) 1817 mptcplog((LOG_DEBUG, "%s: cid %d wrote %d %d\n", 1818 __func__, mpts->mpts_connid, tot_sent, 1819 sb_cc)); 1820 MPT_LOCK(mp_tp); 1821 mptcp_cancel_timer(mp_tp, MPTT_REXMT); 1822 MPT_UNLOCK(mp_tp); 1823 } 1824out: 1825 return (error); 1826} 1827 1828/* 1829 * Subflow socket control event upcall. 1830 * 1831 * Called when the associated subflow socket posted one or more control events. 1832 * The subflow socket lock has been released prior to invoking the callback. 1833 * Note that the upcall may occur synchronously as a result of MPTCP performing 1834 * an action on it, or asynchronously as a result of an event happening at the 1835 * subflow layer. Therefore, to maintain lock ordering, the only lock that can 1836 * be acquired here is the thread lock, for signalling purposes. 1837 */ 1838static void 1839mptcp_subflow_eupcall(struct socket *so, void *arg, uint32_t events) 1840{ 1841#pragma unused(so) 1842 struct mptsub *mpts = arg; 1843 struct mptses *mpte = mpts->mpts_mpte; 1844 1845 VERIFY(mpte != NULL); 1846 1847 lck_mtx_lock(&mpte->mpte_thread_lock); 1848 atomic_bitset_32(&mpts->mpts_evctl, events); 1849 mptcp_thread_signal_locked(mpte); 1850 lck_mtx_unlock(&mpte->mpte_thread_lock); 1851} 1852 1853/* 1854 * Subflow socket control events. 1855 * 1856 * Called for handling events related to the underlying subflow socket. 1857 */ 1858static ev_ret_t 1859mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts) 1860{ 1861 uint32_t events; 1862 ev_ret_t ret = MPTS_EVRET_OK; 1863 1864 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1865 MPTS_LOCK_ASSERT_HELD(mpts); 1866 1867 /* bail if there's nothing to process */ 1868 if ((events = mpts->mpts_evctl) == 0) 1869 return (ret); 1870 1871 if (events & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST| 1872 SO_FILT_HINT_CANTRCVMORE|SO_FILT_HINT_CANTSENDMORE| 1873 SO_FILT_HINT_TIMEOUT|SO_FILT_HINT_NOSRCADDR| 1874 SO_FILT_HINT_IFDENIED|SO_FILT_HINT_SUSPEND| 1875 SO_FILT_HINT_DISCONNECTED)) { 1876 events |= SO_FILT_HINT_MPFAILOVER; 1877 } 1878 1879 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte, 1880 struct mptsub *, mpts, uint32_t, events); 1881 1882 mptcplog2((LOG_DEBUG, "%s: cid %d events=%b\n", __func__, 1883 mpts->mpts_connid, events, SO_FILT_HINT_BITS)); 1884 1885 if ((events & SO_FILT_HINT_MPFAILOVER) && (ret >= MPTS_EVRET_OK)) { 1886 ev_ret_t error = mptcp_subflow_failover_ev(mpte, mpts); 1887 events &= ~SO_FILT_HINT_MPFAILOVER; 1888 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 1889 } 1890 if ((events & SO_FILT_HINT_CONNRESET) && (ret >= MPTS_EVRET_OK)) { 1891 ev_ret_t error = mptcp_subflow_connreset_ev(mpte, mpts); 1892 events &= ~SO_FILT_HINT_CONNRESET; 1893 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 1894 } 1895 if ((events & SO_FILT_HINT_MUSTRST) && (ret >= MPTS_EVRET_OK)) { 1896 ev_ret_t error = mptcp_subflow_mustrst_ev(mpte, mpts); 1897 events &= ~SO_FILT_HINT_MUSTRST; 1898 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 1899 } 1900 if ((events & SO_FILT_HINT_CANTRCVMORE) && (ret >= MPTS_EVRET_OK)) { 1901 ev_ret_t error = mptcp_subflow_cantrcvmore_ev(mpte, mpts); 1902 events &= ~SO_FILT_HINT_CANTRCVMORE; 1903 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 1904 } 1905 if ((events & SO_FILT_HINT_CANTSENDMORE) && (ret >= MPTS_EVRET_OK)) { 1906 ev_ret_t error = mptcp_subflow_cantsendmore_ev(mpte, mpts); 1907 events &= ~SO_FILT_HINT_CANTSENDMORE; 1908 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 1909 } 1910 if ((events & SO_FILT_HINT_TIMEOUT) && (ret >= MPTS_EVRET_OK)) { 1911 ev_ret_t error = mptcp_subflow_timeout_ev(mpte, mpts); 1912 events &= ~SO_FILT_HINT_TIMEOUT; 1913 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 1914 } 1915 if ((events & SO_FILT_HINT_NOSRCADDR) && (ret >= MPTS_EVRET_OK)) { 1916 ev_ret_t error = mptcp_subflow_nosrcaddr_ev(mpte, mpts); 1917 events &= ~SO_FILT_HINT_NOSRCADDR; 1918 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 1919 } 1920 if ((events & SO_FILT_HINT_IFDENIED) && (ret >= MPTS_EVRET_OK)) { 1921 ev_ret_t error = mptcp_subflow_ifdenied_ev(mpte, mpts); 1922 events &= ~SO_FILT_HINT_IFDENIED; 1923 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 1924 } 1925 if ((events & SO_FILT_HINT_SUSPEND) && (ret >= MPTS_EVRET_OK)) { 1926 ev_ret_t error = mptcp_subflow_suspend_ev(mpte, mpts); 1927 events &= ~SO_FILT_HINT_SUSPEND; 1928 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 1929 } 1930 if ((events & SO_FILT_HINT_RESUME) && (ret >= MPTS_EVRET_OK)) { 1931 ev_ret_t error = mptcp_subflow_resume_ev(mpte, mpts); 1932 events &= ~SO_FILT_HINT_RESUME; 1933 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 1934 } 1935 if ((events & SO_FILT_HINT_CONNECTED) && (ret >= MPTS_EVRET_OK)) { 1936 ev_ret_t error = mptcp_subflow_connected_ev(mpte, mpts); 1937 events &= ~SO_FILT_HINT_CONNECTED; 1938 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 1939 } 1940 if ((events & SO_FILT_HINT_MPSTATUS) && (ret >= MPTS_EVRET_OK)) { 1941 ev_ret_t error = mptcp_subflow_mpstatus_ev(mpte, mpts); 1942 events &= ~SO_FILT_HINT_MPSTATUS; 1943 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 1944 } 1945 if ((events & SO_FILT_HINT_DISCONNECTED) && (ret >= MPTS_EVRET_OK)) { 1946 ev_ret_t error = mptcp_subflow_disconnected_ev(mpte, mpts); 1947 events &= ~SO_FILT_HINT_DISCONNECTED; 1948 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error); 1949 } 1950 /* 1951 * We should be getting only events specified via sock_catchevents(), 1952 * so loudly complain if we have any unprocessed one(s). 1953 */ 1954 if (events != 0 || ret < MPTS_EVRET_OK) { 1955 mptcplog((LOG_ERR, "%s%s: cid %d evret %s (%d)" 1956 " unhandled events=%b\n", 1957 (events != 0) ? "MPTCP_ERROR " : "", 1958 __func__, mpts->mpts_connid, 1959 mptcp_evret2str(ret), ret, events, SO_FILT_HINT_BITS)); 1960 } 1961 1962 /* clear the ones we've processed */ 1963 atomic_bitclear_32(&mpts->mpts_evctl, ~events); 1964 1965 return (ret); 1966} 1967 1968/* 1969 * Handle SO_FILT_HINT_CONNRESET subflow socket event. 1970 */ 1971static ev_ret_t 1972mptcp_subflow_connreset_ev(struct mptses *mpte, struct mptsub *mpts) 1973{ 1974 struct socket *mp_so, *so; 1975 struct mptcb *mp_tp; 1976 boolean_t linger; 1977 1978 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1979 MPTS_LOCK_ASSERT_HELD(mpts); 1980 VERIFY(mpte->mpte_mppcb != NULL); 1981 mp_so = mpte->mpte_mppcb->mpp_socket; 1982 mp_tp = mpte->mpte_mptcb; 1983 so = mpts->mpts_socket; 1984 1985 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) && 1986 !(mp_so->so_flags & SOF_PCBCLEARING)); 1987 1988 mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__, 1989 mpts->mpts_connid, (linger ? "YES" : "NO"))); 1990 1991 if (mpts->mpts_soerror == 0) 1992 mpts->mpts_soerror = ECONNREFUSED; 1993 1994 /* 1995 * We got a TCP RST for this subflow connection. 1996 * 1997 * Right now, we simply propagate ECONNREFUSED to the MPTCP socket 1998 * client if the MPTCP connection has not been established. Otherwise 1999 * we close the socket. 2000 */ 2001 mptcp_subflow_disconnect(mpte, mpts, !linger); 2002 2003 MPT_LOCK(mp_tp); 2004 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { 2005 mp_so->so_error = ECONNREFUSED; 2006 } 2007 MPT_UNLOCK(mp_tp); 2008 2009 /* 2010 * Keep the subflow socket around, unless the MPTCP socket has 2011 * been detached or the subflow has been disconnected explicitly, 2012 * in which case it should be deleted right away. 2013 */ 2014 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE); 2015} 2016 2017/* 2018 * Handle SO_FILT_HINT_CANTRCVMORE subflow socket event. 2019 */ 2020static ev_ret_t 2021mptcp_subflow_cantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts) 2022{ 2023 struct socket *so; 2024 2025 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2026 MPTS_LOCK_ASSERT_HELD(mpts); 2027 2028 so = mpts->mpts_socket; 2029 2030 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid)); 2031 2032 /* 2033 * We got a FIN for this subflow connection. This subflow socket 2034 * is no longer available for receiving data; 2035 * The FIN may arrive with data. The data is handed up to the 2036 * mptcp socket and the subflow is disconnected. 2037 */ 2038 2039 return (MPTS_EVRET_OK); /* keep the subflow socket around */ 2040} 2041 2042/* 2043 * Handle SO_FILT_HINT_CANTSENDMORE subflow socket event. 2044 */ 2045static ev_ret_t 2046mptcp_subflow_cantsendmore_ev(struct mptses *mpte, struct mptsub *mpts) 2047{ 2048 struct socket *so; 2049 2050 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2051 MPTS_LOCK_ASSERT_HELD(mpts); 2052 2053 so = mpts->mpts_socket; 2054 2055 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid)); 2056 return (MPTS_EVRET_OK); /* keep the subflow socket around */ 2057} 2058 2059/* 2060 * Handle SO_FILT_HINT_TIMEOUT subflow socket event. 2061 */ 2062static ev_ret_t 2063mptcp_subflow_timeout_ev(struct mptses *mpte, struct mptsub *mpts) 2064{ 2065 struct socket *mp_so, *so; 2066 struct mptcb *mp_tp; 2067 boolean_t linger; 2068 2069 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2070 MPTS_LOCK_ASSERT_HELD(mpts); 2071 VERIFY(mpte->mpte_mppcb != NULL); 2072 mp_so = mpte->mpte_mppcb->mpp_socket; 2073 mp_tp = mpte->mpte_mptcb; 2074 so = mpts->mpts_socket; 2075 2076 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) && 2077 !(mp_so->so_flags & SOF_PCBCLEARING)); 2078 2079 mptcplog((LOG_NOTICE, "%s: cid %d [linger %s]\n", __func__, 2080 mpts->mpts_connid, (linger ? "YES" : "NO"))); 2081 2082 if (mpts->mpts_soerror == 0) 2083 mpts->mpts_soerror = ETIMEDOUT; 2084 2085 /* 2086 * The subflow connection has timed out. 2087 * 2088 * Right now, we simply propagate ETIMEDOUT to the MPTCP socket 2089 * client if the MPTCP connection has not been established. Otherwise 2090 * drop it. 2091 */ 2092 mptcp_subflow_disconnect(mpte, mpts, !linger); 2093 2094 MPT_LOCK(mp_tp); 2095 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { 2096 mp_so->so_error = ETIMEDOUT; 2097 } 2098 MPT_UNLOCK(mp_tp); 2099 2100 /* 2101 * Keep the subflow socket around, unless the MPTCP socket has 2102 * been detached or the subflow has been disconnected explicitly, 2103 * in which case it should be deleted right away. 2104 */ 2105 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE); 2106} 2107 2108/* 2109 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event. 2110 */ 2111static ev_ret_t 2112mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts) 2113{ 2114 struct socket *mp_so, *so; 2115 struct mptcb *mp_tp; 2116 boolean_t linger; 2117 struct tcpcb *tp = NULL; 2118 2119 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2120 MPTS_LOCK_ASSERT_HELD(mpts); 2121 2122 VERIFY(mpte->mpte_mppcb != NULL); 2123 mp_so = mpte->mpte_mppcb->mpp_socket; 2124 mp_tp = mpte->mpte_mptcb; 2125 so = mpts->mpts_socket; 2126 2127 /* Not grabbing socket lock as t_local_aid is write once only */ 2128 tp = intotcpcb(sotoinpcb(so)); 2129 /* 2130 * This overwrites any previous mpte_lost_aid to avoid storing 2131 * too much state when the typical case has only two subflows. 2132 */ 2133 mpte->mpte_flags |= MPTE_SND_REM_ADDR; 2134 mpte->mpte_lost_aid = tp->t_local_aid; 2135 2136 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) && 2137 !(mp_so->so_flags & SOF_PCBCLEARING)); 2138 2139 mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__, 2140 mpts->mpts_connid, (linger ? "YES" : "NO"))); 2141 2142 if (mpts->mpts_soerror == 0) 2143 mpts->mpts_soerror = EADDRNOTAVAIL; 2144 2145 /* 2146 * The subflow connection has lost its source address. 2147 * 2148 * Right now, we simply propagate EADDRNOTAVAIL to the MPTCP socket 2149 * client if the MPTCP connection has not been established. If it 2150 * has been established with one subflow , we keep the MPTCP 2151 * connection valid without any subflows till closed by application. 2152 * This lets tcp connection manager decide whether to close this or 2153 * not as it reacts to reachability changes too. 2154 */ 2155 mptcp_subflow_disconnect(mpte, mpts, !linger); 2156 2157 MPT_LOCK(mp_tp); 2158 if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) && 2159 (mp_so->so_flags & SOF_NOADDRAVAIL)) { 2160 mp_so->so_error = EADDRNOTAVAIL; 2161 } 2162 MPT_UNLOCK(mp_tp); 2163 2164 /* 2165 * Keep the subflow socket around, unless the MPTCP socket has 2166 * been detached or the subflow has been disconnected explicitly, 2167 * in which case it should be deleted right away. 2168 */ 2169 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE); 2170} 2171 2172/* 2173 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event 2174 */ 2175static ev_ret_t 2176mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts) 2177{ 2178 struct mptsub *mpts_alt = NULL; 2179 struct socket *so = NULL; 2180 struct socket *mp_so; 2181 int altpath_exists = 0; 2182 2183 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2184 MPTS_LOCK_ASSERT_HELD(mpts); 2185 mp_so = mpte->mpte_mppcb->mpp_socket; 2186 mptcplog2((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__, 2187 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so))); 2188 2189 MPTS_UNLOCK(mpts); 2190 mpts_alt = mptcp_get_subflow(mpte, mpts); 2191 2192 /* 2193 * If there is no alternate eligible subflow, ignore the 2194 * failover hint. 2195 */ 2196 if (mpts_alt == NULL) { 2197 mptcplog2((LOG_WARNING, "%s: no alternate path\n", __func__)); 2198 MPTS_LOCK(mpts); 2199 goto done; 2200 } 2201 MPTS_LOCK(mpts_alt); 2202 altpath_exists = 1; 2203 so = mpts_alt->mpts_socket; 2204 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) { 2205 socket_lock(so, 1); 2206 /* All data acknowledged */ 2207 if (so->so_snd.sb_cc == 0) { 2208 so->so_flags &= ~SOF_MP_TRYFAILOVER; 2209 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER; 2210 } else { 2211 /* no alternate path available */ 2212 altpath_exists = 0; 2213 } 2214 socket_unlock(so, 1); 2215 } 2216 if (altpath_exists) { 2217 mpts_alt->mpts_flags |= MPTSF_ACTIVE; 2218 struct mptcb *mp_tp = mpte->mpte_mptcb; 2219 /* Bring the subflow's notion of snd_nxt into the send window */ 2220 MPT_LOCK(mp_tp); 2221 mpts_alt->mpts_sndnxt = mp_tp->mpt_snduna; 2222 MPT_UNLOCK(mp_tp); 2223 mpte->mpte_active_sub = mpts_alt; 2224 socket_lock(so, 1); 2225 sowwakeup(so); 2226 socket_unlock(so, 1); 2227 } 2228 MPTS_UNLOCK(mpts_alt); 2229 2230 if (altpath_exists) { 2231 soevent(mp_so, 2232 SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED); 2233 mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx switched from " 2234 "%d to %d\n", __func__, 2235 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 2236 mpts->mpts_connid, mpts_alt->mpts_connid)); 2237 tcpstat.tcps_mp_switches++; 2238 } 2239 2240 MPTS_LOCK(mpts); 2241 if (altpath_exists) { 2242 mpts->mpts_flags |= MPTSF_FAILINGOVER; 2243 mpts->mpts_flags &= ~MPTSF_ACTIVE; 2244 } else { 2245 so = mpts->mpts_socket; 2246 socket_lock(so, 1); 2247 so->so_flags &= ~SOF_MP_TRYFAILOVER; 2248 socket_unlock(so, 1); 2249 } 2250done: 2251 MPTS_LOCK_ASSERT_HELD(mpts); 2252 return (MPTS_EVRET_OK); 2253} 2254 2255/* 2256 * Handle SO_FILT_HINT_IFDENIED subflow socket event. 2257 */ 2258static ev_ret_t 2259mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts) 2260{ 2261 struct socket *mp_so, *so; 2262 struct mptcb *mp_tp; 2263 boolean_t linger; 2264 2265 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2266 MPTS_LOCK_ASSERT_HELD(mpts); 2267 VERIFY(mpte->mpte_mppcb != NULL); 2268 mp_so = mpte->mpte_mppcb->mpp_socket; 2269 mp_tp = mpte->mpte_mptcb; 2270 so = mpts->mpts_socket; 2271 2272 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) && 2273 !(mp_so->so_flags & SOF_PCBCLEARING)); 2274 2275 mptcplog((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__, 2276 mpts->mpts_connid, (linger ? "YES" : "NO"))); 2277 2278 if (mpts->mpts_soerror == 0) 2279 mpts->mpts_soerror = EHOSTUNREACH; 2280 2281 /* 2282 * The subflow connection cannot use the outgoing interface. 2283 * 2284 * Right now, we simply propagate EHOSTUNREACH to the MPTCP socket 2285 * client if the MPTCP connection has not been established. If it 2286 * has been established, let the upper layer call disconnectx. 2287 */ 2288 mptcp_subflow_disconnect(mpte, mpts, !linger); 2289 MPTS_UNLOCK(mpts); 2290 2291 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED); 2292 2293 MPT_LOCK(mp_tp); 2294 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { 2295 mp_so->so_error = EHOSTUNREACH; 2296 } 2297 MPT_UNLOCK(mp_tp); 2298 2299 MPTS_LOCK(mpts); 2300 /* 2301 * Keep the subflow socket around, unless the MPTCP socket has 2302 * been detached or the subflow has been disconnected explicitly, 2303 * in which case it should be deleted right away. 2304 */ 2305 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE); 2306} 2307 2308/* 2309 * Handle SO_FILT_HINT_SUSPEND subflow socket event. 2310 */ 2311static ev_ret_t 2312mptcp_subflow_suspend_ev(struct mptses *mpte, struct mptsub *mpts) 2313{ 2314 struct socket *so; 2315 2316 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2317 MPTS_LOCK_ASSERT_HELD(mpts); 2318 2319 so = mpts->mpts_socket; 2320 2321 /* the subflow connection is being flow controlled */ 2322 mpts->mpts_flags |= MPTSF_SUSPENDED; 2323 2324 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, 2325 mpts->mpts_connid)); 2326 2327 return (MPTS_EVRET_OK); /* keep the subflow socket around */ 2328} 2329 2330/* 2331 * Handle SO_FILT_HINT_RESUME subflow socket event. 2332 */ 2333static ev_ret_t 2334mptcp_subflow_resume_ev(struct mptses *mpte, struct mptsub *mpts) 2335{ 2336 struct socket *so; 2337 2338 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2339 MPTS_LOCK_ASSERT_HELD(mpts); 2340 2341 so = mpts->mpts_socket; 2342 2343 /* the subflow connection is no longer flow controlled */ 2344 mpts->mpts_flags &= ~MPTSF_SUSPENDED; 2345 2346 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid)); 2347 2348 return (MPTS_EVRET_OK); /* keep the subflow socket around */ 2349} 2350 2351/* 2352 * Handle SO_FILT_HINT_CONNECTED subflow socket event. 2353 */ 2354static ev_ret_t 2355mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts) 2356{ 2357 char buf0[MAX_IPv6_STR_LEN], buf1[MAX_IPv6_STR_LEN]; 2358 struct sockaddr_entry *src_se, *dst_se; 2359 struct sockaddr_storage src; 2360 struct socket *mp_so, *so; 2361 struct mptcb *mp_tp; 2362 struct ifnet *outifp; 2363 int af, error = 0; 2364 boolean_t mpok = FALSE; 2365 2366 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2367 VERIFY(mpte->mpte_mppcb != NULL); 2368 mp_so = mpte->mpte_mppcb->mpp_socket; 2369 mp_tp = mpte->mpte_mptcb; 2370 2371 MPTS_LOCK_ASSERT_HELD(mpts); 2372 so = mpts->mpts_socket; 2373 af = mpts->mpts_family; 2374 2375 if (mpts->mpts_flags & MPTSF_CONNECTED) 2376 return (MPTS_EVRET_OK); 2377 2378 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) || 2379 (mpts->mpts_flags & MPTSF_DISCONNECTING)) { 2380 return (MPTS_EVRET_OK); 2381 } 2382 2383 /* 2384 * The subflow connection has been connected. Find out whether it 2385 * is connected as a regular TCP or as a MPTCP subflow. The idea is: 2386 * 2387 * a. If MPTCP connection is not yet established, then this must be 2388 * the first subflow connection. If MPTCP failed to negotiate, 2389 * indicate to the MPTCP socket client via EPROTO, that the 2390 * underlying TCP connection may be peeled off via peeloff(2). 2391 * Otherwise, mark the MPTCP socket as connected. 2392 * 2393 * b. If MPTCP connection has been established, then this must be 2394 * one of the subsequent subflow connections. If MPTCP failed 2395 * to negotiate, disconnect the connection since peeloff(2) 2396 * is no longer possible. 2397 * 2398 * Right now, we simply unblock any waiters at the MPTCP socket layer 2399 * if the MPTCP connection has not been established. 2400 */ 2401 socket_lock(so, 0); 2402 2403 if (so->so_state & SS_ISDISCONNECTED) { 2404 /* 2405 * With MPTCP joins, a connection is connected at the subflow 2406 * level, but the 4th ACK from the server elevates the MPTCP 2407 * subflow to connected state. So there is a small window 2408 * where the subflow could get disconnected before the 2409 * connected event is processed. 2410 */ 2411 socket_unlock(so, 0); 2412 return (MPTS_EVRET_OK); 2413 } 2414 2415 mpts->mpts_soerror = 0; 2416 mpts->mpts_flags &= ~MPTSF_CONNECTING; 2417 mpts->mpts_flags |= MPTSF_CONNECTED; 2418 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) 2419 mpts->mpts_flags |= MPTSF_MP_CAPABLE; 2420 2421 VERIFY(mpts->mpts_dst_sl != NULL); 2422 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head); 2423 VERIFY(dst_se != NULL && dst_se->se_addr != NULL && 2424 dst_se->se_addr->sa_family == af); 2425 2426 VERIFY(mpts->mpts_src_sl != NULL); 2427 src_se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head); 2428 VERIFY(src_se != NULL && src_se->se_addr != NULL && 2429 src_se->se_addr->sa_family == af); 2430 2431 /* get/check source IP address */ 2432 switch (af) { 2433 case AF_INET: { 2434 error = in_getsockaddr_s(so, &src); 2435 if (error == 0) { 2436 struct sockaddr_in *ms = SIN(src_se->se_addr); 2437 struct sockaddr_in *s = SIN(&src); 2438 2439 VERIFY(s->sin_len == ms->sin_len); 2440 VERIFY(ms->sin_family == AF_INET); 2441 2442 if ((mpts->mpts_flags & MPTSF_BOUND_IP) && 2443 bcmp(&ms->sin_addr, &s->sin_addr, 2444 sizeof (ms->sin_addr)) != 0) { 2445 mptcplog((LOG_ERR, "%s: cid %d local " 2446 "address %s (expected %s)\n", __func__, 2447 mpts->mpts_connid, inet_ntop(AF_INET, 2448 (void *)&s->sin_addr.s_addr, buf0, 2449 sizeof (buf0)), inet_ntop(AF_INET, 2450 (void *)&ms->sin_addr.s_addr, buf1, 2451 sizeof (buf1)))); 2452 } 2453 bcopy(s, ms, sizeof (*s)); 2454 } 2455 break; 2456 } 2457#if INET6 2458 case AF_INET6: { 2459 error = in6_getsockaddr_s(so, &src); 2460 if (error == 0) { 2461 struct sockaddr_in6 *ms = SIN6(src_se->se_addr); 2462 struct sockaddr_in6 *s = SIN6(&src); 2463 2464 VERIFY(s->sin6_len == ms->sin6_len); 2465 VERIFY(ms->sin6_family == AF_INET6); 2466 2467 if ((mpts->mpts_flags & MPTSF_BOUND_IP) && 2468 bcmp(&ms->sin6_addr, &s->sin6_addr, 2469 sizeof (ms->sin6_addr)) != 0) { 2470 mptcplog((LOG_ERR, "%s: cid %d local " 2471 "address %s (expected %s)\n", __func__, 2472 mpts->mpts_connid, inet_ntop(AF_INET6, 2473 (void *)&s->sin6_addr, buf0, 2474 sizeof (buf0)), inet_ntop(AF_INET6, 2475 (void *)&ms->sin6_addr, buf1, 2476 sizeof (buf1)))); 2477 } 2478 bcopy(s, ms, sizeof (*s)); 2479 } 2480 break; 2481 } 2482#endif /* INET6 */ 2483 default: 2484 VERIFY(0); 2485 /* NOTREACHED */ 2486 } 2487 2488 if (error != 0) { 2489 mptcplog((LOG_ERR, "%s: cid %d getsockaddr failed (%d)\n", 2490 __func__, mpts->mpts_connid, error)); 2491 } 2492 2493 /* get/verify the outbound interface */ 2494 outifp = sotoinpcb(so)->inp_last_outifp; /* could be NULL */ 2495 if (mpts->mpts_flags & MPTSF_BOUND_IF) { 2496 VERIFY(mpts->mpts_outif != NULL); 2497 if (mpts->mpts_outif != outifp) { 2498 mptcplog((LOG_ERR, "%s: cid %d outif %s " 2499 "(expected %s)\n", __func__, mpts->mpts_connid, 2500 ((outifp != NULL) ? outifp->if_xname : "NULL"), 2501 mpts->mpts_outif->if_xname)); 2502 if (outifp == NULL) 2503 outifp = mpts->mpts_outif; 2504 } 2505 } else { 2506 mpts->mpts_outif = outifp; 2507 } 2508 2509 socket_unlock(so, 0); 2510 2511 mptcplog((LOG_DEBUG, "%s: cid %d outif %s %s[%d] -> %s[%d] " 2512 "is %s\n", __func__, mpts->mpts_connid, ((outifp != NULL) ? 2513 outifp->if_xname : "NULL"), inet_ntop(af, (af == AF_INET) ? 2514 (void *)&SIN(src_se->se_addr)->sin_addr.s_addr : 2515 (void *)&SIN6(src_se->se_addr)->sin6_addr, buf0, sizeof (buf0)), 2516 ((af == AF_INET) ? ntohs(SIN(src_se->se_addr)->sin_port) : 2517 ntohs(SIN6(src_se->se_addr)->sin6_port)), 2518 inet_ntop(af, ((af == AF_INET) ? 2519 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr : 2520 (void *)&SIN6(dst_se->se_addr)->sin6_addr), buf1, sizeof (buf1)), 2521 ((af == AF_INET) ? ntohs(SIN(dst_se->se_addr)->sin_port) : 2522 ntohs(SIN6(dst_se->se_addr)->sin6_port)), 2523 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? 2524 "MPTCP capable" : "a regular TCP"))); 2525 2526 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE); 2527 MPTS_UNLOCK(mpts); 2528 2529 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED); 2530 2531 MPT_LOCK(mp_tp); 2532 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { 2533 /* case (a) above */ 2534 if (!mpok) { 2535 mp_tp->mpt_flags |= MPTCPF_PEEL_OFF; 2536 (void) mptcp_drop(mpte, mp_tp, EPROTO); 2537 MPT_UNLOCK(mp_tp); 2538 } else { 2539 if (mptcp_init_authparms(mp_tp) != 0) { 2540 mp_tp->mpt_flags |= MPTCPF_PEEL_OFF; 2541 (void) mptcp_drop(mpte, mp_tp, EPROTO); 2542 MPT_UNLOCK(mp_tp); 2543 mpok = FALSE; 2544 } else { 2545 mp_tp->mpt_state = MPTCPS_ESTABLISHED; 2546 mpte->mpte_associd = mpts->mpts_connid; 2547 DTRACE_MPTCP2(state__change, 2548 struct mptcb *, mp_tp, 2549 uint32_t, 0 /* event */); 2550 mptcp_init_statevars(mp_tp); 2551 MPT_UNLOCK(mp_tp); 2552 2553 (void) mptcp_setconnorder(mpte, 2554 mpts->mpts_connid, 1); 2555 soisconnected(mp_so); 2556 } 2557 } 2558 MPTS_LOCK(mpts); 2559 if (mpok) { 2560 /* Initialize the relative sequence number */ 2561 mpts->mpts_rel_seq = 1; 2562 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET; 2563 mpte->mpte_nummpcapflows++; 2564 MPT_LOCK_SPIN(mp_tp); 2565 mpts->mpts_sndnxt = mp_tp->mpt_snduna; 2566 MPT_UNLOCK(mp_tp); 2567 } 2568 } else if (mpok) { 2569 MPT_UNLOCK(mp_tp); 2570 /* 2571 * case (b) above 2572 * In case of additional flows, the MPTCP socket is not 2573 * MPTSF_MP_CAPABLE until an ACK is received from server 2574 * for 3-way handshake. TCP would have guaranteed that this 2575 * is an MPTCP subflow. 2576 */ 2577 MPTS_LOCK(mpts); 2578 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET; 2579 mpte->mpte_nummpcapflows++; 2580 mpts->mpts_rel_seq = 1; 2581 MPT_LOCK_SPIN(mp_tp); 2582 mpts->mpts_sndnxt = mp_tp->mpt_snduna; 2583 MPT_UNLOCK(mp_tp); 2584 } 2585 MPTS_LOCK_ASSERT_HELD(mpts); 2586 2587 return (MPTS_EVRET_OK); /* keep the subflow socket around */ 2588} 2589 2590/* 2591 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event. 2592 */ 2593static ev_ret_t 2594mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts) 2595{ 2596 struct socket *mp_so, *so; 2597 struct mptcb *mp_tp; 2598 boolean_t linger; 2599 2600 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2601 MPTS_LOCK_ASSERT_HELD(mpts); 2602 VERIFY(mpte->mpte_mppcb != NULL); 2603 mp_so = mpte->mpte_mppcb->mpp_socket; 2604 mp_tp = mpte->mpte_mptcb; 2605 so = mpts->mpts_socket; 2606 2607 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) && 2608 !(mp_so->so_flags & SOF_PCBCLEARING)); 2609 2610 mptcplog2((LOG_DEBUG, "%s: cid %d [linger %s]\n", __func__, 2611 mpts->mpts_connid, (linger ? "YES" : "NO"))); 2612 2613 if (mpts->mpts_flags & MPTSF_DISCONNECTED) 2614 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE); 2615 2616 /* 2617 * Clear flags that are used by getconninfo to return state. 2618 * Retain like MPTSF_DELETEOK, MPTSF_ACTIVE for internal purposes. 2619 */ 2620 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING| 2621 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED| 2622 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED| 2623 MPTSF_SUSPENDED|MPTSF_ACTIVE); 2624 mpts->mpts_flags |= MPTSF_DISCONNECTED; 2625 2626 /* 2627 * The subflow connection has been disconnected. 2628 * 2629 * Right now, we simply unblock any waiters at the MPTCP socket layer 2630 * if the MPTCP connection has not been established. 2631 */ 2632 MPTS_UNLOCK(mpts); 2633 2634 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED); 2635 2636 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) { 2637 mpte->mpte_nummpcapflows--; 2638 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET; 2639 } 2640 2641 MPT_LOCK(mp_tp); 2642 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { 2643 MPT_UNLOCK(mp_tp); 2644 soisdisconnected(mp_so); 2645 } else { 2646 MPT_UNLOCK(mp_tp); 2647 } 2648 2649 MPTS_LOCK(mpts); 2650 /* 2651 * The underlying subflow socket has been disconnected; 2652 * it is no longer useful to us. Keep the subflow socket 2653 * around, unless the MPTCP socket has been detached or 2654 * the subflow has been disconnected explicitly, in which 2655 * case it should be deleted right away. 2656 */ 2657 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE); 2658} 2659 2660/* 2661 * Handle SO_FILT_HINT_MPSTATUS subflow socket event 2662 */ 2663static ev_ret_t 2664mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts) 2665{ 2666 struct socket *mp_so, *so; 2667 struct mptcb *mp_tp; 2668 ev_ret_t ret = MPTS_EVRET_OK_UPDATE; 2669 2670 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2671 VERIFY(mpte->mpte_mppcb != NULL); 2672 mp_so = mpte->mpte_mppcb->mpp_socket; 2673 mp_tp = mpte->mpte_mptcb; 2674 2675 MPTS_LOCK_ASSERT_HELD(mpts); 2676 so = mpts->mpts_socket; 2677 2678 socket_lock(so, 0); 2679 MPT_LOCK(mp_tp); 2680 2681 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) 2682 mpts->mpts_flags |= MPTSF_MP_CAPABLE; 2683 else 2684 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE; 2685 2686 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) { 2687 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) 2688 goto done; 2689 mpts->mpts_flags |= MPTSF_MP_DEGRADED; 2690 } 2691 else 2692 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED; 2693 2694 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) 2695 mpts->mpts_flags |= MPTSF_MP_READY; 2696 else 2697 mpts->mpts_flags &= ~MPTSF_MP_READY; 2698 2699 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) { 2700 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP; 2701 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY; 2702 } 2703 2704 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) { 2705 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)); 2706 ret = MPTS_EVRET_DISCONNECT_FALLBACK; 2707 } else if (mpts->mpts_flags & MPTSF_MP_READY) { 2708 mp_tp->mpt_flags |= MPTCPF_JOIN_READY; 2709 ret = MPTS_EVRET_CONNECT_PENDING; 2710 } 2711 2712 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d " 2713 "mptsf=%b\n", __func__, 2714 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket), 2715 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid, 2716 mpts->mpts_flags, MPTSF_BITS)); 2717done: 2718 MPT_UNLOCK(mp_tp); 2719 socket_unlock(so, 0); 2720 2721 return (ret); 2722} 2723 2724/* 2725 * Handle SO_FILT_HINT_MUSTRST subflow socket event 2726 */ 2727static ev_ret_t 2728mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts) 2729{ 2730 struct socket *mp_so, *so; 2731 struct mptcb *mp_tp; 2732 boolean_t linger; 2733 2734 2735 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2736 MPTS_LOCK_ASSERT_HELD(mpts); 2737 VERIFY(mpte->mpte_mppcb != NULL); 2738 mp_so = mpte->mpte_mppcb->mpp_socket; 2739 mp_tp = mpte->mpte_mptcb; 2740 so = mpts->mpts_socket; 2741 2742 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) && 2743 !(mp_so->so_flags & SOF_PCBCLEARING)); 2744 2745 if (mpts->mpts_soerror == 0) 2746 mpts->mpts_soerror = ECONNABORTED; 2747 2748 so->so_error = ECONNABORTED; 2749 2750 /* We got an invalid option or a fast close */ 2751 socket_lock(so, 0); 2752 struct tcptemp *t_template; 2753 struct inpcb *inp = sotoinpcb(so); 2754 struct tcpcb *tp = NULL; 2755 2756 tp = intotcpcb(inp); 2757 2758 t_template = tcp_maketemplate(tp); 2759 if (t_template) { 2760 unsigned int ifscope, nocell = 0; 2761 2762 if (inp->inp_flags & INP_BOUND_IF) 2763 ifscope = inp->inp_boundifp->if_index; 2764 else 2765 ifscope = IFSCOPE_NONE; 2766 2767 if (inp->inp_flags & INP_NO_IFT_CELLULAR) 2768 nocell = 1; 2769 2770 tcp_respond(tp, t_template->tt_ipgen, 2771 &t_template->tt_t, (struct mbuf *)NULL, 2772 tp->rcv_nxt, tp->snd_una, TH_RST, ifscope, nocell); 2773 (void) m_free(dtom(t_template)); 2774 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d \n", 2775 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 2776 so, mpts->mpts_connid)); 2777 } 2778 socket_unlock(so, 0); 2779 mptcp_subflow_disconnect(mpte, mpts, !linger); 2780 MPTS_UNLOCK(mpts); 2781 2782 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED); 2783 2784 MPT_LOCK(mp_tp); 2785 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { 2786 mp_so->so_error = ECONNABORTED; 2787 } 2788 MPT_UNLOCK(mp_tp); 2789 2790 MPTS_LOCK(mpts); 2791 /* 2792 * Keep the subflow socket around unless the subflow has been 2793 * disconnected explicitly. 2794 */ 2795 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE); 2796} 2797 2798static const char * 2799mptcp_evret2str(ev_ret_t ret) 2800{ 2801 const char *c = "UNKNOWN"; 2802 2803 switch (ret) { 2804 case MPTS_EVRET_DELETE: 2805 c = "MPTS_EVRET_DELETE"; 2806 break; 2807 case MPTS_EVRET_CONNECT_PENDING: 2808 c = "MPTS_EVRET_CONNECT_PENDING"; 2809 break; 2810 case MPTS_EVRET_DISCONNECT_FALLBACK: 2811 c = "MPTS_EVRET_DISCONNECT_FALLBACK"; 2812 break; 2813 case MPTS_EVRET_OK: 2814 c = "MPTS_EVRET_OK"; 2815 break; 2816 case MPTS_EVRET_OK_UPDATE: 2817 c = "MPTS_EVRET_OK_UPDATE"; 2818 break; 2819 } 2820 return (c); 2821} 2822 2823/* 2824 * Add a reference to a subflow structure; used by MPTS_ADDREF(). 2825 */ 2826void 2827mptcp_subflow_addref(struct mptsub *mpts, int locked) 2828{ 2829 if (!locked) 2830 MPTS_LOCK(mpts); 2831 else 2832 MPTS_LOCK_ASSERT_HELD(mpts); 2833 2834 if (++mpts->mpts_refcnt == 0) { 2835 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts); 2836 /* NOTREACHED */ 2837 } 2838 if (!locked) 2839 MPTS_UNLOCK(mpts); 2840} 2841 2842/* 2843 * Remove a reference held on a subflow structure; used by MPTS_REMREF(); 2844 */ 2845void 2846mptcp_subflow_remref(struct mptsub *mpts) 2847{ 2848 MPTS_LOCK(mpts); 2849 if (mpts->mpts_refcnt == 0) { 2850 panic("%s: mpts %p negative refcnt\n", __func__, mpts); 2851 /* NOTREACHED */ 2852 } 2853 if (--mpts->mpts_refcnt > 0) { 2854 MPTS_UNLOCK(mpts); 2855 return; 2856 } 2857 /* callee will unlock and destroy lock */ 2858 mptcp_subflow_free(mpts); 2859} 2860 2861/* 2862 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked, 2863 * caller must ensure that the option can be issued on subflow sockets, via 2864 * MPOF_SUBFLOW_OK flag. 2865 */ 2866int 2867mptcp_subflow_sosetopt(struct mptses *mpte, struct socket *so, 2868 struct mptopt *mpo) 2869{ 2870 struct socket *mp_so; 2871 struct sockopt sopt; 2872 char buf[32]; 2873 int error; 2874 2875 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK); 2876 mpo->mpo_flags &= ~MPOF_INTERIM; 2877 2878 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2879 mp_so = mpte->mpte_mppcb->mpp_socket; 2880 2881 bzero(&sopt, sizeof (sopt)); 2882 sopt.sopt_dir = SOPT_SET; 2883 sopt.sopt_level = mpo->mpo_level; 2884 sopt.sopt_name = mpo->mpo_name; 2885 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval); 2886 sopt.sopt_valsize = sizeof (int); 2887 sopt.sopt_p = kernproc; 2888 2889 error = sosetoptlock(so, &sopt, 0); /* already locked */ 2890 if (error == 0) { 2891 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s " 2892 "val %d set successful\n", __func__, 2893 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 2894 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name, 2895 buf, sizeof (buf)), mpo->mpo_intval)); 2896 } else { 2897 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s " 2898 "val %d set error %d\n", __func__, 2899 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 2900 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name, 2901 buf, sizeof (buf)), mpo->mpo_intval, error)); 2902 } 2903 return (error); 2904} 2905 2906/* 2907 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked, 2908 * caller must ensure that the option can be issued on subflow sockets, via 2909 * MPOF_SUBFLOW_OK flag. 2910 */ 2911int 2912mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so, 2913 struct mptopt *mpo) 2914{ 2915 struct socket *mp_so; 2916 struct sockopt sopt; 2917 char buf[32]; 2918 int error; 2919 2920 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK); 2921 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 2922 mp_so = mpte->mpte_mppcb->mpp_socket; 2923 2924 bzero(&sopt, sizeof (sopt)); 2925 sopt.sopt_dir = SOPT_GET; 2926 sopt.sopt_level = mpo->mpo_level; 2927 sopt.sopt_name = mpo->mpo_name; 2928 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval); 2929 sopt.sopt_valsize = sizeof (int); 2930 sopt.sopt_p = kernproc; 2931 2932 error = sogetoptlock(so, &sopt, 0); /* already locked */ 2933 if (error == 0) { 2934 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s " 2935 "val %d get successful\n", __func__, 2936 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 2937 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name, 2938 buf, sizeof (buf)), mpo->mpo_intval)); 2939 } else { 2940 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s get error %d\n", 2941 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 2942 mptcp_sopt2str(mpo->mpo_level, 2943 mpo->mpo_name, buf, sizeof (buf)), error)); 2944 } 2945 return (error); 2946} 2947 2948 2949/* 2950 * MPTCP garbage collector. 2951 * 2952 * This routine is called by the MP domain on-demand, periodic callout, 2953 * which is triggered when a MPTCP socket is closed. The callout will 2954 * repeat as long as this routine returns a non-zero value. 2955 */ 2956static uint32_t 2957mptcp_gc(struct mppcbinfo *mppi) 2958{ 2959 struct mppcb *mpp, *tmpp; 2960 uint32_t active = 0; 2961 2962 lck_mtx_assert(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED); 2963 2964 mptcplog3((LOG_DEBUG, "%s: running\n", __func__)); 2965 2966 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) { 2967 struct socket *mp_so; 2968 struct mptses *mpte; 2969 struct mptcb *mp_tp; 2970 2971 VERIFY(mpp->mpp_flags & MPP_ATTACHED); 2972 mp_so = mpp->mpp_socket; 2973 VERIFY(mp_so != NULL); 2974 mpte = mptompte(mpp); 2975 VERIFY(mpte != NULL); 2976 mp_tp = mpte->mpte_mptcb; 2977 VERIFY(mp_tp != NULL); 2978 2979 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx found " 2980 "(u=%d,r=%d,s=%d)\n", __func__, 2981 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount, 2982 mp_so->so_retaincnt, mpp->mpp_state)); 2983 2984 if (!lck_mtx_try_lock(&mpp->mpp_lock)) { 2985 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped " 2986 "(u=%d,r=%d)\n", __func__, 2987 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 2988 mp_so->so_usecount, mp_so->so_retaincnt)); 2989 active++; 2990 continue; 2991 } 2992 2993 /* check again under the lock */ 2994 if (mp_so->so_usecount > 1) { 2995 boolean_t wakeup = FALSE; 2996 struct mptsub *mpts, *tmpts; 2997 2998 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped " 2999 "[u=%d,r=%d] %d %d\n", __func__, 3000 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 3001 mp_so->so_usecount, mp_so->so_retaincnt, 3002 mp_tp->mpt_gc_ticks, 3003 mp_tp->mpt_state)); 3004 MPT_LOCK(mp_tp); 3005 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) { 3006 if (mp_tp->mpt_gc_ticks > 0) 3007 mp_tp->mpt_gc_ticks--; 3008 if (mp_tp->mpt_gc_ticks == 0) { 3009 wakeup = TRUE; 3010 if (mp_tp->mpt_localkey != NULL) { 3011 mptcp_free_key( 3012 mp_tp->mpt_localkey); 3013 mp_tp->mpt_localkey = NULL; 3014 } 3015 } 3016 } 3017 MPT_UNLOCK(mp_tp); 3018 if (wakeup) { 3019 TAILQ_FOREACH_SAFE(mpts, 3020 &mpte->mpte_subflows, mpts_entry, tmpts) { 3021 MPTS_LOCK(mpts); 3022 mpts->mpts_flags |= MPTSF_DELETEOK; 3023 if (mpts->mpts_soerror == 0) 3024 mpts->mpts_soerror = ETIMEDOUT; 3025 mptcp_subflow_eupcall(mpts->mpts_socket, 3026 mpts, SO_FILT_HINT_DISCONNECTED); 3027 MPTS_UNLOCK(mpts); 3028 } 3029 } 3030 lck_mtx_unlock(&mpp->mpp_lock); 3031 active++; 3032 continue; 3033 } 3034 3035 if (mpp->mpp_state != MPPCB_STATE_DEAD) { 3036 mptcplog3((LOG_DEBUG, "%s: mp_so 0x%llx skipped " 3037 "[u=%d,r=%d,s=%d]\n", __func__, 3038 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 3039 mp_so->so_usecount, mp_so->so_retaincnt, 3040 mpp->mpp_state)); 3041 lck_mtx_unlock(&mpp->mpp_lock); 3042 active++; 3043 continue; 3044 } 3045 3046 /* 3047 * The PCB has been detached, and there is exactly 1 refnct 3048 * held by the MPTCP thread. Signal that thread to terminate, 3049 * after which the last refcnt will be released. That will 3050 * allow it to be destroyed below during the next round. 3051 */ 3052 if (mp_so->so_usecount == 1) { 3053 mptcplog2((LOG_DEBUG, "%s: mp_so 0x%llx scheduled for " 3054 "termination [u=%d,r=%d]\n", __func__, 3055 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 3056 mp_so->so_usecount, mp_so->so_retaincnt)); 3057 /* signal MPTCP thread to terminate */ 3058 mptcp_thread_terminate_signal(mpte); 3059 lck_mtx_unlock(&mpp->mpp_lock); 3060 active++; 3061 continue; 3062 } 3063 3064 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n", 3065 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 3066 mp_so->so_usecount, mp_so->so_retaincnt)); 3067 DTRACE_MPTCP4(dispose, struct socket *, mp_so, 3068 struct sockbuf *, &mp_so->so_rcv, 3069 struct sockbuf *, &mp_so->so_snd, 3070 struct mppcb *, mpp); 3071 3072 mp_pcbdispose(mpp); 3073 } 3074 3075 return (active); 3076} 3077 3078/* 3079 * Drop a MPTCP connection, reporting the specified error. 3080 */ 3081struct mptses * 3082mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno) 3083{ 3084 struct socket *mp_so; 3085 3086 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 3087 MPT_LOCK_ASSERT_HELD(mp_tp); 3088 VERIFY(mpte->mpte_mptcb == mp_tp); 3089 mp_so = mpte->mpte_mppcb->mpp_socket; 3090 3091 mp_tp->mpt_state = MPTCPS_CLOSED; 3092 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp, 3093 uint32_t, 0 /* event */); 3094 3095 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) 3096 errno = mp_tp->mpt_softerror; 3097 mp_so->so_error = errno; 3098 3099 return (mptcp_close(mpte, mp_tp)); 3100} 3101 3102/* 3103 * Close a MPTCP control block. 3104 */ 3105struct mptses * 3106mptcp_close(struct mptses *mpte, struct mptcb *mp_tp) 3107{ 3108 struct socket *mp_so; 3109 struct mptsub *mpts, *tmpts; 3110 3111 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 3112 MPT_LOCK_ASSERT_HELD(mp_tp); 3113 VERIFY(mpte->mpte_mptcb == mp_tp); 3114 mp_so = mpte->mpte_mppcb->mpp_socket; 3115 if (mp_tp->mpt_localkey != NULL) { 3116 mptcp_free_key(mp_tp->mpt_localkey); 3117 mp_tp->mpt_localkey = NULL; 3118 } 3119 3120 MPT_UNLOCK(mp_tp); 3121 soisdisconnected(mp_so); 3122 3123 MPT_LOCK(mp_tp); 3124 if (mp_tp->mpt_flags & MPTCPF_PEEL_OFF) { 3125 return (NULL); 3126 } 3127 MPT_UNLOCK(mp_tp); 3128 3129 /* Clean up all subflows */ 3130 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { 3131 MPTS_LOCK(mpts); 3132 mptcp_subflow_disconnect(mpte, mpts, TRUE); 3133 MPTS_UNLOCK(mpts); 3134 mptcp_subflow_del(mpte, mpts, TRUE); 3135 } 3136 MPT_LOCK(mp_tp); 3137 3138 return (NULL); 3139} 3140 3141void 3142mptcp_notify_close(struct socket *so) 3143{ 3144 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED)); 3145} 3146 3147/* 3148 * Signal MPTCP thread to wake up. 3149 */ 3150void 3151mptcp_thread_signal(struct mptses *mpte) 3152{ 3153 lck_mtx_lock(&mpte->mpte_thread_lock); 3154 mptcp_thread_signal_locked(mpte); 3155 lck_mtx_unlock(&mpte->mpte_thread_lock); 3156} 3157 3158/* 3159 * Signal MPTCP thread to wake up (locked version) 3160 */ 3161static void 3162mptcp_thread_signal_locked(struct mptses *mpte) 3163{ 3164 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED); 3165 3166 mpte->mpte_thread_reqs++; 3167 if (!mpte->mpte_thread_active && mpte->mpte_thread != THREAD_NULL) 3168 wakeup_one((caddr_t)&mpte->mpte_thread); 3169} 3170 3171/* 3172 * Signal MPTCP thread to terminate. 3173 */ 3174static void 3175mptcp_thread_terminate_signal(struct mptses *mpte) 3176{ 3177 lck_mtx_lock(&mpte->mpte_thread_lock); 3178 if (mpte->mpte_thread != THREAD_NULL) { 3179 mpte->mpte_thread = THREAD_NULL; 3180 mpte->mpte_thread_reqs++; 3181 if (!mpte->mpte_thread_active) 3182 wakeup_one((caddr_t)&mpte->mpte_thread); 3183 } 3184 lck_mtx_unlock(&mpte->mpte_thread_lock); 3185} 3186 3187/* 3188 * MPTCP thread workloop. 3189 */ 3190static void 3191mptcp_thread_dowork(struct mptses *mpte) 3192{ 3193 struct socket *mp_so; 3194 struct mptsub *mpts, *tmpts; 3195 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE; 3196 boolean_t conninfo_update = FALSE; 3197 3198 MPTE_LOCK(mpte); /* same as MP socket lock */ 3199 VERIFY(mpte->mpte_mppcb != NULL); 3200 mp_so = mpte->mpte_mppcb->mpp_socket; 3201 VERIFY(mp_so != NULL); 3202 3203 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { 3204 ev_ret_t ret; 3205 3206 MPTS_LOCK(mpts); 3207 MPTS_ADDREF_LOCKED(mpts); /* for us */ 3208 3209 /* Update process ownership based on parent mptcp socket */ 3210 mptcp_update_last_owner(mpts, mp_so); 3211 3212 mptcp_subflow_input(mpte, mpts); 3213 ret = mptcp_subflow_events(mpte, mpts); 3214 3215 if (mpts->mpts_flags & MPTSF_ACTIVE) { 3216 mptcplog3((LOG_INFO, "%s: cid %d \n", __func__, 3217 mpts->mpts_connid)); 3218 (void) mptcp_subflow_output(mpte, mpts); 3219 } 3220 3221 /* 3222 * If MPTCP socket is closed, disconnect all subflows. 3223 * This will generate a disconnect event which will 3224 * be handled during the next iteration, causing a 3225 * non-zero error to be returned above. 3226 */ 3227 if (mp_so->so_flags & SOF_PCBCLEARING) 3228 mptcp_subflow_disconnect(mpte, mpts, FALSE); 3229 MPTS_UNLOCK(mpts); 3230 3231 switch (ret) { 3232 case MPTS_EVRET_OK_UPDATE: 3233 conninfo_update = TRUE; 3234 break; 3235 case MPTS_EVRET_OK: 3236 /* nothing to do */ 3237 break; 3238 case MPTS_EVRET_DELETE: 3239 if (mptcp_delete_ok(mpte, mpts)) { 3240 mptcp_subflow_del(mpte, mpts, TRUE); 3241 } 3242 break; 3243 case MPTS_EVRET_CONNECT_PENDING: 3244 connect_pending = TRUE; 3245 break; 3246 case MPTS_EVRET_DISCONNECT_FALLBACK: 3247 disconnect_fallback = TRUE; 3248 break; 3249 } 3250 MPTS_REMREF(mpts); /* ours */ 3251 } 3252 3253 if (conninfo_update) { 3254 soevent(mp_so, SO_FILT_HINT_LOCKED | 3255 SO_FILT_HINT_CONNINFO_UPDATED); 3256 } 3257 3258 if (!connect_pending && !disconnect_fallback) { 3259 MPTE_UNLOCK(mpte); 3260 return; 3261 } 3262 3263 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) { 3264 MPTS_LOCK(mpts); 3265 if (disconnect_fallback) { 3266 struct socket *so = NULL; 3267 struct inpcb *inp = NULL; 3268 struct tcpcb *tp = NULL; 3269 3270 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) { 3271 MPTS_UNLOCK(mpts); 3272 continue; 3273 } 3274 3275 mpts->mpts_flags |= MPTSF_MP_DEGRADED; 3276 3277 if (mpts->mpts_flags & (MPTSF_DISCONNECTING| 3278 MPTSF_DISCONNECTED)) { 3279 MPTS_UNLOCK(mpts); 3280 continue; 3281 } 3282 so = mpts->mpts_socket; 3283 3284 /* 3285 * The MPTCP connection has degraded to a fallback 3286 * mode, so there is no point in keeping this subflow 3287 * regardless of its MPTCP-readiness state, unless it 3288 * is the primary one which we use for fallback. This 3289 * assumes that the subflow used for fallback is the 3290 * ACTIVE one. 3291 */ 3292 3293 socket_lock(so, 1); 3294 inp = sotoinpcb(so); 3295 tp = intotcpcb(inp); 3296 tp->t_mpflags &= 3297 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE); 3298 tp->t_mpflags |= TMPF_TCP_FALLBACK; 3299 if (mpts->mpts_flags & MPTSF_ACTIVE) { 3300 socket_unlock(so, 1); 3301 MPTS_UNLOCK(mpts); 3302 continue; 3303 } 3304 tp->t_mpflags |= TMPF_RESET; 3305 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST); 3306 socket_unlock(so, 1); 3307 3308 } else if (connect_pending) { 3309 /* 3310 * The MPTCP connection has progressed to a state 3311 * where it supports full multipath semantics; allow 3312 * additional joins to be attempted for all subflows 3313 * that are in the PENDING state. 3314 */ 3315 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) { 3316 (void) mptcp_subflow_soconnectx(mpte, mpts); 3317 } 3318 } 3319 MPTS_UNLOCK(mpts); 3320 } 3321 3322 MPTE_UNLOCK(mpte); 3323} 3324 3325/* 3326 * MPTCP thread. 3327 */ 3328static void 3329mptcp_thread_func(void *v, wait_result_t w) 3330{ 3331#pragma unused(w) 3332 struct mptses *mpte = v; 3333 struct timespec *ts = NULL; 3334 3335 VERIFY(mpte != NULL); 3336 3337 lck_mtx_lock_spin(&mpte->mpte_thread_lock); 3338 3339 for (;;) { 3340 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED); 3341 3342 if (mpte->mpte_thread != THREAD_NULL) { 3343 (void) msleep(&mpte->mpte_thread, 3344 &mpte->mpte_thread_lock, (PZERO - 1) | PSPIN, 3345 __func__, ts); 3346 } 3347 3348 /* MPTCP socket is closed? */ 3349 if (mpte->mpte_thread == THREAD_NULL) { 3350 lck_mtx_unlock(&mpte->mpte_thread_lock); 3351 /* callee will destroy thread lock */ 3352 mptcp_thread_destroy(mpte); 3353 /* NOTREACHED */ 3354 return; 3355 } 3356 3357 mpte->mpte_thread_active = 1; 3358 for (;;) { 3359 uint32_t reqs = mpte->mpte_thread_reqs; 3360 3361 lck_mtx_unlock(&mpte->mpte_thread_lock); 3362 mptcp_thread_dowork(mpte); 3363 lck_mtx_lock_spin(&mpte->mpte_thread_lock); 3364 3365 /* if there's no pending request, we're done */ 3366 if (reqs == mpte->mpte_thread_reqs || 3367 mpte->mpte_thread == THREAD_NULL) 3368 break; 3369 } 3370 mpte->mpte_thread_reqs = 0; 3371 mpte->mpte_thread_active = 0; 3372 } 3373} 3374 3375/* 3376 * Destroy a MTCP thread, to be called in the MPTCP thread context 3377 * upon receiving an indication to self-terminate. This routine 3378 * will not return, as the current thread is terminated at the end. 3379 */ 3380static void 3381mptcp_thread_destroy(struct mptses *mpte) 3382{ 3383 struct socket *mp_so; 3384 3385 MPTE_LOCK(mpte); /* same as MP socket lock */ 3386 VERIFY(mpte->mpte_thread == THREAD_NULL); 3387 VERIFY(mpte->mpte_mppcb != NULL); 3388 3389 mptcp_sesdestroy(mpte); 3390 3391 mp_so = mpte->mpte_mppcb->mpp_socket; 3392 VERIFY(mp_so != NULL); 3393 VERIFY(mp_so->so_usecount != 0); 3394 mp_so->so_usecount--; /* for thread */ 3395 mpte->mpte_mppcb->mpp_flags |= MPP_DEFUNCT; 3396 MPTE_UNLOCK(mpte); 3397 3398 /* for the extra refcnt from kernel_thread_start() */ 3399 thread_deallocate(current_thread()); 3400 /* this is the end */ 3401 thread_terminate(current_thread()); 3402 /* NOTREACHED */ 3403} 3404 3405/* 3406 * Protocol pr_lock callback. 3407 */ 3408int 3409mptcp_lock(struct socket *mp_so, int refcount, void *lr) 3410{ 3411 struct mppcb *mpp = sotomppcb(mp_so); 3412 void *lr_saved; 3413 3414 if (lr == NULL) 3415 lr_saved = __builtin_return_address(0); 3416 else 3417 lr_saved = lr; 3418 3419 if (mpp == NULL) { 3420 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__, 3421 mp_so, lr_saved, solockhistory_nr(mp_so)); 3422 /* NOTREACHED */ 3423 } 3424 lck_mtx_lock(&mpp->mpp_lock); 3425 3426 if (mp_so->so_usecount < 0) { 3427 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__, 3428 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount, 3429 solockhistory_nr(mp_so)); 3430 /* NOTREACHED */ 3431 } 3432 if (refcount != 0) 3433 mp_so->so_usecount++; 3434 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved; 3435 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX; 3436 3437 return (0); 3438} 3439 3440/* 3441 * Protocol pr_unlock callback. 3442 */ 3443int 3444mptcp_unlock(struct socket *mp_so, int refcount, void *lr) 3445{ 3446 struct mppcb *mpp = sotomppcb(mp_so); 3447 void *lr_saved; 3448 3449 if (lr == NULL) 3450 lr_saved = __builtin_return_address(0); 3451 else 3452 lr_saved = lr; 3453 3454 if (mpp == NULL) { 3455 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__, 3456 mp_so, mp_so->so_usecount, lr_saved, 3457 solockhistory_nr(mp_so)); 3458 /* NOTREACHED */ 3459 } 3460 lck_mtx_assert(&mpp->mpp_lock, LCK_MTX_ASSERT_OWNED); 3461 3462 if (refcount != 0) 3463 mp_so->so_usecount--; 3464 3465 if (mp_so->so_usecount < 0) { 3466 panic("%s: so=%p usecount=%x lrh= %s\n", __func__, 3467 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so)); 3468 /* NOTREACHED */ 3469 } 3470 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved; 3471 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX; 3472 lck_mtx_unlock(&mpp->mpp_lock); 3473 3474 return (0); 3475} 3476 3477/* 3478 * Protocol pr_getlock callback. 3479 */ 3480lck_mtx_t * 3481mptcp_getlock(struct socket *mp_so, int locktype) 3482{ 3483#pragma unused(locktype) 3484 struct mppcb *mpp = sotomppcb(mp_so); 3485 3486 if (mpp == NULL) { 3487 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so, 3488 solockhistory_nr(mp_so)); 3489 /* NOTREACHED */ 3490 } 3491 if (mp_so->so_usecount < 0) { 3492 panic("%s: so=%p usecount=%x lrh= %s\n", __func__, 3493 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so)); 3494 /* NOTREACHED */ 3495 } 3496 return (&mpp->mpp_lock); 3497} 3498 3499/* 3500 * Key generation functions 3501 */ 3502static void 3503mptcp_generate_unique_key(struct mptcp_key_entry *key_entry) 3504{ 3505 struct mptcp_key_entry *key_elm; 3506try_again: 3507 read_random(&key_entry->mkey_value, sizeof (key_entry->mkey_value)); 3508 if (key_entry->mkey_value == 0) 3509 goto try_again; 3510 mptcp_do_sha1(&key_entry->mkey_value, key_entry->mkey_digest, 3511 sizeof (key_entry->mkey_digest)); 3512 3513 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) { 3514 if (key_elm->mkey_value == key_entry->mkey_value) { 3515 goto try_again; 3516 } 3517 if (bcmp(key_elm->mkey_digest, key_entry->mkey_digest, 4) == 3518 0) { 3519 goto try_again; 3520 } 3521 } 3522} 3523 3524static mptcp_key_t * 3525mptcp_reserve_key(void) 3526{ 3527 struct mptcp_key_entry *key_elm; 3528 struct mptcp_key_entry *found_elm = NULL; 3529 3530 lck_mtx_lock(&mptcp_keys_pool.mkph_lock); 3531 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) { 3532 if (key_elm->mkey_flags == MKEYF_FREE) { 3533 key_elm->mkey_flags = MKEYF_INUSE; 3534 found_elm = key_elm; 3535 break; 3536 } 3537 } 3538 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock); 3539 3540 if (found_elm) { 3541 return (&found_elm->mkey_value); 3542 } 3543 3544 key_elm = (struct mptcp_key_entry *) 3545 zalloc(mptcp_keys_pool.mkph_key_entry_zone); 3546 key_elm->mkey_flags = MKEYF_INUSE; 3547 3548 lck_mtx_lock(&mptcp_keys_pool.mkph_lock); 3549 mptcp_generate_unique_key(key_elm); 3550 LIST_INSERT_HEAD(&mptcp_keys_pool, key_elm, mkey_next); 3551 mptcp_keys_pool.mkph_count += 1; 3552 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock); 3553 return (&key_elm->mkey_value); 3554} 3555 3556static caddr_t 3557mptcp_get_stored_digest(mptcp_key_t *key) 3558{ 3559 struct mptcp_key_entry *key_holder; 3560 caddr_t digest = NULL; 3561 3562 lck_mtx_lock(&mptcp_keys_pool.mkph_lock); 3563 key_holder = (struct mptcp_key_entry *)(void *)((caddr_t)key - 3564 offsetof(struct mptcp_key_entry, mkey_value)); 3565 if (key_holder->mkey_flags != MKEYF_INUSE) 3566 panic_plain("%s", __func__); 3567 digest = &key_holder->mkey_digest[0]; 3568 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock); 3569 return (digest); 3570} 3571 3572void 3573mptcp_free_key(mptcp_key_t *key) 3574{ 3575 struct mptcp_key_entry *key_holder; 3576 struct mptcp_key_entry *key_elm; 3577 int pt = RandomULong(); 3578 3579 mptcplog((LOG_INFO, "%s\n", __func__)); 3580 3581 lck_mtx_lock(&mptcp_keys_pool.mkph_lock); 3582 key_holder = (struct mptcp_key_entry *)(void*)((caddr_t)key - 3583 offsetof(struct mptcp_key_entry, mkey_value)); 3584 key_holder->mkey_flags = MKEYF_FREE; 3585 3586 LIST_REMOVE(key_holder, mkey_next); 3587 mptcp_keys_pool.mkph_count -= 1; 3588 3589 /* Free half the time */ 3590 if (pt & 0x01) { 3591 zfree(mptcp_keys_pool.mkph_key_entry_zone, key_holder); 3592 } else { 3593 /* Insert it at random point to avoid early reuse */ 3594 int i = 0; 3595 if (mptcp_keys_pool.mkph_count > 1) { 3596 pt = pt % (mptcp_keys_pool.mkph_count - 1); 3597 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) { 3598 if (++i >= pt) { 3599 LIST_INSERT_AFTER(key_elm, key_holder, 3600 mkey_next); 3601 break; 3602 } 3603 } 3604 if (i < pt) 3605 panic("missed insertion"); 3606 } else { 3607 LIST_INSERT_HEAD(&mptcp_keys_pool, key_holder, 3608 mkey_next); 3609 } 3610 mptcp_keys_pool.mkph_count += 1; 3611 } 3612 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock); 3613} 3614 3615static void 3616mptcp_key_pool_init(void) 3617{ 3618 int i; 3619 struct mptcp_key_entry *key_entry; 3620 3621 LIST_INIT(&mptcp_keys_pool); 3622 mptcp_keys_pool.mkph_count = 0; 3623 3624 mptcp_keys_pool.mkph_key_elm_sz = (vm_size_t) 3625 (sizeof (struct mptcp_key_entry)); 3626 mptcp_keys_pool.mkph_key_entry_zone = zinit( 3627 mptcp_keys_pool.mkph_key_elm_sz, 3628 MPTCP_MX_KEY_ALLOCS * mptcp_keys_pool.mkph_key_elm_sz, 3629 MPTCP_MX_PREALLOC_ZONE_SZ, "mptkeys"); 3630 if (mptcp_keys_pool.mkph_key_entry_zone == NULL) { 3631 panic("%s: unable to allocate MPTCP keys zone \n", __func__); 3632 /* NOTREACHED */ 3633 } 3634 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_CALLERACCT, FALSE); 3635 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_EXPAND, TRUE); 3636 3637 for (i = 0; i < MPTCP_KEY_PREALLOCS_MX; i++) { 3638 key_entry = (struct mptcp_key_entry *) 3639 zalloc(mptcp_keys_pool.mkph_key_entry_zone); 3640 key_entry->mkey_flags = MKEYF_FREE; 3641 mptcp_generate_unique_key(key_entry); 3642 LIST_INSERT_HEAD(&mptcp_keys_pool, key_entry, mkey_next); 3643 mptcp_keys_pool.mkph_count += 1; 3644 } 3645 lck_mtx_init(&mptcp_keys_pool.mkph_lock, mtcbinfo.mppi_lock_grp, 3646 mtcbinfo.mppi_lock_attr); 3647} 3648 3649/* 3650 * MPTCP Join support 3651 */ 3652 3653static void 3654mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, 3655 connid_t conn_id) 3656{ 3657 struct tcpcb *tp = sototcpcb(so); 3658 struct mptcp_subf_auth_entry *sauth_entry; 3659 MPT_LOCK_ASSERT_NOTHELD(mp_tp); 3660 3661 MPT_LOCK_SPIN(mp_tp); 3662 tp->t_mptcb = mp_tp; 3663 MPT_UNLOCK(mp_tp); 3664 /* 3665 * As long as the mpts_connid is unique it can be used as the 3666 * address ID for additional subflows. 3667 * The address ID of the first flow is implicitly 0. 3668 */ 3669 if (mp_tp->mpt_state == MPTCPS_CLOSED) { 3670 tp->t_local_aid = 0; 3671 } else { 3672 tp->t_local_aid = conn_id; 3673 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW); 3674 so->so_flags |= SOF_MP_SEC_SUBFLOW; 3675 } 3676 sauth_entry = zalloc(mpt_subauth_zone); 3677 sauth_entry->msae_laddr_id = tp->t_local_aid; 3678 sauth_entry->msae_raddr_id = 0; 3679 sauth_entry->msae_raddr_rand = 0; 3680try_again: 3681 sauth_entry->msae_laddr_rand = RandomULong(); 3682 if (sauth_entry->msae_laddr_rand == 0) 3683 goto try_again; 3684 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next); 3685} 3686 3687static void 3688mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so) 3689{ 3690 struct mptcp_subf_auth_entry *sauth_entry; 3691 struct tcpcb *tp = sototcpcb(so); 3692 int found = 0; 3693 3694 if (tp == NULL) 3695 return; 3696 3697 MPT_LOCK(mp_tp); 3698 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) { 3699 if (sauth_entry->msae_laddr_id == tp->t_local_aid) { 3700 found = 1; 3701 break; 3702 } 3703 } 3704 if (found) { 3705 LIST_REMOVE(sauth_entry, msae_next); 3706 zfree(mpt_subauth_zone, sauth_entry); 3707 } 3708 tp->t_mptcb = NULL; 3709 MPT_UNLOCK(mp_tp); 3710} 3711 3712void 3713mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand, 3714 u_int32_t *rrand) 3715{ 3716 struct mptcp_subf_auth_entry *sauth_entry; 3717 MPT_LOCK_ASSERT_NOTHELD(mp_tp); 3718 3719 MPT_LOCK(mp_tp); 3720 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) { 3721 if (sauth_entry->msae_laddr_id == addr_id) { 3722 if (lrand) 3723 *lrand = sauth_entry->msae_laddr_rand; 3724 if (rrand) 3725 *rrand = sauth_entry->msae_raddr_rand; 3726 break; 3727 } 3728 } 3729 MPT_UNLOCK(mp_tp); 3730} 3731 3732void 3733mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp, 3734 mptcp_addr_id raddr_id, u_int32_t raddr_rand) 3735{ 3736 struct mptcp_subf_auth_entry *sauth_entry; 3737 MPT_LOCK_ASSERT_NOTHELD(mp_tp); 3738 3739 MPT_LOCK(mp_tp); 3740 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) { 3741 if (sauth_entry->msae_laddr_id == laddr_id) { 3742 if ((sauth_entry->msae_raddr_id != 0) && 3743 (sauth_entry->msae_raddr_id != raddr_id)) { 3744 mptcplog((LOG_ERR, "MPTCP ERROR %s: mismatched" 3745 " address ids %d %d \n", __func__, raddr_id, 3746 sauth_entry->msae_raddr_id)); 3747 MPT_UNLOCK(mp_tp); 3748 return; 3749 } 3750 sauth_entry->msae_raddr_id = raddr_id; 3751 if ((sauth_entry->msae_raddr_rand != 0) && 3752 (sauth_entry->msae_raddr_rand != raddr_rand)) { 3753 mptcplog((LOG_ERR, "%s: dup SYN_ACK %d %d \n", 3754 __func__, raddr_rand, 3755 sauth_entry->msae_raddr_rand)); 3756 MPT_UNLOCK(mp_tp); 3757 return; 3758 } 3759 sauth_entry->msae_raddr_rand = raddr_rand; 3760 MPT_UNLOCK(mp_tp); 3761 return; 3762 } 3763 } 3764 MPT_UNLOCK(mp_tp); 3765} 3766 3767/* 3768 * SHA1 support for MPTCP 3769 */ 3770static int 3771mptcp_do_sha1(mptcp_key_t *key, char *sha_digest, int digest_len) 3772{ 3773 SHA1_CTX sha1ctxt; 3774 const unsigned char *sha1_base; 3775 int sha1_size; 3776 3777 if (digest_len != SHA1_RESULTLEN) { 3778 return (FALSE); 3779 } 3780 3781 sha1_base = (const unsigned char *) key; 3782 sha1_size = sizeof (mptcp_key_t); 3783 SHA1Init(&sha1ctxt); 3784 SHA1Update(&sha1ctxt, sha1_base, sha1_size); 3785 SHA1Final(sha_digest, &sha1ctxt); 3786 return (TRUE); 3787} 3788 3789void 3790mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2, 3791 u_int32_t rand1, u_int32_t rand2, u_char *digest, int digest_len) 3792{ 3793 SHA1_CTX sha1ctxt; 3794 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */ 3795 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */ 3796 u_int32_t data[2]; 3797 int i; 3798 3799 bzero(digest, digest_len); 3800 3801 /* Set up the Key for HMAC */ 3802 key_ipad[0] = key1; 3803 key_ipad[1] = key2; 3804 3805 key_opad[0] = key1; 3806 key_opad[1] = key2; 3807 3808 /* Set up the message for HMAC */ 3809 data[0] = rand1; 3810 data[1] = rand2; 3811 3812 /* Key is 512 block length, so no need to compute hash */ 3813 3814 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */ 3815 3816 for (i = 0; i < 8; i++) { 3817 key_ipad[i] ^= 0x3636363636363636; 3818 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c; 3819 } 3820 3821 /* Perform inner SHA1 */ 3822 SHA1Init(&sha1ctxt); 3823 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad)); 3824 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data)); 3825 SHA1Final(digest, &sha1ctxt); 3826 3827 /* Perform outer SHA1 */ 3828 SHA1Init(&sha1ctxt); 3829 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad)); 3830 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN); 3831 SHA1Final(digest, &sha1ctxt); 3832} 3833 3834/* 3835 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A)) 3836 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B)) 3837 */ 3838void 3839mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest, 3840 int digest_len) 3841{ 3842 uint32_t lrand, rrand; 3843 mptcp_key_t localkey, remotekey; 3844 MPT_LOCK_ASSERT_NOTHELD(mp_tp); 3845 3846 if (digest_len != SHA1_RESULTLEN) 3847 return; 3848 3849 lrand = rrand = 0; 3850 mptcp_get_rands(aid, mp_tp, &lrand, &rrand); 3851 MPT_LOCK_SPIN(mp_tp); 3852 localkey = *mp_tp->mpt_localkey; 3853 remotekey = mp_tp->mpt_remotekey; 3854 MPT_UNLOCK(mp_tp); 3855 mptcp_hmac_sha1(localkey, remotekey, lrand, rrand, digest, 3856 digest_len); 3857} 3858 3859u_int64_t 3860mptcp_get_trunced_hmac(mptcp_addr_id aid, struct mptcb *mp_tp) 3861{ 3862 u_char digest[SHA1_RESULTLEN]; 3863 u_int64_t trunced_digest; 3864 3865 mptcp_get_hmac(aid, mp_tp, &digest[0], sizeof (digest)); 3866 bcopy(digest, &trunced_digest, 8); 3867 return (trunced_digest); 3868} 3869 3870/* 3871 * Authentication data generation 3872 */ 3873int 3874mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token, 3875 int token_len) 3876{ 3877 VERIFY(token_len == sizeof (u_int32_t)); 3878 VERIFY(sha_digest_len == SHA1_RESULTLEN); 3879 3880 /* Most significant 32 bits of the SHA1 hash */ 3881 bcopy(sha_digest, token, sizeof (u_int32_t)); 3882 return (TRUE); 3883} 3884 3885int 3886mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn, 3887 int idsn_len) 3888{ 3889 VERIFY(idsn_len == sizeof (u_int64_t)); 3890 VERIFY(sha_digest_len == SHA1_RESULTLEN); 3891 3892 /* 3893 * Least significant 64 bits of the SHA1 hash 3894 */ 3895 3896 idsn[7] = sha_digest[12]; 3897 idsn[6] = sha_digest[13]; 3898 idsn[5] = sha_digest[14]; 3899 idsn[4] = sha_digest[15]; 3900 idsn[3] = sha_digest[16]; 3901 idsn[2] = sha_digest[17]; 3902 idsn[1] = sha_digest[18]; 3903 idsn[0] = sha_digest[19]; 3904 return (TRUE); 3905} 3906 3907static int 3908mptcp_init_authparms(struct mptcb *mp_tp) 3909{ 3910 caddr_t local_digest = NULL; 3911 char remote_digest[MPTCP_SHA1_RESULTLEN]; 3912 MPT_LOCK_ASSERT_HELD(mp_tp); 3913 3914 /* Only Version 0 is supported for auth purposes */ 3915 if (mp_tp->mpt_version != MP_DRAFT_VERSION_12) 3916 return (-1); 3917 3918 /* Setup local and remote tokens and Initial DSNs */ 3919 local_digest = mptcp_get_stored_digest(mp_tp->mpt_localkey); 3920 mptcp_generate_token(local_digest, SHA1_RESULTLEN, 3921 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken)); 3922 mptcp_generate_idsn(local_digest, SHA1_RESULTLEN, 3923 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t)); 3924 3925 if (!mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest, 3926 SHA1_RESULTLEN)) { 3927 mptcplog((LOG_ERR, "MPTCP ERROR %s: unexpected failure", 3928 __func__)); 3929 return (-1); 3930 } 3931 mptcp_generate_token(remote_digest, SHA1_RESULTLEN, 3932 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_localtoken)); 3933 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN, 3934 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t)); 3935 return (0); 3936} 3937 3938static void 3939mptcp_init_statevars(struct mptcb *mp_tp) 3940{ 3941 MPT_LOCK_ASSERT_HELD(mp_tp); 3942 3943 /* The subflow SYN is also first MPTCP byte */ 3944 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1; 3945 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna; 3946 3947 mp_tp->mpt_rcvatmark = mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1; 3948} 3949 3950static void 3951mptcp_conn_properties(struct mptcb *mp_tp) 3952{ 3953 /* There is only Version 0 at this time */ 3954 mp_tp->mpt_version = MP_DRAFT_VERSION_12; 3955 3956 /* Set DSS checksum flag */ 3957 if (mptcp_dss_csum) 3958 mp_tp->mpt_flags |= MPTCPF_CHECKSUM; 3959 3960 /* Set up receive window */ 3961 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp); 3962 3963 /* Set up gc ticks */ 3964 mp_tp->mpt_gc_ticks = MPT_GC_TICKS; 3965} 3966 3967/* 3968 * Helper Functions 3969 */ 3970mptcp_token_t 3971mptcp_get_localtoken(void* mptcb_arg) 3972{ 3973 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg; 3974 return (mp_tp->mpt_localtoken); 3975} 3976 3977mptcp_token_t 3978mptcp_get_remotetoken(void* mptcb_arg) 3979{ 3980 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg; 3981 return (mp_tp->mpt_remotetoken); 3982} 3983 3984u_int64_t 3985mptcp_get_localkey(void* mptcb_arg) 3986{ 3987 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg; 3988 if (mp_tp->mpt_localkey != NULL) 3989 return (*mp_tp->mpt_localkey); 3990 else 3991 return (0); 3992} 3993 3994u_int64_t 3995mptcp_get_remotekey(void* mptcb_arg) 3996{ 3997 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg; 3998 return (mp_tp->mpt_remotekey); 3999} 4000 4001void 4002mptcp_send_dfin(struct socket *so) 4003{ 4004 struct tcpcb *tp = NULL; 4005 struct inpcb *inp = NULL; 4006 4007 inp = sotoinpcb(so); 4008 if (!inp) 4009 return; 4010 4011 tp = intotcpcb(inp); 4012 if (!tp) 4013 return; 4014 4015 if (!(tp->t_mpflags & TMPF_RESET)) 4016 tp->t_mpflags |= TMPF_SEND_DFIN; 4017} 4018 4019/* 4020 * Data Sequence Mapping routines 4021 */ 4022void 4023mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m) 4024{ 4025 struct mptcb *mp_tp; 4026 4027 if (m == NULL) 4028 return; 4029 4030 mp_tp = &((struct mpp_mtp *)mpp)->mtcb; 4031 MPT_LOCK(mp_tp); 4032 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { 4033 MPT_UNLOCK(mp_tp); 4034 panic("%s: data write before establishment.", 4035 __func__); 4036 return; 4037 } 4038 4039 while (m) { 4040 VERIFY(m->m_flags & M_PKTHDR); 4041 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO); 4042 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax; 4043 m->m_pkthdr.mp_rlen = m_pktlen(m); 4044 mp_tp->mpt_sndmax += m_pktlen(m); 4045 m = m->m_next; 4046 } 4047 MPT_UNLOCK(mp_tp); 4048} 4049 4050void 4051mptcp_preproc_sbdrop(struct mbuf *m, unsigned int len) 4052{ 4053 u_int32_t sub_len = 0; 4054 4055 while (m) { 4056 VERIFY(m->m_flags & M_PKTHDR); 4057 4058 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) { 4059 sub_len = m->m_pkthdr.mp_rlen; 4060 4061 if (sub_len < len) { 4062 m->m_pkthdr.mp_dsn += sub_len; 4063 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) { 4064 m->m_pkthdr.mp_rseq += sub_len; 4065 } 4066 m->m_pkthdr.mp_rlen = 0; 4067 len -= sub_len; 4068 } else { 4069 /* sub_len >= len */ 4070 m->m_pkthdr.mp_dsn += len; 4071 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) { 4072 m->m_pkthdr.mp_rseq += len; 4073 } 4074 mptcplog3((LOG_INFO, 4075 "%s: %llu %u %d %d\n", __func__, 4076 m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rseq, 4077 m->m_pkthdr.mp_rlen, len)); 4078 m->m_pkthdr.mp_rlen -= len; 4079 return; 4080 } 4081 } else { 4082 panic("%s: MPTCP tag not set", __func__); 4083 /* NOTREACHED */ 4084 } 4085 m = m->m_next; 4086 } 4087} 4088 4089/* Obtain the DSN mapping stored in the mbuf */ 4090void 4091mptcp_output_getm_dsnmap32(struct socket *so, int off, uint32_t datalen, 4092 u_int32_t *dsn, u_int32_t *relseq, u_int16_t *data_len, u_int64_t *dsn64p) 4093{ 4094 u_int64_t dsn64; 4095 4096 mptcp_output_getm_dsnmap64(so, off, datalen, &dsn64, relseq, data_len); 4097 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64); 4098 *dsn64p = dsn64; 4099} 4100 4101void 4102mptcp_output_getm_dsnmap64(struct socket *so, int off, uint32_t datalen, 4103 u_int64_t *dsn, u_int32_t *relseq, u_int16_t *data_len) 4104{ 4105 struct mbuf *m = so->so_snd.sb_mb; 4106 struct mbuf *mnext = NULL; 4107 uint32_t runlen = 0; 4108 u_int64_t dsn64; 4109 uint32_t contig_len = 0; 4110 4111 if (m == NULL) 4112 return; 4113 4114 if (off < 0) 4115 return; 4116 /* 4117 * In the subflow socket, the DSN sequencing can be discontiguous, 4118 * but the subflow sequence mapping is contiguous. Use the subflow 4119 * sequence property to find the right mbuf and corresponding dsn 4120 * mapping. 4121 */ 4122 4123 while (m) { 4124 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP); 4125 VERIFY(m->m_flags & M_PKTHDR); 4126 4127 if ((unsigned int)off >= m->m_pkthdr.mp_rlen) { 4128 off -= m->m_pkthdr.mp_rlen; 4129 m = m->m_next; 4130 } else { 4131 break; 4132 } 4133 } 4134 4135 if (m == NULL) { 4136 panic("%s: bad offset", __func__); 4137 /* NOTREACHED */ 4138 } 4139 4140 dsn64 = m->m_pkthdr.mp_dsn + off; 4141 *dsn = dsn64; 4142 *relseq = m->m_pkthdr.mp_rseq + off; 4143 4144 /* 4145 * Now find the last contiguous byte and its length from 4146 * start. 4147 */ 4148 runlen = m->m_pkthdr.mp_rlen - off; 4149 contig_len = runlen; 4150 4151 /* If datalen does not span multiple mbufs, return */ 4152 if (datalen <= runlen) { 4153 *data_len = min(datalen, UINT16_MAX); 4154 return; 4155 } 4156 4157 mnext = m->m_next; 4158 while (datalen > runlen) { 4159 if (mnext == NULL) { 4160 panic("%s: bad datalen = %d, %d %d", __func__, datalen, 4161 runlen, off); 4162 /* NOTREACHED */ 4163 } 4164 VERIFY(mnext->m_flags & M_PKTHDR); 4165 VERIFY(mnext->m_pkthdr.pkt_flags & PKTF_MPTCP); 4166 4167 /* 4168 * case A. contiguous DSN stream 4169 * case B. discontiguous DSN stream 4170 */ 4171 if (mnext->m_pkthdr.mp_dsn == (dsn64 + runlen)) { 4172 /* case A */ 4173 runlen += mnext->m_pkthdr.mp_rlen; 4174 contig_len += mnext->m_pkthdr.mp_rlen; 4175 mptcplog3((LOG_INFO, "%s: contig \n", 4176 __func__)); 4177 } else { 4178 /* case B */ 4179 mptcplog((LOG_INFO, "%s: discontig %d %d \n", 4180 __func__, datalen, contig_len)); 4181 break; 4182 } 4183 mnext = mnext->m_next; 4184 } 4185 datalen = min(datalen, UINT16_MAX); 4186 *data_len = min(datalen, contig_len); 4187 mptcplog3((LOG_INFO, "%s: %llu %u %d %d \n", __func__, 4188 *dsn, *relseq, *data_len, off)); 4189} 4190 4191/* 4192 * MPTCP's notion of the next insequence Data Sequence number is adjusted 4193 * here. It must be called from mptcp_adj_rmap() which is called only after 4194 * reassembly of out of order data. The rcvnxt variable must 4195 * be updated only when atleast some insequence new data is received. 4196 */ 4197static void 4198mptcp_adj_rcvnxt(struct tcpcb *tp, struct mbuf *m) 4199{ 4200 struct mptcb *mp_tp = tptomptp(tp); 4201 4202 if (mp_tp == NULL) 4203 return; 4204 MPT_LOCK(mp_tp); 4205 if ((MPTCP_SEQ_GEQ(mp_tp->mpt_rcvnxt, m->m_pkthdr.mp_dsn)) && 4206 (MPTCP_SEQ_LEQ(mp_tp->mpt_rcvnxt, (m->m_pkthdr.mp_dsn + 4207 m->m_pkthdr.mp_rlen)))) { 4208 mp_tp->mpt_rcvnxt = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen; 4209 } 4210 MPT_UNLOCK(mp_tp); 4211} 4212 4213/* 4214 * Note that this is called only from tcp_input() which may trim data 4215 * after the dsn mapping is inserted into the mbuf. When it trims data 4216 * tcp_input calls m_adj() which does not remove the m_pkthdr even if the 4217 * m_len becomes 0 as a result of trimming the mbuf. The dsn map insertion 4218 * cannot be delayed after trim, because data can be in the reassembly 4219 * queue for a while and the DSN option info in tp will be overwritten for 4220 * every new packet received. 4221 * The dsn map will be adjusted just prior to appending to subflow sockbuf 4222 * with mptcp_adj_rmap() 4223 */ 4224void 4225mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m) 4226{ 4227 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)); 4228 4229 if (tp->t_mpflags & TMPF_EMBED_DSN) { 4230 VERIFY(m->m_flags & M_PKTHDR); 4231 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn; 4232 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq; 4233 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len; 4234 m->m_pkthdr.pkt_flags |= PKTF_MPTCP; 4235 tp->t_mpflags &= ~TMPF_EMBED_DSN; 4236 tp->t_mpflags |= TMPF_MPTCP_ACKNOW; 4237 } 4238} 4239 4240void 4241mptcp_adj_rmap(struct socket *so, struct mbuf *m) 4242{ 4243 u_int64_t dsn; 4244 u_int32_t sseq, datalen; 4245 struct tcpcb *tp = intotcpcb(sotoinpcb(so)); 4246 u_int32_t old_rcvnxt = 0; 4247 4248 if (m_pktlen(m) == 0) 4249 return; 4250 4251 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) { 4252 VERIFY(m->m_flags & M_PKTHDR); 4253 4254 dsn = m->m_pkthdr.mp_dsn; 4255 sseq = m->m_pkthdr.mp_rseq + tp->irs; 4256 datalen = m->m_pkthdr.mp_rlen; 4257 } else { 4258 /* data arrived without an DSS option mapping */ 4259 mptcp_notify_mpfail(so); 4260 return; 4261 } 4262 4263 /* In the common case, data is in window and in sequence */ 4264 if (m->m_pkthdr.len == (int)datalen) { 4265 mptcp_adj_rcvnxt(tp, m); 4266 return; 4267 } 4268 4269 if (m->m_pkthdr.len > (int)datalen) { 4270 panic("%s: mbuf len = %d expected = %d", __func__, 4271 m->m_pkthdr.len, datalen); 4272 } 4273 4274 old_rcvnxt = tp->rcv_nxt - m->m_pkthdr.len; 4275 if (SEQ_GT(old_rcvnxt, sseq)) { 4276 /* data trimmed from the left */ 4277 int off = old_rcvnxt - sseq; 4278 m->m_pkthdr.mp_dsn += off; 4279 m->m_pkthdr.mp_rseq += off; 4280 m->m_pkthdr.mp_rlen -= off; 4281 } else if (old_rcvnxt == sseq) { 4282 /* 4283 * Data was trimmed from the right 4284 */ 4285 m->m_pkthdr.mp_rlen = m->m_pkthdr.len; 4286 } else { 4287 /* XXX handle gracefully with reass or fallback in January */ 4288 panic("%s: partial map %u %u", __func__, old_rcvnxt, sseq); 4289 /* NOTREACHED */ 4290 } 4291 mptcp_adj_rcvnxt(tp, m); 4292 4293} 4294 4295/* 4296 * Following routines help with failure detection and failover of data 4297 * transfer from one subflow to another. 4298 */ 4299void 4300mptcp_act_on_txfail(struct socket *so) 4301{ 4302 struct tcpcb *tp = NULL; 4303 struct inpcb *inp = sotoinpcb(so); 4304 4305 if (inp == NULL) 4306 return; 4307 4308 tp = intotcpcb(inp); 4309 if (tp == NULL) 4310 return; 4311 4312 if (tp->t_state != TCPS_ESTABLISHED) 4313 mptcplog((LOG_INFO, "%s: state = %d \n", __func__, 4314 tp->t_state)); 4315 4316 if (so->so_flags & SOF_MP_TRYFAILOVER) { 4317 return; 4318 } 4319 4320 so->so_flags |= SOF_MP_TRYFAILOVER; 4321 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER)); 4322} 4323 4324/* 4325 * Support for MP_FAIL option 4326 */ 4327int 4328mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq) 4329{ 4330 struct mbuf *m = so->so_snd.sb_mb; 4331 u_int64_t dsn; 4332 int off = 0; 4333 u_int32_t datalen; 4334 4335 if (m == NULL) 4336 return (-1); 4337 4338 while (m != NULL) { 4339 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP); 4340 VERIFY(m->m_flags & M_PKTHDR); 4341 dsn = m->m_pkthdr.mp_dsn; 4342 datalen = m->m_pkthdr.mp_rlen; 4343 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) && 4344 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) { 4345 off = dsn_fail - dsn; 4346 *tcp_seq = m->m_pkthdr.mp_rseq + off; 4347 return (0); 4348 } 4349 4350 m = m->m_next; 4351 } 4352 4353 /* 4354 * If there was no mbuf data and a fallback to TCP occurred, there's 4355 * not much else to do. 4356 */ 4357 4358 mptcplog((LOG_ERR, "%s: %llu not found \n", __func__, dsn_fail)); 4359 return (-1); 4360} 4361 4362/* 4363 * Support for sending contiguous MPTCP bytes in subflow 4364 */ 4365int32_t 4366mptcp_adj_sendlen(struct socket *so, int32_t off, int32_t len) 4367{ 4368 u_int64_t mdss_dsn = 0; 4369 u_int32_t mdss_subflow_seq = 0; 4370 u_int16_t mdss_data_len = 0; 4371 4372 if (len == 0) 4373 return (len); 4374 4375 mptcp_output_getm_dsnmap64(so, off, (u_int32_t)len, 4376 &mdss_dsn, &mdss_subflow_seq, &mdss_data_len); 4377 4378 return (mdss_data_len); 4379} 4380 4381int32_t 4382mptcp_sbspace(struct mptcb *mpt) 4383{ 4384 struct sockbuf *sb; 4385 uint32_t rcvbuf; 4386 int32_t space; 4387 4388 MPT_LOCK_ASSERT_HELD(mpt); 4389 MPTE_LOCK_ASSERT_HELD(mpt->mpt_mpte); 4390 4391 sb = &mpt->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv; 4392 rcvbuf = sb->sb_hiwat; 4393 space = ((int32_t)imin((rcvbuf - sb->sb_cc), 4394 (sb->sb_mbmax - sb->sb_mbcnt))); 4395 if (space < 0) 4396 space = 0; 4397 /* XXX check if it's too small? */ 4398 4399 return (space); 4400} 4401 4402/* 4403 * Support Fallback to Regular TCP 4404 */ 4405void 4406mptcp_notify_mpready(struct socket *so) 4407{ 4408 struct tcpcb *tp = NULL; 4409 4410 if (so == NULL) 4411 return; 4412 4413 tp = intotcpcb(sotoinpcb(so)); 4414 4415 if (tp == NULL) 4416 return; 4417 4418 DTRACE_MPTCP4(multipath__ready, struct socket *, so, 4419 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd, 4420 struct tcpcb *, tp); 4421 4422 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) 4423 return; 4424 4425 if (tp->t_mpflags & TMPF_MPTCP_READY) 4426 return; 4427 4428 tp->t_mpflags &= ~TMPF_TCP_FALLBACK; 4429 tp->t_mpflags |= TMPF_MPTCP_READY; 4430 4431 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS)); 4432} 4433 4434void 4435mptcp_notify_mpfail(struct socket *so) 4436{ 4437 struct tcpcb *tp = NULL; 4438 4439 if (so == NULL) 4440 return; 4441 4442 tp = intotcpcb(sotoinpcb(so)); 4443 4444 if (tp == NULL) 4445 return; 4446 4447 DTRACE_MPTCP4(multipath__failed, struct socket *, so, 4448 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd, 4449 struct tcpcb *, tp); 4450 4451 if (tp->t_mpflags & TMPF_TCP_FALLBACK) 4452 return; 4453 4454 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE); 4455 tp->t_mpflags |= TMPF_TCP_FALLBACK; 4456 4457 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS)); 4458} 4459 4460/* 4461 * Keepalive helper function 4462 */ 4463boolean_t 4464mptcp_ok_to_keepalive(struct mptcb *mp_tp) 4465{ 4466 boolean_t ret = 1; 4467 VERIFY(mp_tp != NULL); 4468 MPT_LOCK(mp_tp); 4469 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) { 4470 ret = 0; 4471 } 4472 MPT_UNLOCK(mp_tp); 4473 return (ret); 4474} 4475 4476/* 4477 * MPTCP t_maxseg adjustment function 4478 */ 4479int 4480mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc) 4481{ 4482 int mss_lower = 0; 4483 struct mptcb *mp_tp = tptomptp(tp); 4484 4485#define MPTCP_COMPUTE_LEN { \ 4486 mss_lower = sizeof (struct mptcp_dss_ack_opt); \ 4487 MPT_LOCK(mp_tp); \ 4488 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \ 4489 mss_lower += 2; \ 4490 else \ 4491 /* adjust to 32-bit boundary + EOL */ \ 4492 mss_lower += 2; \ 4493 MPT_UNLOCK(mp_tp); \ 4494} 4495 if (mp_tp == NULL) 4496 return (0); 4497 4498 /* 4499 * For the first subflow and subsequent subflows, adjust mss for 4500 * most common MPTCP option size, for case where tcp_mss is called 4501 * during option processing and MTU discovery. 4502 */ 4503 if ((tp->t_mpflags & TMPF_PREESTABLISHED) && 4504 (!(tp->t_mpflags & TMPF_JOINED_FLOW))) { 4505 MPTCP_COMPUTE_LEN; 4506 } 4507 4508 if ((tp->t_mpflags & TMPF_PREESTABLISHED) && 4509 (tp->t_mpflags & TMPF_SENT_JOIN)) { 4510 MPTCP_COMPUTE_LEN; 4511 } 4512 4513 if ((mtudisc) && (tp->t_mpflags & TMPF_MPTCP_TRUE)) { 4514 MPTCP_COMPUTE_LEN; 4515 } 4516 4517 return (mss_lower); 4518} 4519 4520/* 4521 * Update the pid, upid, uuid of the subflow so, based on parent so 4522 */ 4523void 4524mptcp_update_last_owner(struct mptsub *mpts, struct socket *parent_mpso) 4525{ 4526 struct socket *subflow_so = mpts->mpts_socket; 4527 4528 MPTS_LOCK_ASSERT_HELD(mpts); 4529 4530 socket_lock(subflow_so, 0); 4531 if ((subflow_so->last_pid != parent_mpso->last_pid) || 4532 (subflow_so->last_upid != parent_mpso->last_upid)) { 4533 subflow_so->last_upid = parent_mpso->last_upid; 4534 subflow_so->last_pid = parent_mpso->last_pid; 4535 uuid_copy(subflow_so->last_uuid, parent_mpso->last_uuid); 4536 } 4537 so_update_policy(subflow_so); 4538 socket_unlock(subflow_so, 0); 4539} 4540 4541static void 4542fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts) 4543{ 4544 struct inpcb *inp; 4545 4546 tcp_getconninfo(so, &flow->flow_ci); 4547 inp = sotoinpcb(so); 4548#if INET6 4549 if ((inp->inp_vflag & INP_IPV6) != 0) { 4550 flow->flow_src.ss_family = AF_INET6; 4551 flow->flow_dst.ss_family = AF_INET6; 4552 flow->flow_src.ss_len = sizeof(struct sockaddr_in6); 4553 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6); 4554 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport; 4555 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport; 4556 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr; 4557 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr; 4558 } else 4559#endif 4560 { 4561 flow->flow_src.ss_family = AF_INET; 4562 flow->flow_dst.ss_family = AF_INET; 4563 flow->flow_src.ss_len = sizeof(struct sockaddr_in); 4564 flow->flow_dst.ss_len = sizeof(struct sockaddr_in); 4565 SIN(&flow->flow_src)->sin_port = inp->inp_lport; 4566 SIN(&flow->flow_dst)->sin_port = inp->inp_fport; 4567 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr; 4568 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr; 4569 } 4570 flow->flow_flags = mpts->mpts_flags; 4571 flow->flow_cid = mpts->mpts_connid; 4572} 4573 4574static int 4575mptcp_pcblist SYSCTL_HANDLER_ARGS 4576{ 4577#pragma unused(oidp, arg1, arg2) 4578 int error = 0, f; 4579 size_t n, len; 4580 struct mppcb *mpp; 4581 struct mptses *mpte; 4582 struct mptcb *mp_tp; 4583 struct mptsub *mpts; 4584 struct socket *so; 4585 conninfo_mptcp_t mptcpci; 4586 mptcp_flow_t *flows; 4587 4588 if (req->newptr != USER_ADDR_NULL) 4589 return (EPERM); 4590 4591 lck_mtx_lock(&mtcbinfo.mppi_lock); 4592 n = mtcbinfo.mppi_count; 4593 if (req->oldptr == USER_ADDR_NULL) { 4594 lck_mtx_unlock(&mtcbinfo.mppi_lock); 4595 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) + 4596 4 * (n + n/8) * sizeof(mptcp_flow_t); 4597 return (0); 4598 } 4599 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) { 4600 bzero(&mptcpci, sizeof(mptcpci)); 4601 lck_mtx_lock(&mpp->mpp_lock); 4602 VERIFY(mpp->mpp_flags & MPP_ATTACHED); 4603 mpte = mptompte(mpp); 4604 VERIFY(mpte != NULL); 4605 mp_tp = mpte->mpte_mptcb; 4606 VERIFY(mp_tp != NULL); 4607 len = sizeof(*flows) * mpte->mpte_numflows; 4608 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO); 4609 if (flows == NULL) { 4610 lck_mtx_unlock(&mpp->mpp_lock); 4611 break; 4612 } 4613 /* N.B. we don't take the mpt_lock just for the state. */ 4614 mptcpci.mptcpci_state = mp_tp->mpt_state; 4615 mptcpci.mptcpci_nflows = mpte->mpte_numflows; 4616 mptcpci.mptcpci_len = sizeof(mptcpci) + 4617 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1); 4618 error = SYSCTL_OUT(req, &mptcpci, 4619 sizeof(mptcpci) - sizeof(*flows)); 4620 if (error) { 4621 lck_mtx_unlock(&mpp->mpp_lock); 4622 FREE(flows, M_TEMP); 4623 break; 4624 } 4625 f = 0; 4626 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { 4627 MPTS_LOCK(mpts); 4628 so = mpts->mpts_socket; 4629 socket_lock(so, 0); 4630 fill_mptcp_subflow(so, &flows[f], mpts); 4631 socket_unlock(so, 0); 4632 MPTS_UNLOCK(mpts); 4633 f++; 4634 } 4635 lck_mtx_unlock(&mpp->mpp_lock); 4636 error = SYSCTL_OUT(req, flows, len); 4637 FREE(flows, M_TEMP); 4638 if (error) 4639 break; 4640 } 4641 lck_mtx_unlock(&mtcbinfo.mppi_lock); 4642 4643 return (error); 4644} 4645 4646SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED, 4647 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t", 4648 "List of active MPTCP connections"); 4649