1/* 2 * Copyright (c) 2012-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <sys/param.h> 30#include <sys/systm.h> 31#include <sys/kernel.h> 32#include <sys/socket.h> 33#include <sys/socketvar.h> 34#include <sys/protosw.h> 35#include <sys/mcache.h> 36#include <sys/syslog.h> 37#include <sys/proc.h> 38#include <sys/proc_internal.h> 39#include <sys/resourcevar.h> 40 41#include <net/if.h> 42#include <netinet/in.h> 43#include <netinet/in_var.h> 44#include <netinet/tcp.h> 45#include <netinet/tcp_fsm.h> 46#include <netinet/tcp_seq.h> 47#include <netinet/tcp_var.h> 48#include <netinet/tcp_timer.h> 49#include <netinet/mptcp_var.h> 50#include <netinet/mptcp_timer.h> 51 52#include <mach/sdt.h> 53 54static int mptcp_usr_attach(struct socket *, int, struct proc *); 55static int mptcp_usr_detach(struct socket *); 56static int mptcp_attach(struct socket *, struct proc *); 57static int mptcp_detach(struct socket *, struct mppcb *); 58static int mptcp_connectx(struct mptses *, struct sockaddr_list **, 59 struct sockaddr_list **, struct proc *, uint32_t, associd_t, connid_t *, 60 uint32_t, void *, uint32_t); 61static int mptcp_usr_connectx(struct socket *, struct sockaddr_list **, 62 struct sockaddr_list **, struct proc *, uint32_t, associd_t, connid_t *, 63 uint32_t, void *, uint32_t); 64static int mptcp_getassocids(struct mptses *, uint32_t *, user_addr_t); 65static int mptcp_getconnids(struct mptses *, associd_t, uint32_t *, 66 user_addr_t); 67static int mptcp_getconninfo(struct mptses *, connid_t *, uint32_t *, 68 uint32_t *, int32_t *, user_addr_t, socklen_t *, user_addr_t, socklen_t *, 69 uint32_t *, user_addr_t, uint32_t *); 70static int mptcp_usr_control(struct socket *, u_long, caddr_t, struct ifnet *, 71 struct proc *); 72static int mptcp_disconnectx(struct mptses *, associd_t, connid_t); 73static int mptcp_usr_disconnectx(struct socket *, associd_t, connid_t); 74static struct mptses *mptcp_usrclosed(struct mptses *); 75static int mptcp_usr_peeloff(struct socket *, associd_t, struct socket **); 76static int mptcp_peeloff(struct mptses *, associd_t, struct socket **); 77static int mptcp_usr_rcvd(struct socket *, int); 78static int mptcp_usr_send(struct socket *, int, struct mbuf *, 79 struct sockaddr *, struct mbuf *, struct proc *); 80static int mptcp_usr_shutdown(struct socket *); 81static int mptcp_uiotombuf(struct uio *, int, int, uint32_t, struct mbuf **); 82static int mptcp_usr_sosend(struct socket *, struct sockaddr *, struct uio *, 83 struct mbuf *, struct mbuf *, int); 84static int mptcp_usr_socheckopt(struct socket *, struct sockopt *); 85static int mptcp_setopt_apply(struct mptses *, struct mptopt *); 86static int mptcp_setopt(struct mptses *, struct sockopt *); 87static int mptcp_getopt(struct mptses *, struct sockopt *); 88static int mptcp_default_tcp_optval(struct mptses *, struct sockopt *, int *); 89static void mptcp_connorder_helper(struct mptsub *mpts); 90 91struct pr_usrreqs mptcp_usrreqs = { 92 .pru_attach = mptcp_usr_attach, 93 .pru_connectx = mptcp_usr_connectx, 94 .pru_control = mptcp_usr_control, 95 .pru_detach = mptcp_usr_detach, 96 .pru_disconnectx = mptcp_usr_disconnectx, 97 .pru_peeloff = mptcp_usr_peeloff, 98 .pru_rcvd = mptcp_usr_rcvd, 99 .pru_send = mptcp_usr_send, 100 .pru_shutdown = mptcp_usr_shutdown, 101 .pru_sosend = mptcp_usr_sosend, 102 .pru_soreceive = soreceive, 103 .pru_socheckopt = mptcp_usr_socheckopt, 104}; 105 106/* 107 * Attaches an MPTCP control block to a socket. 108 */ 109static int 110mptcp_usr_attach(struct socket *mp_so, int proto, struct proc *p) 111{ 112#pragma unused(proto) 113 int error; 114 115 VERIFY(sotomppcb(mp_so) == NULL); 116 117 error = mptcp_attach(mp_so, p); 118 if (error != 0) 119 goto out; 120 /* 121 * XXX: adi@apple.com 122 * 123 * Might want to use a different SO_LINGER timeout than TCP's? 124 */ 125 if ((mp_so->so_options & SO_LINGER) && mp_so->so_linger == 0) 126 mp_so->so_linger = TCP_LINGERTIME * hz; 127out: 128 return (error); 129} 130 131/* 132 * Detaches an MPTCP control block from a socket. 133 */ 134static int 135mptcp_usr_detach(struct socket *mp_so) 136{ 137 struct mppcb *mpp = sotomppcb(mp_so); 138 int error = 0; 139 140 VERIFY(mpp != NULL); 141 VERIFY(mpp->mpp_socket != NULL); 142 143 error = mptcp_detach(mp_so, mpp); 144 return (error); 145} 146 147/* 148 * Attach MPTCP protocol to socket, allocating MP control block, 149 * MPTCP session, control block, buffer space, etc. 150 */ 151static int 152mptcp_attach(struct socket *mp_so, struct proc *p) 153{ 154#pragma unused(p) 155 struct mptses *mpte; 156 struct mptcb *mp_tp; 157 struct mppcb *mpp; 158 int error = 0; 159 160 if (mp_so->so_snd.sb_hiwat == 0 || mp_so->so_rcv.sb_hiwat == 0) { 161 error = soreserve(mp_so, tcp_sendspace, MPTCP_RWIN_MAX); 162 if (error != 0) 163 goto out; 164 } 165 166 /* 167 * MPTCP socket buffers cannot be compressed, due to the 168 * fact that each mbuf chained via m_next is a M_PKTHDR 169 * which carries some MPTCP metadata. 170 */ 171 mp_so->so_snd.sb_flags |= SB_NOCOMPRESS; 172 mp_so->so_rcv.sb_flags |= SB_NOCOMPRESS; 173 174 /* Disable socket buffer auto-tuning. */ 175 mp_so->so_rcv.sb_flags &= ~SB_AUTOSIZE; 176 mp_so->so_snd.sb_flags &= ~SB_AUTOSIZE; 177 178 if ((error = mp_pcballoc(mp_so, &mtcbinfo)) != 0) 179 goto out; 180 181 mpp = sotomppcb(mp_so); 182 VERIFY(mpp != NULL); 183 184 mpte = mptcp_sescreate(mp_so, mpp); 185 if (mpte == NULL) { 186 mp_pcbdetach(mpp); 187 error = ENOBUFS; 188 goto out; 189 } 190 mp_tp = mpte->mpte_mptcb; 191 VERIFY(mp_tp != NULL); 192 193 MPT_LOCK(mp_tp); 194 mp_tp->mpt_state = MPTCPS_CLOSED; 195 MPT_UNLOCK(mp_tp); 196 197out: 198 return (error); 199} 200 201/* 202 * Called when the socket layer loses its final reference to the socket; 203 * at this point, there is only one case in which we will keep things 204 * around: time wait. 205 */ 206static int 207mptcp_detach(struct socket *mp_so, struct mppcb *mpp) 208{ 209 struct mptses *mpte; 210 struct mppcbinfo *mppi; 211 212 VERIFY(mp_so->so_pcb == mpp); 213 VERIFY(mpp->mpp_socket == mp_so); 214 215 mppi = mpp->mpp_pcbinfo; 216 VERIFY(mppi != NULL); 217 218 mpte = &((struct mpp_mtp *)mpp)->mpp_ses; 219 VERIFY(mpte->mpte_mppcb == mpp); 220 221 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 222 223 /* 224 * We are done with this MPTCP socket (it has been closed); 225 * trigger all subflows to be disconnected, if not already, 226 * by initiating the PCB detach sequence (SOF_PCBCLEARING 227 * will be set.) 228 */ 229 mp_pcbdetach(mpp); 230 231 (void) mptcp_disconnectx(mpte, ASSOCID_ALL, CONNID_ALL); 232 233 /* 234 * XXX: adi@apple.com 235 * 236 * Here, we would want to handle time wait state. 237 */ 238 239 return (0); 240} 241 242/* 243 * Common subroutine to open a MPTCP connection to one of the remote hosts 244 * specified by dst_sl. This includes allocating and establishing a 245 * subflow TCP connection, either initially to establish MPTCP connection, 246 * or to join an existing one. Returns a connection handle upon success. 247 */ 248static int 249mptcp_connectx(struct mptses *mpte, struct sockaddr_list **src_sl, 250 struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope, 251 associd_t aid, connid_t *pcid, uint32_t flags, void *arg, 252 uint32_t arglen) 253{ 254#pragma unused(p, aid, flags, arg, arglen) 255 struct mptsub *mpts; 256 struct socket *mp_so; 257 int error = 0; 258 259 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 260 mp_so = mpte->mpte_mppcb->mpp_socket; 261 262 VERIFY(dst_sl != NULL && *dst_sl != NULL); 263 VERIFY(pcid != NULL); 264 265 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx\n", __func__, 266 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so))); 267 DTRACE_MPTCP3(connectx, struct mptses *, mpte, associd_t, aid, 268 struct socket *, mp_so); 269 270 mpts = mptcp_subflow_alloc(M_WAITOK); 271 if (mpts == NULL) { 272 error = ENOBUFS; 273 goto out; 274 } 275 MPTS_ADDREF(mpts); /* for this routine */ 276 277 if (src_sl != NULL) { 278 mpts->mpts_src_sl = *src_sl; 279 *src_sl = NULL; 280 } 281 mpts->mpts_dst_sl = *dst_sl; 282 *dst_sl = NULL; 283 284 error = mptcp_subflow_add(mpte, mpts, p, ifscope); 285 if (error == 0 && pcid != NULL) 286 *pcid = mpts->mpts_connid; 287 288out: 289 if (mpts != NULL) { 290 if ((error != 0) && (error != EWOULDBLOCK)) { 291 MPTS_LOCK(mpts); 292 if (mpts->mpts_flags & MPTSF_ATTACHED) { 293 MPTS_UNLOCK(mpts); 294 MPTS_REMREF(mpts); 295 mptcp_subflow_del(mpte, mpts, TRUE); 296 return (error); 297 } 298 MPTS_UNLOCK(mpts); 299 } 300 MPTS_REMREF(mpts); 301 } 302 303 return (error); 304} 305 306/* 307 * User-protocol pru_connectx callback. 308 */ 309static int 310mptcp_usr_connectx(struct socket *mp_so, struct sockaddr_list **src_sl, 311 struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope, 312 associd_t aid, connid_t *pcid, uint32_t flags, void *arg, 313 uint32_t arglen) 314{ 315#pragma unused(arg, arglen) 316 struct mppcb *mpp = sotomppcb(mp_so); 317 struct mptses *mpte; 318 int error = 0; 319 320 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { 321 error = EINVAL; 322 goto out; 323 } 324 mpte = mptompte(mpp); 325 VERIFY(mpte != NULL); 326 327 error = mptcp_connectx(mpte, src_sl, dst_sl, p, ifscope, 328 aid, pcid, flags, arg, arglen); 329out: 330 return (error); 331} 332 333/* 334 * Handle SIOCGASSOCIDS ioctl for PF_MULTIPATH domain. 335 */ 336static int 337mptcp_getassocids(struct mptses *mpte, uint32_t *cnt, user_addr_t aidp) 338{ 339 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 340 341 /* MPTCP has at most 1 association */ 342 *cnt = (mpte->mpte_associd != ASSOCID_ANY) ? 1 : 0; 343 344 /* just asking how many there are? */ 345 if (aidp == USER_ADDR_NULL) 346 return (0); 347 348 return (copyout(&mpte->mpte_associd, aidp, 349 sizeof (mpte->mpte_associd))); 350} 351 352/* 353 * Handle SIOCGCONNIDS ioctl for PF_MULTIPATH domain. 354 */ 355static int 356mptcp_getconnids(struct mptses *mpte, associd_t aid, uint32_t *cnt, 357 user_addr_t cidp) 358{ 359 struct mptsub *mpts; 360 int error = 0; 361 362 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 363 364 if (aid != ASSOCID_ANY && aid != ASSOCID_ALL && 365 aid != mpte->mpte_associd) 366 return (EINVAL); 367 368 *cnt = mpte->mpte_numflows; 369 370 /* just asking how many there are? */ 371 if (cidp == USER_ADDR_NULL) 372 return (0); 373 374 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { 375 if ((error = copyout(&mpts->mpts_connid, cidp, 376 sizeof (mpts->mpts_connid))) != 0) 377 break; 378 379 cidp += sizeof (mpts->mpts_connid); 380 } 381 382 return (error); 383} 384 385/* 386 * Handle SIOCGCONNINFO ioctl for PF_MULTIPATH domain. 387 */ 388static int 389mptcp_getconninfo(struct mptses *mpte, connid_t *cid, uint32_t *flags, 390 uint32_t *ifindex, int32_t *soerror, user_addr_t src, socklen_t *src_len, 391 user_addr_t dst, socklen_t *dst_len, uint32_t *aux_type, 392 user_addr_t aux_data, uint32_t *aux_len) 393{ 394#pragma unused(aux_data) 395 struct sockaddr_entry *se; 396 struct ifnet *ifp = NULL; 397 struct mptsub *mpts; 398 int error = 0; 399 400 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 401 402 if (*cid == CONNID_ALL) 403 return (EINVAL); 404 405 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { 406 if (mpts->mpts_connid == *cid || *cid == CONNID_ANY) 407 break; 408 } 409 if (mpts == NULL) 410 return ((*cid == CONNID_ANY) ? ENXIO : EINVAL); 411 412 MPTS_LOCK(mpts); 413 ifp = mpts->mpts_outif; 414 *cid = mpts->mpts_connid; 415 *ifindex = ((ifp != NULL) ? ifp->if_index : 0); 416 *soerror = mpts->mpts_soerror; 417 *flags = 0; 418 if (mpts->mpts_flags & MPTSF_CONNECTING) 419 *flags |= CIF_CONNECTING; 420 if (mpts->mpts_flags & MPTSF_CONNECTED) 421 *flags |= CIF_CONNECTED; 422 if (mpts->mpts_flags & MPTSF_DISCONNECTING) 423 *flags |= CIF_DISCONNECTING; 424 if (mpts->mpts_flags & MPTSF_DISCONNECTED) 425 *flags |= CIF_DISCONNECTED; 426 if (mpts->mpts_flags & MPTSF_BOUND_IF) 427 *flags |= CIF_BOUND_IF; 428 if (mpts->mpts_flags & MPTSF_BOUND_IP) 429 *flags |= CIF_BOUND_IP; 430 if (mpts->mpts_flags & MPTSF_BOUND_PORT) 431 *flags |= CIF_BOUND_PORT; 432 if (mpts->mpts_flags & MPTSF_PREFERRED) 433 *flags |= CIF_PREFERRED; 434 if (mpts->mpts_flags & MPTSF_MP_CAPABLE) 435 *flags |= CIF_MP_CAPABLE; 436 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) 437 *flags |= CIF_MP_DEGRADED; 438 if (mpts->mpts_flags & MPTSF_MP_READY) 439 *flags |= CIF_MP_READY; 440 if (mpts->mpts_flags & MPTSF_ACTIVE) 441 *flags |= CIF_MP_ACTIVE; 442 443 VERIFY(mpts->mpts_src_sl != NULL); 444 se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head); 445 VERIFY(se != NULL && se->se_addr != NULL); 446 *src_len = se->se_addr->sa_len; 447 if (src != USER_ADDR_NULL) { 448 error = copyout(se->se_addr, src, se->se_addr->sa_len); 449 if (error != 0) 450 goto out; 451 } 452 453 VERIFY(mpts->mpts_dst_sl != NULL); 454 se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head); 455 VERIFY(se != NULL && se->se_addr != NULL); 456 *dst_len = se->se_addr->sa_len; 457 if (dst != USER_ADDR_NULL) { 458 error = copyout(se->se_addr, dst, se->se_addr->sa_len); 459 if (error != 0) 460 goto out; 461 } 462 463 *aux_type = 0; 464 *aux_len = 0; 465 if (mpts->mpts_socket != NULL) { 466 struct conninfo_tcp tcp_ci; 467 468 *aux_type = CIAUX_TCP; 469 *aux_len = sizeof (tcp_ci); 470 471 if (aux_data != USER_ADDR_NULL) { 472 struct socket *so = mpts->mpts_socket; 473 474 VERIFY(SOCK_PROTO(so) == IPPROTO_TCP); 475 bzero(&tcp_ci, sizeof (tcp_ci)); 476 socket_lock(so, 0); 477 tcp_getconninfo(so, &tcp_ci); 478 socket_unlock(so, 0); 479 error = copyout(&tcp_ci, aux_data, sizeof (tcp_ci)); 480 if (error != 0) 481 goto out; 482 } 483 } 484out: 485 MPTS_UNLOCK(mpts); 486 return (error); 487} 488 489/* 490 * Handle SIOCSCONNORDER 491 */ 492int 493mptcp_setconnorder(struct mptses *mpte, connid_t cid, uint32_t rank) 494{ 495 struct mptsub *mpts, *mpts1; 496 int error = 0; 497 498 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 499 mptcplog((LOG_DEBUG, "%s: cid %d rank %d \n", __func__, cid, rank)); 500 501 if (cid == CONNID_ANY || cid == CONNID_ALL) { 502 error = EINVAL; 503 goto out; 504 } 505 506 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { 507 if (mpts->mpts_connid == cid) 508 break; 509 } 510 if (mpts == NULL) { 511 error = ENXIO; 512 goto out; 513 } 514 515 if (rank == 0 || rank > 1) { 516 /* 517 * If rank is 0, determine whether this should be the 518 * primary or backup subflow, depending on what we have. 519 * 520 * Otherwise, if greater than 0, make it a backup flow. 521 */ 522 TAILQ_FOREACH(mpts1, &mpte->mpte_subflows, mpts_entry) { 523 MPTS_LOCK(mpts1); 524 if (mpts1->mpts_flags & MPTSF_PREFERRED) { 525 MPTS_UNLOCK(mpts1); 526 break; 527 } 528 MPTS_UNLOCK(mpts1); 529 } 530 531 MPTS_LOCK(mpts); 532 mpts->mpts_flags &= ~MPTSF_PREFERRED; 533 mpts->mpts_rank = rank; 534 if (mpts1 != NULL && mpts != mpts1) { 535 /* preferred subflow found; set rank as necessary */ 536 if (rank == 0) 537 mpts->mpts_rank = (mpts1->mpts_rank + 1); 538 } else if (rank == 0) { 539 /* no preferred one found; promote this */ 540 rank = 1; 541 } 542 MPTS_UNLOCK(mpts); 543 } 544 545 if (rank == 1) { 546 /* 547 * If rank is 1, promote this subflow to be preferred. 548 */ 549 TAILQ_FOREACH(mpts1, &mpte->mpte_subflows, mpts_entry) { 550 MPTS_LOCK(mpts1); 551 if (mpts1 != mpts && 552 (mpts1->mpts_flags & MPTSF_PREFERRED)) { 553 mpts1->mpts_flags &= ~MPTSF_PREFERRED; 554 if (mpte->mpte_nummpcapflows > 1) 555 mptcp_connorder_helper(mpts1); 556 } else if (mpts1 == mpts) { 557 mpts1->mpts_rank = 1; 558 if (mpts1->mpts_flags & MPTSF_MP_CAPABLE) { 559 mpts1->mpts_flags |= MPTSF_PREFERRED; 560 if (mpte->mpte_nummpcapflows > 1) 561 mptcp_connorder_helper(mpts1); 562 } 563 } 564 MPTS_UNLOCK(mpts1); 565 } 566 } 567 568out: 569 return (error); 570} 571 572static void 573mptcp_connorder_helper(struct mptsub *mpts) 574{ 575 struct socket *so = mpts->mpts_socket; 576 struct tcpcb *tp = NULL; 577 578 socket_lock(so, 0); 579 580 tp = intotcpcb(sotoinpcb(so)); 581 tp->t_mpflags |= TMPF_SND_MPPRIO; 582 if (mpts->mpts_flags & MPTSF_PREFERRED) 583 tp->t_mpflags &= ~TMPF_BACKUP_PATH; 584 else 585 tp->t_mpflags |= TMPF_BACKUP_PATH; 586 mptcplog((LOG_DEBUG, "%s cid %d flags %x", __func__, 587 mpts->mpts_connid, mpts->mpts_flags)); 588 socket_unlock(so, 0); 589 590} 591 592/* 593 * Handle SIOCSGONNORDER 594 */ 595int 596mptcp_getconnorder(struct mptses *mpte, connid_t cid, uint32_t *rank) 597{ 598 struct mptsub *mpts; 599 int error = 0; 600 601 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 602 VERIFY(rank != NULL); 603 *rank = 0; 604 605 if (cid == CONNID_ANY || cid == CONNID_ALL) { 606 error = EINVAL; 607 goto out; 608 } 609 610 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { 611 if (mpts->mpts_connid == cid) 612 break; 613 } 614 if (mpts == NULL) { 615 error = ENXIO; 616 goto out; 617 } 618 619 MPTS_LOCK(mpts); 620 *rank = mpts->mpts_rank; 621 MPTS_UNLOCK(mpts); 622out: 623 return (error); 624} 625 626/* 627 * User-protocol pru_control callback. 628 */ 629static int 630mptcp_usr_control(struct socket *mp_so, u_long cmd, caddr_t data, 631 struct ifnet *ifp, struct proc *p) 632{ 633#pragma unused(ifp, p) 634 struct mppcb *mpp = sotomppcb(mp_so); 635 struct mptses *mpte; 636 int error = 0; 637 638 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { 639 error = EINVAL; 640 goto out; 641 } 642 mpte = mptompte(mpp); 643 VERIFY(mpte != NULL); 644 645 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 646 647 switch (cmd) { 648 case SIOCGASSOCIDS32: { /* struct so_aidreq32 */ 649 struct so_aidreq32 aidr; 650 bcopy(data, &aidr, sizeof (aidr)); 651 error = mptcp_getassocids(mpte, &aidr.sar_cnt, 652 aidr.sar_aidp); 653 if (error == 0) 654 bcopy(&aidr, data, sizeof (aidr)); 655 break; 656 } 657 658 case SIOCGASSOCIDS64: { /* struct so_aidreq64 */ 659 struct so_aidreq64 aidr; 660 bcopy(data, &aidr, sizeof (aidr)); 661 error = mptcp_getassocids(mpte, &aidr.sar_cnt, 662 aidr.sar_aidp); 663 if (error == 0) 664 bcopy(&aidr, data, sizeof (aidr)); 665 break; 666 } 667 668 case SIOCGCONNIDS32: { /* struct so_cidreq32 */ 669 struct so_cidreq32 cidr; 670 bcopy(data, &cidr, sizeof (cidr)); 671 error = mptcp_getconnids(mpte, cidr.scr_aid, &cidr.scr_cnt, 672 cidr.scr_cidp); 673 if (error == 0) 674 bcopy(&cidr, data, sizeof (cidr)); 675 break; 676 } 677 678 case SIOCGCONNIDS64: { /* struct so_cidreq64 */ 679 struct so_cidreq64 cidr; 680 bcopy(data, &cidr, sizeof (cidr)); 681 error = mptcp_getconnids(mpte, cidr.scr_aid, &cidr.scr_cnt, 682 cidr.scr_cidp); 683 if (error == 0) 684 bcopy(&cidr, data, sizeof (cidr)); 685 break; 686 } 687 688 case SIOCGCONNINFO32: { /* struct so_cinforeq32 */ 689 struct so_cinforeq32 cifr; 690 bcopy(data, &cifr, sizeof (cifr)); 691 error = mptcp_getconninfo(mpte, &cifr.scir_cid, 692 &cifr.scir_flags, &cifr.scir_ifindex, &cifr.scir_error, 693 cifr.scir_src, &cifr.scir_src_len, cifr.scir_dst, 694 &cifr.scir_dst_len, &cifr.scir_aux_type, cifr.scir_aux_data, 695 &cifr.scir_aux_len); 696 if (error == 0) 697 bcopy(&cifr, data, sizeof (cifr)); 698 break; 699 } 700 701 case SIOCGCONNINFO64: { /* struct so_cinforeq64 */ 702 struct so_cinforeq64 cifr; 703 bcopy(data, &cifr, sizeof (cifr)); 704 error = mptcp_getconninfo(mpte, &cifr.scir_cid, 705 &cifr.scir_flags, &cifr.scir_ifindex, &cifr.scir_error, 706 cifr.scir_src, &cifr.scir_src_len, cifr.scir_dst, 707 &cifr.scir_dst_len, &cifr.scir_aux_type, cifr.scir_aux_data, 708 &cifr.scir_aux_len); 709 if (error == 0) 710 bcopy(&cifr, data, sizeof (cifr)); 711 break; 712 } 713 714 case SIOCSCONNORDER: { /* struct so_cordreq */ 715 struct so_cordreq cor; 716 bcopy(data, &cor, sizeof (cor)); 717 error = mptcp_setconnorder(mpte, cor.sco_cid, cor.sco_rank); 718 if (error == 0) 719 bcopy(&cor, data, sizeof (cor)); 720 break; 721 } 722 723 case SIOCGCONNORDER: { /* struct so_cordreq */ 724 struct so_cordreq cor; 725 bcopy(data, &cor, sizeof (cor)); 726 error = mptcp_getconnorder(mpte, cor.sco_cid, &cor.sco_rank); 727 if (error == 0) 728 bcopy(&cor, data, sizeof (cor)); 729 break; 730 } 731 732 default: 733 error = EOPNOTSUPP; 734 break; 735 } 736out: 737 return (error); 738} 739 740/* 741 * Initiate a disconnect. MPTCP-level disconnection is specified by 742 * CONNID_{ANY,ALL}. Otherwise, selectively disconnect a subflow 743 * connection while keeping the MPTCP-level connection (association). 744 */ 745static int 746mptcp_disconnectx(struct mptses *mpte, associd_t aid, connid_t cid) 747{ 748 struct mptsub *mpts; 749 struct socket *mp_so; 750 struct mptcb *mp_tp; 751 int error = 0; 752 753 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 754 755 mp_so = mpte->mpte_mppcb->mpp_socket; 756 mp_tp = mpte->mpte_mptcb; 757 758 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx aid %d cid %d\n", __func__, 759 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), aid, cid)); 760 DTRACE_MPTCP5(disconnectx, struct mptses *, mpte, associd_t, aid, 761 connid_t, cid, struct socket *, mp_so, struct mptcb *, mp_tp); 762 763 VERIFY(aid == ASSOCID_ANY || aid == ASSOCID_ALL || 764 aid == mpte->mpte_associd); 765 766 /* terminate the association? */ 767 if (cid == CONNID_ANY || cid == CONNID_ALL) { 768 /* if we're not detached, go thru socket state checks */ 769 if (!(mp_so->so_flags & SOF_PCBCLEARING)) { 770 if (!(mp_so->so_state & (SS_ISCONNECTED| 771 SS_ISCONNECTING))) { 772 error = ENOTCONN; 773 goto out; 774 } 775 if (mp_so->so_state & SS_ISDISCONNECTING) { 776 error = EALREADY; 777 goto out; 778 } 779 } 780 MPT_LOCK(mp_tp); 781 mptcp_cancel_all_timers(mp_tp); 782 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { 783 (void) mptcp_close(mpte, mp_tp); 784 MPT_UNLOCK(mp_tp); 785 } else if ((mp_so->so_options & SO_LINGER) && 786 mp_so->so_linger == 0) { 787 (void) mptcp_drop(mpte, mp_tp, 0); 788 MPT_UNLOCK(mp_tp); 789 } else { 790 MPT_UNLOCK(mp_tp); 791 soisdisconnecting(mp_so); 792 sbflush(&mp_so->so_rcv); 793 if (mptcp_usrclosed(mpte) != NULL) 794 (void) mptcp_output(mpte); 795 } 796 } else { 797 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { 798 if (mpts->mpts_connid != cid) 799 continue; 800 MPTS_LOCK(mpts); 801 mptcp_subflow_disconnect(mpte, mpts, FALSE); 802 MPTS_UNLOCK(mpts); 803 break; 804 } 805 806 if (mpts == NULL) { 807 error = EINVAL; 808 goto out; 809 } 810 } 811 812 if (error == 0) 813 mptcp_thread_signal(mpte); 814 815 if ((mp_so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) == 816 (SS_CANTRCVMORE | SS_CANTSENDMORE)) { 817 /* the socket has been shutdown, no more sockopt's */ 818 mptcp_flush_sopts(mpte); 819 } 820 821out: 822 return (error); 823} 824 825/* 826 * User-protocol pru_disconnectx callback. 827 */ 828static int 829mptcp_usr_disconnectx(struct socket *mp_so, associd_t aid, connid_t cid) 830{ 831 struct mppcb *mpp = sotomppcb(mp_so); 832 struct mptses *mpte; 833 int error = 0; 834 835 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { 836 error = EINVAL; 837 goto out; 838 } 839 mpte = mptompte(mpp); 840 VERIFY(mpte != NULL); 841 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 842 843 if (aid != ASSOCID_ANY && aid != ASSOCID_ALL && 844 aid != mpte->mpte_associd) { 845 error = EINVAL; 846 goto out; 847 } 848 849 error = mptcp_disconnectx(mpte, aid, cid); 850out: 851 return (error); 852} 853 854/* 855 * User issued close, and wish to trail thru shutdown states. 856 */ 857static struct mptses * 858mptcp_usrclosed(struct mptses *mpte) 859{ 860 struct socket *mp_so; 861 struct mptcb *mp_tp; 862 struct mptsub *mpts; 863 864 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 865 mp_so = mpte->mpte_mppcb->mpp_socket; 866 mp_tp = mpte->mpte_mptcb; 867 868 MPT_LOCK(mp_tp); 869 mptcp_close_fsm(mp_tp, MPCE_CLOSE); 870 871 if (mp_tp->mpt_state == TCPS_CLOSED) { 872 mpte = mptcp_close(mpte, mp_tp); 873 MPT_UNLOCK(mp_tp); 874 } else if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) { 875 MPT_UNLOCK(mp_tp); 876 soisdisconnected(mp_so); 877 } else { 878 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */ 879 MPT_UNLOCK(mp_tp); 880 881 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { 882 MPTS_LOCK(mpts); 883 mptcp_subflow_disconnect(mpte, mpts, FALSE); 884 MPTS_UNLOCK(mpts); 885 } 886 } 887 /* 888 * XXX: adi@apple.com 889 * 890 * Do we need to handle time wait specially here? We need to handle 891 * the case where MPTCP has been established, but we have not usable 892 * subflow to use. Do we want to wait a while before forcibly 893 * tearing this MPTCP down, in case we have one or more subflows 894 * that are flow controlled? 895 */ 896 897 return (mpte); 898} 899 900/* 901 * User-protocol pru_peeloff callback. 902 */ 903static int 904mptcp_usr_peeloff(struct socket *mp_so, associd_t aid, struct socket **psop) 905{ 906 struct mppcb *mpp = sotomppcb(mp_so); 907 struct mptses *mpte; 908 int error = 0; 909 910 VERIFY(psop != NULL); 911 912 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { 913 error = EINVAL; 914 goto out; 915 } 916 mpte = mptompte(mpp); 917 VERIFY(mpte != NULL); 918 919 error = mptcp_peeloff(mpte, aid, psop); 920out: 921 return (error); 922} 923 924/* 925 * Transform a previously connected TCP subflow connection which has 926 * failed to negotiate MPTCP to its own socket which can be externalized 927 * with a file descriptor. Valid only when the MPTCP socket is not 928 * yet associated (MPTCP-level connection has not been established.) 929 */ 930static int 931mptcp_peeloff(struct mptses *mpte, associd_t aid, struct socket **psop) 932{ 933 struct socket *so = NULL, *mp_so; 934 struct mptsub *mpts; 935 int error = 0; 936 937 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 938 mp_so = mpte->mpte_mppcb->mpp_socket; 939 940 VERIFY(psop != NULL); 941 *psop = NULL; 942 943 DTRACE_MPTCP3(peeloff, struct mptses *, mpte, associd_t, aid, 944 struct socket *, mp_so); 945 946 /* peeloff cannot happen after an association is established */ 947 if (mpte->mpte_associd != ASSOCID_ANY) { 948 error = EINVAL; 949 goto out; 950 } 951 952 if (aid != ASSOCID_ANY && aid != ASSOCID_ALL) { 953 error = EINVAL; 954 goto out; 955 } 956 957 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { 958 MPTS_LOCK(mpts); 959 if (mpts->mpts_flags & MPTSF_MP_CAPABLE) { 960 panic("%s: so %p is MPTCP capable but mp_so %p " 961 "aid is %d\n", __func__, so, mp_so, 962 mpte->mpte_associd); 963 /* NOTREACHED */ 964 } 965 MPTS_ADDREF_LOCKED(mpts); /* for us */ 966 so = mpts->mpts_socket; 967 VERIFY(so != NULL); 968 /* 969 * This subflow socket is about to be externalized; make it 970 * appear as if it has the same properties as the MPTCP socket, 971 * undo what's done earlier in mptcp_subflow_add(). 972 */ 973 mptcp_subflow_sopeeloff(mpte, mpts, so); 974 MPTS_UNLOCK(mpts); 975 976 mptcp_subflow_del(mpte, mpts, FALSE); 977 MPTS_REMREF(mpts); /* ours */ 978 /* 979 * XXX adi@apple.com 980 * 981 * Here we need to make sure the subflow socket is not 982 * flow controlled; need to clear both INP_FLOW_CONTROLLED 983 * and INP_FLOW_SUSPENDED on the subflow socket, since 984 * we will no longer be monitoring its events. 985 */ 986 break; 987 } 988 989 if (so == NULL) { 990 error = EINVAL; 991 goto out; 992 } 993 *psop = so; 994 995 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx\n", __func__, 996 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so))); 997out: 998 return (error); 999} 1000 1001/* 1002 * After a receive, possible send some update to peer. 1003 */ 1004static int 1005mptcp_usr_rcvd(struct socket *mp_so, int flags) 1006{ 1007#pragma unused(flags) 1008 struct mppcb *mpp = sotomppcb(mp_so); 1009 struct mptses *mpte; 1010 int error = 0; 1011 1012 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { 1013 error = EINVAL; 1014 goto out; 1015 } 1016 mpte = mptompte(mpp); 1017 VERIFY(mpte != NULL); 1018 1019 error = mptcp_output(mpte); 1020out: 1021 return (error); 1022} 1023 1024/* 1025 * Do a send by putting data in the output queue. 1026 */ 1027static int 1028mptcp_usr_send(struct socket *mp_so, int prus_flags, struct mbuf *m, 1029 struct sockaddr *nam, struct mbuf *control, struct proc *p) 1030{ 1031#pragma unused(nam, p) 1032 struct mppcb *mpp = sotomppcb(mp_so); 1033 struct mptses *mpte; 1034 int error = 0; 1035 1036 if (prus_flags & (PRUS_OOB|PRUS_EOF)) { 1037 error = EOPNOTSUPP; 1038 goto out; 1039 } 1040 1041 if (nam != NULL) { 1042 error = EOPNOTSUPP; 1043 goto out; 1044 } 1045 1046 if (control != NULL && control->m_len != 0) { 1047 error = EOPNOTSUPP; 1048 goto out; 1049 } 1050 1051 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { 1052 error = ECONNRESET; 1053 goto out; 1054 } 1055 mpte = mptompte(mpp); 1056 VERIFY(mpte != NULL); 1057 1058 if (!(mp_so->so_state & SS_ISCONNECTED)) { 1059 error = ENOTCONN; 1060 goto out; 1061 } 1062 1063 mptcp_insert_dsn(mpp, m); 1064 VERIFY(mp_so->so_snd.sb_flags & SB_NOCOMPRESS); 1065 (void) sbappendstream(&mp_so->so_snd, m); 1066 m = NULL; 1067 1068 if (mpte != NULL) { 1069 /* 1070 * XXX: adi@apple.com 1071 * 1072 * PRUS_MORETOCOME could be set, but we don't check it now. 1073 */ 1074 error = mptcp_output(mpte); 1075 } 1076 1077out: 1078 if (error) { 1079 if (m != NULL) 1080 m_freem(m); 1081 if (control != NULL) 1082 m_freem(control); 1083 } 1084 return (error); 1085} 1086 1087/* 1088 * Mark the MPTCP connection as being incapable of further output. 1089 */ 1090static int 1091mptcp_usr_shutdown(struct socket *mp_so) 1092{ 1093 struct mppcb *mpp = sotomppcb(mp_so); 1094 struct mptses *mpte; 1095 int error = 0; 1096 1097 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { 1098 error = EINVAL; 1099 goto out; 1100 } 1101 mpte = mptompte(mpp); 1102 VERIFY(mpte != NULL); 1103 1104 socantsendmore(mp_so); 1105 1106 mpte = mptcp_usrclosed(mpte); 1107 if (mpte != NULL) 1108 error = mptcp_output(mpte); 1109out: 1110 return (error); 1111} 1112 1113/* 1114 * Copy the contents of uio into a properly sized mbuf chain. 1115 */ 1116static int 1117mptcp_uiotombuf(struct uio *uio, int how, int space, uint32_t align, 1118 struct mbuf **top) 1119{ 1120 struct mbuf *m, *mb, *nm = NULL, *mtail = NULL; 1121 user_ssize_t resid, tot, len, progress; /* must be user_ssize_t */ 1122 int error; 1123 1124 VERIFY(top != NULL && *top == NULL); 1125 1126 /* 1127 * space can be zero or an arbitrary large value bound by 1128 * the total data supplied by the uio. 1129 */ 1130 resid = uio_resid(uio); 1131 if (space > 0) 1132 tot = imin(resid, space); 1133 else 1134 tot = resid; 1135 1136 /* 1137 * The smallest unit is a single mbuf with pkthdr. 1138 * We can't align past it. 1139 */ 1140 if (align >= MHLEN) 1141 return (EINVAL); 1142 1143 /* 1144 * Give us the full allocation or nothing. 1145 * If space is zero return the smallest empty mbuf. 1146 */ 1147 if ((len = tot + align) == 0) 1148 len = 1; 1149 1150 /* Loop and append maximum sized mbufs to the chain tail. */ 1151 while (len > 0) { 1152 uint32_t m_needed = 1; 1153 1154 if (njcl > 0 && len > MBIGCLBYTES) 1155 mb = m_getpackets_internal(&m_needed, 1, 1156 how, 1, M16KCLBYTES); 1157 else if (len > MCLBYTES) 1158 mb = m_getpackets_internal(&m_needed, 1, 1159 how, 1, MBIGCLBYTES); 1160 else if (len >= (signed)MINCLSIZE) 1161 mb = m_getpackets_internal(&m_needed, 1, 1162 how, 1, MCLBYTES); 1163 else 1164 mb = m_gethdr(how, MT_DATA); 1165 1166 /* Fail the whole operation if one mbuf can't be allocated. */ 1167 if (mb == NULL) { 1168 if (nm != NULL) 1169 m_freem(nm); 1170 return (ENOBUFS); 1171 } 1172 1173 /* Book keeping. */ 1174 VERIFY(mb->m_flags & M_PKTHDR); 1175 len -= ((mb->m_flags & M_EXT) ? mb->m_ext.ext_size : MHLEN); 1176 if (mtail != NULL) 1177 mtail->m_next = mb; 1178 else 1179 nm = mb; 1180 mtail = mb; 1181 } 1182 1183 m = nm; 1184 m->m_data += align; 1185 1186 progress = 0; 1187 /* Fill all mbufs with uio data and update header information. */ 1188 for (mb = m; mb != NULL; mb = mb->m_next) { 1189 len = imin(M_TRAILINGSPACE(mb), tot - progress); 1190 1191 error = uiomove(mtod(mb, char *), len, uio); 1192 if (error != 0) { 1193 m_freem(m); 1194 return (error); 1195 } 1196 1197 /* each mbuf is M_PKTHDR chained via m_next */ 1198 mb->m_len = len; 1199 mb->m_pkthdr.len = len; 1200 1201 progress += len; 1202 } 1203 VERIFY(progress == tot); 1204 *top = m; 1205 return (0); 1206} 1207 1208/* 1209 * MPTCP socket protocol-user socket send routine, derived from sosend(). 1210 */ 1211static int 1212mptcp_usr_sosend(struct socket *mp_so, struct sockaddr *addr, struct uio *uio, 1213 struct mbuf *top, struct mbuf *control, int flags) 1214{ 1215#pragma unused(addr) 1216 int32_t space; 1217 user_ssize_t resid; 1218 int error, sendflags; 1219 struct proc *p = current_proc(); 1220 int sblocked = 0; 1221 1222 /* UIO is required for now, due to per-mbuf M_PKTHDR constrains */ 1223 if (uio == NULL || top != NULL) { 1224 error = EINVAL; 1225 goto out; 1226 } 1227 resid = uio_resid(uio); 1228 1229 socket_lock(mp_so, 1); 1230 so_update_last_owner_locked(mp_so, p); 1231 so_update_policy(mp_so); 1232 1233 VERIFY(mp_so->so_type == SOCK_STREAM); 1234 VERIFY(!(mp_so->so_flags & SOF_MP_SUBFLOW)); 1235 1236 if ((flags & (MSG_OOB|MSG_DONTROUTE|MSG_HOLD|MSG_SEND|MSG_FLUSH)) || 1237 (mp_so->so_flags & SOF_ENABLE_MSGS)) { 1238 error = EOPNOTSUPP; 1239 socket_unlock(mp_so, 1); 1240 goto out; 1241 } 1242 1243 /* 1244 * In theory resid should be unsigned. However, space must be 1245 * signed, as it might be less than 0 if we over-committed, and we 1246 * must use a signed comparison of space and resid. On the other 1247 * hand, a negative resid causes us to loop sending 0-length 1248 * segments to the protocol. 1249 */ 1250 if (resid < 0 || (flags & MSG_EOR) || control != NULL) { 1251 error = EINVAL; 1252 socket_unlock(mp_so, 1); 1253 goto out; 1254 } 1255 1256 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd); 1257 1258 do { 1259 error = sosendcheck(mp_so, NULL, resid, 0, 0, flags, 1260 &sblocked, NULL); 1261 if (error != 0) 1262 goto release; 1263 1264 space = sbspace(&mp_so->so_snd); 1265 do { 1266 socket_unlock(mp_so, 0); 1267 /* 1268 * Copy the data from userland into an mbuf chain. 1269 */ 1270 error = mptcp_uiotombuf(uio, M_WAITOK, space, 0, &top); 1271 if (error != 0) { 1272 socket_lock(mp_so, 0); 1273 goto release; 1274 } 1275 VERIFY(top != NULL); 1276 space -= resid - uio_resid(uio); 1277 resid = uio_resid(uio); 1278 socket_lock(mp_so, 0); 1279 1280 /* 1281 * Compute flags here, for pru_send and NKEs. 1282 */ 1283 sendflags = (resid > 0 && space > 0) ? 1284 PRUS_MORETOCOME : 0; 1285 1286 /* 1287 * Socket filter processing 1288 */ 1289 VERIFY(control == NULL); 1290 error = sflt_data_out(mp_so, NULL, &top, &control, 0); 1291 if (error != 0) { 1292 if (error == EJUSTRETURN) { 1293 error = 0; 1294 top = NULL; 1295 /* always free control if any */ 1296 } 1297 goto release; 1298 } 1299 if (control != NULL) { 1300 m_freem(control); 1301 control = NULL; 1302 } 1303 1304 /* 1305 * Pass data to protocol. 1306 */ 1307 error = (*mp_so->so_proto->pr_usrreqs->pru_send) 1308 (mp_so, sendflags, top, NULL, NULL, p); 1309 1310 top = NULL; 1311 if (error != 0) 1312 goto release; 1313 } while (resid != 0 && space > 0); 1314 } while (resid != 0); 1315 1316release: 1317 if (sblocked) 1318 sbunlock(&mp_so->so_snd, FALSE); /* will unlock socket */ 1319 else 1320 socket_unlock(mp_so, 1); 1321out: 1322 if (top != NULL) 1323 m_freem(top); 1324 if (control != NULL) 1325 m_freem(control); 1326 1327 return (error); 1328} 1329 1330/* 1331 * Called to filter SOPT_{SET,GET} for SOL_SOCKET level socket options. 1332 * This routine simply indicates to the caller whether or not to proceed 1333 * further with the given socket option. This is invoked by sosetoptlock() 1334 * and sogetoptlock(). 1335 */ 1336static int 1337mptcp_usr_socheckopt(struct socket *mp_so, struct sockopt *sopt) 1338{ 1339#pragma unused(mp_so) 1340 int error = 0; 1341 1342 VERIFY(sopt->sopt_level == SOL_SOCKET); 1343 1344 /* 1345 * We could check for sopt_dir (set/get) here, but we'll just 1346 * let the caller deal with it as appropriate; therefore the 1347 * following is a superset of the socket options which we 1348 * allow for set/get. 1349 * 1350 * XXX: adi@apple.com 1351 * 1352 * Need to consider the following cases: 1353 * 1354 * a. In the event peeloff(2) occurs on the subflow socket, 1355 * we may want to issue those options which are now 1356 * handled at the MP socket. In that case, we will need 1357 * to record them in mptcp_setopt() so that they can 1358 * be replayed during peeloff. 1359 * 1360 * b. Certain socket options don't have a clear definition 1361 * on the expected behavior post connect(2). At the time 1362 * those options are issued on the MP socket, there may 1363 * be existing subflow sockets that are already connected. 1364 */ 1365 switch (sopt->sopt_name) { 1366 case SO_LINGER: /* MP */ 1367 case SO_LINGER_SEC: /* MP */ 1368 case SO_TYPE: /* MP */ 1369 case SO_NREAD: /* MP */ 1370 case SO_NWRITE: /* MP */ 1371 case SO_ERROR: /* MP */ 1372 case SO_SNDBUF: /* MP */ 1373 case SO_RCVBUF: /* MP */ 1374 case SO_SNDLOWAT: /* MP */ 1375 case SO_RCVLOWAT: /* MP */ 1376 case SO_SNDTIMEO: /* MP */ 1377 case SO_RCVTIMEO: /* MP */ 1378 case SO_NKE: /* MP */ 1379 case SO_NOSIGPIPE: /* MP */ 1380 case SO_NOADDRERR: /* MP */ 1381 case SO_LABEL: /* MP */ 1382 case SO_PEERLABEL: /* MP */ 1383 case SO_DEFUNCTOK: /* MP */ 1384 case SO_ISDEFUNCT: /* MP */ 1385 case SO_TRAFFIC_CLASS_DBG: /* MP */ 1386 /* 1387 * Tell the caller that these options are to be processed. 1388 */ 1389 break; 1390 1391 case SO_DEBUG: /* MP + subflow */ 1392 case SO_KEEPALIVE: /* MP + subflow */ 1393 case SO_USELOOPBACK: /* MP + subflow */ 1394 case SO_RANDOMPORT: /* MP + subflow */ 1395 case SO_TRAFFIC_CLASS: /* MP + subflow */ 1396 case SO_RECV_TRAFFIC_CLASS: /* MP + subflow */ 1397 case SO_PRIVILEGED_TRAFFIC_CLASS: /* MP + subflow */ 1398 case SO_RECV_ANYIF: /* MP + subflow */ 1399 case SO_RESTRICTIONS: /* MP + subflow */ 1400 case SO_FLUSH: /* MP + subflow */ 1401 /* 1402 * Tell the caller that these options are to be processed; 1403 * these will also be recorded later by mptcp_setopt(). 1404 * 1405 * NOTE: Only support integer option value for now. 1406 */ 1407 if (sopt->sopt_valsize != sizeof (int)) 1408 error = EINVAL; 1409 break; 1410 1411 default: 1412 /* 1413 * Tell the caller to stop immediately and return an error. 1414 */ 1415 error = ENOPROTOOPT; 1416 break; 1417 } 1418 1419 return (error); 1420} 1421 1422/* 1423 * Issue SOPT_SET for all MPTCP subflows (for integer option values.) 1424 */ 1425static int 1426mptcp_setopt_apply(struct mptses *mpte, struct mptopt *mpo) 1427{ 1428 struct socket *mp_so; 1429 struct mptsub *mpts; 1430 struct mptopt smpo; 1431 int error = 0; 1432 1433 /* just bail now if this isn't applicable to subflow sockets */ 1434 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) { 1435 error = ENOPROTOOPT; 1436 goto out; 1437 } 1438 1439 /* 1440 * Skip those that are handled internally; these options 1441 * should not have been recorded and marked with the 1442 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case. 1443 */ 1444 if (mpo->mpo_level == SOL_SOCKET && 1445 (mpo->mpo_name == SO_NOSIGPIPE || mpo->mpo_name == SO_NOADDRERR)) { 1446 error = ENOPROTOOPT; 1447 goto out; 1448 } 1449 1450 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1451 mp_so = mpte->mpte_mppcb->mpp_socket; 1452 1453 /* 1454 * Don't bother going further if there's no subflow; mark the option 1455 * with MPOF_INTERIM so that we know whether or not to remove this 1456 * option upon encountering an error while issuing it during subflow 1457 * socket creation. 1458 */ 1459 if (mpte->mpte_numflows == 0) { 1460 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows)); 1461 mpo->mpo_flags |= MPOF_INTERIM; 1462 /* return success */ 1463 goto out; 1464 } 1465 1466 bzero(&smpo, sizeof (smpo)); 1467 smpo.mpo_flags |= MPOF_SUBFLOW_OK; 1468 smpo.mpo_level = mpo->mpo_level; 1469 smpo.mpo_name = mpo->mpo_name; 1470 1471 /* grab exisiting values in case we need to rollback */ 1472 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { 1473 struct socket *so; 1474 1475 MPTS_LOCK(mpts); 1476 mpts->mpts_flags &= ~(MPTSF_SOPT_OLDVAL|MPTSF_SOPT_INPROG); 1477 mpts->mpts_oldintval = 0; 1478 smpo.mpo_intval = 0; 1479 VERIFY(mpts->mpts_socket != NULL); 1480 so = mpts->mpts_socket; 1481 socket_lock(so, 0); 1482 if (mptcp_subflow_sogetopt(mpte, so, &smpo) == 0) { 1483 mpts->mpts_flags |= MPTSF_SOPT_OLDVAL; 1484 mpts->mpts_oldintval = smpo.mpo_intval; 1485 } 1486 socket_unlock(so, 0); 1487 MPTS_UNLOCK(mpts); 1488 } 1489 1490 /* apply socket option */ 1491 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { 1492 struct socket *so; 1493 1494 MPTS_LOCK(mpts); 1495 mpts->mpts_flags |= MPTSF_SOPT_INPROG; 1496 VERIFY(mpts->mpts_socket != NULL); 1497 so = mpts->mpts_socket; 1498 socket_lock(so, 0); 1499 error = mptcp_subflow_sosetopt(mpte, so, mpo); 1500 socket_unlock(so, 0); 1501 MPTS_UNLOCK(mpts); 1502 if (error != 0) 1503 break; 1504 } 1505 1506 /* cleanup, and rollback if needed */ 1507 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { 1508 struct socket *so; 1509 1510 MPTS_LOCK(mpts); 1511 if (!(mpts->mpts_flags & MPTSF_SOPT_INPROG)) { 1512 /* clear in case it's set */ 1513 mpts->mpts_flags &= ~MPTSF_SOPT_OLDVAL; 1514 mpts->mpts_oldintval = 0; 1515 MPTS_UNLOCK(mpts); 1516 continue; 1517 } 1518 if (!(mpts->mpts_flags & MPTSF_SOPT_OLDVAL)) { 1519 mpts->mpts_flags &= ~MPTSF_SOPT_INPROG; 1520 VERIFY(mpts->mpts_oldintval == 0); 1521 MPTS_UNLOCK(mpts); 1522 continue; 1523 } 1524 /* error during sosetopt, so roll it back */ 1525 if (error != 0) { 1526 VERIFY(mpts->mpts_socket != NULL); 1527 so = mpts->mpts_socket; 1528 socket_lock(so, 0); 1529 smpo.mpo_intval = mpts->mpts_oldintval; 1530 (void) mptcp_subflow_sosetopt(mpte, so, &smpo); 1531 socket_unlock(so, 0); 1532 } 1533 mpts->mpts_oldintval = 0; 1534 mpts->mpts_flags &= ~(MPTSF_SOPT_OLDVAL|MPTSF_SOPT_INPROG); 1535 MPTS_UNLOCK(mpts); 1536 } 1537 1538out: 1539 return (error); 1540} 1541 1542/* 1543 * Handle SOPT_SET for socket options issued on MP socket. 1544 */ 1545static int 1546mptcp_setopt(struct mptses *mpte, struct sockopt *sopt) 1547{ 1548 int error = 0, optval, level, optname, rec = 1; 1549 struct mptopt smpo, *mpo = NULL; 1550 struct socket *mp_so; 1551 char buf[32]; 1552 1553 level = sopt->sopt_level; 1554 optname = sopt->sopt_name; 1555 1556 VERIFY(sopt->sopt_dir == SOPT_SET); 1557 VERIFY(level == SOL_SOCKET || level == IPPROTO_TCP); 1558 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1559 mp_so = mpte->mpte_mppcb->mpp_socket; 1560 1561 /* 1562 * Record socket options which are applicable to subflow sockets so 1563 * that we can replay them for new ones; see mptcp_usr_socheckopt() 1564 * for the list of eligible socket-level options. 1565 */ 1566 if (level == SOL_SOCKET) { 1567 switch (optname) { 1568 case SO_DEBUG: 1569 case SO_KEEPALIVE: 1570 case SO_USELOOPBACK: 1571 case SO_RANDOMPORT: 1572 case SO_TRAFFIC_CLASS: 1573 case SO_RECV_TRAFFIC_CLASS: 1574 case SO_PRIVILEGED_TRAFFIC_CLASS: 1575 case SO_RECV_ANYIF: 1576 case SO_RESTRICTIONS: 1577 /* record it */ 1578 break; 1579 case SO_FLUSH: 1580 /* don't record it */ 1581 rec = 0; 1582 break; 1583 default: 1584 /* nothing to do; just return success */ 1585 goto out; 1586 } 1587 } else { 1588 switch (optname) { 1589 case TCP_NODELAY: 1590 case TCP_RXT_FINDROP: 1591 case TCP_KEEPALIVE: 1592 case TCP_KEEPINTVL: 1593 case TCP_KEEPCNT: 1594 case TCP_CONNECTIONTIMEOUT: 1595 case TCP_RXT_CONNDROPTIME: 1596 case PERSIST_TIMEOUT: 1597 /* eligible; record it */ 1598 break; 1599 default: 1600 /* not eligible */ 1601 error = ENOPROTOOPT; 1602 goto out; 1603 } 1604 } 1605 1606 if ((error = sooptcopyin(sopt, &optval, sizeof (optval), 1607 sizeof (optval))) != 0) 1608 goto out; 1609 1610 if (rec) { 1611 /* search for an existing one; if not found, allocate */ 1612 if ((mpo = mptcp_sopt_find(mpte, sopt)) == NULL) 1613 mpo = mptcp_sopt_alloc(M_WAITOK); 1614 1615 if (mpo == NULL) { 1616 error = ENOBUFS; 1617 } else { 1618 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s " 1619 "val %d %s\n", __func__, 1620 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 1621 mptcp_sopt2str(level, optname, buf, 1622 sizeof (buf)), optval, 1623 (mpo->mpo_flags & MPOF_ATTACHED) ? 1624 "updated" : "recorded")); 1625 1626 /* initialize or update, as needed */ 1627 mpo->mpo_intval = optval; 1628 if (!(mpo->mpo_flags & MPOF_ATTACHED)) { 1629 mpo->mpo_level = level; 1630 mpo->mpo_name = optname; 1631 mptcp_sopt_insert(mpte, mpo); 1632 } 1633 VERIFY(mpo->mpo_flags & MPOF_ATTACHED); 1634 /* this can be issued on the subflow socket */ 1635 mpo->mpo_flags |= MPOF_SUBFLOW_OK; 1636 } 1637 } else { 1638 bzero(&smpo, sizeof (smpo)); 1639 mpo = &smpo; 1640 mpo->mpo_flags |= MPOF_SUBFLOW_OK; 1641 mpo->mpo_level = level; 1642 mpo->mpo_name = optname; 1643 mpo->mpo_intval = optval; 1644 } 1645 VERIFY(mpo == NULL || error == 0); 1646 1647 /* issue this socket option on existing subflows */ 1648 if (error == 0) { 1649 error = mptcp_setopt_apply(mpte, mpo); 1650 if (error != 0 && (mpo->mpo_flags & MPOF_ATTACHED)) { 1651 VERIFY(mpo != &smpo); 1652 mptcp_sopt_remove(mpte, mpo); 1653 mptcp_sopt_free(mpo); 1654 } 1655 if (mpo == &smpo) 1656 mpo->mpo_flags &= ~MPOF_INTERIM; 1657 } 1658out: 1659 if (error == 0 && mpo != NULL) { 1660 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s val %d set %s\n", 1661 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 1662 mptcp_sopt2str(level, optname, buf, 1663 sizeof (buf)), optval, (mpo->mpo_flags & MPOF_INTERIM) ? 1664 "pending" : "successful")); 1665 } else if (error != 0) { 1666 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s can't be issued " 1667 "error %d\n", __func__, 1668 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mptcp_sopt2str(level, 1669 optname, buf, sizeof (buf)), error)); 1670 } 1671 return (error); 1672} 1673 1674/* 1675 * Handle SOPT_GET for socket options issued on MP socket. 1676 */ 1677static int 1678mptcp_getopt(struct mptses *mpte, struct sockopt *sopt) 1679{ 1680 int error = 0, optval; 1681 1682 VERIFY(sopt->sopt_dir == SOPT_GET); 1683 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1684 1685 /* 1686 * We only handle SOPT_GET for TCP level socket options; we should 1687 * not get here for socket level options since they are already 1688 * handled at the socket layer. 1689 */ 1690 if (sopt->sopt_level != IPPROTO_TCP) { 1691 error = ENOPROTOOPT; 1692 goto out; 1693 } 1694 1695 switch (sopt->sopt_name) { 1696 case TCP_NODELAY: 1697 case TCP_RXT_FINDROP: 1698 case TCP_KEEPALIVE: 1699 case TCP_KEEPINTVL: 1700 case TCP_KEEPCNT: 1701 case TCP_CONNECTIONTIMEOUT: 1702 case TCP_RXT_CONNDROPTIME: 1703 case PERSIST_TIMEOUT: 1704 /* eligible; get the default value just in case */ 1705 error = mptcp_default_tcp_optval(mpte, sopt, &optval); 1706 break; 1707 default: 1708 /* not eligible */ 1709 error = ENOPROTOOPT; 1710 break; 1711 } 1712 1713 /* 1714 * Search for a previously-issued TCP level socket option and 1715 * return the recorded option value. This assumes that the 1716 * value did not get modified by the lower layer after it was 1717 * issued at setsockopt(2) time. If not found, we'll return 1718 * the default value obtained ealier. 1719 */ 1720 if (error == 0) { 1721 struct mptopt *mpo; 1722 1723 if ((mpo = mptcp_sopt_find(mpte, sopt)) != NULL) 1724 optval = mpo->mpo_intval; 1725 1726 error = sooptcopyout(sopt, &optval, sizeof (int)); 1727 } 1728out: 1729 return (error); 1730} 1731 1732/* 1733 * Return default values for TCP socket options. Ideally we would query the 1734 * subflow TCP socket, but that requires creating a subflow socket before 1735 * connectx(2) time. To simplify things, just return the default values 1736 * that we know of. 1737 */ 1738static int 1739mptcp_default_tcp_optval(struct mptses *mpte, struct sockopt *sopt, int *optval) 1740{ 1741 int error = 0; 1742 1743 VERIFY(sopt->sopt_level == IPPROTO_TCP); 1744 VERIFY(sopt->sopt_dir == SOPT_GET); 1745 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1746 1747 /* try to do what tcp_newtcpcb() does */ 1748 switch (sopt->sopt_name) { 1749 case TCP_NODELAY: 1750 case TCP_RXT_FINDROP: 1751 case TCP_KEEPINTVL: 1752 case TCP_KEEPCNT: 1753 case TCP_CONNECTIONTIMEOUT: 1754 case TCP_RXT_CONNDROPTIME: 1755 *optval = 0; 1756 break; 1757 1758 case TCP_KEEPALIVE: 1759 *optval = mptcp_subflow_keeptime; 1760 break; 1761 1762 case PERSIST_TIMEOUT: 1763 *optval = tcp_max_persist_timeout; 1764 break; 1765 1766 default: 1767 error = ENOPROTOOPT; 1768 break; 1769 } 1770 return (error); 1771} 1772 1773/* 1774 * MPTCP SOPT_{SET,GET} socket option handler, for options issued on the MP 1775 * socket, at SOL_SOCKET and IPPROTO_TCP levels. The former is restricted 1776 * to those that are allowed by mptcp_usr_socheckopt(). 1777 */ 1778int 1779mptcp_ctloutput(struct socket *mp_so, struct sockopt *sopt) 1780{ 1781 struct mppcb *mpp = sotomppcb(mp_so); 1782 struct mptses *mpte; 1783 int error = 0; 1784 1785 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { 1786 error = EINVAL; 1787 goto out; 1788 } 1789 mpte = mptompte(mpp); 1790 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ 1791 1792 /* we only handle socket and TCP-level socket options for MPTCP */ 1793 if (sopt->sopt_level != SOL_SOCKET && sopt->sopt_level != IPPROTO_TCP) { 1794 char buf[32]; 1795 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s level not " 1796 "handled\n", __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), 1797 mptcp_sopt2str(sopt->sopt_level, 1798 sopt->sopt_name, buf, sizeof (buf)))); 1799 error = EINVAL; 1800 goto out; 1801 } 1802 1803 switch (sopt->sopt_dir) { 1804 case SOPT_SET: 1805 error = mptcp_setopt(mpte, sopt); 1806 break; 1807 1808 case SOPT_GET: 1809 error = mptcp_getopt(mpte, sopt); 1810 break; 1811 } 1812out: 1813 return (error); 1814} 1815 1816/* 1817 * Return a string representation of <sopt_level,sopt_name> 1818 */ 1819const char * 1820mptcp_sopt2str(int level, int optname, char *dst, int size) 1821{ 1822 char lbuf[32], obuf[32]; 1823 const char *l = lbuf, *o = obuf; 1824 1825 (void) snprintf(lbuf, sizeof (lbuf), "0x%x", level); 1826 (void) snprintf(obuf, sizeof (obuf), "0x%x", optname); 1827 1828 switch (level) { 1829 case SOL_SOCKET: 1830 l = "SOL_SOCKET"; 1831 switch (optname) { 1832 case SO_LINGER: 1833 o = "SO_LINGER"; 1834 break; 1835 case SO_LINGER_SEC: 1836 o = "SO_LINGER_SEC"; 1837 break; 1838 case SO_DEBUG: 1839 o = "SO_DEBUG"; 1840 break; 1841 case SO_KEEPALIVE: 1842 o = "SO_KEEPALIVE"; 1843 break; 1844 case SO_USELOOPBACK: 1845 o = "SO_USELOOPBACK"; 1846 break; 1847 case SO_TYPE: 1848 o = "SO_TYPE"; 1849 break; 1850 case SO_NREAD: 1851 o = "SO_NREAD"; 1852 break; 1853 case SO_NWRITE: 1854 o = "SO_NWRITE"; 1855 break; 1856 case SO_ERROR: 1857 o = "SO_ERROR"; 1858 break; 1859 case SO_SNDBUF: 1860 o = "SO_SNDBUF"; 1861 break; 1862 case SO_RCVBUF: 1863 o = "SO_RCVBUF"; 1864 break; 1865 case SO_SNDLOWAT: 1866 o = "SO_SNDLOWAT"; 1867 break; 1868 case SO_RCVLOWAT: 1869 o = "SO_RCVLOWAT"; 1870 break; 1871 case SO_SNDTIMEO: 1872 o = "SO_SNDTIMEO"; 1873 break; 1874 case SO_RCVTIMEO: 1875 o = "SO_RCVTIMEO"; 1876 break; 1877 case SO_NKE: 1878 o = "SO_NKE"; 1879 break; 1880 case SO_NOSIGPIPE: 1881 o = "SO_NOSIGPIPE"; 1882 break; 1883 case SO_NOADDRERR: 1884 o = "SO_NOADDRERR"; 1885 break; 1886 case SO_RESTRICTIONS: 1887 o = "SO_RESTRICTIONS"; 1888 break; 1889 case SO_LABEL: 1890 o = "SO_LABEL"; 1891 break; 1892 case SO_PEERLABEL: 1893 o = "SO_PEERLABEL"; 1894 break; 1895 case SO_RANDOMPORT: 1896 o = "SO_RANDOMPORT"; 1897 break; 1898 case SO_TRAFFIC_CLASS: 1899 o = "SO_TRAFFIC_CLASS"; 1900 break; 1901 case SO_RECV_TRAFFIC_CLASS: 1902 o = "SO_RECV_TRAFFIC_CLASS"; 1903 break; 1904 case SO_TRAFFIC_CLASS_DBG: 1905 o = "SO_TRAFFIC_CLASS_DBG"; 1906 break; 1907 case SO_PRIVILEGED_TRAFFIC_CLASS: 1908 o = "SO_PRIVILEGED_TRAFFIC_CLASS"; 1909 break; 1910 case SO_DEFUNCTOK: 1911 o = "SO_DEFUNCTOK"; 1912 break; 1913 case SO_ISDEFUNCT: 1914 o = "SO_ISDEFUNCT"; 1915 break; 1916 case SO_OPPORTUNISTIC: 1917 o = "SO_OPPORTUNISTIC"; 1918 break; 1919 case SO_FLUSH: 1920 o = "SO_FLUSH"; 1921 break; 1922 case SO_RECV_ANYIF: 1923 o = "SO_RECV_ANYIF"; 1924 break; 1925 } 1926 break; 1927 case IPPROTO_TCP: 1928 l = "IPPROTO_TCP"; 1929 switch (optname) { 1930 case TCP_KEEPALIVE: 1931 o = "TCP_KEEPALIVE"; 1932 break; 1933 case TCP_KEEPINTVL: 1934 o = "TCP_KEEPINTVL"; 1935 break; 1936 case TCP_KEEPCNT: 1937 o = "TCP_KEEPCNT"; 1938 break; 1939 case TCP_CONNECTIONTIMEOUT: 1940 o = "TCP_CONNECTIONTIMEOUT"; 1941 break; 1942 case TCP_RXT_CONNDROPTIME: 1943 o = "TCP_RXT_CONNDROPTIME"; 1944 break; 1945 case PERSIST_TIMEOUT: 1946 o = "PERSIST_TIMEOUT"; 1947 break; 1948 } 1949 break; 1950 } 1951 1952 (void) snprintf(dst, size, "<%s,%s>", l, o); 1953 return (dst); 1954} 1955