1/*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 5 * The Regents of the University of California. All rights reserved. 6 * Copyright (c) 2004 The FreeBSD Foundation. All rights reserved. 7 * Copyright (c) 2004-2008 Robert N. M. Watson. All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c 34 */ 35 36/* 37 * 38 * Copyright (c) 2010 Isilon Systems, Inc. 39 * Copyright (c) 2010 iX Systems, Inc. 40 * Copyright (c) 2010 Panasas, Inc. 41 * All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice unmodified, this list of conditions, and the following 48 * disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 54 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 55 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 56 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 57 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 58 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 59 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 60 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 61 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 62 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 63 * 64 */ 65#include <sys/cdefs.h> 66__FBSDID("$FreeBSD$"); 67 68#include <sys/param.h> 69#include <sys/kernel.h> 70#include <sys/malloc.h> 71 72#include "sdp.h" 73 74#include <net/if.h> 75#include <net/route.h> 76#include <net/vnet.h> 77#include <sys/sysctl.h> 78 79uma_zone_t sdp_zone; 80struct rwlock sdp_lock; 81LIST_HEAD(, sdp_sock) sdp_list; 82 83struct workqueue_struct *rx_comp_wq; 84 85RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock"); 86#define SDP_LIST_WLOCK() rw_wlock(&sdp_lock) 87#define SDP_LIST_RLOCK() rw_rlock(&sdp_lock) 88#define SDP_LIST_WUNLOCK() rw_wunlock(&sdp_lock) 89#define SDP_LIST_RUNLOCK() rw_runlock(&sdp_lock) 90#define SDP_LIST_WLOCK_ASSERT() rw_assert(&sdp_lock, RW_WLOCKED) 91#define SDP_LIST_RLOCK_ASSERT() rw_assert(&sdp_lock, RW_RLOCKED) 92#define SDP_LIST_LOCK_ASSERT() rw_assert(&sdp_lock, RW_LOCKED) 93 94MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol"); 95 96static void sdp_stop_keepalive_timer(struct socket *so); 97 98/* 99 * SDP protocol interface to socket abstraction. 100 */ 101/* 102 * sdp_sendspace and sdp_recvspace are the default send and receive window 103 * sizes, respectively. 104 */ 105u_long sdp_sendspace = 1024*32; 106u_long sdp_recvspace = 1024*64; 107 108static int sdp_count; 109 110/* 111 * Disable async. CMA events for sockets which are being torn down. 112 */ 113static void 114sdp_destroy_cma(struct sdp_sock *ssk) 115{ 116 117 if (ssk->id == NULL) 118 return; 119 rdma_destroy_id(ssk->id); 120 ssk->id = NULL; 121} 122 123static int 124sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred) 125{ 126 struct sockaddr_in *sin; 127 struct sockaddr_in null; 128 int error; 129 130 SDP_WLOCK_ASSERT(ssk); 131 132 if (ssk->lport != 0 || ssk->laddr != INADDR_ANY) 133 return (EINVAL); 134 /* rdma_bind_addr handles bind races. */ 135 SDP_WUNLOCK(ssk); 136 if (ssk->id == NULL) 137 ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC); 138 if (ssk->id == NULL) { 139 SDP_WLOCK(ssk); 140 return (ENOMEM); 141 } 142 if (nam == NULL) { 143 null.sin_family = AF_INET; 144 null.sin_len = sizeof(null); 145 null.sin_addr.s_addr = INADDR_ANY; 146 null.sin_port = 0; 147 bzero(&null.sin_zero, sizeof(null.sin_zero)); 148 nam = (struct sockaddr *)&null; 149 } 150 error = -rdma_bind_addr(ssk->id, nam); 151 SDP_WLOCK(ssk); 152 if (error == 0) { 153 sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr; 154 ssk->laddr = sin->sin_addr.s_addr; 155 ssk->lport = sin->sin_port; 156 } else 157 sdp_destroy_cma(ssk); 158 return (error); 159} 160 161static void 162sdp_pcbfree(struct sdp_sock *ssk) 163{ 164 165 KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk)); 166 KASSERT((ssk->flags & SDP_DESTROY) == 0, 167 ("ssk %p already destroyed", ssk)); 168 169 sdp_dbg(ssk->socket, "Freeing pcb"); 170 SDP_WLOCK_ASSERT(ssk); 171 ssk->flags |= SDP_DESTROY; 172 SDP_WUNLOCK(ssk); 173 SDP_LIST_WLOCK(); 174 sdp_count--; 175 LIST_REMOVE(ssk, list); 176 SDP_LIST_WUNLOCK(); 177 crfree(ssk->cred); 178 ssk->qp_active = 0; 179 if (ssk->qp) { 180 ib_destroy_qp(ssk->qp); 181 ssk->qp = NULL; 182 } 183 sdp_tx_ring_destroy(ssk); 184 sdp_rx_ring_destroy(ssk); 185 sdp_destroy_cma(ssk); 186 rw_destroy(&ssk->rx_ring.destroyed_lock); 187 rw_destroy(&ssk->lock); 188 uma_zfree(sdp_zone, ssk); 189} 190 191/* 192 * Common routines to return a socket address. 193 */ 194static struct sockaddr * 195sdp_sockaddr(in_port_t port, struct in_addr *addr_p) 196{ 197 struct sockaddr_in *sin; 198 199 sin = malloc(sizeof *sin, M_SONAME, 200 M_WAITOK | M_ZERO); 201 sin->sin_family = AF_INET; 202 sin->sin_len = sizeof(*sin); 203 sin->sin_addr = *addr_p; 204 sin->sin_port = port; 205 206 return (struct sockaddr *)sin; 207} 208 209static int 210sdp_getsockaddr(struct socket *so, struct sockaddr **nam) 211{ 212 struct sdp_sock *ssk; 213 struct in_addr addr; 214 in_port_t port; 215 216 ssk = sdp_sk(so); 217 SDP_RLOCK(ssk); 218 port = ssk->lport; 219 addr.s_addr = ssk->laddr; 220 SDP_RUNLOCK(ssk); 221 222 *nam = sdp_sockaddr(port, &addr); 223 return 0; 224} 225 226static int 227sdp_getpeeraddr(struct socket *so, struct sockaddr **nam) 228{ 229 struct sdp_sock *ssk; 230 struct in_addr addr; 231 in_port_t port; 232 233 ssk = sdp_sk(so); 234 SDP_RLOCK(ssk); 235 port = ssk->fport; 236 addr.s_addr = ssk->faddr; 237 SDP_RUNLOCK(ssk); 238 239 *nam = sdp_sockaddr(port, &addr); 240 return 0; 241} 242 243static void 244sdp_pcbnotifyall(struct in_addr faddr, int errno, 245 struct sdp_sock *(*notify)(struct sdp_sock *, int)) 246{ 247 struct sdp_sock *ssk, *ssk_temp; 248 249 SDP_LIST_WLOCK(); 250 LIST_FOREACH_SAFE(ssk, &sdp_list, list, ssk_temp) { 251 SDP_WLOCK(ssk); 252 if (ssk->faddr != faddr.s_addr || ssk->socket == NULL) { 253 SDP_WUNLOCK(ssk); 254 continue; 255 } 256 if ((ssk->flags & SDP_DESTROY) == 0) 257 if ((*notify)(ssk, errno)) 258 SDP_WUNLOCK(ssk); 259 } 260 SDP_LIST_WUNLOCK(); 261} 262 263#if 0 264static void 265sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg) 266{ 267 struct sdp_sock *ssk; 268 269 SDP_LIST_RLOCK(); 270 LIST_FOREACH(ssk, &sdp_list, list) { 271 SDP_WLOCK(ssk); 272 func(ssk, arg); 273 SDP_WUNLOCK(ssk); 274 } 275 SDP_LIST_RUNLOCK(); 276} 277#endif 278 279static void 280sdp_output_reset(struct sdp_sock *ssk) 281{ 282 struct rdma_cm_id *id; 283 284 SDP_WLOCK_ASSERT(ssk); 285 if (ssk->id) { 286 id = ssk->id; 287 ssk->qp_active = 0; 288 SDP_WUNLOCK(ssk); 289 rdma_disconnect(id); 290 SDP_WLOCK(ssk); 291 } 292 ssk->state = TCPS_CLOSED; 293} 294 295/* 296 * Attempt to close a SDP socket, marking it as dropped, and freeing 297 * the socket if we hold the only reference. 298 */ 299static struct sdp_sock * 300sdp_closed(struct sdp_sock *ssk) 301{ 302 struct socket *so; 303 304 SDP_WLOCK_ASSERT(ssk); 305 306 ssk->flags |= SDP_DROPPED; 307 so = ssk->socket; 308 soisdisconnected(so); 309 if (ssk->flags & SDP_SOCKREF) { 310 KASSERT(so->so_state & SS_PROTOREF, 311 ("sdp_closed: !SS_PROTOREF")); 312 ssk->flags &= ~SDP_SOCKREF; 313 SDP_WUNLOCK(ssk); 314 SOCK_LOCK(so); 315 so->so_state &= ~SS_PROTOREF; 316 sofree(so); 317 return (NULL); 318 } 319 return (ssk); 320} 321 322/* 323 * Perform timer based shutdowns which can not operate in 324 * callout context. 325 */ 326static void 327sdp_shutdown_task(void *data, int pending) 328{ 329 struct sdp_sock *ssk; 330 331 ssk = data; 332 SDP_WLOCK(ssk); 333 /* 334 * I don't think this can race with another call to pcbfree() 335 * because SDP_TIMEWAIT protects it. SDP_DESTROY may be redundant. 336 */ 337 if (ssk->flags & SDP_DESTROY) 338 panic("sdp_shutdown_task: Racing with pcbfree for ssk %p", 339 ssk); 340 if (ssk->flags & SDP_DISCON) 341 sdp_output_reset(ssk); 342 /* We have to clear this so sdp_detach() will call pcbfree(). */ 343 ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT); 344 if ((ssk->flags & SDP_DROPPED) == 0 && 345 sdp_closed(ssk) == NULL) 346 return; 347 if (ssk->socket == NULL) { 348 sdp_pcbfree(ssk); 349 return; 350 } 351 SDP_WUNLOCK(ssk); 352} 353 354/* 355 * 2msl has expired, schedule the shutdown task. 356 */ 357static void 358sdp_2msl_timeout(void *data) 359{ 360 struct sdp_sock *ssk; 361 362 ssk = data; 363 /* Callout canceled. */ 364 if (!callout_active(&ssk->keep2msl)) 365 goto out; 366 callout_deactivate(&ssk->keep2msl); 367 /* Should be impossible, defensive programming. */ 368 if ((ssk->flags & SDP_TIMEWAIT) == 0) 369 goto out; 370 taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task); 371out: 372 SDP_WUNLOCK(ssk); 373 return; 374} 375 376/* 377 * Schedule the 2msl wait timer. 378 */ 379static void 380sdp_2msl_wait(struct sdp_sock *ssk) 381{ 382 383 SDP_WLOCK_ASSERT(ssk); 384 ssk->flags |= SDP_TIMEWAIT; 385 ssk->state = TCPS_TIME_WAIT; 386 soisdisconnected(ssk->socket); 387 callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk); 388} 389 390/* 391 * Timed out waiting for the final fin/ack from rdma_disconnect(). 392 */ 393static void 394sdp_dreq_timeout(void *data) 395{ 396 struct sdp_sock *ssk; 397 398 ssk = data; 399 /* Callout canceled. */ 400 if (!callout_active(&ssk->keep2msl)) 401 goto out; 402 /* Callout rescheduled, probably as a different timer. */ 403 if (callout_pending(&ssk->keep2msl)) 404 goto out; 405 callout_deactivate(&ssk->keep2msl); 406 if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK) 407 goto out; 408 if ((ssk->flags & SDP_DREQWAIT) == 0) 409 goto out; 410 ssk->flags &= ~SDP_DREQWAIT; 411 ssk->flags |= SDP_DISCON; 412 sdp_2msl_wait(ssk); 413 ssk->qp_active = 0; 414out: 415 SDP_WUNLOCK(ssk); 416} 417 418/* 419 * Received the final fin/ack. Cancel the 2msl. 420 */ 421void 422sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk) 423{ 424 sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n"); 425 ssk->flags &= ~SDP_DREQWAIT; 426 sdp_2msl_wait(ssk); 427} 428 429static int 430sdp_init_sock(struct socket *sk) 431{ 432 struct sdp_sock *ssk = sdp_sk(sk); 433 434 sdp_dbg(sk, "%s\n", __func__); 435 436 callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED); 437 TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk); 438#ifdef SDP_ZCOPY 439 INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout); 440 ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */ 441 ssk->tx_ring.rdma_inflight = NULL; 442#endif 443 atomic_set(&ssk->mseq_ack, 0); 444 sdp_rx_ring_init(ssk); 445 ssk->tx_ring.buffer = NULL; 446 447 return 0; 448} 449 450/* 451 * Allocate an sdp_sock for the socket and reserve socket buffer space. 452 */ 453static int 454sdp_attach(struct socket *so, int proto, struct thread *td) 455{ 456 struct sdp_sock *ssk; 457 int error; 458 459 ssk = sdp_sk(so); 460 KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so)); 461 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 462 error = soreserve(so, sdp_sendspace, sdp_recvspace); 463 if (error) 464 return (error); 465 } 466 so->so_rcv.sb_flags |= SB_AUTOSIZE; 467 so->so_snd.sb_flags |= SB_AUTOSIZE; 468 ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO); 469 if (ssk == NULL) 470 return (ENOBUFS); 471 rw_init(&ssk->lock, "sdpsock"); 472 ssk->socket = so; 473 ssk->cred = crhold(so->so_cred); 474 so->so_pcb = (caddr_t)ssk; 475 sdp_init_sock(so); 476 ssk->flags = 0; 477 ssk->qp_active = 0; 478 ssk->state = TCPS_CLOSED; 479 mbufq_init(&ssk->rxctlq, INT_MAX); 480 SDP_LIST_WLOCK(); 481 LIST_INSERT_HEAD(&sdp_list, ssk, list); 482 sdp_count++; 483 SDP_LIST_WUNLOCK(); 484 485 return (0); 486} 487 488/* 489 * Detach SDP from the socket, potentially leaving it around for the 490 * timewait to expire. 491 */ 492static void 493sdp_detach(struct socket *so) 494{ 495 struct sdp_sock *ssk; 496 497 ssk = sdp_sk(so); 498 SDP_WLOCK(ssk); 499 KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL")); 500 ssk->socket->so_pcb = NULL; 501 ssk->socket = NULL; 502 if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT)) 503 SDP_WUNLOCK(ssk); 504 else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT) 505 sdp_pcbfree(ssk); 506 else 507 panic("sdp_detach: Unexpected state, ssk %p.\n", ssk); 508} 509 510/* 511 * Allocate a local address for the socket. 512 */ 513static int 514sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 515{ 516 int error = 0; 517 struct sdp_sock *ssk; 518 struct sockaddr_in *sin; 519 520 sin = (struct sockaddr_in *)nam; 521 if (nam->sa_len != sizeof (*sin)) 522 return (EINVAL); 523 if (sin->sin_family != AF_INET) 524 return (EINVAL); 525 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 526 return (EAFNOSUPPORT); 527 528 ssk = sdp_sk(so); 529 SDP_WLOCK(ssk); 530 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 531 error = EINVAL; 532 goto out; 533 } 534 error = sdp_pcbbind(ssk, nam, td->td_ucred); 535out: 536 SDP_WUNLOCK(ssk); 537 538 return (error); 539} 540 541/* 542 * Prepare to accept connections. 543 */ 544static int 545sdp_listen(struct socket *so, int backlog, struct thread *td) 546{ 547 int error = 0; 548 struct sdp_sock *ssk; 549 550 ssk = sdp_sk(so); 551 SDP_WLOCK(ssk); 552 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 553 error = EINVAL; 554 goto out; 555 } 556 if (error == 0 && ssk->lport == 0) 557 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 558 SOCK_LOCK(so); 559 if (error == 0) 560 error = solisten_proto_check(so); 561 if (error == 0) { 562 solisten_proto(so, backlog); 563 ssk->state = TCPS_LISTEN; 564 } 565 SOCK_UNLOCK(so); 566 567out: 568 SDP_WUNLOCK(ssk); 569 if (error == 0) 570 error = -rdma_listen(ssk->id, backlog); 571 return (error); 572} 573 574/* 575 * Initiate a SDP connection to nam. 576 */ 577static int 578sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td) 579{ 580 struct sockaddr_in src; 581 struct socket *so; 582 int error; 583 584 so = ssk->socket; 585 586 SDP_WLOCK_ASSERT(ssk); 587 if (ssk->lport == 0) { 588 error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred); 589 if (error) 590 return error; 591 } 592 src.sin_family = AF_INET; 593 src.sin_len = sizeof(src); 594 bzero(&src.sin_zero, sizeof(src.sin_zero)); 595 src.sin_port = ssk->lport; 596 src.sin_addr.s_addr = ssk->laddr; 597 soisconnecting(so); 598 SDP_WUNLOCK(ssk); 599 error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam, 600 SDP_RESOLVE_TIMEOUT); 601 SDP_WLOCK(ssk); 602 if (error == 0) 603 ssk->state = TCPS_SYN_SENT; 604 605 return 0; 606} 607 608/* 609 * Initiate SDP connection. 610 */ 611static int 612sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 613{ 614 int error = 0; 615 struct sdp_sock *ssk; 616 struct sockaddr_in *sin; 617 618 sin = (struct sockaddr_in *)nam; 619 if (nam->sa_len != sizeof (*sin)) 620 return (EINVAL); 621 if (sin->sin_family != AF_INET) 622 return (EINVAL); 623 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 624 return (EAFNOSUPPORT); 625 if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0) 626 return (error); 627 ssk = sdp_sk(so); 628 SDP_WLOCK(ssk); 629 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) 630 error = EINVAL; 631 else 632 error = sdp_start_connect(ssk, nam, td); 633 SDP_WUNLOCK(ssk); 634 return (error); 635} 636 637/* 638 * Drop a SDP socket, reporting 639 * the specified error. If connection is synchronized, 640 * then send a RST to peer. 641 */ 642static struct sdp_sock * 643sdp_drop(struct sdp_sock *ssk, int errno) 644{ 645 struct socket *so; 646 647 SDP_WLOCK_ASSERT(ssk); 648 so = ssk->socket; 649 if (TCPS_HAVERCVDSYN(ssk->state)) 650 sdp_output_reset(ssk); 651 if (errno == ETIMEDOUT && ssk->softerror) 652 errno = ssk->softerror; 653 so->so_error = errno; 654 return (sdp_closed(ssk)); 655} 656 657/* 658 * User issued close, and wish to trail through shutdown states: 659 * if never received SYN, just forget it. If got a SYN from peer, 660 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 661 * If already got a FIN from peer, then almost done; go to LAST_ACK 662 * state. In all other cases, have already sent FIN to peer (e.g. 663 * after PRU_SHUTDOWN), and just have to play tedious game waiting 664 * for peer to send FIN or not respond to keep-alives, etc. 665 * We can let the user exit from the close as soon as the FIN is acked. 666 */ 667static void 668sdp_usrclosed(struct sdp_sock *ssk) 669{ 670 671 SDP_WLOCK_ASSERT(ssk); 672 673 switch (ssk->state) { 674 case TCPS_LISTEN: 675 ssk->state = TCPS_CLOSED; 676 SDP_WUNLOCK(ssk); 677 sdp_destroy_cma(ssk); 678 SDP_WLOCK(ssk); 679 /* FALLTHROUGH */ 680 case TCPS_CLOSED: 681 ssk = sdp_closed(ssk); 682 /* 683 * sdp_closed() should never return NULL here as the socket is 684 * still open. 685 */ 686 KASSERT(ssk != NULL, 687 ("sdp_usrclosed: sdp_closed() returned NULL")); 688 break; 689 690 case TCPS_SYN_SENT: 691 /* FALLTHROUGH */ 692 case TCPS_SYN_RECEIVED: 693 ssk->flags |= SDP_NEEDFIN; 694 break; 695 696 case TCPS_ESTABLISHED: 697 ssk->flags |= SDP_NEEDFIN; 698 ssk->state = TCPS_FIN_WAIT_1; 699 break; 700 701 case TCPS_CLOSE_WAIT: 702 ssk->state = TCPS_LAST_ACK; 703 break; 704 } 705 if (ssk->state >= TCPS_FIN_WAIT_2) { 706 /* Prevent the connection hanging in FIN_WAIT_2 forever. */ 707 if (ssk->state == TCPS_FIN_WAIT_2) 708 sdp_2msl_wait(ssk); 709 else 710 soisdisconnected(ssk->socket); 711 } 712} 713 714static void 715sdp_output_disconnect(struct sdp_sock *ssk) 716{ 717 718 SDP_WLOCK_ASSERT(ssk); 719 callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT, 720 sdp_dreq_timeout, ssk); 721 ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT; 722 sdp_post_sends(ssk, M_NOWAIT); 723} 724 725/* 726 * Initiate or continue a disconnect. 727 * If embryonic state, just send reset (once). 728 * If in ``let data drain'' option and linger null, just drop. 729 * Otherwise (hard), mark socket disconnecting and drop 730 * current input data; switch states based on user close, and 731 * send segment to peer (with FIN). 732 */ 733static void 734sdp_start_disconnect(struct sdp_sock *ssk) 735{ 736 struct socket *so; 737 int unread; 738 739 so = ssk->socket; 740 SDP_WLOCK_ASSERT(ssk); 741 sdp_stop_keepalive_timer(so); 742 /* 743 * Neither sdp_closed() nor sdp_drop() should return NULL, as the 744 * socket is still open. 745 */ 746 if (ssk->state < TCPS_ESTABLISHED) { 747 ssk = sdp_closed(ssk); 748 KASSERT(ssk != NULL, 749 ("sdp_start_disconnect: sdp_close() returned NULL")); 750 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 751 ssk = sdp_drop(ssk, 0); 752 KASSERT(ssk != NULL, 753 ("sdp_start_disconnect: sdp_drop() returned NULL")); 754 } else { 755 soisdisconnecting(so); 756 unread = sbused(&so->so_rcv); 757 sbflush(&so->so_rcv); 758 sdp_usrclosed(ssk); 759 if (!(ssk->flags & SDP_DROPPED)) { 760 if (unread) 761 sdp_output_reset(ssk); 762 else 763 sdp_output_disconnect(ssk); 764 } 765 } 766} 767 768/* 769 * User initiated disconnect. 770 */ 771static int 772sdp_disconnect(struct socket *so) 773{ 774 struct sdp_sock *ssk; 775 int error = 0; 776 777 ssk = sdp_sk(so); 778 SDP_WLOCK(ssk); 779 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 780 error = ECONNRESET; 781 goto out; 782 } 783 sdp_start_disconnect(ssk); 784out: 785 SDP_WUNLOCK(ssk); 786 return (error); 787} 788 789/* 790 * Accept a connection. Essentially all the work is done at higher levels; 791 * just return the address of the peer, storing through addr. 792 * 793 * 794 * XXX This is broken XXX 795 * 796 * The rationale for acquiring the sdp lock here is somewhat complicated, 797 * and is described in detail in the commit log entry for r175612. Acquiring 798 * it delays an accept(2) racing with sonewconn(), which inserts the socket 799 * before the address/port fields are initialized. A better fix would 800 * prevent the socket from being placed in the listen queue until all fields 801 * are fully initialized. 802 */ 803static int 804sdp_accept(struct socket *so, struct sockaddr **nam) 805{ 806 struct sdp_sock *ssk = NULL; 807 struct in_addr addr; 808 in_port_t port; 809 int error; 810 811 if (so->so_state & SS_ISDISCONNECTED) 812 return (ECONNABORTED); 813 814 port = 0; 815 addr.s_addr = 0; 816 error = 0; 817 ssk = sdp_sk(so); 818 SDP_WLOCK(ssk); 819 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 820 error = ECONNABORTED; 821 goto out; 822 } 823 port = ssk->fport; 824 addr.s_addr = ssk->faddr; 825out: 826 SDP_WUNLOCK(ssk); 827 if (error == 0) 828 *nam = sdp_sockaddr(port, &addr); 829 return error; 830} 831 832/* 833 * Mark the connection as being incapable of further output. 834 */ 835static int 836sdp_shutdown(struct socket *so) 837{ 838 int error = 0; 839 struct sdp_sock *ssk; 840 841 ssk = sdp_sk(so); 842 SDP_WLOCK(ssk); 843 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 844 error = ECONNRESET; 845 goto out; 846 } 847 socantsendmore(so); 848 sdp_usrclosed(ssk); 849 if (!(ssk->flags & SDP_DROPPED)) 850 sdp_output_disconnect(ssk); 851 852out: 853 SDP_WUNLOCK(ssk); 854 855 return (error); 856} 857 858static void 859sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt) 860{ 861 struct mbuf *n; 862 int ncnt; 863 864 SOCKBUF_LOCK_ASSERT(sb); 865 SBLASTRECORDCHK(sb); 866 KASSERT(mb->m_flags & M_PKTHDR, 867 ("sdp_append: %p Missing packet header.\n", mb)); 868 n = sb->sb_lastrecord; 869 /* 870 * If the queue is empty just set all pointers and proceed. 871 */ 872 if (n == NULL) { 873 sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb; 874 for (; mb; mb = mb->m_next) { 875 sb->sb_mbtail = mb; 876 sballoc(sb, mb); 877 } 878 return; 879 } 880 /* 881 * Count the number of mbufs in the current tail. 882 */ 883 for (ncnt = 0; n->m_next; n = n->m_next) 884 ncnt++; 885 n = sb->sb_lastrecord; 886 /* 887 * If the two chains can fit in a single sdp packet and 888 * the last record has not been sent yet (WRITABLE) coalesce 889 * them. The lastrecord remains the same but we must strip the 890 * packet header and then let sbcompress do the hard part. 891 */ 892 if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES && 893 n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE < 894 ssk->xmit_size_goal) { 895 m_adj(mb, SDP_HEAD_SIZE); 896 n->m_pkthdr.len += mb->m_pkthdr.len; 897 n->m_flags |= mb->m_flags & (M_PUSH | M_URG); 898 m_demote(mb, 1, 0); 899 sbcompress(sb, mb, sb->sb_mbtail); 900 return; 901 } 902 /* 903 * Not compressible, just append to the end and adjust counters. 904 */ 905 sb->sb_lastrecord->m_flags |= M_PUSH; 906 sb->sb_lastrecord->m_nextpkt = mb; 907 sb->sb_lastrecord = mb; 908 if (sb->sb_sndptr == NULL) 909 sb->sb_sndptr = mb; 910 for (; mb; mb = mb->m_next) { 911 sb->sb_mbtail = mb; 912 sballoc(sb, mb); 913 } 914} 915 916/* 917 * Do a send by putting data in output queue and updating urgent 918 * marker if URG set. Possibly send more data. Unlike the other 919 * pru_*() routines, the mbuf chains are our responsibility. We 920 * must either enqueue them or free them. The other pru_* routines 921 * generally are caller-frees. 922 * 923 * This comes from sendfile, normal sends will come from sdp_sosend(). 924 */ 925static int 926sdp_send(struct socket *so, int flags, struct mbuf *m, 927 struct sockaddr *nam, struct mbuf *control, struct thread *td) 928{ 929 struct sdp_sock *ssk; 930 struct mbuf *n; 931 int error; 932 int cnt; 933 934 error = 0; 935 ssk = sdp_sk(so); 936 KASSERT(m->m_flags & M_PKTHDR, 937 ("sdp_send: %p no packet header", m)); 938 M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK); 939 mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; 940 for (n = m, cnt = 0; n->m_next; n = n->m_next) 941 cnt++; 942 if (cnt > SDP_MAX_SEND_SGES) { 943 n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES); 944 if (n == NULL) { 945 m_freem(m); 946 return (EMSGSIZE); 947 } 948 m = n; 949 for (cnt = 0; n->m_next; n = n->m_next) 950 cnt++; 951 } 952 SDP_WLOCK(ssk); 953 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 954 if (control) 955 m_freem(control); 956 if (m) 957 m_freem(m); 958 error = ECONNRESET; 959 goto out; 960 } 961 if (control) { 962 /* SDP doesn't support control messages. */ 963 if (control->m_len) { 964 m_freem(control); 965 if (m) 966 m_freem(m); 967 error = EINVAL; 968 goto out; 969 } 970 m_freem(control); /* empty control, just free it */ 971 } 972 if (!(flags & PRUS_OOB)) { 973 SOCKBUF_LOCK(&so->so_snd); 974 sdp_append(ssk, &so->so_snd, m, cnt); 975 SOCKBUF_UNLOCK(&so->so_snd); 976 if (nam && ssk->state < TCPS_SYN_SENT) { 977 /* 978 * Do implied connect if not yet connected. 979 */ 980 error = sdp_start_connect(ssk, nam, td); 981 if (error) 982 goto out; 983 } 984 if (flags & PRUS_EOF) { 985 /* 986 * Close the send side of the connection after 987 * the data is sent. 988 */ 989 socantsendmore(so); 990 sdp_usrclosed(ssk); 991 if (!(ssk->flags & SDP_DROPPED)) 992 sdp_output_disconnect(ssk); 993 } else if (!(ssk->flags & SDP_DROPPED) && 994 !(flags & PRUS_MORETOCOME)) 995 sdp_post_sends(ssk, M_NOWAIT); 996 SDP_WUNLOCK(ssk); 997 return (0); 998 } else { 999 SOCKBUF_LOCK(&so->so_snd); 1000 if (sbspace(&so->so_snd) < -512) { 1001 SOCKBUF_UNLOCK(&so->so_snd); 1002 m_freem(m); 1003 error = ENOBUFS; 1004 goto out; 1005 } 1006 /* 1007 * According to RFC961 (Assigned Protocols), 1008 * the urgent pointer points to the last octet 1009 * of urgent data. We continue, however, 1010 * to consider it to indicate the first octet 1011 * of data past the urgent section. 1012 * Otherwise, snd_up should be one lower. 1013 */ 1014 m->m_flags |= M_URG | M_PUSH; 1015 sdp_append(ssk, &so->so_snd, m, cnt); 1016 SOCKBUF_UNLOCK(&so->so_snd); 1017 if (nam && ssk->state < TCPS_SYN_SENT) { 1018 /* 1019 * Do implied connect if not yet connected. 1020 */ 1021 error = sdp_start_connect(ssk, nam, td); 1022 if (error) 1023 goto out; 1024 } 1025 sdp_post_sends(ssk, M_NOWAIT); 1026 SDP_WUNLOCK(ssk); 1027 return (0); 1028 } 1029out: 1030 SDP_WUNLOCK(ssk); 1031 return (error); 1032} 1033 1034#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT) 1035 1036/* 1037 * Send on a socket. If send must go all at once and message is larger than 1038 * send buffering, then hard error. Lock against other senders. If must go 1039 * all at once and not enough room now, then inform user that this would 1040 * block and do nothing. Otherwise, if nonblocking, send as much as 1041 * possible. The data to be sent is described by "uio" if nonzero, otherwise 1042 * by the mbuf chain "top" (which must be null if uio is not). Data provided 1043 * in mbuf chain must be small enough to send all at once. 1044 * 1045 * Returns nonzero on error, timeout or signal; callers must check for short 1046 * counts if EINTR/ERESTART are returned. Data and control buffers are freed 1047 * on return. 1048 */ 1049static int 1050sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1051 struct mbuf *top, struct mbuf *control, int flags, struct thread *td) 1052{ 1053 struct sdp_sock *ssk; 1054 long space, resid; 1055 int atomic; 1056 int error; 1057 int copy; 1058 1059 if (uio != NULL) 1060 resid = uio->uio_resid; 1061 else 1062 resid = top->m_pkthdr.len; 1063 atomic = top != NULL; 1064 if (control != NULL) { 1065 if (control->m_len) { 1066 m_freem(control); 1067 if (top) 1068 m_freem(top); 1069 return (EINVAL); 1070 } 1071 m_freem(control); 1072 control = NULL; 1073 } 1074 /* 1075 * In theory resid should be unsigned. However, space must be 1076 * signed, as it might be less than 0 if we over-committed, and we 1077 * must use a signed comparison of space and resid. On the other 1078 * hand, a negative resid causes us to loop sending 0-length 1079 * segments to the protocol. 1080 * 1081 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM 1082 * type sockets since that's an error. 1083 */ 1084 if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) { 1085 error = EINVAL; 1086 goto out; 1087 } 1088 if (td != NULL) 1089 td->td_ru.ru_msgsnd++; 1090 1091 ssk = sdp_sk(so); 1092 error = sblock(&so->so_snd, SBLOCKWAIT(flags)); 1093 if (error) 1094 goto out; 1095 1096restart: 1097 do { 1098 SOCKBUF_LOCK(&so->so_snd); 1099 if (so->so_snd.sb_state & SBS_CANTSENDMORE) { 1100 SOCKBUF_UNLOCK(&so->so_snd); 1101 error = EPIPE; 1102 goto release; 1103 } 1104 if (so->so_error) { 1105 error = so->so_error; 1106 so->so_error = 0; 1107 SOCKBUF_UNLOCK(&so->so_snd); 1108 goto release; 1109 } 1110 if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) { 1111 SOCKBUF_UNLOCK(&so->so_snd); 1112 error = ENOTCONN; 1113 goto release; 1114 } 1115 space = sbspace(&so->so_snd); 1116 if (flags & MSG_OOB) 1117 space += 1024; 1118 if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) { 1119 SOCKBUF_UNLOCK(&so->so_snd); 1120 error = EMSGSIZE; 1121 goto release; 1122 } 1123 if (space < resid && 1124 (atomic || space < so->so_snd.sb_lowat)) { 1125 if ((so->so_state & SS_NBIO) || 1126 (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) { 1127 SOCKBUF_UNLOCK(&so->so_snd); 1128 error = EWOULDBLOCK; 1129 goto release; 1130 } 1131 error = sbwait(&so->so_snd); 1132 SOCKBUF_UNLOCK(&so->so_snd); 1133 if (error) 1134 goto release; 1135 goto restart; 1136 } 1137 SOCKBUF_UNLOCK(&so->so_snd); 1138 do { 1139 if (uio == NULL) { 1140 resid = 0; 1141 if (flags & MSG_EOR) 1142 top->m_flags |= M_EOR; 1143 } else { 1144 /* 1145 * Copy the data from userland into a mbuf 1146 * chain. If no data is to be copied in, 1147 * a single empty mbuf is returned. 1148 */ 1149 copy = min(space, 1150 ssk->xmit_size_goal - SDP_HEAD_SIZE); 1151 top = m_uiotombuf(uio, M_WAITOK, copy, 1152 0, M_PKTHDR | 1153 ((flags & MSG_EOR) ? M_EOR : 0)); 1154 if (top == NULL) { 1155 /* only possible error */ 1156 error = EFAULT; 1157 goto release; 1158 } 1159 space -= resid - uio->uio_resid; 1160 resid = uio->uio_resid; 1161 } 1162 /* 1163 * XXX all the SBS_CANTSENDMORE checks previously 1164 * done could be out of date after dropping the 1165 * socket lock. 1166 */ 1167 error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB : 1168 /* 1169 * Set EOF on the last send if the user specified 1170 * MSG_EOF. 1171 */ 1172 ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF : 1173 /* If there is more to send set PRUS_MORETOCOME. */ 1174 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, 1175 top, addr, NULL, td); 1176 top = NULL; 1177 if (error) 1178 goto release; 1179 } while (resid && space > 0); 1180 } while (resid); 1181 1182release: 1183 sbunlock(&so->so_snd); 1184out: 1185 if (top != NULL) 1186 m_freem(top); 1187 return (error); 1188} 1189 1190/* 1191 * The part of soreceive() that implements reading non-inline out-of-band 1192 * data from a socket. For more complete comments, see soreceive(), from 1193 * which this code originated. 1194 * 1195 * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is 1196 * unable to return an mbuf chain to the caller. 1197 */ 1198static int 1199soreceive_rcvoob(struct socket *so, struct uio *uio, int flags) 1200{ 1201 struct protosw *pr = so->so_proto; 1202 struct mbuf *m; 1203 int error; 1204 1205 KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0")); 1206 1207 m = m_get(M_WAITOK, MT_DATA); 1208 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); 1209 if (error) 1210 goto bad; 1211 do { 1212 error = uiomove(mtod(m, void *), 1213 (int) min(uio->uio_resid, m->m_len), uio); 1214 m = m_free(m); 1215 } while (uio->uio_resid && error == 0 && m); 1216bad: 1217 if (m != NULL) 1218 m_freem(m); 1219 return (error); 1220} 1221 1222/* 1223 * Optimized version of soreceive() for stream (TCP) sockets. 1224 */ 1225static int 1226sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio, 1227 struct mbuf **mp0, struct mbuf **controlp, int *flagsp) 1228{ 1229 int len = 0, error = 0, flags, oresid; 1230 struct sockbuf *sb; 1231 struct mbuf *m, *n = NULL; 1232 struct sdp_sock *ssk; 1233 1234 /* We only do stream sockets. */ 1235 if (so->so_type != SOCK_STREAM) 1236 return (EINVAL); 1237 if (psa != NULL) 1238 *psa = NULL; 1239 if (controlp != NULL) 1240 return (EINVAL); 1241 if (flagsp != NULL) 1242 flags = *flagsp &~ MSG_EOR; 1243 else 1244 flags = 0; 1245 if (flags & MSG_OOB) 1246 return (soreceive_rcvoob(so, uio, flags)); 1247 if (mp0 != NULL) 1248 *mp0 = NULL; 1249 1250 sb = &so->so_rcv; 1251 ssk = sdp_sk(so); 1252 1253 /* Prevent other readers from entering the socket. */ 1254 error = sblock(sb, SBLOCKWAIT(flags)); 1255 if (error) 1256 goto out; 1257 SOCKBUF_LOCK(sb); 1258 1259 /* Easy one, no space to copyout anything. */ 1260 if (uio->uio_resid == 0) { 1261 error = EINVAL; 1262 goto out; 1263 } 1264 oresid = uio->uio_resid; 1265 1266 /* We will never ever get anything unless we are connected. */ 1267 if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) { 1268 /* When disconnecting there may be still some data left. */ 1269 if (sbavail(sb)) 1270 goto deliver; 1271 if (!(so->so_state & SS_ISDISCONNECTED)) 1272 error = ENOTCONN; 1273 goto out; 1274 } 1275 1276 /* Socket buffer is empty and we shall not block. */ 1277 if (sbavail(sb) == 0 && 1278 ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) { 1279 error = EAGAIN; 1280 goto out; 1281 } 1282 1283restart: 1284 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1285 1286 /* Abort if socket has reported problems. */ 1287 if (so->so_error) { 1288 if (sbavail(sb)) 1289 goto deliver; 1290 if (oresid > uio->uio_resid) 1291 goto out; 1292 error = so->so_error; 1293 if (!(flags & MSG_PEEK)) 1294 so->so_error = 0; 1295 goto out; 1296 } 1297 1298 /* Door is closed. Deliver what is left, if any. */ 1299 if (sb->sb_state & SBS_CANTRCVMORE) { 1300 if (sbavail(sb)) 1301 goto deliver; 1302 else 1303 goto out; 1304 } 1305 1306 /* Socket buffer got some data that we shall deliver now. */ 1307 if (sbavail(sb) && !(flags & MSG_WAITALL) && 1308 ((so->so_state & SS_NBIO) || 1309 (flags & (MSG_DONTWAIT|MSG_NBIO)) || 1310 sbavail(sb) >= sb->sb_lowat || 1311 sbavail(sb) >= uio->uio_resid || 1312 sbavail(sb) >= sb->sb_hiwat) ) { 1313 goto deliver; 1314 } 1315 1316 /* On MSG_WAITALL we must wait until all data or error arrives. */ 1317 if ((flags & MSG_WAITALL) && 1318 (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat)) 1319 goto deliver; 1320 1321 /* 1322 * Wait and block until (more) data comes in. 1323 * NB: Drops the sockbuf lock during wait. 1324 */ 1325 error = sbwait(sb); 1326 if (error) 1327 goto out; 1328 goto restart; 1329 1330deliver: 1331 SOCKBUF_LOCK_ASSERT(&so->so_rcv); 1332 KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__)); 1333 KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__)); 1334 1335 /* Statistics. */ 1336 if (uio->uio_td) 1337 uio->uio_td->td_ru.ru_msgrcv++; 1338 1339 /* Fill uio until full or current end of socket buffer is reached. */ 1340 len = min(uio->uio_resid, sbavail(sb)); 1341 if (mp0 != NULL) { 1342 /* Dequeue as many mbufs as possible. */ 1343 if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) { 1344 for (*mp0 = m = sb->sb_mb; 1345 m != NULL && m->m_len <= len; 1346 m = m->m_next) { 1347 len -= m->m_len; 1348 uio->uio_resid -= m->m_len; 1349 sbfree(sb, m); 1350 n = m; 1351 } 1352 sb->sb_mb = m; 1353 if (sb->sb_mb == NULL) 1354 SB_EMPTY_FIXUP(sb); 1355 n->m_next = NULL; 1356 } 1357 /* Copy the remainder. */ 1358 if (len > 0) { 1359 KASSERT(sb->sb_mb != NULL, 1360 ("%s: len > 0 && sb->sb_mb empty", __func__)); 1361 1362 m = m_copym(sb->sb_mb, 0, len, M_NOWAIT); 1363 if (m == NULL) 1364 len = 0; /* Don't flush data from sockbuf. */ 1365 else 1366 uio->uio_resid -= m->m_len; 1367 if (*mp0 != NULL) 1368 n->m_next = m; 1369 else 1370 *mp0 = m; 1371 if (*mp0 == NULL) { 1372 error = ENOBUFS; 1373 goto out; 1374 } 1375 } 1376 } else { 1377 /* NB: Must unlock socket buffer as uiomove may sleep. */ 1378 SOCKBUF_UNLOCK(sb); 1379 error = m_mbuftouio(uio, sb->sb_mb, len); 1380 SOCKBUF_LOCK(sb); 1381 if (error) 1382 goto out; 1383 } 1384 SBLASTRECORDCHK(sb); 1385 SBLASTMBUFCHK(sb); 1386 1387 /* 1388 * Remove the delivered data from the socket buffer unless we 1389 * were only peeking. 1390 */ 1391 if (!(flags & MSG_PEEK)) { 1392 if (len > 0) 1393 sbdrop_locked(sb, len); 1394 1395 /* Notify protocol that we drained some data. */ 1396 SOCKBUF_UNLOCK(sb); 1397 SDP_WLOCK(ssk); 1398 sdp_do_posts(ssk); 1399 SDP_WUNLOCK(ssk); 1400 SOCKBUF_LOCK(sb); 1401 } 1402 1403 /* 1404 * For MSG_WAITALL we may have to loop again and wait for 1405 * more data to come in. 1406 */ 1407 if ((flags & MSG_WAITALL) && uio->uio_resid > 0) 1408 goto restart; 1409out: 1410 SOCKBUF_LOCK_ASSERT(sb); 1411 SBLASTRECORDCHK(sb); 1412 SBLASTMBUFCHK(sb); 1413 SOCKBUF_UNLOCK(sb); 1414 sbunlock(sb); 1415 return (error); 1416} 1417 1418/* 1419 * Abort is used to teardown a connection typically while sitting in 1420 * the accept queue. 1421 */ 1422void 1423sdp_abort(struct socket *so) 1424{ 1425 struct sdp_sock *ssk; 1426 1427 ssk = sdp_sk(so); 1428 SDP_WLOCK(ssk); 1429 /* 1430 * If we have not yet dropped, do it now. 1431 */ 1432 if (!(ssk->flags & SDP_TIMEWAIT) && 1433 !(ssk->flags & SDP_DROPPED)) 1434 sdp_drop(ssk, ECONNABORTED); 1435 KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X", 1436 ssk, ssk->flags)); 1437 SDP_WUNLOCK(ssk); 1438} 1439 1440/* 1441 * Close a SDP socket and initiate a friendly disconnect. 1442 */ 1443static void 1444sdp_close(struct socket *so) 1445{ 1446 struct sdp_sock *ssk; 1447 1448 ssk = sdp_sk(so); 1449 SDP_WLOCK(ssk); 1450 /* 1451 * If we have not yet dropped, do it now. 1452 */ 1453 if (!(ssk->flags & SDP_TIMEWAIT) && 1454 !(ssk->flags & SDP_DROPPED)) 1455 sdp_start_disconnect(ssk); 1456 1457 /* 1458 * If we've still not dropped let the socket layer know we're 1459 * holding on to the socket and pcb for a while. 1460 */ 1461 if (!(ssk->flags & SDP_DROPPED)) { 1462 SOCK_LOCK(so); 1463 so->so_state |= SS_PROTOREF; 1464 SOCK_UNLOCK(so); 1465 ssk->flags |= SDP_SOCKREF; 1466 } 1467 SDP_WUNLOCK(ssk); 1468} 1469 1470/* 1471 * User requests out-of-band data. 1472 */ 1473static int 1474sdp_rcvoob(struct socket *so, struct mbuf *m, int flags) 1475{ 1476 int error = 0; 1477 struct sdp_sock *ssk; 1478 1479 ssk = sdp_sk(so); 1480 SDP_WLOCK(ssk); 1481 if (!rx_ring_trylock(&ssk->rx_ring)) { 1482 SDP_WUNLOCK(ssk); 1483 return (ECONNRESET); 1484 } 1485 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1486 error = ECONNRESET; 1487 goto out; 1488 } 1489 if ((so->so_oobmark == 0 && 1490 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 1491 so->so_options & SO_OOBINLINE || 1492 ssk->oobflags & SDP_HADOOB) { 1493 error = EINVAL; 1494 goto out; 1495 } 1496 if ((ssk->oobflags & SDP_HAVEOOB) == 0) { 1497 error = EWOULDBLOCK; 1498 goto out; 1499 } 1500 m->m_len = 1; 1501 *mtod(m, caddr_t) = ssk->iobc; 1502 if ((flags & MSG_PEEK) == 0) 1503 ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB); 1504out: 1505 rx_ring_unlock(&ssk->rx_ring); 1506 SDP_WUNLOCK(ssk); 1507 return (error); 1508} 1509 1510void 1511sdp_urg(struct sdp_sock *ssk, struct mbuf *mb) 1512{ 1513 struct mbuf *m; 1514 struct socket *so; 1515 1516 so = ssk->socket; 1517 if (so == NULL) 1518 return; 1519 1520 so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1; 1521 sohasoutofband(so); 1522 ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB); 1523 if (!(so->so_options & SO_OOBINLINE)) { 1524 for (m = mb; m->m_next != NULL; m = m->m_next); 1525 ssk->iobc = *(mtod(m, char *) + m->m_len - 1); 1526 ssk->oobflags |= SDP_HAVEOOB; 1527 m->m_len--; 1528 mb->m_pkthdr.len--; 1529 } 1530} 1531 1532/* 1533 * Notify a sdp socket of an asynchronous error. 1534 * 1535 * Do not wake up user since there currently is no mechanism for 1536 * reporting soft errors (yet - a kqueue filter may be added). 1537 */ 1538struct sdp_sock * 1539sdp_notify(struct sdp_sock *ssk, int error) 1540{ 1541 1542 SDP_WLOCK_ASSERT(ssk); 1543 1544 if ((ssk->flags & SDP_TIMEWAIT) || 1545 (ssk->flags & SDP_DROPPED)) 1546 return (ssk); 1547 1548 /* 1549 * Ignore some errors if we are hooked up. 1550 */ 1551 if (ssk->state == TCPS_ESTABLISHED && 1552 (error == EHOSTUNREACH || error == ENETUNREACH || 1553 error == EHOSTDOWN)) 1554 return (ssk); 1555 ssk->softerror = error; 1556 return sdp_drop(ssk, error); 1557} 1558 1559static void 1560sdp_ctlinput(int cmd, struct sockaddr *sa, void *vip) 1561{ 1562 struct in_addr faddr; 1563 1564 faddr = ((struct sockaddr_in *)sa)->sin_addr; 1565 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 1566 return; 1567 1568 sdp_pcbnotifyall(faddr, inetctlerrmap[cmd], sdp_notify); 1569} 1570 1571static int 1572sdp_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, 1573 struct thread *td) 1574{ 1575 return (EOPNOTSUPP); 1576} 1577 1578static void 1579sdp_keepalive_timeout(void *data) 1580{ 1581 struct sdp_sock *ssk; 1582 1583 ssk = data; 1584 /* Callout canceled. */ 1585 if (!callout_active(&ssk->keep2msl)) 1586 return; 1587 /* Callout rescheduled as a different kind of timer. */ 1588 if (callout_pending(&ssk->keep2msl)) 1589 goto out; 1590 callout_deactivate(&ssk->keep2msl); 1591 if (ssk->flags & SDP_DROPPED || 1592 (ssk->socket->so_options & SO_KEEPALIVE) == 0) 1593 goto out; 1594 sdp_post_keepalive(ssk); 1595 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1596 sdp_keepalive_timeout, ssk); 1597out: 1598 SDP_WUNLOCK(ssk); 1599} 1600 1601 1602void 1603sdp_start_keepalive_timer(struct socket *so) 1604{ 1605 struct sdp_sock *ssk; 1606 1607 ssk = sdp_sk(so); 1608 if (!callout_pending(&ssk->keep2msl)) 1609 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME, 1610 sdp_keepalive_timeout, ssk); 1611} 1612 1613static void 1614sdp_stop_keepalive_timer(struct socket *so) 1615{ 1616 struct sdp_sock *ssk; 1617 1618 ssk = sdp_sk(so); 1619 callout_stop(&ssk->keep2msl); 1620} 1621 1622/* 1623 * sdp_ctloutput() must drop the inpcb lock before performing copyin on 1624 * socket option arguments. When it re-acquires the lock after the copy, it 1625 * has to revalidate that the connection is still valid for the socket 1626 * option. 1627 */ 1628#define SDP_WLOCK_RECHECK(inp) do { \ 1629 SDP_WLOCK(ssk); \ 1630 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { \ 1631 SDP_WUNLOCK(ssk); \ 1632 return (ECONNRESET); \ 1633 } \ 1634} while(0) 1635 1636static int 1637sdp_ctloutput(struct socket *so, struct sockopt *sopt) 1638{ 1639 int error, opt, optval; 1640 struct sdp_sock *ssk; 1641 1642 error = 0; 1643 ssk = sdp_sk(so); 1644 if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) { 1645 SDP_WLOCK(ssk); 1646 if (so->so_options & SO_KEEPALIVE) 1647 sdp_start_keepalive_timer(so); 1648 else 1649 sdp_stop_keepalive_timer(so); 1650 SDP_WUNLOCK(ssk); 1651 } 1652 if (sopt->sopt_level != IPPROTO_TCP) 1653 return (error); 1654 1655 SDP_WLOCK(ssk); 1656 if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) { 1657 SDP_WUNLOCK(ssk); 1658 return (ECONNRESET); 1659 } 1660 1661 switch (sopt->sopt_dir) { 1662 case SOPT_SET: 1663 switch (sopt->sopt_name) { 1664 case TCP_NODELAY: 1665 SDP_WUNLOCK(ssk); 1666 error = sooptcopyin(sopt, &optval, sizeof optval, 1667 sizeof optval); 1668 if (error) 1669 return (error); 1670 1671 SDP_WLOCK_RECHECK(ssk); 1672 opt = SDP_NODELAY; 1673 if (optval) 1674 ssk->flags |= opt; 1675 else 1676 ssk->flags &= ~opt; 1677 sdp_do_posts(ssk); 1678 SDP_WUNLOCK(ssk); 1679 break; 1680 1681 default: 1682 SDP_WUNLOCK(ssk); 1683 error = ENOPROTOOPT; 1684 break; 1685 } 1686 break; 1687 1688 case SOPT_GET: 1689 switch (sopt->sopt_name) { 1690 case TCP_NODELAY: 1691 optval = ssk->flags & SDP_NODELAY; 1692 SDP_WUNLOCK(ssk); 1693 error = sooptcopyout(sopt, &optval, sizeof optval); 1694 break; 1695 default: 1696 SDP_WUNLOCK(ssk); 1697 error = ENOPROTOOPT; 1698 break; 1699 } 1700 break; 1701 } 1702 return (error); 1703} 1704#undef SDP_WLOCK_RECHECK 1705 1706int sdp_mod_count = 0; 1707int sdp_mod_usec = 0; 1708 1709void 1710sdp_set_default_moderation(struct sdp_sock *ssk) 1711{ 1712 if (sdp_mod_count <= 0 || sdp_mod_usec <= 0) 1713 return; 1714 ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec); 1715} 1716 1717static void 1718sdp_dev_add(struct ib_device *device) 1719{ 1720 struct ib_fmr_pool_param param; 1721 struct sdp_device *sdp_dev; 1722 1723 sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO); 1724 sdp_dev->pd = ib_alloc_pd(device, 0); 1725 if (IS_ERR(sdp_dev->pd)) 1726 goto out_pd; 1727 memset(¶m, 0, sizeof param); 1728 param.max_pages_per_fmr = SDP_FMR_SIZE; 1729 param.page_shift = PAGE_SHIFT; 1730 param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ); 1731 param.pool_size = SDP_FMR_POOL_SIZE; 1732 param.dirty_watermark = SDP_FMR_DIRTY_SIZE; 1733 param.cache = 1; 1734 sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, ¶m); 1735 if (IS_ERR(sdp_dev->fmr_pool)) 1736 goto out_fmr; 1737 ib_set_client_data(device, &sdp_client, sdp_dev); 1738 return; 1739 1740out_fmr: 1741 ib_dealloc_pd(sdp_dev->pd); 1742out_pd: 1743 free(sdp_dev, M_SDP); 1744} 1745 1746static void 1747sdp_dev_rem(struct ib_device *device, void *client_data) 1748{ 1749 struct sdp_device *sdp_dev; 1750 struct sdp_sock *ssk; 1751 1752 SDP_LIST_WLOCK(); 1753 LIST_FOREACH(ssk, &sdp_list, list) { 1754 if (ssk->ib_device != device) 1755 continue; 1756 SDP_WLOCK(ssk); 1757 if ((ssk->flags & SDP_DESTROY) == 0) 1758 ssk = sdp_notify(ssk, ECONNRESET); 1759 if (ssk) 1760 SDP_WUNLOCK(ssk); 1761 } 1762 SDP_LIST_WUNLOCK(); 1763 /* 1764 * XXX Do I need to wait between these two? 1765 */ 1766 sdp_dev = ib_get_client_data(device, &sdp_client); 1767 if (!sdp_dev) 1768 return; 1769 ib_flush_fmr_pool(sdp_dev->fmr_pool); 1770 ib_destroy_fmr_pool(sdp_dev->fmr_pool); 1771 ib_dealloc_pd(sdp_dev->pd); 1772 free(sdp_dev, M_SDP); 1773} 1774 1775struct ib_client sdp_client = 1776 { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem }; 1777 1778 1779static int 1780sdp_pcblist(SYSCTL_HANDLER_ARGS) 1781{ 1782 int error, n, i; 1783 struct sdp_sock *ssk; 1784 struct xinpgen xig; 1785 1786 /* 1787 * The process of preparing the TCB list is too time-consuming and 1788 * resource-intensive to repeat twice on every request. 1789 */ 1790 if (req->oldptr == NULL) { 1791 n = sdp_count; 1792 n += imax(n / 8, 10); 1793 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); 1794 return (0); 1795 } 1796 1797 if (req->newptr != NULL) 1798 return (EPERM); 1799 1800 /* 1801 * OK, now we're committed to doing something. 1802 */ 1803 SDP_LIST_RLOCK(); 1804 n = sdp_count; 1805 SDP_LIST_RUNLOCK(); 1806 1807 error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) 1808 + n * sizeof(struct xtcpcb)); 1809 if (error != 0) 1810 return (error); 1811 1812 bzero(&xig, sizeof(xig)); 1813 xig.xig_len = sizeof xig; 1814 xig.xig_count = n; 1815 xig.xig_gen = 0; 1816 xig.xig_sogen = so_gencnt; 1817 error = SYSCTL_OUT(req, &xig, sizeof xig); 1818 if (error) 1819 return (error); 1820 1821 SDP_LIST_RLOCK(); 1822 for (ssk = LIST_FIRST(&sdp_list), i = 0; 1823 ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) { 1824 struct xtcpcb xt; 1825 1826 SDP_RLOCK(ssk); 1827 if (ssk->flags & SDP_TIMEWAIT) { 1828 if (ssk->cred != NULL) 1829 error = cr_cansee(req->td->td_ucred, 1830 ssk->cred); 1831 else 1832 error = EINVAL; /* Skip this inp. */ 1833 } else if (ssk->socket) 1834 error = cr_canseesocket(req->td->td_ucred, 1835 ssk->socket); 1836 else 1837 error = EINVAL; 1838 if (error) { 1839 error = 0; 1840 goto next; 1841 } 1842 1843 bzero(&xt, sizeof(xt)); 1844 xt.xt_len = sizeof xt; 1845 xt.xt_inp.inp_gencnt = 0; 1846 xt.xt_inp.inp_vflag = INP_IPV4; 1847 memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr)); 1848 xt.xt_inp.inp_lport = ssk->lport; 1849 memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr)); 1850 xt.xt_inp.inp_fport = ssk->fport; 1851 xt.t_state = ssk->state; 1852 if (ssk->socket != NULL) 1853 sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket); 1854 xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; 1855 SDP_RUNLOCK(ssk); 1856 error = SYSCTL_OUT(req, &xt, sizeof xt); 1857 if (error) 1858 break; 1859 i++; 1860 continue; 1861next: 1862 SDP_RUNLOCK(ssk); 1863 } 1864 if (!error) { 1865 /* 1866 * Give the user an updated idea of our state. 1867 * If the generation differs from what we told 1868 * her before, she knows that something happened 1869 * while we were processing this request, and it 1870 * might be necessary to retry. 1871 */ 1872 xig.xig_gen = 0; 1873 xig.xig_sogen = so_gencnt; 1874 xig.xig_count = sdp_count; 1875 error = SYSCTL_OUT(req, &xig, sizeof xig); 1876 } 1877 SDP_LIST_RUNLOCK(); 1878 return (error); 1879} 1880 1881static SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW, 0, "SDP"); 1882 1883SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist, 1884 CTLFLAG_RD | CTLTYPE_STRUCT, 0, 0, sdp_pcblist, "S,xtcpcb", 1885 "List of active SDP connections"); 1886 1887static void 1888sdp_zone_change(void *tag) 1889{ 1890 1891 uma_zone_set_max(sdp_zone, maxsockets); 1892} 1893 1894static void 1895sdp_init(void) 1896{ 1897 1898 LIST_INIT(&sdp_list); 1899 sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock), 1900 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); 1901 uma_zone_set_max(sdp_zone, maxsockets); 1902 EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL, 1903 EVENTHANDLER_PRI_ANY); 1904 rx_comp_wq = create_singlethread_workqueue("rx_comp_wq"); 1905 ib_register_client(&sdp_client); 1906} 1907 1908extern struct domain sdpdomain; 1909 1910struct pr_usrreqs sdp_usrreqs = { 1911 .pru_abort = sdp_abort, 1912 .pru_accept = sdp_accept, 1913 .pru_attach = sdp_attach, 1914 .pru_bind = sdp_bind, 1915 .pru_connect = sdp_connect, 1916 .pru_control = sdp_control, 1917 .pru_detach = sdp_detach, 1918 .pru_disconnect = sdp_disconnect, 1919 .pru_listen = sdp_listen, 1920 .pru_peeraddr = sdp_getpeeraddr, 1921 .pru_rcvoob = sdp_rcvoob, 1922 .pru_send = sdp_send, 1923 .pru_sosend = sdp_sosend, 1924 .pru_soreceive = sdp_sorecv, 1925 .pru_shutdown = sdp_shutdown, 1926 .pru_sockaddr = sdp_getsockaddr, 1927 .pru_close = sdp_close, 1928}; 1929 1930struct protosw sdpsw[] = { 1931{ 1932 .pr_type = SOCK_STREAM, 1933 .pr_domain = &sdpdomain, 1934 .pr_protocol = IPPROTO_IP, 1935 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, 1936 .pr_ctlinput = sdp_ctlinput, 1937 .pr_ctloutput = sdp_ctloutput, 1938 .pr_usrreqs = &sdp_usrreqs 1939}, 1940{ 1941 .pr_type = SOCK_STREAM, 1942 .pr_domain = &sdpdomain, 1943 .pr_protocol = IPPROTO_TCP, 1944 .pr_flags = PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD, 1945 .pr_ctlinput = sdp_ctlinput, 1946 .pr_ctloutput = sdp_ctloutput, 1947 .pr_usrreqs = &sdp_usrreqs 1948}, 1949}; 1950 1951struct domain sdpdomain = { 1952 .dom_family = AF_INET_SDP, 1953 .dom_name = "SDP", 1954 .dom_init = sdp_init, 1955 .dom_protosw = sdpsw, 1956 .dom_protoswNPROTOSW = &sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])], 1957}; 1958 1959DOMAIN_SET(sdp); 1960 1961int sdp_debug_level = 1; 1962int sdp_data_debug_level = 0; 1963