tcp_usrreq.c revision 194777
1/*- 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. 4 * Copyright (c) 2006-2007 Robert N. M. Watson 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 32 */ 33 34#include <sys/cdefs.h> 35__FBSDID("$FreeBSD: head/sys/netinet/tcp_usrreq.c 194777 2009-06-23 22:08:55Z bz $"); 36 37#include "opt_ddb.h" 38#include "opt_inet.h" 39#include "opt_inet6.h" 40#include "opt_tcpdebug.h" 41 42#include <sys/param.h> 43#include <sys/systm.h> 44#include <sys/malloc.h> 45#include <sys/kernel.h> 46#include <sys/sysctl.h> 47#include <sys/mbuf.h> 48#ifdef INET6 49#include <sys/domain.h> 50#endif /* INET6 */ 51#include <sys/socket.h> 52#include <sys/socketvar.h> 53#include <sys/protosw.h> 54#include <sys/proc.h> 55#include <sys/jail.h> 56#include <sys/vimage.h> 57 58#ifdef DDB 59#include <ddb/ddb.h> 60#endif 61 62#include <net/if.h> 63#include <net/route.h> 64 65#include <netinet/in.h> 66#include <netinet/in_systm.h> 67#ifdef INET6 68#include <netinet/ip6.h> 69#endif 70#include <netinet/in_pcb.h> 71#ifdef INET6 72#include <netinet6/in6_pcb.h> 73#endif 74#include <netinet/in_var.h> 75#include <netinet/ip_var.h> 76#ifdef INET6 77#include <netinet6/ip6_var.h> 78#include <netinet6/scope6_var.h> 79#endif 80#include <netinet/tcp.h> 81#include <netinet/tcp_fsm.h> 82#include <netinet/tcp_seq.h> 83#include <netinet/tcp_timer.h> 84#include <netinet/tcp_var.h> 85#include <netinet/tcpip.h> 86#ifdef TCPDEBUG 87#include <netinet/tcp_debug.h> 88#endif 89#include <netinet/tcp_offload.h> 90#include <netinet/vinet.h> 91 92/* 93 * TCP protocol interface to socket abstraction. 94 */ 95static int tcp_attach(struct socket *); 96static int tcp_connect(struct tcpcb *, struct sockaddr *, 97 struct thread *td); 98#ifdef INET6 99static int tcp6_connect(struct tcpcb *, struct sockaddr *, 100 struct thread *td); 101#endif /* INET6 */ 102static void tcp_disconnect(struct tcpcb *); 103static void tcp_usrclosed(struct tcpcb *); 104static void tcp_fill_info(struct tcpcb *, struct tcp_info *); 105 106#ifdef TCPDEBUG 107#define TCPDEBUG0 int ostate = 0 108#define TCPDEBUG1() ostate = tp ? tp->t_state : 0 109#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ 110 tcp_trace(TA_USER, ostate, tp, 0, 0, req) 111#else 112#define TCPDEBUG0 113#define TCPDEBUG1() 114#define TCPDEBUG2(req) 115#endif 116 117/* 118 * TCP attaches to socket via pru_attach(), reserving space, 119 * and an internet control block. 120 */ 121static int 122tcp_usr_attach(struct socket *so, int proto, struct thread *td) 123{ 124 struct inpcb *inp; 125 struct tcpcb *tp = NULL; 126 int error; 127 TCPDEBUG0; 128 129 inp = sotoinpcb(so); 130 KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL")); 131 TCPDEBUG1(); 132 133 error = tcp_attach(so); 134 if (error) 135 goto out; 136 137 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 138 so->so_linger = TCP_LINGERTIME; 139 140 inp = sotoinpcb(so); 141 tp = intotcpcb(inp); 142out: 143 TCPDEBUG2(PRU_ATTACH); 144 return error; 145} 146 147/* 148 * tcp_detach is called when the socket layer loses its final reference 149 * to the socket, be it a file descriptor reference, a reference from TCP, 150 * etc. At this point, there is only one case in which we will keep around 151 * inpcb state: time wait. 152 * 153 * This function can probably be re-absorbed back into tcp_usr_detach() now 154 * that there is a single detach path. 155 */ 156static void 157tcp_detach(struct socket *so, struct inpcb *inp) 158{ 159 struct tcpcb *tp; 160#ifdef INVARIANTS 161 INIT_VNET_INET(so->so_vnet); 162#endif 163 164 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 165 INP_WLOCK_ASSERT(inp); 166 167 KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp")); 168 KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so")); 169 170 tp = intotcpcb(inp); 171 172 if (inp->inp_flags & INP_TIMEWAIT) { 173 /* 174 * There are two cases to handle: one in which the time wait 175 * state is being discarded (INP_DROPPED), and one in which 176 * this connection will remain in timewait. In the former, 177 * it is time to discard all state (except tcptw, which has 178 * already been discarded by the timewait close code, which 179 * should be further up the call stack somewhere). In the 180 * latter case, we detach from the socket, but leave the pcb 181 * present until timewait ends. 182 * 183 * XXXRW: Would it be cleaner to free the tcptw here? 184 */ 185 if (inp->inp_flags & INP_DROPPED) { 186 KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && " 187 "INP_DROPPED && tp != NULL")); 188 in_pcbdetach(inp); 189 in_pcbfree(inp); 190 } else { 191 in_pcbdetach(inp); 192 INP_WUNLOCK(inp); 193 } 194 } else { 195 /* 196 * If the connection is not in timewait, we consider two 197 * two conditions: one in which no further processing is 198 * necessary (dropped || embryonic), and one in which TCP is 199 * not yet done, but no longer requires the socket, so the 200 * pcb will persist for the time being. 201 * 202 * XXXRW: Does the second case still occur? 203 */ 204 if (inp->inp_flags & INP_DROPPED || 205 tp->t_state < TCPS_SYN_SENT) { 206 tcp_discardcb(tp); 207 in_pcbdetach(inp); 208 in_pcbfree(inp); 209 } else 210 in_pcbdetach(inp); 211 } 212} 213 214/* 215 * pru_detach() detaches the TCP protocol from the socket. 216 * If the protocol state is non-embryonic, then can't 217 * do this directly: have to initiate a pru_disconnect(), 218 * which may finish later; embryonic TCB's can just 219 * be discarded here. 220 */ 221static void 222tcp_usr_detach(struct socket *so) 223{ 224 INIT_VNET_INET(so->so_vnet); 225 struct inpcb *inp; 226 227 inp = sotoinpcb(so); 228 KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); 229 INP_INFO_WLOCK(&V_tcbinfo); 230 INP_WLOCK(inp); 231 KASSERT(inp->inp_socket != NULL, 232 ("tcp_usr_detach: inp_socket == NULL")); 233 tcp_detach(so, inp); 234 INP_INFO_WUNLOCK(&V_tcbinfo); 235} 236 237/* 238 * Give the socket an address. 239 */ 240static int 241tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 242{ 243 INIT_VNET_INET(so->so_vnet); 244 int error = 0; 245 struct inpcb *inp; 246 struct tcpcb *tp = NULL; 247 struct sockaddr_in *sinp; 248 249 sinp = (struct sockaddr_in *)nam; 250 if (nam->sa_len != sizeof (*sinp)) 251 return (EINVAL); 252 /* 253 * Must check for multicast addresses and disallow binding 254 * to them. 255 */ 256 if (sinp->sin_family == AF_INET && 257 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) 258 return (EAFNOSUPPORT); 259 260 TCPDEBUG0; 261 INP_INFO_WLOCK(&V_tcbinfo); 262 inp = sotoinpcb(so); 263 KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); 264 INP_WLOCK(inp); 265 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 266 error = EINVAL; 267 goto out; 268 } 269 tp = intotcpcb(inp); 270 TCPDEBUG1(); 271 error = in_pcbbind(inp, nam, td->td_ucred); 272out: 273 TCPDEBUG2(PRU_BIND); 274 INP_WUNLOCK(inp); 275 INP_INFO_WUNLOCK(&V_tcbinfo); 276 277 return (error); 278} 279 280#ifdef INET6 281static int 282tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 283{ 284 INIT_VNET_INET(so->so_vnet); 285 int error = 0; 286 struct inpcb *inp; 287 struct tcpcb *tp = NULL; 288 struct sockaddr_in6 *sin6p; 289 290 sin6p = (struct sockaddr_in6 *)nam; 291 if (nam->sa_len != sizeof (*sin6p)) 292 return (EINVAL); 293 /* 294 * Must check for multicast addresses and disallow binding 295 * to them. 296 */ 297 if (sin6p->sin6_family == AF_INET6 && 298 IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) 299 return (EAFNOSUPPORT); 300 301 TCPDEBUG0; 302 INP_INFO_WLOCK(&V_tcbinfo); 303 inp = sotoinpcb(so); 304 KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); 305 INP_WLOCK(inp); 306 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 307 error = EINVAL; 308 goto out; 309 } 310 tp = intotcpcb(inp); 311 TCPDEBUG1(); 312 inp->inp_vflag &= ~INP_IPV4; 313 inp->inp_vflag |= INP_IPV6; 314 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { 315 if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) 316 inp->inp_vflag |= INP_IPV4; 317 else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 318 struct sockaddr_in sin; 319 320 in6_sin6_2_sin(&sin, sin6p); 321 inp->inp_vflag |= INP_IPV4; 322 inp->inp_vflag &= ~INP_IPV6; 323 error = in_pcbbind(inp, (struct sockaddr *)&sin, 324 td->td_ucred); 325 goto out; 326 } 327 } 328 error = in6_pcbbind(inp, nam, td->td_ucred); 329out: 330 TCPDEBUG2(PRU_BIND); 331 INP_WUNLOCK(inp); 332 INP_INFO_WUNLOCK(&V_tcbinfo); 333 return (error); 334} 335#endif /* INET6 */ 336 337/* 338 * Prepare to accept connections. 339 */ 340static int 341tcp_usr_listen(struct socket *so, int backlog, struct thread *td) 342{ 343 INIT_VNET_INET(so->so_vnet); 344 int error = 0; 345 struct inpcb *inp; 346 struct tcpcb *tp = NULL; 347 348 TCPDEBUG0; 349 INP_INFO_WLOCK(&V_tcbinfo); 350 inp = sotoinpcb(so); 351 KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); 352 INP_WLOCK(inp); 353 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 354 error = EINVAL; 355 goto out; 356 } 357 tp = intotcpcb(inp); 358 TCPDEBUG1(); 359 SOCK_LOCK(so); 360 error = solisten_proto_check(so); 361 if (error == 0 && inp->inp_lport == 0) 362 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 363 if (error == 0) { 364 tp->t_state = TCPS_LISTEN; 365 solisten_proto(so, backlog); 366 tcp_offload_listen_open(tp); 367 } 368 SOCK_UNLOCK(so); 369 370out: 371 TCPDEBUG2(PRU_LISTEN); 372 INP_WUNLOCK(inp); 373 INP_INFO_WUNLOCK(&V_tcbinfo); 374 return (error); 375} 376 377#ifdef INET6 378static int 379tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) 380{ 381 INIT_VNET_INET(so->so_vnet); 382 int error = 0; 383 struct inpcb *inp; 384 struct tcpcb *tp = NULL; 385 386 TCPDEBUG0; 387 INP_INFO_WLOCK(&V_tcbinfo); 388 inp = sotoinpcb(so); 389 KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); 390 INP_WLOCK(inp); 391 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 392 error = EINVAL; 393 goto out; 394 } 395 tp = intotcpcb(inp); 396 TCPDEBUG1(); 397 SOCK_LOCK(so); 398 error = solisten_proto_check(so); 399 if (error == 0 && inp->inp_lport == 0) { 400 inp->inp_vflag &= ~INP_IPV4; 401 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) 402 inp->inp_vflag |= INP_IPV4; 403 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 404 } 405 if (error == 0) { 406 tp->t_state = TCPS_LISTEN; 407 solisten_proto(so, backlog); 408 } 409 SOCK_UNLOCK(so); 410 411out: 412 TCPDEBUG2(PRU_LISTEN); 413 INP_WUNLOCK(inp); 414 INP_INFO_WUNLOCK(&V_tcbinfo); 415 return (error); 416} 417#endif /* INET6 */ 418 419/* 420 * Initiate connection to peer. 421 * Create a template for use in transmissions on this connection. 422 * Enter SYN_SENT state, and mark socket as connecting. 423 * Start keep-alive timer, and seed output sequence space. 424 * Send initial segment on connection. 425 */ 426static int 427tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 428{ 429 INIT_VNET_INET(so->so_vnet); 430 int error = 0; 431 struct inpcb *inp; 432 struct tcpcb *tp = NULL; 433 struct sockaddr_in *sinp; 434 435 sinp = (struct sockaddr_in *)nam; 436 if (nam->sa_len != sizeof (*sinp)) 437 return (EINVAL); 438 /* 439 * Must disallow TCP ``connections'' to multicast addresses. 440 */ 441 if (sinp->sin_family == AF_INET 442 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) 443 return (EAFNOSUPPORT); 444 if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0) 445 return (error); 446 447 TCPDEBUG0; 448 INP_INFO_WLOCK(&V_tcbinfo); 449 inp = sotoinpcb(so); 450 KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); 451 INP_WLOCK(inp); 452 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 453 error = EINVAL; 454 goto out; 455 } 456 tp = intotcpcb(inp); 457 TCPDEBUG1(); 458 if ((error = tcp_connect(tp, nam, td)) != 0) 459 goto out; 460 error = tcp_output_connect(so, nam); 461out: 462 TCPDEBUG2(PRU_CONNECT); 463 INP_WUNLOCK(inp); 464 INP_INFO_WUNLOCK(&V_tcbinfo); 465 return (error); 466} 467 468#ifdef INET6 469static int 470tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 471{ 472 INIT_VNET_INET(so->so_vnet); 473 int error = 0; 474 struct inpcb *inp; 475 struct tcpcb *tp = NULL; 476 struct sockaddr_in6 *sin6p; 477 478 TCPDEBUG0; 479 480 sin6p = (struct sockaddr_in6 *)nam; 481 if (nam->sa_len != sizeof (*sin6p)) 482 return (EINVAL); 483 /* 484 * Must disallow TCP ``connections'' to multicast addresses. 485 */ 486 if (sin6p->sin6_family == AF_INET6 487 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) 488 return (EAFNOSUPPORT); 489 490 INP_INFO_WLOCK(&V_tcbinfo); 491 inp = sotoinpcb(so); 492 KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); 493 INP_WLOCK(inp); 494 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 495 error = EINVAL; 496 goto out; 497 } 498 tp = intotcpcb(inp); 499 TCPDEBUG1(); 500 if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 501 struct sockaddr_in sin; 502 503 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { 504 error = EINVAL; 505 goto out; 506 } 507 508 in6_sin6_2_sin(&sin, sin6p); 509 inp->inp_vflag |= INP_IPV4; 510 inp->inp_vflag &= ~INP_IPV6; 511 if ((error = prison_remote_ip4(td->td_ucred, 512 &sin.sin_addr)) != 0) 513 goto out; 514 if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) 515 goto out; 516 error = tcp_output_connect(so, nam); 517 goto out; 518 } 519 inp->inp_vflag &= ~INP_IPV4; 520 inp->inp_vflag |= INP_IPV6; 521 inp->inp_inc.inc_flags |= INC_ISIPV6; 522 if ((error = prison_remote_ip6(td->td_ucred, &sin6p->sin6_addr)) != 0) 523 goto out; 524 if ((error = tcp6_connect(tp, nam, td)) != 0) 525 goto out; 526 error = tcp_output_connect(so, nam); 527 528out: 529 TCPDEBUG2(PRU_CONNECT); 530 INP_WUNLOCK(inp); 531 INP_INFO_WUNLOCK(&V_tcbinfo); 532 return (error); 533} 534#endif /* INET6 */ 535 536/* 537 * Initiate disconnect from peer. 538 * If connection never passed embryonic stage, just drop; 539 * else if don't need to let data drain, then can just drop anyways, 540 * else have to begin TCP shutdown process: mark socket disconnecting, 541 * drain unread data, state switch to reflect user close, and 542 * send segment (e.g. FIN) to peer. Socket will be really disconnected 543 * when peer sends FIN and acks ours. 544 * 545 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 546 */ 547static int 548tcp_usr_disconnect(struct socket *so) 549{ 550 INIT_VNET_INET(so->so_vnet); 551 struct inpcb *inp; 552 struct tcpcb *tp = NULL; 553 int error = 0; 554 555 TCPDEBUG0; 556 INP_INFO_WLOCK(&V_tcbinfo); 557 inp = sotoinpcb(so); 558 KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); 559 INP_WLOCK(inp); 560 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 561 error = ECONNRESET; 562 goto out; 563 } 564 tp = intotcpcb(inp); 565 TCPDEBUG1(); 566 tcp_disconnect(tp); 567out: 568 TCPDEBUG2(PRU_DISCONNECT); 569 INP_WUNLOCK(inp); 570 INP_INFO_WUNLOCK(&V_tcbinfo); 571 return (error); 572} 573 574/* 575 * Accept a connection. Essentially all the work is 576 * done at higher levels; just return the address 577 * of the peer, storing through addr. 578 */ 579static int 580tcp_usr_accept(struct socket *so, struct sockaddr **nam) 581{ 582 INIT_VNET_INET(so->so_vnet); 583 int error = 0; 584 struct inpcb *inp = NULL; 585 struct tcpcb *tp = NULL; 586 struct in_addr addr; 587 in_port_t port = 0; 588 TCPDEBUG0; 589 590 if (so->so_state & SS_ISDISCONNECTED) 591 return (ECONNABORTED); 592 593 inp = sotoinpcb(so); 594 KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); 595 INP_INFO_RLOCK(&V_tcbinfo); 596 INP_WLOCK(inp); 597 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 598 error = ECONNABORTED; 599 goto out; 600 } 601 tp = intotcpcb(inp); 602 TCPDEBUG1(); 603 604 /* 605 * We inline in_getpeeraddr and COMMON_END here, so that we can 606 * copy the data of interest and defer the malloc until after we 607 * release the lock. 608 */ 609 port = inp->inp_fport; 610 addr = inp->inp_faddr; 611 612out: 613 TCPDEBUG2(PRU_ACCEPT); 614 INP_WUNLOCK(inp); 615 INP_INFO_RUNLOCK(&V_tcbinfo); 616 if (error == 0) 617 *nam = in_sockaddr(port, &addr); 618 return error; 619} 620 621#ifdef INET6 622static int 623tcp6_usr_accept(struct socket *so, struct sockaddr **nam) 624{ 625 struct inpcb *inp = NULL; 626 int error = 0; 627 struct tcpcb *tp = NULL; 628 struct in_addr addr; 629 struct in6_addr addr6; 630 in_port_t port = 0; 631 int v4 = 0; 632 TCPDEBUG0; 633 634 if (so->so_state & SS_ISDISCONNECTED) 635 return (ECONNABORTED); 636 637 inp = sotoinpcb(so); 638 KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); 639 INP_WLOCK(inp); 640 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 641 error = ECONNABORTED; 642 goto out; 643 } 644 tp = intotcpcb(inp); 645 TCPDEBUG1(); 646 647 /* 648 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can 649 * copy the data of interest and defer the malloc until after we 650 * release the lock. 651 */ 652 if (inp->inp_vflag & INP_IPV4) { 653 v4 = 1; 654 port = inp->inp_fport; 655 addr = inp->inp_faddr; 656 } else { 657 port = inp->inp_fport; 658 addr6 = inp->in6p_faddr; 659 } 660 661out: 662 TCPDEBUG2(PRU_ACCEPT); 663 INP_WUNLOCK(inp); 664 if (error == 0) { 665 if (v4) 666 *nam = in6_v4mapsin6_sockaddr(port, &addr); 667 else 668 *nam = in6_sockaddr(port, &addr6); 669 } 670 return error; 671} 672#endif /* INET6 */ 673 674/* 675 * Mark the connection as being incapable of further output. 676 */ 677static int 678tcp_usr_shutdown(struct socket *so) 679{ 680 INIT_VNET_INET(so->so_vnet); 681 int error = 0; 682 struct inpcb *inp; 683 struct tcpcb *tp = NULL; 684 685 TCPDEBUG0; 686 INP_INFO_WLOCK(&V_tcbinfo); 687 inp = sotoinpcb(so); 688 KASSERT(inp != NULL, ("inp == NULL")); 689 INP_WLOCK(inp); 690 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 691 error = ECONNRESET; 692 goto out; 693 } 694 tp = intotcpcb(inp); 695 TCPDEBUG1(); 696 socantsendmore(so); 697 tcp_usrclosed(tp); 698 if (!(inp->inp_flags & INP_DROPPED)) 699 error = tcp_output_disconnect(tp); 700 701out: 702 TCPDEBUG2(PRU_SHUTDOWN); 703 INP_WUNLOCK(inp); 704 INP_INFO_WUNLOCK(&V_tcbinfo); 705 706 return (error); 707} 708 709/* 710 * After a receive, possibly send window update to peer. 711 */ 712static int 713tcp_usr_rcvd(struct socket *so, int flags) 714{ 715 struct inpcb *inp; 716 struct tcpcb *tp = NULL; 717 int error = 0; 718 719 TCPDEBUG0; 720 inp = sotoinpcb(so); 721 KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL")); 722 INP_WLOCK(inp); 723 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 724 error = ECONNRESET; 725 goto out; 726 } 727 tp = intotcpcb(inp); 728 TCPDEBUG1(); 729 tcp_output_rcvd(tp); 730 731out: 732 TCPDEBUG2(PRU_RCVD); 733 INP_WUNLOCK(inp); 734 return (error); 735} 736 737/* 738 * Do a send by putting data in output queue and updating urgent 739 * marker if URG set. Possibly send more data. Unlike the other 740 * pru_*() routines, the mbuf chains are our responsibility. We 741 * must either enqueue them or free them. The other pru_* routines 742 * generally are caller-frees. 743 */ 744static int 745tcp_usr_send(struct socket *so, int flags, struct mbuf *m, 746 struct sockaddr *nam, struct mbuf *control, struct thread *td) 747{ 748 INIT_VNET_INET(so->so_vnet); 749 int error = 0; 750 struct inpcb *inp; 751 struct tcpcb *tp = NULL; 752 int headlocked = 0; 753#ifdef INET6 754 int isipv6; 755#endif 756 TCPDEBUG0; 757 758 /* 759 * We require the pcbinfo lock in two cases: 760 * 761 * (1) An implied connect is taking place, which can result in 762 * binding IPs and ports and hence modification of the pcb hash 763 * chains. 764 * 765 * (2) PRUS_EOF is set, resulting in explicit close on the send. 766 */ 767 if ((nam != NULL) || (flags & PRUS_EOF)) { 768 INP_INFO_WLOCK(&V_tcbinfo); 769 headlocked = 1; 770 } 771 inp = sotoinpcb(so); 772 KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); 773 INP_WLOCK(inp); 774 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 775 if (control) 776 m_freem(control); 777 if (m) 778 m_freem(m); 779 error = ECONNRESET; 780 goto out; 781 } 782#ifdef INET6 783 isipv6 = nam && nam->sa_family == AF_INET6; 784#endif /* INET6 */ 785 tp = intotcpcb(inp); 786 TCPDEBUG1(); 787 if (control) { 788 /* TCP doesn't do control messages (rights, creds, etc) */ 789 if (control->m_len) { 790 m_freem(control); 791 if (m) 792 m_freem(m); 793 error = EINVAL; 794 goto out; 795 } 796 m_freem(control); /* empty control, just free it */ 797 } 798 if (!(flags & PRUS_OOB)) { 799 sbappendstream(&so->so_snd, m); 800 if (nam && tp->t_state < TCPS_SYN_SENT) { 801 /* 802 * Do implied connect if not yet connected, 803 * initialize window to default value, and 804 * initialize maxseg/maxopd using peer's cached 805 * MSS. 806 */ 807 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 808#ifdef INET6 809 if (isipv6) 810 error = tcp6_connect(tp, nam, td); 811 else 812#endif /* INET6 */ 813 error = tcp_connect(tp, nam, td); 814 if (error) 815 goto out; 816 tp->snd_wnd = TTCP_CLIENT_SND_WND; 817 tcp_mss(tp, -1); 818 } 819 if (flags & PRUS_EOF) { 820 /* 821 * Close the send side of the connection after 822 * the data is sent. 823 */ 824 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 825 socantsendmore(so); 826 tcp_usrclosed(tp); 827 } 828 if (headlocked) { 829 INP_INFO_WUNLOCK(&V_tcbinfo); 830 headlocked = 0; 831 } 832 if (!(inp->inp_flags & INP_DROPPED)) { 833 if (flags & PRUS_MORETOCOME) 834 tp->t_flags |= TF_MORETOCOME; 835 error = tcp_output_send(tp); 836 if (flags & PRUS_MORETOCOME) 837 tp->t_flags &= ~TF_MORETOCOME; 838 } 839 } else { 840 /* 841 * XXXRW: PRUS_EOF not implemented with PRUS_OOB? 842 */ 843 SOCKBUF_LOCK(&so->so_snd); 844 if (sbspace(&so->so_snd) < -512) { 845 SOCKBUF_UNLOCK(&so->so_snd); 846 m_freem(m); 847 error = ENOBUFS; 848 goto out; 849 } 850 /* 851 * According to RFC961 (Assigned Protocols), 852 * the urgent pointer points to the last octet 853 * of urgent data. We continue, however, 854 * to consider it to indicate the first octet 855 * of data past the urgent section. 856 * Otherwise, snd_up should be one lower. 857 */ 858 sbappendstream_locked(&so->so_snd, m); 859 SOCKBUF_UNLOCK(&so->so_snd); 860 if (nam && tp->t_state < TCPS_SYN_SENT) { 861 /* 862 * Do implied connect if not yet connected, 863 * initialize window to default value, and 864 * initialize maxseg/maxopd using peer's cached 865 * MSS. 866 */ 867 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 868#ifdef INET6 869 if (isipv6) 870 error = tcp6_connect(tp, nam, td); 871 else 872#endif /* INET6 */ 873 error = tcp_connect(tp, nam, td); 874 if (error) 875 goto out; 876 tp->snd_wnd = TTCP_CLIENT_SND_WND; 877 tcp_mss(tp, -1); 878 INP_INFO_WUNLOCK(&V_tcbinfo); 879 headlocked = 0; 880 } else if (nam) { 881 INP_INFO_WUNLOCK(&V_tcbinfo); 882 headlocked = 0; 883 } 884 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 885 tp->t_flags |= TF_FORCEDATA; 886 error = tcp_output_send(tp); 887 tp->t_flags &= ~TF_FORCEDATA; 888 } 889out: 890 TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : 891 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); 892 INP_WUNLOCK(inp); 893 if (headlocked) 894 INP_INFO_WUNLOCK(&V_tcbinfo); 895 return (error); 896} 897 898/* 899 * Abort the TCP. Drop the connection abruptly. 900 */ 901static void 902tcp_usr_abort(struct socket *so) 903{ 904 INIT_VNET_INET(so->so_vnet); 905 struct inpcb *inp; 906 struct tcpcb *tp = NULL; 907 TCPDEBUG0; 908 909 inp = sotoinpcb(so); 910 KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); 911 912 INP_INFO_WLOCK(&V_tcbinfo); 913 INP_WLOCK(inp); 914 KASSERT(inp->inp_socket != NULL, 915 ("tcp_usr_abort: inp_socket == NULL")); 916 917 /* 918 * If we still have full TCP state, and we're not dropped, drop. 919 */ 920 if (!(inp->inp_flags & INP_TIMEWAIT) && 921 !(inp->inp_flags & INP_DROPPED)) { 922 tp = intotcpcb(inp); 923 TCPDEBUG1(); 924 tcp_drop(tp, ECONNABORTED); 925 TCPDEBUG2(PRU_ABORT); 926 } 927 if (!(inp->inp_flags & INP_DROPPED)) { 928 SOCK_LOCK(so); 929 so->so_state |= SS_PROTOREF; 930 SOCK_UNLOCK(so); 931 inp->inp_flags |= INP_SOCKREF; 932 } 933 INP_WUNLOCK(inp); 934 INP_INFO_WUNLOCK(&V_tcbinfo); 935} 936 937/* 938 * TCP socket is closed. Start friendly disconnect. 939 */ 940static void 941tcp_usr_close(struct socket *so) 942{ 943 INIT_VNET_INET(so->so_vnet); 944 struct inpcb *inp; 945 struct tcpcb *tp = NULL; 946 TCPDEBUG0; 947 948 inp = sotoinpcb(so); 949 KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); 950 951 INP_INFO_WLOCK(&V_tcbinfo); 952 INP_WLOCK(inp); 953 KASSERT(inp->inp_socket != NULL, 954 ("tcp_usr_close: inp_socket == NULL")); 955 956 /* 957 * If we still have full TCP state, and we're not dropped, initiate 958 * a disconnect. 959 */ 960 if (!(inp->inp_flags & INP_TIMEWAIT) && 961 !(inp->inp_flags & INP_DROPPED)) { 962 tp = intotcpcb(inp); 963 TCPDEBUG1(); 964 tcp_disconnect(tp); 965 TCPDEBUG2(PRU_CLOSE); 966 } 967 if (!(inp->inp_flags & INP_DROPPED)) { 968 SOCK_LOCK(so); 969 so->so_state |= SS_PROTOREF; 970 SOCK_UNLOCK(so); 971 inp->inp_flags |= INP_SOCKREF; 972 } 973 INP_WUNLOCK(inp); 974 INP_INFO_WUNLOCK(&V_tcbinfo); 975} 976 977/* 978 * Receive out-of-band data. 979 */ 980static int 981tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) 982{ 983 int error = 0; 984 struct inpcb *inp; 985 struct tcpcb *tp = NULL; 986 987 TCPDEBUG0; 988 inp = sotoinpcb(so); 989 KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL")); 990 INP_WLOCK(inp); 991 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 992 error = ECONNRESET; 993 goto out; 994 } 995 tp = intotcpcb(inp); 996 TCPDEBUG1(); 997 if ((so->so_oobmark == 0 && 998 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 999 so->so_options & SO_OOBINLINE || 1000 tp->t_oobflags & TCPOOB_HADDATA) { 1001 error = EINVAL; 1002 goto out; 1003 } 1004 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 1005 error = EWOULDBLOCK; 1006 goto out; 1007 } 1008 m->m_len = 1; 1009 *mtod(m, caddr_t) = tp->t_iobc; 1010 if ((flags & MSG_PEEK) == 0) 1011 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 1012 1013out: 1014 TCPDEBUG2(PRU_RCVOOB); 1015 INP_WUNLOCK(inp); 1016 return (error); 1017} 1018 1019struct pr_usrreqs tcp_usrreqs = { 1020 .pru_abort = tcp_usr_abort, 1021 .pru_accept = tcp_usr_accept, 1022 .pru_attach = tcp_usr_attach, 1023 .pru_bind = tcp_usr_bind, 1024 .pru_connect = tcp_usr_connect, 1025 .pru_control = in_control, 1026 .pru_detach = tcp_usr_detach, 1027 .pru_disconnect = tcp_usr_disconnect, 1028 .pru_listen = tcp_usr_listen, 1029 .pru_peeraddr = in_getpeeraddr, 1030 .pru_rcvd = tcp_usr_rcvd, 1031 .pru_rcvoob = tcp_usr_rcvoob, 1032 .pru_send = tcp_usr_send, 1033 .pru_shutdown = tcp_usr_shutdown, 1034 .pru_sockaddr = in_getsockaddr, 1035#if 0 1036 .pru_soreceive = soreceive_stream, 1037#endif 1038 .pru_sosetlabel = in_pcbsosetlabel, 1039 .pru_close = tcp_usr_close, 1040}; 1041 1042#ifdef INET6 1043struct pr_usrreqs tcp6_usrreqs = { 1044 .pru_abort = tcp_usr_abort, 1045 .pru_accept = tcp6_usr_accept, 1046 .pru_attach = tcp_usr_attach, 1047 .pru_bind = tcp6_usr_bind, 1048 .pru_connect = tcp6_usr_connect, 1049 .pru_control = in6_control, 1050 .pru_detach = tcp_usr_detach, 1051 .pru_disconnect = tcp_usr_disconnect, 1052 .pru_listen = tcp6_usr_listen, 1053 .pru_peeraddr = in6_mapped_peeraddr, 1054 .pru_rcvd = tcp_usr_rcvd, 1055 .pru_rcvoob = tcp_usr_rcvoob, 1056 .pru_send = tcp_usr_send, 1057 .pru_shutdown = tcp_usr_shutdown, 1058 .pru_sockaddr = in6_mapped_sockaddr, 1059#if 0 1060 .pru_soreceive = soreceive_stream, 1061#endif 1062 .pru_sosetlabel = in_pcbsosetlabel, 1063 .pru_close = tcp_usr_close, 1064}; 1065#endif /* INET6 */ 1066 1067/* 1068 * Common subroutine to open a TCP connection to remote host specified 1069 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 1070 * port number if needed. Call in_pcbconnect_setup to do the routing and 1071 * to choose a local host address (interface). If there is an existing 1072 * incarnation of the same connection in TIME-WAIT state and if the remote 1073 * host was sending CC options and if the connection duration was < MSL, then 1074 * truncate the previous TIME-WAIT state and proceed. 1075 * Initialize connection parameters and enter SYN-SENT state. 1076 */ 1077static int 1078tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) 1079{ 1080 struct inpcb *inp = tp->t_inpcb, *oinp; 1081 struct socket *so = inp->inp_socket; 1082 INIT_VNET_INET(so->so_vnet); 1083 struct in_addr laddr; 1084 u_short lport; 1085 int error; 1086 1087 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1088 INP_WLOCK_ASSERT(inp); 1089 1090 if (inp->inp_lport == 0) { 1091 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 1092 if (error) 1093 return error; 1094 } 1095 1096 /* 1097 * Cannot simply call in_pcbconnect, because there might be an 1098 * earlier incarnation of this same connection still in 1099 * TIME_WAIT state, creating an ADDRINUSE error. 1100 */ 1101 laddr = inp->inp_laddr; 1102 lport = inp->inp_lport; 1103 error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, 1104 &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); 1105 if (error && oinp == NULL) 1106 return error; 1107 if (oinp) 1108 return EADDRINUSE; 1109 inp->inp_laddr = laddr; 1110 in_pcbrehash(inp); 1111 1112 /* 1113 * Compute window scaling to request: 1114 * Scale to fit into sweet spot. See tcp_syncache.c. 1115 * XXX: This should move to tcp_output(). 1116 */ 1117 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1118 (TCP_MAXWIN << tp->request_r_scale) < sb_max) 1119 tp->request_r_scale++; 1120 1121 soisconnecting(so); 1122 TCPSTAT_INC(tcps_connattempt); 1123 tp->t_state = TCPS_SYN_SENT; 1124 tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); 1125 tp->iss = tcp_new_isn(tp); 1126 tp->t_bw_rtseq = tp->iss; 1127 tcp_sendseqinit(tp); 1128 1129 return 0; 1130} 1131 1132#ifdef INET6 1133static int 1134tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) 1135{ 1136 struct inpcb *inp = tp->t_inpcb, *oinp; 1137 struct socket *so = inp->inp_socket; 1138 INIT_VNET_INET(so->so_vnet); 1139 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; 1140 struct in6_addr addr6; 1141 int error; 1142 1143 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1144 INP_WLOCK_ASSERT(inp); 1145 1146 if (inp->inp_lport == 0) { 1147 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 1148 if (error) 1149 return error; 1150 } 1151 1152 /* 1153 * Cannot simply call in_pcbconnect, because there might be an 1154 * earlier incarnation of this same connection still in 1155 * TIME_WAIT state, creating an ADDRINUSE error. 1156 * in6_pcbladdr() also handles scope zone IDs. 1157 */ 1158 error = in6_pcbladdr(inp, nam, &addr6); 1159 if (error) 1160 return error; 1161 oinp = in6_pcblookup_hash(inp->inp_pcbinfo, 1162 &sin6->sin6_addr, sin6->sin6_port, 1163 IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) 1164 ? &addr6 1165 : &inp->in6p_laddr, 1166 inp->inp_lport, 0, NULL); 1167 if (oinp) 1168 return EADDRINUSE; 1169 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) 1170 inp->in6p_laddr = addr6; 1171 inp->in6p_faddr = sin6->sin6_addr; 1172 inp->inp_fport = sin6->sin6_port; 1173 /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ 1174 inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; 1175 if (inp->inp_flags & IN6P_AUTOFLOWLABEL) 1176 inp->inp_flow |= 1177 (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); 1178 in_pcbrehash(inp); 1179 1180 /* Compute window scaling to request. */ 1181 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 1182 (TCP_MAXWIN << tp->request_r_scale) < sb_max) 1183 tp->request_r_scale++; 1184 1185 soisconnecting(so); 1186 TCPSTAT_INC(tcps_connattempt); 1187 tp->t_state = TCPS_SYN_SENT; 1188 tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); 1189 tp->iss = tcp_new_isn(tp); 1190 tp->t_bw_rtseq = tp->iss; 1191 tcp_sendseqinit(tp); 1192 1193 return 0; 1194} 1195#endif /* INET6 */ 1196 1197/* 1198 * Export TCP internal state information via a struct tcp_info, based on the 1199 * Linux 2.6 API. Not ABI compatible as our constants are mapped differently 1200 * (TCP state machine, etc). We export all information using FreeBSD-native 1201 * constants -- for example, the numeric values for tcpi_state will differ 1202 * from Linux. 1203 */ 1204static void 1205tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) 1206{ 1207 1208 INP_WLOCK_ASSERT(tp->t_inpcb); 1209 bzero(ti, sizeof(*ti)); 1210 1211 ti->tcpi_state = tp->t_state; 1212 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 1213 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 1214 if (tp->t_flags & TF_SACK_PERMIT) 1215 ti->tcpi_options |= TCPI_OPT_SACK; 1216 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 1217 ti->tcpi_options |= TCPI_OPT_WSCALE; 1218 ti->tcpi_snd_wscale = tp->snd_scale; 1219 ti->tcpi_rcv_wscale = tp->rcv_scale; 1220 } 1221 1222 ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; 1223 ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT; 1224 1225 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 1226 ti->tcpi_snd_cwnd = tp->snd_cwnd; 1227 1228 /* 1229 * FreeBSD-specific extension fields for tcp_info. 1230 */ 1231 ti->tcpi_rcv_space = tp->rcv_wnd; 1232 ti->tcpi_rcv_nxt = tp->rcv_nxt; 1233 ti->tcpi_snd_wnd = tp->snd_wnd; 1234 ti->tcpi_snd_bwnd = tp->snd_bwnd; 1235 ti->tcpi_snd_nxt = tp->snd_nxt; 1236 ti->__tcpi_snd_mss = tp->t_maxseg; 1237 ti->__tcpi_rcv_mss = tp->t_maxseg; 1238 if (tp->t_flags & TF_TOE) 1239 ti->tcpi_options |= TCPI_OPT_TOE; 1240} 1241 1242/* 1243 * tcp_ctloutput() must drop the inpcb lock before performing copyin on 1244 * socket option arguments. When it re-acquires the lock after the copy, it 1245 * has to revalidate that the connection is still valid for the socket 1246 * option. 1247 */ 1248#define INP_WLOCK_RECHECK(inp) do { \ 1249 INP_WLOCK(inp); \ 1250 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ 1251 INP_WUNLOCK(inp); \ 1252 return (ECONNRESET); \ 1253 } \ 1254 tp = intotcpcb(inp); \ 1255} while(0) 1256 1257int 1258tcp_ctloutput(struct socket *so, struct sockopt *sopt) 1259{ 1260 INIT_VNET_INET(so->so_vnet); 1261 int error, opt, optval; 1262 struct inpcb *inp; 1263 struct tcpcb *tp; 1264 struct tcp_info ti; 1265 1266 error = 0; 1267 inp = sotoinpcb(so); 1268 KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); 1269 INP_WLOCK(inp); 1270 if (sopt->sopt_level != IPPROTO_TCP) { 1271#ifdef INET6 1272 if (inp->inp_vflag & INP_IPV6PROTO) { 1273 INP_WUNLOCK(inp); 1274 error = ip6_ctloutput(so, sopt); 1275 } else { 1276#endif /* INET6 */ 1277 INP_WUNLOCK(inp); 1278 error = ip_ctloutput(so, sopt); 1279#ifdef INET6 1280 } 1281#endif 1282 return (error); 1283 } 1284 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 1285 INP_WUNLOCK(inp); 1286 return (ECONNRESET); 1287 } 1288 1289 switch (sopt->sopt_dir) { 1290 case SOPT_SET: 1291 switch (sopt->sopt_name) { 1292#ifdef TCP_SIGNATURE 1293 case TCP_MD5SIG: 1294 INP_WUNLOCK(inp); 1295 error = sooptcopyin(sopt, &optval, sizeof optval, 1296 sizeof optval); 1297 if (error) 1298 return (error); 1299 1300 INP_WLOCK_RECHECK(inp); 1301 if (optval > 0) 1302 tp->t_flags |= TF_SIGNATURE; 1303 else 1304 tp->t_flags &= ~TF_SIGNATURE; 1305 INP_WUNLOCK(inp); 1306 break; 1307#endif /* TCP_SIGNATURE */ 1308 case TCP_NODELAY: 1309 case TCP_NOOPT: 1310 INP_WUNLOCK(inp); 1311 error = sooptcopyin(sopt, &optval, sizeof optval, 1312 sizeof optval); 1313 if (error) 1314 return (error); 1315 1316 INP_WLOCK_RECHECK(inp); 1317 switch (sopt->sopt_name) { 1318 case TCP_NODELAY: 1319 opt = TF_NODELAY; 1320 break; 1321 case TCP_NOOPT: 1322 opt = TF_NOOPT; 1323 break; 1324 default: 1325 opt = 0; /* dead code to fool gcc */ 1326 break; 1327 } 1328 1329 if (optval) 1330 tp->t_flags |= opt; 1331 else 1332 tp->t_flags &= ~opt; 1333 INP_WUNLOCK(inp); 1334 break; 1335 1336 case TCP_NOPUSH: 1337 INP_WUNLOCK(inp); 1338 error = sooptcopyin(sopt, &optval, sizeof optval, 1339 sizeof optval); 1340 if (error) 1341 return (error); 1342 1343 INP_WLOCK_RECHECK(inp); 1344 if (optval) 1345 tp->t_flags |= TF_NOPUSH; 1346 else { 1347 tp->t_flags &= ~TF_NOPUSH; 1348 error = tcp_output(tp); 1349 } 1350 INP_WUNLOCK(inp); 1351 break; 1352 1353 case TCP_MAXSEG: 1354 INP_WUNLOCK(inp); 1355 error = sooptcopyin(sopt, &optval, sizeof optval, 1356 sizeof optval); 1357 if (error) 1358 return (error); 1359 1360 INP_WLOCK_RECHECK(inp); 1361 if (optval > 0 && optval <= tp->t_maxseg && 1362 optval + 40 >= V_tcp_minmss) 1363 tp->t_maxseg = optval; 1364 else 1365 error = EINVAL; 1366 INP_WUNLOCK(inp); 1367 break; 1368 1369 case TCP_INFO: 1370 INP_WUNLOCK(inp); 1371 error = EINVAL; 1372 break; 1373 1374 default: 1375 INP_WUNLOCK(inp); 1376 error = ENOPROTOOPT; 1377 break; 1378 } 1379 break; 1380 1381 case SOPT_GET: 1382 tp = intotcpcb(inp); 1383 switch (sopt->sopt_name) { 1384#ifdef TCP_SIGNATURE 1385 case TCP_MD5SIG: 1386 optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; 1387 INP_WUNLOCK(inp); 1388 error = sooptcopyout(sopt, &optval, sizeof optval); 1389 break; 1390#endif 1391 1392 case TCP_NODELAY: 1393 optval = tp->t_flags & TF_NODELAY; 1394 INP_WUNLOCK(inp); 1395 error = sooptcopyout(sopt, &optval, sizeof optval); 1396 break; 1397 case TCP_MAXSEG: 1398 optval = tp->t_maxseg; 1399 INP_WUNLOCK(inp); 1400 error = sooptcopyout(sopt, &optval, sizeof optval); 1401 break; 1402 case TCP_NOOPT: 1403 optval = tp->t_flags & TF_NOOPT; 1404 INP_WUNLOCK(inp); 1405 error = sooptcopyout(sopt, &optval, sizeof optval); 1406 break; 1407 case TCP_NOPUSH: 1408 optval = tp->t_flags & TF_NOPUSH; 1409 INP_WUNLOCK(inp); 1410 error = sooptcopyout(sopt, &optval, sizeof optval); 1411 break; 1412 case TCP_INFO: 1413 tcp_fill_info(tp, &ti); 1414 INP_WUNLOCK(inp); 1415 error = sooptcopyout(sopt, &ti, sizeof ti); 1416 break; 1417 default: 1418 INP_WUNLOCK(inp); 1419 error = ENOPROTOOPT; 1420 break; 1421 } 1422 break; 1423 } 1424 return (error); 1425} 1426#undef INP_WLOCK_RECHECK 1427 1428/* 1429 * tcp_sendspace and tcp_recvspace are the default send and receive window 1430 * sizes, respectively. These are obsolescent (this information should 1431 * be set by the route). 1432 */ 1433u_long tcp_sendspace = 1024*32; 1434SYSCTL_ULONG(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, 1435 &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); 1436u_long tcp_recvspace = 1024*64; 1437SYSCTL_ULONG(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, 1438 &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); 1439 1440/* 1441 * Attach TCP protocol to socket, allocating 1442 * internet protocol control block, tcp control block, 1443 * bufer space, and entering LISTEN state if to accept connections. 1444 */ 1445static int 1446tcp_attach(struct socket *so) 1447{ 1448 INIT_VNET_INET(so->so_vnet); 1449 struct tcpcb *tp; 1450 struct inpcb *inp; 1451 int error; 1452 1453 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 1454 error = soreserve(so, tcp_sendspace, tcp_recvspace); 1455 if (error) 1456 return (error); 1457 } 1458 so->so_rcv.sb_flags |= SB_AUTOSIZE; 1459 so->so_snd.sb_flags |= SB_AUTOSIZE; 1460 INP_INFO_WLOCK(&V_tcbinfo); 1461 error = in_pcballoc(so, &V_tcbinfo); 1462 if (error) { 1463 INP_INFO_WUNLOCK(&V_tcbinfo); 1464 return (error); 1465 } 1466 inp = sotoinpcb(so); 1467#ifdef INET6 1468 if (inp->inp_vflag & INP_IPV6PROTO) { 1469 inp->inp_vflag |= INP_IPV6; 1470 inp->in6p_hops = -1; /* use kernel default */ 1471 } 1472 else 1473#endif 1474 inp->inp_vflag |= INP_IPV4; 1475 tp = tcp_newtcpcb(inp); 1476 if (tp == NULL) { 1477 in_pcbdetach(inp); 1478 in_pcbfree(inp); 1479 INP_INFO_WUNLOCK(&V_tcbinfo); 1480 return (ENOBUFS); 1481 } 1482 tp->t_state = TCPS_CLOSED; 1483 INP_WUNLOCK(inp); 1484 INP_INFO_WUNLOCK(&V_tcbinfo); 1485 return (0); 1486} 1487 1488/* 1489 * Initiate (or continue) disconnect. 1490 * If embryonic state, just send reset (once). 1491 * If in ``let data drain'' option and linger null, just drop. 1492 * Otherwise (hard), mark socket disconnecting and drop 1493 * current input data; switch states based on user close, and 1494 * send segment to peer (with FIN). 1495 */ 1496static void 1497tcp_disconnect(struct tcpcb *tp) 1498{ 1499 struct inpcb *inp = tp->t_inpcb; 1500 struct socket *so = inp->inp_socket; 1501#ifdef INVARIANTS 1502 INIT_VNET_INET(so->so_vnet); 1503#endif 1504 1505 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1506 INP_WLOCK_ASSERT(inp); 1507 1508 /* 1509 * Neither tcp_close() nor tcp_drop() should return NULL, as the 1510 * socket is still open. 1511 */ 1512 if (tp->t_state < TCPS_ESTABLISHED) { 1513 tp = tcp_close(tp); 1514 KASSERT(tp != NULL, 1515 ("tcp_disconnect: tcp_close() returned NULL")); 1516 } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { 1517 tp = tcp_drop(tp, 0); 1518 KASSERT(tp != NULL, 1519 ("tcp_disconnect: tcp_drop() returned NULL")); 1520 } else { 1521 soisdisconnecting(so); 1522 sbflush(&so->so_rcv); 1523 tcp_usrclosed(tp); 1524 if (!(inp->inp_flags & INP_DROPPED)) 1525 tcp_output_disconnect(tp); 1526 } 1527} 1528 1529/* 1530 * User issued close, and wish to trail through shutdown states: 1531 * if never received SYN, just forget it. If got a SYN from peer, 1532 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1533 * If already got a FIN from peer, then almost done; go to LAST_ACK 1534 * state. In all other cases, have already sent FIN to peer (e.g. 1535 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1536 * for peer to send FIN or not respond to keep-alives, etc. 1537 * We can let the user exit from the close as soon as the FIN is acked. 1538 */ 1539static void 1540tcp_usrclosed(struct tcpcb *tp) 1541{ 1542#ifdef INVARIANTS 1543 INIT_VNET_INET(tp->t_inpcb->inp_vnet); 1544#endif 1545 1546 INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1547 INP_WLOCK_ASSERT(tp->t_inpcb); 1548 1549 switch (tp->t_state) { 1550 case TCPS_LISTEN: 1551 tcp_offload_listen_close(tp); 1552 /* FALLTHROUGH */ 1553 case TCPS_CLOSED: 1554 tp->t_state = TCPS_CLOSED; 1555 tp = tcp_close(tp); 1556 /* 1557 * tcp_close() should never return NULL here as the socket is 1558 * still open. 1559 */ 1560 KASSERT(tp != NULL, 1561 ("tcp_usrclosed: tcp_close() returned NULL")); 1562 break; 1563 1564 case TCPS_SYN_SENT: 1565 case TCPS_SYN_RECEIVED: 1566 tp->t_flags |= TF_NEEDFIN; 1567 break; 1568 1569 case TCPS_ESTABLISHED: 1570 tp->t_state = TCPS_FIN_WAIT_1; 1571 break; 1572 1573 case TCPS_CLOSE_WAIT: 1574 tp->t_state = TCPS_LAST_ACK; 1575 break; 1576 } 1577 if (tp->t_state >= TCPS_FIN_WAIT_2) { 1578 soisdisconnected(tp->t_inpcb->inp_socket); 1579 /* Prevent the connection hanging in FIN_WAIT_2 forever. */ 1580 if (tp->t_state == TCPS_FIN_WAIT_2) { 1581 int timeout; 1582 1583 timeout = (tcp_fast_finwait2_recycle) ? 1584 tcp_finwait2_timeout : tcp_maxidle; 1585 tcp_timer_activate(tp, TT_2MSL, timeout); 1586 } 1587 } 1588} 1589 1590#ifdef DDB 1591static void 1592db_print_indent(int indent) 1593{ 1594 int i; 1595 1596 for (i = 0; i < indent; i++) 1597 db_printf(" "); 1598} 1599 1600static void 1601db_print_tstate(int t_state) 1602{ 1603 1604 switch (t_state) { 1605 case TCPS_CLOSED: 1606 db_printf("TCPS_CLOSED"); 1607 return; 1608 1609 case TCPS_LISTEN: 1610 db_printf("TCPS_LISTEN"); 1611 return; 1612 1613 case TCPS_SYN_SENT: 1614 db_printf("TCPS_SYN_SENT"); 1615 return; 1616 1617 case TCPS_SYN_RECEIVED: 1618 db_printf("TCPS_SYN_RECEIVED"); 1619 return; 1620 1621 case TCPS_ESTABLISHED: 1622 db_printf("TCPS_ESTABLISHED"); 1623 return; 1624 1625 case TCPS_CLOSE_WAIT: 1626 db_printf("TCPS_CLOSE_WAIT"); 1627 return; 1628 1629 case TCPS_FIN_WAIT_1: 1630 db_printf("TCPS_FIN_WAIT_1"); 1631 return; 1632 1633 case TCPS_CLOSING: 1634 db_printf("TCPS_CLOSING"); 1635 return; 1636 1637 case TCPS_LAST_ACK: 1638 db_printf("TCPS_LAST_ACK"); 1639 return; 1640 1641 case TCPS_FIN_WAIT_2: 1642 db_printf("TCPS_FIN_WAIT_2"); 1643 return; 1644 1645 case TCPS_TIME_WAIT: 1646 db_printf("TCPS_TIME_WAIT"); 1647 return; 1648 1649 default: 1650 db_printf("unknown"); 1651 return; 1652 } 1653} 1654 1655static void 1656db_print_tflags(u_int t_flags) 1657{ 1658 int comma; 1659 1660 comma = 0; 1661 if (t_flags & TF_ACKNOW) { 1662 db_printf("%sTF_ACKNOW", comma ? ", " : ""); 1663 comma = 1; 1664 } 1665 if (t_flags & TF_DELACK) { 1666 db_printf("%sTF_DELACK", comma ? ", " : ""); 1667 comma = 1; 1668 } 1669 if (t_flags & TF_NODELAY) { 1670 db_printf("%sTF_NODELAY", comma ? ", " : ""); 1671 comma = 1; 1672 } 1673 if (t_flags & TF_NOOPT) { 1674 db_printf("%sTF_NOOPT", comma ? ", " : ""); 1675 comma = 1; 1676 } 1677 if (t_flags & TF_SENTFIN) { 1678 db_printf("%sTF_SENTFIN", comma ? ", " : ""); 1679 comma = 1; 1680 } 1681 if (t_flags & TF_REQ_SCALE) { 1682 db_printf("%sTF_REQ_SCALE", comma ? ", " : ""); 1683 comma = 1; 1684 } 1685 if (t_flags & TF_RCVD_SCALE) { 1686 db_printf("%sTF_RECVD_SCALE", comma ? ", " : ""); 1687 comma = 1; 1688 } 1689 if (t_flags & TF_REQ_TSTMP) { 1690 db_printf("%sTF_REQ_TSTMP", comma ? ", " : ""); 1691 comma = 1; 1692 } 1693 if (t_flags & TF_RCVD_TSTMP) { 1694 db_printf("%sTF_RCVD_TSTMP", comma ? ", " : ""); 1695 comma = 1; 1696 } 1697 if (t_flags & TF_SACK_PERMIT) { 1698 db_printf("%sTF_SACK_PERMIT", comma ? ", " : ""); 1699 comma = 1; 1700 } 1701 if (t_flags & TF_NEEDSYN) { 1702 db_printf("%sTF_NEEDSYN", comma ? ", " : ""); 1703 comma = 1; 1704 } 1705 if (t_flags & TF_NEEDFIN) { 1706 db_printf("%sTF_NEEDFIN", comma ? ", " : ""); 1707 comma = 1; 1708 } 1709 if (t_flags & TF_NOPUSH) { 1710 db_printf("%sTF_NOPUSH", comma ? ", " : ""); 1711 comma = 1; 1712 } 1713 if (t_flags & TF_NOPUSH) { 1714 db_printf("%sTF_NOPUSH", comma ? ", " : ""); 1715 comma = 1; 1716 } 1717 if (t_flags & TF_MORETOCOME) { 1718 db_printf("%sTF_MORETOCOME", comma ? ", " : ""); 1719 comma = 1; 1720 } 1721 if (t_flags & TF_LQ_OVERFLOW) { 1722 db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : ""); 1723 comma = 1; 1724 } 1725 if (t_flags & TF_LASTIDLE) { 1726 db_printf("%sTF_LASTIDLE", comma ? ", " : ""); 1727 comma = 1; 1728 } 1729 if (t_flags & TF_RXWIN0SENT) { 1730 db_printf("%sTF_RXWIN0SENT", comma ? ", " : ""); 1731 comma = 1; 1732 } 1733 if (t_flags & TF_FASTRECOVERY) { 1734 db_printf("%sTF_FASTRECOVERY", comma ? ", " : ""); 1735 comma = 1; 1736 } 1737 if (t_flags & TF_WASFRECOVERY) { 1738 db_printf("%sTF_WASFRECOVERY", comma ? ", " : ""); 1739 comma = 1; 1740 } 1741 if (t_flags & TF_SIGNATURE) { 1742 db_printf("%sTF_SIGNATURE", comma ? ", " : ""); 1743 comma = 1; 1744 } 1745 if (t_flags & TF_FORCEDATA) { 1746 db_printf("%sTF_FORCEDATA", comma ? ", " : ""); 1747 comma = 1; 1748 } 1749 if (t_flags & TF_TSO) { 1750 db_printf("%sTF_TSO", comma ? ", " : ""); 1751 comma = 1; 1752 } 1753 if (t_flags & TF_ECN_PERMIT) { 1754 db_printf("%sTF_ECN_PERMIT", comma ? ", " : ""); 1755 comma = 1; 1756 } 1757} 1758 1759static void 1760db_print_toobflags(char t_oobflags) 1761{ 1762 int comma; 1763 1764 comma = 0; 1765 if (t_oobflags & TCPOOB_HAVEDATA) { 1766 db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : ""); 1767 comma = 1; 1768 } 1769 if (t_oobflags & TCPOOB_HADDATA) { 1770 db_printf("%sTCPOOB_HADDATA", comma ? ", " : ""); 1771 comma = 1; 1772 } 1773} 1774 1775static void 1776db_print_tcpcb(struct tcpcb *tp, const char *name, int indent) 1777{ 1778 1779 db_print_indent(indent); 1780 db_printf("%s at %p\n", name, tp); 1781 1782 indent += 2; 1783 1784 db_print_indent(indent); 1785 db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", 1786 LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); 1787 1788 db_print_indent(indent); 1789 db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", 1790 &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep); 1791 1792 db_print_indent(indent); 1793 db_printf("tt_2msl: %p tt_delack: %p t_inpcb: %p\n", &tp->t_timers->tt_2msl, 1794 &tp->t_timers->tt_delack, tp->t_inpcb); 1795 1796 db_print_indent(indent); 1797 db_printf("t_state: %d (", tp->t_state); 1798 db_print_tstate(tp->t_state); 1799 db_printf(")\n"); 1800 1801 db_print_indent(indent); 1802 db_printf("t_flags: 0x%x (", tp->t_flags); 1803 db_print_tflags(tp->t_flags); 1804 db_printf(")\n"); 1805 1806 db_print_indent(indent); 1807 db_printf("snd_una: 0x%08x snd_max: 0x%08x snd_nxt: x0%08x\n", 1808 tp->snd_una, tp->snd_max, tp->snd_nxt); 1809 1810 db_print_indent(indent); 1811 db_printf("snd_up: 0x%08x snd_wl1: 0x%08x snd_wl2: 0x%08x\n", 1812 tp->snd_up, tp->snd_wl1, tp->snd_wl2); 1813 1814 db_print_indent(indent); 1815 db_printf("iss: 0x%08x irs: 0x%08x rcv_nxt: 0x%08x\n", 1816 tp->iss, tp->irs, tp->rcv_nxt); 1817 1818 db_print_indent(indent); 1819 db_printf("rcv_adv: 0x%08x rcv_wnd: %lu rcv_up: 0x%08x\n", 1820 tp->rcv_adv, tp->rcv_wnd, tp->rcv_up); 1821 1822 db_print_indent(indent); 1823 db_printf("snd_wnd: %lu snd_cwnd: %lu snd_bwnd: %lu\n", 1824 tp->snd_wnd, tp->snd_cwnd, tp->snd_bwnd); 1825 1826 db_print_indent(indent); 1827 db_printf("snd_ssthresh: %lu snd_bandwidth: %lu snd_recover: " 1828 "0x%08x\n", tp->snd_ssthresh, tp->snd_bandwidth, 1829 tp->snd_recover); 1830 1831 db_print_indent(indent); 1832 db_printf("t_maxopd: %u t_rcvtime: %u t_startime: %u\n", 1833 tp->t_maxopd, tp->t_rcvtime, tp->t_starttime); 1834 1835 db_print_indent(indent); 1836 db_printf("t_rttime: %u t_rtsq: 0x%08x t_bw_rtttime: %u\n", 1837 tp->t_rtttime, tp->t_rtseq, tp->t_bw_rtttime); 1838 1839 db_print_indent(indent); 1840 db_printf("t_bw_rtseq: 0x%08x t_rxtcur: %d t_maxseg: %u " 1841 "t_srtt: %d\n", tp->t_bw_rtseq, tp->t_rxtcur, tp->t_maxseg, 1842 tp->t_srtt); 1843 1844 db_print_indent(indent); 1845 db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u " 1846 "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin, 1847 tp->t_rttbest); 1848 1849 db_print_indent(indent); 1850 db_printf("t_rttupdated: %lu max_sndwnd: %lu t_softerror: %d\n", 1851 tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror); 1852 1853 db_print_indent(indent); 1854 db_printf("t_oobflags: 0x%x (", tp->t_oobflags); 1855 db_print_toobflags(tp->t_oobflags); 1856 db_printf(") t_iobc: 0x%02x\n", tp->t_iobc); 1857 1858 db_print_indent(indent); 1859 db_printf("snd_scale: %u rcv_scale: %u request_r_scale: %u\n", 1860 tp->snd_scale, tp->rcv_scale, tp->request_r_scale); 1861 1862 db_print_indent(indent); 1863 db_printf("ts_recent: %u ts_recent_age: %u\n", 1864 tp->ts_recent, tp->ts_recent_age); 1865 1866 db_print_indent(indent); 1867 db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: " 1868 "%lu\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev); 1869 1870 db_print_indent(indent); 1871 db_printf("snd_ssthresh_prev: %lu snd_recover_prev: 0x%08x " 1872 "t_badrxtwin: %u\n", tp->snd_ssthresh_prev, 1873 tp->snd_recover_prev, tp->t_badrxtwin); 1874 1875 db_print_indent(indent); 1876 db_printf("snd_numholes: %d snd_holes first: %p\n", 1877 tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes)); 1878 1879 db_print_indent(indent); 1880 db_printf("snd_fack: 0x%08x rcv_numsacks: %d sack_newdata: " 1881 "0x%08x\n", tp->snd_fack, tp->rcv_numsacks, tp->sack_newdata); 1882 1883 /* Skip sackblks, sackhint. */ 1884 1885 db_print_indent(indent); 1886 db_printf("t_rttlow: %d rfbuf_ts: %u rfbuf_cnt: %d\n", 1887 tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt); 1888} 1889 1890DB_SHOW_COMMAND(tcpcb, db_show_tcpcb) 1891{ 1892 struct tcpcb *tp; 1893 1894 if (!have_addr) { 1895 db_printf("usage: show tcpcb <addr>\n"); 1896 return; 1897 } 1898 tp = (struct tcpcb *)addr; 1899 1900 db_print_tcpcb(tp, "tcpcb", 0); 1901} 1902#endif 1903