tcp_usrreq.c revision 148385
1266612Smav/*- 2266612Smav * Copyright (c) 1982, 1986, 1988, 1993 3219089Spjd * The Regents of the University of California. All rights reserved. 4266612Smav * 5266612Smav * Redistribution and use in source and binary forms, with or without 6266612Smav * modification, are permitted provided that the following conditions 7219089Spjd * are met: 8266612Smav * 1. Redistributions of source code must retain the above copyright 9266612Smav * notice, this list of conditions and the following disclaimer. 10266612Smav * 2. Redistributions in binary form must reproduce the above copyright 11266612Smav * notice, this list of conditions and the following disclaimer in the 12219089Spjd * documentation and/or other materials provided with the distribution. 13266612Smav * 4. Neither the name of the University nor the names of its contributors 14266612Smav * may be used to endorse or promote products derived from this software 15266612Smav * without specific prior written permission. 16266612Smav * 17266612Smav * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18266612Smav * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19266612Smav * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20266612Smav * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21266612Smav * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22266612Smav * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23266612Smav * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24266612Smav * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25266612Smav * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26266612Smav * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27266612Smav * SUCH DAMAGE. 28219089Spjd * 29219089Spjd * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 30219089Spjd * $FreeBSD: head/sys/netinet/tcp_usrreq.c 148385 2005-07-25 12:31:43Z ume $ 31266612Smav */ 32266612Smav 33219089Spjd#include "opt_ipsec.h" 34266612Smav#include "opt_inet.h" 35219089Spjd#include "opt_inet6.h" 36266612Smav#include "opt_tcpdebug.h" 37266612Smav 38266612Smav#include <sys/param.h> 39219089Spjd#include <sys/systm.h> 40266612Smav#include <sys/malloc.h> 41266612Smav#include <sys/kernel.h> 42266612Smav#include <sys/sysctl.h> 43266612Smav#include <sys/mbuf.h> 44266612Smav#ifdef INET6 45266612Smav#include <sys/domain.h> 46266612Smav#endif /* INET6 */ 47266612Smav#include <sys/socket.h> 48266612Smav#include <sys/socketvar.h> 49266612Smav#include <sys/protosw.h> 50266612Smav#include <sys/proc.h> 51266612Smav#include <sys/jail.h> 52266612Smav 53266612Smav#include <net/if.h> 54266612Smav#include <net/route.h> 55266612Smav 56266612Smav#include <netinet/in.h> 57266612Smav#include <netinet/in_systm.h> 58266612Smav#ifdef INET6 59266612Smav#include <netinet/ip6.h> 60266612Smav#endif 61266612Smav#include <netinet/in_pcb.h> 62266612Smav#ifdef INET6 63266612Smav#include <netinet6/in6_pcb.h> 64266612Smav#endif 65266612Smav#include <netinet/in_var.h> 66266612Smav#include <netinet/ip_var.h> 67266612Smav#ifdef INET6 68266612Smav#include <netinet6/ip6_var.h> 69266612Smav#include <netinet6/scope6_var.h> 70266612Smav#endif 71266612Smav#include <netinet/tcp.h> 72266612Smav#include <netinet/tcp_fsm.h> 73266612Smav#include <netinet/tcp_seq.h> 74266612Smav#include <netinet/tcp_timer.h> 75266612Smav#include <netinet/tcp_var.h> 76266612Smav#include <netinet/tcpip.h> 77266612Smav#ifdef TCPDEBUG 78266612Smav#include <netinet/tcp_debug.h> 79#endif 80 81#ifdef IPSEC 82#include <netinet6/ipsec.h> 83#endif /*IPSEC*/ 84 85/* 86 * TCP protocol interface to socket abstraction. 87 */ 88extern char *tcpstates[]; /* XXX ??? */ 89 90static int tcp_attach(struct socket *); 91static int tcp_connect(struct tcpcb *, struct sockaddr *, 92 struct thread *td); 93#ifdef INET6 94static int tcp6_connect(struct tcpcb *, struct sockaddr *, 95 struct thread *td); 96#endif /* INET6 */ 97static struct tcpcb * 98 tcp_disconnect(struct tcpcb *); 99static struct tcpcb * 100 tcp_usrclosed(struct tcpcb *); 101static void tcp_fill_info(struct tcpcb *, struct tcp_info *); 102 103#ifdef TCPDEBUG 104#define TCPDEBUG0 int ostate = 0 105#define TCPDEBUG1() ostate = tp ? tp->t_state : 0 106#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ 107 tcp_trace(TA_USER, ostate, tp, 0, 0, req) 108#else 109#define TCPDEBUG0 110#define TCPDEBUG1() 111#define TCPDEBUG2(req) 112#endif 113 114/* 115 * TCP attaches to socket via pru_attach(), reserving space, 116 * and an internet control block. 117 */ 118static int 119tcp_usr_attach(struct socket *so, int proto, struct thread *td) 120{ 121 int error; 122 struct inpcb *inp; 123 struct tcpcb *tp = 0; 124 TCPDEBUG0; 125 126 INP_INFO_WLOCK(&tcbinfo); 127 TCPDEBUG1(); 128 inp = sotoinpcb(so); 129 if (inp) { 130 error = EISCONN; 131 goto out; 132 } 133 134 error = tcp_attach(so); 135 if (error) 136 goto out; 137 138 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 139 so->so_linger = TCP_LINGERTIME; 140 141 inp = sotoinpcb(so); 142 tp = intotcpcb(inp); 143out: 144 TCPDEBUG2(PRU_ATTACH); 145 INP_INFO_WUNLOCK(&tcbinfo); 146 return error; 147} 148 149/* 150 * pru_detach() detaches the TCP protocol from the socket. 151 * If the protocol state is non-embryonic, then can't 152 * do this directly: have to initiate a pru_disconnect(), 153 * which may finish later; embryonic TCB's can just 154 * be discarded here. 155 */ 156static int 157tcp_usr_detach(struct socket *so) 158{ 159 int error = 0; 160 struct inpcb *inp; 161 struct tcpcb *tp; 162 TCPDEBUG0; 163 164 INP_INFO_WLOCK(&tcbinfo); 165 inp = sotoinpcb(so); 166 if (inp == NULL) { 167 INP_INFO_WUNLOCK(&tcbinfo); 168 return error; 169 } 170 INP_LOCK(inp); 171 tp = intotcpcb(inp); 172 TCPDEBUG1(); 173 tp = tcp_disconnect(tp); 174 175 TCPDEBUG2(PRU_DETACH); 176 if (tp) 177 INP_UNLOCK(inp); 178 INP_INFO_WUNLOCK(&tcbinfo); 179 return error; 180} 181 182#define INI_NOLOCK 0 183#define INI_READ 1 184#define INI_WRITE 2 185 186#define COMMON_START() \ 187 TCPDEBUG0; \ 188 do { \ 189 if (inirw == INI_READ) \ 190 INP_INFO_RLOCK(&tcbinfo); \ 191 else if (inirw == INI_WRITE) \ 192 INP_INFO_WLOCK(&tcbinfo); \ 193 inp = sotoinpcb(so); \ 194 if (inp == 0) { \ 195 if (inirw == INI_READ) \ 196 INP_INFO_RUNLOCK(&tcbinfo); \ 197 else if (inirw == INI_WRITE) \ 198 INP_INFO_WUNLOCK(&tcbinfo); \ 199 return EINVAL; \ 200 } \ 201 INP_LOCK(inp); \ 202 if (inirw == INI_READ) \ 203 INP_INFO_RUNLOCK(&tcbinfo); \ 204 tp = intotcpcb(inp); \ 205 TCPDEBUG1(); \ 206} while(0) 207 208#define COMMON_END(req) \ 209out: TCPDEBUG2(req); \ 210 do { \ 211 if (tp) \ 212 INP_UNLOCK(inp); \ 213 if (inirw == INI_WRITE) \ 214 INP_INFO_WUNLOCK(&tcbinfo); \ 215 return error; \ 216 goto out; \ 217} while(0) 218 219/* 220 * Give the socket an address. 221 */ 222static int 223tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 224{ 225 int error = 0; 226 struct inpcb *inp; 227 struct tcpcb *tp; 228 struct sockaddr_in *sinp; 229 const int inirw = INI_WRITE; 230 231 sinp = (struct sockaddr_in *)nam; 232 if (nam->sa_len != sizeof (*sinp)) 233 return (EINVAL); 234 /* 235 * Must check for multicast addresses and disallow binding 236 * to them. 237 */ 238 if (sinp->sin_family == AF_INET && 239 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) 240 return (EAFNOSUPPORT); 241 242 COMMON_START(); 243 error = in_pcbbind(inp, nam, td->td_ucred); 244 if (error) 245 goto out; 246 COMMON_END(PRU_BIND); 247} 248 249#ifdef INET6 250static int 251tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 252{ 253 int error = 0; 254 struct inpcb *inp; 255 struct tcpcb *tp; 256 struct sockaddr_in6 *sin6p; 257 const int inirw = INI_WRITE; 258 259 sin6p = (struct sockaddr_in6 *)nam; 260 if (nam->sa_len != sizeof (*sin6p)) 261 return (EINVAL); 262 /* 263 * Must check for multicast addresses and disallow binding 264 * to them. 265 */ 266 if (sin6p->sin6_family == AF_INET6 && 267 IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) 268 return (EAFNOSUPPORT); 269 270 COMMON_START(); 271 inp->inp_vflag &= ~INP_IPV4; 272 inp->inp_vflag |= INP_IPV6; 273 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { 274 if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) 275 inp->inp_vflag |= INP_IPV4; 276 else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 277 struct sockaddr_in sin; 278 279 in6_sin6_2_sin(&sin, sin6p); 280 inp->inp_vflag |= INP_IPV4; 281 inp->inp_vflag &= ~INP_IPV6; 282 error = in_pcbbind(inp, (struct sockaddr *)&sin, 283 td->td_ucred); 284 goto out; 285 } 286 } 287 error = in6_pcbbind(inp, nam, td->td_ucred); 288 if (error) 289 goto out; 290 COMMON_END(PRU_BIND); 291} 292#endif /* INET6 */ 293 294/* 295 * Prepare to accept connections. 296 */ 297static int 298tcp_usr_listen(struct socket *so, struct thread *td) 299{ 300 int error = 0; 301 struct inpcb *inp; 302 struct tcpcb *tp; 303 const int inirw = INI_WRITE; 304 305 COMMON_START(); 306 SOCK_LOCK(so); 307 error = solisten_proto_check(so); 308 if (error == 0 && inp->inp_lport == 0) 309 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 310 if (error == 0) { 311 tp->t_state = TCPS_LISTEN; 312 solisten_proto(so); 313 } 314 SOCK_UNLOCK(so); 315 COMMON_END(PRU_LISTEN); 316} 317 318#ifdef INET6 319static int 320tcp6_usr_listen(struct socket *so, struct thread *td) 321{ 322 int error = 0; 323 struct inpcb *inp; 324 struct tcpcb *tp; 325 const int inirw = INI_WRITE; 326 327 COMMON_START(); 328 SOCK_LOCK(so); 329 error = solisten_proto_check(so); 330 if (error == 0 && inp->inp_lport == 0) { 331 inp->inp_vflag &= ~INP_IPV4; 332 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) 333 inp->inp_vflag |= INP_IPV4; 334 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 335 } 336 if (error == 0) { 337 tp->t_state = TCPS_LISTEN; 338 solisten_proto(so); 339 } 340 SOCK_UNLOCK(so); 341 COMMON_END(PRU_LISTEN); 342} 343#endif /* INET6 */ 344 345/* 346 * Initiate connection to peer. 347 * Create a template for use in transmissions on this connection. 348 * Enter SYN_SENT state, and mark socket as connecting. 349 * Start keep-alive timer, and seed output sequence space. 350 * Send initial segment on connection. 351 */ 352static int 353tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 354{ 355 int error = 0; 356 struct inpcb *inp; 357 struct tcpcb *tp; 358 struct sockaddr_in *sinp; 359 const int inirw = INI_WRITE; 360 361 sinp = (struct sockaddr_in *)nam; 362 if (nam->sa_len != sizeof (*sinp)) 363 return (EINVAL); 364 /* 365 * Must disallow TCP ``connections'' to multicast addresses. 366 */ 367 if (sinp->sin_family == AF_INET 368 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) 369 return (EAFNOSUPPORT); 370 if (jailed(td->td_ucred)) 371 prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr); 372 373 COMMON_START(); 374 if ((error = tcp_connect(tp, nam, td)) != 0) 375 goto out; 376 error = tcp_output(tp); 377 COMMON_END(PRU_CONNECT); 378} 379 380#ifdef INET6 381static int 382tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 383{ 384 int error = 0; 385 struct inpcb *inp; 386 struct tcpcb *tp; 387 struct sockaddr_in6 *sin6p; 388 const int inirw = INI_WRITE; 389 390 sin6p = (struct sockaddr_in6 *)nam; 391 if (nam->sa_len != sizeof (*sin6p)) 392 return (EINVAL); 393 /* 394 * Must disallow TCP ``connections'' to multicast addresses. 395 */ 396 if (sin6p->sin6_family == AF_INET6 397 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) 398 return (EAFNOSUPPORT); 399 400 COMMON_START(); 401 if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 402 struct sockaddr_in sin; 403 404 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { 405 error = EINVAL; 406 goto out; 407 } 408 409 in6_sin6_2_sin(&sin, sin6p); 410 inp->inp_vflag |= INP_IPV4; 411 inp->inp_vflag &= ~INP_IPV6; 412 if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) 413 goto out; 414 error = tcp_output(tp); 415 goto out; 416 } 417 inp->inp_vflag &= ~INP_IPV4; 418 inp->inp_vflag |= INP_IPV6; 419 inp->inp_inc.inc_isipv6 = 1; 420 if ((error = tcp6_connect(tp, nam, td)) != 0) 421 goto out; 422 error = tcp_output(tp); 423 COMMON_END(PRU_CONNECT); 424} 425#endif /* INET6 */ 426 427/* 428 * Initiate disconnect from peer. 429 * If connection never passed embryonic stage, just drop; 430 * else if don't need to let data drain, then can just drop anyways, 431 * else have to begin TCP shutdown process: mark socket disconnecting, 432 * drain unread data, state switch to reflect user close, and 433 * send segment (e.g. FIN) to peer. Socket will be really disconnected 434 * when peer sends FIN and acks ours. 435 * 436 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 437 */ 438static int 439tcp_usr_disconnect(struct socket *so) 440{ 441 int error = 0; 442 struct inpcb *inp; 443 struct tcpcb *tp; 444 const int inirw = INI_WRITE; 445 446 COMMON_START(); 447 tp = tcp_disconnect(tp); 448 COMMON_END(PRU_DISCONNECT); 449} 450 451/* 452 * Accept a connection. Essentially all the work is 453 * done at higher levels; just return the address 454 * of the peer, storing through addr. 455 */ 456static int 457tcp_usr_accept(struct socket *so, struct sockaddr **nam) 458{ 459 int error = 0; 460 struct inpcb *inp = NULL; 461 struct tcpcb *tp = NULL; 462 struct in_addr addr; 463 in_port_t port = 0; 464 TCPDEBUG0; 465 466 if (so->so_state & SS_ISDISCONNECTED) { 467 error = ECONNABORTED; 468 goto out; 469 } 470 471 INP_INFO_RLOCK(&tcbinfo); 472 inp = sotoinpcb(so); 473 if (!inp) { 474 INP_INFO_RUNLOCK(&tcbinfo); 475 return (EINVAL); 476 } 477 INP_LOCK(inp); 478 INP_INFO_RUNLOCK(&tcbinfo); 479 tp = intotcpcb(inp); 480 TCPDEBUG1(); 481 482 /* 483 * We inline in_setpeeraddr and COMMON_END here, so that we can 484 * copy the data of interest and defer the malloc until after we 485 * release the lock. 486 */ 487 port = inp->inp_fport; 488 addr = inp->inp_faddr; 489 490out: TCPDEBUG2(PRU_ACCEPT); 491 if (tp) 492 INP_UNLOCK(inp); 493 if (error == 0) 494 *nam = in_sockaddr(port, &addr); 495 return error; 496} 497 498#ifdef INET6 499static int 500tcp6_usr_accept(struct socket *so, struct sockaddr **nam) 501{ 502 struct inpcb *inp = NULL; 503 int error = 0; 504 struct tcpcb *tp = NULL; 505 struct in_addr addr; 506 struct in6_addr addr6; 507 in_port_t port = 0; 508 int v4 = 0; 509 TCPDEBUG0; 510 511 if (so->so_state & SS_ISDISCONNECTED) { 512 error = ECONNABORTED; 513 goto out; 514 } 515 516 INP_INFO_RLOCK(&tcbinfo); 517 inp = sotoinpcb(so); 518 if (inp == 0) { 519 INP_INFO_RUNLOCK(&tcbinfo); 520 return (EINVAL); 521 } 522 INP_LOCK(inp); 523 INP_INFO_RUNLOCK(&tcbinfo); 524 tp = intotcpcb(inp); 525 TCPDEBUG1(); 526 /* 527 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can 528 * copy the data of interest and defer the malloc until after we 529 * release the lock. 530 */ 531 if (inp->inp_vflag & INP_IPV4) { 532 v4 = 1; 533 port = inp->inp_fport; 534 addr = inp->inp_faddr; 535 } else { 536 port = inp->inp_fport; 537 addr6 = inp->in6p_faddr; 538 } 539 540out: TCPDEBUG2(PRU_ACCEPT); 541 if (tp) 542 INP_UNLOCK(inp); 543 if (error == 0) { 544 if (v4) 545 *nam = in6_v4mapsin6_sockaddr(port, &addr); 546 else 547 *nam = in6_sockaddr(port, &addr6); 548 } 549 return error; 550} 551#endif /* INET6 */ 552 553/* 554 * This is the wrapper function for in_setsockaddr. We just pass down 555 * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking 556 * here because in_setsockaddr will call malloc and can block. 557 */ 558static int 559tcp_sockaddr(struct socket *so, struct sockaddr **nam) 560{ 561 return (in_setsockaddr(so, nam, &tcbinfo)); 562} 563 564/* 565 * This is the wrapper function for in_setpeeraddr. We just pass down 566 * the pcbinfo for in_setpeeraddr to lock. 567 */ 568static int 569tcp_peeraddr(struct socket *so, struct sockaddr **nam) 570{ 571 return (in_setpeeraddr(so, nam, &tcbinfo)); 572} 573 574/* 575 * Mark the connection as being incapable of further output. 576 */ 577static int 578tcp_usr_shutdown(struct socket *so) 579{ 580 int error = 0; 581 struct inpcb *inp; 582 struct tcpcb *tp; 583 const int inirw = INI_WRITE; 584 585 COMMON_START(); 586 socantsendmore(so); 587 tp = tcp_usrclosed(tp); 588 if (tp) 589 error = tcp_output(tp); 590 COMMON_END(PRU_SHUTDOWN); 591} 592 593/* 594 * After a receive, possibly send window update to peer. 595 */ 596static int 597tcp_usr_rcvd(struct socket *so, int flags) 598{ 599 int error = 0; 600 struct inpcb *inp; 601 struct tcpcb *tp; 602 const int inirw = INI_READ; 603 604 COMMON_START(); 605 tcp_output(tp); 606 COMMON_END(PRU_RCVD); 607} 608 609/* 610 * Do a send by putting data in output queue and updating urgent 611 * marker if URG set. Possibly send more data. Unlike the other 612 * pru_*() routines, the mbuf chains are our responsibility. We 613 * must either enqueue them or free them. The other pru_* routines 614 * generally are caller-frees. 615 */ 616static int 617tcp_usr_send(struct socket *so, int flags, struct mbuf *m, 618 struct sockaddr *nam, struct mbuf *control, struct thread *td) 619{ 620 int error = 0; 621 struct inpcb *inp; 622 struct tcpcb *tp; 623 int unlocked = 0; 624#ifdef INET6 625 int isipv6; 626#endif 627 TCPDEBUG0; 628 629 /* 630 * Need write lock here because this function might call 631 * tcp_connect or tcp_usrclosed. 632 * We really want to have to this function upgrade from read lock 633 * to write lock. XXX 634 */ 635 INP_INFO_WLOCK(&tcbinfo); 636 inp = sotoinpcb(so); 637 if (inp == NULL) { 638 /* 639 * OOPS! we lost a race, the TCP session got reset after 640 * we checked SBS_CANTSENDMORE, eg: while doing uiomove or a 641 * network interrupt in the non-splnet() section of sosend(). 642 */ 643 if (m) 644 m_freem(m); 645 if (control) 646 m_freem(control); 647 error = ECONNRESET; /* XXX EPIPE? */ 648 tp = NULL; 649 TCPDEBUG1(); 650 goto out; 651 } 652 INP_LOCK(inp); 653#ifdef INET6 654 isipv6 = nam && nam->sa_family == AF_INET6; 655#endif /* INET6 */ 656 tp = intotcpcb(inp); 657 TCPDEBUG1(); 658 if (control) { 659 /* TCP doesn't do control messages (rights, creds, etc) */ 660 if (control->m_len) { 661 m_freem(control); 662 if (m) 663 m_freem(m); 664 error = EINVAL; 665 goto out; 666 } 667 m_freem(control); /* empty control, just free it */ 668 } 669 if (!(flags & PRUS_OOB)) { 670 sbappendstream(&so->so_snd, m); 671 if (nam && tp->t_state < TCPS_SYN_SENT) { 672 /* 673 * Do implied connect if not yet connected, 674 * initialize window to default value, and 675 * initialize maxseg/maxopd using peer's cached 676 * MSS. 677 */ 678#ifdef INET6 679 if (isipv6) 680 error = tcp6_connect(tp, nam, td); 681 else 682#endif /* INET6 */ 683 error = tcp_connect(tp, nam, td); 684 if (error) 685 goto out; 686 tp->snd_wnd = TTCP_CLIENT_SND_WND; 687 tcp_mss(tp, -1); 688 } 689 690 if (flags & PRUS_EOF) { 691 /* 692 * Close the send side of the connection after 693 * the data is sent. 694 */ 695 socantsendmore(so); 696 tp = tcp_usrclosed(tp); 697 } 698 INP_INFO_WUNLOCK(&tcbinfo); 699 unlocked = 1; 700 if (tp != NULL) { 701 if (flags & PRUS_MORETOCOME) 702 tp->t_flags |= TF_MORETOCOME; 703 error = tcp_output(tp); 704 if (flags & PRUS_MORETOCOME) 705 tp->t_flags &= ~TF_MORETOCOME; 706 } 707 } else { 708 SOCKBUF_LOCK(&so->so_snd); 709 if (sbspace(&so->so_snd) < -512) { 710 SOCKBUF_UNLOCK(&so->so_snd); 711 m_freem(m); 712 error = ENOBUFS; 713 goto out; 714 } 715 /* 716 * According to RFC961 (Assigned Protocols), 717 * the urgent pointer points to the last octet 718 * of urgent data. We continue, however, 719 * to consider it to indicate the first octet 720 * of data past the urgent section. 721 * Otherwise, snd_up should be one lower. 722 */ 723 sbappendstream_locked(&so->so_snd, m); 724 SOCKBUF_UNLOCK(&so->so_snd); 725 if (nam && tp->t_state < TCPS_SYN_SENT) { 726 /* 727 * Do implied connect if not yet connected, 728 * initialize window to default value, and 729 * initialize maxseg/maxopd using peer's cached 730 * MSS. 731 */ 732#ifdef INET6 733 if (isipv6) 734 error = tcp6_connect(tp, nam, td); 735 else 736#endif /* INET6 */ 737 error = tcp_connect(tp, nam, td); 738 if (error) 739 goto out; 740 tp->snd_wnd = TTCP_CLIENT_SND_WND; 741 tcp_mss(tp, -1); 742 } 743 INP_INFO_WUNLOCK(&tcbinfo); 744 unlocked = 1; 745 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 746 tp->t_flags |= TF_FORCEDATA; 747 error = tcp_output(tp); 748 tp->t_flags &= ~TF_FORCEDATA; 749 } 750out: 751 TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : 752 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); 753 if (tp) 754 INP_UNLOCK(inp); 755 if (!unlocked) 756 INP_INFO_WUNLOCK(&tcbinfo); 757 return (error); 758} 759 760/* 761 * Abort the TCP. 762 */ 763static int 764tcp_usr_abort(struct socket *so) 765{ 766 int error = 0; 767 struct inpcb *inp; 768 struct tcpcb *tp; 769 const int inirw = INI_WRITE; 770 771 COMMON_START(); 772 tp = tcp_drop(tp, ECONNABORTED); 773 COMMON_END(PRU_ABORT); 774} 775 776/* 777 * Receive out-of-band data. 778 */ 779static int 780tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) 781{ 782 int error = 0; 783 struct inpcb *inp; 784 struct tcpcb *tp; 785 const int inirw = INI_READ; 786 787 COMMON_START(); 788 if ((so->so_oobmark == 0 && 789 (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || 790 so->so_options & SO_OOBINLINE || 791 tp->t_oobflags & TCPOOB_HADDATA) { 792 error = EINVAL; 793 goto out; 794 } 795 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 796 error = EWOULDBLOCK; 797 goto out; 798 } 799 m->m_len = 1; 800 *mtod(m, caddr_t) = tp->t_iobc; 801 if ((flags & MSG_PEEK) == 0) 802 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 803 COMMON_END(PRU_RCVOOB); 804} 805 806struct pr_usrreqs tcp_usrreqs = { 807 .pru_abort = tcp_usr_abort, 808 .pru_accept = tcp_usr_accept, 809 .pru_attach = tcp_usr_attach, 810 .pru_bind = tcp_usr_bind, 811 .pru_connect = tcp_usr_connect, 812 .pru_control = in_control, 813 .pru_detach = tcp_usr_detach, 814 .pru_disconnect = tcp_usr_disconnect, 815 .pru_listen = tcp_usr_listen, 816 .pru_peeraddr = tcp_peeraddr, 817 .pru_rcvd = tcp_usr_rcvd, 818 .pru_rcvoob = tcp_usr_rcvoob, 819 .pru_send = tcp_usr_send, 820 .pru_shutdown = tcp_usr_shutdown, 821 .pru_sockaddr = tcp_sockaddr, 822 .pru_sosetlabel = in_pcbsosetlabel 823}; 824 825#ifdef INET6 826struct pr_usrreqs tcp6_usrreqs = { 827 .pru_abort = tcp_usr_abort, 828 .pru_accept = tcp6_usr_accept, 829 .pru_attach = tcp_usr_attach, 830 .pru_bind = tcp6_usr_bind, 831 .pru_connect = tcp6_usr_connect, 832 .pru_control = in6_control, 833 .pru_detach = tcp_usr_detach, 834 .pru_disconnect = tcp_usr_disconnect, 835 .pru_listen = tcp6_usr_listen, 836 .pru_peeraddr = in6_mapped_peeraddr, 837 .pru_rcvd = tcp_usr_rcvd, 838 .pru_rcvoob = tcp_usr_rcvoob, 839 .pru_send = tcp_usr_send, 840 .pru_shutdown = tcp_usr_shutdown, 841 .pru_sockaddr = in6_mapped_sockaddr, 842 .pru_sosetlabel = in_pcbsosetlabel 843}; 844#endif /* INET6 */ 845 846/* 847 * Common subroutine to open a TCP connection to remote host specified 848 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 849 * port number if needed. Call in_pcbconnect_setup to do the routing and 850 * to choose a local host address (interface). If there is an existing 851 * incarnation of the same connection in TIME-WAIT state and if the remote 852 * host was sending CC options and if the connection duration was < MSL, then 853 * truncate the previous TIME-WAIT state and proceed. 854 * Initialize connection parameters and enter SYN-SENT state. 855 */ 856static int 857tcp_connect(tp, nam, td) 858 register struct tcpcb *tp; 859 struct sockaddr *nam; 860 struct thread *td; 861{ 862 struct inpcb *inp = tp->t_inpcb, *oinp; 863 struct socket *so = inp->inp_socket; 864 struct in_addr laddr; 865 u_short lport; 866 int error; 867 868 if (inp->inp_lport == 0) { 869 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 870 if (error) 871 return error; 872 } 873 874 /* 875 * Cannot simply call in_pcbconnect, because there might be an 876 * earlier incarnation of this same connection still in 877 * TIME_WAIT state, creating an ADDRINUSE error. 878 */ 879 laddr = inp->inp_laddr; 880 lport = inp->inp_lport; 881 error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, 882 &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); 883 if (error && oinp == NULL) 884 return error; 885 if (oinp) 886 return EADDRINUSE; 887 inp->inp_laddr = laddr; 888 in_pcbrehash(inp); 889 890 /* Compute window scaling to request. */ 891 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 892 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) 893 tp->request_r_scale++; 894 895 soisconnecting(so); 896 tcpstat.tcps_connattempt++; 897 tp->t_state = TCPS_SYN_SENT; 898 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); 899 tp->iss = tcp_new_isn(tp); 900 tp->t_bw_rtseq = tp->iss; 901 tcp_sendseqinit(tp); 902 903 return 0; 904} 905 906#ifdef INET6 907static int 908tcp6_connect(tp, nam, td) 909 register struct tcpcb *tp; 910 struct sockaddr *nam; 911 struct thread *td; 912{ 913 struct inpcb *inp = tp->t_inpcb, *oinp; 914 struct socket *so = inp->inp_socket; 915 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; 916 struct in6_addr *addr6; 917 int error; 918 919 if (inp->inp_lport == 0) { 920 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 921 if (error) 922 return error; 923 } 924 925 /* 926 * Cannot simply call in_pcbconnect, because there might be an 927 * earlier incarnation of this same connection still in 928 * TIME_WAIT state, creating an ADDRINUSE error. 929 * in6_pcbladdr() also handles scope zone IDs. 930 */ 931 error = in6_pcbladdr(inp, nam, &addr6); 932 if (error) 933 return error; 934 oinp = in6_pcblookup_hash(inp->inp_pcbinfo, 935 &sin6->sin6_addr, sin6->sin6_port, 936 IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) 937 ? addr6 938 : &inp->in6p_laddr, 939 inp->inp_lport, 0, NULL); 940 if (oinp) 941 return EADDRINUSE; 942 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) 943 inp->in6p_laddr = *addr6; 944 inp->in6p_faddr = sin6->sin6_addr; 945 inp->inp_fport = sin6->sin6_port; 946 /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ 947 inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK; 948 if (inp->in6p_flags & IN6P_AUTOFLOWLABEL) 949 inp->in6p_flowinfo |= 950 (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); 951 in_pcbrehash(inp); 952 953 /* Compute window scaling to request. */ 954 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 955 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) 956 tp->request_r_scale++; 957 958 soisconnecting(so); 959 tcpstat.tcps_connattempt++; 960 tp->t_state = TCPS_SYN_SENT; 961 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); 962 tp->iss = tcp_new_isn(tp); 963 tp->t_bw_rtseq = tp->iss; 964 tcp_sendseqinit(tp); 965 966 return 0; 967} 968#endif /* INET6 */ 969 970/* 971 * Export TCP internal state information via a struct tcp_info, based on the 972 * Linux 2.6 API. Not ABI compatible as our constants are mapped differently 973 * (TCP state machine, etc). We export all information using FreeBSD-native 974 * constants -- for example, the numeric values for tcpi_state will differ 975 * from Linux. 976 */ 977static void 978tcp_fill_info(tp, ti) 979 struct tcpcb *tp; 980 struct tcp_info *ti; 981{ 982 983 INP_LOCK_ASSERT(tp->t_inpcb); 984 bzero(ti, sizeof(*ti)); 985 986 ti->tcpi_state = tp->t_state; 987 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 988 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 989 if (tp->sack_enable) 990 ti->tcpi_options |= TCPI_OPT_SACK; 991 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 992 ti->tcpi_options |= TCPI_OPT_WSCALE; 993 ti->tcpi_snd_wscale = tp->snd_scale; 994 ti->tcpi_rcv_wscale = tp->rcv_scale; 995 } 996 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 997 ti->tcpi_snd_cwnd = tp->snd_cwnd; 998 999 /* 1000 * FreeBSD-specific extension fields for tcp_info. 1001 */ 1002 ti->tcpi_rcv_space = tp->rcv_wnd; 1003 ti->tcpi_snd_wnd = tp->snd_wnd; 1004 ti->tcpi_snd_bwnd = tp->snd_bwnd; 1005} 1006 1007/* 1008 * The new sockopt interface makes it possible for us to block in the 1009 * copyin/out step (if we take a page fault). Taking a page fault at 1010 * splnet() is probably a Bad Thing. (Since sockets and pcbs both now 1011 * use TSM, there probably isn't any need for this function to run at 1012 * splnet() any more. This needs more examination.) 1013 * 1014 * XXXRW: The locking here is wrong; we may take a page fault while holding 1015 * the inpcb lock. 1016 */ 1017int 1018tcp_ctloutput(so, sopt) 1019 struct socket *so; 1020 struct sockopt *sopt; 1021{ 1022 int error, opt, optval; 1023 struct inpcb *inp; 1024 struct tcpcb *tp; 1025 struct tcp_info ti; 1026 1027 error = 0; 1028 INP_INFO_RLOCK(&tcbinfo); 1029 inp = sotoinpcb(so); 1030 if (inp == NULL) { 1031 INP_INFO_RUNLOCK(&tcbinfo); 1032 return (ECONNRESET); 1033 } 1034 INP_LOCK(inp); 1035 INP_INFO_RUNLOCK(&tcbinfo); 1036 if (sopt->sopt_level != IPPROTO_TCP) { 1037 INP_UNLOCK(inp); 1038#ifdef INET6 1039 if (INP_CHECK_SOCKAF(so, AF_INET6)) 1040 error = ip6_ctloutput(so, sopt); 1041 else 1042#endif /* INET6 */ 1043 error = ip_ctloutput(so, sopt); 1044 return (error); 1045 } 1046 tp = intotcpcb(inp); 1047 1048 switch (sopt->sopt_dir) { 1049 case SOPT_SET: 1050 switch (sopt->sopt_name) { 1051#ifdef TCP_SIGNATURE 1052 case TCP_MD5SIG: 1053 error = sooptcopyin(sopt, &optval, sizeof optval, 1054 sizeof optval); 1055 if (error) 1056 break; 1057 1058 if (optval > 0) 1059 tp->t_flags |= TF_SIGNATURE; 1060 else 1061 tp->t_flags &= ~TF_SIGNATURE; 1062 break; 1063#endif /* TCP_SIGNATURE */ 1064 case TCP_NODELAY: 1065 case TCP_NOOPT: 1066 error = sooptcopyin(sopt, &optval, sizeof optval, 1067 sizeof optval); 1068 if (error) 1069 break; 1070 1071 switch (sopt->sopt_name) { 1072 case TCP_NODELAY: 1073 opt = TF_NODELAY; 1074 break; 1075 case TCP_NOOPT: 1076 opt = TF_NOOPT; 1077 break; 1078 default: 1079 opt = 0; /* dead code to fool gcc */ 1080 break; 1081 } 1082 1083 if (optval) 1084 tp->t_flags |= opt; 1085 else 1086 tp->t_flags &= ~opt; 1087 break; 1088 1089 case TCP_NOPUSH: 1090 error = sooptcopyin(sopt, &optval, sizeof optval, 1091 sizeof optval); 1092 if (error) 1093 break; 1094 1095 if (optval) 1096 tp->t_flags |= TF_NOPUSH; 1097 else { 1098 tp->t_flags &= ~TF_NOPUSH; 1099 error = tcp_output(tp); 1100 } 1101 break; 1102 1103 case TCP_MAXSEG: 1104 error = sooptcopyin(sopt, &optval, sizeof optval, 1105 sizeof optval); 1106 if (error) 1107 break; 1108 1109 if (optval > 0 && optval <= tp->t_maxseg && 1110 optval + 40 >= tcp_minmss) 1111 tp->t_maxseg = optval; 1112 else 1113 error = EINVAL; 1114 break; 1115 1116 case TCP_INFO: 1117 error = EINVAL; 1118 break; 1119 1120 default: 1121 error = ENOPROTOOPT; 1122 break; 1123 } 1124 break; 1125 1126 case SOPT_GET: 1127 switch (sopt->sopt_name) { 1128#ifdef TCP_SIGNATURE 1129 case TCP_MD5SIG: 1130 optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; 1131 error = sooptcopyout(sopt, &optval, sizeof optval); 1132 break; 1133#endif 1134 case TCP_NODELAY: 1135 optval = tp->t_flags & TF_NODELAY; 1136 error = sooptcopyout(sopt, &optval, sizeof optval); 1137 break; 1138 case TCP_MAXSEG: 1139 optval = tp->t_maxseg; 1140 error = sooptcopyout(sopt, &optval, sizeof optval); 1141 break; 1142 case TCP_NOOPT: 1143 optval = tp->t_flags & TF_NOOPT; 1144 error = sooptcopyout(sopt, &optval, sizeof optval); 1145 break; 1146 case TCP_NOPUSH: 1147 optval = tp->t_flags & TF_NOPUSH; 1148 error = sooptcopyout(sopt, &optval, sizeof optval); 1149 break; 1150 case TCP_INFO: 1151 tcp_fill_info(tp, &ti); 1152 error = sooptcopyout(sopt, &ti, sizeof ti); 1153 break; 1154 default: 1155 error = ENOPROTOOPT; 1156 break; 1157 } 1158 break; 1159 } 1160 INP_UNLOCK(inp); 1161 return (error); 1162} 1163 1164/* 1165 * tcp_sendspace and tcp_recvspace are the default send and receive window 1166 * sizes, respectively. These are obsolescent (this information should 1167 * be set by the route). 1168 */ 1169u_long tcp_sendspace = 1024*32; 1170SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, 1171 &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); 1172u_long tcp_recvspace = 1024*64; 1173SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, 1174 &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); 1175 1176/* 1177 * Attach TCP protocol to socket, allocating 1178 * internet protocol control block, tcp control block, 1179 * bufer space, and entering LISTEN state if to accept connections. 1180 */ 1181static int 1182tcp_attach(so) 1183 struct socket *so; 1184{ 1185 register struct tcpcb *tp; 1186 struct inpcb *inp; 1187 int error; 1188#ifdef INET6 1189 int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0; 1190#endif 1191 1192 INP_INFO_WLOCK_ASSERT(&tcbinfo); 1193 1194 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 1195 error = soreserve(so, tcp_sendspace, tcp_recvspace); 1196 if (error) 1197 return (error); 1198 } 1199 error = in_pcballoc(so, &tcbinfo, "tcpinp"); 1200 if (error) 1201 return (error); 1202 inp = sotoinpcb(so); 1203#ifdef INET6 1204 if (isipv6) { 1205 inp->inp_vflag |= INP_IPV6; 1206 inp->in6p_hops = -1; /* use kernel default */ 1207 } 1208 else 1209#endif 1210 inp->inp_vflag |= INP_IPV4; 1211 tp = tcp_newtcpcb(inp); 1212 if (tp == 0) { 1213 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 1214 1215 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 1216 1217 INP_LOCK(inp); 1218#ifdef INET6 1219 if (isipv6) 1220 in6_pcbdetach(inp); 1221 else 1222#endif 1223 in_pcbdetach(inp); 1224 so->so_state |= nofd; 1225 return (ENOBUFS); 1226 } 1227 tp->t_state = TCPS_CLOSED; 1228 return (0); 1229} 1230 1231/* 1232 * Initiate (or continue) disconnect. 1233 * If embryonic state, just send reset (once). 1234 * If in ``let data drain'' option and linger null, just drop. 1235 * Otherwise (hard), mark socket disconnecting and drop 1236 * current input data; switch states based on user close, and 1237 * send segment to peer (with FIN). 1238 */ 1239static struct tcpcb * 1240tcp_disconnect(tp) 1241 register struct tcpcb *tp; 1242{ 1243 struct inpcb *inp = tp->t_inpcb; 1244 struct socket *so = inp->inp_socket; 1245 1246 INP_INFO_WLOCK_ASSERT(&tcbinfo); 1247 INP_LOCK_ASSERT(inp); 1248 1249 if (tp->t_state < TCPS_ESTABLISHED) 1250 tp = tcp_close(tp); 1251 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 1252 tp = tcp_drop(tp, 0); 1253 else { 1254 soisdisconnecting(so); 1255 sbflush(&so->so_rcv); 1256 tp = tcp_usrclosed(tp); 1257 if (tp) 1258 (void) tcp_output(tp); 1259 } 1260 return (tp); 1261} 1262 1263/* 1264 * User issued close, and wish to trail through shutdown states: 1265 * if never received SYN, just forget it. If got a SYN from peer, 1266 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1267 * If already got a FIN from peer, then almost done; go to LAST_ACK 1268 * state. In all other cases, have already sent FIN to peer (e.g. 1269 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1270 * for peer to send FIN or not respond to keep-alives, etc. 1271 * We can let the user exit from the close as soon as the FIN is acked. 1272 */ 1273static struct tcpcb * 1274tcp_usrclosed(tp) 1275 register struct tcpcb *tp; 1276{ 1277 1278 INP_INFO_WLOCK_ASSERT(&tcbinfo); 1279 INP_LOCK_ASSERT(tp->t_inpcb); 1280 1281 switch (tp->t_state) { 1282 1283 case TCPS_CLOSED: 1284 case TCPS_LISTEN: 1285 tp->t_state = TCPS_CLOSED; 1286 tp = tcp_close(tp); 1287 break; 1288 1289 case TCPS_SYN_SENT: 1290 case TCPS_SYN_RECEIVED: 1291 tp->t_flags |= TF_NEEDFIN; 1292 break; 1293 1294 case TCPS_ESTABLISHED: 1295 tp->t_state = TCPS_FIN_WAIT_1; 1296 break; 1297 1298 case TCPS_CLOSE_WAIT: 1299 tp->t_state = TCPS_LAST_ACK; 1300 break; 1301 } 1302 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1303 soisdisconnected(tp->t_inpcb->inp_socket); 1304 /* To prevent the connection hanging in FIN_WAIT_2 forever. */ 1305 if (tp->t_state == TCPS_FIN_WAIT_2) 1306 callout_reset(tp->tt_2msl, tcp_maxidle, 1307 tcp_timer_2msl, tp); 1308 } 1309 return (tp); 1310} 1311