tcp_usrreq.c revision 127862
1/* 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 34 * $FreeBSD: head/sys/netinet/tcp_usrreq.c 127862 2004-04-04 20:14:55Z pjd $ 35 */ 36 37#include "opt_ipsec.h" 38#include "opt_inet.h" 39#include "opt_inet6.h" 40#include "opt_tcpdebug.h" 41 42#include <sys/param.h> 43#include <sys/systm.h> 44#include <sys/malloc.h> 45#include <sys/kernel.h> 46#include <sys/sysctl.h> 47#include <sys/mbuf.h> 48#ifdef INET6 49#include <sys/domain.h> 50#endif /* INET6 */ 51#include <sys/socket.h> 52#include <sys/socketvar.h> 53#include <sys/protosw.h> 54#include <sys/proc.h> 55#include <sys/jail.h> 56 57#include <net/if.h> 58#include <net/route.h> 59 60#include <netinet/in.h> 61#include <netinet/in_systm.h> 62#ifdef INET6 63#include <netinet/ip6.h> 64#endif 65#include <netinet/in_pcb.h> 66#ifdef INET6 67#include <netinet6/in6_pcb.h> 68#endif 69#include <netinet/in_var.h> 70#include <netinet/ip_var.h> 71#ifdef INET6 72#include <netinet6/ip6_var.h> 73#endif 74#include <netinet/tcp.h> 75#include <netinet/tcp_fsm.h> 76#include <netinet/tcp_seq.h> 77#include <netinet/tcp_timer.h> 78#include <netinet/tcp_var.h> 79#include <netinet/tcpip.h> 80#ifdef TCPDEBUG 81#include <netinet/tcp_debug.h> 82#endif 83 84#ifdef IPSEC 85#include <netinet6/ipsec.h> 86#endif /*IPSEC*/ 87 88/* 89 * TCP protocol interface to socket abstraction. 90 */ 91extern char *tcpstates[]; /* XXX ??? */ 92 93static int tcp_attach(struct socket *); 94static int tcp_connect(struct tcpcb *, struct sockaddr *, 95 struct thread *td); 96#ifdef INET6 97static int tcp6_connect(struct tcpcb *, struct sockaddr *, 98 struct thread *td); 99#endif /* INET6 */ 100static struct tcpcb * 101 tcp_disconnect(struct tcpcb *); 102static struct tcpcb * 103 tcp_usrclosed(struct tcpcb *); 104 105#ifdef TCPDEBUG 106#define TCPDEBUG0 int ostate = 0 107#define TCPDEBUG1() ostate = tp ? tp->t_state : 0 108#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ 109 tcp_trace(TA_USER, ostate, tp, 0, 0, req) 110#else 111#define TCPDEBUG0 112#define TCPDEBUG1() 113#define TCPDEBUG2(req) 114#endif 115 116/* 117 * TCP attaches to socket via pru_attach(), reserving space, 118 * and an internet control block. 119 */ 120static int 121tcp_usr_attach(struct socket *so, int proto, struct thread *td) 122{ 123 int s = splnet(); 124 int error; 125 struct inpcb *inp; 126 struct tcpcb *tp = 0; 127 TCPDEBUG0; 128 129 INP_INFO_WLOCK(&tcbinfo); 130 TCPDEBUG1(); 131 inp = sotoinpcb(so); 132 if (inp) { 133 error = EISCONN; 134 goto out; 135 } 136 137 error = tcp_attach(so); 138 if (error) 139 goto out; 140 141 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 142 so->so_linger = TCP_LINGERTIME; 143 144 inp = sotoinpcb(so); 145 tp = intotcpcb(inp); 146out: 147 TCPDEBUG2(PRU_ATTACH); 148 INP_INFO_WUNLOCK(&tcbinfo); 149 splx(s); 150 return error; 151} 152 153/* 154 * pru_detach() detaches the TCP protocol from the socket. 155 * If the protocol state is non-embryonic, then can't 156 * do this directly: have to initiate a pru_disconnect(), 157 * which may finish later; embryonic TCB's can just 158 * be discarded here. 159 */ 160static int 161tcp_usr_detach(struct socket *so) 162{ 163 int s = splnet(); 164 int error = 0; 165 struct inpcb *inp; 166 struct tcpcb *tp; 167 TCPDEBUG0; 168 169 INP_INFO_WLOCK(&tcbinfo); 170 inp = sotoinpcb(so); 171 if (inp == 0) { 172 INP_INFO_WUNLOCK(&tcbinfo); 173 splx(s); 174 return EINVAL; /* XXX */ 175 } 176 INP_LOCK(inp); 177 tp = intotcpcb(inp); 178 TCPDEBUG1(); 179 tp = tcp_disconnect(tp); 180 181 TCPDEBUG2(PRU_DETACH); 182 if (tp) 183 INP_UNLOCK(inp); 184 INP_INFO_WUNLOCK(&tcbinfo); 185 splx(s); 186 return error; 187} 188 189#define INI_NOLOCK 0 190#define INI_READ 1 191#define INI_WRITE 2 192 193#define COMMON_START() \ 194 TCPDEBUG0; \ 195 do { \ 196 if (inirw == INI_READ) \ 197 INP_INFO_RLOCK(&tcbinfo); \ 198 else if (inirw == INI_WRITE) \ 199 INP_INFO_WLOCK(&tcbinfo); \ 200 inp = sotoinpcb(so); \ 201 if (inp == 0) { \ 202 if (inirw == INI_READ) \ 203 INP_INFO_RUNLOCK(&tcbinfo); \ 204 else if (inirw == INI_WRITE) \ 205 INP_INFO_WUNLOCK(&tcbinfo); \ 206 splx(s); \ 207 return EINVAL; \ 208 } \ 209 INP_LOCK(inp); \ 210 if (inirw == INI_READ) \ 211 INP_INFO_RUNLOCK(&tcbinfo); \ 212 tp = intotcpcb(inp); \ 213 TCPDEBUG1(); \ 214} while(0) 215 216#define COMMON_END(req) \ 217out: TCPDEBUG2(req); \ 218 do { \ 219 if (tp) \ 220 INP_UNLOCK(inp); \ 221 if (inirw == INI_WRITE) \ 222 INP_INFO_WUNLOCK(&tcbinfo); \ 223 splx(s); \ 224 return error; \ 225 goto out; \ 226} while(0) 227 228/* 229 * Give the socket an address. 230 */ 231static int 232tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 233{ 234 int s = splnet(); 235 int error = 0; 236 struct inpcb *inp; 237 struct tcpcb *tp; 238 struct sockaddr_in *sinp; 239 const int inirw = INI_WRITE; 240 241 sinp = (struct sockaddr_in *)nam; 242 if (nam->sa_len != sizeof (*sinp)) 243 return (EINVAL); 244 /* 245 * Must check for multicast addresses and disallow binding 246 * to them. 247 */ 248 if (sinp->sin_family == AF_INET && 249 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) 250 return (EAFNOSUPPORT); 251 252 COMMON_START(); 253 error = in_pcbbind(inp, nam, td->td_ucred); 254 if (error) 255 goto out; 256 COMMON_END(PRU_BIND); 257} 258 259#ifdef INET6 260static int 261tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 262{ 263 int s = splnet(); 264 int error = 0; 265 struct inpcb *inp; 266 struct tcpcb *tp; 267 struct sockaddr_in6 *sin6p; 268 const int inirw = INI_WRITE; 269 270 sin6p = (struct sockaddr_in6 *)nam; 271 if (nam->sa_len != sizeof (*sin6p)) 272 return (EINVAL); 273 /* 274 * Must check for multicast addresses and disallow binding 275 * to them. 276 */ 277 if (sin6p->sin6_family == AF_INET6 && 278 IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) 279 return (EAFNOSUPPORT); 280 281 COMMON_START(); 282 inp->inp_vflag &= ~INP_IPV4; 283 inp->inp_vflag |= INP_IPV6; 284 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { 285 if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) 286 inp->inp_vflag |= INP_IPV4; 287 else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 288 struct sockaddr_in sin; 289 290 in6_sin6_2_sin(&sin, sin6p); 291 inp->inp_vflag |= INP_IPV4; 292 inp->inp_vflag &= ~INP_IPV6; 293 error = in_pcbbind(inp, (struct sockaddr *)&sin, 294 td->td_ucred); 295 goto out; 296 } 297 } 298 error = in6_pcbbind(inp, nam, td->td_ucred); 299 if (error) 300 goto out; 301 COMMON_END(PRU_BIND); 302} 303#endif /* INET6 */ 304 305/* 306 * Prepare to accept connections. 307 */ 308static int 309tcp_usr_listen(struct socket *so, struct thread *td) 310{ 311 int s = splnet(); 312 int error = 0; 313 struct inpcb *inp; 314 struct tcpcb *tp; 315 const int inirw = INI_WRITE; 316 317 COMMON_START(); 318 if (inp->inp_lport == 0) 319 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 320 if (error == 0) 321 tp->t_state = TCPS_LISTEN; 322 COMMON_END(PRU_LISTEN); 323} 324 325#ifdef INET6 326static int 327tcp6_usr_listen(struct socket *so, struct thread *td) 328{ 329 int s = splnet(); 330 int error = 0; 331 struct inpcb *inp; 332 struct tcpcb *tp; 333 const int inirw = INI_WRITE; 334 335 COMMON_START(); 336 if (inp->inp_lport == 0) { 337 inp->inp_vflag &= ~INP_IPV4; 338 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) 339 inp->inp_vflag |= INP_IPV4; 340 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 341 } 342 if (error == 0) 343 tp->t_state = TCPS_LISTEN; 344 COMMON_END(PRU_LISTEN); 345} 346#endif /* INET6 */ 347 348/* 349 * Initiate connection to peer. 350 * Create a template for use in transmissions on this connection. 351 * Enter SYN_SENT state, and mark socket as connecting. 352 * Start keep-alive timer, and seed output sequence space. 353 * Send initial segment on connection. 354 */ 355static int 356tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 357{ 358 int s = splnet(); 359 int error = 0; 360 struct inpcb *inp; 361 struct tcpcb *tp; 362 struct sockaddr_in *sinp; 363 const int inirw = INI_WRITE; 364 365 sinp = (struct sockaddr_in *)nam; 366 if (nam->sa_len != sizeof (*sinp)) 367 return (EINVAL); 368 /* 369 * Must disallow TCP ``connections'' to multicast addresses. 370 */ 371 if (sinp->sin_family == AF_INET 372 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) 373 return (EAFNOSUPPORT); 374 if (td && jailed(td->td_ucred)) 375 prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr); 376 377 COMMON_START(); 378 if ((error = tcp_connect(tp, nam, td)) != 0) 379 goto out; 380 error = tcp_output(tp); 381 COMMON_END(PRU_CONNECT); 382} 383 384#ifdef INET6 385static int 386tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) 387{ 388 int s = splnet(); 389 int error = 0; 390 struct inpcb *inp; 391 struct tcpcb *tp; 392 struct sockaddr_in6 *sin6p; 393 const int inirw = INI_WRITE; 394 395 sin6p = (struct sockaddr_in6 *)nam; 396 if (nam->sa_len != sizeof (*sin6p)) 397 return (EINVAL); 398 /* 399 * Must disallow TCP ``connections'' to multicast addresses. 400 */ 401 if (sin6p->sin6_family == AF_INET6 402 && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) 403 return (EAFNOSUPPORT); 404 405 COMMON_START(); 406 if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { 407 struct sockaddr_in sin; 408 409 if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { 410 error = EINVAL; 411 goto out; 412 } 413 414 in6_sin6_2_sin(&sin, sin6p); 415 inp->inp_vflag |= INP_IPV4; 416 inp->inp_vflag &= ~INP_IPV6; 417 if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) 418 goto out; 419 error = tcp_output(tp); 420 goto out; 421 } 422 inp->inp_vflag &= ~INP_IPV4; 423 inp->inp_vflag |= INP_IPV6; 424 inp->inp_inc.inc_isipv6 = 1; 425 if ((error = tcp6_connect(tp, nam, td)) != 0) 426 goto out; 427 error = tcp_output(tp); 428 COMMON_END(PRU_CONNECT); 429} 430#endif /* INET6 */ 431 432/* 433 * Initiate disconnect from peer. 434 * If connection never passed embryonic stage, just drop; 435 * else if don't need to let data drain, then can just drop anyways, 436 * else have to begin TCP shutdown process: mark socket disconnecting, 437 * drain unread data, state switch to reflect user close, and 438 * send segment (e.g. FIN) to peer. Socket will be really disconnected 439 * when peer sends FIN and acks ours. 440 * 441 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 442 */ 443static int 444tcp_usr_disconnect(struct socket *so) 445{ 446 int s = splnet(); 447 int error = 0; 448 struct inpcb *inp; 449 struct tcpcb *tp; 450 const int inirw = INI_WRITE; 451 452 COMMON_START(); 453 tp = tcp_disconnect(tp); 454 COMMON_END(PRU_DISCONNECT); 455} 456 457/* 458 * Accept a connection. Essentially all the work is 459 * done at higher levels; just return the address 460 * of the peer, storing through addr. 461 */ 462static int 463tcp_usr_accept(struct socket *so, struct sockaddr **nam) 464{ 465 int s; 466 int error = 0; 467 struct inpcb *inp = NULL; 468 struct tcpcb *tp = NULL; 469 struct in_addr addr; 470 in_port_t port = 0; 471 TCPDEBUG0; 472 473 if (so->so_state & SS_ISDISCONNECTED) { 474 error = ECONNABORTED; 475 goto out; 476 } 477 478 s = splnet(); 479 INP_INFO_RLOCK(&tcbinfo); 480 inp = sotoinpcb(so); 481 if (!inp) { 482 INP_INFO_RUNLOCK(&tcbinfo); 483 splx(s); 484 return (EINVAL); 485 } 486 INP_LOCK(inp); 487 INP_INFO_RUNLOCK(&tcbinfo); 488 tp = intotcpcb(inp); 489 TCPDEBUG1(); 490 491 /* 492 * We inline in_setpeeraddr and COMMON_END here, so that we can 493 * copy the data of interest and defer the malloc until after we 494 * release the lock. 495 */ 496 port = inp->inp_fport; 497 addr = inp->inp_faddr; 498 499out: TCPDEBUG2(PRU_ACCEPT); 500 if (tp) 501 INP_UNLOCK(inp); 502 splx(s); 503 if (error == 0) 504 *nam = in_sockaddr(port, &addr); 505 return error; 506} 507 508#ifdef INET6 509static int 510tcp6_usr_accept(struct socket *so, struct sockaddr **nam) 511{ 512 int s; 513 struct inpcb *inp = NULL; 514 int error = 0; 515 struct tcpcb *tp = NULL; 516 struct in_addr addr; 517 struct in6_addr addr6; 518 in_port_t port = 0; 519 int v4 = 0; 520 TCPDEBUG0; 521 522 if (so->so_state & SS_ISDISCONNECTED) { 523 error = ECONNABORTED; 524 goto out; 525 } 526 527 s = splnet(); 528 INP_INFO_RLOCK(&tcbinfo); 529 inp = sotoinpcb(so); 530 if (inp == 0) { 531 INP_INFO_RUNLOCK(&tcbinfo); 532 splx(s); 533 return (EINVAL); 534 } 535 INP_LOCK(inp); 536 INP_INFO_RUNLOCK(&tcbinfo); 537 tp = intotcpcb(inp); 538 TCPDEBUG1(); 539 /* 540 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can 541 * copy the data of interest and defer the malloc until after we 542 * release the lock. 543 */ 544 if (inp->inp_vflag & INP_IPV4) { 545 v4 = 1; 546 port = inp->inp_fport; 547 addr = inp->inp_faddr; 548 } else { 549 port = inp->inp_fport; 550 addr6 = inp->in6p_faddr; 551 } 552 553out: TCPDEBUG2(PRU_ACCEPT); 554 if (tp) 555 INP_UNLOCK(inp); 556 splx(s); 557 if (error == 0) { 558 if (v4) 559 *nam = in6_v4mapsin6_sockaddr(port, &addr); 560 else 561 *nam = in6_sockaddr(port, &addr6); 562 } 563 return error; 564} 565#endif /* INET6 */ 566 567/* 568 * This is the wrapper function for in_setsockaddr. We just pass down 569 * the pcbinfo for in_setsockaddr to lock. We don't want to do the locking 570 * here because in_setsockaddr will call malloc and can block. 571 */ 572static int 573tcp_sockaddr(struct socket *so, struct sockaddr **nam) 574{ 575 return (in_setsockaddr(so, nam, &tcbinfo)); 576} 577 578/* 579 * This is the wrapper function for in_setpeeraddr. We just pass down 580 * the pcbinfo for in_setpeeraddr to lock. 581 */ 582static int 583tcp_peeraddr(struct socket *so, struct sockaddr **nam) 584{ 585 return (in_setpeeraddr(so, nam, &tcbinfo)); 586} 587 588/* 589 * Mark the connection as being incapable of further output. 590 */ 591static int 592tcp_usr_shutdown(struct socket *so) 593{ 594 int s = splnet(); 595 int error = 0; 596 struct inpcb *inp; 597 struct tcpcb *tp; 598 const int inirw = INI_WRITE; 599 600 COMMON_START(); 601 socantsendmore(so); 602 tp = tcp_usrclosed(tp); 603 if (tp) 604 error = tcp_output(tp); 605 COMMON_END(PRU_SHUTDOWN); 606} 607 608/* 609 * After a receive, possibly send window update to peer. 610 */ 611static int 612tcp_usr_rcvd(struct socket *so, int flags) 613{ 614 int s = splnet(); 615 int error = 0; 616 struct inpcb *inp; 617 struct tcpcb *tp; 618 const int inirw = INI_READ; 619 620 COMMON_START(); 621 tcp_output(tp); 622 COMMON_END(PRU_RCVD); 623} 624 625/* 626 * Do a send by putting data in output queue and updating urgent 627 * marker if URG set. Possibly send more data. Unlike the other 628 * pru_*() routines, the mbuf chains are our responsibility. We 629 * must either enqueue them or free them. The other pru_* routines 630 * generally are caller-frees. 631 */ 632static int 633tcp_usr_send(struct socket *so, int flags, struct mbuf *m, 634 struct sockaddr *nam, struct mbuf *control, struct thread *td) 635{ 636 int s = splnet(); 637 int error = 0; 638 struct inpcb *inp; 639 struct tcpcb *tp; 640 const int inirw = INI_WRITE; 641#ifdef INET6 642 int isipv6; 643#endif 644 TCPDEBUG0; 645 646 /* 647 * Need write lock here because this function might call 648 * tcp_connect or tcp_usrclosed. 649 * We really want to have to this function upgrade from read lock 650 * to write lock. XXX 651 */ 652 INP_INFO_WLOCK(&tcbinfo); 653 inp = sotoinpcb(so); 654 if (inp == NULL) { 655 /* 656 * OOPS! we lost a race, the TCP session got reset after 657 * we checked SS_CANTSENDMORE, eg: while doing uiomove or a 658 * network interrupt in the non-splnet() section of sosend(). 659 */ 660 if (m) 661 m_freem(m); 662 if (control) 663 m_freem(control); 664 error = ECONNRESET; /* XXX EPIPE? */ 665 tp = NULL; 666 TCPDEBUG1(); 667 goto out; 668 } 669 INP_LOCK(inp); 670#ifdef INET6 671 isipv6 = nam && nam->sa_family == AF_INET6; 672#endif /* INET6 */ 673 tp = intotcpcb(inp); 674 TCPDEBUG1(); 675 if (control) { 676 /* TCP doesn't do control messages (rights, creds, etc) */ 677 if (control->m_len) { 678 m_freem(control); 679 if (m) 680 m_freem(m); 681 error = EINVAL; 682 goto out; 683 } 684 m_freem(control); /* empty control, just free it */ 685 } 686 if (!(flags & PRUS_OOB)) { 687 sbappendstream(&so->so_snd, m); 688 if (nam && tp->t_state < TCPS_SYN_SENT) { 689 /* 690 * Do implied connect if not yet connected, 691 * initialize window to default value, and 692 * initialize maxseg/maxopd using peer's cached 693 * MSS. 694 */ 695#ifdef INET6 696 if (isipv6) 697 error = tcp6_connect(tp, nam, td); 698 else 699#endif /* INET6 */ 700 error = tcp_connect(tp, nam, td); 701 if (error) 702 goto out; 703 tp->snd_wnd = TTCP_CLIENT_SND_WND; 704 tcp_mss(tp, -1); 705 } 706 707 if (flags & PRUS_EOF) { 708 /* 709 * Close the send side of the connection after 710 * the data is sent. 711 */ 712 socantsendmore(so); 713 tp = tcp_usrclosed(tp); 714 } 715 if (tp != NULL) { 716 if (flags & PRUS_MORETOCOME) 717 tp->t_flags |= TF_MORETOCOME; 718 error = tcp_output(tp); 719 if (flags & PRUS_MORETOCOME) 720 tp->t_flags &= ~TF_MORETOCOME; 721 } 722 } else { 723 if (sbspace(&so->so_snd) < -512) { 724 m_freem(m); 725 error = ENOBUFS; 726 goto out; 727 } 728 /* 729 * According to RFC961 (Assigned Protocols), 730 * the urgent pointer points to the last octet 731 * of urgent data. We continue, however, 732 * to consider it to indicate the first octet 733 * of data past the urgent section. 734 * Otherwise, snd_up should be one lower. 735 */ 736 sbappendstream(&so->so_snd, m); 737 if (nam && tp->t_state < TCPS_SYN_SENT) { 738 /* 739 * Do implied connect if not yet connected, 740 * initialize window to default value, and 741 * initialize maxseg/maxopd using peer's cached 742 * MSS. 743 */ 744#ifdef INET6 745 if (isipv6) 746 error = tcp6_connect(tp, nam, td); 747 else 748#endif /* INET6 */ 749 error = tcp_connect(tp, nam, td); 750 if (error) 751 goto out; 752 tp->snd_wnd = TTCP_CLIENT_SND_WND; 753 tcp_mss(tp, -1); 754 } 755 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 756 tp->t_force = 1; 757 error = tcp_output(tp); 758 tp->t_force = 0; 759 } 760 COMMON_END((flags & PRUS_OOB) ? PRU_SENDOOB : 761 ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); 762} 763 764/* 765 * Abort the TCP. 766 */ 767static int 768tcp_usr_abort(struct socket *so) 769{ 770 int s = splnet(); 771 int error = 0; 772 struct inpcb *inp; 773 struct tcpcb *tp; 774 const int inirw = INI_WRITE; 775 776 COMMON_START(); 777 tp = tcp_drop(tp, ECONNABORTED); 778 COMMON_END(PRU_ABORT); 779} 780 781/* 782 * Receive out-of-band data. 783 */ 784static int 785tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) 786{ 787 int s = splnet(); 788 int error = 0; 789 struct inpcb *inp; 790 struct tcpcb *tp; 791 const int inirw = INI_READ; 792 793 COMMON_START(); 794 if ((so->so_oobmark == 0 && 795 (so->so_state & SS_RCVATMARK) == 0) || 796 so->so_options & SO_OOBINLINE || 797 tp->t_oobflags & TCPOOB_HADDATA) { 798 error = EINVAL; 799 goto out; 800 } 801 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 802 error = EWOULDBLOCK; 803 goto out; 804 } 805 m->m_len = 1; 806 *mtod(m, caddr_t) = tp->t_iobc; 807 if ((flags & MSG_PEEK) == 0) 808 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 809 COMMON_END(PRU_RCVOOB); 810} 811 812/* xxx - should be const */ 813struct pr_usrreqs tcp_usrreqs = { 814 tcp_usr_abort, tcp_usr_accept, tcp_usr_attach, tcp_usr_bind, 815 tcp_usr_connect, pru_connect2_notsupp, in_control, tcp_usr_detach, 816 tcp_usr_disconnect, tcp_usr_listen, tcp_peeraddr, tcp_usr_rcvd, 817 tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown, 818 tcp_sockaddr, sosend, soreceive, sopoll, in_pcbsosetlabel 819}; 820 821#ifdef INET6 822struct pr_usrreqs tcp6_usrreqs = { 823 tcp_usr_abort, tcp6_usr_accept, tcp_usr_attach, tcp6_usr_bind, 824 tcp6_usr_connect, pru_connect2_notsupp, in6_control, tcp_usr_detach, 825 tcp_usr_disconnect, tcp6_usr_listen, in6_mapped_peeraddr, tcp_usr_rcvd, 826 tcp_usr_rcvoob, tcp_usr_send, pru_sense_null, tcp_usr_shutdown, 827 in6_mapped_sockaddr, sosend, soreceive, sopoll, in_pcbsosetlabel 828}; 829#endif /* INET6 */ 830 831/* 832 * Common subroutine to open a TCP connection to remote host specified 833 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 834 * port number if needed. Call in_pcbconnect_setup to do the routing and 835 * to choose a local host address (interface). If there is an existing 836 * incarnation of the same connection in TIME-WAIT state and if the remote 837 * host was sending CC options and if the connection duration was < MSL, then 838 * truncate the previous TIME-WAIT state and proceed. 839 * Initialize connection parameters and enter SYN-SENT state. 840 */ 841static int 842tcp_connect(tp, nam, td) 843 register struct tcpcb *tp; 844 struct sockaddr *nam; 845 struct thread *td; 846{ 847 struct inpcb *inp = tp->t_inpcb, *oinp; 848 struct socket *so = inp->inp_socket; 849 struct tcptw *otw; 850 struct rmxp_tao tao; 851 struct in_addr laddr; 852 u_short lport; 853 int error; 854 855 bzero(&tao, sizeof(tao)); 856 857 if (inp->inp_lport == 0) { 858 error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 859 if (error) 860 return error; 861 } 862 863 /* 864 * Cannot simply call in_pcbconnect, because there might be an 865 * earlier incarnation of this same connection still in 866 * TIME_WAIT state, creating an ADDRINUSE error. 867 */ 868 laddr = inp->inp_laddr; 869 lport = inp->inp_lport; 870 error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, 871 &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); 872 if (error && oinp == NULL) 873 return error; 874 if (oinp) { 875 if (oinp != inp && 876 (oinp->inp_vflag & INP_TIMEWAIT) && 877 (ticks - (otw = intotw(oinp))->t_starttime) < tcp_msl && 878 otw->cc_recv != 0) { 879 inp->inp_faddr = oinp->inp_faddr; 880 inp->inp_fport = oinp->inp_fport; 881 (void) tcp_twclose(otw, 0); 882 } else 883 return EADDRINUSE; 884 } 885 inp->inp_laddr = laddr; 886 in_pcbrehash(inp); 887 888 /* Compute window scaling to request. */ 889 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 890 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) 891 tp->request_r_scale++; 892 893 soisconnecting(so); 894 tcpstat.tcps_connattempt++; 895 tp->t_state = TCPS_SYN_SENT; 896 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); 897 tp->iss = tcp_new_isn(tp); 898 tp->t_bw_rtseq = tp->iss; 899 tcp_sendseqinit(tp); 900 901 /* 902 * Generate a CC value for this connection and 903 * check whether CC or CCnew should be used. 904 */ 905 if (tcp_do_rfc1644) 906 tcp_hc_gettao(&inp->inp_inc, &tao); 907 908 tp->cc_send = CC_INC(tcp_ccgen); 909 if (tao.tao_ccsent != 0 && 910 CC_GEQ(tp->cc_send, tao.tao_ccsent)) { 911 tao.tao_ccsent = tp->cc_send; 912 } else { 913 tao.tao_ccsent = 0; 914 tp->t_flags |= TF_SENDCCNEW; 915 } 916 917 if (tcp_do_rfc1644) 918 tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, 919 tao.tao_ccsent, 0); 920 921 return 0; 922} 923 924#ifdef INET6 925static int 926tcp6_connect(tp, nam, td) 927 register struct tcpcb *tp; 928 struct sockaddr *nam; 929 struct thread *td; 930{ 931 struct inpcb *inp = tp->t_inpcb, *oinp; 932 struct socket *so = inp->inp_socket; 933 struct tcptw *otw; 934 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; 935 struct in6_addr *addr6; 936 struct rmxp_tao tao; 937 int error; 938 939 bzero(&tao, sizeof(tao)); 940 941 if (inp->inp_lport == 0) { 942 error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); 943 if (error) 944 return error; 945 } 946 947 /* 948 * Cannot simply call in_pcbconnect, because there might be an 949 * earlier incarnation of this same connection still in 950 * TIME_WAIT state, creating an ADDRINUSE error. 951 */ 952 error = in6_pcbladdr(inp, nam, &addr6); 953 if (error) 954 return error; 955 oinp = in6_pcblookup_hash(inp->inp_pcbinfo, 956 &sin6->sin6_addr, sin6->sin6_port, 957 IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) 958 ? addr6 959 : &inp->in6p_laddr, 960 inp->inp_lport, 0, NULL); 961 if (oinp) { 962 if (oinp != inp && 963 (oinp->inp_vflag & INP_TIMEWAIT) && 964 (ticks - (otw = intotw(oinp))->t_starttime) < tcp_msl && 965 otw->cc_recv != 0) { 966 inp->inp_faddr = oinp->inp_faddr; 967 inp->inp_fport = oinp->inp_fport; 968 (void) tcp_twclose(otw, 0); 969 } else 970 return EADDRINUSE; 971 } 972 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) 973 inp->in6p_laddr = *addr6; 974 inp->in6p_faddr = sin6->sin6_addr; 975 inp->inp_fport = sin6->sin6_port; 976 if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0) 977 inp->in6p_flowinfo = sin6->sin6_flowinfo; 978 in_pcbrehash(inp); 979 980 /* Compute window scaling to request. */ 981 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 982 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) 983 tp->request_r_scale++; 984 985 soisconnecting(so); 986 tcpstat.tcps_connattempt++; 987 tp->t_state = TCPS_SYN_SENT; 988 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); 989 tp->iss = tcp_new_isn(tp); 990 tp->t_bw_rtseq = tp->iss; 991 tcp_sendseqinit(tp); 992 993 /* 994 * Generate a CC value for this connection and 995 * check whether CC or CCnew should be used. 996 */ 997 if (tcp_do_rfc1644) 998 tcp_hc_gettao(&inp->inp_inc, &tao); 999 1000 tp->cc_send = CC_INC(tcp_ccgen); 1001 if (tao.tao_ccsent != 0 && 1002 CC_GEQ(tp->cc_send, tao.tao_ccsent)) { 1003 tao.tao_ccsent = tp->cc_send; 1004 } else { 1005 tao.tao_ccsent = 0; 1006 tp->t_flags |= TF_SENDCCNEW; 1007 } 1008 if (tcp_do_rfc1644) 1009 tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, 1010 tao.tao_ccsent, 0); 1011 1012 return 0; 1013} 1014#endif /* INET6 */ 1015 1016/* 1017 * The new sockopt interface makes it possible for us to block in the 1018 * copyin/out step (if we take a page fault). Taking a page fault at 1019 * splnet() is probably a Bad Thing. (Since sockets and pcbs both now 1020 * use TSM, there probably isn't any need for this function to run at 1021 * splnet() any more. This needs more examination.) 1022 */ 1023int 1024tcp_ctloutput(so, sopt) 1025 struct socket *so; 1026 struct sockopt *sopt; 1027{ 1028 int error, opt, optval, s; 1029 struct inpcb *inp; 1030 struct tcpcb *tp; 1031 1032 error = 0; 1033 s = splnet(); /* XXX */ 1034 INP_INFO_RLOCK(&tcbinfo); 1035 inp = sotoinpcb(so); 1036 if (inp == NULL) { 1037 INP_INFO_RUNLOCK(&tcbinfo); 1038 splx(s); 1039 return (ECONNRESET); 1040 } 1041 INP_LOCK(inp); 1042 INP_INFO_RUNLOCK(&tcbinfo); 1043 if (sopt->sopt_level != IPPROTO_TCP) { 1044#ifdef INET6 1045 if (INP_CHECK_SOCKAF(so, AF_INET6)) 1046 error = ip6_ctloutput(so, sopt); 1047 else 1048#endif /* INET6 */ 1049 error = ip_ctloutput(so, sopt); 1050 INP_UNLOCK(inp); 1051 splx(s); 1052 return (error); 1053 } 1054 tp = intotcpcb(inp); 1055 1056 switch (sopt->sopt_dir) { 1057 case SOPT_SET: 1058 switch (sopt->sopt_name) { 1059#ifdef TCP_SIGNATURE 1060 case TCP_MD5SIG: 1061 error = sooptcopyin(sopt, &optval, sizeof optval, 1062 sizeof optval); 1063 if (error) 1064 break; 1065 1066 if (optval > 0) 1067 tp->t_flags |= TF_SIGNATURE; 1068 else 1069 tp->t_flags &= ~TF_SIGNATURE; 1070 break; 1071#endif /* TCP_SIGNATURE */ 1072 case TCP_NODELAY: 1073 case TCP_NOOPT: 1074 error = sooptcopyin(sopt, &optval, sizeof optval, 1075 sizeof optval); 1076 if (error) 1077 break; 1078 1079 switch (sopt->sopt_name) { 1080 case TCP_NODELAY: 1081 opt = TF_NODELAY; 1082 break; 1083 case TCP_NOOPT: 1084 opt = TF_NOOPT; 1085 break; 1086 default: 1087 opt = 0; /* dead code to fool gcc */ 1088 break; 1089 } 1090 1091 if (optval) 1092 tp->t_flags |= opt; 1093 else 1094 tp->t_flags &= ~opt; 1095 break; 1096 1097 case TCP_NOPUSH: 1098 error = sooptcopyin(sopt, &optval, sizeof optval, 1099 sizeof optval); 1100 if (error) 1101 break; 1102 1103 if (optval) 1104 tp->t_flags |= TF_NOPUSH; 1105 else { 1106 tp->t_flags &= ~TF_NOPUSH; 1107 error = tcp_output(tp); 1108 } 1109 break; 1110 1111 case TCP_MAXSEG: 1112 error = sooptcopyin(sopt, &optval, sizeof optval, 1113 sizeof optval); 1114 if (error) 1115 break; 1116 1117 if (optval > 0 && optval <= tp->t_maxseg && 1118 optval + 40 >= tcp_minmss) 1119 tp->t_maxseg = optval; 1120 else 1121 error = EINVAL; 1122 break; 1123 1124 default: 1125 error = ENOPROTOOPT; 1126 break; 1127 } 1128 break; 1129 1130 case SOPT_GET: 1131 switch (sopt->sopt_name) { 1132#ifdef TCP_SIGNATURE 1133 case TCP_MD5SIG: 1134 optval = (tp->t_flags & TF_SIGNATURE) ? 1 : 0; 1135 break; 1136#endif 1137 case TCP_NODELAY: 1138 optval = tp->t_flags & TF_NODELAY; 1139 break; 1140 case TCP_MAXSEG: 1141 optval = tp->t_maxseg; 1142 break; 1143 case TCP_NOOPT: 1144 optval = tp->t_flags & TF_NOOPT; 1145 break; 1146 case TCP_NOPUSH: 1147 optval = tp->t_flags & TF_NOPUSH; 1148 break; 1149 default: 1150 error = ENOPROTOOPT; 1151 break; 1152 } 1153 if (error == 0) 1154 error = sooptcopyout(sopt, &optval, sizeof optval); 1155 break; 1156 } 1157 INP_UNLOCK(inp); 1158 splx(s); 1159 return (error); 1160} 1161 1162/* 1163 * tcp_sendspace and tcp_recvspace are the default send and receive window 1164 * sizes, respectively. These are obsolescent (this information should 1165 * be set by the route). 1166 */ 1167u_long tcp_sendspace = 1024*32; 1168SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, 1169 &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); 1170u_long tcp_recvspace = 1024*64; 1171SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, 1172 &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); 1173 1174/* 1175 * Attach TCP protocol to socket, allocating 1176 * internet protocol control block, tcp control block, 1177 * bufer space, and entering LISTEN state if to accept connections. 1178 */ 1179static int 1180tcp_attach(so) 1181 struct socket *so; 1182{ 1183 register struct tcpcb *tp; 1184 struct inpcb *inp; 1185 int error; 1186#ifdef INET6 1187 int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0; 1188#endif 1189 1190 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 1191 error = soreserve(so, tcp_sendspace, tcp_recvspace); 1192 if (error) 1193 return (error); 1194 } 1195 error = in_pcballoc(so, &tcbinfo, "tcpinp"); 1196 if (error) 1197 return (error); 1198 inp = sotoinpcb(so); 1199#ifdef INET6 1200 if (isipv6) { 1201 inp->inp_vflag |= INP_IPV6; 1202 inp->in6p_hops = -1; /* use kernel default */ 1203 } 1204 else 1205#endif 1206 inp->inp_vflag |= INP_IPV4; 1207 tp = tcp_newtcpcb(inp); 1208 if (tp == 0) { 1209 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 1210 1211 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 1212#ifdef INET6 1213 if (isipv6) 1214 in6_pcbdetach(inp); 1215 else 1216#endif 1217 in_pcbdetach(inp); 1218 so->so_state |= nofd; 1219 return (ENOBUFS); 1220 } 1221 tp->t_state = TCPS_CLOSED; 1222 return (0); 1223} 1224 1225/* 1226 * Initiate (or continue) disconnect. 1227 * If embryonic state, just send reset (once). 1228 * If in ``let data drain'' option and linger null, just drop. 1229 * Otherwise (hard), mark socket disconnecting and drop 1230 * current input data; switch states based on user close, and 1231 * send segment to peer (with FIN). 1232 */ 1233static struct tcpcb * 1234tcp_disconnect(tp) 1235 register struct tcpcb *tp; 1236{ 1237 struct socket *so = tp->t_inpcb->inp_socket; 1238 1239 if (tp->t_state < TCPS_ESTABLISHED) 1240 tp = tcp_close(tp); 1241 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 1242 tp = tcp_drop(tp, 0); 1243 else { 1244 soisdisconnecting(so); 1245 sbflush(&so->so_rcv); 1246 tp = tcp_usrclosed(tp); 1247 if (tp) 1248 (void) tcp_output(tp); 1249 } 1250 return (tp); 1251} 1252 1253/* 1254 * User issued close, and wish to trail through shutdown states: 1255 * if never received SYN, just forget it. If got a SYN from peer, 1256 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1257 * If already got a FIN from peer, then almost done; go to LAST_ACK 1258 * state. In all other cases, have already sent FIN to peer (e.g. 1259 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1260 * for peer to send FIN or not respond to keep-alives, etc. 1261 * We can let the user exit from the close as soon as the FIN is acked. 1262 */ 1263static struct tcpcb * 1264tcp_usrclosed(tp) 1265 register struct tcpcb *tp; 1266{ 1267 1268 switch (tp->t_state) { 1269 1270 case TCPS_CLOSED: 1271 case TCPS_LISTEN: 1272 tp->t_state = TCPS_CLOSED; 1273 tp = tcp_close(tp); 1274 break; 1275 1276 case TCPS_SYN_SENT: 1277 case TCPS_SYN_RECEIVED: 1278 tp->t_flags |= TF_NEEDFIN; 1279 break; 1280 1281 case TCPS_ESTABLISHED: 1282 tp->t_state = TCPS_FIN_WAIT_1; 1283 break; 1284 1285 case TCPS_CLOSE_WAIT: 1286 tp->t_state = TCPS_LAST_ACK; 1287 break; 1288 } 1289 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1290 soisdisconnected(tp->t_inpcb->inp_socket); 1291 /* To prevent the connection hanging in FIN_WAIT_2 forever. */ 1292 if (tp->t_state == TCPS_FIN_WAIT_2) 1293 callout_reset(tp->tt_2msl, tcp_maxidle, 1294 tcp_timer_2msl, tp); 1295 } 1296 return (tp); 1297} 1298 1299