tcp_usrreq.c revision 8876
1/* 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 34 * $Id: tcp_usrreq.c,v 1.13 1995/04/09 01:29:28 davidg Exp $ 35 */ 36 37#include <sys/param.h> 38#include <sys/systm.h> 39#include <sys/kernel.h> 40#include <sys/malloc.h> 41#include <sys/mbuf.h> 42#include <sys/socket.h> 43#include <sys/socketvar.h> 44#include <sys/protosw.h> 45#include <sys/errno.h> 46#include <sys/stat.h> 47#include <vm/vm.h> 48#include <sys/sysctl.h> 49 50#include <net/if.h> 51#include <net/route.h> 52 53#include <netinet/in.h> 54#include <netinet/in_systm.h> 55#include <netinet/ip.h> 56#include <netinet/in_pcb.h> 57#include <netinet/in_var.h> 58#include <netinet/ip_var.h> 59#include <netinet/tcp.h> 60#include <netinet/tcp_fsm.h> 61#include <netinet/tcp_seq.h> 62#include <netinet/tcp_timer.h> 63#include <netinet/tcp_var.h> 64#include <netinet/tcpip.h> 65#ifdef TCPDEBUG 66#include <netinet/tcp_debug.h> 67#endif 68 69/* 70 * TCP protocol interface to socket abstraction. 71 */ 72extern char *tcpstates[]; 73 74/* 75 * Process a TCP user request for TCP tb. If this is a send request 76 * then m is the mbuf chain of send data. If this is a timer expiration 77 * (called from the software clock routine), then timertype tells which timer. 78 */ 79/*ARGSUSED*/ 80int 81tcp_usrreq(so, req, m, nam, control) 82 struct socket *so; 83 int req; 84 struct mbuf *m, *nam, *control; 85{ 86 register struct inpcb *inp; 87 register struct tcpcb *tp = 0; 88 struct sockaddr_in *sinp; 89 int s; 90 int error = 0; 91#ifdef TCPDEBUG 92 int ostate; 93#endif 94 95 if (req == PRU_CONTROL) 96 return (in_control(so, (int)m, (caddr_t)nam, 97 (struct ifnet *)control)); 98 if (control && control->m_len) { 99 m_freem(control); 100 if (m) 101 m_freem(m); 102 return (EINVAL); 103 } 104 105 s = splnet(); 106 inp = sotoinpcb(so); 107 /* 108 * When a TCP is attached to a socket, then there will be 109 * a (struct inpcb) pointed at by the socket, and this 110 * structure will point at a subsidary (struct tcpcb). 111 */ 112 if (inp == 0 && req != PRU_ATTACH) { 113 splx(s); 114 return (EINVAL); /* XXX */ 115 } 116 if (inp) { 117 tp = intotcpcb(inp); 118 /* WHAT IF TP IS 0? */ 119#ifdef KPROF 120 tcp_acounts[tp->t_state][req]++; 121#endif 122#ifdef TCPDEBUG 123 ostate = tp->t_state; 124 } else 125 ostate = 0; 126#else /* TCPDEBUG */ 127 } 128#endif /* TCPDEBUG */ 129 130 switch (req) { 131 132 /* 133 * TCP attaches to socket via PRU_ATTACH, reserving space, 134 * and an internet control block. 135 */ 136 case PRU_ATTACH: 137 if (inp) { 138 error = EISCONN; 139 break; 140 } 141 error = tcp_attach(so); 142 if (error) 143 break; 144 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 145 so->so_linger = TCP_LINGERTIME * hz; 146 tp = sototcpcb(so); 147 break; 148 149 /* 150 * PRU_DETACH detaches the TCP protocol from the socket. 151 * If the protocol state is non-embryonic, then can't 152 * do this directly: have to initiate a PRU_DISCONNECT, 153 * which may finish later; embryonic TCB's can just 154 * be discarded here. 155 */ 156 case PRU_DETACH: 157 if (tp->t_state > TCPS_LISTEN) 158 tp = tcp_disconnect(tp); 159 else 160 tp = tcp_close(tp); 161 break; 162 163 /* 164 * Give the socket an address. 165 */ 166 case PRU_BIND: 167 /* 168 * Must check for multicast addresses and disallow binding 169 * to them. 170 */ 171 sinp = mtod(nam, struct sockaddr_in *); 172 if (sinp->sin_family == AF_INET && 173 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 174 error = EAFNOSUPPORT; 175 break; 176 } 177 error = in_pcbbind(inp, nam); 178 if (error) 179 break; 180 break; 181 182 /* 183 * Prepare to accept connections. 184 */ 185 case PRU_LISTEN: 186 if (inp->inp_lport == 0) 187 error = in_pcbbind(inp, NULL); 188 if (error == 0) 189 tp->t_state = TCPS_LISTEN; 190 break; 191 192 /* 193 * Initiate connection to peer. 194 * Create a template for use in transmissions on this connection. 195 * Enter SYN_SENT state, and mark socket as connecting. 196 * Start keep-alive timer, and seed output sequence space. 197 * Send initial segment on connection. 198 */ 199 case PRU_CONNECT: 200 /* 201 * Must disallow TCP ``connections'' to multicast addresses. 202 */ 203 sinp = mtod(nam, struct sockaddr_in *); 204 if (sinp->sin_family == AF_INET 205 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 206 error = EAFNOSUPPORT; 207 break; 208 } 209 210 if ((error = tcp_connect(tp, nam)) != 0) 211 break; 212 error = tcp_output(tp); 213 break; 214 215 /* 216 * Create a TCP connection between two sockets. 217 */ 218 case PRU_CONNECT2: 219 error = EOPNOTSUPP; 220 break; 221 222 /* 223 * Initiate disconnect from peer. 224 * If connection never passed embryonic stage, just drop; 225 * else if don't need to let data drain, then can just drop anyways, 226 * else have to begin TCP shutdown process: mark socket disconnecting, 227 * drain unread data, state switch to reflect user close, and 228 * send segment (e.g. FIN) to peer. Socket will be really disconnected 229 * when peer sends FIN and acks ours. 230 * 231 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 232 */ 233 case PRU_DISCONNECT: 234 tp = tcp_disconnect(tp); 235 break; 236 237 /* 238 * Accept a connection. Essentially all the work is 239 * done at higher levels; just return the address 240 * of the peer, storing through addr. 241 */ 242 case PRU_ACCEPT: 243 in_setpeeraddr(inp, nam); 244 break; 245 246 /* 247 * Mark the connection as being incapable of further output. 248 */ 249 case PRU_SHUTDOWN: 250 socantsendmore(so); 251 tp = tcp_usrclosed(tp); 252 if (tp) 253 error = tcp_output(tp); 254 break; 255 256 /* 257 * After a receive, possibly send window update to peer. 258 */ 259 case PRU_RCVD: 260 (void) tcp_output(tp); 261 break; 262 263 /* 264 * Do a send by putting data in output queue and updating urgent 265 * marker if URG set. Possibly send more data. 266 */ 267 case PRU_SEND_EOF: 268 case PRU_SEND: 269 sbappend(&so->so_snd, m); 270 if (nam && tp->t_state < TCPS_SYN_SENT) { 271 /* 272 * Do implied connect if not yet connected, 273 * initialize window to default value, and 274 * initialize maxseg/maxopd using peer's cached 275 * MSS. 276 */ 277 error = tcp_connect(tp, nam); 278 if (error) 279 break; 280 tp->snd_wnd = TTCP_CLIENT_SND_WND; 281 tcp_mss(tp, -1); 282 } 283 284 if (req == PRU_SEND_EOF) { 285 /* 286 * Close the send side of the connection after 287 * the data is sent. 288 */ 289 socantsendmore(so); 290 tp = tcp_usrclosed(tp); 291 } 292 if (tp != NULL) 293 error = tcp_output(tp); 294 break; 295 296 /* 297 * Abort the TCP. 298 */ 299 case PRU_ABORT: 300 tp = tcp_drop(tp, ECONNABORTED); 301 break; 302 303 case PRU_SENSE: 304 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 305 (void) splx(s); 306 return (0); 307 308 case PRU_RCVOOB: 309 if ((so->so_oobmark == 0 && 310 (so->so_state & SS_RCVATMARK) == 0) || 311 so->so_options & SO_OOBINLINE || 312 tp->t_oobflags & TCPOOB_HADDATA) { 313 error = EINVAL; 314 break; 315 } 316 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 317 error = EWOULDBLOCK; 318 break; 319 } 320 m->m_len = 1; 321 *mtod(m, caddr_t) = tp->t_iobc; 322 if (((int)nam & MSG_PEEK) == 0) 323 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 324 break; 325 326 case PRU_SENDOOB: 327 if (sbspace(&so->so_snd) < -512) { 328 m_freem(m); 329 error = ENOBUFS; 330 break; 331 } 332 /* 333 * According to RFC961 (Assigned Protocols), 334 * the urgent pointer points to the last octet 335 * of urgent data. We continue, however, 336 * to consider it to indicate the first octet 337 * of data past the urgent section. 338 * Otherwise, snd_up should be one lower. 339 */ 340 sbappend(&so->so_snd, m); 341 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 342 tp->t_force = 1; 343 error = tcp_output(tp); 344 tp->t_force = 0; 345 break; 346 347 case PRU_SOCKADDR: 348 in_setsockaddr(inp, nam); 349 break; 350 351 case PRU_PEERADDR: 352 in_setpeeraddr(inp, nam); 353 break; 354 355 /* 356 * TCP slow timer went off; going through this 357 * routine for tracing's sake. 358 */ 359 case PRU_SLOWTIMO: 360 tp = tcp_timers(tp, (int)nam); 361#ifdef TCPDEBUG 362 req |= (int)nam << 8; /* for debug's sake */ 363#endif 364 break; 365 366 default: 367 panic("tcp_usrreq"); 368 } 369#ifdef TCPDEBUG 370 if (tp && (so->so_options & SO_DEBUG)) 371 tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req); 372#endif 373 splx(s); 374 return (error); 375} 376 377/* 378 * Common subroutine to open a TCP connection to remote host specified 379 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 380 * port number if needed. Call in_pcbladdr to do the routing and to choose 381 * a local host address (interface). If there is an existing incarnation 382 * of the same connection in TIME-WAIT state and if the remote host was 383 * sending CC options and if the connection duration was < MSL, then 384 * truncate the previous TIME-WAIT state and proceed. 385 * Initialize connection parameters and enter SYN-SENT state. 386 */ 387int 388tcp_connect(tp, nam) 389 register struct tcpcb *tp; 390 struct mbuf *nam; 391{ 392 struct inpcb *inp = tp->t_inpcb, *oinp; 393 struct socket *so = inp->inp_socket; 394 struct tcpcb *otp; 395 struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *); 396 struct sockaddr_in *ifaddr; 397 int error; 398 399 if (inp->inp_lport == 0) { 400 error = in_pcbbind(inp, NULL); 401 if (error) 402 return error; 403 } 404 405 /* 406 * Cannot simply call in_pcbconnect, because there might be an 407 * earlier incarnation of this same connection still in 408 * TIME_WAIT state, creating an ADDRINUSE error. 409 */ 410 error = in_pcbladdr(inp, nam, &ifaddr); 411 oinp = in_pcblookup(inp->inp_pcbinfo->listhead, 412 sin->sin_addr, sin->sin_port, 413 inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr 414 : ifaddr->sin_addr, 415 inp->inp_lport, 0); 416 if (oinp) { 417 if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && 418 otp->t_state == TCPS_TIME_WAIT && 419 otp->t_duration < TCPTV_MSL && 420 (otp->t_flags & TF_RCVD_CC)) 421 otp = tcp_close(otp); 422 else 423 return EADDRINUSE; 424 } 425 if (inp->inp_laddr.s_addr == INADDR_ANY) 426 inp->inp_laddr = ifaddr->sin_addr; 427 inp->inp_faddr = sin->sin_addr; 428 inp->inp_fport = sin->sin_port; 429 in_pcbrehash(inp); 430 431 tp->t_template = tcp_template(tp); 432 if (tp->t_template == 0) { 433 in_pcbdisconnect(inp); 434 return ENOBUFS; 435 } 436 437 /* Compute window scaling to request. */ 438 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 439 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) 440 tp->request_r_scale++; 441 442 soisconnecting(so); 443 tcpstat.tcps_connattempt++; 444 tp->t_state = TCPS_SYN_SENT; 445 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; 446 tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2; 447 tcp_sendseqinit(tp); 448 tp->cc_send = CC_INC(tcp_ccgen); 449 450 return 0; 451} 452 453int 454tcp_ctloutput(op, so, level, optname, mp) 455 int op; 456 struct socket *so; 457 int level, optname; 458 struct mbuf **mp; 459{ 460 int error = 0, s; 461 struct inpcb *inp; 462 register struct tcpcb *tp; 463 register struct mbuf *m; 464 register int i; 465 466 s = splnet(); 467 inp = sotoinpcb(so); 468 if (inp == NULL) { 469 splx(s); 470 if (op == PRCO_SETOPT && *mp) 471 (void) m_free(*mp); 472 return (ECONNRESET); 473 } 474 if (level != IPPROTO_TCP) { 475 error = ip_ctloutput(op, so, level, optname, mp); 476 splx(s); 477 return (error); 478 } 479 tp = intotcpcb(inp); 480 481 switch (op) { 482 483 case PRCO_SETOPT: 484 m = *mp; 485 switch (optname) { 486 487 case TCP_NODELAY: 488 if (m == NULL || m->m_len < sizeof (int)) 489 error = EINVAL; 490 else if (*mtod(m, int *)) 491 tp->t_flags |= TF_NODELAY; 492 else 493 tp->t_flags &= ~TF_NODELAY; 494 break; 495 496 case TCP_MAXSEG: 497 if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg) 498 tp->t_maxseg = i; 499 else 500 error = EINVAL; 501 break; 502 503 case TCP_NOOPT: 504 if (m == NULL || m->m_len < sizeof (int)) 505 error = EINVAL; 506 else if (*mtod(m, int *)) 507 tp->t_flags |= TF_NOOPT; 508 else 509 tp->t_flags &= ~TF_NOOPT; 510 break; 511 512 case TCP_NOPUSH: 513 if (m == NULL || m->m_len < sizeof (int)) 514 error = EINVAL; 515 else if (*mtod(m, int *)) 516 tp->t_flags |= TF_NOPUSH; 517 else 518 tp->t_flags &= ~TF_NOPUSH; 519 break; 520 521 default: 522 error = ENOPROTOOPT; 523 break; 524 } 525 if (m) 526 (void) m_free(m); 527 break; 528 529 case PRCO_GETOPT: 530 *mp = m = m_get(M_WAIT, MT_SOOPTS); 531 m->m_len = sizeof(int); 532 533 switch (optname) { 534 case TCP_NODELAY: 535 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 536 break; 537 case TCP_MAXSEG: 538 *mtod(m, int *) = tp->t_maxseg; 539 break; 540 case TCP_NOOPT: 541 *mtod(m, int *) = tp->t_flags & TF_NOOPT; 542 break; 543 case TCP_NOPUSH: 544 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 545 break; 546 default: 547 error = ENOPROTOOPT; 548 break; 549 } 550 break; 551 } 552 splx(s); 553 return (error); 554} 555 556/* 557 * tcp_sendspace and tcp_recvspace are the default send and receive window 558 * sizes, respectively. These are obsolescent (this information should 559 * be set by the route). 560 */ 561u_long tcp_sendspace = 1024*16; 562u_long tcp_recvspace = 1024*16; 563 564/* 565 * Attach TCP protocol to socket, allocating 566 * internet protocol control block, tcp control block, 567 * bufer space, and entering LISTEN state if to accept connections. 568 */ 569int 570tcp_attach(so) 571 struct socket *so; 572{ 573 register struct tcpcb *tp; 574 struct inpcb *inp; 575 int error; 576 577 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 578 error = soreserve(so, tcp_sendspace, tcp_recvspace); 579 if (error) 580 return (error); 581 } 582 error = in_pcballoc(so, &tcbinfo); 583 if (error) 584 return (error); 585 inp = sotoinpcb(so); 586 tp = tcp_newtcpcb(inp); 587 if (tp == 0) { 588 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 589 590 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 591 in_pcbdetach(inp); 592 so->so_state |= nofd; 593 return (ENOBUFS); 594 } 595 tp->t_state = TCPS_CLOSED; 596 return (0); 597} 598 599/* 600 * Initiate (or continue) disconnect. 601 * If embryonic state, just send reset (once). 602 * If in ``let data drain'' option and linger null, just drop. 603 * Otherwise (hard), mark socket disconnecting and drop 604 * current input data; switch states based on user close, and 605 * send segment to peer (with FIN). 606 */ 607struct tcpcb * 608tcp_disconnect(tp) 609 register struct tcpcb *tp; 610{ 611 struct socket *so = tp->t_inpcb->inp_socket; 612 613 if (tp->t_state < TCPS_ESTABLISHED) 614 tp = tcp_close(tp); 615 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 616 tp = tcp_drop(tp, 0); 617 else { 618 soisdisconnecting(so); 619 sbflush(&so->so_rcv); 620 tp = tcp_usrclosed(tp); 621 if (tp) 622 (void) tcp_output(tp); 623 } 624 return (tp); 625} 626 627/* 628 * User issued close, and wish to trail through shutdown states: 629 * if never received SYN, just forget it. If got a SYN from peer, 630 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 631 * If already got a FIN from peer, then almost done; go to LAST_ACK 632 * state. In all other cases, have already sent FIN to peer (e.g. 633 * after PRU_SHUTDOWN), and just have to play tedious game waiting 634 * for peer to send FIN or not respond to keep-alives, etc. 635 * We can let the user exit from the close as soon as the FIN is acked. 636 */ 637struct tcpcb * 638tcp_usrclosed(tp) 639 register struct tcpcb *tp; 640{ 641 642 switch (tp->t_state) { 643 644 case TCPS_CLOSED: 645 case TCPS_LISTEN: 646 tp->t_state = TCPS_CLOSED; 647 tp = tcp_close(tp); 648 break; 649 650 case TCPS_SYN_SENT: 651 case TCPS_SYN_RECEIVED: 652 tp->t_flags |= TF_NEEDFIN; 653 break; 654 655 case TCPS_ESTABLISHED: 656 tp->t_state = TCPS_FIN_WAIT_1; 657 break; 658 659 case TCPS_CLOSE_WAIT: 660 tp->t_state = TCPS_LAST_ACK; 661 break; 662 } 663 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) 664 soisdisconnected(tp->t_inpcb->inp_socket); 665 return (tp); 666} 667 668/* 669 * Sysctl for tcp variables. 670 */ 671int 672tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen) 673 int *name; 674 u_int namelen; 675 void *oldp; 676 size_t *oldlenp; 677 void *newp; 678 size_t newlen; 679{ 680 /* All sysctl names at this level are terminal. */ 681 if (namelen != 1) 682 return (ENOTDIR); 683 684 switch (name[0]) { 685 case TCPCTL_DO_RFC1323: 686 return (sysctl_int(oldp, oldlenp, newp, newlen, 687 &tcp_do_rfc1323)); 688 case TCPCTL_DO_RFC1644: 689 return (sysctl_int(oldp, oldlenp, newp, newlen, 690 &tcp_do_rfc1644)); 691 case TCPCTL_MSSDFLT: 692 return (sysctl_int(oldp, oldlenp, newp, newlen, 693 &tcp_mssdflt)); 694 case TCPCTL_STATS: 695 return (sysctl_rdstruct(oldp, oldlenp, newp, &tcpstat, 696 sizeof tcpstat)); 697 case TCPCTL_RTTDFLT: 698 return (sysctl_int(oldp, oldlenp, newp, newlen, &tcp_rttdflt)); 699 case TCPCTL_KEEPIDLE: 700 return (sysctl_int(oldp, oldlenp, newp, newlen, 701 &tcp_keepidle)); 702 case TCPCTL_KEEPINTVL: 703 return (sysctl_int(oldp, oldlenp, newp, newlen, 704 &tcp_keepintvl)); 705 case TCPCTL_SENDSPACE: 706 return (sysctl_int(oldp, oldlenp, newp, newlen, 707 (int *)&tcp_sendspace)); /* XXX */ 708 case TCPCTL_RECVSPACE: 709 return (sysctl_int(oldp, oldlenp, newp, newlen, 710 (int *)&tcp_recvspace)); /* XXX */ 711 default: 712 return (ENOPROTOOPT); 713 } 714 /* NOTREACHED */ 715} 716