tcp_usrreq.c revision 12657
1/* 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 34 * $Id: tcp_usrreq.c,v 1.20 1995/11/14 20:34:47 phk Exp $ 35 */ 36 37#include <sys/param.h> 38#include <sys/systm.h> 39#include <sys/kernel.h> 40#include <sys/sysctl.h> 41#include <sys/malloc.h> 42#include <sys/mbuf.h> 43#include <sys/socket.h> 44#include <sys/socketvar.h> 45#include <sys/protosw.h> 46#include <sys/errno.h> 47#include <sys/stat.h> 48 49#include <net/if.h> 50#include <net/route.h> 51 52#include <netinet/in.h> 53#include <netinet/in_systm.h> 54#include <netinet/ip.h> 55#include <netinet/in_pcb.h> 56#include <netinet/in_var.h> 57#include <netinet/ip_var.h> 58#include <netinet/tcp.h> 59#include <netinet/tcp_fsm.h> 60#include <netinet/tcp_seq.h> 61#include <netinet/tcp_timer.h> 62#include <netinet/tcp_var.h> 63#include <netinet/tcpip.h> 64#ifdef TCPDEBUG 65#include <netinet/tcp_debug.h> 66#endif 67 68/* 69 * TCP protocol interface to socket abstraction. 70 */ 71extern char *tcpstates[]; 72 73static int tcp_attach __P((struct socket *)); 74static int tcp_connect __P((struct tcpcb *, struct mbuf *)); 75static struct tcpcb * 76 tcp_disconnect __P((struct tcpcb *)); 77static struct tcpcb * 78 tcp_usrclosed __P((struct tcpcb *)); 79/* 80 * Process a TCP user request for TCP tb. If this is a send request 81 * then m is the mbuf chain of send data. If this is a timer expiration 82 * (called from the software clock routine), then timertype tells which timer. 83 */ 84/*ARGSUSED*/ 85int 86tcp_usrreq(so, req, m, nam, control) 87 struct socket *so; 88 int req; 89 struct mbuf *m, *nam, *control; 90{ 91 register struct inpcb *inp; 92 register struct tcpcb *tp = 0; 93 struct sockaddr_in *sinp; 94 int s; 95 int error = 0; 96#ifdef TCPDEBUG 97 int ostate; 98#endif 99 100 if (req == PRU_CONTROL) 101 return (in_control(so, (u_long)m, (caddr_t)nam, 102 (struct ifnet *)control)); 103 if (control && control->m_len) { 104 m_freem(control); 105 if (m) 106 m_freem(m); 107 return (EINVAL); 108 } 109 110 s = splnet(); 111 inp = sotoinpcb(so); 112 /* 113 * When a TCP is attached to a socket, then there will be 114 * a (struct inpcb) pointed at by the socket, and this 115 * structure will point at a subsidary (struct tcpcb). 116 */ 117 if (inp == 0 && req != PRU_ATTACH) { 118 splx(s); 119#if 0 120 /* 121 * The following corrects an mbuf leak under rare 122 * circumstances, but has not been fully tested. 123 */ 124 if (m && req != PRU_SENSE) 125 m_freem(m); 126#else 127 /* safer version of fix for mbuf leak */ 128 if (m && (req == PRU_SEND || req == PRU_SENDOOB)) 129 m_freem(m); 130#endif 131 return (EINVAL); /* XXX */ 132 } 133 if (inp) { 134 tp = intotcpcb(inp); 135 /* WHAT IF TP IS 0? */ 136#ifdef KPROF 137 tcp_acounts[tp->t_state][req]++; 138#endif 139#ifdef TCPDEBUG 140 ostate = tp->t_state; 141 } else 142 ostate = 0; 143#else /* TCPDEBUG */ 144 } 145#endif /* TCPDEBUG */ 146 147 switch (req) { 148 149 /* 150 * TCP attaches to socket via PRU_ATTACH, reserving space, 151 * and an internet control block. 152 */ 153 case PRU_ATTACH: 154 if (inp) { 155 error = EISCONN; 156 break; 157 } 158 error = tcp_attach(so); 159 if (error) 160 break; 161 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 162 so->so_linger = TCP_LINGERTIME * hz; 163 tp = sototcpcb(so); 164 break; 165 166 /* 167 * PRU_DETACH detaches the TCP protocol from the socket. 168 * If the protocol state is non-embryonic, then can't 169 * do this directly: have to initiate a PRU_DISCONNECT, 170 * which may finish later; embryonic TCB's can just 171 * be discarded here. 172 */ 173 case PRU_DETACH: 174 if (tp->t_state > TCPS_LISTEN) 175 tp = tcp_disconnect(tp); 176 else 177 tp = tcp_close(tp); 178 break; 179 180 /* 181 * Give the socket an address. 182 */ 183 case PRU_BIND: 184 /* 185 * Must check for multicast addresses and disallow binding 186 * to them. 187 */ 188 sinp = mtod(nam, struct sockaddr_in *); 189 if (sinp->sin_family == AF_INET && 190 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 191 error = EAFNOSUPPORT; 192 break; 193 } 194 error = in_pcbbind(inp, nam); 195 if (error) 196 break; 197 break; 198 199 /* 200 * Prepare to accept connections. 201 */ 202 case PRU_LISTEN: 203 if (inp->inp_lport == 0) 204 error = in_pcbbind(inp, NULL); 205 if (error == 0) 206 tp->t_state = TCPS_LISTEN; 207 break; 208 209 /* 210 * Initiate connection to peer. 211 * Create a template for use in transmissions on this connection. 212 * Enter SYN_SENT state, and mark socket as connecting. 213 * Start keep-alive timer, and seed output sequence space. 214 * Send initial segment on connection. 215 */ 216 case PRU_CONNECT: 217 /* 218 * Must disallow TCP ``connections'' to multicast addresses. 219 */ 220 sinp = mtod(nam, struct sockaddr_in *); 221 if (sinp->sin_family == AF_INET 222 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 223 error = EAFNOSUPPORT; 224 break; 225 } 226 227 if ((error = tcp_connect(tp, nam)) != 0) 228 break; 229 error = tcp_output(tp); 230 break; 231 232 /* 233 * Create a TCP connection between two sockets. 234 */ 235 case PRU_CONNECT2: 236 error = EOPNOTSUPP; 237 break; 238 239 /* 240 * Initiate disconnect from peer. 241 * If connection never passed embryonic stage, just drop; 242 * else if don't need to let data drain, then can just drop anyways, 243 * else have to begin TCP shutdown process: mark socket disconnecting, 244 * drain unread data, state switch to reflect user close, and 245 * send segment (e.g. FIN) to peer. Socket will be really disconnected 246 * when peer sends FIN and acks ours. 247 * 248 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 249 */ 250 case PRU_DISCONNECT: 251 tp = tcp_disconnect(tp); 252 break; 253 254 /* 255 * Accept a connection. Essentially all the work is 256 * done at higher levels; just return the address 257 * of the peer, storing through addr. 258 */ 259 case PRU_ACCEPT: 260 in_setpeeraddr(inp, nam); 261 break; 262 263 /* 264 * Mark the connection as being incapable of further output. 265 */ 266 case PRU_SHUTDOWN: 267 socantsendmore(so); 268 tp = tcp_usrclosed(tp); 269 if (tp) 270 error = tcp_output(tp); 271 break; 272 273 /* 274 * After a receive, possibly send window update to peer. 275 */ 276 case PRU_RCVD: 277 (void) tcp_output(tp); 278 break; 279 280 /* 281 * Do a send by putting data in output queue and updating urgent 282 * marker if URG set. Possibly send more data. 283 */ 284 case PRU_SEND_EOF: 285 case PRU_SEND: 286 sbappend(&so->so_snd, m); 287 if (nam && tp->t_state < TCPS_SYN_SENT) { 288 /* 289 * Do implied connect if not yet connected, 290 * initialize window to default value, and 291 * initialize maxseg/maxopd using peer's cached 292 * MSS. 293 */ 294 error = tcp_connect(tp, nam); 295 if (error) 296 break; 297 tp->snd_wnd = TTCP_CLIENT_SND_WND; 298 tcp_mss(tp, -1); 299 } 300 301 if (req == PRU_SEND_EOF) { 302 /* 303 * Close the send side of the connection after 304 * the data is sent. 305 */ 306 socantsendmore(so); 307 tp = tcp_usrclosed(tp); 308 } 309 if (tp != NULL) 310 error = tcp_output(tp); 311 break; 312 313 /* 314 * Abort the TCP. 315 */ 316 case PRU_ABORT: 317 tp = tcp_drop(tp, ECONNABORTED); 318 break; 319 320 case PRU_SENSE: 321 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 322 (void) splx(s); 323 return (0); 324 325 case PRU_RCVOOB: 326 if ((so->so_oobmark == 0 && 327 (so->so_state & SS_RCVATMARK) == 0) || 328 so->so_options & SO_OOBINLINE || 329 tp->t_oobflags & TCPOOB_HADDATA) { 330 error = EINVAL; 331 break; 332 } 333 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 334 error = EWOULDBLOCK; 335 break; 336 } 337 m->m_len = 1; 338 *mtod(m, caddr_t) = tp->t_iobc; 339 if (((int)nam & MSG_PEEK) == 0) 340 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 341 break; 342 343 case PRU_SENDOOB: 344 if (sbspace(&so->so_snd) < -512) { 345 m_freem(m); 346 error = ENOBUFS; 347 break; 348 } 349 /* 350 * According to RFC961 (Assigned Protocols), 351 * the urgent pointer points to the last octet 352 * of urgent data. We continue, however, 353 * to consider it to indicate the first octet 354 * of data past the urgent section. 355 * Otherwise, snd_up should be one lower. 356 */ 357 sbappend(&so->so_snd, m); 358 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 359 tp->t_force = 1; 360 error = tcp_output(tp); 361 tp->t_force = 0; 362 break; 363 364 case PRU_SOCKADDR: 365 in_setsockaddr(inp, nam); 366 break; 367 368 case PRU_PEERADDR: 369 in_setpeeraddr(inp, nam); 370 break; 371 372 /* 373 * TCP slow timer went off; going through this 374 * routine for tracing's sake. 375 */ 376 case PRU_SLOWTIMO: 377 tp = tcp_timers(tp, (int)nam); 378#ifdef TCPDEBUG 379 req |= (int)nam << 8; /* for debug's sake */ 380#endif 381 break; 382 383 default: 384 panic("tcp_usrreq"); 385 } 386#ifdef TCPDEBUG 387 if (tp && (so->so_options & SO_DEBUG)) 388 tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req); 389#endif 390 splx(s); 391 return (error); 392} 393 394/* 395 * Common subroutine to open a TCP connection to remote host specified 396 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 397 * port number if needed. Call in_pcbladdr to do the routing and to choose 398 * a local host address (interface). If there is an existing incarnation 399 * of the same connection in TIME-WAIT state and if the remote host was 400 * sending CC options and if the connection duration was < MSL, then 401 * truncate the previous TIME-WAIT state and proceed. 402 * Initialize connection parameters and enter SYN-SENT state. 403 */ 404static int 405tcp_connect(tp, nam) 406 register struct tcpcb *tp; 407 struct mbuf *nam; 408{ 409 struct inpcb *inp = tp->t_inpcb, *oinp; 410 struct socket *so = inp->inp_socket; 411 struct tcpcb *otp; 412 struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *); 413 struct sockaddr_in *ifaddr; 414 int error; 415 struct rmxp_tao *taop; 416 struct rmxp_tao tao_noncached; 417 418 if (inp->inp_lport == 0) { 419 error = in_pcbbind(inp, NULL); 420 if (error) 421 return error; 422 } 423 424 /* 425 * Cannot simply call in_pcbconnect, because there might be an 426 * earlier incarnation of this same connection still in 427 * TIME_WAIT state, creating an ADDRINUSE error. 428 */ 429 error = in_pcbladdr(inp, nam, &ifaddr); 430 if (error) 431 return error; 432 oinp = in_pcblookup(inp->inp_pcbinfo->listhead, 433 sin->sin_addr, sin->sin_port, 434 inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr 435 : ifaddr->sin_addr, 436 inp->inp_lport, 0); 437 if (oinp) { 438 if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && 439 otp->t_state == TCPS_TIME_WAIT && 440 otp->t_duration < TCPTV_MSL && 441 (otp->t_flags & TF_RCVD_CC)) 442 otp = tcp_close(otp); 443 else 444 return EADDRINUSE; 445 } 446 if (inp->inp_laddr.s_addr == INADDR_ANY) 447 inp->inp_laddr = ifaddr->sin_addr; 448 inp->inp_faddr = sin->sin_addr; 449 inp->inp_fport = sin->sin_port; 450 in_pcbrehash(inp); 451 452 tp->t_template = tcp_template(tp); 453 if (tp->t_template == 0) { 454 in_pcbdisconnect(inp); 455 return ENOBUFS; 456 } 457 458 /* Compute window scaling to request. */ 459 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 460 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) 461 tp->request_r_scale++; 462 463 soisconnecting(so); 464 tcpstat.tcps_connattempt++; 465 tp->t_state = TCPS_SYN_SENT; 466 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; 467 tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2; 468 tcp_sendseqinit(tp); 469 470 /* 471 * Generate a CC value for this connection and 472 * check whether CC or CCnew should be used. 473 */ 474 if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) { 475 taop = &tao_noncached; 476 bzero(taop, sizeof(*taop)); 477 } 478 479 tp->cc_send = CC_INC(tcp_ccgen); 480 if (taop->tao_ccsent != 0 && 481 CC_GEQ(tp->cc_send, taop->tao_ccsent)) { 482 taop->tao_ccsent = tp->cc_send; 483 } else { 484 taop->tao_ccsent = 0; 485 tp->t_flags |= TF_SENDCCNEW; 486 } 487 488 return 0; 489} 490 491int 492tcp_ctloutput(op, so, level, optname, mp) 493 int op; 494 struct socket *so; 495 int level, optname; 496 struct mbuf **mp; 497{ 498 int error = 0, s; 499 struct inpcb *inp; 500 register struct tcpcb *tp; 501 register struct mbuf *m; 502 register int i; 503 504 s = splnet(); 505 inp = sotoinpcb(so); 506 if (inp == NULL) { 507 splx(s); 508 if (op == PRCO_SETOPT && *mp) 509 (void) m_free(*mp); 510 return (ECONNRESET); 511 } 512 if (level != IPPROTO_TCP) { 513 error = ip_ctloutput(op, so, level, optname, mp); 514 splx(s); 515 return (error); 516 } 517 tp = intotcpcb(inp); 518 519 switch (op) { 520 521 case PRCO_SETOPT: 522 m = *mp; 523 switch (optname) { 524 525 case TCP_NODELAY: 526 if (m == NULL || m->m_len < sizeof (int)) 527 error = EINVAL; 528 else if (*mtod(m, int *)) 529 tp->t_flags |= TF_NODELAY; 530 else 531 tp->t_flags &= ~TF_NODELAY; 532 break; 533 534 case TCP_MAXSEG: 535 if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg) 536 tp->t_maxseg = i; 537 else 538 error = EINVAL; 539 break; 540 541 case TCP_NOOPT: 542 if (m == NULL || m->m_len < sizeof (int)) 543 error = EINVAL; 544 else if (*mtod(m, int *)) 545 tp->t_flags |= TF_NOOPT; 546 else 547 tp->t_flags &= ~TF_NOOPT; 548 break; 549 550 case TCP_NOPUSH: 551 if (m == NULL || m->m_len < sizeof (int)) 552 error = EINVAL; 553 else if (*mtod(m, int *)) 554 tp->t_flags |= TF_NOPUSH; 555 else 556 tp->t_flags &= ~TF_NOPUSH; 557 break; 558 559 default: 560 error = ENOPROTOOPT; 561 break; 562 } 563 if (m) 564 (void) m_free(m); 565 break; 566 567 case PRCO_GETOPT: 568 *mp = m = m_get(M_WAIT, MT_SOOPTS); 569 m->m_len = sizeof(int); 570 571 switch (optname) { 572 case TCP_NODELAY: 573 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 574 break; 575 case TCP_MAXSEG: 576 *mtod(m, int *) = tp->t_maxseg; 577 break; 578 case TCP_NOOPT: 579 *mtod(m, int *) = tp->t_flags & TF_NOOPT; 580 break; 581 case TCP_NOPUSH: 582 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 583 break; 584 default: 585 error = ENOPROTOOPT; 586 break; 587 } 588 break; 589 } 590 splx(s); 591 return (error); 592} 593 594/* 595 * tcp_sendspace and tcp_recvspace are the default send and receive window 596 * sizes, respectively. These are obsolescent (this information should 597 * be set by the route). 598 */ 599u_long tcp_sendspace = 1024*16; 600SYSCTL_INT(_net_inet_tcp, TCPCTL_SENDSPACE, sendspace, 601 CTLFLAG_RW, &tcp_sendspace , 0, ""); 602u_long tcp_recvspace = 1024*16; 603SYSCTL_INT(_net_inet_tcp, TCPCTL_RECVSPACE, recvspace, 604 CTLFLAG_RW, &tcp_recvspace , 0, ""); 605 606/* 607 * Attach TCP protocol to socket, allocating 608 * internet protocol control block, tcp control block, 609 * bufer space, and entering LISTEN state if to accept connections. 610 */ 611static int 612tcp_attach(so) 613 struct socket *so; 614{ 615 register struct tcpcb *tp; 616 struct inpcb *inp; 617 int error; 618 619 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 620 error = soreserve(so, tcp_sendspace, tcp_recvspace); 621 if (error) 622 return (error); 623 } 624 error = in_pcballoc(so, &tcbinfo); 625 if (error) 626 return (error); 627 inp = sotoinpcb(so); 628 tp = tcp_newtcpcb(inp); 629 if (tp == 0) { 630 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 631 632 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 633 in_pcbdetach(inp); 634 so->so_state |= nofd; 635 return (ENOBUFS); 636 } 637 tp->t_state = TCPS_CLOSED; 638 return (0); 639} 640 641/* 642 * Initiate (or continue) disconnect. 643 * If embryonic state, just send reset (once). 644 * If in ``let data drain'' option and linger null, just drop. 645 * Otherwise (hard), mark socket disconnecting and drop 646 * current input data; switch states based on user close, and 647 * send segment to peer (with FIN). 648 */ 649static struct tcpcb * 650tcp_disconnect(tp) 651 register struct tcpcb *tp; 652{ 653 struct socket *so = tp->t_inpcb->inp_socket; 654 655 if (tp->t_state < TCPS_ESTABLISHED) 656 tp = tcp_close(tp); 657 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 658 tp = tcp_drop(tp, 0); 659 else { 660 soisdisconnecting(so); 661 sbflush(&so->so_rcv); 662 tp = tcp_usrclosed(tp); 663 if (tp) 664 (void) tcp_output(tp); 665 } 666 return (tp); 667} 668 669/* 670 * User issued close, and wish to trail through shutdown states: 671 * if never received SYN, just forget it. If got a SYN from peer, 672 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 673 * If already got a FIN from peer, then almost done; go to LAST_ACK 674 * state. In all other cases, have already sent FIN to peer (e.g. 675 * after PRU_SHUTDOWN), and just have to play tedious game waiting 676 * for peer to send FIN or not respond to keep-alives, etc. 677 * We can let the user exit from the close as soon as the FIN is acked. 678 */ 679static struct tcpcb * 680tcp_usrclosed(tp) 681 register struct tcpcb *tp; 682{ 683 684 switch (tp->t_state) { 685 686 case TCPS_CLOSED: 687 case TCPS_LISTEN: 688 tp->t_state = TCPS_CLOSED; 689 tp = tcp_close(tp); 690 break; 691 692 case TCPS_SYN_SENT: 693 case TCPS_SYN_RECEIVED: 694 tp->t_flags |= TF_NEEDFIN; 695 break; 696 697 case TCPS_ESTABLISHED: 698 tp->t_state = TCPS_FIN_WAIT_1; 699 break; 700 701 case TCPS_CLOSE_WAIT: 702 tp->t_state = TCPS_LAST_ACK; 703 break; 704 } 705 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 706 soisdisconnected(tp->t_inpcb->inp_socket); 707 /* To prevent the connection hanging in FIN_WAIT_2 forever. */ 708 if (tp->t_state == TCPS_FIN_WAIT_2) 709 tp->t_timer[TCPT_2MSL] = tcp_maxidle; 710 } 711 return (tp); 712} 713 714