tcp_usrreq.c revision 6475
1/* 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 34 * $Id: tcp_usrreq.c,v 1.7 1995/02/09 23:13:27 wollman Exp $ 35 */ 36 37#include <sys/param.h> 38#include <sys/systm.h> 39#include <sys/malloc.h> 40#include <sys/mbuf.h> 41#include <sys/socket.h> 42#include <sys/socketvar.h> 43#include <sys/protosw.h> 44#include <sys/errno.h> 45#include <sys/stat.h> 46 47#include <net/if.h> 48#include <net/route.h> 49 50#include <netinet/in.h> 51#include <netinet/in_systm.h> 52#include <netinet/ip.h> 53#include <netinet/in_pcb.h> 54#include <netinet/ip_var.h> 55#include <netinet/tcp.h> 56#include <netinet/tcp_fsm.h> 57#include <netinet/tcp_seq.h> 58#include <netinet/tcp_timer.h> 59#include <netinet/tcp_var.h> 60#include <netinet/tcpip.h> 61#ifdef TCPDEBUG 62#include <netinet/tcp_debug.h> 63#endif 64 65/* 66 * TCP protocol interface to socket abstraction. 67 */ 68extern char *tcpstates[]; 69 70/* 71 * Process a TCP user request for TCP tb. If this is a send request 72 * then m is the mbuf chain of send data. If this is a timer expiration 73 * (called from the software clock routine), then timertype tells which timer. 74 */ 75/*ARGSUSED*/ 76int 77tcp_usrreq(so, req, m, nam, control) 78 struct socket *so; 79 int req; 80 struct mbuf *m, *nam, *control; 81{ 82 register struct inpcb *inp; 83 register struct tcpcb *tp = 0; 84 struct sockaddr_in *sinp; 85 int s; 86 int error = 0; 87#ifdef TCPDEBUG 88 int ostate; 89#endif 90 91 if (req == PRU_CONTROL) 92 return (in_control(so, (int)m, (caddr_t)nam, 93 (struct ifnet *)control)); 94 if (control && control->m_len) { 95 m_freem(control); 96 if (m) 97 m_freem(m); 98 return (EINVAL); 99 } 100 101 s = splnet(); 102 inp = sotoinpcb(so); 103 /* 104 * When a TCP is attached to a socket, then there will be 105 * a (struct inpcb) pointed at by the socket, and this 106 * structure will point at a subsidary (struct tcpcb). 107 */ 108 if (inp == 0 && req != PRU_ATTACH) { 109 splx(s); 110 return (EINVAL); /* XXX */ 111 } 112 if (inp) { 113 tp = intotcpcb(inp); 114 /* WHAT IF TP IS 0? */ 115#ifdef KPROF 116 tcp_acounts[tp->t_state][req]++; 117#endif 118#ifdef TCPDEBUG 119 ostate = tp->t_state; 120 } else 121 ostate = 0; 122#else /* TCPDEBUG */ 123 } 124#endif /* TCPDEBUG */ 125 126 switch (req) { 127 128 /* 129 * TCP attaches to socket via PRU_ATTACH, reserving space, 130 * and an internet control block. 131 */ 132 case PRU_ATTACH: 133 if (inp) { 134 error = EISCONN; 135 break; 136 } 137 error = tcp_attach(so); 138 if (error) 139 break; 140 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 141 so->so_linger = TCP_LINGERTIME; 142 tp = sototcpcb(so); 143 break; 144 145 /* 146 * PRU_DETACH detaches the TCP protocol from the socket. 147 * If the protocol state is non-embryonic, then can't 148 * do this directly: have to initiate a PRU_DISCONNECT, 149 * which may finish later; embryonic TCB's can just 150 * be discarded here. 151 */ 152 case PRU_DETACH: 153 if (tp->t_state > TCPS_LISTEN) 154 tp = tcp_disconnect(tp); 155 else 156 tp = tcp_close(tp); 157 break; 158 159 /* 160 * Give the socket an address. 161 */ 162 case PRU_BIND: 163 /* 164 * Must check for multicast addresses and disallow binding 165 * to them. 166 */ 167 sinp = mtod(nam, struct sockaddr_in *); 168 if (sinp->sin_family == AF_INET && 169 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 170 error = EAFNOSUPPORT; 171 break; 172 } 173 error = in_pcbbind(inp, nam); 174 if (error) 175 break; 176 break; 177 178 /* 179 * Prepare to accept connections. 180 */ 181 case PRU_LISTEN: 182 if (inp->inp_lport == 0) 183 error = in_pcbbind(inp, (struct mbuf *)0); 184 if (error == 0) 185 tp->t_state = TCPS_LISTEN; 186 break; 187 188 /* 189 * Initiate connection to peer. 190 * Create a template for use in transmissions on this connection. 191 * Enter SYN_SENT state, and mark socket as connecting. 192 * Start keep-alive timer, and seed output sequence space. 193 * Send initial segment on connection. 194 */ 195 case PRU_CONNECT: 196 /* 197 * Must disallow TCP ``connections'' to multicast addresses. 198 */ 199 sinp = mtod(nam, struct sockaddr_in *); 200 if (sinp->sin_family == AF_INET 201 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 202 error = EAFNOSUPPORT; 203 break; 204 } 205 206 if ((error = tcp_connect(tp, nam)) != 0) 207 break; 208 error = tcp_output(tp); 209 break; 210 211 /* 212 * Create a TCP connection between two sockets. 213 */ 214 case PRU_CONNECT2: 215 error = EOPNOTSUPP; 216 break; 217 218 /* 219 * Initiate disconnect from peer. 220 * If connection never passed embryonic stage, just drop; 221 * else if don't need to let data drain, then can just drop anyways, 222 * else have to begin TCP shutdown process: mark socket disconnecting, 223 * drain unread data, state switch to reflect user close, and 224 * send segment (e.g. FIN) to peer. Socket will be really disconnected 225 * when peer sends FIN and acks ours. 226 * 227 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 228 */ 229 case PRU_DISCONNECT: 230 tp = tcp_disconnect(tp); 231 break; 232 233 /* 234 * Accept a connection. Essentially all the work is 235 * done at higher levels; just return the address 236 * of the peer, storing through addr. 237 */ 238 case PRU_ACCEPT: 239 in_setpeeraddr(inp, nam); 240 break; 241 242 /* 243 * Mark the connection as being incapable of further output. 244 */ 245 case PRU_SHUTDOWN: 246 socantsendmore(so); 247 tp = tcp_usrclosed(tp); 248 if (tp) 249 error = tcp_output(tp); 250 break; 251 252 /* 253 * After a receive, possibly send window update to peer. 254 */ 255 case PRU_RCVD: 256 (void) tcp_output(tp); 257 break; 258 259 /* 260 * Do a send by putting data in output queue and updating urgent 261 * marker if URG set. Possibly send more data. 262 */ 263 case PRU_SEND_EOF: 264 case PRU_SEND: 265 sbappend(&so->so_snd, m); 266 if (nam && tp->t_state < TCPS_SYN_SENT) { 267 /* 268 * Do implied connect if not yet connected, 269 * initialize window to default value, and 270 * initialize maxseg/maxopd using peer's cached 271 * MSS. 272 */ 273 error = tcp_connect(tp, nam); 274 if (error) 275 break; 276 tp->snd_wnd = TTCP_CLIENT_SND_WND; 277 tcp_mss(tp, -1); 278 } 279 280 if (req == PRU_SEND_EOF) { 281 /* 282 * Close the send side of the connection after 283 * the data is sent. 284 */ 285 socantsendmore(so); 286 tp = tcp_usrclosed(tp); 287 } 288 if (tp != NULL) 289 error = tcp_output(tp); 290 break; 291 292 /* 293 * Abort the TCP. 294 */ 295 case PRU_ABORT: 296 tp = tcp_drop(tp, ECONNABORTED); 297 break; 298 299 case PRU_SENSE: 300 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 301 (void) splx(s); 302 return (0); 303 304 case PRU_RCVOOB: 305 if ((so->so_oobmark == 0 && 306 (so->so_state & SS_RCVATMARK) == 0) || 307 so->so_options & SO_OOBINLINE || 308 tp->t_oobflags & TCPOOB_HADDATA) { 309 error = EINVAL; 310 break; 311 } 312 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 313 error = EWOULDBLOCK; 314 break; 315 } 316 m->m_len = 1; 317 *mtod(m, caddr_t) = tp->t_iobc; 318 if (((int)nam & MSG_PEEK) == 0) 319 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 320 break; 321 322 case PRU_SENDOOB: 323 if (sbspace(&so->so_snd) < -512) { 324 m_freem(m); 325 error = ENOBUFS; 326 break; 327 } 328 /* 329 * According to RFC961 (Assigned Protocols), 330 * the urgent pointer points to the last octet 331 * of urgent data. We continue, however, 332 * to consider it to indicate the first octet 333 * of data past the urgent section. 334 * Otherwise, snd_up should be one lower. 335 */ 336 sbappend(&so->so_snd, m); 337 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 338 tp->t_force = 1; 339 error = tcp_output(tp); 340 tp->t_force = 0; 341 break; 342 343 case PRU_SOCKADDR: 344 in_setsockaddr(inp, nam); 345 break; 346 347 case PRU_PEERADDR: 348 in_setpeeraddr(inp, nam); 349 break; 350 351 /* 352 * TCP slow timer went off; going through this 353 * routine for tracing's sake. 354 */ 355 case PRU_SLOWTIMO: 356 tp = tcp_timers(tp, (int)nam); 357#ifdef TCPDEBUG 358 req |= (int)nam << 8; /* for debug's sake */ 359#endif 360 break; 361 362 default: 363 panic("tcp_usrreq"); 364 } 365#ifdef TCPDEBUG 366 if (tp && (so->so_options & SO_DEBUG)) 367 tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req); 368#endif 369 splx(s); 370 return (error); 371} 372 373/* 374 * Common subroutine to open a TCP connection to remote host specified 375 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 376 * port number if needed. Call in_pcbladdr to do the routing and to choose 377 * a local host address (interface). If there is an existing incarnation 378 * of the same connection in TIME-WAIT state and if the remote host was 379 * sending CC options and if the connection duration was < MSL, then 380 * truncate the previous TIME-WAIT state and proceed. 381 * Initialize connection parameters and enter SYN-SENT state. 382 */ 383int 384tcp_connect(tp, nam) 385 register struct tcpcb *tp; 386 struct mbuf *nam; 387{ 388 struct inpcb *inp = tp->t_inpcb, *oinp; 389 struct socket *so = inp->inp_socket; 390 struct tcpcb *otp; 391 struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *); 392 struct sockaddr_in *ifaddr; 393 int error; 394 395 if (inp->inp_lport == 0) { 396 error = in_pcbbind(inp, NULL); 397 if (error) 398 return error; 399 } 400 401 /* 402 * Cannot simply call in_pcbconnect, because there might be an 403 * earlier incarnation of this same connection still in 404 * TIME_WAIT state, creating an ADDRINUSE error. 405 */ 406 error = in_pcbladdr(inp, nam, &ifaddr); 407 oinp = in_pcblookup(inp->inp_head, 408 sin->sin_addr, sin->sin_port, 409 inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr 410 : ifaddr->sin_addr, 411 inp->inp_lport, 0); 412 if (oinp) { 413 if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && 414 otp->t_state == TCPS_TIME_WAIT && 415 otp->t_duration < TCPTV_MSL && 416 (otp->t_flags & TF_RCVD_CC)) 417 otp = tcp_close(otp); 418 else 419 return EADDRINUSE; 420 } 421 if (inp->inp_laddr.s_addr == INADDR_ANY) 422 inp->inp_laddr = ifaddr->sin_addr; 423 inp->inp_faddr = sin->sin_addr; 424 inp->inp_fport = sin->sin_port; 425 426 tp->t_template = tcp_template(tp); 427 if (tp->t_template == 0) { 428 in_pcbdisconnect(inp); 429 return ENOBUFS; 430 } 431 432 /* Compute window scaling to request. */ 433 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 434 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) 435 tp->request_r_scale++; 436 437 soisconnecting(so); 438 tcpstat.tcps_connattempt++; 439 tp->t_state = TCPS_SYN_SENT; 440 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; 441 tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2; 442 tcp_sendseqinit(tp); 443 tp->cc_send = CC_INC(tcp_ccgen); 444 445 return 0; 446} 447 448int 449tcp_ctloutput(op, so, level, optname, mp) 450 int op; 451 struct socket *so; 452 int level, optname; 453 struct mbuf **mp; 454{ 455 int error = 0, s; 456 struct inpcb *inp; 457 register struct tcpcb *tp; 458 register struct mbuf *m; 459 register int i; 460 461 s = splnet(); 462 inp = sotoinpcb(so); 463 if (inp == NULL) { 464 splx(s); 465 if (op == PRCO_SETOPT && *mp) 466 (void) m_free(*mp); 467 return (ECONNRESET); 468 } 469 if (level != IPPROTO_TCP) { 470 error = ip_ctloutput(op, so, level, optname, mp); 471 splx(s); 472 return (error); 473 } 474 tp = intotcpcb(inp); 475 476 switch (op) { 477 478 case PRCO_SETOPT: 479 m = *mp; 480 switch (optname) { 481 482 case TCP_NODELAY: 483 if (m == NULL || m->m_len < sizeof (int)) 484 error = EINVAL; 485 else if (*mtod(m, int *)) 486 tp->t_flags |= TF_NODELAY; 487 else 488 tp->t_flags &= ~TF_NODELAY; 489 break; 490 491 case TCP_MAXSEG: 492 if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg) 493 tp->t_maxseg = i; 494 else 495 error = EINVAL; 496 break; 497 498 case TCP_NOOPT: 499 if (m == NULL || m->m_len < sizeof (int)) 500 error = EINVAL; 501 else if (*mtod(m, int *)) 502 tp->t_flags |= TF_NOOPT; 503 else 504 tp->t_flags &= ~TF_NOOPT; 505 break; 506 507 case TCP_NOPUSH: 508 if (m == NULL || m->m_len < sizeof (int)) 509 error = EINVAL; 510 else if (*mtod(m, int *)) 511 tp->t_flags |= TF_NOPUSH; 512 else 513 tp->t_flags &= ~TF_NOPUSH; 514 break; 515 516 default: 517 error = ENOPROTOOPT; 518 break; 519 } 520 if (m) 521 (void) m_free(m); 522 break; 523 524 case PRCO_GETOPT: 525 *mp = m = m_get(M_WAIT, MT_SOOPTS); 526 m->m_len = sizeof(int); 527 528 switch (optname) { 529 case TCP_NODELAY: 530 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 531 break; 532 case TCP_MAXSEG: 533 *mtod(m, int *) = tp->t_maxseg; 534 break; 535 case TCP_NOOPT: 536 *mtod(m, int *) = tp->t_flags & TF_NOOPT; 537 break; 538 case TCP_NOPUSH: 539 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 540 break; 541 default: 542 error = ENOPROTOOPT; 543 break; 544 } 545 break; 546 } 547 splx(s); 548 return (error); 549} 550 551/* 552 * tcp_sendspace and tcp_recvspace are the default send and receive window 553 * sizes, respectively. These are obsolescent (this information should 554 * be set by the route). 555 */ 556u_long tcp_sendspace = 1024*16; 557u_long tcp_recvspace = 1024*16; 558 559/* 560 * Attach TCP protocol to socket, allocating 561 * internet protocol control block, tcp control block, 562 * bufer space, and entering LISTEN state if to accept connections. 563 */ 564int 565tcp_attach(so) 566 struct socket *so; 567{ 568 register struct tcpcb *tp; 569 struct inpcb *inp; 570 int error; 571 572 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 573 error = soreserve(so, tcp_sendspace, tcp_recvspace); 574 if (error) 575 return (error); 576 } 577 error = in_pcballoc(so, &tcb); 578 if (error) 579 return (error); 580 inp = sotoinpcb(so); 581 tp = tcp_newtcpcb(inp); 582 if (tp == 0) { 583 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 584 585 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 586 in_pcbdetach(inp); 587 so->so_state |= nofd; 588 return (ENOBUFS); 589 } 590 tp->t_state = TCPS_CLOSED; 591 return (0); 592} 593 594/* 595 * Initiate (or continue) disconnect. 596 * If embryonic state, just send reset (once). 597 * If in ``let data drain'' option and linger null, just drop. 598 * Otherwise (hard), mark socket disconnecting and drop 599 * current input data; switch states based on user close, and 600 * send segment to peer (with FIN). 601 */ 602struct tcpcb * 603tcp_disconnect(tp) 604 register struct tcpcb *tp; 605{ 606 struct socket *so = tp->t_inpcb->inp_socket; 607 608 if (tp->t_state < TCPS_ESTABLISHED) 609 tp = tcp_close(tp); 610 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 611 tp = tcp_drop(tp, 0); 612 else { 613 soisdisconnecting(so); 614 sbflush(&so->so_rcv); 615 tp = tcp_usrclosed(tp); 616 if (tp) 617 (void) tcp_output(tp); 618 } 619 return (tp); 620} 621 622/* 623 * User issued close, and wish to trail through shutdown states: 624 * if never received SYN, just forget it. If got a SYN from peer, 625 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 626 * If already got a FIN from peer, then almost done; go to LAST_ACK 627 * state. In all other cases, have already sent FIN to peer (e.g. 628 * after PRU_SHUTDOWN), and just have to play tedious game waiting 629 * for peer to send FIN or not respond to keep-alives, etc. 630 * We can let the user exit from the close as soon as the FIN is acked. 631 */ 632struct tcpcb * 633tcp_usrclosed(tp) 634 register struct tcpcb *tp; 635{ 636 637 switch (tp->t_state) { 638 639 case TCPS_CLOSED: 640 case TCPS_LISTEN: 641 tp->t_state = TCPS_CLOSED; 642 tp = tcp_close(tp); 643 break; 644 645 case TCPS_SYN_SENT: 646 case TCPS_SYN_RECEIVED: 647 tp->t_flags |= TF_NEEDFIN; 648 break; 649 650 case TCPS_ESTABLISHED: 651 tp->t_state = TCPS_FIN_WAIT_1; 652 break; 653 654 case TCPS_CLOSE_WAIT: 655 tp->t_state = TCPS_LAST_ACK; 656 break; 657 } 658 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) 659 soisdisconnected(tp->t_inpcb->inp_socket); 660 return (tp); 661} 662 663/* 664 * Sysctl for tcp variables. 665 */ 666int 667tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen) 668 int *name; 669 u_int namelen; 670 void *oldp; 671 size_t *oldlenp; 672 void *newp; 673 size_t newlen; 674{ 675 extern int tcp_do_rfc1323; /* XXX */ 676 extern int tcp_do_rfc1644; /* XXX */ 677 extern int tcp_mssdflt; /* XXX */ 678 extern int tcp_rttdflt; /* XXX */ 679 680 /* All sysctl names at this level are terminal. */ 681 if (namelen != 1) 682 return (ENOTDIR); 683 684 switch (name[0]) { 685 case TCPCTL_DO_RFC1323: 686 return (sysctl_int(oldp, oldlenp, newp, newlen, 687 &tcp_do_rfc1323)); 688 case TCPCTL_DO_RFC1644: 689 return (sysctl_int(oldp, oldlenp, newp, newlen, 690 &tcp_do_rfc1644)); 691 case TCPCTL_MSSDFLT: 692 return (sysctl_int(oldp, oldlenp, newp, newlen, 693 &tcp_mssdflt)); 694 case TCPCTL_STATS: 695 return (sysctl_rdstruct(oldp, oldlenp, newp, &tcpstat, 696 sizeof tcpstat)); 697 case TCPCTL_RTTDFLT: 698 return (sysctl_int(oldp, oldlenp, newp, newlen, &tcp_rttdflt)); 699 case TCPCTL_KEEPIDLE: 700 return (sysctl_int(oldp, oldlenp, newp, newlen, 701 &tcp_keepidle)); 702 case TCPCTL_KEEPINTVL: 703 return (sysctl_int(oldp, oldlenp, newp, newlen, 704 &tcp_keepintvl)); 705 case TCPCTL_SENDSPACE: 706 return (sysctl_int(oldp, oldlenp, newp, newlen, 707 (int *)&tcp_sendspace)); /* XXX */ 708 case TCPCTL_RECVSPACE: 709 return (sysctl_int(oldp, oldlenp, newp, newlen, 710 (int *)&tcp_recvspace)); /* XXX */ 711 default: 712 return (ENOPROTOOPT); 713 } 714 /* NOTREACHED */ 715} 716