tcp_usrreq.c revision 7090
1/* 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. All advertising materials mentioning features or use of this software 14 * must display the following acknowledgement: 15 * This product includes software developed by the University of 16 * California, Berkeley and its contributors. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 34 * $Id: tcp_usrreq.c,v 1.11 1995/02/17 00:29:42 wollman Exp $ 35 */ 36 37#include <sys/param.h> 38#include <sys/systm.h> 39#include <sys/kernel.h> 40#include <sys/malloc.h> 41#include <sys/mbuf.h> 42#include <sys/socket.h> 43#include <sys/socketvar.h> 44#include <sys/protosw.h> 45#include <sys/errno.h> 46#include <sys/stat.h> 47#include <vm/vm.h> 48#include <sys/sysctl.h> 49 50#include <net/if.h> 51#include <net/route.h> 52 53#include <netinet/in.h> 54#include <netinet/in_systm.h> 55#include <netinet/ip.h> 56#include <netinet/in_pcb.h> 57#include <netinet/in_var.h> 58#include <netinet/ip_var.h> 59#include <netinet/tcp.h> 60#include <netinet/tcp_fsm.h> 61#include <netinet/tcp_seq.h> 62#include <netinet/tcp_timer.h> 63#include <netinet/tcp_var.h> 64#include <netinet/tcpip.h> 65#ifdef TCPDEBUG 66#include <netinet/tcp_debug.h> 67#endif 68 69/* 70 * TCP protocol interface to socket abstraction. 71 */ 72extern char *tcpstates[]; 73 74/* 75 * Process a TCP user request for TCP tb. If this is a send request 76 * then m is the mbuf chain of send data. If this is a timer expiration 77 * (called from the software clock routine), then timertype tells which timer. 78 */ 79/*ARGSUSED*/ 80int 81tcp_usrreq(so, req, m, nam, control) 82 struct socket *so; 83 int req; 84 struct mbuf *m, *nam, *control; 85{ 86 register struct inpcb *inp; 87 register struct tcpcb *tp = 0; 88 struct sockaddr_in *sinp; 89 int s; 90 int error = 0; 91#ifdef TCPDEBUG 92 int ostate; 93#endif 94 95 if (req == PRU_CONTROL) 96 return (in_control(so, (int)m, (caddr_t)nam, 97 (struct ifnet *)control)); 98 if (control && control->m_len) { 99 m_freem(control); 100 if (m) 101 m_freem(m); 102 return (EINVAL); 103 } 104 105 s = splnet(); 106 inp = sotoinpcb(so); 107 /* 108 * When a TCP is attached to a socket, then there will be 109 * a (struct inpcb) pointed at by the socket, and this 110 * structure will point at a subsidary (struct tcpcb). 111 */ 112 if (inp == 0 && req != PRU_ATTACH) { 113 splx(s); 114 return (EINVAL); /* XXX */ 115 } 116 if (inp) { 117 tp = intotcpcb(inp); 118 /* WHAT IF TP IS 0? */ 119#ifdef KPROF 120 tcp_acounts[tp->t_state][req]++; 121#endif 122#ifdef TCPDEBUG 123 ostate = tp->t_state; 124 } else 125 ostate = 0; 126#else /* TCPDEBUG */ 127 } 128#endif /* TCPDEBUG */ 129 130 switch (req) { 131 132 /* 133 * TCP attaches to socket via PRU_ATTACH, reserving space, 134 * and an internet control block. 135 */ 136 case PRU_ATTACH: 137 if (inp) { 138 error = EISCONN; 139 break; 140 } 141 error = tcp_attach(so); 142 if (error) 143 break; 144 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 145 so->so_linger = TCP_LINGERTIME * hz; 146 tp = sototcpcb(so); 147 break; 148 149 /* 150 * PRU_DETACH detaches the TCP protocol from the socket. 151 * If the protocol state is non-embryonic, then can't 152 * do this directly: have to initiate a PRU_DISCONNECT, 153 * which may finish later; embryonic TCB's can just 154 * be discarded here. 155 */ 156 case PRU_DETACH: 157 if (tp->t_state > TCPS_LISTEN) 158 tp = tcp_disconnect(tp); 159 else 160 tp = tcp_close(tp); 161 break; 162 163 /* 164 * Give the socket an address. 165 */ 166 case PRU_BIND: 167 /* 168 * Must check for multicast addresses and disallow binding 169 * to them. 170 */ 171 sinp = mtod(nam, struct sockaddr_in *); 172 if (sinp->sin_family == AF_INET && 173 IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 174 error = EAFNOSUPPORT; 175 break; 176 } 177 error = in_pcbbind(inp, nam); 178 if (error) 179 break; 180 break; 181 182 /* 183 * Prepare to accept connections. 184 */ 185 case PRU_LISTEN: 186 if (inp->inp_lport == 0) 187 error = in_pcbbind(inp, (struct mbuf *)0); 188 if (error == 0) 189 tp->t_state = TCPS_LISTEN; 190 break; 191 192 /* 193 * Initiate connection to peer. 194 * Create a template for use in transmissions on this connection. 195 * Enter SYN_SENT state, and mark socket as connecting. 196 * Start keep-alive timer, and seed output sequence space. 197 * Send initial segment on connection. 198 */ 199 case PRU_CONNECT: 200 /* 201 * Must disallow TCP ``connections'' to multicast addresses. 202 */ 203 sinp = mtod(nam, struct sockaddr_in *); 204 if (sinp->sin_family == AF_INET 205 && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) { 206 error = EAFNOSUPPORT; 207 break; 208 } 209 210 if ((error = tcp_connect(tp, nam)) != 0) 211 break; 212 error = tcp_output(tp); 213 break; 214 215 /* 216 * Create a TCP connection between two sockets. 217 */ 218 case PRU_CONNECT2: 219 error = EOPNOTSUPP; 220 break; 221 222 /* 223 * Initiate disconnect from peer. 224 * If connection never passed embryonic stage, just drop; 225 * else if don't need to let data drain, then can just drop anyways, 226 * else have to begin TCP shutdown process: mark socket disconnecting, 227 * drain unread data, state switch to reflect user close, and 228 * send segment (e.g. FIN) to peer. Socket will be really disconnected 229 * when peer sends FIN and acks ours. 230 * 231 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 232 */ 233 case PRU_DISCONNECT: 234 tp = tcp_disconnect(tp); 235 break; 236 237 /* 238 * Accept a connection. Essentially all the work is 239 * done at higher levels; just return the address 240 * of the peer, storing through addr. 241 */ 242 case PRU_ACCEPT: 243 in_setpeeraddr(inp, nam); 244 break; 245 246 /* 247 * Mark the connection as being incapable of further output. 248 */ 249 case PRU_SHUTDOWN: 250 socantsendmore(so); 251 tp = tcp_usrclosed(tp); 252 if (tp) 253 error = tcp_output(tp); 254 break; 255 256 /* 257 * After a receive, possibly send window update to peer. 258 */ 259 case PRU_RCVD: 260 (void) tcp_output(tp); 261 break; 262 263 /* 264 * Do a send by putting data in output queue and updating urgent 265 * marker if URG set. Possibly send more data. 266 */ 267 case PRU_SEND_EOF: 268 case PRU_SEND: 269 sbappend(&so->so_snd, m); 270 if (nam && tp->t_state < TCPS_SYN_SENT) { 271 /* 272 * Do implied connect if not yet connected, 273 * initialize window to default value, and 274 * initialize maxseg/maxopd using peer's cached 275 * MSS. 276 */ 277 error = tcp_connect(tp, nam); 278 if (error) 279 break; 280 tp->snd_wnd = TTCP_CLIENT_SND_WND; 281 tcp_mss(tp, -1); 282 } 283 284 if (req == PRU_SEND_EOF) { 285 /* 286 * Close the send side of the connection after 287 * the data is sent. 288 */ 289 socantsendmore(so); 290 tp = tcp_usrclosed(tp); 291 } 292 if (tp != NULL) 293 error = tcp_output(tp); 294 break; 295 296 /* 297 * Abort the TCP. 298 */ 299 case PRU_ABORT: 300 tp = tcp_drop(tp, ECONNABORTED); 301 break; 302 303 case PRU_SENSE: 304 ((struct stat *) m)->st_blksize = so->so_snd.sb_hiwat; 305 (void) splx(s); 306 return (0); 307 308 case PRU_RCVOOB: 309 if ((so->so_oobmark == 0 && 310 (so->so_state & SS_RCVATMARK) == 0) || 311 so->so_options & SO_OOBINLINE || 312 tp->t_oobflags & TCPOOB_HADDATA) { 313 error = EINVAL; 314 break; 315 } 316 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 317 error = EWOULDBLOCK; 318 break; 319 } 320 m->m_len = 1; 321 *mtod(m, caddr_t) = tp->t_iobc; 322 if (((int)nam & MSG_PEEK) == 0) 323 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 324 break; 325 326 case PRU_SENDOOB: 327 if (sbspace(&so->so_snd) < -512) { 328 m_freem(m); 329 error = ENOBUFS; 330 break; 331 } 332 /* 333 * According to RFC961 (Assigned Protocols), 334 * the urgent pointer points to the last octet 335 * of urgent data. We continue, however, 336 * to consider it to indicate the first octet 337 * of data past the urgent section. 338 * Otherwise, snd_up should be one lower. 339 */ 340 sbappend(&so->so_snd, m); 341 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 342 tp->t_force = 1; 343 error = tcp_output(tp); 344 tp->t_force = 0; 345 break; 346 347 case PRU_SOCKADDR: 348 in_setsockaddr(inp, nam); 349 break; 350 351 case PRU_PEERADDR: 352 in_setpeeraddr(inp, nam); 353 break; 354 355 /* 356 * TCP slow timer went off; going through this 357 * routine for tracing's sake. 358 */ 359 case PRU_SLOWTIMO: 360 tp = tcp_timers(tp, (int)nam); 361#ifdef TCPDEBUG 362 req |= (int)nam << 8; /* for debug's sake */ 363#endif 364 break; 365 366 default: 367 panic("tcp_usrreq"); 368 } 369#ifdef TCPDEBUG 370 if (tp && (so->so_options & SO_DEBUG)) 371 tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, req); 372#endif 373 splx(s); 374 return (error); 375} 376 377/* 378 * Common subroutine to open a TCP connection to remote host specified 379 * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local 380 * port number if needed. Call in_pcbladdr to do the routing and to choose 381 * a local host address (interface). If there is an existing incarnation 382 * of the same connection in TIME-WAIT state and if the remote host was 383 * sending CC options and if the connection duration was < MSL, then 384 * truncate the previous TIME-WAIT state and proceed. 385 * Initialize connection parameters and enter SYN-SENT state. 386 */ 387int 388tcp_connect(tp, nam) 389 register struct tcpcb *tp; 390 struct mbuf *nam; 391{ 392 struct inpcb *inp = tp->t_inpcb, *oinp; 393 struct socket *so = inp->inp_socket; 394 struct tcpcb *otp; 395 struct sockaddr_in *sin = mtod(nam, struct sockaddr_in *); 396 struct sockaddr_in *ifaddr; 397 int error; 398 399 if (inp->inp_lport == 0) { 400 error = in_pcbbind(inp, NULL); 401 if (error) 402 return error; 403 } 404 405 /* 406 * Cannot simply call in_pcbconnect, because there might be an 407 * earlier incarnation of this same connection still in 408 * TIME_WAIT state, creating an ADDRINUSE error. 409 */ 410 error = in_pcbladdr(inp, nam, &ifaddr); 411 oinp = in_pcblookup(inp->inp_head, 412 sin->sin_addr, sin->sin_port, 413 inp->inp_laddr.s_addr != INADDR_ANY ? inp->inp_laddr 414 : ifaddr->sin_addr, 415 inp->inp_lport, 0); 416 if (oinp) { 417 if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && 418 otp->t_state == TCPS_TIME_WAIT && 419 otp->t_duration < TCPTV_MSL && 420 (otp->t_flags & TF_RCVD_CC)) 421 otp = tcp_close(otp); 422 else 423 return EADDRINUSE; 424 } 425 if (inp->inp_laddr.s_addr == INADDR_ANY) 426 inp->inp_laddr = ifaddr->sin_addr; 427 inp->inp_faddr = sin->sin_addr; 428 inp->inp_fport = sin->sin_port; 429 430 tp->t_template = tcp_template(tp); 431 if (tp->t_template == 0) { 432 in_pcbdisconnect(inp); 433 return ENOBUFS; 434 } 435 436 /* Compute window scaling to request. */ 437 while (tp->request_r_scale < TCP_MAX_WINSHIFT && 438 (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) 439 tp->request_r_scale++; 440 441 soisconnecting(so); 442 tcpstat.tcps_connattempt++; 443 tp->t_state = TCPS_SYN_SENT; 444 tp->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT; 445 tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2; 446 tcp_sendseqinit(tp); 447 tp->cc_send = CC_INC(tcp_ccgen); 448 449 return 0; 450} 451 452int 453tcp_ctloutput(op, so, level, optname, mp) 454 int op; 455 struct socket *so; 456 int level, optname; 457 struct mbuf **mp; 458{ 459 int error = 0, s; 460 struct inpcb *inp; 461 register struct tcpcb *tp; 462 register struct mbuf *m; 463 register int i; 464 465 s = splnet(); 466 inp = sotoinpcb(so); 467 if (inp == NULL) { 468 splx(s); 469 if (op == PRCO_SETOPT && *mp) 470 (void) m_free(*mp); 471 return (ECONNRESET); 472 } 473 if (level != IPPROTO_TCP) { 474 error = ip_ctloutput(op, so, level, optname, mp); 475 splx(s); 476 return (error); 477 } 478 tp = intotcpcb(inp); 479 480 switch (op) { 481 482 case PRCO_SETOPT: 483 m = *mp; 484 switch (optname) { 485 486 case TCP_NODELAY: 487 if (m == NULL || m->m_len < sizeof (int)) 488 error = EINVAL; 489 else if (*mtod(m, int *)) 490 tp->t_flags |= TF_NODELAY; 491 else 492 tp->t_flags &= ~TF_NODELAY; 493 break; 494 495 case TCP_MAXSEG: 496 if (m && (i = *mtod(m, int *)) > 0 && i <= tp->t_maxseg) 497 tp->t_maxseg = i; 498 else 499 error = EINVAL; 500 break; 501 502 case TCP_NOOPT: 503 if (m == NULL || m->m_len < sizeof (int)) 504 error = EINVAL; 505 else if (*mtod(m, int *)) 506 tp->t_flags |= TF_NOOPT; 507 else 508 tp->t_flags &= ~TF_NOOPT; 509 break; 510 511 case TCP_NOPUSH: 512 if (m == NULL || m->m_len < sizeof (int)) 513 error = EINVAL; 514 else if (*mtod(m, int *)) 515 tp->t_flags |= TF_NOPUSH; 516 else 517 tp->t_flags &= ~TF_NOPUSH; 518 break; 519 520 default: 521 error = ENOPROTOOPT; 522 break; 523 } 524 if (m) 525 (void) m_free(m); 526 break; 527 528 case PRCO_GETOPT: 529 *mp = m = m_get(M_WAIT, MT_SOOPTS); 530 m->m_len = sizeof(int); 531 532 switch (optname) { 533 case TCP_NODELAY: 534 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 535 break; 536 case TCP_MAXSEG: 537 *mtod(m, int *) = tp->t_maxseg; 538 break; 539 case TCP_NOOPT: 540 *mtod(m, int *) = tp->t_flags & TF_NOOPT; 541 break; 542 case TCP_NOPUSH: 543 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 544 break; 545 default: 546 error = ENOPROTOOPT; 547 break; 548 } 549 break; 550 } 551 splx(s); 552 return (error); 553} 554 555/* 556 * tcp_sendspace and tcp_recvspace are the default send and receive window 557 * sizes, respectively. These are obsolescent (this information should 558 * be set by the route). 559 */ 560u_long tcp_sendspace = 1024*16; 561u_long tcp_recvspace = 1024*16; 562 563/* 564 * Attach TCP protocol to socket, allocating 565 * internet protocol control block, tcp control block, 566 * bufer space, and entering LISTEN state if to accept connections. 567 */ 568int 569tcp_attach(so) 570 struct socket *so; 571{ 572 register struct tcpcb *tp; 573 struct inpcb *inp; 574 int error; 575 576 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { 577 error = soreserve(so, tcp_sendspace, tcp_recvspace); 578 if (error) 579 return (error); 580 } 581 error = in_pcballoc(so, &tcb); 582 if (error) 583 return (error); 584 inp = sotoinpcb(so); 585 tp = tcp_newtcpcb(inp); 586 if (tp == 0) { 587 int nofd = so->so_state & SS_NOFDREF; /* XXX */ 588 589 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 590 in_pcbdetach(inp); 591 so->so_state |= nofd; 592 return (ENOBUFS); 593 } 594 tp->t_state = TCPS_CLOSED; 595 return (0); 596} 597 598/* 599 * Initiate (or continue) disconnect. 600 * If embryonic state, just send reset (once). 601 * If in ``let data drain'' option and linger null, just drop. 602 * Otherwise (hard), mark socket disconnecting and drop 603 * current input data; switch states based on user close, and 604 * send segment to peer (with FIN). 605 */ 606struct tcpcb * 607tcp_disconnect(tp) 608 register struct tcpcb *tp; 609{ 610 struct socket *so = tp->t_inpcb->inp_socket; 611 612 if (tp->t_state < TCPS_ESTABLISHED) 613 tp = tcp_close(tp); 614 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 615 tp = tcp_drop(tp, 0); 616 else { 617 soisdisconnecting(so); 618 sbflush(&so->so_rcv); 619 tp = tcp_usrclosed(tp); 620 if (tp) 621 (void) tcp_output(tp); 622 } 623 return (tp); 624} 625 626/* 627 * User issued close, and wish to trail through shutdown states: 628 * if never received SYN, just forget it. If got a SYN from peer, 629 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 630 * If already got a FIN from peer, then almost done; go to LAST_ACK 631 * state. In all other cases, have already sent FIN to peer (e.g. 632 * after PRU_SHUTDOWN), and just have to play tedious game waiting 633 * for peer to send FIN or not respond to keep-alives, etc. 634 * We can let the user exit from the close as soon as the FIN is acked. 635 */ 636struct tcpcb * 637tcp_usrclosed(tp) 638 register struct tcpcb *tp; 639{ 640 641 switch (tp->t_state) { 642 643 case TCPS_CLOSED: 644 case TCPS_LISTEN: 645 tp->t_state = TCPS_CLOSED; 646 tp = tcp_close(tp); 647 break; 648 649 case TCPS_SYN_SENT: 650 case TCPS_SYN_RECEIVED: 651 tp->t_flags |= TF_NEEDFIN; 652 break; 653 654 case TCPS_ESTABLISHED: 655 tp->t_state = TCPS_FIN_WAIT_1; 656 break; 657 658 case TCPS_CLOSE_WAIT: 659 tp->t_state = TCPS_LAST_ACK; 660 break; 661 } 662 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) 663 soisdisconnected(tp->t_inpcb->inp_socket); 664 return (tp); 665} 666 667/* 668 * Sysctl for tcp variables. 669 */ 670int 671tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen) 672 int *name; 673 u_int namelen; 674 void *oldp; 675 size_t *oldlenp; 676 void *newp; 677 size_t newlen; 678{ 679 /* All sysctl names at this level are terminal. */ 680 if (namelen != 1) 681 return (ENOTDIR); 682 683 switch (name[0]) { 684 case TCPCTL_DO_RFC1323: 685 return (sysctl_int(oldp, oldlenp, newp, newlen, 686 &tcp_do_rfc1323)); 687 case TCPCTL_DO_RFC1644: 688 return (sysctl_int(oldp, oldlenp, newp, newlen, 689 &tcp_do_rfc1644)); 690 case TCPCTL_MSSDFLT: 691 return (sysctl_int(oldp, oldlenp, newp, newlen, 692 &tcp_mssdflt)); 693 case TCPCTL_STATS: 694 return (sysctl_rdstruct(oldp, oldlenp, newp, &tcpstat, 695 sizeof tcpstat)); 696 case TCPCTL_RTTDFLT: 697 return (sysctl_int(oldp, oldlenp, newp, newlen, &tcp_rttdflt)); 698 case TCPCTL_KEEPIDLE: 699 return (sysctl_int(oldp, oldlenp, newp, newlen, 700 &tcp_keepidle)); 701 case TCPCTL_KEEPINTVL: 702 return (sysctl_int(oldp, oldlenp, newp, newlen, 703 &tcp_keepintvl)); 704 case TCPCTL_SENDSPACE: 705 return (sysctl_int(oldp, oldlenp, newp, newlen, 706 (int *)&tcp_sendspace)); /* XXX */ 707 case TCPCTL_RECVSPACE: 708 return (sysctl_int(oldp, oldlenp, newp, newlen, 709 (int *)&tcp_recvspace)); /* XXX */ 710 default: 711 return (ENOPROTOOPT); 712 } 713 /* NOTREACHED */ 714} 715