1/* $OpenBSD: tcp_usrreq.c,v 1.231 2024/04/12 16:07:09 bluhm Exp $ */ 2/* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4/* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71#include <sys/param.h> 72#include <sys/systm.h> 73#include <sys/mbuf.h> 74#include <sys/socket.h> 75#include <sys/socketvar.h> 76#include <sys/protosw.h> 77#include <sys/stat.h> 78#include <sys/sysctl.h> 79#include <sys/domain.h> 80#include <sys/kernel.h> 81#include <sys/pool.h> 82#include <sys/proc.h> 83 84#include <net/if.h> 85#include <net/if_var.h> 86#include <net/route.h> 87 88#include <netinet/in.h> 89#include <netinet/in_var.h> 90#include <netinet/ip.h> 91#include <netinet/in_pcb.h> 92#include <netinet/ip_var.h> 93#include <netinet6/ip6_var.h> 94#include <netinet/tcp.h> 95#include <netinet/tcp_fsm.h> 96#include <netinet/tcp_seq.h> 97#include <netinet/tcp_timer.h> 98#include <netinet/tcp_var.h> 99#include <netinet/tcp_debug.h> 100 101#ifdef INET6 102#include <netinet6/in6_var.h> 103#endif 104 105#ifndef TCP_SENDSPACE 106#define TCP_SENDSPACE 1024*16 107#endif 108u_int tcp_sendspace = TCP_SENDSPACE; 109#ifndef TCP_RECVSPACE 110#define TCP_RECVSPACE 1024*16 111#endif 112u_int tcp_recvspace = TCP_RECVSPACE; 113u_int tcp_autorcvbuf_inc = 16 * 1024; 114 115const struct pr_usrreqs tcp_usrreqs = { 116 .pru_attach = tcp_attach, 117 .pru_detach = tcp_detach, 118 .pru_bind = tcp_bind, 119 .pru_listen = tcp_listen, 120 .pru_connect = tcp_connect, 121 .pru_accept = tcp_accept, 122 .pru_disconnect = tcp_disconnect, 123 .pru_shutdown = tcp_shutdown, 124 .pru_rcvd = tcp_rcvd, 125 .pru_send = tcp_send, 126 .pru_abort = tcp_abort, 127 .pru_sense = tcp_sense, 128 .pru_rcvoob = tcp_rcvoob, 129 .pru_sendoob = tcp_sendoob, 130 .pru_control = in_control, 131 .pru_sockaddr = tcp_sockaddr, 132 .pru_peeraddr = tcp_peeraddr, 133}; 134 135#ifdef INET6 136const struct pr_usrreqs tcp6_usrreqs = { 137 .pru_attach = tcp_attach, 138 .pru_detach = tcp_detach, 139 .pru_bind = tcp_bind, 140 .pru_listen = tcp_listen, 141 .pru_connect = tcp_connect, 142 .pru_accept = tcp_accept, 143 .pru_disconnect = tcp_disconnect, 144 .pru_shutdown = tcp_shutdown, 145 .pru_rcvd = tcp_rcvd, 146 .pru_send = tcp_send, 147 .pru_abort = tcp_abort, 148 .pru_sense = tcp_sense, 149 .pru_rcvoob = tcp_rcvoob, 150 .pru_sendoob = tcp_sendoob, 151 .pru_control = in6_control, 152 .pru_sockaddr = tcp_sockaddr, 153 .pru_peeraddr = tcp_peeraddr, 154}; 155#endif 156 157const struct sysctl_bounded_args tcpctl_vars[] = { 158 { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 }, 159 { TCPCTL_SACK, &tcp_do_sack, 0, 1 }, 160 { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 }, 161 { TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 }, 162 { TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 }, 163#ifdef TCP_ECN 164 { TCPCTL_ECN, &tcp_do_ecn, 0, 1 }, 165#endif 166 { TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 }, 167 { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX }, 168 { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 }, 169 { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 }, 170 { TCPCTL_TSO, &tcp_do_tso, 0, 1 }, 171}; 172 173struct inpcbtable tcbtable; 174#ifdef INET6 175struct inpcbtable tcb6table; 176#endif 177 178int tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *); 179int tcp_ident(void *, size_t *, void *, size_t, int); 180 181static inline int tcp_sogetpcb(struct socket *, struct inpcb **, 182 struct tcpcb **); 183 184static inline int 185tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp) 186{ 187 struct inpcb *inp; 188 struct tcpcb *tp; 189 190 /* 191 * When a TCP is attached to a socket, then there will be 192 * a (struct inpcb) pointed at by the socket, and this 193 * structure will point at a subsidiary (struct tcpcb). 194 */ 195 if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) { 196 if (so->so_error) 197 return so->so_error; 198 return EINVAL; 199 } 200 201 *rinp = inp; 202 *rtp = tp; 203 204 return 0; 205} 206 207/* 208 * Export internal TCP state information via a struct tcp_info without 209 * leaking any sensitive information. Sequence numbers are reported 210 * relative to the initial sequence number. 211 */ 212int 213tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m) 214{ 215 struct proc *p = curproc; 216 struct tcp_info *ti; 217 u_int t = 1000; /* msec => usec */ 218 uint64_t now; 219 220 if (sizeof(*ti) > MLEN) { 221 MCLGETL(m, M_WAITOK, sizeof(*ti)); 222 if (!ISSET(m->m_flags, M_EXT)) 223 return ENOMEM; 224 } 225 ti = mtod(m, struct tcp_info *); 226 m->m_len = sizeof(*ti); 227 memset(ti, 0, sizeof(*ti)); 228 now = tcp_now(); 229 230 ti->tcpi_state = tp->t_state; 231 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 232 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 233 if (tp->t_flags & TF_SACK_PERMIT) 234 ti->tcpi_options |= TCPI_OPT_SACK; 235 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 236 ti->tcpi_options |= TCPI_OPT_WSCALE; 237 ti->tcpi_snd_wscale = tp->snd_scale; 238 ti->tcpi_rcv_wscale = tp->rcv_scale; 239 } 240#ifdef TCP_ECN 241 if (tp->t_flags & TF_ECN_PERMIT) 242 ti->tcpi_options |= TCPI_OPT_ECN; 243#endif 244 245 ti->tcpi_rto = tp->t_rxtcur * t; 246 ti->tcpi_snd_mss = tp->t_maxseg; 247 ti->tcpi_rcv_mss = tp->t_peermss; 248 249 ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t; 250 ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t; 251 ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t; 252 ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t; 253 254 ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >> 255 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 256 ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >> 257 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT); 258 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 259 ti->tcpi_snd_cwnd = tp->snd_cwnd; 260 261 ti->tcpi_rcv_space = tp->rcv_wnd; 262 263 /* 264 * Provide only minimal information for unprivileged processes. 265 */ 266 if (suser(p) != 0) 267 return 0; 268 269 /* FreeBSD-specific extension fields for tcp_info. */ 270 ti->tcpi_snd_wnd = tp->snd_wnd; 271 ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss; 272 ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs; 273 /* missing tcpi_toe_tid */ 274 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 275 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 276 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 277 278 /* OpenBSD extensions */ 279 ti->tcpi_rttmin = tp->t_rttmin * t; 280 ti->tcpi_max_sndwnd = tp->max_sndwnd; 281 ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs; 282 ti->tcpi_rcv_up = tp->rcv_up - tp->irs; 283 ti->tcpi_snd_una = tp->snd_una - tp->iss; 284 ti->tcpi_snd_up = tp->snd_up - tp->iss; 285 ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss; 286 ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss; 287 ti->tcpi_snd_max = tp->snd_max - tp->iss; 288 289 ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */ 290 ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t; 291 ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt; 292 ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t; 293 294 ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc; 295 ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat; 296 ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat; 297 ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat; 298 ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc; 299 ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat; 300 ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat; 301 ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat; 302 303 return 0; 304} 305 306int 307tcp_ctloutput(int op, struct socket *so, int level, int optname, 308 struct mbuf *m) 309{ 310 int error = 0; 311 struct inpcb *inp; 312 struct tcpcb *tp; 313 int i; 314 315 inp = sotoinpcb(so); 316 if (inp == NULL) 317 return (ECONNRESET); 318 if (level != IPPROTO_TCP) { 319#ifdef INET6 320 if (ISSET(inp->inp_flags, INP_IPV6)) 321 error = ip6_ctloutput(op, so, level, optname, m); 322 else 323#endif 324 error = ip_ctloutput(op, so, level, optname, m); 325 return (error); 326 } 327 tp = intotcpcb(inp); 328 329 switch (op) { 330 331 case PRCO_SETOPT: 332 switch (optname) { 333 334 case TCP_NODELAY: 335 if (m == NULL || m->m_len < sizeof (int)) 336 error = EINVAL; 337 else if (*mtod(m, int *)) 338 tp->t_flags |= TF_NODELAY; 339 else 340 tp->t_flags &= ~TF_NODELAY; 341 break; 342 343 case TCP_NOPUSH: 344 if (m == NULL || m->m_len < sizeof (int)) 345 error = EINVAL; 346 else if (*mtod(m, int *)) 347 tp->t_flags |= TF_NOPUSH; 348 else if (tp->t_flags & TF_NOPUSH) { 349 tp->t_flags &= ~TF_NOPUSH; 350 if (TCPS_HAVEESTABLISHED(tp->t_state)) 351 error = tcp_output(tp); 352 } 353 break; 354 355 case TCP_MAXSEG: 356 if (m == NULL || m->m_len < sizeof (int)) { 357 error = EINVAL; 358 break; 359 } 360 361 i = *mtod(m, int *); 362 if (i > 0 && i <= tp->t_maxseg) 363 tp->t_maxseg = i; 364 else 365 error = EINVAL; 366 break; 367 368 case TCP_SACK_ENABLE: 369 if (m == NULL || m->m_len < sizeof (int)) { 370 error = EINVAL; 371 break; 372 } 373 374 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 375 error = EPERM; 376 break; 377 } 378 379 if (tp->t_flags & TF_SIGNATURE) { 380 error = EPERM; 381 break; 382 } 383 384 if (*mtod(m, int *)) 385 tp->sack_enable = 1; 386 else 387 tp->sack_enable = 0; 388 break; 389#ifdef TCP_SIGNATURE 390 case TCP_MD5SIG: 391 if (m == NULL || m->m_len < sizeof (int)) { 392 error = EINVAL; 393 break; 394 } 395 396 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 397 error = EPERM; 398 break; 399 } 400 401 if (*mtod(m, int *)) { 402 tp->t_flags |= TF_SIGNATURE; 403 tp->sack_enable = 0; 404 } else 405 tp->t_flags &= ~TF_SIGNATURE; 406 break; 407#endif /* TCP_SIGNATURE */ 408 default: 409 error = ENOPROTOOPT; 410 break; 411 } 412 break; 413 414 case PRCO_GETOPT: 415 switch (optname) { 416 case TCP_NODELAY: 417 m->m_len = sizeof(int); 418 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 419 break; 420 case TCP_NOPUSH: 421 m->m_len = sizeof(int); 422 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 423 break; 424 case TCP_MAXSEG: 425 m->m_len = sizeof(int); 426 *mtod(m, int *) = tp->t_maxseg; 427 break; 428 case TCP_SACK_ENABLE: 429 m->m_len = sizeof(int); 430 *mtod(m, int *) = tp->sack_enable; 431 break; 432 case TCP_INFO: 433 error = tcp_fill_info(tp, so, m); 434 break; 435#ifdef TCP_SIGNATURE 436 case TCP_MD5SIG: 437 m->m_len = sizeof(int); 438 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 439 break; 440#endif 441 default: 442 error = ENOPROTOOPT; 443 break; 444 } 445 break; 446 } 447 return (error); 448} 449 450/* 451 * Attach TCP protocol to socket, allocating 452 * internet protocol control block, tcp control block, 453 * buffer space, and entering LISTEN state to accept connections. 454 */ 455int 456tcp_attach(struct socket *so, int proto, int wait) 457{ 458 struct inpcbtable *table; 459 struct tcpcb *tp; 460 struct inpcb *inp; 461 int error; 462 463 if (so->so_pcb) 464 return EISCONN; 465 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 466 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 467 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 468 error = soreserve(so, tcp_sendspace, tcp_recvspace); 469 if (error) 470 return (error); 471 } 472 473 NET_ASSERT_LOCKED(); 474#ifdef INET6 475 if (so->so_proto->pr_domain->dom_family == PF_INET6) 476 table = &tcb6table; 477 else 478#endif 479 table = &tcbtable; 480 error = in_pcballoc(so, table, wait); 481 if (error) 482 return (error); 483 inp = sotoinpcb(so); 484 tp = tcp_newtcpcb(inp, wait); 485 if (tp == NULL) { 486 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 487 488 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 489 in_pcbdetach(inp); 490 so->so_state |= nofd; 491 return (ENOBUFS); 492 } 493 tp->t_state = TCPS_CLOSED; 494#ifdef INET6 495 if (ISSET(inp->inp_flags, INP_IPV6)) 496 tp->pf = PF_INET6; 497 else 498#endif 499 tp->pf = PF_INET; 500 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 501 so->so_linger = TCP_LINGERTIME; 502 503 if (so->so_options & SO_DEBUG) 504 tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0); 505 return (0); 506} 507 508int 509tcp_detach(struct socket *so) 510{ 511 struct inpcb *inp; 512 struct tcpcb *otp = NULL, *tp; 513 int error; 514 short ostate; 515 516 soassertlocked(so); 517 518 if ((error = tcp_sogetpcb(so, &inp, &tp))) 519 return (error); 520 521 if (so->so_options & SO_DEBUG) { 522 otp = tp; 523 ostate = tp->t_state; 524 } 525 526 /* 527 * Detach the TCP protocol from the socket. 528 * If the protocol state is non-embryonic, then can't 529 * do this directly: have to initiate a PRU_DISCONNECT, 530 * which may finish later; embryonic TCB's can just 531 * be discarded here. 532 */ 533 tp = tcp_dodisconnect(tp); 534 535 if (otp) 536 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0); 537 return (0); 538} 539 540/* 541 * Give the socket an address. 542 */ 543int 544tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p) 545{ 546 struct inpcb *inp; 547 struct tcpcb *tp; 548 int error; 549 short ostate; 550 551 soassertlocked(so); 552 553 if ((error = tcp_sogetpcb(so, &inp, &tp))) 554 return (error); 555 556 if (so->so_options & SO_DEBUG) 557 ostate = tp->t_state; 558 559 error = in_pcbbind(inp, nam, p); 560 561 if (so->so_options & SO_DEBUG) 562 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0); 563 return (error); 564} 565 566/* 567 * Prepare to accept connections. 568 */ 569int 570tcp_listen(struct socket *so) 571{ 572 struct inpcb *inp; 573 struct tcpcb *tp, *otp = NULL; 574 int error; 575 short ostate; 576 577 soassertlocked(so); 578 579 if ((error = tcp_sogetpcb(so, &inp, &tp))) 580 return (error); 581 582 if (so->so_options & SO_DEBUG) { 583 otp = tp; 584 ostate = tp->t_state; 585 } 586 587 if (inp->inp_lport == 0) 588 if ((error = in_pcbbind(inp, NULL, curproc))) 589 goto out; 590 591 /* 592 * If the in_pcbbind() above is called, the tp->pf 593 * should still be whatever it was before. 594 */ 595 tp->t_state = TCPS_LISTEN; 596 597out: 598 if (otp) 599 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0); 600 return (error); 601} 602 603/* 604 * Initiate connection to peer. 605 * Create a template for use in transmissions on this connection. 606 * Enter SYN_SENT state, and mark socket as connecting. 607 * Start keep-alive timer, and seed output sequence space. 608 * Send initial segment on connection. 609 */ 610int 611tcp_connect(struct socket *so, struct mbuf *nam) 612{ 613 struct inpcb *inp; 614 struct tcpcb *tp, *otp = NULL; 615 int error; 616 short ostate; 617 618 soassertlocked(so); 619 620 if ((error = tcp_sogetpcb(so, &inp, &tp))) 621 return (error); 622 623 if (so->so_options & SO_DEBUG) { 624 otp = tp; 625 ostate = tp->t_state; 626 } 627 628#ifdef INET6 629 if (ISSET(inp->inp_flags, INP_IPV6)) { 630 struct sockaddr_in6 *sin6; 631 632 if ((error = in6_nam2sin6(nam, &sin6))) 633 goto out; 634 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 635 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 636 error = EINVAL; 637 goto out; 638 } 639 } else 640#endif 641 { 642 struct sockaddr_in *sin; 643 644 if ((error = in_nam2sin(nam, &sin))) 645 goto out; 646 if ((sin->sin_addr.s_addr == INADDR_ANY) || 647 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 648 IN_MULTICAST(sin->sin_addr.s_addr) || 649 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 650 error = EINVAL; 651 goto out; 652 } 653 } 654 error = in_pcbconnect(inp, nam); 655 if (error) 656 goto out; 657 658 tp->t_template = tcp_template(tp); 659 if (tp->t_template == 0) { 660 in_pcbunset_faddr(inp); 661 in_pcbdisconnect(inp); 662 error = ENOBUFS; 663 goto out; 664 } 665 666 so->so_state |= SS_CONNECTOUT; 667 668 /* Compute window scaling to request. */ 669 tcp_rscale(tp, sb_max); 670 671 soisconnecting(so); 672 tcpstat_inc(tcps_connattempt); 673 tp->t_state = TCPS_SYN_SENT; 674 TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init); 675 tcp_set_iss_tsm(tp); 676 tcp_sendseqinit(tp); 677 tp->snd_last = tp->snd_una; 678 error = tcp_output(tp); 679 680out: 681 if (otp) 682 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0); 683 return (error); 684} 685 686/* 687 * Accept a connection. Essentially all the work is done at higher 688 * levels; just return the address of the peer, storing through addr. 689 */ 690int 691tcp_accept(struct socket *so, struct mbuf *nam) 692{ 693 struct inpcb *inp; 694 struct tcpcb *tp; 695 int error; 696 697 soassertlocked(so); 698 699 if ((error = tcp_sogetpcb(so, &inp, &tp))) 700 return (error); 701 702 in_setpeeraddr(inp, nam); 703 704 if (so->so_options & SO_DEBUG) 705 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_ACCEPT, 0); 706 return (0); 707} 708 709/* 710 * Initiate disconnect from peer. 711 * If connection never passed embryonic stage, just drop; 712 * else if don't need to let data drain, then can just drop anyways, 713 * else have to begin TCP shutdown process: mark socket disconnecting, 714 * drain unread data, state switch to reflect user close, and 715 * send segment (e.g. FIN) to peer. Socket will be really disconnected 716 * when peer sends FIN and acks ours. 717 * 718 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 719 */ 720int 721tcp_disconnect(struct socket *so) 722{ 723 struct inpcb *inp; 724 struct tcpcb *tp, *otp = NULL; 725 int error; 726 short ostate; 727 728 soassertlocked(so); 729 730 if ((error = tcp_sogetpcb(so, &inp, &tp))) 731 return (error); 732 733 if (so->so_options & SO_DEBUG) { 734 otp = tp; 735 ostate = tp->t_state; 736 } 737 738 tp = tcp_dodisconnect(tp); 739 740 if (otp) 741 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0); 742 return (0); 743} 744 745/* 746 * Mark the connection as being incapable of further output. 747 */ 748int 749tcp_shutdown(struct socket *so) 750{ 751 struct inpcb *inp; 752 struct tcpcb *tp, *otp = NULL; 753 int error; 754 short ostate; 755 756 soassertlocked(so); 757 758 if ((error = tcp_sogetpcb(so, &inp, &tp))) 759 return (error); 760 761 if (so->so_options & SO_DEBUG) { 762 otp = tp; 763 ostate = tp->t_state; 764 } 765 766 if (so->so_snd.sb_state & SS_CANTSENDMORE) 767 goto out; 768 769 socantsendmore(so); 770 tp = tcp_usrclosed(tp); 771 if (tp) 772 error = tcp_output(tp); 773 774out: 775 if (otp) 776 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0); 777 return (error); 778} 779 780/* 781 * After a receive, possibly send window update to peer. 782 */ 783void 784tcp_rcvd(struct socket *so) 785{ 786 struct inpcb *inp; 787 struct tcpcb *tp; 788 short ostate; 789 790 soassertlocked(so); 791 792 if (tcp_sogetpcb(so, &inp, &tp)) 793 return; 794 795 if (so->so_options & SO_DEBUG) 796 ostate = tp->t_state; 797 798 /* 799 * soreceive() calls this function when a user receives 800 * ancillary data on a listening socket. We don't call 801 * tcp_output in such a case, since there is no header 802 * template for a listening socket and hence the kernel 803 * will panic. 804 */ 805 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 806 (void) tcp_output(tp); 807 808 if (so->so_options & SO_DEBUG) 809 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0); 810} 811 812/* 813 * Do a send by putting data in output queue and updating urgent 814 * marker if URG set. Possibly send more data. 815 */ 816int 817tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam, 818 struct mbuf *control) 819{ 820 struct inpcb *inp; 821 struct tcpcb *tp; 822 int error; 823 short ostate; 824 825 soassertlocked(so); 826 827 if (control && control->m_len) { 828 error = EINVAL; 829 goto out; 830 } 831 832 if ((error = tcp_sogetpcb(so, &inp, &tp))) 833 goto out; 834 835 if (so->so_options & SO_DEBUG) 836 ostate = tp->t_state; 837 838 sbappendstream(so, &so->so_snd, m); 839 m = NULL; 840 841 error = tcp_output(tp); 842 843 if (so->so_options & SO_DEBUG) 844 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0); 845 846out: 847 m_freem(control); 848 m_freem(m); 849 850 return (error); 851} 852 853/* 854 * Abort the TCP. 855 */ 856void 857tcp_abort(struct socket *so) 858{ 859 struct inpcb *inp; 860 struct tcpcb *tp, *otp = NULL; 861 short ostate; 862 863 soassertlocked(so); 864 865 if (tcp_sogetpcb(so, &inp, &tp)) 866 return; 867 868 if (so->so_options & SO_DEBUG) { 869 otp = tp; 870 ostate = tp->t_state; 871 } 872 873 tp = tcp_drop(tp, ECONNABORTED); 874 875 if (otp) 876 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0); 877} 878 879int 880tcp_sense(struct socket *so, struct stat *ub) 881{ 882 struct inpcb *inp; 883 struct tcpcb *tp; 884 int error; 885 886 soassertlocked(so); 887 888 if ((error = tcp_sogetpcb(so, &inp, &tp))) 889 return (error); 890 891 ub->st_blksize = so->so_snd.sb_hiwat; 892 893 if (so->so_options & SO_DEBUG) 894 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0); 895 return (0); 896} 897 898int 899tcp_rcvoob(struct socket *so, struct mbuf *m, int flags) 900{ 901 struct inpcb *inp; 902 struct tcpcb *tp; 903 int error; 904 905 soassertlocked(so); 906 907 if ((error = tcp_sogetpcb(so, &inp, &tp))) 908 return (error); 909 910 if ((so->so_oobmark == 0 && 911 (so->so_rcv.sb_state & SS_RCVATMARK) == 0) || 912 so->so_options & SO_OOBINLINE || 913 tp->t_oobflags & TCPOOB_HADDATA) { 914 error = EINVAL; 915 goto out; 916 } 917 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 918 error = EWOULDBLOCK; 919 goto out; 920 } 921 m->m_len = 1; 922 *mtod(m, caddr_t) = tp->t_iobc; 923 if ((flags & MSG_PEEK) == 0) 924 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 925out: 926 if (so->so_options & SO_DEBUG) 927 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0); 928 return (error); 929} 930 931int 932tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam, 933 struct mbuf *control) 934{ 935 struct inpcb *inp; 936 struct tcpcb *tp; 937 int error; 938 short ostate; 939 940 soassertlocked(so); 941 942 if (control && control->m_len) { 943 error = EINVAL; 944 goto release; 945 } 946 947 if ((error = tcp_sogetpcb(so, &inp, &tp))) 948 goto release; 949 950 if (so->so_options & SO_DEBUG) 951 ostate = tp->t_state; 952 953 if (sbspace(so, &so->so_snd) < -512) { 954 error = ENOBUFS; 955 goto out; 956 } 957 958 /* 959 * According to RFC961 (Assigned Protocols), 960 * the urgent pointer points to the last octet 961 * of urgent data. We continue, however, 962 * to consider it to indicate the first octet 963 * of data past the urgent section. 964 * Otherwise, snd_up should be one lower. 965 */ 966 sbappendstream(so, &so->so_snd, m); 967 m = NULL; 968 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 969 tp->t_force = 1; 970 error = tcp_output(tp); 971 tp->t_force = 0; 972 973out: 974 if (so->so_options & SO_DEBUG) 975 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0); 976 977release: 978 m_freem(control); 979 m_freem(m); 980 981 return (error); 982} 983 984int 985tcp_sockaddr(struct socket *so, struct mbuf *nam) 986{ 987 struct inpcb *inp; 988 struct tcpcb *tp; 989 int error; 990 991 soassertlocked(so); 992 993 if ((error = tcp_sogetpcb(so, &inp, &tp))) 994 return (error); 995 996 in_setsockaddr(inp, nam); 997 998 if (so->so_options & SO_DEBUG) 999 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, 1000 PRU_SOCKADDR, 0); 1001 return (0); 1002} 1003 1004int 1005tcp_peeraddr(struct socket *so, struct mbuf *nam) 1006{ 1007 struct inpcb *inp; 1008 struct tcpcb *tp; 1009 int error; 1010 1011 soassertlocked(so); 1012 1013 if ((error = tcp_sogetpcb(so, &inp, &tp))) 1014 return (error); 1015 1016 in_setpeeraddr(inp, nam); 1017 1018 if (so->so_options & SO_DEBUG) 1019 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_PEERADDR, 0); 1020 return (0); 1021} 1022 1023/* 1024 * Initiate (or continue) disconnect. 1025 * If embryonic state, just send reset (once). 1026 * If in ``let data drain'' option and linger null, just drop. 1027 * Otherwise (hard), mark socket disconnecting and drop 1028 * current input data; switch states based on user close, and 1029 * send segment to peer (with FIN). 1030 */ 1031struct tcpcb * 1032tcp_dodisconnect(struct tcpcb *tp) 1033{ 1034 struct socket *so = tp->t_inpcb->inp_socket; 1035 1036 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 1037 tp = tcp_close(tp); 1038 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 1039 tp = tcp_drop(tp, 0); 1040 else { 1041 soisdisconnecting(so); 1042 sbflush(so, &so->so_rcv); 1043 tp = tcp_usrclosed(tp); 1044 if (tp) 1045 (void) tcp_output(tp); 1046 } 1047 return (tp); 1048} 1049 1050/* 1051 * User issued close, and wish to trail through shutdown states: 1052 * if never received SYN, just forget it. If got a SYN from peer, 1053 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1054 * If already got a FIN from peer, then almost done; go to LAST_ACK 1055 * state. In all other cases, have already sent FIN to peer (e.g. 1056 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1057 * for peer to send FIN or not respond to keep-alives, etc. 1058 * We can let the user exit from the close as soon as the FIN is acked. 1059 */ 1060struct tcpcb * 1061tcp_usrclosed(struct tcpcb *tp) 1062{ 1063 1064 switch (tp->t_state) { 1065 1066 case TCPS_CLOSED: 1067 case TCPS_LISTEN: 1068 case TCPS_SYN_SENT: 1069 tp->t_state = TCPS_CLOSED; 1070 tp = tcp_close(tp); 1071 break; 1072 1073 case TCPS_SYN_RECEIVED: 1074 case TCPS_ESTABLISHED: 1075 tp->t_state = TCPS_FIN_WAIT_1; 1076 break; 1077 1078 case TCPS_CLOSE_WAIT: 1079 tp->t_state = TCPS_LAST_ACK; 1080 break; 1081 } 1082 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1083 soisdisconnected(tp->t_inpcb->inp_socket); 1084 /* 1085 * If we are in FIN_WAIT_2, we arrived here because the 1086 * application did a shutdown of the send side. Like the 1087 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 1088 * a full close, we start a timer to make sure sockets are 1089 * not left in FIN_WAIT_2 forever. 1090 */ 1091 if (tp->t_state == TCPS_FIN_WAIT_2) 1092 TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle); 1093 } 1094 return (tp); 1095} 1096 1097/* 1098 * Look up a socket for ident or tcpdrop, ... 1099 */ 1100int 1101tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 1102{ 1103 int error = 0; 1104 struct tcp_ident_mapping tir; 1105 struct inpcb *inp; 1106 struct tcpcb *tp = NULL; 1107 struct sockaddr_in *fin, *lin; 1108#ifdef INET6 1109 struct sockaddr_in6 *fin6, *lin6; 1110 struct in6_addr f6, l6; 1111#endif 1112 1113 NET_ASSERT_LOCKED(); 1114 1115 if (dodrop) { 1116 if (oldp != NULL || *oldlenp != 0) 1117 return (EINVAL); 1118 if (newp == NULL) 1119 return (EPERM); 1120 if (newlen < sizeof(tir)) 1121 return (ENOMEM); 1122 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 1123 return (error); 1124 } else { 1125 if (oldp == NULL) 1126 return (EINVAL); 1127 if (*oldlenp < sizeof(tir)) 1128 return (ENOMEM); 1129 if (newp != NULL || newlen != 0) 1130 return (EINVAL); 1131 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 1132 return (error); 1133 } 1134 switch (tir.faddr.ss_family) { 1135#ifdef INET6 1136 case AF_INET6: 1137 fin6 = (struct sockaddr_in6 *)&tir.faddr; 1138 error = in6_embedscope(&f6, fin6, NULL, NULL); 1139 if (error) 1140 return EINVAL; /*?*/ 1141 lin6 = (struct sockaddr_in6 *)&tir.laddr; 1142 error = in6_embedscope(&l6, lin6, NULL, NULL); 1143 if (error) 1144 return EINVAL; /*?*/ 1145 break; 1146#endif 1147 case AF_INET: 1148 fin = (struct sockaddr_in *)&tir.faddr; 1149 lin = (struct sockaddr_in *)&tir.laddr; 1150 break; 1151 default: 1152 return (EINVAL); 1153 } 1154 1155 switch (tir.faddr.ss_family) { 1156#ifdef INET6 1157 case AF_INET6: 1158 inp = in6_pcblookup(&tcb6table, &f6, 1159 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 1160 break; 1161#endif 1162 case AF_INET: 1163 inp = in_pcblookup(&tcbtable, fin->sin_addr, 1164 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 1165 break; 1166 default: 1167 unhandled_af(tir.faddr.ss_family); 1168 } 1169 1170 if (dodrop) { 1171 if (inp && (tp = intotcpcb(inp)) && 1172 ((inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) 1173 tp = tcp_drop(tp, ECONNABORTED); 1174 else 1175 error = ESRCH; 1176 in_pcbunref(inp); 1177 return (error); 1178 } 1179 1180 if (inp == NULL) { 1181 tcpstat_inc(tcps_pcbhashmiss); 1182 switch (tir.faddr.ss_family) { 1183#ifdef INET6 1184 case AF_INET6: 1185 inp = in6_pcblookup_listen(&tcb6table, 1186 &l6, lin6->sin6_port, NULL, tir.rdomain); 1187 break; 1188#endif 1189 case AF_INET: 1190 inp = in_pcblookup_listen(&tcbtable, 1191 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 1192 break; 1193 } 1194 } 1195 1196 if (inp != NULL && (inp->inp_socket->so_state & SS_CONNECTOUT)) { 1197 tir.ruid = inp->inp_socket->so_ruid; 1198 tir.euid = inp->inp_socket->so_euid; 1199 } else { 1200 tir.ruid = -1; 1201 tir.euid = -1; 1202 } 1203 1204 *oldlenp = sizeof (tir); 1205 error = copyout((void *)&tir, oldp, sizeof (tir)); 1206 in_pcbunref(inp); 1207 return (error); 1208} 1209 1210int 1211tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 1212{ 1213 uint64_t counters[tcps_ncounters]; 1214 struct tcpstat tcpstat; 1215 struct syn_cache_set *set; 1216 int i = 0; 1217 1218#define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 1219 1220 memset(&tcpstat, 0, sizeof tcpstat); 1221 counters_read(tcpcounters, counters, nitems(counters), NULL); 1222 ASSIGN(tcps_connattempt); 1223 ASSIGN(tcps_accepts); 1224 ASSIGN(tcps_connects); 1225 ASSIGN(tcps_drops); 1226 ASSIGN(tcps_conndrops); 1227 ASSIGN(tcps_closed); 1228 ASSIGN(tcps_segstimed); 1229 ASSIGN(tcps_rttupdated); 1230 ASSIGN(tcps_delack); 1231 ASSIGN(tcps_timeoutdrop); 1232 ASSIGN(tcps_rexmttimeo); 1233 ASSIGN(tcps_persisttimeo); 1234 ASSIGN(tcps_persistdrop); 1235 ASSIGN(tcps_keeptimeo); 1236 ASSIGN(tcps_keepprobe); 1237 ASSIGN(tcps_keepdrops); 1238 ASSIGN(tcps_sndtotal); 1239 ASSIGN(tcps_sndpack); 1240 ASSIGN(tcps_sndbyte); 1241 ASSIGN(tcps_sndrexmitpack); 1242 ASSIGN(tcps_sndrexmitbyte); 1243 ASSIGN(tcps_sndrexmitfast); 1244 ASSIGN(tcps_sndacks); 1245 ASSIGN(tcps_sndprobe); 1246 ASSIGN(tcps_sndurg); 1247 ASSIGN(tcps_sndwinup); 1248 ASSIGN(tcps_sndctrl); 1249 ASSIGN(tcps_rcvtotal); 1250 ASSIGN(tcps_rcvpack); 1251 ASSIGN(tcps_rcvbyte); 1252 ASSIGN(tcps_rcvbadsum); 1253 ASSIGN(tcps_rcvbadoff); 1254 ASSIGN(tcps_rcvmemdrop); 1255 ASSIGN(tcps_rcvnosec); 1256 ASSIGN(tcps_rcvshort); 1257 ASSIGN(tcps_rcvduppack); 1258 ASSIGN(tcps_rcvdupbyte); 1259 ASSIGN(tcps_rcvpartduppack); 1260 ASSIGN(tcps_rcvpartdupbyte); 1261 ASSIGN(tcps_rcvoopack); 1262 ASSIGN(tcps_rcvoobyte); 1263 ASSIGN(tcps_rcvpackafterwin); 1264 ASSIGN(tcps_rcvbyteafterwin); 1265 ASSIGN(tcps_rcvafterclose); 1266 ASSIGN(tcps_rcvwinprobe); 1267 ASSIGN(tcps_rcvdupack); 1268 ASSIGN(tcps_rcvacktoomuch); 1269 ASSIGN(tcps_rcvacktooold); 1270 ASSIGN(tcps_rcvackpack); 1271 ASSIGN(tcps_rcvackbyte); 1272 ASSIGN(tcps_rcvwinupd); 1273 ASSIGN(tcps_pawsdrop); 1274 ASSIGN(tcps_predack); 1275 ASSIGN(tcps_preddat); 1276 ASSIGN(tcps_pcbhashmiss); 1277 ASSIGN(tcps_noport); 1278 ASSIGN(tcps_badsyn); 1279 ASSIGN(tcps_dropsyn); 1280 ASSIGN(tcps_rcvbadsig); 1281 ASSIGN(tcps_rcvgoodsig); 1282 ASSIGN(tcps_inswcsum); 1283 ASSIGN(tcps_outswcsum); 1284 ASSIGN(tcps_ecn_accepts); 1285 ASSIGN(tcps_ecn_rcvece); 1286 ASSIGN(tcps_ecn_rcvcwr); 1287 ASSIGN(tcps_ecn_rcvce); 1288 ASSIGN(tcps_ecn_sndect); 1289 ASSIGN(tcps_ecn_sndece); 1290 ASSIGN(tcps_ecn_sndcwr); 1291 ASSIGN(tcps_cwr_ecn); 1292 ASSIGN(tcps_cwr_frecovery); 1293 ASSIGN(tcps_cwr_timeout); 1294 ASSIGN(tcps_sc_added); 1295 ASSIGN(tcps_sc_completed); 1296 ASSIGN(tcps_sc_timed_out); 1297 ASSIGN(tcps_sc_overflowed); 1298 ASSIGN(tcps_sc_reset); 1299 ASSIGN(tcps_sc_unreach); 1300 ASSIGN(tcps_sc_bucketoverflow); 1301 ASSIGN(tcps_sc_aborted); 1302 ASSIGN(tcps_sc_dupesyn); 1303 ASSIGN(tcps_sc_dropped); 1304 ASSIGN(tcps_sc_collisions); 1305 ASSIGN(tcps_sc_retransmitted); 1306 ASSIGN(tcps_sc_seedrandom); 1307 ASSIGN(tcps_sc_hash_size); 1308 ASSIGN(tcps_sc_entry_count); 1309 ASSIGN(tcps_sc_entry_limit); 1310 ASSIGN(tcps_sc_bucket_maxlen); 1311 ASSIGN(tcps_sc_bucket_limit); 1312 ASSIGN(tcps_sc_uses_left); 1313 ASSIGN(tcps_conndrained); 1314 ASSIGN(tcps_sack_recovery_episode); 1315 ASSIGN(tcps_sack_rexmits); 1316 ASSIGN(tcps_sack_rexmit_bytes); 1317 ASSIGN(tcps_sack_rcv_opts); 1318 ASSIGN(tcps_sack_snd_opts); 1319 ASSIGN(tcps_sack_drop_opts); 1320 ASSIGN(tcps_outswtso); 1321 ASSIGN(tcps_outhwtso); 1322 ASSIGN(tcps_outpkttso); 1323 ASSIGN(tcps_outbadtso); 1324 ASSIGN(tcps_inswlro); 1325 ASSIGN(tcps_inhwlro); 1326 ASSIGN(tcps_inpktlro); 1327 ASSIGN(tcps_inbadlro); 1328 1329#undef ASSIGN 1330 1331 mtx_enter(&syn_cache_mtx); 1332 set = &tcp_syn_cache[tcp_syn_cache_active]; 1333 tcpstat.tcps_sc_hash_size = set->scs_size; 1334 tcpstat.tcps_sc_entry_count = set->scs_count; 1335 tcpstat.tcps_sc_entry_limit = tcp_syn_cache_limit; 1336 tcpstat.tcps_sc_bucket_maxlen = 0; 1337 for (i = 0; i < set->scs_size; i++) { 1338 if (tcpstat.tcps_sc_bucket_maxlen < 1339 set->scs_buckethead[i].sch_length) 1340 tcpstat.tcps_sc_bucket_maxlen = 1341 set->scs_buckethead[i].sch_length; 1342 } 1343 tcpstat.tcps_sc_bucket_limit = tcp_syn_bucket_limit; 1344 tcpstat.tcps_sc_uses_left = set->scs_use; 1345 mtx_leave(&syn_cache_mtx); 1346 1347 return (sysctl_rdstruct(oldp, oldlenp, newp, 1348 &tcpstat, sizeof(tcpstat))); 1349} 1350 1351/* 1352 * Sysctl for tcp variables. 1353 */ 1354int 1355tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 1356 size_t newlen) 1357{ 1358 int error, nval; 1359 1360 /* All sysctl names at this level are terminal. */ 1361 if (namelen != 1) 1362 return (ENOTDIR); 1363 1364 switch (name[0]) { 1365 case TCPCTL_KEEPINITTIME: 1366 NET_LOCK(); 1367 nval = tcptv_keep_init / TCP_TIME(1); 1368 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1369 1, 3 * (TCPTV_KEEP_INIT / TCP_TIME(1))); 1370 if (!error) 1371 tcptv_keep_init = TCP_TIME(nval); 1372 NET_UNLOCK(); 1373 return (error); 1374 1375 case TCPCTL_KEEPIDLE: 1376 NET_LOCK(); 1377 nval = tcp_keepidle / TCP_TIME(1); 1378 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1379 1, 5 * (TCPTV_KEEP_IDLE / TCP_TIME(1))); 1380 if (!error) 1381 tcp_keepidle = TCP_TIME(nval); 1382 NET_UNLOCK(); 1383 return (error); 1384 1385 case TCPCTL_KEEPINTVL: 1386 NET_LOCK(); 1387 nval = tcp_keepintvl / TCP_TIME(1); 1388 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &nval, 1389 1, 3 * (TCPTV_KEEPINTVL / TCP_TIME(1))); 1390 if (!error) 1391 tcp_keepintvl = TCP_TIME(nval); 1392 NET_UNLOCK(); 1393 return (error); 1394 1395 case TCPCTL_BADDYNAMIC: 1396 NET_LOCK(); 1397 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1398 baddynamicports.tcp, sizeof(baddynamicports.tcp)); 1399 NET_UNLOCK(); 1400 return (error); 1401 1402 case TCPCTL_ROOTONLY: 1403 if (newp && securelevel > 0) 1404 return (EPERM); 1405 NET_LOCK(); 1406 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1407 rootonlyports.tcp, sizeof(rootonlyports.tcp)); 1408 NET_UNLOCK(); 1409 return (error); 1410 1411 case TCPCTL_IDENT: 1412 NET_LOCK(); 1413 error = tcp_ident(oldp, oldlenp, newp, newlen, 0); 1414 NET_UNLOCK(); 1415 return (error); 1416 1417 case TCPCTL_DROP: 1418 NET_LOCK(); 1419 error = tcp_ident(oldp, oldlenp, newp, newlen, 1); 1420 NET_UNLOCK(); 1421 return (error); 1422 1423 case TCPCTL_REASS_LIMIT: 1424 NET_LOCK(); 1425 nval = tcp_reass_limit; 1426 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1427 if (!error && nval != tcp_reass_limit) { 1428 error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); 1429 if (!error) 1430 tcp_reass_limit = nval; 1431 } 1432 NET_UNLOCK(); 1433 return (error); 1434 1435 case TCPCTL_SACKHOLE_LIMIT: 1436 NET_LOCK(); 1437 nval = tcp_sackhole_limit; 1438 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1439 if (!error && nval != tcp_sackhole_limit) { 1440 error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); 1441 if (!error) 1442 tcp_sackhole_limit = nval; 1443 } 1444 NET_UNLOCK(); 1445 return (error); 1446 1447 case TCPCTL_STATS: 1448 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1449 1450 case TCPCTL_SYN_USE_LIMIT: 1451 NET_LOCK(); 1452 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1453 &tcp_syn_use_limit, 0, INT_MAX); 1454 if (!error && newp != NULL) { 1455 /* 1456 * Global tcp_syn_use_limit is used when reseeding a 1457 * new cache. Also update the value in active cache. 1458 */ 1459 mtx_enter(&syn_cache_mtx); 1460 if (tcp_syn_cache[0].scs_use > tcp_syn_use_limit) 1461 tcp_syn_cache[0].scs_use = tcp_syn_use_limit; 1462 if (tcp_syn_cache[1].scs_use > tcp_syn_use_limit) 1463 tcp_syn_cache[1].scs_use = tcp_syn_use_limit; 1464 mtx_leave(&syn_cache_mtx); 1465 } 1466 NET_UNLOCK(); 1467 return (error); 1468 1469 case TCPCTL_SYN_HASH_SIZE: 1470 NET_LOCK(); 1471 nval = tcp_syn_hash_size; 1472 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1473 &nval, 1, 100000); 1474 if (!error && nval != tcp_syn_hash_size) { 1475 /* 1476 * If global hash size has been changed, 1477 * switch sets as soon as possible. Then 1478 * the actual hash array will be reallocated. 1479 */ 1480 mtx_enter(&syn_cache_mtx); 1481 if (tcp_syn_cache[0].scs_size != nval) 1482 tcp_syn_cache[0].scs_use = 0; 1483 if (tcp_syn_cache[1].scs_size != nval) 1484 tcp_syn_cache[1].scs_use = 0; 1485 tcp_syn_hash_size = nval; 1486 mtx_leave(&syn_cache_mtx); 1487 } 1488 NET_UNLOCK(); 1489 return (error); 1490 1491 default: 1492 NET_LOCK(); 1493 error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), 1494 name, namelen, oldp, oldlenp, newp, newlen); 1495 NET_UNLOCK(); 1496 return (error); 1497 } 1498 /* NOTREACHED */ 1499} 1500 1501/* 1502 * Scale the send buffer so that inflight data is not accounted against 1503 * the limit. The buffer will scale with the congestion window, if the 1504 * the receiver stops acking data the window will shrink and therefore 1505 * the buffer size will shrink as well. 1506 * In low memory situation try to shrink the buffer to the initial size 1507 * disabling the send buffer scaling as long as the situation persists. 1508 */ 1509void 1510tcp_update_sndspace(struct tcpcb *tp) 1511{ 1512 struct socket *so = tp->t_inpcb->inp_socket; 1513 u_long nmax = so->so_snd.sb_hiwat; 1514 1515 if (sbchecklowmem()) { 1516 /* low on memory try to get rid of some */ 1517 if (tcp_sendspace < nmax) 1518 nmax = tcp_sendspace; 1519 } else if (so->so_snd.sb_wat != tcp_sendspace) 1520 /* user requested buffer size, auto-scaling disabled */ 1521 nmax = so->so_snd.sb_wat; 1522 else 1523 /* automatic buffer scaling */ 1524 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1525 tp->snd_una); 1526 1527 /* a writable socket must be preserved because of poll(2) semantics */ 1528 if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { 1529 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1530 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1531 /* keep in sync with sbreserve() calculation */ 1532 if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1533 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8; 1534 } 1535 1536 /* round to MSS boundary */ 1537 nmax = roundup(nmax, tp->t_maxseg); 1538 1539 if (nmax != so->so_snd.sb_hiwat) 1540 sbreserve(so, &so->so_snd, nmax); 1541} 1542 1543/* 1544 * Scale the recv buffer by looking at how much data was transferred in 1545 * one approximated RTT. If more than a big part of the recv buffer was 1546 * transferred during that time we increase the buffer by a constant. 1547 * In low memory situation try to shrink the buffer to the initial size. 1548 */ 1549void 1550tcp_update_rcvspace(struct tcpcb *tp) 1551{ 1552 struct socket *so = tp->t_inpcb->inp_socket; 1553 u_long nmax = so->so_rcv.sb_hiwat; 1554 1555 if (sbchecklowmem()) { 1556 /* low on memory try to get rid of some */ 1557 if (tcp_recvspace < nmax) 1558 nmax = tcp_recvspace; 1559 } else if (so->so_rcv.sb_wat != tcp_recvspace) 1560 /* user requested buffer size, auto-scaling disabled */ 1561 nmax = so->so_rcv.sb_wat; 1562 else { 1563 /* automatic buffer scaling */ 1564 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1565 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1566 tcp_autorcvbuf_inc); 1567 } 1568 1569 /* a readable socket must be preserved because of poll(2) semantics */ 1570 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1571 nmax < so->so_snd.sb_lowat) 1572 nmax = so->so_snd.sb_lowat; 1573 1574 if (nmax == so->so_rcv.sb_hiwat) 1575 return; 1576 1577 /* round to MSS boundary */ 1578 nmax = roundup(nmax, tp->t_maxseg); 1579 sbreserve(so, &so->so_rcv, nmax); 1580} 1581