186764Sjlemon/*- 2141063Srwatson * Copyright (c) 2001 McAfee, Inc. 3253210Sandre * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG 486764Sjlemon * All rights reserved. 586764Sjlemon * 686764Sjlemon * This software was developed for the FreeBSD Project by Jonathan Lemon 7141063Srwatson * and McAfee Research, the Security Research Division of McAfee, Inc. under 8141063Srwatson * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the 9253210Sandre * DARPA CHATS research program. [2001 McAfee, Inc.] 1086764Sjlemon * 1186764Sjlemon * Redistribution and use in source and binary forms, with or without 1286764Sjlemon * modification, are permitted provided that the following conditions 1386764Sjlemon * are met: 1486764Sjlemon * 1. Redistributions of source code must retain the above copyright 1586764Sjlemon * notice, this list of conditions and the following disclaimer. 1686764Sjlemon * 2. Redistributions in binary form must reproduce the above copyright 1786764Sjlemon * notice, this list of conditions and the following disclaimer in the 1886764Sjlemon * documentation and/or other materials provided with the distribution. 1986764Sjlemon * 2086764Sjlemon * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 2186764Sjlemon * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2286764Sjlemon * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2386764Sjlemon * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 2486764Sjlemon * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2586764Sjlemon * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2686764Sjlemon * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2786764Sjlemon * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2886764Sjlemon * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2986764Sjlemon * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3086764Sjlemon * SUCH DAMAGE. 3186764Sjlemon */ 3286764Sjlemon 33172467Ssilby#include <sys/cdefs.h> 34172467Ssilby__FBSDID("$FreeBSD: releng/10.2/sys/netinet/tcp_syncache.c 284603 2015-06-19 19:36:21Z hiren $"); 35172467Ssilby 36125680Sbms#include "opt_inet.h" 3786764Sjlemon#include "opt_inet6.h" 3886764Sjlemon#include "opt_ipsec.h" 39222748Srwatson#include "opt_pcbgroup.h" 4086764Sjlemon 4186764Sjlemon#include <sys/param.h> 4286764Sjlemon#include <sys/systm.h> 4386764Sjlemon#include <sys/kernel.h> 4486764Sjlemon#include <sys/sysctl.h> 45174775Sru#include <sys/limits.h> 46159695Sandre#include <sys/lock.h> 47159695Sandre#include <sys/mutex.h> 4886764Sjlemon#include <sys/malloc.h> 4986764Sjlemon#include <sys/mbuf.h> 5086764Sjlemon#include <sys/proc.h> /* for proc0 declaration */ 5186764Sjlemon#include <sys/random.h> 5286764Sjlemon#include <sys/socket.h> 5386764Sjlemon#include <sys/socketvar.h> 54169685Sandre#include <sys/syslog.h> 55182056Sbz#include <sys/ucred.h> 5686764Sjlemon 57253210Sandre#include <sys/md5.h> 58253210Sandre#include <crypto/siphash/siphash.h> 59253210Sandre 60162278Sandre#include <vm/uma.h> 61162278Sandre 6286764Sjlemon#include <net/if.h> 6386764Sjlemon#include <net/route.h> 64195699Srwatson#include <net/vnet.h> 6586764Sjlemon 6686764Sjlemon#include <netinet/in.h> 6786764Sjlemon#include <netinet/in_systm.h> 6886764Sjlemon#include <netinet/ip.h> 6986764Sjlemon#include <netinet/in_var.h> 7086764Sjlemon#include <netinet/in_pcb.h> 7186764Sjlemon#include <netinet/ip_var.h> 72152592Sandre#include <netinet/ip_options.h> 7386764Sjlemon#ifdef INET6 7486764Sjlemon#include <netinet/ip6.h> 7586764Sjlemon#include <netinet/icmp6.h> 7686764Sjlemon#include <netinet6/nd6.h> 7786764Sjlemon#include <netinet6/ip6_var.h> 7886764Sjlemon#include <netinet6/in6_pcb.h> 7986764Sjlemon#endif 8086764Sjlemon#include <netinet/tcp.h> 8186764Sjlemon#include <netinet/tcp_fsm.h> 8286764Sjlemon#include <netinet/tcp_seq.h> 8386764Sjlemon#include <netinet/tcp_timer.h> 8486764Sjlemon#include <netinet/tcp_var.h> 85171605Ssilby#include <netinet/tcp_syncache.h> 8686764Sjlemon#ifdef INET6 8786764Sjlemon#include <netinet6/tcp6_var.h> 8886764Sjlemon#endif 89237263Snp#ifdef TCP_OFFLOAD 90237263Snp#include <netinet/toecore.h> 91237263Snp#endif 9286764Sjlemon 93171167Sgnn#ifdef IPSEC 94105199Ssam#include <netipsec/ipsec.h> 95105199Ssam#ifdef INET6 96105199Ssam#include <netipsec/ipsec6.h> 97105199Ssam#endif 98105199Ssam#include <netipsec/key.h> 99171167Sgnn#endif /*IPSEC*/ 100105199Ssam 10186764Sjlemon#include <machine/in_cksum.h> 10286764Sjlemon 103163606Srwatson#include <security/mac/mac_framework.h> 104163606Srwatson 105215701Sdimstatic VNET_DEFINE(int, tcp_syncookies) = 1; 106195727Srwatson#define V_tcp_syncookies VNET(tcp_syncookies) 107195699SrwatsonSYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW, 108195699Srwatson &VNET_NAME(tcp_syncookies), 0, 10988180Sjlemon "Use TCP SYN cookies if the syncache overflows"); 11088180Sjlemon 111215701Sdimstatic VNET_DEFINE(int, tcp_syncookiesonly) = 0; 112207369Sbz#define V_tcp_syncookiesonly VNET(tcp_syncookiesonly) 113195699SrwatsonSYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW, 114195699Srwatson &VNET_NAME(tcp_syncookiesonly), 0, 115162277Sandre "Use only TCP SYN cookies"); 116162277Sandre 117237263Snp#ifdef TCP_OFFLOAD 118237263Snp#define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL) 119174704Skmacy#endif 120174704Skmacy 12186764Sjlemonstatic void syncache_drop(struct syncache *, struct syncache_head *); 12286764Sjlemonstatic void syncache_free(struct syncache *); 12388180Sjlemonstatic void syncache_insert(struct syncache *, struct syncache_head *); 124168900Sandrestatic int syncache_respond(struct syncache *); 125133874Srwatsonstatic struct socket *syncache_socket(struct syncache *, struct socket *, 12696602Srwatson struct mbuf *m); 127242254Sandrestatic int syncache_sysctl_count(SYSCTL_HANDLER_ARGS); 128171639Sandrestatic void syncache_timeout(struct syncache *sc, struct syncache_head *sch, 129171639Sandre int docallout); 13086764Sjlemonstatic void syncache_timer(void *); 131253210Sandre 132253210Sandrestatic uint32_t syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t, 133253210Sandre uint8_t *, uintptr_t); 134253210Sandrestatic tcp_seq syncookie_generate(struct syncache_head *, struct syncache *); 135159697Sandrestatic struct syncache 136162277Sandre *syncookie_lookup(struct in_conninfo *, struct syncache_head *, 137253210Sandre struct syncache *, struct tcphdr *, struct tcpopt *, 138159697Sandre struct socket *); 139253210Sandrestatic void syncookie_reseed(void *); 140253210Sandre#ifdef INVARIANTS 141253210Sandrestatic int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, 142253210Sandre struct syncache *sc, struct tcphdr *th, struct tcpopt *to, 143253210Sandre struct socket *lso); 144253210Sandre#endif 14586764Sjlemon 14686764Sjlemon/* 14786764Sjlemon * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. 148174775Sru * 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds, 14986764Sjlemon * the odds are that the user has given up attempting to connect by then. 15086764Sjlemon */ 15186764Sjlemon#define SYNCACHE_MAXREXMTS 3 15286764Sjlemon 15386764Sjlemon/* Arbitrary values */ 15486764Sjlemon#define TCP_SYNCACHE_HASHSIZE 512 15586764Sjlemon#define TCP_SYNCACHE_BUCKETLIMIT 30 15686764Sjlemon 157215701Sdimstatic VNET_DEFINE(struct tcp_syncache, tcp_syncache); 158207369Sbz#define V_tcp_syncache VNET(tcp_syncache) 159207369Sbz 160227309Sedstatic SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, 161227309Sed "TCP SYN cache"); 16286764Sjlemon 163217322SmdfSYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN, 164195699Srwatson &VNET_NAME(tcp_syncache.bucket_limit), 0, 165195699Srwatson "Per-bucket hash limit for syncache"); 16686764Sjlemon 167217322SmdfSYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, 168195699Srwatson &VNET_NAME(tcp_syncache.cache_limit), 0, 169195699Srwatson "Overall entry limit for syncache"); 17086764Sjlemon 171242254SandreSYSCTL_VNET_PROC(_net_inet_tcp_syncache, OID_AUTO, count, (CTLTYPE_UINT|CTLFLAG_RD), 172242254Sandre NULL, 0, &syncache_sysctl_count, "IU", 173195699Srwatson "Current number of entries in syncache"); 17486764Sjlemon 175217322SmdfSYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN, 176195699Srwatson &VNET_NAME(tcp_syncache.hashsize), 0, 177195699Srwatson "Size of TCP syncache hashtable"); 17886764Sjlemon 179217322SmdfSYSCTL_VNET_UINT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW, 180195699Srwatson &VNET_NAME(tcp_syncache.rexmt_limit), 0, 181195699Srwatson "Limit on SYN/ACK retransmissions"); 18286764Sjlemon 183207369SbzVNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1; 184195699SrwatsonSYSCTL_VNET_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, 185195699Srwatson CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0, 186195699Srwatson "Send reset on socket allocation failure"); 187170055Sandre 18886764Sjlemonstatic MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); 18986764Sjlemon 190133874Srwatson#define SYNCACHE_HASH(inc, mask) \ 191181803Sbz ((V_tcp_syncache.hash_secret ^ \ 19286764Sjlemon (inc)->inc_faddr.s_addr ^ \ 193133874Srwatson ((inc)->inc_faddr.s_addr >> 16) ^ \ 19486764Sjlemon (inc)->inc_fport ^ (inc)->inc_lport) & mask) 19586764Sjlemon 196133874Srwatson#define SYNCACHE_HASH6(inc, mask) \ 197181803Sbz ((V_tcp_syncache.hash_secret ^ \ 198133874Srwatson (inc)->inc6_faddr.s6_addr32[0] ^ \ 199133874Srwatson (inc)->inc6_faddr.s6_addr32[3] ^ \ 20086764Sjlemon (inc)->inc_fport ^ (inc)->inc_lport) & mask) 20186764Sjlemon 20286764Sjlemon#define ENDPTS_EQ(a, b) ( \ 20389667Sjlemon (a)->ie_fport == (b)->ie_fport && \ 20486764Sjlemon (a)->ie_lport == (b)->ie_lport && \ 20586764Sjlemon (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \ 20686764Sjlemon (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \ 20786764Sjlemon) 20886764Sjlemon 20986764Sjlemon#define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0) 21086764Sjlemon 211159695Sandre#define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx) 212159695Sandre#define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx) 213159695Sandre#define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED) 214159695Sandre 215159695Sandre/* 216159695Sandre * Requires the syncache entry to be already removed from the bucket list. 217159695Sandre */ 21886764Sjlemonstatic void 21986764Sjlemonsyncache_free(struct syncache *sc) 22086764Sjlemon{ 221183550Szec 22286764Sjlemon if (sc->sc_ipopts) 22386764Sjlemon (void) m_free(sc->sc_ipopts); 224182056Sbz if (sc->sc_cred) 225182056Sbz crfree(sc->sc_cred); 226165149Scsjp#ifdef MAC 227172970Srwatson mac_syncache_destroy(&sc->sc_label); 228165149Scsjp#endif 229122922Sandre 230181803Sbz uma_zfree(V_tcp_syncache.zone, sc); 23186764Sjlemon} 23286764Sjlemon 23386764Sjlemonvoid 23486764Sjlemonsyncache_init(void) 23586764Sjlemon{ 23686764Sjlemon int i; 23786764Sjlemon 238181803Sbz V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; 239181803Sbz V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; 240181803Sbz V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; 241181803Sbz V_tcp_syncache.hash_secret = arc4random(); 24286764Sjlemon 243133874Srwatson TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", 244181803Sbz &V_tcp_syncache.hashsize); 245133874Srwatson TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", 246181803Sbz &V_tcp_syncache.bucket_limit); 247181887Sjulian if (!powerof2(V_tcp_syncache.hashsize) || 248181887Sjulian V_tcp_syncache.hashsize == 0) { 249133874Srwatson printf("WARNING: syncache hash size is not a power of 2.\n"); 250181803Sbz V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; 251133874Srwatson } 252181803Sbz V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1; 25386764Sjlemon 254159695Sandre /* Set limits. */ 255181803Sbz V_tcp_syncache.cache_limit = 256181803Sbz V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit; 257159695Sandre TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", 258181803Sbz &V_tcp_syncache.cache_limit); 259159695Sandre 26086764Sjlemon /* Allocate the hash table. */ 261184214Sdes V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize * 262184214Sdes sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO); 26386764Sjlemon 264253210Sandre#ifdef VIMAGE 265253210Sandre V_tcp_syncache.vnet = curvnet; 266253210Sandre#endif 267253210Sandre 26886764Sjlemon /* Initialize the hash buckets. */ 269181803Sbz for (i = 0; i < V_tcp_syncache.hashsize; i++) { 270181803Sbz TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); 271181803Sbz mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", 272159695Sandre NULL, MTX_DEF); 273181803Sbz callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer, 274181803Sbz &V_tcp_syncache.hashbase[i].sch_mtx, 0); 275181803Sbz V_tcp_syncache.hashbase[i].sch_length = 0; 276253210Sandre V_tcp_syncache.hashbase[i].sch_sc = &V_tcp_syncache; 27786764Sjlemon } 27886764Sjlemon 279159695Sandre /* Create the syncache entry zone. */ 280181803Sbz V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), 281159695Sandre NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 282246208Sandre V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone, 283246208Sandre V_tcp_syncache.cache_limit); 284253210Sandre 285253210Sandre /* Start the SYN cookie reseeder callout. */ 286253210Sandre callout_init(&V_tcp_syncache.secret.reseed, 1); 287253210Sandre arc4rand(V_tcp_syncache.secret.key[0], SYNCOOKIE_SECRET_SIZE, 0); 288253210Sandre arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0); 289253210Sandre callout_reset(&V_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz, 290253210Sandre syncookie_reseed, &V_tcp_syncache); 29186764Sjlemon} 29286764Sjlemon 293193731Szec#ifdef VIMAGE 294193731Szecvoid 295193731Szecsyncache_destroy(void) 296193731Szec{ 297204143Sbz struct syncache_head *sch; 298204143Sbz struct syncache *sc, *nsc; 299204143Sbz int i; 300193731Szec 301204143Sbz /* Cleanup hash buckets: stop timers, free entries, destroy locks. */ 302204143Sbz for (i = 0; i < V_tcp_syncache.hashsize; i++) { 303193731Szec 304204143Sbz sch = &V_tcp_syncache.hashbase[i]; 305204143Sbz callout_drain(&sch->sch_timer); 306204143Sbz 307204143Sbz SCH_LOCK(sch); 308204143Sbz TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) 309204143Sbz syncache_drop(sc, sch); 310204143Sbz SCH_UNLOCK(sch); 311204143Sbz KASSERT(TAILQ_EMPTY(&sch->sch_bucket), 312204143Sbz ("%s: sch->sch_bucket not empty", __func__)); 313204143Sbz KASSERT(sch->sch_length == 0, ("%s: sch->sch_length %d not 0", 314204143Sbz __func__, sch->sch_length)); 315204143Sbz mtx_destroy(&sch->sch_mtx); 316204143Sbz } 317204143Sbz 318242254Sandre KASSERT(uma_zone_get_cur(V_tcp_syncache.zone) == 0, 319242254Sandre ("%s: cache_count not 0", __func__)); 320204143Sbz 321204143Sbz /* Free the allocated global resources. */ 322193731Szec uma_zdestroy(V_tcp_syncache.zone); 323204143Sbz free(V_tcp_syncache.hashbase, M_SYNCACHE); 324253210Sandre 325253210Sandre callout_drain(&V_tcp_syncache.secret.reseed); 326193731Szec} 327193731Szec#endif 328193731Szec 329242254Sandrestatic int 330242254Sandresyncache_sysctl_count(SYSCTL_HANDLER_ARGS) 331242254Sandre{ 332242254Sandre int count; 333242254Sandre 334242254Sandre count = uma_zone_get_cur(V_tcp_syncache.zone); 335244680Sglebius return (sysctl_handle_int(oidp, &count, 0, req)); 336242254Sandre} 337242254Sandre 338159695Sandre/* 339159695Sandre * Inserts a syncache entry into the specified bucket row. 340159695Sandre * Locks and unlocks the syncache_head autonomously. 341159695Sandre */ 34288180Sjlemonstatic void 343159697Sandresyncache_insert(struct syncache *sc, struct syncache_head *sch) 34486764Sjlemon{ 34586764Sjlemon struct syncache *sc2; 34686764Sjlemon 347159695Sandre SCH_LOCK(sch); 348122496Ssam 34986764Sjlemon /* 350159695Sandre * Make sure that we don't overflow the per-bucket limit. 351159695Sandre * If the bucket is full, toss the oldest element. 35286764Sjlemon */ 353181803Sbz if (sch->sch_length >= V_tcp_syncache.bucket_limit) { 354159695Sandre KASSERT(!TAILQ_EMPTY(&sch->sch_bucket), 355159695Sandre ("sch->sch_length incorrect")); 356159695Sandre sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head); 35786764Sjlemon syncache_drop(sc2, sch); 358190948Srwatson TCPSTAT_INC(tcps_sc_bucketoverflow); 35986764Sjlemon } 36086764Sjlemon 36186764Sjlemon /* Put it into the bucket. */ 362159695Sandre TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash); 36386764Sjlemon sch->sch_length++; 364159695Sandre 365237263Snp#ifdef TCP_OFFLOAD 366237263Snp if (ADDED_BY_TOE(sc)) { 367237263Snp struct toedev *tod = sc->sc_tod; 368237263Snp 369237263Snp tod->tod_syncache_added(tod, sc->sc_todctx); 370237263Snp } 371237263Snp#endif 372237263Snp 373159695Sandre /* Reinitialize the bucket row's timer. */ 374174775Sru if (sch->sch_length == 1) 375174775Sru sch->sch_nextc = ticks + INT_MAX; 376171639Sandre syncache_timeout(sc, sch, 1); 377159695Sandre 378159695Sandre SCH_UNLOCK(sch); 379159695Sandre 380190948Srwatson TCPSTAT_INC(tcps_sc_added); 38186764Sjlemon} 38286764Sjlemon 383159695Sandre/* 384159695Sandre * Remove and free entry from syncache bucket row. 385159695Sandre * Expects locked syncache head. 386159695Sandre */ 38786764Sjlemonstatic void 388159697Sandresyncache_drop(struct syncache *sc, struct syncache_head *sch) 38986764Sjlemon{ 39086764Sjlemon 391159695Sandre SCH_LOCK_ASSERT(sch); 39286764Sjlemon 39386764Sjlemon TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); 39486764Sjlemon sch->sch_length--; 39586764Sjlemon 396237263Snp#ifdef TCP_OFFLOAD 397237263Snp if (ADDED_BY_TOE(sc)) { 398237263Snp struct toedev *tod = sc->sc_tod; 399237263Snp 400237263Snp tod->tod_syncache_removed(tod, sc->sc_todctx); 401237263Snp } 402237263Snp#endif 403237263Snp 40486764Sjlemon syncache_free(sc); 40586764Sjlemon} 40686764Sjlemon 40786764Sjlemon/* 408171639Sandre * Engage/reengage time on bucket row. 409171639Sandre */ 410171639Sandrestatic void 411171639Sandresyncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) 412171639Sandre{ 413171639Sandre sc->sc_rxttime = ticks + 414242261Sandre TCPTV_RTOBASE * (tcp_syn_backoff[sc->sc_rxmits]); 415171639Sandre sc->sc_rxmits++; 416174775Sru if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { 417171639Sandre sch->sch_nextc = sc->sc_rxttime; 418174775Sru if (docallout) 419174775Sru callout_reset(&sch->sch_timer, sch->sch_nextc - ticks, 420174775Sru syncache_timer, (void *)sch); 421174775Sru } 422171639Sandre} 423171639Sandre 424171639Sandre/* 42586764Sjlemon * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. 42686764Sjlemon * If we have retransmitted an entry the maximum number of times, expire it. 427159695Sandre * One separate timer for each bucket row. 42886764Sjlemon */ 42986764Sjlemonstatic void 430159697Sandresyncache_timer(void *xsch) 43186764Sjlemon{ 432159695Sandre struct syncache_head *sch = (struct syncache_head *)xsch; 43386764Sjlemon struct syncache *sc, *nsc; 434159695Sandre int tick = ticks; 435169685Sandre char *s; 43686764Sjlemon 437253210Sandre CURVNET_SET(sch->sch_sc->vnet); 438185348Szec 439159695Sandre /* NB: syncache_head has already been locked by the callout. */ 440159695Sandre SCH_LOCK_ASSERT(sch); 44186764Sjlemon 442174775Sru /* 443174775Sru * In the following cycle we may remove some entries and/or 444174775Sru * advance some timeouts, so re-initialize the bucket timer. 445174775Sru */ 446174775Sru sch->sch_nextc = tick + INT_MAX; 447174775Sru 448159695Sandre TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) { 449159695Sandre /* 450159695Sandre * We do not check if the listen socket still exists 451159695Sandre * and accept the case where the listen socket may be 452159695Sandre * gone by the time we resend the SYN/ACK. We do 453159695Sandre * not expect this to happens often. If it does, 454159695Sandre * then the RST will be sent by the time the remote 455159695Sandre * host does the SYN/ACK->ACK. 456159695Sandre */ 457174775Sru if (TSTMP_GT(sc->sc_rxttime, tick)) { 458174775Sru if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) 459159695Sandre sch->sch_nextc = sc->sc_rxttime; 460159695Sandre continue; 461159695Sandre } 462181803Sbz if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) { 463169685Sandre if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { 464171639Sandre log(LOG_DEBUG, "%s; %s: Retransmits exhausted, " 465171639Sandre "giving up and removing syncache entry\n", 466169685Sandre s, __func__); 467169685Sandre free(s, M_TCPLOG); 468169685Sandre } 469159695Sandre syncache_drop(sc, sch); 470190948Srwatson TCPSTAT_INC(tcps_sc_stale); 47186764Sjlemon continue; 47286764Sjlemon } 473171639Sandre if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { 474171639Sandre log(LOG_DEBUG, "%s; %s: Response timeout, " 475171639Sandre "retransmitting (%u) SYN|ACK\n", 476171639Sandre s, __func__, sc->sc_rxmits); 477171639Sandre free(s, M_TCPLOG); 478171639Sandre } 479159695Sandre 480168900Sandre (void) syncache_respond(sc); 481190948Srwatson TCPSTAT_INC(tcps_sc_retransmitted); 482171639Sandre syncache_timeout(sc, sch, 0); 48386764Sjlemon } 484159695Sandre if (!TAILQ_EMPTY(&(sch)->sch_bucket)) 485159695Sandre callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick, 486159695Sandre syncache_timer, (void *)(sch)); 487185348Szec CURVNET_RESTORE(); 48886764Sjlemon} 48986764Sjlemon 49086764Sjlemon/* 49186764Sjlemon * Find an entry in the syncache. 492159695Sandre * Returns always with locked syncache_head plus a matching entry or NULL. 49386764Sjlemon */ 494270055Sbzstatic struct syncache * 495159697Sandresyncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) 49686764Sjlemon{ 49786764Sjlemon struct syncache *sc; 49886764Sjlemon struct syncache_head *sch; 49986764Sjlemon 50086764Sjlemon#ifdef INET6 501186222Sbz if (inc->inc_flags & INC_ISIPV6) { 502181803Sbz sch = &V_tcp_syncache.hashbase[ 503181803Sbz SYNCACHE_HASH6(inc, V_tcp_syncache.hashmask)]; 50486764Sjlemon *schp = sch; 505159695Sandre 506159695Sandre SCH_LOCK(sch); 507159695Sandre 508159695Sandre /* Circle through bucket row to find matching entry. */ 50986764Sjlemon TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { 510122496Ssam if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) 51186764Sjlemon return (sc); 51286764Sjlemon } 51386764Sjlemon } else 51486764Sjlemon#endif 51586764Sjlemon { 516181803Sbz sch = &V_tcp_syncache.hashbase[ 517181803Sbz SYNCACHE_HASH(inc, V_tcp_syncache.hashmask)]; 51886764Sjlemon *schp = sch; 519159695Sandre 520159695Sandre SCH_LOCK(sch); 521159695Sandre 522159695Sandre /* Circle through bucket row to find matching entry. */ 52386764Sjlemon TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { 52486764Sjlemon#ifdef INET6 525186222Sbz if (sc->sc_inc.inc_flags & INC_ISIPV6) 52686764Sjlemon continue; 52786764Sjlemon#endif 528122496Ssam if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) 52986764Sjlemon return (sc); 53086764Sjlemon } 53186764Sjlemon } 532159695Sandre SCH_LOCK_ASSERT(*schp); 533159695Sandre return (NULL); /* always returns with locked sch */ 53486764Sjlemon} 53586764Sjlemon 53686764Sjlemon/* 53786764Sjlemon * This function is called when we get a RST for a 53886764Sjlemon * non-existent connection, so that we can see if the 53986764Sjlemon * connection is in the syn cache. If it is, zap it. 54086764Sjlemon */ 54186764Sjlemonvoid 542159697Sandresyncache_chkrst(struct in_conninfo *inc, struct tcphdr *th) 54386764Sjlemon{ 54486764Sjlemon struct syncache *sc; 54586764Sjlemon struct syncache_head *sch; 546171638Sandre char *s = NULL; 54786764Sjlemon 548159695Sandre sc = syncache_lookup(inc, &sch); /* returns locked sch */ 549159695Sandre SCH_LOCK_ASSERT(sch); 550171638Sandre 551171638Sandre /* 552171638Sandre * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags. 553171638Sandre * See RFC 793 page 65, section SEGMENT ARRIVES. 554171638Sandre */ 555171638Sandre if (th->th_flags & (TH_ACK|TH_SYN|TH_FIN)) { 556171638Sandre if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 557171638Sandre log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or " 558171638Sandre "FIN flag set, segment ignored\n", s, __func__); 559190948Srwatson TCPSTAT_INC(tcps_badrst); 560159695Sandre goto done; 561171638Sandre } 562122496Ssam 56386764Sjlemon /* 564171638Sandre * No corresponding connection was found in syncache. 565171638Sandre * If syncookies are enabled and possibly exclusively 566171638Sandre * used, or we are under memory pressure, a valid RST 567171638Sandre * may not find a syncache entry. In that case we're 568171638Sandre * done and no SYN|ACK retransmissions will happen. 569218909Sbrucec * Otherwise the RST was misdirected or spoofed. 570171638Sandre */ 571171638Sandre if (sc == NULL) { 572171638Sandre if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 573171638Sandre log(LOG_DEBUG, "%s; %s: Spurious RST without matching " 574171638Sandre "syncache entry (possibly syncookie only), " 575171638Sandre "segment ignored\n", s, __func__); 576190948Srwatson TCPSTAT_INC(tcps_badrst); 577171638Sandre goto done; 578171638Sandre } 579171638Sandre 580171638Sandre /* 58186764Sjlemon * If the RST bit is set, check the sequence number to see 58286764Sjlemon * if this is a valid reset segment. 58386764Sjlemon * RFC 793 page 37: 58486764Sjlemon * In all states except SYN-SENT, all reset (RST) segments 58586764Sjlemon * are validated by checking their SEQ-fields. A reset is 58686764Sjlemon * valid if its sequence number is in the window. 58786764Sjlemon * 58886764Sjlemon * The sequence number in the reset segment is normally an 58986764Sjlemon * echo of our outgoing acknowlegement numbers, but some hosts 59086764Sjlemon * send a reset with the sequence number at the rightmost edge 59186764Sjlemon * of our receive window, and we have to handle this case. 59286764Sjlemon */ 59386764Sjlemon if (SEQ_GEQ(th->th_seq, sc->sc_irs) && 59486764Sjlemon SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) { 59586764Sjlemon syncache_drop(sc, sch); 596171638Sandre if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 597171638Sandre log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, " 598171638Sandre "connection attempt aborted by remote endpoint\n", 599171638Sandre s, __func__); 600190948Srwatson TCPSTAT_INC(tcps_sc_reset); 601178862Sjhb } else { 602178862Sjhb if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 603178862Sjhb log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != " 604178862Sjhb "IRS %u (+WND %u), segment ignored\n", 605178862Sjhb s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd); 606190948Srwatson TCPSTAT_INC(tcps_badrst); 60786764Sjlemon } 608171638Sandre 609159695Sandredone: 610171638Sandre if (s != NULL) 611171638Sandre free(s, M_TCPLOG); 612159695Sandre SCH_UNLOCK(sch); 61386764Sjlemon} 61486764Sjlemon 61586764Sjlemonvoid 616159697Sandresyncache_badack(struct in_conninfo *inc) 61786764Sjlemon{ 61886764Sjlemon struct syncache *sc; 61986764Sjlemon struct syncache_head *sch; 62086764Sjlemon 621159695Sandre sc = syncache_lookup(inc, &sch); /* returns locked sch */ 622159695Sandre SCH_LOCK_ASSERT(sch); 62386764Sjlemon if (sc != NULL) { 62486764Sjlemon syncache_drop(sc, sch); 625190948Srwatson TCPSTAT_INC(tcps_sc_badack); 62686764Sjlemon } 627159695Sandre SCH_UNLOCK(sch); 62886764Sjlemon} 62986764Sjlemon 63086764Sjlemonvoid 631159697Sandresyncache_unreach(struct in_conninfo *inc, struct tcphdr *th) 63286764Sjlemon{ 63386764Sjlemon struct syncache *sc; 63486764Sjlemon struct syncache_head *sch; 63586764Sjlemon 636159695Sandre sc = syncache_lookup(inc, &sch); /* returns locked sch */ 637159695Sandre SCH_LOCK_ASSERT(sch); 63886764Sjlemon if (sc == NULL) 639159695Sandre goto done; 64086764Sjlemon 64186764Sjlemon /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ 64286764Sjlemon if (ntohl(th->th_seq) != sc->sc_iss) 643159695Sandre goto done; 64486764Sjlemon 64586764Sjlemon /* 64686764Sjlemon * If we've rertransmitted 3 times and this is our second error, 64786764Sjlemon * we remove the entry. Otherwise, we allow it to continue on. 64886764Sjlemon * This prevents us from incorrectly nuking an entry during a 64986764Sjlemon * spurious network outage. 65086764Sjlemon * 65186764Sjlemon * See tcp_notify(). 65286764Sjlemon */ 653159695Sandre if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) { 65486764Sjlemon sc->sc_flags |= SCF_UNREACH; 655159695Sandre goto done; 65686764Sjlemon } 65786764Sjlemon syncache_drop(sc, sch); 658190948Srwatson TCPSTAT_INC(tcps_sc_unreach); 659159695Sandredone: 660159695Sandre SCH_UNLOCK(sch); 66186764Sjlemon} 66286764Sjlemon 66386764Sjlemon/* 66486764Sjlemon * Build a new TCP socket structure from a syncache entry. 66586764Sjlemon */ 66686764Sjlemonstatic struct socket * 667159697Sandresyncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) 66886764Sjlemon{ 66986764Sjlemon struct inpcb *inp = NULL; 67086764Sjlemon struct socket *so; 67186764Sjlemon struct tcpcb *tp; 672211332Sandre int error; 673169685Sandre char *s; 67486764Sjlemon 675181803Sbz INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 676122496Ssam 67786764Sjlemon /* 67886764Sjlemon * Ok, create the full blown connection, and set things up 67986764Sjlemon * as they would have been set up if we had created the 68086764Sjlemon * connection when the SYN arrived. If we can't create 68186764Sjlemon * the connection, abort it. 68286764Sjlemon */ 68386764Sjlemon so = sonewconn(lso, SS_ISCONNECTED); 68486764Sjlemon if (so == NULL) { 68586764Sjlemon /* 686169685Sandre * Drop the connection; we will either send a RST or 687169685Sandre * have the peer retransmit its SYN again after its 688169685Sandre * RTO and try again. 68986764Sjlemon */ 690190948Srwatson TCPSTAT_INC(tcps_listendrop); 691169685Sandre if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { 692169685Sandre log(LOG_DEBUG, "%s; %s: Socket create failed " 693169685Sandre "due to limits or memory shortage\n", 694169685Sandre s, __func__); 695169685Sandre free(s, M_TCPLOG); 696169685Sandre } 697122496Ssam goto abort2; 69886764Sjlemon } 699101106Srwatson#ifdef MAC 700172930Srwatson mac_socketpeer_set_from_mbuf(m, so); 701101106Srwatson#endif 70286764Sjlemon 70386764Sjlemon inp = sotoinpcb(so); 704195922Sjulian inp->inp_inc.inc_fibnum = so->so_fibnum; 705178285Srwatson INP_WLOCK(inp); 706222488Srwatson INP_HASH_WLOCK(&V_tcbinfo); 70786764Sjlemon 708159695Sandre /* Insert new socket into PCB hash list. */ 709186222Sbz inp->inp_inc.inc_flags = sc->sc_inc.inc_flags; 71086764Sjlemon#ifdef INET6 711186222Sbz if (sc->sc_inc.inc_flags & INC_ISIPV6) { 71286764Sjlemon inp->in6p_laddr = sc->sc_inc.inc6_laddr; 71386764Sjlemon } else { 71486764Sjlemon inp->inp_vflag &= ~INP_IPV6; 71586764Sjlemon inp->inp_vflag |= INP_IPV4; 71686764Sjlemon#endif 71786764Sjlemon inp->inp_laddr = sc->sc_inc.inc_laddr; 71886764Sjlemon#ifdef INET6 71986764Sjlemon } 72086764Sjlemon#endif 721222748Srwatson 722222748Srwatson /* 723261705Sadrian * If there's an mbuf and it has a flowid, then let's initialise the 724261705Sadrian * inp with that particular flowid. 725261705Sadrian */ 726281955Shiren if (m != NULL && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { 727261705Sadrian inp->inp_flowid = m->m_pkthdr.flowid; 728284603Shiren inp->inp_flowtype = M_HASHTYPE_GET(m); 729261705Sadrian } 730261705Sadrian 731261705Sadrian /* 732222748Srwatson * Install in the reservation hash table for now, but don't yet 733222748Srwatson * install a connection group since the full 4-tuple isn't yet 734222748Srwatson * configured. 735222748Srwatson */ 73686764Sjlemon inp->inp_lport = sc->sc_inc.inc_lport; 737222748Srwatson if ((error = in_pcbinshash_nopcbgroup(inp)) != 0) { 73886764Sjlemon /* 73986764Sjlemon * Undo the assignments above if we failed to 74086764Sjlemon * put the PCB on the hash lists. 74186764Sjlemon */ 74286764Sjlemon#ifdef INET6 743186222Sbz if (sc->sc_inc.inc_flags & INC_ISIPV6) 74486764Sjlemon inp->in6p_laddr = in6addr_any; 745133874Srwatson else 74686764Sjlemon#endif 74786764Sjlemon inp->inp_laddr.s_addr = INADDR_ANY; 74886764Sjlemon inp->inp_lport = 0; 749211327Sandre if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { 750211327Sandre log(LOG_DEBUG, "%s; %s: in_pcbinshash failed " 751211327Sandre "with error %i\n", 752211327Sandre s, __func__, error); 753211327Sandre free(s, M_TCPLOG); 754211327Sandre } 755222488Srwatson INP_HASH_WUNLOCK(&V_tcbinfo); 75686764Sjlemon goto abort; 75786764Sjlemon } 758171167Sgnn#ifdef IPSEC 759159697Sandre /* Copy old policy into new socket's. */ 76086764Sjlemon if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp)) 761159950Sandre printf("syncache_socket: could not copy policy\n"); 76286764Sjlemon#endif 76386764Sjlemon#ifdef INET6 764186222Sbz if (sc->sc_inc.inc_flags & INC_ISIPV6) { 76586764Sjlemon struct inpcb *oinp = sotoinpcb(lso); 76686764Sjlemon struct in6_addr laddr6; 767124847Sandre struct sockaddr_in6 sin6; 76886764Sjlemon /* 76986764Sjlemon * Inherit socket options from the listening socket. 77086764Sjlemon * Note that in6p_inputopts are not (and should not be) 77186764Sjlemon * copied, since it stores previously received options and is 77286764Sjlemon * used to detect if each new option is different than the 77386764Sjlemon * previous one and hence should be passed to a user. 774133874Srwatson * If we copied in6p_inputopts, a user would not be able to 77586764Sjlemon * receive options just after calling the accept system call. 77686764Sjlemon */ 77786764Sjlemon inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS; 77886764Sjlemon if (oinp->in6p_outputopts) 77986764Sjlemon inp->in6p_outputopts = 78086764Sjlemon ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); 78186764Sjlemon 782124847Sandre sin6.sin6_family = AF_INET6; 783124847Sandre sin6.sin6_len = sizeof(sin6); 784124847Sandre sin6.sin6_addr = sc->sc_inc.inc6_faddr; 785124847Sandre sin6.sin6_port = sc->sc_inc.inc_fport; 786124847Sandre sin6.sin6_flowinfo = sin6.sin6_scope_id = 0; 78786764Sjlemon laddr6 = inp->in6p_laddr; 78886764Sjlemon if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) 78986764Sjlemon inp->in6p_laddr = sc->sc_inc.inc6_laddr; 790222691Srwatson if ((error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6, 791222691Srwatson thread0.td_ucred, m)) != 0) { 79286764Sjlemon inp->in6p_laddr = laddr6; 793211327Sandre if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { 794211327Sandre log(LOG_DEBUG, "%s; %s: in6_pcbconnect failed " 795211327Sandre "with error %i\n", 796211327Sandre s, __func__, error); 797211327Sandre free(s, M_TCPLOG); 798211327Sandre } 799222488Srwatson INP_HASH_WUNLOCK(&V_tcbinfo); 80086764Sjlemon goto abort; 80186764Sjlemon } 802132307Sdwmalone /* Override flowlabel from in6_pcbconnect. */ 803186141Sbz inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; 804186141Sbz inp->inp_flow |= sc->sc_flowlabel; 805221250Sbz } 806221250Sbz#endif /* INET6 */ 807221250Sbz#if defined(INET) && defined(INET6) 808221250Sbz else 80986764Sjlemon#endif 810221250Sbz#ifdef INET 81186764Sjlemon { 81286764Sjlemon struct in_addr laddr; 813124847Sandre struct sockaddr_in sin; 81486764Sjlemon 815174704Skmacy inp->inp_options = (m) ? ip_srcroute(m) : NULL; 816174704Skmacy 81786764Sjlemon if (inp->inp_options == NULL) { 81886764Sjlemon inp->inp_options = sc->sc_ipopts; 81986764Sjlemon sc->sc_ipopts = NULL; 82086764Sjlemon } 82186764Sjlemon 822124847Sandre sin.sin_family = AF_INET; 823124847Sandre sin.sin_len = sizeof(sin); 824124847Sandre sin.sin_addr = sc->sc_inc.inc_faddr; 825124847Sandre sin.sin_port = sc->sc_inc.inc_fport; 826124847Sandre bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero)); 82786764Sjlemon laddr = inp->inp_laddr; 82886764Sjlemon if (inp->inp_laddr.s_addr == INADDR_ANY) 82986764Sjlemon inp->inp_laddr = sc->sc_inc.inc_laddr; 830222691Srwatson if ((error = in_pcbconnect_mbuf(inp, (struct sockaddr *)&sin, 831222691Srwatson thread0.td_ucred, m)) != 0) { 83286764Sjlemon inp->inp_laddr = laddr; 833211327Sandre if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { 834211327Sandre log(LOG_DEBUG, "%s; %s: in_pcbconnect failed " 835211327Sandre "with error %i\n", 836211327Sandre s, __func__, error); 837211327Sandre free(s, M_TCPLOG); 838211327Sandre } 839222488Srwatson INP_HASH_WUNLOCK(&V_tcbinfo); 84086764Sjlemon goto abort; 84186764Sjlemon } 84286764Sjlemon } 843221250Sbz#endif /* INET */ 844222488Srwatson INP_HASH_WUNLOCK(&V_tcbinfo); 84586764Sjlemon tp = intotcpcb(inp); 846254889Smarkj tcp_state_change(tp, TCPS_SYN_RECEIVED); 84786764Sjlemon tp->iss = sc->sc_iss; 84886764Sjlemon tp->irs = sc->sc_irs; 84986764Sjlemon tcp_rcvseqinit(tp); 85086764Sjlemon tcp_sendseqinit(tp); 85186764Sjlemon tp->snd_wl1 = sc->sc_irs; 852168368Sandre tp->snd_max = tp->iss + 1; 853168368Sandre tp->snd_nxt = tp->iss + 1; 85486764Sjlemon tp->rcv_up = sc->sc_irs + 1; 85586764Sjlemon tp->rcv_wnd = sc->sc_wnd; 85686764Sjlemon tp->rcv_adv += tp->rcv_wnd; 857168368Sandre tp->last_ack_sent = tp->rcv_nxt; 85886764Sjlemon 85990982Sjlemon tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); 86086764Sjlemon if (sc->sc_flags & SCF_NOOPT) 86186764Sjlemon tp->t_flags |= TF_NOOPT; 862159950Sandre else { 863159950Sandre if (sc->sc_flags & SCF_WINSCALE) { 864159950Sandre tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; 865159950Sandre tp->snd_scale = sc->sc_requested_s_scale; 866159950Sandre tp->request_r_scale = sc->sc_requested_r_scale; 867159950Sandre } 868159950Sandre if (sc->sc_flags & SCF_TIMESTAMP) { 869159950Sandre tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; 870159950Sandre tp->ts_recent = sc->sc_tsreflect; 871231767Sbz tp->ts_recent_age = tcp_ts_getticks(); 872162277Sandre tp->ts_offset = sc->sc_tsoff; 873159950Sandre } 874125680Sbms#ifdef TCP_SIGNATURE 875159950Sandre if (sc->sc_flags & SCF_SIGNATURE) 876159950Sandre tp->t_flags |= TF_SIGNATURE; 877125783Sbms#endif 878169317Sandre if (sc->sc_flags & SCF_SACK) 879159950Sandre tp->t_flags |= TF_SACK_PERMIT; 880130989Sps } 881159695Sandre 882181056Srpaulo if (sc->sc_flags & SCF_ECN) 883181056Srpaulo tp->t_flags |= TF_ECN_PERMIT; 884181056Srpaulo 885122922Sandre /* 886122922Sandre * Set up MSS and get cached values from tcp_hostcache. 887122922Sandre * This might overwrite some of the defaults we just set. 888122922Sandre */ 88986764Sjlemon tcp_mss(tp, sc->sc_peer_mss); 89086764Sjlemon 89186764Sjlemon /* 892242250Sandre * If the SYN,ACK was retransmitted, indicate that CWND to be 893242250Sandre * limited to one segment in cc_conn_init(). 894210666Sandre * NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits. 89586764Sjlemon */ 896210666Sandre if (sc->sc_rxmits > 1) 897242250Sandre tp->snd_cwnd = 1; 89886764Sjlemon 899237263Snp#ifdef TCP_OFFLOAD 900231025Sglebius /* 901237263Snp * Allow a TOE driver to install its hooks. Note that we hold the 902237263Snp * pcbinfo lock too and that prevents tcp_usr_accept from accepting a 903237263Snp * new connection before the TOE driver has done its thing. 904237263Snp */ 905237263Snp if (ADDED_BY_TOE(sc)) { 906237263Snp struct toedev *tod = sc->sc_tod; 907237263Snp 908237263Snp tod->tod_offload_socket(tod, sc->sc_todctx, so); 909237263Snp } 910237263Snp#endif 911237263Snp /* 912231025Sglebius * Copy and activate timers. 913231025Sglebius */ 914231025Sglebius tp->t_keepinit = sototcpcb(lso)->t_keepinit; 915231025Sglebius tp->t_keepidle = sototcpcb(lso)->t_keepidle; 916231025Sglebius tp->t_keepintvl = sototcpcb(lso)->t_keepintvl; 917231025Sglebius tp->t_keepcnt = sototcpcb(lso)->t_keepcnt; 918231025Sglebius tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); 919231025Sglebius 920178285Srwatson INP_WUNLOCK(inp); 921122496Ssam 922190948Srwatson TCPSTAT_INC(tcps_accepts); 92386764Sjlemon return (so); 92486764Sjlemon 92586764Sjlemonabort: 926178285Srwatson INP_WUNLOCK(inp); 927122496Ssamabort2: 92886764Sjlemon if (so != NULL) 929156763Srwatson soabort(so); 93086764Sjlemon return (NULL); 93186764Sjlemon} 93286764Sjlemon 93386764Sjlemon/* 93486764Sjlemon * This function gets called when we receive an ACK for a 93586764Sjlemon * socket in the LISTEN state. We look up the connection 93686764Sjlemon * in the syncache, and if its there, we pull it out of 93786764Sjlemon * the cache and turn it into a full-blown connection in 93886764Sjlemon * the SYN-RECEIVED state. 93986764Sjlemon */ 94086764Sjlemonint 941162277Sandresyncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, 942159697Sandre struct socket **lsop, struct mbuf *m) 94386764Sjlemon{ 94486764Sjlemon struct syncache *sc; 94586764Sjlemon struct syncache_head *sch; 946162277Sandre struct syncache scs; 947169685Sandre char *s; 94886764Sjlemon 949159695Sandre /* 950159695Sandre * Global TCP locks are held because we manipulate the PCB lists 951159695Sandre * and create a new socket. 952159695Sandre */ 953181803Sbz INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 954169685Sandre KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, 955169685Sandre ("%s: can handle only ACK", __func__)); 956122496Ssam 957159695Sandre sc = syncache_lookup(inc, &sch); /* returns locked sch */ 958159695Sandre SCH_LOCK_ASSERT(sch); 959253210Sandre 960253210Sandre#ifdef INVARIANTS 961253210Sandre /* 962253210Sandre * Test code for syncookies comparing the syncache stored 963253210Sandre * values with the reconstructed values from the cookie. 964253210Sandre */ 965253210Sandre if (sc != NULL) 966253210Sandre syncookie_cmp(inc, sch, sc, th, to, *lsop); 967253210Sandre#endif 968253210Sandre 96988180Sjlemon if (sc == NULL) { 97088180Sjlemon /* 971133874Srwatson * There is no syncache entry, so see if this ACK is 97288180Sjlemon * a returning syncookie. To do this, first: 97388180Sjlemon * A. See if this socket has had a syncache entry dropped in 97488180Sjlemon * the past. We don't want to accept a bogus syncookie 975133874Srwatson * if we've never received a SYN. 97688180Sjlemon * B. check that the syncookie is valid. If it is, then 97788180Sjlemon * cobble up a fake syncache entry, and return. 97888180Sjlemon */ 979185348Szec if (!V_tcp_syncookies) { 980162277Sandre SCH_UNLOCK(sch); 981169685Sandre if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 982170078Sandre log(LOG_DEBUG, "%s; %s: Spurious ACK, " 983170078Sandre "segment rejected (syncookies disabled)\n", 984169685Sandre s, __func__); 985162277Sandre goto failed; 986162277Sandre } 987162277Sandre bzero(&scs, sizeof(scs)); 988253210Sandre sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop); 989159695Sandre SCH_UNLOCK(sch); 990169685Sandre if (sc == NULL) { 991169685Sandre if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 992169685Sandre log(LOG_DEBUG, "%s; %s: Segment failed " 993170078Sandre "SYNCOOKIE authentication, segment rejected " 994170078Sandre "(probably spoofed)\n", s, __func__); 995159695Sandre goto failed; 996169685Sandre } 997159695Sandre } else { 998159695Sandre /* Pull out the entry to unlock the bucket row. */ 999159695Sandre TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); 1000159695Sandre sch->sch_length--; 1001237263Snp#ifdef TCP_OFFLOAD 1002237263Snp if (ADDED_BY_TOE(sc)) { 1003237263Snp struct toedev *tod = sc->sc_tod; 1004237263Snp 1005237263Snp tod->tod_syncache_removed(tod, sc->sc_todctx); 1006237263Snp } 1007237263Snp#endif 1008159695Sandre SCH_UNLOCK(sch); 100988180Sjlemon } 101086764Sjlemon 101186764Sjlemon /* 1012169685Sandre * Segment validation: 1013169685Sandre * ACK must match our initial sequence number + 1 (the SYN|ACK). 101486764Sjlemon */ 1015237263Snp if (th->th_ack != sc->sc_iss + 1) { 1016169685Sandre if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 1017170078Sandre log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment " 1018170078Sandre "rejected\n", s, __func__, th->th_ack, sc->sc_iss); 1019159695Sandre goto failed; 1020169685Sandre } 1021181337Sjhb 1022169686Sandre /* 1023181337Sjhb * The SEQ must fall in the window starting at the received 1024181337Sjhb * initial receive sequence number + 1 (the SYN). 1025169686Sandre */ 1026237263Snp if (SEQ_LEQ(th->th_seq, sc->sc_irs) || 1027237263Snp SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) { 1028169686Sandre if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 1029170078Sandre log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment " 1030170385Sandre "rejected\n", s, __func__, th->th_seq, sc->sc_irs); 1031169686Sandre goto failed; 1032169686Sandre } 1033174545Skmacy 1034253150Sandre /* 1035253150Sandre * If timestamps were not negotiated during SYN/ACK they 1036253150Sandre * must not appear on any segment during this session. 1037253150Sandre */ 1038169686Sandre if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) { 1039169686Sandre if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 1040170078Sandre log(LOG_DEBUG, "%s; %s: Timestamp not expected, " 1041170078Sandre "segment rejected\n", s, __func__); 1042169686Sandre goto failed; 1043169686Sandre } 1044253150Sandre 1045169686Sandre /* 1046253150Sandre * If timestamps were negotiated during SYN/ACK they should 1047253150Sandre * appear on every segment during this session. 1048253150Sandre * XXXAO: This is only informal as there have been unverified 1049253150Sandre * reports of non-compliants stacks. 1050253150Sandre */ 1051253150Sandre if ((sc->sc_flags & SCF_TIMESTAMP) && !(to->to_flags & TOF_TS)) { 1052253395Sandre if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { 1053253150Sandre log(LOG_DEBUG, "%s; %s: Timestamp missing, " 1054253150Sandre "no action\n", s, __func__); 1055253395Sandre free(s, M_TCPLOG); 1056253395Sandre s = NULL; 1057253395Sandre } 1058253150Sandre } 1059253150Sandre 1060253150Sandre /* 1061169686Sandre * If timestamps were negotiated the reflected timestamp 1062169686Sandre * must be equal to what we actually sent in the SYN|ACK. 1063169686Sandre */ 1064237263Snp if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts) { 1065169686Sandre if ((s = tcp_log_addrs(inc, th, NULL, NULL))) 1066170078Sandre log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, " 1067170078Sandre "segment rejected\n", 1068169686Sandre s, __func__, to->to_tsecr, sc->sc_ts); 1069169686Sandre goto failed; 1070169686Sandre } 107186764Sjlemon 1072168902Sandre *lsop = syncache_socket(sc, *lsop, m); 1073159695Sandre 1074168902Sandre if (*lsop == NULL) 1075190948Srwatson TCPSTAT_INC(tcps_sc_aborted); 1076168902Sandre else 1077190948Srwatson TCPSTAT_INC(tcps_sc_completed); 1078122922Sandre 1079178888Sjulian/* how do we find the inp for the new socket? */ 1080162277Sandre if (sc != &scs) 1081162277Sandre syncache_free(sc); 1082159695Sandre return (1); 1083159695Sandrefailed: 1084162277Sandre if (sc != NULL && sc != &scs) 108586764Sjlemon syncache_free(sc); 1086169685Sandre if (s != NULL) 1087169685Sandre free(s, M_TCPLOG); 1088168902Sandre *lsop = NULL; 1089159695Sandre return (0); 109086764Sjlemon} 109186764Sjlemon 109286764Sjlemon/* 109386764Sjlemon * Given a LISTEN socket and an inbound SYN request, add 109486764Sjlemon * this to the syn cache, and send back a segment: 109586764Sjlemon * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK> 109686764Sjlemon * to the source. 109786764Sjlemon * 109886764Sjlemon * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. 109986764Sjlemon * Doing so would require that we hold onto the data and deliver it 110086764Sjlemon * to the application. However, if we are the target of a SYN-flood 110186764Sjlemon * DoS attack, an attacker could send data which would eventually 110286764Sjlemon * consume all available buffer space if it were ACKed. By not ACKing 110386764Sjlemon * the data, we avoid this DoS scenario. 110486764Sjlemon */ 1105237263Snpvoid 1106237263Snpsyncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, 1107237263Snp struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, 1108237263Snp void *todctx) 110986764Sjlemon{ 111086764Sjlemon struct tcpcb *tp; 111186764Sjlemon struct socket *so; 111286764Sjlemon struct syncache *sc = NULL; 111386764Sjlemon struct syncache_head *sch; 111486764Sjlemon struct mbuf *ipopts = NULL; 1115221023Sattilio u_int ltflags; 1116221023Sattilio int win, sb_hiwat, ip_ttl, ip_tos; 1117171639Sandre char *s; 1118159701Sandre#ifdef INET6 1119159701Sandre int autoflowlabel = 0; 1120159701Sandre#endif 1121165149Scsjp#ifdef MAC 1122165149Scsjp struct label *maclabel; 1123165149Scsjp#endif 1124162277Sandre struct syncache scs; 1125182056Sbz struct ucred *cred; 112686764Sjlemon 1127181803Sbz INP_INFO_WLOCK_ASSERT(&V_tcbinfo); 1128178285Srwatson INP_WLOCK_ASSERT(inp); /* listen socket */ 1129171643Ssilby KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, 1130171639Sandre ("%s: unexpected tcp flags", __func__)); 1131122496Ssam 1132159695Sandre /* 1133159695Sandre * Combine all so/tp operations very early to drop the INP lock as 1134159695Sandre * soon as possible. 1135159695Sandre */ 1136159695Sandre so = *lsop; 113786764Sjlemon tp = sototcpcb(so); 1138182056Sbz cred = crhold(so->so_cred); 113986764Sjlemon 1140159695Sandre#ifdef INET6 1141186222Sbz if ((inc->inc_flags & INC_ISIPV6) && 1142186141Sbz (inp->inp_flags & IN6P_AUTOFLOWLABEL)) 1143159695Sandre autoflowlabel = 1; 1144159695Sandre#endif 1145159695Sandre ip_ttl = inp->inp_ip_ttl; 1146159695Sandre ip_tos = inp->inp_ip_tos; 1147159695Sandre win = sbspace(&so->so_rcv); 1148159695Sandre sb_hiwat = so->so_rcv.sb_hiwat; 1149221023Sattilio ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE)); 1150159695Sandre 1151182045Sbz /* By the time we drop the lock these should no longer be used. */ 1152159695Sandre so = NULL; 1153159695Sandre tp = NULL; 1154159695Sandre 1155165149Scsjp#ifdef MAC 1156172970Srwatson if (mac_syncache_init(&maclabel) != 0) { 1157178285Srwatson INP_WUNLOCK(inp); 1158181803Sbz INP_INFO_WUNLOCK(&V_tcbinfo); 1159168900Sandre goto done; 1160165149Scsjp } else 1161172970Srwatson mac_syncache_create(maclabel, inp); 1162165149Scsjp#endif 1163178285Srwatson INP_WUNLOCK(inp); 1164181803Sbz INP_INFO_WUNLOCK(&V_tcbinfo); 1165159695Sandre 116686764Sjlemon /* 116786764Sjlemon * Remember the IP options, if any. 116886764Sjlemon */ 116986764Sjlemon#ifdef INET6 1170186222Sbz if (!(inc->inc_flags & INC_ISIPV6)) 117186764Sjlemon#endif 1172221250Sbz#ifdef INET 1173174704Skmacy ipopts = (m) ? ip_srcroute(m) : NULL; 1174221250Sbz#else 1175221250Sbz ipopts = NULL; 1176221250Sbz#endif 117786764Sjlemon 117886764Sjlemon /* 117986764Sjlemon * See if we already have an entry for this connection. 118086764Sjlemon * If we do, resend the SYN,ACK, and reset the retransmit timer. 118186764Sjlemon * 1182159697Sandre * XXX: should the syncache be re-initialized with the contents 118386764Sjlemon * of the new SYN here (which may have different options?) 1184171639Sandre * 1185171639Sandre * XXX: We do not check the sequence number to see if this is a 1186171639Sandre * real retransmit or a new connection attempt. The question is 1187171639Sandre * how to handle such a case; either ignore it as spoofed, or 1188171639Sandre * drop the current entry and create a new one? 118986764Sjlemon */ 1190159695Sandre sc = syncache_lookup(inc, &sch); /* returns locked entry */ 1191159695Sandre SCH_LOCK_ASSERT(sch); 119286764Sjlemon if (sc != NULL) { 1193190948Srwatson TCPSTAT_INC(tcps_sc_dupsyn); 119486764Sjlemon if (ipopts) { 119586764Sjlemon /* 119686764Sjlemon * If we were remembering a previous source route, 119786764Sjlemon * forget it and use the new one we've been given. 119886764Sjlemon */ 119986764Sjlemon if (sc->sc_ipopts) 120086764Sjlemon (void) m_free(sc->sc_ipopts); 120186764Sjlemon sc->sc_ipopts = ipopts; 120286764Sjlemon } 120386764Sjlemon /* 120486764Sjlemon * Update timestamp if present. 120586764Sjlemon */ 1206168901Sandre if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) 1207159950Sandre sc->sc_tsreflect = to->to_tsval; 1208168901Sandre else 1209168901Sandre sc->sc_flags &= ~SCF_TIMESTAMP; 1210165149Scsjp#ifdef MAC 1211165149Scsjp /* 1212165149Scsjp * Since we have already unconditionally allocated label 1213165149Scsjp * storage, free it up. The syncache entry will already 1214165149Scsjp * have an initialized label we can use. 1215165149Scsjp */ 1216172970Srwatson mac_syncache_destroy(&maclabel); 1217165149Scsjp#endif 1218171639Sandre /* Retransmit SYN|ACK and reset retransmit count. */ 1219171639Sandre if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) { 1220171652Sbmah log(LOG_DEBUG, "%s; %s: Received duplicate SYN, " 1221171639Sandre "resetting timer and retransmitting SYN|ACK\n", 1222171639Sandre s, __func__); 1223171639Sandre free(s, M_TCPLOG); 1224171639Sandre } 1225237263Snp if (syncache_respond(sc) == 0) { 1226171639Sandre sc->sc_rxmits = 0; 1227171639Sandre syncache_timeout(sc, sch, 1); 1228190948Srwatson TCPSTAT_INC(tcps_sndacks); 1229190948Srwatson TCPSTAT_INC(tcps_sndtotal); 123086764Sjlemon } 1231159695Sandre SCH_UNLOCK(sch); 1232159695Sandre goto done; 123386764Sjlemon } 123486764Sjlemon 1235181803Sbz sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); 123686764Sjlemon if (sc == NULL) { 123786764Sjlemon /* 123886764Sjlemon * The zone allocator couldn't provide more entries. 1239133874Srwatson * Treat this as if the cache was full; drop the oldest 124086764Sjlemon * entry and insert the new one. 124186764Sjlemon */ 1242190948Srwatson TCPSTAT_INC(tcps_sc_zonefail); 1243168817Sandre if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) 1244168817Sandre syncache_drop(sc, sch); 1245181803Sbz sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); 124686764Sjlemon if (sc == NULL) { 1247185348Szec if (V_tcp_syncookies) { 1248162277Sandre bzero(&scs, sizeof(scs)); 1249162277Sandre sc = &scs; 1250162277Sandre } else { 1251162277Sandre SCH_UNLOCK(sch); 1252162277Sandre if (ipopts) 1253162277Sandre (void) m_free(ipopts); 1254162277Sandre goto done; 1255162277Sandre } 125686764Sjlemon } 1257162277Sandre } 1258174558Skmacy 125986764Sjlemon /* 126086764Sjlemon * Fill in the syncache values. 126186764Sjlemon */ 1262165149Scsjp#ifdef MAC 1263165149Scsjp sc->sc_label = maclabel; 1264165149Scsjp#endif 1265182056Sbz sc->sc_cred = cred; 1266182056Sbz cred = NULL; 126786764Sjlemon sc->sc_ipopts = ipopts; 1268159950Sandre bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); 126986764Sjlemon#ifdef INET6 1270186222Sbz if (!(inc->inc_flags & INC_ISIPV6)) 127186764Sjlemon#endif 127286764Sjlemon { 1273159695Sandre sc->sc_ip_tos = ip_tos; 1274159695Sandre sc->sc_ip_ttl = ip_ttl; 127586764Sjlemon } 1276237263Snp#ifdef TCP_OFFLOAD 1277237263Snp sc->sc_tod = tod; 1278237263Snp sc->sc_todctx = todctx; 1279174558Skmacy#endif 128086764Sjlemon sc->sc_irs = th->th_seq; 1281162277Sandre sc->sc_iss = arc4random(); 1282110023Ssilby sc->sc_flags = 0; 1283132307Sdwmalone sc->sc_flowlabel = 0; 128486764Sjlemon 1285159695Sandre /* 1286159695Sandre * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN]. 1287159695Sandre * win was derived from socket earlier in the function. 1288159695Sandre */ 128986764Sjlemon win = imax(win, 0); 129086764Sjlemon win = imin(win, TCP_MAXWIN); 129186764Sjlemon sc->sc_wnd = win; 129286764Sjlemon 1293181803Sbz if (V_tcp_do_rfc1323) { 129486764Sjlemon /* 129586764Sjlemon * A timestamp received in a SYN makes 129686764Sjlemon * it ok to send timestamp requests and replies. 129786764Sjlemon */ 129886764Sjlemon if (to->to_flags & TOF_TS) { 1299159950Sandre sc->sc_tsreflect = to->to_tsval; 1300231767Sbz sc->sc_ts = tcp_ts_getticks(); 130186764Sjlemon sc->sc_flags |= SCF_TIMESTAMP; 130286764Sjlemon } 130386764Sjlemon if (to->to_flags & TOF_SCALE) { 130486764Sjlemon int wscale = 0; 130586764Sjlemon 1306166403Sandre /* 1307172795Ssilby * Pick the smallest possible scaling factor that 1308172795Ssilby * will still allow us to scale up to sb_max, aka 1309172795Ssilby * kern.ipc.maxsockbuf. 1310167606Sandre * 1311172795Ssilby * We do this because there are broken firewalls that 1312172795Ssilby * will corrupt the window scale option, leading to 1313172795Ssilby * the other endpoint believing that our advertised 1314172795Ssilby * window is unscaled. At scale factors larger than 1315172795Ssilby * 5 the unscaled window will drop below 1500 bytes, 1316172795Ssilby * leading to serious problems when traversing these 1317172795Ssilby * broken firewalls. 1318172795Ssilby * 1319172795Ssilby * With the default maxsockbuf of 256K, a scale factor 1320172795Ssilby * of 3 will be chosen by this algorithm. Those who 1321172795Ssilby * choose a larger maxsockbuf should watch out 1322172795Ssilby * for the compatiblity problems mentioned above. 1323172795Ssilby * 1324167606Sandre * RFC1323: The Window field in a SYN (i.e., a <SYN> 1325167606Sandre * or <SYN,ACK>) segment itself is never scaled. 1326166403Sandre */ 132786764Sjlemon while (wscale < TCP_MAX_WINSHIFT && 1328172795Ssilby (TCP_MAXWIN << wscale) < sb_max) 132986764Sjlemon wscale++; 1330159950Sandre sc->sc_requested_r_scale = wscale; 1331167606Sandre sc->sc_requested_s_scale = to->to_wscale; 133286764Sjlemon sc->sc_flags |= SCF_WINSCALE; 133386764Sjlemon } 133486764Sjlemon } 1335125680Sbms#ifdef TCP_SIGNATURE 1336125680Sbms /* 1337145371Sps * If listening socket requested TCP digests, and received SYN 1338145371Sps * contains the option, flag this in the syncache so that 1339145371Sps * syncache_respond() will do the right thing with the SYN+ACK. 1340159697Sandre * XXX: Currently we always record the option by default and will 1341145371Sps * attempt to use it in syncache_respond(). 1342125680Sbms */ 1343221023Sattilio if (to->to_flags & TOF_SIGNATURE || ltflags & TF_SIGNATURE) 1344150131Sandre sc->sc_flags |= SCF_SIGNATURE; 1345125783Sbms#endif 1346174248Ssilby if (to->to_flags & TOF_SACKPERM) 1347130989Sps sc->sc_flags |= SCF_SACK; 1348162277Sandre if (to->to_flags & TOF_MSS) 1349162277Sandre sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */ 1350221023Sattilio if (ltflags & TF_NOOPT) 1351159727Sandre sc->sc_flags |= SCF_NOOPT; 1352181803Sbz if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn) 1353181056Srpaulo sc->sc_flags |= SCF_ECN; 1354130989Sps 1355253210Sandre if (V_tcp_syncookies) 1356253210Sandre sc->sc_iss = syncookie_generate(sch, sc); 1357162277Sandre#ifdef INET6 1358253210Sandre if (autoflowlabel) { 1359253210Sandre if (V_tcp_syncookies) 1360253210Sandre sc->sc_flowlabel = sc->sc_iss; 1361253210Sandre else 1362253210Sandre sc->sc_flowlabel = ip6_randomflowlabel(); 1363253210Sandre sc->sc_flowlabel = htonl(sc->sc_flowlabel) & IPV6_FLOWLABEL_MASK; 1364253210Sandre } 1365162277Sandre#endif 1366162277Sandre SCH_UNLOCK(sch); 1367162277Sandre 136886764Sjlemon /* 1369137139Sandre * Do a standard 3-way handshake. 137086764Sjlemon */ 1371237263Snp if (syncache_respond(sc) == 0) { 1372185348Szec if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) 1373162277Sandre syncache_free(sc); 1374162277Sandre else if (sc != &scs) 1375162277Sandre syncache_insert(sc, sch); /* locks and unlocks sch */ 1376190948Srwatson TCPSTAT_INC(tcps_sndacks); 1377190948Srwatson TCPSTAT_INC(tcps_sndtotal); 137886764Sjlemon } else { 1379165149Scsjp if (sc != &scs) 1380165149Scsjp syncache_free(sc); 1381190948Srwatson TCPSTAT_INC(tcps_sc_dropped); 138286764Sjlemon } 1383159695Sandre 1384159695Sandredone: 1385182056Sbz if (cred != NULL) 1386182056Sbz crfree(cred); 1387168900Sandre#ifdef MAC 1388168900Sandre if (sc == &scs) 1389172970Srwatson mac_syncache_destroy(&maclabel); 1390168900Sandre#endif 1391174558Skmacy if (m) { 1392174558Skmacy 1393174558Skmacy *lsop = NULL; 1394174558Skmacy m_freem(m); 1395174558Skmacy } 139686764Sjlemon} 139786764Sjlemon 139886764Sjlemonstatic int 1399168900Sandresyncache_respond(struct syncache *sc) 140086764Sjlemon{ 1401159950Sandre struct ip *ip = NULL; 1402168900Sandre struct mbuf *m; 1403221250Sbz struct tcphdr *th = NULL; 1404221250Sbz int optlen, error = 0; /* Make compiler happy */ 1405167606Sandre u_int16_t hlen, tlen, mssopt; 1406167606Sandre struct tcpopt to; 140786764Sjlemon#ifdef INET6 140886764Sjlemon struct ip6_hdr *ip6 = NULL; 140986764Sjlemon#endif 141086764Sjlemon 1411122922Sandre hlen = 141286764Sjlemon#ifdef INET6 1413186222Sbz (sc->sc_inc.inc_flags & INC_ISIPV6) ? sizeof(struct ip6_hdr) : 141486764Sjlemon#endif 1415122922Sandre sizeof(struct ip); 1416167606Sandre tlen = hlen + sizeof(struct tcphdr); 141786764Sjlemon 1418159697Sandre /* Determine MSS we advertize to other end of connection. */ 1419122922Sandre mssopt = tcp_mssopt(&sc->sc_inc); 1420159955Sandre if (sc->sc_peer_mss) 1421181803Sbz mssopt = max( min(sc->sc_peer_mss, mssopt), V_tcp_minmss); 1422122922Sandre 1423167606Sandre /* XXX: Assume that the entire packet will fit in a header mbuf. */ 1424168904Sandre KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN, 1425167606Sandre ("syncache: mbuf too small")); 142686764Sjlemon 1427159695Sandre /* Create the IP+TCP header from scratch. */ 1428243882Sglebius m = m_gethdr(M_NOWAIT, MT_DATA); 142986764Sjlemon if (m == NULL) 143086764Sjlemon return (ENOBUFS); 1431165149Scsjp#ifdef MAC 1432172970Srwatson mac_syncache_create_mbuf(sc->sc_label, m); 1433165149Scsjp#endif 143486764Sjlemon m->m_data += max_linkhdr; 143586764Sjlemon m->m_len = tlen; 143686764Sjlemon m->m_pkthdr.len = tlen; 143786764Sjlemon m->m_pkthdr.rcvif = NULL; 1438159695Sandre 143986764Sjlemon#ifdef INET6 1440186222Sbz if (sc->sc_inc.inc_flags & INC_ISIPV6) { 144186764Sjlemon ip6 = mtod(m, struct ip6_hdr *); 144286764Sjlemon ip6->ip6_vfc = IPV6_VERSION; 144386764Sjlemon ip6->ip6_nxt = IPPROTO_TCP; 144486764Sjlemon ip6->ip6_src = sc->sc_inc.inc6_laddr; 144586764Sjlemon ip6->ip6_dst = sc->sc_inc.inc6_faddr; 144686764Sjlemon ip6->ip6_plen = htons(tlen - hlen); 144786764Sjlemon /* ip6_hlim is set after checksum */ 1448132307Sdwmalone ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK; 1449132307Sdwmalone ip6->ip6_flow |= sc->sc_flowlabel; 145086764Sjlemon 145186764Sjlemon th = (struct tcphdr *)(ip6 + 1); 1452221250Sbz } 145386764Sjlemon#endif 1454221250Sbz#if defined(INET6) && defined(INET) 1455221250Sbz else 1456221250Sbz#endif 1457221250Sbz#ifdef INET 145886764Sjlemon { 145986764Sjlemon ip = mtod(m, struct ip *); 146086764Sjlemon ip->ip_v = IPVERSION; 146186764Sjlemon ip->ip_hl = sizeof(struct ip) >> 2; 1462241913Sglebius ip->ip_len = htons(tlen); 146386764Sjlemon ip->ip_id = 0; 146486764Sjlemon ip->ip_off = 0; 146586764Sjlemon ip->ip_sum = 0; 146686764Sjlemon ip->ip_p = IPPROTO_TCP; 146786764Sjlemon ip->ip_src = sc->sc_inc.inc_laddr; 146886764Sjlemon ip->ip_dst = sc->sc_inc.inc_faddr; 1469159695Sandre ip->ip_ttl = sc->sc_ip_ttl; 1470159695Sandre ip->ip_tos = sc->sc_ip_tos; 147186764Sjlemon 147298204Ssilby /* 1473108125Shsu * See if we should do MTU discovery. Route lookups are 1474108125Shsu * expensive, so we will only unset the DF bit if: 1475101405Ssilby * 1476101405Ssilby * 1) path_mtu_discovery is disabled 1477101405Ssilby * 2) the SCF_UNREACH flag has been set 147898204Ssilby */ 1479181803Sbz if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) 1480241913Sglebius ip->ip_off |= htons(IP_DF); 148198204Ssilby 148286764Sjlemon th = (struct tcphdr *)(ip + 1); 148386764Sjlemon } 1484221250Sbz#endif /* INET */ 148586764Sjlemon th->th_sport = sc->sc_inc.inc_lport; 148686764Sjlemon th->th_dport = sc->sc_inc.inc_fport; 148786764Sjlemon 148886764Sjlemon th->th_seq = htonl(sc->sc_iss); 148986764Sjlemon th->th_ack = htonl(sc->sc_irs + 1); 1490167606Sandre th->th_off = sizeof(struct tcphdr) >> 2; 149186764Sjlemon th->th_x2 = 0; 149286764Sjlemon th->th_flags = TH_SYN|TH_ACK; 149386764Sjlemon th->th_win = htons(sc->sc_wnd); 149486764Sjlemon th->th_urp = 0; 149586764Sjlemon 1496181056Srpaulo if (sc->sc_flags & SCF_ECN) { 1497181056Srpaulo th->th_flags |= TH_ECE; 1498190948Srwatson TCPSTAT_INC(tcps_ecn_shs); 1499181056Srpaulo } 1500181056Srpaulo 150186764Sjlemon /* Tack on the TCP options. */ 1502167606Sandre if ((sc->sc_flags & SCF_NOOPT) == 0) { 1503167606Sandre to.to_flags = 0; 150486764Sjlemon 1505167606Sandre to.to_mss = mssopt; 1506167606Sandre to.to_flags = TOF_MSS; 1507108125Shsu if (sc->sc_flags & SCF_WINSCALE) { 1508167606Sandre to.to_wscale = sc->sc_requested_r_scale; 1509167606Sandre to.to_flags |= TOF_SCALE; 1510108125Shsu } 1511108125Shsu if (sc->sc_flags & SCF_TIMESTAMP) { 1512167606Sandre /* Virgin timestamp or TCP cookie enhanced one. */ 1513169686Sandre to.to_tsval = sc->sc_ts; 1514167606Sandre to.to_tsecr = sc->sc_tsreflect; 1515167606Sandre to.to_flags |= TOF_TS; 1516108125Shsu } 1517167606Sandre if (sc->sc_flags & SCF_SACK) 1518167606Sandre to.to_flags |= TOF_SACKPERM; 1519167606Sandre#ifdef TCP_SIGNATURE 1520167606Sandre if (sc->sc_flags & SCF_SIGNATURE) 1521167606Sandre to.to_flags |= TOF_SIGNATURE; 1522167606Sandre#endif 1523167606Sandre optlen = tcp_addoptions(&to, (u_char *)(th + 1)); 152486764Sjlemon 1525167606Sandre /* Adjust headers by option size. */ 1526167606Sandre th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; 1527167606Sandre m->m_len += optlen; 1528167606Sandre m->m_pkthdr.len += optlen; 1529174119Sbz 1530174119Sbz#ifdef TCP_SIGNATURE 1531174119Sbz if (sc->sc_flags & SCF_SIGNATURE) 1532183001Sbz tcp_signature_compute(m, 0, 0, optlen, 1533174119Sbz to.to_signature, IPSEC_DIR_OUTBOUND); 1534174119Sbz#endif 1535167606Sandre#ifdef INET6 1536186222Sbz if (sc->sc_inc.inc_flags & INC_ISIPV6) 1537167606Sandre ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen); 1538167658Skmacy else 1539167606Sandre#endif 1540241913Sglebius ip->ip_len = htons(ntohs(ip->ip_len) + optlen); 1541167606Sandre } else 1542167606Sandre optlen = 0; 1543130989Sps 1544195922Sjulian M_SETFIB(m, sc->sc_inc.inc_fibnum); 1545235961Sbz m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); 154686764Sjlemon#ifdef INET6 1547186222Sbz if (sc->sc_inc.inc_flags & INC_ISIPV6) { 1548236170Sbz m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; 1549235961Sbz th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen, 1550235961Sbz IPPROTO_TCP, 0); 1551122922Sandre ip6->ip6_hlim = in6_selecthlim(NULL, NULL); 1552245919Snp#ifdef TCP_OFFLOAD 1553245919Snp if (ADDED_BY_TOE(sc)) { 1554245919Snp struct toedev *tod = sc->sc_tod; 1555245919Snp 1556245919Snp error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); 1557245919Snp 1558245919Snp return (error); 1559245919Snp } 1560245919Snp#endif 1561159695Sandre error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); 1562221250Sbz } 156386764Sjlemon#endif 1564221250Sbz#if defined(INET6) && defined(INET) 1565221250Sbz else 1566221250Sbz#endif 1567221250Sbz#ifdef INET 156886764Sjlemon { 1569236170Sbz m->m_pkthdr.csum_flags = CSUM_TCP; 1570133874Srwatson th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, 1571167606Sandre htons(tlen + optlen - hlen + IPPROTO_TCP)); 1572237263Snp#ifdef TCP_OFFLOAD 1573237263Snp if (ADDED_BY_TOE(sc)) { 1574237263Snp struct toedev *tod = sc->sc_tod; 1575237263Snp 1576237263Snp error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); 1577237263Snp 1578237263Snp return (error); 1579237263Snp } 1580237263Snp#endif 1581159695Sandre error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); 158286764Sjlemon } 1583221250Sbz#endif 158486764Sjlemon return (error); 158586764Sjlemon} 158688180Sjlemon 158788180Sjlemon/* 1588253210Sandre * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks 1589253210Sandre * that exceed the capacity of the syncache by avoiding the storage of any 1590253210Sandre * of the SYNs we receive. Syncookies defend against blind SYN flooding 1591253210Sandre * attacks where the attacker does not have access to our responses. 159288180Sjlemon * 1593253210Sandre * Syncookies encode and include all necessary information about the 1594253210Sandre * connection setup within the SYN|ACK that we send back. That way we 1595253210Sandre * can avoid keeping any local state until the ACK to our SYN|ACK returns 1596253210Sandre * (if ever). Normally the syncache and syncookies are running in parallel 1597253210Sandre * with the latter taking over when the former is exhausted. When matching 1598253210Sandre * syncache entry is found the syncookie is ignored. 1599162277Sandre * 1600253210Sandre * The only reliable information persisting the 3WHS is our inital sequence 1601253210Sandre * number ISS of 32 bits. Syncookies embed a cryptographically sufficient 1602253210Sandre * strong hash (MAC) value and a few bits of TCP SYN options in the ISS 1603253210Sandre * of our SYN|ACK. The MAC can be recomputed when the ACK to our SYN|ACK 1604253210Sandre * returns and signifies a legitimate connection if it matches the ACK. 1605162277Sandre * 1606253210Sandre * The available space of 32 bits to store the hash and to encode the SYN 1607253210Sandre * option information is very tight and we should have at least 24 bits for 1608253210Sandre * the MAC to keep the number of guesses by blind spoofing reasonably high. 1609162277Sandre * 1610253210Sandre * SYN option information we have to encode to fully restore a connection: 1611253210Sandre * MSS: is imporant to chose an optimal segment size to avoid IP level 1612253210Sandre * fragmentation along the path. The common MSS values can be encoded 1613253210Sandre * in a 3-bit table. Uncommon values are captured by the next lower value 1614253210Sandre * in the table leading to a slight increase in packetization overhead. 1615253210Sandre * WSCALE: is necessary to allow large windows to be used for high delay- 1616253210Sandre * bandwidth product links. Not scaling the window when it was initially 1617253210Sandre * negotiated is bad for performance as lack of scaling further decreases 1618253210Sandre * the apparent available send window. We only need to encode the WSCALE 1619253210Sandre * we received from the remote end. Our end can be recalculated at any 1620253210Sandre * time. The common WSCALE values can be encoded in a 3-bit table. 1621253210Sandre * Uncommon values are captured by the next lower value in the table 1622253210Sandre * making us under-estimate the available window size halving our 1623253210Sandre * theoretically possible maximum throughput for that connection. 1624253210Sandre * SACK: Greatly assists in packet loss recovery and requires 1 bit. 1625253210Sandre * TIMESTAMP and SIGNATURE is not encoded because they are permanent options 1626253210Sandre * that are included in all segments on a connection. We enable them when 1627253210Sandre * the ACK has them. 1628162277Sandre * 1629253210Sandre * Security of syncookies and attack vectors: 1630162277Sandre * 1631253210Sandre * The MAC is computed over (faddr||laddr||fport||lport||irs||flags||secmod) 1632253210Sandre * together with the gloabl secret to make it unique per connection attempt. 1633253210Sandre * Thus any change of any of those parameters results in a different MAC output 1634253210Sandre * in an unpredictable way unless a collision is encountered. 24 bits of the 1635253210Sandre * MAC are embedded into the ISS. 1636162277Sandre * 1637253210Sandre * To prevent replay attacks two rotating global secrets are updated with a 1638253210Sandre * new random value every 15 seconds. The life-time of a syncookie is thus 1639253210Sandre * 15-30 seconds. 1640162277Sandre * 1641253210Sandre * Vector 1: Attacking the secret. This requires finding a weakness in the 1642253210Sandre * MAC itself or the way it is used here. The attacker can do a chosen plain 1643253210Sandre * text attack by varying and testing the all parameters under his control. 1644253210Sandre * The strength depends on the size and randomness of the secret, and the 1645253210Sandre * cryptographic security of the MAC function. Due to the constant updating 1646253210Sandre * of the secret the attacker has at most 29.999 seconds to find the secret 1647253210Sandre * and launch spoofed connections. After that he has to start all over again. 1648162277Sandre * 1649253210Sandre * Vector 2: Collision attack on the MAC of a single ACK. With a 24 bit MAC 1650253210Sandre * size an average of 4,823 attempts are required for a 50% chance of success 1651253210Sandre * to spoof a single syncookie (birthday collision paradox). However the 1652253210Sandre * attacker is blind and doesn't know if one of his attempts succeeded unless 1653253210Sandre * he has a side channel to interfere success from. A single connection setup 1654253210Sandre * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets. 1655253210Sandre * This many attempts are required for each one blind spoofed connection. For 1656253210Sandre * every additional spoofed connection he has to launch another N attempts. 1657253210Sandre * Thus for a sustained rate 100 spoofed connections per second approximately 1658253210Sandre * 1,800,000 packets per second would have to be sent. 165988180Sjlemon * 1660253210Sandre * NB: The MAC function should be fast so that it doesn't become a CPU 1661253210Sandre * exhaustion attack vector itself. 1662253210Sandre * 1663253210Sandre * References: 1664253210Sandre * RFC4987 TCP SYN Flooding Attacks and Common Mitigations 1665253210Sandre * SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996 1666253210Sandre * http://cr.yp.to/syncookies.html (overview) 1667253210Sandre * http://cr.yp.to/syncookies/archive (details) 1668253210Sandre * 1669253210Sandre * 1670253210Sandre * Schematic construction of a syncookie enabled Initial Sequence Number: 1671253210Sandre * 0 1 2 3 1672253210Sandre * 12345678901234567890123456789012 1673253210Sandre * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP| 1674253210Sandre * 1675253210Sandre * x 24 MAC (truncated) 1676253210Sandre * W 3 Send Window Scale index 1677253210Sandre * M 3 MSS index 1678253210Sandre * S 1 SACK permitted 1679253210Sandre * P 1 Odd/even secret 168088180Sjlemon */ 168188180Sjlemon 1682253210Sandre/* 1683253210Sandre * Distribution and probability of certain MSS values. Those in between are 1684253210Sandre * rounded down to the next lower one. 1685253210Sandre * [An Analysis of TCP Maximum Segment Sizes, S. Alcock and R. Nelson, 2011] 1686253210Sandre * .2% .3% 5% 7% 7% 20% 15% 45% 1687253210Sandre */ 1688253210Sandrestatic int tcp_sc_msstab[] = { 216, 536, 1200, 1360, 1400, 1440, 1452, 1460 }; 1689253210Sandre 1690253210Sandre/* 1691253210Sandre * Distribution and probability of certain WSCALE values. We have to map the 1692253210Sandre * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3 1693253210Sandre * bits based on prevalence of certain values. Where we don't have an exact 1694253210Sandre * match for are rounded down to the next lower one letting us under-estimate 1695253210Sandre * the true available window. At the moment this would happen only for the 1696253210Sandre * very uncommon values 3, 5 and those above 8 (more than 16MB socket buffer 1697253210Sandre * and window size). The absence of the WSCALE option (no scaling in either 1698253210Sandre * direction) is encoded with index zero. 1699253210Sandre * [WSCALE values histograms, Allman, 2012] 1700253210Sandre * X 10 10 35 5 6 14 10% by host 1701253210Sandre * X 11 4 5 5 18 49 3% by connections 1702253210Sandre */ 1703253210Sandrestatic int tcp_sc_wstab[] = { 0, 0, 1, 2, 4, 6, 7, 8 }; 1704253210Sandre 1705253210Sandre/* 1706253210Sandre * Compute the MAC for the SYN cookie. SIPHASH-2-4 is chosen for its speed 1707253210Sandre * and good cryptographic properties. 1708253210Sandre */ 1709253210Sandrestatic uint32_t 1710253210Sandresyncookie_mac(struct in_conninfo *inc, tcp_seq irs, uint8_t flags, 1711253210Sandre uint8_t *secbits, uintptr_t secmod) 1712162277Sandre{ 1713253210Sandre SIPHASH_CTX ctx; 1714253210Sandre uint32_t siphash[2]; 1715159695Sandre 1716253210Sandre SipHash24_Init(&ctx); 1717253210Sandre SipHash_SetKey(&ctx, secbits); 1718253210Sandre switch (inc->inc_flags & INC_ISIPV6) { 1719253210Sandre#ifdef INET 1720253210Sandre case 0: 1721253210Sandre SipHash_Update(&ctx, &inc->inc_faddr, sizeof(inc->inc_faddr)); 1722253210Sandre SipHash_Update(&ctx, &inc->inc_laddr, sizeof(inc->inc_laddr)); 1723253210Sandre break; 1724253210Sandre#endif 1725253210Sandre#ifdef INET6 1726253210Sandre case INC_ISIPV6: 1727253210Sandre SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(inc->inc6_faddr)); 1728253210Sandre SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(inc->inc6_laddr)); 1729253210Sandre break; 1730253210Sandre#endif 1731253210Sandre } 1732253210Sandre SipHash_Update(&ctx, &inc->inc_fport, sizeof(inc->inc_fport)); 1733253210Sandre SipHash_Update(&ctx, &inc->inc_lport, sizeof(inc->inc_lport)); 1734253210Sandre SipHash_Update(&ctx, &flags, sizeof(flags)); 1735253210Sandre SipHash_Update(&ctx, &secmod, sizeof(secmod)); 1736253210Sandre SipHash_Final((u_int8_t *)&siphash, &ctx); 1737253210Sandre 1738253210Sandre return (siphash[0] ^ siphash[1]); 1739253210Sandre} 1740253210Sandre 1741253210Sandrestatic tcp_seq 1742253210Sandresyncookie_generate(struct syncache_head *sch, struct syncache *sc) 1743253210Sandre{ 1744253210Sandre u_int i, mss, secbit, wscale; 1745253210Sandre uint32_t iss, hash; 1746253210Sandre uint8_t *secbits; 1747253210Sandre union syncookie cookie; 1748253210Sandre 1749162277Sandre SCH_LOCK_ASSERT(sch); 1750162277Sandre 1751253210Sandre cookie.cookie = 0; 1752162277Sandre 1753253210Sandre /* Map our computed MSS into the 3-bit index. */ 1754253210Sandre mss = min(tcp_mssopt(&sc->sc_inc), max(sc->sc_peer_mss, V_tcp_minmss)); 1755253210Sandre for (i = sizeof(tcp_sc_msstab) / sizeof(*tcp_sc_msstab) - 1; 1756253210Sandre tcp_sc_msstab[i] > mss && i > 0; 1757253210Sandre i--) 1758253210Sandre ; 1759253210Sandre cookie.flags.mss_idx = i; 1760253210Sandre 1761253210Sandre /* 1762253210Sandre * Map the send window scale into the 3-bit index but only if 1763253210Sandre * the wscale option was received. 1764253210Sandre */ 1765253210Sandre if (sc->sc_flags & SCF_WINSCALE) { 1766253210Sandre wscale = sc->sc_requested_s_scale; 1767253210Sandre for (i = sizeof(tcp_sc_wstab) / sizeof(*tcp_sc_wstab) - 1; 1768253210Sandre tcp_sc_wstab[i] > wscale && i > 0; 1769253210Sandre i--) 1770253210Sandre ; 1771253210Sandre cookie.flags.wscale_idx = i; 1772159695Sandre } 1773159695Sandre 1774253210Sandre /* Can we do SACK? */ 1775253210Sandre if (sc->sc_flags & SCF_SACK) 1776253210Sandre cookie.flags.sack_ok = 1; 177788180Sjlemon 1778253210Sandre /* Which of the two secrets to use. */ 1779253210Sandre secbit = sch->sch_sc->secret.oddeven & 0x1; 1780253210Sandre cookie.flags.odd_even = secbit; 1781162277Sandre 1782253210Sandre secbits = sch->sch_sc->secret.key[secbit]; 1783253210Sandre hash = syncookie_mac(&sc->sc_inc, sc->sc_irs, cookie.cookie, secbits, 1784253210Sandre (uintptr_t)sch); 1785162277Sandre 1786253210Sandre /* 1787253210Sandre * Put the flags into the hash and XOR them to get better ISS number 1788253210Sandre * variance. This doesn't enhance the cryptographic strength and is 1789253210Sandre * done to prevent the 8 cookie bits from showing up directly on the 1790253210Sandre * wire. 1791253210Sandre */ 1792253210Sandre iss = hash & ~0xff; 1793253210Sandre iss |= cookie.cookie ^ (hash >> 24); 1794162277Sandre 1795253210Sandre /* Randomize the timestamp. */ 1796162277Sandre if (sc->sc_flags & SCF_TIMESTAMP) { 1797253210Sandre sc->sc_ts = arc4random(); 1798253210Sandre sc->sc_tsoff = sc->sc_ts - tcp_ts_getticks(); 1799169686Sandre } 1800162277Sandre 1801190948Srwatson TCPSTAT_INC(tcps_sc_sendcookie); 1802253210Sandre return (iss); 180388180Sjlemon} 180488180Sjlemon 180588180Sjlemonstatic struct syncache * 1806162277Sandresyncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, 1807253210Sandre struct syncache *sc, struct tcphdr *th, struct tcpopt *to, 1808253210Sandre struct socket *lso) 180988180Sjlemon{ 1810253210Sandre uint32_t hash; 1811253210Sandre uint8_t *secbits; 1812162277Sandre tcp_seq ack, seq; 1813253210Sandre int wnd, wscale = 0; 1814253210Sandre union syncookie cookie; 181588180Sjlemon 1816162277Sandre SCH_LOCK_ASSERT(sch); 1817162277Sandre 1818162277Sandre /* 1819253210Sandre * Pull information out of SYN-ACK/ACK and revert sequence number 1820253210Sandre * advances. 1821162277Sandre */ 1822162277Sandre ack = th->th_ack - 1; 1823162277Sandre seq = th->th_seq - 1; 1824162277Sandre 1825162277Sandre /* 1826253210Sandre * Unpack the flags containing enough information to restore the 1827253210Sandre * connection. 1828162277Sandre */ 1829253210Sandre cookie.cookie = (ack & 0xff) ^ (ack >> 24); 1830162277Sandre 1831253210Sandre /* Which of the two secrets to use. */ 1832253210Sandre secbits = sch->sch_sc->secret.key[cookie.flags.odd_even]; 1833162277Sandre 1834253210Sandre hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch); 1835253210Sandre 1836253210Sandre /* The recomputed hash matches the ACK if this was a genuine cookie. */ 1837253210Sandre if ((ack & ~0xff) != (hash & ~0xff)) 183888180Sjlemon return (NULL); 183988180Sjlemon 1840162277Sandre /* Fill in the syncache values. */ 1841253210Sandre sc->sc_flags = 0; 1842162277Sandre bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); 184388180Sjlemon sc->sc_ipopts = NULL; 1844162277Sandre 1845162277Sandre sc->sc_irs = seq; 1846162277Sandre sc->sc_iss = ack; 1847162277Sandre 1848253210Sandre switch (inc->inc_flags & INC_ISIPV6) { 1849253210Sandre#ifdef INET 1850253210Sandre case 0: 1851253210Sandre sc->sc_ip_ttl = sotoinpcb(lso)->inp_ip_ttl; 1852253210Sandre sc->sc_ip_tos = sotoinpcb(lso)->inp_ip_tos; 1853253210Sandre break; 1854253210Sandre#endif 185588180Sjlemon#ifdef INET6 1856253210Sandre case INC_ISIPV6: 1857253210Sandre if (sotoinpcb(lso)->inp_flags & IN6P_AUTOFLOWLABEL) 1858253210Sandre sc->sc_flowlabel = sc->sc_iss & IPV6_FLOWLABEL_MASK; 1859253210Sandre break; 186088180Sjlemon#endif 186188180Sjlemon } 1862162277Sandre 1863253210Sandre sc->sc_peer_mss = tcp_sc_msstab[cookie.flags.mss_idx]; 1864253210Sandre 1865253210Sandre /* We can simply recompute receive window scale we sent earlier. */ 1866253210Sandre while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) 1867253210Sandre wscale++; 1868253210Sandre 1869253210Sandre /* Only use wscale if it was enabled in the orignal SYN. */ 1870253210Sandre if (cookie.flags.wscale_idx > 0) { 1871253210Sandre sc->sc_requested_r_scale = wscale; 1872253210Sandre sc->sc_requested_s_scale = tcp_sc_wstab[cookie.flags.wscale_idx]; 1873253210Sandre sc->sc_flags |= SCF_WINSCALE; 1874253210Sandre } 1875253210Sandre 1876253210Sandre wnd = sbspace(&lso->so_rcv); 1877253210Sandre wnd = imax(wnd, 0); 1878253210Sandre wnd = imin(wnd, TCP_MAXWIN); 1879253210Sandre sc->sc_wnd = wnd; 1880253210Sandre 1881253210Sandre if (cookie.flags.sack_ok) 1882253210Sandre sc->sc_flags |= SCF_SACK; 1883253210Sandre 1884253210Sandre if (to->to_flags & TOF_TS) { 1885162277Sandre sc->sc_flags |= SCF_TIMESTAMP; 1886162277Sandre sc->sc_tsreflect = to->to_tsval; 1887169686Sandre sc->sc_ts = to->to_tsecr; 1888231767Sbz sc->sc_tsoff = to->to_tsecr - tcp_ts_getticks(); 1889253210Sandre } 1890162277Sandre 1891253210Sandre if (to->to_flags & TOF_SIGNATURE) 1892253210Sandre sc->sc_flags |= SCF_SIGNATURE; 1893162277Sandre 1894159695Sandre sc->sc_rxmits = 0; 1895162277Sandre 1896190948Srwatson TCPSTAT_INC(tcps_sc_recvcookie); 189788180Sjlemon return (sc); 189888180Sjlemon} 1899171605Ssilby 1900253210Sandre#ifdef INVARIANTS 1901253210Sandrestatic int 1902253210Sandresyncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, 1903253210Sandre struct syncache *sc, struct tcphdr *th, struct tcpopt *to, 1904253210Sandre struct socket *lso) 1905253210Sandre{ 1906253210Sandre struct syncache scs, *scx; 1907253210Sandre char *s; 1908253210Sandre 1909253210Sandre bzero(&scs, sizeof(scs)); 1910253210Sandre scx = syncookie_lookup(inc, sch, &scs, th, to, lso); 1911253210Sandre 1912253210Sandre if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) 1913253210Sandre return (0); 1914253210Sandre 1915253210Sandre if (scx != NULL) { 1916253210Sandre if (sc->sc_peer_mss != scx->sc_peer_mss) 1917253210Sandre log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n", 1918253210Sandre s, __func__, sc->sc_peer_mss, scx->sc_peer_mss); 1919253210Sandre 1920253210Sandre if (sc->sc_requested_r_scale != scx->sc_requested_r_scale) 1921253210Sandre log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n", 1922253210Sandre s, __func__, sc->sc_requested_r_scale, 1923253210Sandre scx->sc_requested_r_scale); 1924253210Sandre 1925253210Sandre if (sc->sc_requested_s_scale != scx->sc_requested_s_scale) 1926253210Sandre log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n", 1927253210Sandre s, __func__, sc->sc_requested_s_scale, 1928253210Sandre scx->sc_requested_s_scale); 1929253210Sandre 1930253210Sandre if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK)) 1931253210Sandre log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__); 1932253210Sandre } 1933253210Sandre 1934253210Sandre if (s != NULL) 1935253210Sandre free(s, M_TCPLOG); 1936253210Sandre return (0); 1937253210Sandre} 1938253210Sandre#endif /* INVARIANTS */ 1939253210Sandre 1940253210Sandrestatic void 1941253210Sandresyncookie_reseed(void *arg) 1942253210Sandre{ 1943253210Sandre struct tcp_syncache *sc = arg; 1944253210Sandre uint8_t *secbits; 1945253210Sandre int secbit; 1946253210Sandre 1947253210Sandre /* 1948253210Sandre * Reseeding the secret doesn't have to be protected by a lock. 1949253210Sandre * It only must be ensured that the new random values are visible 1950253210Sandre * to all CPUs in a SMP environment. The atomic with release 1951253210Sandre * semantics ensures that. 1952253210Sandre */ 1953253210Sandre secbit = (sc->secret.oddeven & 0x1) ? 0 : 1; 1954253210Sandre secbits = sc->secret.key[secbit]; 1955253210Sandre arc4rand(secbits, SYNCOOKIE_SECRET_SIZE, 0); 1956253210Sandre atomic_add_rel_int(&sc->secret.oddeven, 1); 1957253210Sandre 1958253210Sandre /* Reschedule ourself. */ 1959253210Sandre callout_schedule(&sc->secret.reseed, SYNCOOKIE_LIFETIME * hz); 1960253210Sandre} 1961253210Sandre 1962171605Ssilby/* 1963171605Ssilby * Returns the current number of syncache entries. This number 1964171605Ssilby * will probably change before you get around to calling 1965171605Ssilby * syncache_pcblist. 1966171605Ssilby */ 1967171605Ssilbyint 1968171605Ssilbysyncache_pcbcount(void) 1969171605Ssilby{ 1970171605Ssilby struct syncache_head *sch; 1971171605Ssilby int count, i; 1972171605Ssilby 1973181803Sbz for (count = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { 1974171605Ssilby /* No need to lock for a read. */ 1975181803Sbz sch = &V_tcp_syncache.hashbase[i]; 1976171605Ssilby count += sch->sch_length; 1977171605Ssilby } 1978171605Ssilby return count; 1979171605Ssilby} 1980171605Ssilby 1981171605Ssilby/* 1982171605Ssilby * Exports the syncache entries to userland so that netstat can display 1983171605Ssilby * them alongside the other sockets. This function is intended to be 1984171605Ssilby * called only from tcp_pcblist. 1985171605Ssilby * 1986171605Ssilby * Due to concurrency on an active system, the number of pcbs exported 1987171605Ssilby * may have no relation to max_pcbs. max_pcbs merely indicates the 1988171605Ssilby * amount of space the caller allocated for this function to use. 1989171605Ssilby */ 1990171605Ssilbyint 1991171605Ssilbysyncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported) 1992171605Ssilby{ 1993171605Ssilby struct xtcpcb xt; 1994171605Ssilby struct syncache *sc; 1995171605Ssilby struct syncache_head *sch; 1996171605Ssilby int count, error, i; 1997171605Ssilby 1998181803Sbz for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { 1999181803Sbz sch = &V_tcp_syncache.hashbase[i]; 2000171605Ssilby SCH_LOCK(sch); 2001171605Ssilby TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { 2002171605Ssilby if (count >= max_pcbs) { 2003171605Ssilby SCH_UNLOCK(sch); 2004171605Ssilby goto exit; 2005171605Ssilby } 2006182056Sbz if (cr_cansee(req->td->td_ucred, sc->sc_cred) != 0) 2007182056Sbz continue; 2008171605Ssilby bzero(&xt, sizeof(xt)); 2009171605Ssilby xt.xt_len = sizeof(xt); 2010186222Sbz if (sc->sc_inc.inc_flags & INC_ISIPV6) 2011171605Ssilby xt.xt_inp.inp_vflag = INP_IPV6; 2012171605Ssilby else 2013171605Ssilby xt.xt_inp.inp_vflag = INP_IPV4; 2014171605Ssilby bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo)); 2015171605Ssilby xt.xt_tp.t_inpcb = &xt.xt_inp; 2016171605Ssilby xt.xt_tp.t_state = TCPS_SYN_RECEIVED; 2017171605Ssilby xt.xt_socket.xso_protocol = IPPROTO_TCP; 2018171605Ssilby xt.xt_socket.xso_len = sizeof (struct xsocket); 2019171605Ssilby xt.xt_socket.so_type = SOCK_STREAM; 2020171605Ssilby xt.xt_socket.so_state = SS_ISCONNECTING; 2021171605Ssilby error = SYSCTL_OUT(req, &xt, sizeof xt); 2022171605Ssilby if (error) { 2023171605Ssilby SCH_UNLOCK(sch); 2024171605Ssilby goto exit; 2025171605Ssilby } 2026171605Ssilby count++; 2027171605Ssilby } 2028171605Ssilby SCH_UNLOCK(sch); 2029171605Ssilby } 2030171605Ssilbyexit: 2031171605Ssilby *pcbs_exported = count; 2032171605Ssilby return error; 2033171605Ssilby} 2034