tcp_syncache.c revision 163606
186764Sjlemon/*-
2141063Srwatson * Copyright (c) 2001 McAfee, Inc.
3159695Sandre * Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG
486764Sjlemon * All rights reserved.
586764Sjlemon *
686764Sjlemon * This software was developed for the FreeBSD Project by Jonathan Lemon
7141063Srwatson * and McAfee Research, the Security Research Division of McAfee, Inc. under
8141063Srwatson * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
986764Sjlemon * DARPA CHATS research program.
1086764Sjlemon *
1186764Sjlemon * Redistribution and use in source and binary forms, with or without
1286764Sjlemon * modification, are permitted provided that the following conditions
1386764Sjlemon * are met:
1486764Sjlemon * 1. Redistributions of source code must retain the above copyright
1586764Sjlemon *    notice, this list of conditions and the following disclaimer.
1686764Sjlemon * 2. Redistributions in binary form must reproduce the above copyright
1786764Sjlemon *    notice, this list of conditions and the following disclaimer in the
1886764Sjlemon *    documentation and/or other materials provided with the distribution.
1986764Sjlemon *
2086764Sjlemon * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
2186764Sjlemon * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2286764Sjlemon * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2386764Sjlemon * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2486764Sjlemon * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2586764Sjlemon * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2686764Sjlemon * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2786764Sjlemon * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2886764Sjlemon * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2986764Sjlemon * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3086764Sjlemon * SUCH DAMAGE.
3186764Sjlemon *
3286764Sjlemon * $FreeBSD: head/sys/netinet/tcp_syncache.c 163606 2006-10-22 11:52:19Z rwatson $
3386764Sjlemon */
3486764Sjlemon
35125680Sbms#include "opt_inet.h"
3686764Sjlemon#include "opt_inet6.h"
3786764Sjlemon#include "opt_ipsec.h"
38101106Srwatson#include "opt_mac.h"
3986764Sjlemon
4086764Sjlemon#include <sys/param.h>
4186764Sjlemon#include <sys/systm.h>
4286764Sjlemon#include <sys/kernel.h>
4386764Sjlemon#include <sys/sysctl.h>
44159695Sandre#include <sys/lock.h>
45159695Sandre#include <sys/mutex.h>
4686764Sjlemon#include <sys/malloc.h>
4786764Sjlemon#include <sys/mbuf.h>
4886764Sjlemon#include <sys/md5.h>
4986764Sjlemon#include <sys/proc.h>		/* for proc0 declaration */
5086764Sjlemon#include <sys/random.h>
5186764Sjlemon#include <sys/socket.h>
5286764Sjlemon#include <sys/socketvar.h>
5386764Sjlemon
54162278Sandre#include <vm/uma.h>
55162278Sandre
5686764Sjlemon#include <net/if.h>
5786764Sjlemon#include <net/route.h>
5886764Sjlemon
5986764Sjlemon#include <netinet/in.h>
6086764Sjlemon#include <netinet/in_systm.h>
6186764Sjlemon#include <netinet/ip.h>
6286764Sjlemon#include <netinet/in_var.h>
6386764Sjlemon#include <netinet/in_pcb.h>
6486764Sjlemon#include <netinet/ip_var.h>
65152592Sandre#include <netinet/ip_options.h>
6686764Sjlemon#ifdef INET6
6786764Sjlemon#include <netinet/ip6.h>
6886764Sjlemon#include <netinet/icmp6.h>
6986764Sjlemon#include <netinet6/nd6.h>
7086764Sjlemon#include <netinet6/ip6_var.h>
7186764Sjlemon#include <netinet6/in6_pcb.h>
7286764Sjlemon#endif
7386764Sjlemon#include <netinet/tcp.h>
7486764Sjlemon#include <netinet/tcp_fsm.h>
7586764Sjlemon#include <netinet/tcp_seq.h>
7686764Sjlemon#include <netinet/tcp_timer.h>
7786764Sjlemon#include <netinet/tcp_var.h>
7886764Sjlemon#ifdef INET6
7986764Sjlemon#include <netinet6/tcp6_var.h>
8086764Sjlemon#endif
8186764Sjlemon
8286764Sjlemon#ifdef IPSEC
8386764Sjlemon#include <netinet6/ipsec.h>
8486764Sjlemon#ifdef INET6
8586764Sjlemon#include <netinet6/ipsec6.h>
8686764Sjlemon#endif
8786764Sjlemon#endif /*IPSEC*/
8886764Sjlemon
89105199Ssam#ifdef FAST_IPSEC
90105199Ssam#include <netipsec/ipsec.h>
91105199Ssam#ifdef INET6
92105199Ssam#include <netipsec/ipsec6.h>
93105199Ssam#endif
94105199Ssam#include <netipsec/key.h>
95105199Ssam#endif /*FAST_IPSEC*/
96105199Ssam
9786764Sjlemon#include <machine/in_cksum.h>
9886764Sjlemon
99163606Srwatson#include <security/mac/mac_framework.h>
100163606Srwatson
10188180Sjlemonstatic int tcp_syncookies = 1;
10288180SjlemonSYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW,
103133874Srwatson    &tcp_syncookies, 0,
10488180Sjlemon    "Use TCP SYN cookies if the syncache overflows");
10588180Sjlemon
106162277Sandrestatic int tcp_syncookiesonly = 0;
107162277SandreSYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_RW,
108162277Sandre    &tcp_syncookiesonly, 0,
109162277Sandre    "Use only TCP SYN cookies");
110162277Sandre
111162277Sandre#define	SYNCOOKIE_SECRET_SIZE	8	/* dwords */
112162277Sandre#define	SYNCOOKIE_LIFETIME	16	/* seconds */
113162277Sandre
114159725Sandrestruct syncache {
115159725Sandre	TAILQ_ENTRY(syncache)	sc_hash;
116159725Sandre	struct		in_conninfo sc_inc;	/* addresses */
117159725Sandre	u_long		sc_rxttime;		/* retransmit time */
118159725Sandre	u_int16_t	sc_rxmits;		/* retransmit counter */
119159725Sandre
120159950Sandre	u_int32_t	sc_tsreflect;		/* timestamp to reflect */
121162277Sandre	u_int32_t	sc_ts;			/* our timestamp to send */
122162277Sandre	u_int32_t	sc_tsoff;		/* ts offset w/ syncookies */
123159725Sandre	u_int32_t	sc_flowlabel;		/* IPv6 flowlabel */
124159725Sandre	tcp_seq		sc_irs;			/* seq from peer */
125159725Sandre	tcp_seq		sc_iss;			/* our ISS */
126159725Sandre	struct		mbuf *sc_ipopts;	/* source route */
127159725Sandre
128159725Sandre	u_int16_t	sc_peer_mss;		/* peer's MSS */
129159725Sandre	u_int16_t	sc_wnd;			/* advertised window */
130159725Sandre	u_int8_t	sc_ip_ttl;		/* IPv4 TTL */
131159725Sandre	u_int8_t	sc_ip_tos;		/* IPv4 TOS */
132159725Sandre	u_int8_t	sc_requested_s_scale:4,
133159950Sandre			sc_requested_r_scale:4;
134159725Sandre	u_int8_t	sc_flags;
135159725Sandre#define SCF_NOOPT	0x01			/* no TCP options */
136159725Sandre#define SCF_WINSCALE	0x02			/* negotiated window scaling */
137159725Sandre#define SCF_TIMESTAMP	0x04			/* negotiated timestamps */
138159950Sandre						/* MSS is implicit */
139159725Sandre#define SCF_UNREACH	0x10			/* icmp unreachable received */
140159725Sandre#define SCF_SIGNATURE	0x20			/* send MD5 digests */
141159725Sandre#define SCF_SACK	0x80			/* send SACK option */
142159725Sandre};
143159725Sandre
144159725Sandrestruct syncache_head {
145159725Sandre	struct mtx	sch_mtx;
146159725Sandre	TAILQ_HEAD(sch_head, syncache)	sch_bucket;
147159725Sandre	struct callout	sch_timer;
148159725Sandre	int		sch_nextc;
149159725Sandre	u_int		sch_length;
150162277Sandre	u_int		sch_oddeven;
151162277Sandre	u_int32_t	sch_secbits_odd[SYNCOOKIE_SECRET_SIZE];
152162277Sandre	u_int32_t	sch_secbits_even[SYNCOOKIE_SECRET_SIZE];
153162277Sandre	u_int		sch_reseed;		/* time_uptime, seconds */
154159725Sandre};
155159725Sandre
15686764Sjlemonstatic void	 syncache_drop(struct syncache *, struct syncache_head *);
15786764Sjlemonstatic void	 syncache_free(struct syncache *);
15888180Sjlemonstatic void	 syncache_insert(struct syncache *, struct syncache_head *);
15986764Sjlemonstruct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **);
16086764Sjlemonstatic int	 syncache_respond(struct syncache *, struct mbuf *);
161133874Srwatsonstatic struct	 socket *syncache_socket(struct syncache *, struct socket *,
16296602Srwatson		    struct mbuf *m);
16386764Sjlemonstatic void	 syncache_timer(void *);
164162277Sandrestatic void	 syncookie_generate(struct syncache_head *, struct syncache *,
165162277Sandre		    u_int32_t *);
166159697Sandrestatic struct syncache
167162277Sandre		*syncookie_lookup(struct in_conninfo *, struct syncache_head *,
168162277Sandre		    struct syncache *, struct tcpopt *, struct tcphdr *,
169159697Sandre		    struct socket *);
17086764Sjlemon
17186764Sjlemon/*
17286764Sjlemon * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
17386764Sjlemon * 3 retransmits corresponds to a timeout of (1 + 2 + 4 + 8 == 15) seconds,
17486764Sjlemon * the odds are that the user has given up attempting to connect by then.
17586764Sjlemon */
17686764Sjlemon#define SYNCACHE_MAXREXMTS		3
17786764Sjlemon
17886764Sjlemon/* Arbitrary values */
17986764Sjlemon#define TCP_SYNCACHE_HASHSIZE		512
18086764Sjlemon#define TCP_SYNCACHE_BUCKETLIMIT	30
18186764Sjlemon
18286764Sjlemonstruct tcp_syncache {
18386764Sjlemon	struct	syncache_head *hashbase;
18492760Sjeff	uma_zone_t zone;
18586764Sjlemon	u_int	hashsize;
18686764Sjlemon	u_int	hashmask;
18786764Sjlemon	u_int	bucket_limit;
188159695Sandre	u_int	cache_count;		/* XXX: unprotected */
18986764Sjlemon	u_int	cache_limit;
19086764Sjlemon	u_int	rexmt_limit;
19186764Sjlemon	u_int	hash_secret;
19286764Sjlemon};
19386764Sjlemonstatic struct tcp_syncache tcp_syncache;
19486764Sjlemon
19586764SjlemonSYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache");
19686764Sjlemon
197121307SsilbySYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
19886764Sjlemon     &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache");
19986764Sjlemon
200121307SsilbySYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
20186764Sjlemon     &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache");
20286764Sjlemon
20386764SjlemonSYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD,
20486764Sjlemon     &tcp_syncache.cache_count, 0, "Current number of entries in syncache");
20586764Sjlemon
206121307SsilbySYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
20786764Sjlemon     &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable");
20886764Sjlemon
20986764SjlemonSYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW,
21086764Sjlemon     &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions");
21186764Sjlemon
21286764Sjlemonstatic MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
21386764Sjlemon
214133874Srwatson#define SYNCACHE_HASH(inc, mask)					\
21586764Sjlemon	((tcp_syncache.hash_secret ^					\
21686764Sjlemon	  (inc)->inc_faddr.s_addr ^					\
217133874Srwatson	  ((inc)->inc_faddr.s_addr >> 16) ^				\
21886764Sjlemon	  (inc)->inc_fport ^ (inc)->inc_lport) & mask)
21986764Sjlemon
220133874Srwatson#define SYNCACHE_HASH6(inc, mask)					\
22186764Sjlemon	((tcp_syncache.hash_secret ^					\
222133874Srwatson	  (inc)->inc6_faddr.s6_addr32[0] ^				\
223133874Srwatson	  (inc)->inc6_faddr.s6_addr32[3] ^				\
22486764Sjlemon	  (inc)->inc_fport ^ (inc)->inc_lport) & mask)
22586764Sjlemon
22686764Sjlemon#define ENDPTS_EQ(a, b) (						\
22789667Sjlemon	(a)->ie_fport == (b)->ie_fport &&				\
22886764Sjlemon	(a)->ie_lport == (b)->ie_lport &&				\
22986764Sjlemon	(a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr &&			\
23086764Sjlemon	(a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr			\
23186764Sjlemon)
23286764Sjlemon
23386764Sjlemon#define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
23486764Sjlemon
235159695Sandre#define SYNCACHE_TIMEOUT(sc, sch, co) do {				\
236159695Sandre	(sc)->sc_rxmits++;						\
237159695Sandre	(sc)->sc_rxttime = ticks +					\
238159695Sandre		TCPTV_RTOBASE * tcp_backoff[(sc)->sc_rxmits - 1];	\
239159695Sandre	if ((sch)->sch_nextc > (sc)->sc_rxttime)			\
240159695Sandre		(sch)->sch_nextc = (sc)->sc_rxttime;			\
241159695Sandre	if (!TAILQ_EMPTY(&(sch)->sch_bucket) && !(co))			\
242159695Sandre		callout_reset(&(sch)->sch_timer,			\
243159695Sandre			(sch)->sch_nextc - ticks,			\
244159695Sandre			syncache_timer, (void *)(sch));			\
24586764Sjlemon} while (0)
24686764Sjlemon
247159695Sandre#define	SCH_LOCK(sch)		mtx_lock(&(sch)->sch_mtx)
248159695Sandre#define	SCH_UNLOCK(sch)		mtx_unlock(&(sch)->sch_mtx)
249159695Sandre#define	SCH_LOCK_ASSERT(sch)	mtx_assert(&(sch)->sch_mtx, MA_OWNED)
250159695Sandre
251159695Sandre/*
252159695Sandre * Requires the syncache entry to be already removed from the bucket list.
253159695Sandre */
25486764Sjlemonstatic void
25586764Sjlemonsyncache_free(struct syncache *sc)
25686764Sjlemon{
25786764Sjlemon	if (sc->sc_ipopts)
25886764Sjlemon		(void) m_free(sc->sc_ipopts);
259122922Sandre
26092760Sjeff	uma_zfree(tcp_syncache.zone, sc);
26186764Sjlemon}
26286764Sjlemon
26386764Sjlemonvoid
26486764Sjlemonsyncache_init(void)
26586764Sjlemon{
26686764Sjlemon	int i;
26786764Sjlemon
26886764Sjlemon	tcp_syncache.cache_count = 0;
26986764Sjlemon	tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
27086764Sjlemon	tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
27186764Sjlemon	tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
27286764Sjlemon	tcp_syncache.hash_secret = arc4random();
27386764Sjlemon
274133874Srwatson	TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
27586764Sjlemon	    &tcp_syncache.hashsize);
276133874Srwatson	TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
27786764Sjlemon	    &tcp_syncache.bucket_limit);
278149455Sglebius	if (!powerof2(tcp_syncache.hashsize) || tcp_syncache.hashsize == 0) {
279133874Srwatson		printf("WARNING: syncache hash size is not a power of 2.\n");
280149455Sglebius		tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
281133874Srwatson	}
28286764Sjlemon	tcp_syncache.hashmask = tcp_syncache.hashsize - 1;
28386764Sjlemon
284159695Sandre	/* Set limits. */
285159695Sandre	tcp_syncache.cache_limit =
286159695Sandre	    tcp_syncache.hashsize * tcp_syncache.bucket_limit;
287159695Sandre	TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
288159695Sandre	    &tcp_syncache.cache_limit);
289159695Sandre
29086764Sjlemon	/* Allocate the hash table. */
29186764Sjlemon	MALLOC(tcp_syncache.hashbase, struct syncache_head *,
29286764Sjlemon	    tcp_syncache.hashsize * sizeof(struct syncache_head),
293159787Sandre	    M_SYNCACHE, M_WAITOK | M_ZERO);
29486764Sjlemon
29586764Sjlemon	/* Initialize the hash buckets. */
29686764Sjlemon	for (i = 0; i < tcp_syncache.hashsize; i++) {
29786764Sjlemon		TAILQ_INIT(&tcp_syncache.hashbase[i].sch_bucket);
298159695Sandre		mtx_init(&tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head",
299159695Sandre			 NULL, MTX_DEF);
300159695Sandre		callout_init_mtx(&tcp_syncache.hashbase[i].sch_timer,
301159695Sandre			 &tcp_syncache.hashbase[i].sch_mtx, 0);
30286764Sjlemon		tcp_syncache.hashbase[i].sch_length = 0;
30386764Sjlemon	}
30486764Sjlemon
305159695Sandre	/* Create the syncache entry zone. */
30692760Sjeff	tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
307159695Sandre	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
30892760Sjeff	uma_zone_set_max(tcp_syncache.zone, tcp_syncache.cache_limit);
30986764Sjlemon}
31086764Sjlemon
311159695Sandre/*
312159695Sandre * Inserts a syncache entry into the specified bucket row.
313159695Sandre * Locks and unlocks the syncache_head autonomously.
314159695Sandre */
31588180Sjlemonstatic void
316159697Sandresyncache_insert(struct syncache *sc, struct syncache_head *sch)
31786764Sjlemon{
31886764Sjlemon	struct syncache *sc2;
31986764Sjlemon
320159695Sandre	SCH_LOCK(sch);
321122496Ssam
32286764Sjlemon	/*
323159695Sandre	 * Make sure that we don't overflow the per-bucket limit.
324159695Sandre	 * If the bucket is full, toss the oldest element.
32586764Sjlemon	 */
32686764Sjlemon	if (sch->sch_length >= tcp_syncache.bucket_limit) {
327159695Sandre		KASSERT(!TAILQ_EMPTY(&sch->sch_bucket),
328159695Sandre			("sch->sch_length incorrect"));
329159695Sandre		sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head);
33086764Sjlemon		syncache_drop(sc2, sch);
33186764Sjlemon		tcpstat.tcps_sc_bucketoverflow++;
33286764Sjlemon	}
33386764Sjlemon
33486764Sjlemon	/* Put it into the bucket. */
335159695Sandre	TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash);
33686764Sjlemon	sch->sch_length++;
337159695Sandre
338159695Sandre	/* Reinitialize the bucket row's timer. */
339159695Sandre	SYNCACHE_TIMEOUT(sc, sch, 1);
340159695Sandre
341159695Sandre	SCH_UNLOCK(sch);
342159695Sandre
34386764Sjlemon	tcp_syncache.cache_count++;
34486764Sjlemon	tcpstat.tcps_sc_added++;
34586764Sjlemon}
34686764Sjlemon
347159695Sandre/*
348159695Sandre * Remove and free entry from syncache bucket row.
349159695Sandre * Expects locked syncache head.
350159695Sandre */
35186764Sjlemonstatic void
352159697Sandresyncache_drop(struct syncache *sc, struct syncache_head *sch)
35386764Sjlemon{
35486764Sjlemon
355159695Sandre	SCH_LOCK_ASSERT(sch);
35686764Sjlemon
35786764Sjlemon	TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
35886764Sjlemon	sch->sch_length--;
35986764Sjlemon
36086764Sjlemon	syncache_free(sc);
361159695Sandre	tcp_syncache.cache_count--;
36286764Sjlemon}
36386764Sjlemon
36486764Sjlemon/*
36586764Sjlemon * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
36686764Sjlemon * If we have retransmitted an entry the maximum number of times, expire it.
367159695Sandre * One separate timer for each bucket row.
36886764Sjlemon */
36986764Sjlemonstatic void
370159697Sandresyncache_timer(void *xsch)
37186764Sjlemon{
372159695Sandre	struct syncache_head *sch = (struct syncache_head *)xsch;
37386764Sjlemon	struct syncache *sc, *nsc;
374159695Sandre	int tick = ticks;
37586764Sjlemon
376159695Sandre	/* NB: syncache_head has already been locked by the callout. */
377159695Sandre	SCH_LOCK_ASSERT(sch);
37886764Sjlemon
379159695Sandre	TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) {
380159695Sandre		/*
381159695Sandre		 * We do not check if the listen socket still exists
382159695Sandre		 * and accept the case where the listen socket may be
383159695Sandre		 * gone by the time we resend the SYN/ACK.  We do
384159695Sandre		 * not expect this to happens often. If it does,
385159695Sandre		 * then the RST will be sent by the time the remote
386159695Sandre		 * host does the SYN/ACK->ACK.
387159695Sandre		 */
388159695Sandre		if (sc->sc_rxttime >= tick) {
389159695Sandre			if (sc->sc_rxttime < sch->sch_nextc)
390159695Sandre				sch->sch_nextc = sc->sc_rxttime;
391159695Sandre			continue;
392159695Sandre		}
393159695Sandre
394159695Sandre		if (sc->sc_rxmits > tcp_syncache.rexmt_limit) {
395159695Sandre			syncache_drop(sc, sch);
39686764Sjlemon			tcpstat.tcps_sc_stale++;
39786764Sjlemon			continue;
39886764Sjlemon		}
399159695Sandre
40086764Sjlemon		(void) syncache_respond(sc, NULL);
40186764Sjlemon		tcpstat.tcps_sc_retransmitted++;
402159695Sandre		SYNCACHE_TIMEOUT(sc, sch, 0);
40386764Sjlemon	}
404159695Sandre	if (!TAILQ_EMPTY(&(sch)->sch_bucket))
405159695Sandre		callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick,
406159695Sandre			syncache_timer, (void *)(sch));
40786764Sjlemon}
40886764Sjlemon
40986764Sjlemon/*
41086764Sjlemon * Find an entry in the syncache.
411159695Sandre * Returns always with locked syncache_head plus a matching entry or NULL.
41286764Sjlemon */
41386764Sjlemonstruct syncache *
414159697Sandresyncache_lookup(struct in_conninfo *inc, struct syncache_head **schp)
41586764Sjlemon{
41686764Sjlemon	struct syncache *sc;
41786764Sjlemon	struct syncache_head *sch;
41886764Sjlemon
41986764Sjlemon#ifdef INET6
42086764Sjlemon	if (inc->inc_isipv6) {
42186764Sjlemon		sch = &tcp_syncache.hashbase[
42286764Sjlemon		    SYNCACHE_HASH6(inc, tcp_syncache.hashmask)];
42386764Sjlemon		*schp = sch;
424159695Sandre
425159695Sandre		SCH_LOCK(sch);
426159695Sandre
427159695Sandre		/* Circle through bucket row to find matching entry. */
42886764Sjlemon		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
429122496Ssam			if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
43086764Sjlemon				return (sc);
43186764Sjlemon		}
43286764Sjlemon	} else
43386764Sjlemon#endif
43486764Sjlemon	{
43586764Sjlemon		sch = &tcp_syncache.hashbase[
43686764Sjlemon		    SYNCACHE_HASH(inc, tcp_syncache.hashmask)];
43786764Sjlemon		*schp = sch;
438159695Sandre
439159695Sandre		SCH_LOCK(sch);
440159695Sandre
441159695Sandre		/* Circle through bucket row to find matching entry. */
44286764Sjlemon		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
44386764Sjlemon#ifdef INET6
44486764Sjlemon			if (sc->sc_inc.inc_isipv6)
44586764Sjlemon				continue;
44686764Sjlemon#endif
447122496Ssam			if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
44886764Sjlemon				return (sc);
44986764Sjlemon		}
45086764Sjlemon	}
451159695Sandre	SCH_LOCK_ASSERT(*schp);
452159695Sandre	return (NULL);			/* always returns with locked sch */
45386764Sjlemon}
45486764Sjlemon
45586764Sjlemon/*
45686764Sjlemon * This function is called when we get a RST for a
45786764Sjlemon * non-existent connection, so that we can see if the
45886764Sjlemon * connection is in the syn cache.  If it is, zap it.
45986764Sjlemon */
46086764Sjlemonvoid
461159697Sandresyncache_chkrst(struct in_conninfo *inc, struct tcphdr *th)
46286764Sjlemon{
46386764Sjlemon	struct syncache *sc;
46486764Sjlemon	struct syncache_head *sch;
46586764Sjlemon
466159695Sandre	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
467159695Sandre	SCH_LOCK_ASSERT(sch);
468159695Sandre	if (sc == NULL)
469159695Sandre		goto done;
470122496Ssam
47186764Sjlemon	/*
47286764Sjlemon	 * If the RST bit is set, check the sequence number to see
47386764Sjlemon	 * if this is a valid reset segment.
47486764Sjlemon	 * RFC 793 page 37:
47586764Sjlemon	 *   In all states except SYN-SENT, all reset (RST) segments
47686764Sjlemon	 *   are validated by checking their SEQ-fields.  A reset is
47786764Sjlemon	 *   valid if its sequence number is in the window.
47886764Sjlemon	 *
47986764Sjlemon	 *   The sequence number in the reset segment is normally an
48086764Sjlemon	 *   echo of our outgoing acknowlegement numbers, but some hosts
48186764Sjlemon	 *   send a reset with the sequence number at the rightmost edge
48286764Sjlemon	 *   of our receive window, and we have to handle this case.
48386764Sjlemon	 */
48486764Sjlemon	if (SEQ_GEQ(th->th_seq, sc->sc_irs) &&
48586764Sjlemon	    SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
48686764Sjlemon		syncache_drop(sc, sch);
48786764Sjlemon		tcpstat.tcps_sc_reset++;
48886764Sjlemon	}
489159695Sandredone:
490159695Sandre	SCH_UNLOCK(sch);
49186764Sjlemon}
49286764Sjlemon
49386764Sjlemonvoid
494159697Sandresyncache_badack(struct in_conninfo *inc)
49586764Sjlemon{
49686764Sjlemon	struct syncache *sc;
49786764Sjlemon	struct syncache_head *sch;
49886764Sjlemon
499159695Sandre	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
500159695Sandre	SCH_LOCK_ASSERT(sch);
50186764Sjlemon	if (sc != NULL) {
50286764Sjlemon		syncache_drop(sc, sch);
50386764Sjlemon		tcpstat.tcps_sc_badack++;
50486764Sjlemon	}
505159695Sandre	SCH_UNLOCK(sch);
50686764Sjlemon}
50786764Sjlemon
50886764Sjlemonvoid
509159697Sandresyncache_unreach(struct in_conninfo *inc, struct tcphdr *th)
51086764Sjlemon{
51186764Sjlemon	struct syncache *sc;
51286764Sjlemon	struct syncache_head *sch;
51386764Sjlemon
514159695Sandre	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
515159695Sandre	SCH_LOCK_ASSERT(sch);
51686764Sjlemon	if (sc == NULL)
517159695Sandre		goto done;
51886764Sjlemon
51986764Sjlemon	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
52086764Sjlemon	if (ntohl(th->th_seq) != sc->sc_iss)
521159695Sandre		goto done;
52286764Sjlemon
52386764Sjlemon	/*
52486764Sjlemon	 * If we've rertransmitted 3 times and this is our second error,
52586764Sjlemon	 * we remove the entry.  Otherwise, we allow it to continue on.
52686764Sjlemon	 * This prevents us from incorrectly nuking an entry during a
52786764Sjlemon	 * spurious network outage.
52886764Sjlemon	 *
52986764Sjlemon	 * See tcp_notify().
53086764Sjlemon	 */
531159695Sandre	if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) {
53286764Sjlemon		sc->sc_flags |= SCF_UNREACH;
533159695Sandre		goto done;
53486764Sjlemon	}
53586764Sjlemon	syncache_drop(sc, sch);
53686764Sjlemon	tcpstat.tcps_sc_unreach++;
537159695Sandredone:
538159695Sandre	SCH_UNLOCK(sch);
53986764Sjlemon}
54086764Sjlemon
54186764Sjlemon/*
54286764Sjlemon * Build a new TCP socket structure from a syncache entry.
54386764Sjlemon */
54486764Sjlemonstatic struct socket *
545159697Sandresyncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
54686764Sjlemon{
54786764Sjlemon	struct inpcb *inp = NULL;
54886764Sjlemon	struct socket *so;
54986764Sjlemon	struct tcpcb *tp;
55086764Sjlemon
551130555Srwatson	NET_ASSERT_GIANT();
552122496Ssam	INP_INFO_WLOCK_ASSERT(&tcbinfo);
553122496Ssam
55486764Sjlemon	/*
55586764Sjlemon	 * Ok, create the full blown connection, and set things up
55686764Sjlemon	 * as they would have been set up if we had created the
55786764Sjlemon	 * connection when the SYN arrived.  If we can't create
55886764Sjlemon	 * the connection, abort it.
55986764Sjlemon	 */
56086764Sjlemon	so = sonewconn(lso, SS_ISCONNECTED);
56186764Sjlemon	if (so == NULL) {
56286764Sjlemon		/*
56386764Sjlemon		 * Drop the connection; we will send a RST if the peer
56486764Sjlemon		 * retransmits the ACK,
56586764Sjlemon		 */
56686764Sjlemon		tcpstat.tcps_listendrop++;
567122496Ssam		goto abort2;
56886764Sjlemon	}
569101106Srwatson#ifdef MAC
570130398Srwatson	SOCK_LOCK(so);
571101106Srwatson	mac_set_socket_peer_from_mbuf(m, so);
572130398Srwatson	SOCK_UNLOCK(so);
573101106Srwatson#endif
57486764Sjlemon
57586764Sjlemon	inp = sotoinpcb(so);
576122496Ssam	INP_LOCK(inp);
57786764Sjlemon
578159695Sandre	/* Insert new socket into PCB hash list. */
57991492Sume	inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6;
58086764Sjlemon#ifdef INET6
58186764Sjlemon	if (sc->sc_inc.inc_isipv6) {
58286764Sjlemon		inp->in6p_laddr = sc->sc_inc.inc6_laddr;
58386764Sjlemon	} else {
58486764Sjlemon		inp->inp_vflag &= ~INP_IPV6;
58586764Sjlemon		inp->inp_vflag |= INP_IPV4;
58686764Sjlemon#endif
58786764Sjlemon		inp->inp_laddr = sc->sc_inc.inc_laddr;
58886764Sjlemon#ifdef INET6
58986764Sjlemon	}
59086764Sjlemon#endif
59186764Sjlemon	inp->inp_lport = sc->sc_inc.inc_lport;
59286764Sjlemon	if (in_pcbinshash(inp) != 0) {
59386764Sjlemon		/*
59486764Sjlemon		 * Undo the assignments above if we failed to
59586764Sjlemon		 * put the PCB on the hash lists.
59686764Sjlemon		 */
59786764Sjlemon#ifdef INET6
59886764Sjlemon		if (sc->sc_inc.inc_isipv6)
59986764Sjlemon			inp->in6p_laddr = in6addr_any;
600133874Srwatson		else
60186764Sjlemon#endif
60286764Sjlemon			inp->inp_laddr.s_addr = INADDR_ANY;
60386764Sjlemon		inp->inp_lport = 0;
60486764Sjlemon		goto abort;
60586764Sjlemon	}
60686764Sjlemon#ifdef IPSEC
607159697Sandre	/* Copy old policy into new socket's. */
608122062Sume	if (ipsec_copy_pcbpolicy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
609159950Sandre		printf("syncache_socket: could not copy policy\n");
610122062Sume#endif
611122062Sume#ifdef FAST_IPSEC
612159697Sandre	/* Copy old policy into new socket's. */
61386764Sjlemon	if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
614159950Sandre		printf("syncache_socket: could not copy policy\n");
61586764Sjlemon#endif
61686764Sjlemon#ifdef INET6
61786764Sjlemon	if (sc->sc_inc.inc_isipv6) {
61886764Sjlemon		struct inpcb *oinp = sotoinpcb(lso);
61986764Sjlemon		struct in6_addr laddr6;
620124847Sandre		struct sockaddr_in6 sin6;
62186764Sjlemon		/*
62286764Sjlemon		 * Inherit socket options from the listening socket.
62386764Sjlemon		 * Note that in6p_inputopts are not (and should not be)
62486764Sjlemon		 * copied, since it stores previously received options and is
62586764Sjlemon		 * used to detect if each new option is different than the
62686764Sjlemon		 * previous one and hence should be passed to a user.
627133874Srwatson		 * If we copied in6p_inputopts, a user would not be able to
62886764Sjlemon		 * receive options just after calling the accept system call.
62986764Sjlemon		 */
63086764Sjlemon		inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS;
63186764Sjlemon		if (oinp->in6p_outputopts)
63286764Sjlemon			inp->in6p_outputopts =
63386764Sjlemon			    ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
63486764Sjlemon
635124847Sandre		sin6.sin6_family = AF_INET6;
636124847Sandre		sin6.sin6_len = sizeof(sin6);
637124847Sandre		sin6.sin6_addr = sc->sc_inc.inc6_faddr;
638124847Sandre		sin6.sin6_port = sc->sc_inc.inc_fport;
639124847Sandre		sin6.sin6_flowinfo = sin6.sin6_scope_id = 0;
64086764Sjlemon		laddr6 = inp->in6p_laddr;
64186764Sjlemon		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
64286764Sjlemon			inp->in6p_laddr = sc->sc_inc.inc6_laddr;
643127505Spjd		if (in6_pcbconnect(inp, (struct sockaddr *)&sin6,
644127505Spjd		    thread0.td_ucred)) {
64586764Sjlemon			inp->in6p_laddr = laddr6;
64686764Sjlemon			goto abort;
64786764Sjlemon		}
648132307Sdwmalone		/* Override flowlabel from in6_pcbconnect. */
649132307Sdwmalone		inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
650132307Sdwmalone		inp->in6p_flowinfo |= sc->sc_flowlabel;
65186764Sjlemon	} else
65286764Sjlemon#endif
65386764Sjlemon	{
65486764Sjlemon		struct in_addr laddr;
655124847Sandre		struct sockaddr_in sin;
65686764Sjlemon
657135274Sandre		inp->inp_options = ip_srcroute(m);
65886764Sjlemon		if (inp->inp_options == NULL) {
65986764Sjlemon			inp->inp_options = sc->sc_ipopts;
66086764Sjlemon			sc->sc_ipopts = NULL;
66186764Sjlemon		}
66286764Sjlemon
663124847Sandre		sin.sin_family = AF_INET;
664124847Sandre		sin.sin_len = sizeof(sin);
665124847Sandre		sin.sin_addr = sc->sc_inc.inc_faddr;
666124847Sandre		sin.sin_port = sc->sc_inc.inc_fport;
667124847Sandre		bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero));
66886764Sjlemon		laddr = inp->inp_laddr;
66986764Sjlemon		if (inp->inp_laddr.s_addr == INADDR_ANY)
67086764Sjlemon			inp->inp_laddr = sc->sc_inc.inc_laddr;
671127505Spjd		if (in_pcbconnect(inp, (struct sockaddr *)&sin,
672127505Spjd		    thread0.td_ucred)) {
67386764Sjlemon			inp->inp_laddr = laddr;
67486764Sjlemon			goto abort;
67586764Sjlemon		}
67686764Sjlemon	}
67786764Sjlemon	tp = intotcpcb(inp);
67886764Sjlemon	tp->t_state = TCPS_SYN_RECEIVED;
67986764Sjlemon	tp->iss = sc->sc_iss;
68086764Sjlemon	tp->irs = sc->sc_irs;
68186764Sjlemon	tcp_rcvseqinit(tp);
68286764Sjlemon	tcp_sendseqinit(tp);
68386764Sjlemon	tp->snd_wl1 = sc->sc_irs;
68486764Sjlemon	tp->rcv_up = sc->sc_irs + 1;
68586764Sjlemon	tp->rcv_wnd = sc->sc_wnd;
68686764Sjlemon	tp->rcv_adv += tp->rcv_wnd;
68786764Sjlemon
68890982Sjlemon	tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
68986764Sjlemon	if (sc->sc_flags & SCF_NOOPT)
69086764Sjlemon		tp->t_flags |= TF_NOOPT;
691159950Sandre	else {
692159950Sandre		if (sc->sc_flags & SCF_WINSCALE) {
693159950Sandre			tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
694159950Sandre			tp->snd_scale = sc->sc_requested_s_scale;
695159950Sandre			tp->request_r_scale = sc->sc_requested_r_scale;
696159950Sandre		}
697159950Sandre		if (sc->sc_flags & SCF_TIMESTAMP) {
698159950Sandre			tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
699159950Sandre			tp->ts_recent = sc->sc_tsreflect;
700159950Sandre			tp->ts_recent_age = ticks;
701162277Sandre			tp->ts_offset = sc->sc_tsoff;
702159950Sandre		}
703125680Sbms#ifdef TCP_SIGNATURE
704159950Sandre		if (sc->sc_flags & SCF_SIGNATURE)
705159950Sandre			tp->t_flags |= TF_SIGNATURE;
706125783Sbms#endif
707159950Sandre		if (sc->sc_flags & SCF_SACK) {
708159950Sandre			tp->sack_enable = 1;
709159950Sandre			tp->t_flags |= TF_SACK_PERMIT;
710159950Sandre		}
711130989Sps	}
712159695Sandre
713122922Sandre	/*
714122922Sandre	 * Set up MSS and get cached values from tcp_hostcache.
715122922Sandre	 * This might overwrite some of the defaults we just set.
716122922Sandre	 */
71786764Sjlemon	tcp_mss(tp, sc->sc_peer_mss);
71886764Sjlemon
71986764Sjlemon	/*
72086764Sjlemon	 * If the SYN,ACK was retransmitted, reset cwnd to 1 segment.
72186764Sjlemon	 */
722159695Sandre	if (sc->sc_rxmits > 1)
723133874Srwatson		tp->snd_cwnd = tp->t_maxseg;
72486764Sjlemon	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
72586764Sjlemon
726122496Ssam	INP_UNLOCK(inp);
727122496Ssam
72886764Sjlemon	tcpstat.tcps_accepts++;
72986764Sjlemon	return (so);
73086764Sjlemon
73186764Sjlemonabort:
732122496Ssam	INP_UNLOCK(inp);
733122496Ssamabort2:
73486764Sjlemon	if (so != NULL)
735156763Srwatson		soabort(so);
73686764Sjlemon	return (NULL);
73786764Sjlemon}
73886764Sjlemon
73986764Sjlemon/*
74086764Sjlemon * This function gets called when we receive an ACK for a
74186764Sjlemon * socket in the LISTEN state.  We look up the connection
74286764Sjlemon * in the syncache, and if its there, we pull it out of
74386764Sjlemon * the cache and turn it into a full-blown connection in
74486764Sjlemon * the SYN-RECEIVED state.
74586764Sjlemon */
74686764Sjlemonint
747162277Sandresyncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
748159697Sandre    struct socket **lsop, struct mbuf *m)
74986764Sjlemon{
75086764Sjlemon	struct syncache *sc;
75186764Sjlemon	struct syncache_head *sch;
75286764Sjlemon	struct socket *so;
753162277Sandre	struct syncache scs;
75486764Sjlemon
755159695Sandre	/*
756159695Sandre	 * Global TCP locks are held because we manipulate the PCB lists
757159695Sandre	 * and create a new socket.
758159695Sandre	 */
759122496Ssam	INP_INFO_WLOCK_ASSERT(&tcbinfo);
760122496Ssam
761159695Sandre	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
762159695Sandre	SCH_LOCK_ASSERT(sch);
76388180Sjlemon	if (sc == NULL) {
76488180Sjlemon		/*
765133874Srwatson		 * There is no syncache entry, so see if this ACK is
76688180Sjlemon		 * a returning syncookie.  To do this, first:
76788180Sjlemon		 *  A. See if this socket has had a syncache entry dropped in
76888180Sjlemon		 *     the past.  We don't want to accept a bogus syncookie
769133874Srwatson		 *     if we've never received a SYN.
77088180Sjlemon		 *  B. check that the syncookie is valid.  If it is, then
77188180Sjlemon		 *     cobble up a fake syncache entry, and return.
77288180Sjlemon		 */
773162277Sandre		if (!tcp_syncookies) {
774162277Sandre			SCH_UNLOCK(sch);
775162277Sandre			goto failed;
776162277Sandre		}
777162277Sandre		bzero(&scs, sizeof(scs));
778162277Sandre		sc = syncookie_lookup(inc, sch, &scs, to, th, *lsop);
779159695Sandre		SCH_UNLOCK(sch);
78088180Sjlemon		if (sc == NULL)
781159695Sandre			goto failed;
78288180Sjlemon		tcpstat.tcps_sc_recvcookie++;
783159695Sandre	} else {
784159695Sandre		/* Pull out the entry to unlock the bucket row. */
785159695Sandre		TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
786159695Sandre		sch->sch_length--;
787159922Sandre		tcp_syncache.cache_count--;
788159695Sandre		SCH_UNLOCK(sch);
78988180Sjlemon	}
79086764Sjlemon
79186764Sjlemon	/*
79286764Sjlemon	 * If seg contains an ACK, but not for our SYN/ACK, send a RST.
79386764Sjlemon	 */
794159695Sandre	if (th->th_ack != sc->sc_iss + 1)
795159695Sandre		goto failed;
79686764Sjlemon
797159695Sandre	so = syncache_socket(sc, *lsop, m);
798159695Sandre
79986764Sjlemon	if (so == NULL) {
80086764Sjlemon#if 0
80186764Sjlemonresetandabort:
80286764Sjlemon		/* XXXjlemon check this - is this correct? */
80386764Sjlemon		(void) tcp_respond(NULL, m, m, th,
80486764Sjlemon		    th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK);
80586764Sjlemon#endif
806159695Sandre		m_freem(m);			/* XXX: only needed for above */
80786764Sjlemon		tcpstat.tcps_sc_aborted++;
808162277Sandre		if (sc != &scs) {
809159695Sandre			syncache_insert(sc, sch);  /* try again later */
810159695Sandre			sc = NULL;
811159695Sandre		}
812159695Sandre		goto failed;
813122922Sandre	} else
81486764Sjlemon		tcpstat.tcps_sc_completed++;
815159695Sandre	*lsop = so;
816122922Sandre
817162277Sandre	if (sc != &scs)
818162277Sandre		syncache_free(sc);
819159695Sandre	return (1);
820159695Sandrefailed:
821162277Sandre	if (sc != NULL && sc != &scs)
82286764Sjlemon		syncache_free(sc);
823159695Sandre	return (0);
82486764Sjlemon}
82586764Sjlemon
82686764Sjlemon/*
82786764Sjlemon * Given a LISTEN socket and an inbound SYN request, add
82886764Sjlemon * this to the syn cache, and send back a segment:
82986764Sjlemon *	<SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
83086764Sjlemon * to the source.
83186764Sjlemon *
83286764Sjlemon * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
83386764Sjlemon * Doing so would require that we hold onto the data and deliver it
83486764Sjlemon * to the application.  However, if we are the target of a SYN-flood
83586764Sjlemon * DoS attack, an attacker could send data which would eventually
83686764Sjlemon * consume all available buffer space if it were ACKed.  By not ACKing
83786764Sjlemon * the data, we avoid this DoS scenario.
83886764Sjlemon */
83986764Sjlemonint
840159697Sandresyncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
841159697Sandre    struct inpcb *inp, struct socket **lsop, struct mbuf *m)
84286764Sjlemon{
84386764Sjlemon	struct tcpcb *tp;
84486764Sjlemon	struct socket *so;
84586764Sjlemon	struct syncache *sc = NULL;
84686764Sjlemon	struct syncache_head *sch;
84786764Sjlemon	struct mbuf *ipopts = NULL;
848132307Sdwmalone	u_int32_t flowtmp;
849159727Sandre	int win, sb_hiwat, ip_ttl, ip_tos, noopt;
850159701Sandre#ifdef INET6
851159701Sandre	int autoflowlabel = 0;
852159701Sandre#endif
853162277Sandre	struct syncache scs;
85486764Sjlemon
855122496Ssam	INP_INFO_WLOCK_ASSERT(&tcbinfo);
856159695Sandre	INP_LOCK_ASSERT(inp);			/* listen socket */
857122496Ssam
858159695Sandre	/*
859159695Sandre	 * Combine all so/tp operations very early to drop the INP lock as
860159695Sandre	 * soon as possible.
861159695Sandre	 */
862159695Sandre	so = *lsop;
86386764Sjlemon	tp = sototcpcb(so);
86486764Sjlemon
865159695Sandre#ifdef INET6
866159695Sandre	if (inc->inc_isipv6 &&
867159695Sandre	    (inp->in6p_flags & IN6P_AUTOFLOWLABEL))
868159695Sandre		autoflowlabel = 1;
869159695Sandre#endif
870159695Sandre	ip_ttl = inp->inp_ip_ttl;
871159695Sandre	ip_tos = inp->inp_ip_tos;
872159695Sandre	win = sbspace(&so->so_rcv);
873159695Sandre	sb_hiwat = so->so_rcv.sb_hiwat;
874159727Sandre	noopt = (tp->t_flags & TF_NOOPT);
875159695Sandre
876159695Sandre	so = NULL;
877159695Sandre	tp = NULL;
878159695Sandre
879159695Sandre	INP_UNLOCK(inp);
880159695Sandre	INP_INFO_WUNLOCK(&tcbinfo);
881159695Sandre
88286764Sjlemon	/*
88386764Sjlemon	 * Remember the IP options, if any.
88486764Sjlemon	 */
88586764Sjlemon#ifdef INET6
88686764Sjlemon	if (!inc->inc_isipv6)
88786764Sjlemon#endif
888135274Sandre		ipopts = ip_srcroute(m);
88986764Sjlemon
89086764Sjlemon	/*
89186764Sjlemon	 * See if we already have an entry for this connection.
89286764Sjlemon	 * If we do, resend the SYN,ACK, and reset the retransmit timer.
89386764Sjlemon	 *
894159697Sandre	 * XXX: should the syncache be re-initialized with the contents
89586764Sjlemon	 * of the new SYN here (which may have different options?)
89686764Sjlemon	 */
897159695Sandre	sc = syncache_lookup(inc, &sch);	/* returns locked entry */
898159695Sandre	SCH_LOCK_ASSERT(sch);
89986764Sjlemon	if (sc != NULL) {
90086764Sjlemon		tcpstat.tcps_sc_dupsyn++;
90186764Sjlemon		if (ipopts) {
90286764Sjlemon			/*
90386764Sjlemon			 * If we were remembering a previous source route,
90486764Sjlemon			 * forget it and use the new one we've been given.
90586764Sjlemon			 */
90686764Sjlemon			if (sc->sc_ipopts)
90786764Sjlemon				(void) m_free(sc->sc_ipopts);
90886764Sjlemon			sc->sc_ipopts = ipopts;
90986764Sjlemon		}
91086764Sjlemon		/*
91186764Sjlemon		 * Update timestamp if present.
91286764Sjlemon		 */
91386764Sjlemon		if (sc->sc_flags & SCF_TIMESTAMP)
914159950Sandre			sc->sc_tsreflect = to->to_tsval;
91586764Sjlemon		if (syncache_respond(sc, m) == 0) {
916159695Sandre			SYNCACHE_TIMEOUT(sc, sch, 1);
917133874Srwatson			tcpstat.tcps_sndacks++;
91886764Sjlemon			tcpstat.tcps_sndtotal++;
91986764Sjlemon		}
920159695Sandre		SCH_UNLOCK(sch);
921159695Sandre		goto done;
92286764Sjlemon	}
92386764Sjlemon
924155487Sqingli	sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
92586764Sjlemon	if (sc == NULL) {
92686764Sjlemon		/*
92786764Sjlemon		 * The zone allocator couldn't provide more entries.
928133874Srwatson		 * Treat this as if the cache was full; drop the oldest
92986764Sjlemon		 * entry and insert the new one.
93086764Sjlemon		 */
931154355Sglebius		tcpstat.tcps_sc_zonefail++;
932159695Sandre		sc = TAILQ_LAST(&sch->sch_bucket, sch_head);
933159695Sandre		syncache_drop(sc, sch);
934155487Sqingli		sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
93586764Sjlemon		if (sc == NULL) {
936162277Sandre			if (tcp_syncookies) {
937162277Sandre				bzero(&scs, sizeof(scs));
938162277Sandre				sc = &scs;
939162277Sandre			} else {
940162277Sandre				SCH_UNLOCK(sch);
941162277Sandre				if (ipopts)
942162277Sandre					(void) m_free(ipopts);
943162277Sandre				goto done;
944162277Sandre			}
94586764Sjlemon		}
946162277Sandre	}
94786764Sjlemon
94886764Sjlemon	/*
94986764Sjlemon	 * Fill in the syncache values.
95086764Sjlemon	 */
95186764Sjlemon	sc->sc_ipopts = ipopts;
952159950Sandre	bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
95386764Sjlemon#ifdef INET6
954159950Sandre	if (!inc->inc_isipv6)
95586764Sjlemon#endif
95686764Sjlemon	{
957159695Sandre		sc->sc_ip_tos = ip_tos;
958159695Sandre		sc->sc_ip_ttl = ip_ttl;
95986764Sjlemon	}
960162277Sandre
96186764Sjlemon	sc->sc_irs = th->th_seq;
962162277Sandre	sc->sc_iss = arc4random();
963110023Ssilby	sc->sc_flags = 0;
964132307Sdwmalone	sc->sc_flowlabel = 0;
96586764Sjlemon
966159695Sandre	/*
967159695Sandre	 * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN].
968159695Sandre	 * win was derived from socket earlier in the function.
969159695Sandre	 */
97086764Sjlemon	win = imax(win, 0);
97186764Sjlemon	win = imin(win, TCP_MAXWIN);
97286764Sjlemon	sc->sc_wnd = win;
97386764Sjlemon
97486764Sjlemon	if (tcp_do_rfc1323) {
97586764Sjlemon		/*
97686764Sjlemon		 * A timestamp received in a SYN makes
97786764Sjlemon		 * it ok to send timestamp requests and replies.
97886764Sjlemon		 */
97986764Sjlemon		if (to->to_flags & TOF_TS) {
980159950Sandre			sc->sc_tsreflect = to->to_tsval;
98186764Sjlemon			sc->sc_flags |= SCF_TIMESTAMP;
98286764Sjlemon		}
98386764Sjlemon		if (to->to_flags & TOF_SCALE) {
98486764Sjlemon			int wscale = 0;
98586764Sjlemon
98686764Sjlemon			/* Compute proper scaling value from buffer space */
98786764Sjlemon			while (wscale < TCP_MAX_WINSHIFT &&
988159695Sandre			    (TCP_MAXWIN << wscale) < sb_hiwat)
98986764Sjlemon				wscale++;
990159950Sandre			sc->sc_requested_r_scale = wscale;
99186764Sjlemon			sc->sc_requested_s_scale = to->to_requested_s_scale;
99286764Sjlemon			sc->sc_flags |= SCF_WINSCALE;
99386764Sjlemon		}
99486764Sjlemon	}
995125680Sbms#ifdef TCP_SIGNATURE
996125680Sbms	/*
997145371Sps	 * If listening socket requested TCP digests, and received SYN
998145371Sps	 * contains the option, flag this in the syncache so that
999145371Sps	 * syncache_respond() will do the right thing with the SYN+ACK.
1000159697Sandre	 * XXX: Currently we always record the option by default and will
1001145371Sps	 * attempt to use it in syncache_respond().
1002125680Sbms	 */
1003145371Sps	if (to->to_flags & TOF_SIGNATURE)
1004150131Sandre		sc->sc_flags |= SCF_SIGNATURE;
1005125783Sbms#endif
1006133874Srwatson	if (to->to_flags & TOF_SACK)
1007130989Sps		sc->sc_flags |= SCF_SACK;
1008162277Sandre	if (to->to_flags & TOF_MSS)
1009162277Sandre		sc->sc_peer_mss = to->to_mss;	/* peer mss may be zero */
1010159727Sandre	if (noopt)
1011159727Sandre		sc->sc_flags |= SCF_NOOPT;
1012130989Sps
1013162277Sandre	if (tcp_syncookies) {
1014162277Sandre		syncookie_generate(sch, sc, &flowtmp);
1015162277Sandre#ifdef INET6
1016162277Sandre		if (autoflowlabel)
1017162277Sandre			sc->sc_flowlabel = flowtmp;
1018162277Sandre#endif
1019162277Sandre	} else {
1020162277Sandre#ifdef INET6
1021162277Sandre		if (autoflowlabel)
1022162277Sandre			sc->sc_flowlabel =
1023162277Sandre			    (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
1024162277Sandre#endif
1025162277Sandre	}
1026162277Sandre	SCH_UNLOCK(sch);
1027162277Sandre
102886764Sjlemon	/*
1029137139Sandre	 * Do a standard 3-way handshake.
103086764Sjlemon	 */
103188180Sjlemon	if (syncache_respond(sc, m) == 0) {
1032162277Sandre		if (tcp_syncookies && tcp_syncookiesonly && sc != &scs)
1033162277Sandre			syncache_free(sc);
1034162277Sandre		else if (sc != &scs)
1035162277Sandre			syncache_insert(sc, sch);   /* locks and unlocks sch */
103688180Sjlemon		tcpstat.tcps_sndacks++;
103788180Sjlemon		tcpstat.tcps_sndtotal++;
103886764Sjlemon	} else {
103986764Sjlemon		syncache_free(sc);
104088180Sjlemon		tcpstat.tcps_sc_dropped++;
104186764Sjlemon	}
1042159695Sandre
1043159695Sandredone:
1044159695Sandre	*lsop = NULL;
104586764Sjlemon	return (1);
104686764Sjlemon}
104786764Sjlemon
104886764Sjlemonstatic int
1049159697Sandresyncache_respond(struct syncache *sc, struct mbuf *m)
105086764Sjlemon{
1051159950Sandre	struct ip *ip = NULL;
1052159950Sandre	struct tcphdr *th;
105386764Sjlemon	int optlen, error;
105486764Sjlemon	u_int16_t tlen, hlen, mssopt;
1055159950Sandre	u_int8_t *optp;
105686764Sjlemon#ifdef INET6
105786764Sjlemon	struct ip6_hdr *ip6 = NULL;
105886764Sjlemon#endif
1059159695Sandre#ifdef MAC
1060159695Sandre	struct inpcb *inp = NULL;
1061159695Sandre#endif
106286764Sjlemon
1063122922Sandre	hlen =
106486764Sjlemon#ifdef INET6
1065133874Srwatson	       (sc->sc_inc.inc_isipv6) ? sizeof(struct ip6_hdr) :
106686764Sjlemon#endif
1067122922Sandre		sizeof(struct ip);
106886764Sjlemon
1069159697Sandre	/* Determine MSS we advertize to other end of connection. */
1070122922Sandre	mssopt = tcp_mssopt(&sc->sc_inc);
1071159955Sandre	if (sc->sc_peer_mss)
1072159955Sandre		mssopt = max( min(sc->sc_peer_mss, mssopt), tcp_minmss);
1073122922Sandre
107486764Sjlemon	/* Compute the size of the TCP options. */
107586764Sjlemon	if (sc->sc_flags & SCF_NOOPT) {
107686764Sjlemon		optlen = 0;
107786764Sjlemon	} else {
107886764Sjlemon		optlen = TCPOLEN_MAXSEG +
107986764Sjlemon		    ((sc->sc_flags & SCF_WINSCALE) ? 4 : 0) +
1080137139Sandre		    ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
1081125680Sbms#ifdef TCP_SIGNATURE
1082145372Sps		if (sc->sc_flags & SCF_SIGNATURE)
1083145372Sps			optlen += TCPOLEN_SIGNATURE;
1084125783Sbms#endif
1085145372Sps		if (sc->sc_flags & SCF_SACK)
1086145372Sps			optlen += TCPOLEN_SACK_PERMITTED;
1087145372Sps		optlen = roundup2(optlen, 4);
108886764Sjlemon	}
108986764Sjlemon	tlen = hlen + sizeof(struct tcphdr) + optlen;
109086764Sjlemon
109186764Sjlemon	/*
1092159697Sandre	 * XXX: Assume that the entire packet will fit in a header mbuf.
109386764Sjlemon	 */
109486764Sjlemon	KASSERT(max_linkhdr + tlen <= MHLEN, ("syncache: mbuf too small"));
109586764Sjlemon
1096159695Sandre	/* Create the IP+TCP header from scratch. */
109786764Sjlemon	if (m)
109886764Sjlemon		m_freem(m);
109986764Sjlemon
1100151967Sandre	m = m_gethdr(M_DONTWAIT, MT_DATA);
110186764Sjlemon	if (m == NULL)
110286764Sjlemon		return (ENOBUFS);
110386764Sjlemon	m->m_data += max_linkhdr;
110486764Sjlemon	m->m_len = tlen;
110586764Sjlemon	m->m_pkthdr.len = tlen;
110686764Sjlemon	m->m_pkthdr.rcvif = NULL;
1107159695Sandre
1108159695Sandre#ifdef MAC
1109159695Sandre	/*
1110159695Sandre	 * For MAC look up the inpcb to get access to the label information.
1111159695Sandre	 * We don't store the inpcb pointer in struct syncache to make locking
1112159695Sandre	 * less complicated and to save locking operations.  However for MAC
1113159695Sandre	 * this gives a slight overhead as we have to do a full pcblookup here.
1114159695Sandre	 */
1115159695Sandre	INP_INFO_RLOCK(&tcbinfo);
1116159695Sandre	if (inp == NULL) {
1117159695Sandre#ifdef INET6 /* && MAC */
1118159695Sandre		if (sc->sc_inc.inc_isipv6)
1119159695Sandre			inp = in6_pcblookup_hash(&tcbinfo,
1120159945Sandre				&sc->sc_inc.inc6_faddr, sc->sc_inc.inc_fport,
1121159695Sandre				&sc->sc_inc.inc6_laddr, sc->sc_inc.inc_lport,
1122159695Sandre				1, NULL);
1123159695Sandre		else
1124159695Sandre#endif /* INET6 */
1125159695Sandre			inp = in_pcblookup_hash(&tcbinfo,
1126159945Sandre				sc->sc_inc.inc_faddr, sc->sc_inc.inc_fport,
1127159695Sandre				sc->sc_inc.inc_laddr, sc->sc_inc.inc_lport,
1128159695Sandre				1, NULL);
1129159695Sandre		if (inp == NULL) {
1130159695Sandre			m_freem(m);
1131159695Sandre			INP_INFO_RUNLOCK(&tcbinfo);
1132159695Sandre			return (ESHUTDOWN);
1133159695Sandre		}
1134159695Sandre	}
1135122496Ssam	INP_LOCK(inp);
1136159695Sandre	if (!inp->inp_socket->so_options & SO_ACCEPTCONN) {
1137159695Sandre		m_freem(m);
1138159695Sandre		INP_UNLOCK(inp);
1139159695Sandre		INP_INFO_RUNLOCK(&tcbinfo);
1140159695Sandre		return (ESHUTDOWN);
1141159695Sandre	}
1142128905Srwatson	mac_create_mbuf_from_inpcb(inp, m);
1143159695Sandre	INP_UNLOCK(inp);
1144159695Sandre	INP_INFO_RUNLOCK(&tcbinfo);
1145159695Sandre#endif /* MAC */
114686764Sjlemon
114786764Sjlemon#ifdef INET6
114886764Sjlemon	if (sc->sc_inc.inc_isipv6) {
114986764Sjlemon		ip6 = mtod(m, struct ip6_hdr *);
115086764Sjlemon		ip6->ip6_vfc = IPV6_VERSION;
115186764Sjlemon		ip6->ip6_nxt = IPPROTO_TCP;
115286764Sjlemon		ip6->ip6_src = sc->sc_inc.inc6_laddr;
115386764Sjlemon		ip6->ip6_dst = sc->sc_inc.inc6_faddr;
115486764Sjlemon		ip6->ip6_plen = htons(tlen - hlen);
115586764Sjlemon		/* ip6_hlim is set after checksum */
1156132307Sdwmalone		ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
1157132307Sdwmalone		ip6->ip6_flow |= sc->sc_flowlabel;
115886764Sjlemon
115986764Sjlemon		th = (struct tcphdr *)(ip6 + 1);
116086764Sjlemon	} else
116186764Sjlemon#endif
116286764Sjlemon	{
116386764Sjlemon		ip = mtod(m, struct ip *);
116486764Sjlemon		ip->ip_v = IPVERSION;
116586764Sjlemon		ip->ip_hl = sizeof(struct ip) >> 2;
116686764Sjlemon		ip->ip_len = tlen;
116786764Sjlemon		ip->ip_id = 0;
116886764Sjlemon		ip->ip_off = 0;
116986764Sjlemon		ip->ip_sum = 0;
117086764Sjlemon		ip->ip_p = IPPROTO_TCP;
117186764Sjlemon		ip->ip_src = sc->sc_inc.inc_laddr;
117286764Sjlemon		ip->ip_dst = sc->sc_inc.inc_faddr;
1173159695Sandre		ip->ip_ttl = sc->sc_ip_ttl;
1174159695Sandre		ip->ip_tos = sc->sc_ip_tos;
117586764Sjlemon
117698204Ssilby		/*
1177108125Shsu		 * See if we should do MTU discovery.  Route lookups are
1178108125Shsu		 * expensive, so we will only unset the DF bit if:
1179101405Ssilby		 *
1180101405Ssilby		 *	1) path_mtu_discovery is disabled
1181101405Ssilby		 *	2) the SCF_UNREACH flag has been set
118298204Ssilby		 */
1183108125Shsu		if (path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0))
118498204Ssilby		       ip->ip_off |= IP_DF;
118598204Ssilby
118686764Sjlemon		th = (struct tcphdr *)(ip + 1);
118786764Sjlemon	}
118886764Sjlemon	th->th_sport = sc->sc_inc.inc_lport;
118986764Sjlemon	th->th_dport = sc->sc_inc.inc_fport;
119086764Sjlemon
119186764Sjlemon	th->th_seq = htonl(sc->sc_iss);
119286764Sjlemon	th->th_ack = htonl(sc->sc_irs + 1);
119386764Sjlemon	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
119486764Sjlemon	th->th_x2 = 0;
119586764Sjlemon	th->th_flags = TH_SYN|TH_ACK;
119686764Sjlemon	th->th_win = htons(sc->sc_wnd);
119786764Sjlemon	th->th_urp = 0;
119886764Sjlemon
119986764Sjlemon	/* Tack on the TCP options. */
1200108125Shsu	if (optlen != 0) {
1201108125Shsu		optp = (u_int8_t *)(th + 1);
1202108125Shsu		*optp++ = TCPOPT_MAXSEG;
1203108125Shsu		*optp++ = TCPOLEN_MAXSEG;
1204108125Shsu		*optp++ = (mssopt >> 8) & 0xff;
1205108125Shsu		*optp++ = mssopt & 0xff;
120686764Sjlemon
1207108125Shsu		if (sc->sc_flags & SCF_WINSCALE) {
1208108125Shsu			*((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
1209108125Shsu			    TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
1210159950Sandre			    sc->sc_requested_r_scale);
1211108125Shsu			optp += 4;
1212108125Shsu		}
121386764Sjlemon
1214108125Shsu		if (sc->sc_flags & SCF_TIMESTAMP) {
1215108125Shsu			u_int32_t *lp = (u_int32_t *)(optp);
121686764Sjlemon
1217108125Shsu			/* Form timestamp option per appendix A of RFC 1323. */
1218108125Shsu			*lp++ = htonl(TCPOPT_TSTAMP_HDR);
1219162277Sandre			if (sc->sc_ts)
1220162277Sandre				*lp++ = htonl(sc->sc_ts);
1221162277Sandre			else
1222162277Sandre				*lp++ = htonl(ticks);
1223159950Sandre			*lp   = htonl(sc->sc_tsreflect);
1224108125Shsu			optp += TCPOLEN_TSTAMP_APPA;
1225108125Shsu		}
122686764Sjlemon
1227125680Sbms#ifdef TCP_SIGNATURE
1228125680Sbms		/*
1229125680Sbms		 * Handle TCP-MD5 passive opener response.
1230125680Sbms		 */
1231125680Sbms		if (sc->sc_flags & SCF_SIGNATURE) {
1232125680Sbms			u_int8_t *bp = optp;
1233125680Sbms			int i;
1234125680Sbms
1235125680Sbms			*bp++ = TCPOPT_SIGNATURE;
1236125680Sbms			*bp++ = TCPOLEN_SIGNATURE;
1237125680Sbms			for (i = 0; i < TCP_SIGLEN; i++)
1238125680Sbms				*bp++ = 0;
1239125783Sbms			tcp_signature_compute(m, sizeof(struct ip), 0, optlen,
1240125680Sbms			    optp + 2, IPSEC_DIR_OUTBOUND);
1241145372Sps			optp += TCPOLEN_SIGNATURE;
1242125680Sbms		}
1243125680Sbms#endif /* TCP_SIGNATURE */
1244130989Sps
1245145372Sps		if (sc->sc_flags & SCF_SACK) {
1246145372Sps			*optp++ = TCPOPT_SACK_PERMITTED;
1247145372Sps			*optp++ = TCPOLEN_SACK_PERMITTED;
1248145372Sps		}
1249145372Sps
1250145372Sps		{
1251145372Sps			/* Pad TCP options to a 4 byte boundary */
1252145372Sps			int padlen = optlen - (optp - (u_int8_t *)(th + 1));
1253145372Sps			while (padlen-- > 0)
1254145372Sps				*optp++ = TCPOPT_EOL;
1255145372Sps		}
125686764Sjlemon	}
125786764Sjlemon
125886764Sjlemon#ifdef INET6
125986764Sjlemon	if (sc->sc_inc.inc_isipv6) {
126086764Sjlemon		th->th_sum = 0;
126186764Sjlemon		th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
1262122922Sandre		ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
1263159695Sandre		error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
126486764Sjlemon	} else
126586764Sjlemon#endif
126686764Sjlemon	{
1267133874Srwatson		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
126886764Sjlemon		    htons(tlen - hlen + IPPROTO_TCP));
126986764Sjlemon		m->m_pkthdr.csum_flags = CSUM_TCP;
127086764Sjlemon		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1271159695Sandre		error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL);
127286764Sjlemon	}
127386764Sjlemon	return (error);
127486764Sjlemon}
127588180Sjlemon
127688180Sjlemon/*
1277162277Sandre * The purpose of SYN cookies is to avoid keeping track of all SYN's we
1278162277Sandre * receive and to be able to handle SYN floods from bogus source addresses
1279162277Sandre * (where we will never receive any reply).  SYN floods try to exhaust all
1280162277Sandre * our memory and available slots in the SYN cache table to cause a denial
1281162277Sandre * of service to legitimate users of the local host.
128288180Sjlemon *
1283162277Sandre * The idea of SYN cookies is to encode and include all necessary information
1284162277Sandre * about the connection setup state within the SYN-ACK we send back and thus
1285162277Sandre * to get along without keeping any local state until the ACK to the SYN-ACK
1286162277Sandre * arrives (if ever).  Everything we need to know should be available from
1287162277Sandre * the information we encoded in the SYN-ACK.
1288162277Sandre *
1289162277Sandre * More information about the theory behind SYN cookies and its first
1290162277Sandre * discussion and specification can be found at:
1291162277Sandre *  http://cr.yp.to/syncookies.html    (overview)
1292162277Sandre *  http://cr.yp.to/syncookies/archive (gory details)
1293162277Sandre *
1294162277Sandre * This implementation extends the orginal idea and first implementation
1295162277Sandre * of FreeBSD by using not only the initial sequence number field to store
1296162277Sandre * information but also the timestamp field if present.  This way we can
1297162277Sandre * keep track of the entire state we need to know to recreate the session in
1298162277Sandre * its original form.  Almost all TCP speakers implement RFC1323 timestamps
1299162277Sandre * these days.  For those that do not we still have to live with the known
1300162277Sandre * shortcomings of the ISN only SYN cookies.
1301162277Sandre *
1302162277Sandre * Cookie layers:
1303162277Sandre *
1304162277Sandre * Initial sequence number we send:
1305162277Sandre * 31|................................|0
1306162277Sandre *    DDDDDDDDDDDDDDDDDDDDDDDDDMMMRRRP
1307162277Sandre *    D = MD5 Digest (first dword)
1308162277Sandre *    M = MSS index
1309162277Sandre *    R = Rotation of secret
1310162277Sandre *    P = Odd or Even secret
1311162277Sandre *
1312162277Sandre * The MD5 Digest is computed with over following parameters:
1313162277Sandre *  a) randomly rotated secret
1314162277Sandre *  b) struct in_conninfo containing the remote/local ip/port (IPv4&IPv6)
1315162277Sandre *  c) the received initial sequence number from remote host
1316162277Sandre *  d) the rotation offset and odd/even bit
1317162277Sandre *
1318162277Sandre * Timestamp we send:
1319162277Sandre * 31|................................|0
1320162277Sandre *    DDDDDDDDDDDDDDDDDDDDDDSSSSRRRRA5
1321162277Sandre *    D = MD5 Digest (third dword) (only as filler)
1322162277Sandre *    S = Requested send window scale
1323162277Sandre *    R = Requested receive window scale
1324162277Sandre *    A = SACK allowed
1325162277Sandre *    5 = TCP-MD5 enabled (not implemented yet)
1326162277Sandre *    XORed with MD5 Digest (forth dword)
1327162277Sandre *
1328162277Sandre * The timestamp isn't cryptographically secure and doesn't need to be.
1329162277Sandre * The double use of the MD5 digest dwords ties it to a specific remote/
1330162277Sandre * local host/port, remote initial sequence number and our local time
1331162277Sandre * limited secret.  A received timestamp is reverted (XORed) and then
1332162277Sandre * the contained MD5 dword is compared to the computed one to ensure the
1333162277Sandre * timestamp belongs to the SYN-ACK we sent.  The other parameters may
1334162277Sandre * have been tampered with but this isn't different from supplying bogus
1335162277Sandre * values in the SYN in the first place.
1336162277Sandre *
1337162277Sandre * Some problems with SYN cookies remain however:
133888180Sjlemon * Consider the problem of a recreated (and retransmitted) cookie.  If the
1339133874Srwatson * original SYN was accepted, the connection is established.  The second
1340133874Srwatson * SYN is inflight, and if it arrives with an ISN that falls within the
1341133874Srwatson * receive window, the connection is killed.
134288180Sjlemon *
1343162277Sandre * Notes:
1344162277Sandre * A heuristic to determine when to accept syn cookies is not necessary.
1345162277Sandre * An ACK flood would cause the syncookie verification to be attempted,
1346162277Sandre * but a SYN flood causes syncookies to be generated.  Both are of equal
1347162277Sandre * cost, so there's no point in trying to optimize the ACK flood case.
1348162277Sandre * Also, if you don't process certain ACKs for some reason, then all someone
1349162277Sandre * would have to do is launch a SYN and ACK flood at the same time, which
1350162277Sandre * would stop cookie verification and defeat the entire purpose of syncookies.
135188180Sjlemon */
1352162277Sandrestatic int tcp_sc_msstab[] = { 0, 256, 468, 536, 996, 1452, 1460, 8960 };
135388180Sjlemon
1354159695Sandrestatic void
1355162277Sandresyncookie_generate(struct syncache_head *sch, struct syncache *sc,
1356162277Sandre    u_int32_t *flowlabel)
1357162277Sandre{
1358162277Sandre	MD5_CTX ctx;
1359162277Sandre	u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)];
1360162277Sandre	u_int32_t data;
1361162277Sandre	u_int32_t *secbits;
1362162277Sandre	u_int off, pmss, mss;
1363162277Sandre	int i;
1364159695Sandre
1365162277Sandre	SCH_LOCK_ASSERT(sch);
1366162277Sandre
1367162277Sandre	/* Which of the two secrets to use. */
1368162277Sandre	secbits = sch->sch_oddeven ?
1369162277Sandre			sch->sch_secbits_odd : sch->sch_secbits_even;
1370162277Sandre
1371162277Sandre	/* Reseed secret if too old. */
1372162277Sandre	if (sch->sch_reseed < time_uptime) {
1373162277Sandre		sch->sch_oddeven = sch->sch_oddeven ? 0 : 1;	/* toggle */
1374162277Sandre		secbits = sch->sch_oddeven ?
1375162277Sandre				sch->sch_secbits_odd : sch->sch_secbits_even;
1376162277Sandre		for (i = 0; i < SYNCOOKIE_SECRET_SIZE; i++)
1377162277Sandre			secbits[i] = arc4random();
1378162277Sandre		sch->sch_reseed = time_uptime + SYNCOOKIE_LIFETIME;
1379159695Sandre	}
1380159695Sandre
1381162277Sandre	/* Secret rotation offset. */
1382162277Sandre	off = sc->sc_iss & 0x7;			/* iss was randomized before */
138388180Sjlemon
1384162277Sandre	/* Maximum segment size calculation. */
1385162277Sandre	pmss = max( min(sc->sc_peer_mss, tcp_mssopt(&sc->sc_inc)), tcp_minmss);
1386162277Sandre	for (mss = sizeof(tcp_sc_msstab) / sizeof(int) - 1; mss > 0; mss--)
1387162277Sandre		if (tcp_sc_msstab[mss] <= pmss)
138888180Sjlemon			break;
1389162277Sandre
1390162277Sandre	/* Fold parameters and MD5 digest into the ISN we will send. */
1391162277Sandre	data = sch->sch_oddeven;/* odd or even secret, 1 bit */
1392162277Sandre	data |= off << 1;	/* secret offset, derived from iss, 3 bits */
1393162277Sandre	data |= mss << 4;	/* mss, 3 bits */
1394162277Sandre
1395162277Sandre	MD5Init(&ctx);
1396162277Sandre	MD5Update(&ctx, ((u_int8_t *)secbits) + off,
1397162277Sandre	    SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off);
1398162277Sandre	MD5Update(&ctx, secbits, off);
1399162277Sandre	MD5Update(&ctx, &sc->sc_inc, sizeof(sc->sc_inc));
1400162277Sandre	MD5Update(&ctx, &sc->sc_irs, sizeof(sc->sc_irs));
1401162277Sandre	MD5Update(&ctx, &data, sizeof(data));
1402162277Sandre	MD5Final((u_int8_t *)&md5_buffer, &ctx);
1403162277Sandre
1404162277Sandre	data |= (md5_buffer[0] << 7);
1405162277Sandre	sc->sc_iss = data;
1406162277Sandre
1407162306Sache#ifdef INET6
1408162277Sandre	*flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
1409162306Sache#endif
1410162277Sandre
1411162277Sandre	/* Additional parameters are stored in the timestamp if present. */
1412162277Sandre	if (sc->sc_flags & SCF_TIMESTAMP) {
1413162277Sandre		data =  ((sc->sc_flags & SCF_SIGNATURE) ? 1 : 0); /* TCP-MD5, 1 bit */
1414162277Sandre		data |= ((sc->sc_flags & SCF_SACK) ? 1 : 0) << 1; /* SACK, 1 bit */
1415162277Sandre		data |= sc->sc_requested_s_scale << 2;  /* SWIN scale, 4 bits */
1416162277Sandre		data |= sc->sc_requested_r_scale << 6;  /* RWIN scale, 4 bits */
1417162277Sandre		data |= md5_buffer[2] << 10;		/* more digest bits */
1418162277Sandre		data ^= md5_buffer[3];
1419162277Sandre		sc->sc_ts = data;
1420162277Sandre		sc->sc_tsoff = data - ticks;		/* after XOR */
142188180Sjlemon	} else
1422162277Sandre		sc->sc_ts = 0;
1423162277Sandre
1424162277Sandre	return;
142588180Sjlemon}
142688180Sjlemon
142788180Sjlemonstatic struct syncache *
1428162277Sandresyncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch,
1429162277Sandre    struct syncache *sc, struct tcpopt *to, struct tcphdr *th,
1430162277Sandre    struct socket *so)
143188180Sjlemon{
1432162277Sandre	MD5_CTX ctx;
1433162277Sandre	u_int32_t md5_buffer[MD5_DIGEST_LENGTH / sizeof(u_int32_t)];
1434162277Sandre	u_int32_t data = 0;
1435162277Sandre	u_int32_t *secbits;
1436162277Sandre	tcp_seq ack, seq;
1437162277Sandre	int off, mss, wnd, flags;
143888180Sjlemon
1439162277Sandre	SCH_LOCK_ASSERT(sch);
1440162277Sandre
1441162277Sandre	/*
1442162277Sandre	 * Pull information out of SYN-ACK/ACK and
1443162277Sandre	 * revert sequence number advances.
1444162277Sandre	 */
1445162277Sandre	ack = th->th_ack - 1;
1446162277Sandre	seq = th->th_seq - 1;
1447162277Sandre	off = (ack >> 1) & 0x7;
1448162277Sandre	mss = (ack >> 4) & 0x7;
1449162277Sandre	flags = ack & 0x7f;
1450162277Sandre
1451162277Sandre	/* Which of the two secrets to use. */
1452162277Sandre	secbits = (flags & 0x1) ? sch->sch_secbits_odd : sch->sch_secbits_even;
1453162277Sandre
1454162277Sandre	/*
1455162277Sandre	 * The secret wasn't updated for the lifetime of a syncookie,
1456162277Sandre	 * so this SYN-ACK/ACK is either too old (replay) or totally bogus.
1457162277Sandre	 */
1458162277Sandre	if (sch->sch_reseed < time_uptime) {
145988180Sjlemon		return (NULL);
1460159695Sandre	}
1461162277Sandre
1462162277Sandre	/* Recompute the digest so we can compare it. */
1463162277Sandre	MD5Init(&ctx);
1464162277Sandre	MD5Update(&ctx, ((u_int8_t *)secbits) + off,
1465162277Sandre	    SYNCOOKIE_SECRET_SIZE * sizeof(*secbits) - off);
1466162277Sandre	MD5Update(&ctx, secbits, off);
1467162277Sandre	MD5Update(&ctx, inc, sizeof(*inc));
1468162277Sandre	MD5Update(&ctx, &seq, sizeof(seq));
1469162277Sandre	MD5Update(&ctx, &flags, sizeof(flags));
1470162277Sandre	MD5Final((u_int8_t *)&md5_buffer, &ctx);
1471162277Sandre
1472162277Sandre	/* Does the digest part of or ACK'ed ISS match? */
1473162277Sandre	if ((ack & (~0x7f)) != (md5_buffer[0] << 7))
147488180Sjlemon		return (NULL);
147588180Sjlemon
1476162277Sandre	/* Does the digest part of our reflected timestamp match? */
1477162277Sandre	if (to->to_flags & TOF_TS) {
1478162277Sandre		data = md5_buffer[3] ^ to->to_tsecr;
1479162277Sandre		if ((data & (~0x3ff)) != (md5_buffer[2] << 10))
1480162277Sandre			return (NULL);
1481162277Sandre	}
1482162277Sandre
1483162277Sandre	/* Fill in the syncache values. */
1484162277Sandre	bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo));
148588180Sjlemon	sc->sc_ipopts = NULL;
1486162277Sandre
1487162277Sandre	sc->sc_irs = seq;
1488162277Sandre	sc->sc_iss = ack;
1489162277Sandre
149088180Sjlemon#ifdef INET6
149188180Sjlemon	if (inc->inc_isipv6) {
1492159695Sandre		if (sotoinpcb(so)->in6p_flags & IN6P_AUTOFLOWLABEL)
1493132307Sdwmalone			sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
149488180Sjlemon	} else
149588180Sjlemon#endif
149688180Sjlemon	{
1497159695Sandre		sc->sc_ip_ttl = sotoinpcb(so)->inp_ip_ttl;
1498159695Sandre		sc->sc_ip_tos = sotoinpcb(so)->inp_ip_tos;
149988180Sjlemon	}
1500162277Sandre
1501162277Sandre	/* Additional parameters that were encoded in the timestamp. */
1502162277Sandre	if (data) {
1503162277Sandre		sc->sc_flags |= SCF_TIMESTAMP;
1504162277Sandre		sc->sc_tsreflect = to->to_tsval;
1505162277Sandre		sc->sc_tsoff = to->to_tsecr - ticks;
1506162277Sandre		sc->sc_flags |= (data & 0x1) ? SCF_SIGNATURE : 0;
1507162277Sandre		sc->sc_flags |= ((data >> 1) & 0x1) ? SCF_SACK : 0;
1508162277Sandre		sc->sc_requested_s_scale = min((data >> 2) & 0xf,
1509162277Sandre		    TCP_MAX_WINSHIFT);
1510162277Sandre		sc->sc_requested_r_scale = min((data >> 6) & 0xf,
1511162277Sandre		    TCP_MAX_WINSHIFT);
1512162277Sandre		if (sc->sc_requested_s_scale || sc->sc_requested_r_scale)
1513162277Sandre			sc->sc_flags |= SCF_WINSCALE;
1514162277Sandre	} else
1515162277Sandre		sc->sc_flags |= SCF_NOOPT;
1516162277Sandre
151788180Sjlemon	wnd = sbspace(&so->so_rcv);
151888180Sjlemon	wnd = imax(wnd, 0);
151988180Sjlemon	wnd = imin(wnd, TCP_MAXWIN);
152088180Sjlemon	sc->sc_wnd = wnd;
1521162277Sandre
1522159695Sandre	sc->sc_rxmits = 0;
1523162277Sandre	sc->sc_peer_mss = tcp_sc_msstab[mss];
1524162277Sandre
152588180Sjlemon	return (sc);
152688180Sjlemon}
1527