tcp_syncache.c revision 159945
1/*-
2 * Copyright (c) 2001 McAfee, Inc.
3 * Copyright (c) 2006 Andre Oppermann, Internet Business Solutions AG
4 * All rights reserved.
5 *
6 * This software was developed for the FreeBSD Project by Jonathan Lemon
7 * and McAfee Research, the Security Research Division of McAfee, Inc. under
8 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
9 * DARPA CHATS research program.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * $FreeBSD: head/sys/netinet/tcp_syncache.c 159945 2006-06-26 09:43:55Z andre $
33 */
34
35#include "opt_inet.h"
36#include "opt_inet6.h"
37#include "opt_ipsec.h"
38#include "opt_mac.h"
39#include "opt_tcpdebug.h"
40#include "opt_tcp_sack.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/kernel.h>
45#include <sys/sysctl.h>
46#include <sys/lock.h>
47#include <sys/mutex.h>
48#include <sys/malloc.h>
49#include <sys/mac.h>
50#include <sys/mbuf.h>
51#include <sys/md5.h>
52#include <sys/proc.h>		/* for proc0 declaration */
53#include <sys/random.h>
54#include <sys/rwlock.h>
55#include <sys/socket.h>
56#include <sys/socketvar.h>
57
58#include <net/if.h>
59#include <net/route.h>
60
61#include <netinet/in.h>
62#include <netinet/in_systm.h>
63#include <netinet/ip.h>
64#include <netinet/in_var.h>
65#include <netinet/in_pcb.h>
66#include <netinet/ip_var.h>
67#include <netinet/ip_options.h>
68#ifdef INET6
69#include <netinet/ip6.h>
70#include <netinet/icmp6.h>
71#include <netinet6/nd6.h>
72#include <netinet6/ip6_var.h>
73#include <netinet6/in6_pcb.h>
74#endif
75#include <netinet/tcp.h>
76#ifdef TCPDEBUG
77#include <netinet/tcpip.h>
78#endif
79#include <netinet/tcp_fsm.h>
80#include <netinet/tcp_seq.h>
81#include <netinet/tcp_timer.h>
82#include <netinet/tcp_var.h>
83#ifdef TCPDEBUG
84#include <netinet/tcp_debug.h>
85#endif
86#ifdef INET6
87#include <netinet6/tcp6_var.h>
88#endif
89
90#ifdef IPSEC
91#include <netinet6/ipsec.h>
92#ifdef INET6
93#include <netinet6/ipsec6.h>
94#endif
95#endif /*IPSEC*/
96
97#ifdef FAST_IPSEC
98#include <netipsec/ipsec.h>
99#ifdef INET6
100#include <netipsec/ipsec6.h>
101#endif
102#include <netipsec/key.h>
103#endif /*FAST_IPSEC*/
104
105#include <machine/in_cksum.h>
106#include <vm/uma.h>
107
108static int tcp_syncookies = 1;
109SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW,
110    &tcp_syncookies, 0,
111    "Use TCP SYN cookies if the syncache overflows");
112
113struct syncache {
114	TAILQ_ENTRY(syncache)	sc_hash;
115	struct		in_conninfo sc_inc;	/* addresses */
116	u_long		sc_rxttime;		/* retransmit time */
117	u_int16_t	sc_rxmits;		/* retransmit counter */
118
119	u_int32_t	sc_tsrecent;
120	u_int32_t	sc_flowlabel;		/* IPv6 flowlabel */
121	tcp_seq		sc_irs;			/* seq from peer */
122	tcp_seq		sc_iss;			/* our ISS */
123	struct		mbuf *sc_ipopts;	/* source route */
124
125	u_int16_t	sc_peer_mss;		/* peer's MSS */
126	u_int16_t	sc_wnd;			/* advertised window */
127	u_int8_t	sc_ip_ttl;		/* IPv4 TTL */
128	u_int8_t	sc_ip_tos;		/* IPv4 TOS */
129	u_int8_t	sc_requested_s_scale:4,
130			sc_request_r_scale:4;
131	u_int8_t	sc_flags;
132#define SCF_NOOPT	0x01			/* no TCP options */
133#define SCF_WINSCALE	0x02			/* negotiated window scaling */
134#define SCF_TIMESTAMP	0x04			/* negotiated timestamps */
135#define SCF_UNREACH	0x10			/* icmp unreachable received */
136#define SCF_SIGNATURE	0x20			/* send MD5 digests */
137#define SCF_SACK	0x80			/* send SACK option */
138};
139
140struct syncache_head {
141	struct mtx	sch_mtx;
142	TAILQ_HEAD(sch_head, syncache)	sch_bucket;
143	struct callout	sch_timer;
144	int		sch_nextc;
145	u_int		sch_length;
146};
147
148static void	 syncache_drop(struct syncache *, struct syncache_head *);
149static void	 syncache_free(struct syncache *);
150static void	 syncache_insert(struct syncache *, struct syncache_head *);
151struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **);
152static int	 syncache_respond(struct syncache *, struct mbuf *);
153static struct	 socket *syncache_socket(struct syncache *, struct socket *,
154		    struct mbuf *m);
155static void	 syncache_timer(void *);
156static void	 syncookie_init(void);
157static u_int32_t syncookie_generate(struct syncache *, u_int32_t *);
158static struct syncache
159		 *syncookie_lookup(struct in_conninfo *, struct tcphdr *,
160		    struct socket *);
161
162/*
163 * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
164 * 3 retransmits corresponds to a timeout of (1 + 2 + 4 + 8 == 15) seconds,
165 * the odds are that the user has given up attempting to connect by then.
166 */
167#define SYNCACHE_MAXREXMTS		3
168
169/* Arbitrary values */
170#define TCP_SYNCACHE_HASHSIZE		512
171#define TCP_SYNCACHE_BUCKETLIMIT	30
172
173struct tcp_syncache {
174	struct	syncache_head *hashbase;
175	uma_zone_t zone;
176	u_int	hashsize;
177	u_int	hashmask;
178	u_int	bucket_limit;
179	u_int	cache_count;		/* XXX: unprotected */
180	u_int	cache_limit;
181	u_int	rexmt_limit;
182	u_int	hash_secret;
183};
184static struct tcp_syncache tcp_syncache;
185
186SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache");
187
188SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
189     &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache");
190
191SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
192     &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache");
193
194SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD,
195     &tcp_syncache.cache_count, 0, "Current number of entries in syncache");
196
197SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
198     &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable");
199
200SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW,
201     &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions");
202
203static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
204
205#define SYNCACHE_HASH(inc, mask)					\
206	((tcp_syncache.hash_secret ^					\
207	  (inc)->inc_faddr.s_addr ^					\
208	  ((inc)->inc_faddr.s_addr >> 16) ^				\
209	  (inc)->inc_fport ^ (inc)->inc_lport) & mask)
210
211#define SYNCACHE_HASH6(inc, mask)					\
212	((tcp_syncache.hash_secret ^					\
213	  (inc)->inc6_faddr.s6_addr32[0] ^				\
214	  (inc)->inc6_faddr.s6_addr32[3] ^				\
215	  (inc)->inc_fport ^ (inc)->inc_lport) & mask)
216
217#define ENDPTS_EQ(a, b) (						\
218	(a)->ie_fport == (b)->ie_fport &&				\
219	(a)->ie_lport == (b)->ie_lport &&				\
220	(a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr &&			\
221	(a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr			\
222)
223
224#define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
225
226#define SYNCACHE_TIMEOUT(sc, sch, co) do {				\
227	(sc)->sc_rxmits++;						\
228	(sc)->sc_rxttime = ticks +					\
229		TCPTV_RTOBASE * tcp_backoff[(sc)->sc_rxmits - 1];	\
230	if ((sch)->sch_nextc > (sc)->sc_rxttime)			\
231		(sch)->sch_nextc = (sc)->sc_rxttime;			\
232	if (!TAILQ_EMPTY(&(sch)->sch_bucket) && !(co))			\
233		callout_reset(&(sch)->sch_timer,			\
234			(sch)->sch_nextc - ticks,			\
235			syncache_timer, (void *)(sch));			\
236} while (0)
237
238#define	SCH_LOCK(sch)		mtx_lock(&(sch)->sch_mtx)
239#define	SCH_UNLOCK(sch)		mtx_unlock(&(sch)->sch_mtx)
240#define	SCH_LOCK_ASSERT(sch)	mtx_assert(&(sch)->sch_mtx, MA_OWNED)
241
242/*
243 * Requires the syncache entry to be already removed from the bucket list.
244 */
245static void
246syncache_free(struct syncache *sc)
247{
248	if (sc->sc_ipopts)
249		(void) m_free(sc->sc_ipopts);
250
251	uma_zfree(tcp_syncache.zone, sc);
252}
253
254void
255syncache_init(void)
256{
257	int i;
258
259	tcp_syncache.cache_count = 0;
260	tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
261	tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
262	tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
263	tcp_syncache.hash_secret = arc4random();
264
265	TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
266	    &tcp_syncache.hashsize);
267	TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
268	    &tcp_syncache.bucket_limit);
269	if (!powerof2(tcp_syncache.hashsize) || tcp_syncache.hashsize == 0) {
270		printf("WARNING: syncache hash size is not a power of 2.\n");
271		tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
272	}
273	tcp_syncache.hashmask = tcp_syncache.hashsize - 1;
274
275	/* Set limits. */
276	tcp_syncache.cache_limit =
277	    tcp_syncache.hashsize * tcp_syncache.bucket_limit;
278	TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
279	    &tcp_syncache.cache_limit);
280
281	/* Allocate the hash table. */
282	MALLOC(tcp_syncache.hashbase, struct syncache_head *,
283	    tcp_syncache.hashsize * sizeof(struct syncache_head),
284	    M_SYNCACHE, M_WAITOK | M_ZERO);
285
286	/* Initialize the hash buckets. */
287	for (i = 0; i < tcp_syncache.hashsize; i++) {
288		TAILQ_INIT(&tcp_syncache.hashbase[i].sch_bucket);
289		mtx_init(&tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head",
290			 NULL, MTX_DEF);
291		callout_init_mtx(&tcp_syncache.hashbase[i].sch_timer,
292			 &tcp_syncache.hashbase[i].sch_mtx, 0);
293		tcp_syncache.hashbase[i].sch_length = 0;
294	}
295
296	syncookie_init();
297
298	/* Create the syncache entry zone. */
299	tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
300	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
301	uma_zone_set_max(tcp_syncache.zone, tcp_syncache.cache_limit);
302}
303
304/*
305 * Inserts a syncache entry into the specified bucket row.
306 * Locks and unlocks the syncache_head autonomously.
307 */
308static void
309syncache_insert(struct syncache *sc, struct syncache_head *sch)
310{
311	struct syncache *sc2;
312
313	SCH_LOCK(sch);
314
315	/*
316	 * Make sure that we don't overflow the per-bucket limit.
317	 * If the bucket is full, toss the oldest element.
318	 */
319	if (sch->sch_length >= tcp_syncache.bucket_limit) {
320		KASSERT(!TAILQ_EMPTY(&sch->sch_bucket),
321			("sch->sch_length incorrect"));
322		sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head);
323		syncache_drop(sc2, sch);
324		tcpstat.tcps_sc_bucketoverflow++;
325	}
326
327	/* Put it into the bucket. */
328	TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash);
329	sch->sch_length++;
330
331	/* Reinitialize the bucket row's timer. */
332	SYNCACHE_TIMEOUT(sc, sch, 1);
333
334	SCH_UNLOCK(sch);
335
336	tcp_syncache.cache_count++;
337	tcpstat.tcps_sc_added++;
338}
339
340/*
341 * Remove and free entry from syncache bucket row.
342 * Expects locked syncache head.
343 */
344static void
345syncache_drop(struct syncache *sc, struct syncache_head *sch)
346{
347
348	SCH_LOCK_ASSERT(sch);
349
350	TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
351	sch->sch_length--;
352
353	syncache_free(sc);
354	tcp_syncache.cache_count--;
355}
356
357/*
358 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
359 * If we have retransmitted an entry the maximum number of times, expire it.
360 * One separate timer for each bucket row.
361 */
362static void
363syncache_timer(void *xsch)
364{
365	struct syncache_head *sch = (struct syncache_head *)xsch;
366	struct syncache *sc, *nsc;
367	int tick = ticks;
368
369	/* NB: syncache_head has already been locked by the callout. */
370	SCH_LOCK_ASSERT(sch);
371
372	TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) {
373		/*
374		 * We do not check if the listen socket still exists
375		 * and accept the case where the listen socket may be
376		 * gone by the time we resend the SYN/ACK.  We do
377		 * not expect this to happens often. If it does,
378		 * then the RST will be sent by the time the remote
379		 * host does the SYN/ACK->ACK.
380		 */
381		if (sc->sc_rxttime >= tick) {
382			if (sc->sc_rxttime < sch->sch_nextc)
383				sch->sch_nextc = sc->sc_rxttime;
384			continue;
385		}
386
387		if (sc->sc_rxmits > tcp_syncache.rexmt_limit) {
388			syncache_drop(sc, sch);
389			tcpstat.tcps_sc_stale++;
390			continue;
391		}
392
393		(void) syncache_respond(sc, NULL);
394		tcpstat.tcps_sc_retransmitted++;
395		SYNCACHE_TIMEOUT(sc, sch, 0);
396	}
397	if (!TAILQ_EMPTY(&(sch)->sch_bucket))
398		callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick,
399			syncache_timer, (void *)(sch));
400}
401
402/*
403 * Find an entry in the syncache.
404 * Returns always with locked syncache_head plus a matching entry or NULL.
405 */
406struct syncache *
407syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp)
408{
409	struct syncache *sc;
410	struct syncache_head *sch;
411
412#ifdef INET6
413	if (inc->inc_isipv6) {
414		sch = &tcp_syncache.hashbase[
415		    SYNCACHE_HASH6(inc, tcp_syncache.hashmask)];
416		*schp = sch;
417
418		SCH_LOCK(sch);
419
420		/* Circle through bucket row to find matching entry. */
421		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
422			if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
423				return (sc);
424		}
425	} else
426#endif
427	{
428		sch = &tcp_syncache.hashbase[
429		    SYNCACHE_HASH(inc, tcp_syncache.hashmask)];
430		*schp = sch;
431
432		SCH_LOCK(sch);
433
434		/* Circle through bucket row to find matching entry. */
435		TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
436#ifdef INET6
437			if (sc->sc_inc.inc_isipv6)
438				continue;
439#endif
440			if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
441				return (sc);
442		}
443	}
444	SCH_LOCK_ASSERT(*schp);
445	return (NULL);			/* always returns with locked sch */
446}
447
448/*
449 * This function is called when we get a RST for a
450 * non-existent connection, so that we can see if the
451 * connection is in the syn cache.  If it is, zap it.
452 */
453void
454syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th)
455{
456	struct syncache *sc;
457	struct syncache_head *sch;
458
459	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
460	SCH_LOCK_ASSERT(sch);
461	if (sc == NULL)
462		goto done;
463
464	/*
465	 * If the RST bit is set, check the sequence number to see
466	 * if this is a valid reset segment.
467	 * RFC 793 page 37:
468	 *   In all states except SYN-SENT, all reset (RST) segments
469	 *   are validated by checking their SEQ-fields.  A reset is
470	 *   valid if its sequence number is in the window.
471	 *
472	 *   The sequence number in the reset segment is normally an
473	 *   echo of our outgoing acknowlegement numbers, but some hosts
474	 *   send a reset with the sequence number at the rightmost edge
475	 *   of our receive window, and we have to handle this case.
476	 */
477	if (SEQ_GEQ(th->th_seq, sc->sc_irs) &&
478	    SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
479		syncache_drop(sc, sch);
480		tcpstat.tcps_sc_reset++;
481	}
482done:
483	SCH_UNLOCK(sch);
484}
485
486void
487syncache_badack(struct in_conninfo *inc)
488{
489	struct syncache *sc;
490	struct syncache_head *sch;
491
492	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
493	SCH_LOCK_ASSERT(sch);
494	if (sc != NULL) {
495		syncache_drop(sc, sch);
496		tcpstat.tcps_sc_badack++;
497	}
498	SCH_UNLOCK(sch);
499}
500
501void
502syncache_unreach(struct in_conninfo *inc, struct tcphdr *th)
503{
504	struct syncache *sc;
505	struct syncache_head *sch;
506
507	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
508	SCH_LOCK_ASSERT(sch);
509	if (sc == NULL)
510		goto done;
511
512	/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
513	if (ntohl(th->th_seq) != sc->sc_iss)
514		goto done;
515
516	/*
517	 * If we've rertransmitted 3 times and this is our second error,
518	 * we remove the entry.  Otherwise, we allow it to continue on.
519	 * This prevents us from incorrectly nuking an entry during a
520	 * spurious network outage.
521	 *
522	 * See tcp_notify().
523	 */
524	if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) {
525		sc->sc_flags |= SCF_UNREACH;
526		goto done;
527	}
528	syncache_drop(sc, sch);
529	tcpstat.tcps_sc_unreach++;
530done:
531	SCH_UNLOCK(sch);
532}
533
534/*
535 * Build a new TCP socket structure from a syncache entry.
536 */
537static struct socket *
538syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m)
539{
540	struct inpcb *inp = NULL;
541	struct socket *so;
542	struct tcpcb *tp;
543
544	NET_ASSERT_GIANT();
545	INP_INFO_WLOCK_ASSERT(&tcbinfo);
546
547	/*
548	 * Ok, create the full blown connection, and set things up
549	 * as they would have been set up if we had created the
550	 * connection when the SYN arrived.  If we can't create
551	 * the connection, abort it.
552	 */
553	so = sonewconn(lso, SS_ISCONNECTED);
554	if (so == NULL) {
555		/*
556		 * Drop the connection; we will send a RST if the peer
557		 * retransmits the ACK,
558		 */
559		tcpstat.tcps_listendrop++;
560		goto abort2;
561	}
562#ifdef MAC
563	SOCK_LOCK(so);
564	mac_set_socket_peer_from_mbuf(m, so);
565	SOCK_UNLOCK(so);
566#endif
567
568	inp = sotoinpcb(so);
569	INP_LOCK(inp);
570
571	/* Insert new socket into PCB hash list. */
572	inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6;
573#ifdef INET6
574	if (sc->sc_inc.inc_isipv6) {
575		inp->in6p_laddr = sc->sc_inc.inc6_laddr;
576	} else {
577		inp->inp_vflag &= ~INP_IPV6;
578		inp->inp_vflag |= INP_IPV4;
579#endif
580		inp->inp_laddr = sc->sc_inc.inc_laddr;
581#ifdef INET6
582	}
583#endif
584	inp->inp_lport = sc->sc_inc.inc_lport;
585	if (in_pcbinshash(inp) != 0) {
586		/*
587		 * Undo the assignments above if we failed to
588		 * put the PCB on the hash lists.
589		 */
590#ifdef INET6
591		if (sc->sc_inc.inc_isipv6)
592			inp->in6p_laddr = in6addr_any;
593		else
594#endif
595			inp->inp_laddr.s_addr = INADDR_ANY;
596		inp->inp_lport = 0;
597		goto abort;
598	}
599#ifdef IPSEC
600	/* Copy old policy into new socket's. */
601	if (ipsec_copy_pcbpolicy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
602		printf("syncache_expand: could not copy policy\n");
603#endif
604#ifdef FAST_IPSEC
605	/* Copy old policy into new socket's. */
606	if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
607		printf("syncache_expand: could not copy policy\n");
608#endif
609#ifdef INET6
610	if (sc->sc_inc.inc_isipv6) {
611		struct inpcb *oinp = sotoinpcb(lso);
612		struct in6_addr laddr6;
613		struct sockaddr_in6 sin6;
614		/*
615		 * Inherit socket options from the listening socket.
616		 * Note that in6p_inputopts are not (and should not be)
617		 * copied, since it stores previously received options and is
618		 * used to detect if each new option is different than the
619		 * previous one and hence should be passed to a user.
620		 * If we copied in6p_inputopts, a user would not be able to
621		 * receive options just after calling the accept system call.
622		 */
623		inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS;
624		if (oinp->in6p_outputopts)
625			inp->in6p_outputopts =
626			    ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
627
628		sin6.sin6_family = AF_INET6;
629		sin6.sin6_len = sizeof(sin6);
630		sin6.sin6_addr = sc->sc_inc.inc6_faddr;
631		sin6.sin6_port = sc->sc_inc.inc_fport;
632		sin6.sin6_flowinfo = sin6.sin6_scope_id = 0;
633		laddr6 = inp->in6p_laddr;
634		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
635			inp->in6p_laddr = sc->sc_inc.inc6_laddr;
636		if (in6_pcbconnect(inp, (struct sockaddr *)&sin6,
637		    thread0.td_ucred)) {
638			inp->in6p_laddr = laddr6;
639			goto abort;
640		}
641		/* Override flowlabel from in6_pcbconnect. */
642		inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
643		inp->in6p_flowinfo |= sc->sc_flowlabel;
644	} else
645#endif
646	{
647		struct in_addr laddr;
648		struct sockaddr_in sin;
649
650		inp->inp_options = ip_srcroute(m);
651		if (inp->inp_options == NULL) {
652			inp->inp_options = sc->sc_ipopts;
653			sc->sc_ipopts = NULL;
654		}
655
656		sin.sin_family = AF_INET;
657		sin.sin_len = sizeof(sin);
658		sin.sin_addr = sc->sc_inc.inc_faddr;
659		sin.sin_port = sc->sc_inc.inc_fport;
660		bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero));
661		laddr = inp->inp_laddr;
662		if (inp->inp_laddr.s_addr == INADDR_ANY)
663			inp->inp_laddr = sc->sc_inc.inc_laddr;
664		if (in_pcbconnect(inp, (struct sockaddr *)&sin,
665		    thread0.td_ucred)) {
666			inp->inp_laddr = laddr;
667			goto abort;
668		}
669	}
670	tp = intotcpcb(inp);
671	tp->t_state = TCPS_SYN_RECEIVED;
672	tp->iss = sc->sc_iss;
673	tp->irs = sc->sc_irs;
674	tcp_rcvseqinit(tp);
675	tcp_sendseqinit(tp);
676	tp->snd_wl1 = sc->sc_irs;
677	tp->rcv_up = sc->sc_irs + 1;
678	tp->rcv_wnd = sc->sc_wnd;
679	tp->rcv_adv += tp->rcv_wnd;
680
681	tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
682	if (sc->sc_flags & SCF_NOOPT)
683		tp->t_flags |= TF_NOOPT;
684	if (sc->sc_flags & SCF_WINSCALE) {
685		tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
686		tp->snd_scale = sc->sc_requested_s_scale;
687		tp->request_r_scale = sc->sc_request_r_scale;
688	}
689	if (sc->sc_flags & SCF_TIMESTAMP) {
690		tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
691		tp->ts_recent = sc->sc_tsrecent;
692		tp->ts_recent_age = ticks;
693	}
694#ifdef TCP_SIGNATURE
695	if (sc->sc_flags & SCF_SIGNATURE)
696		tp->t_flags |= TF_SIGNATURE;
697#endif
698	if (sc->sc_flags & SCF_SACK) {
699		tp->sack_enable = 1;
700		tp->t_flags |= TF_SACK_PERMIT;
701	}
702
703	/*
704	 * Set up MSS and get cached values from tcp_hostcache.
705	 * This might overwrite some of the defaults we just set.
706	 */
707	tcp_mss(tp, sc->sc_peer_mss);
708
709	/*
710	 * If the SYN,ACK was retransmitted, reset cwnd to 1 segment.
711	 */
712	if (sc->sc_rxmits > 1)
713		tp->snd_cwnd = tp->t_maxseg;
714	callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
715
716	INP_UNLOCK(inp);
717
718	tcpstat.tcps_accepts++;
719	return (so);
720
721abort:
722	INP_UNLOCK(inp);
723abort2:
724	if (so != NULL)
725		soabort(so);
726	return (NULL);
727}
728
729/*
730 * This function gets called when we receive an ACK for a
731 * socket in the LISTEN state.  We look up the connection
732 * in the syncache, and if its there, we pull it out of
733 * the cache and turn it into a full-blown connection in
734 * the SYN-RECEIVED state.
735 */
736int
737syncache_expand(struct in_conninfo *inc, struct tcphdr *th,
738    struct socket **lsop, struct mbuf *m)
739{
740	struct syncache *sc;
741	struct syncache_head *sch;
742	struct socket *so;
743
744	/*
745	 * Global TCP locks are held because we manipulate the PCB lists
746	 * and create a new socket.
747	 */
748	INP_INFO_WLOCK_ASSERT(&tcbinfo);
749
750	sc = syncache_lookup(inc, &sch);	/* returns locked sch */
751	SCH_LOCK_ASSERT(sch);
752	if (sc == NULL) {
753		/*
754		 * There is no syncache entry, so see if this ACK is
755		 * a returning syncookie.  To do this, first:
756		 *  A. See if this socket has had a syncache entry dropped in
757		 *     the past.  We don't want to accept a bogus syncookie
758		 *     if we've never received a SYN.
759		 *  B. check that the syncookie is valid.  If it is, then
760		 *     cobble up a fake syncache entry, and return.
761		 */
762		SCH_UNLOCK(sch);
763		sch = NULL;
764
765		if (!tcp_syncookies)
766			goto failed;
767		sc = syncookie_lookup(inc, th, *lsop);
768		if (sc == NULL)
769			goto failed;
770		tcpstat.tcps_sc_recvcookie++;
771	} else {
772		/* Pull out the entry to unlock the bucket row. */
773		TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
774		sch->sch_length--;
775		tcp_syncache.cache_count--;
776		SCH_UNLOCK(sch);
777	}
778
779	/*
780	 * If seg contains an ACK, but not for our SYN/ACK, send a RST.
781	 */
782	if (th->th_ack != sc->sc_iss + 1)
783		goto failed;
784
785	so = syncache_socket(sc, *lsop, m);
786
787	if (so == NULL) {
788#if 0
789resetandabort:
790		/* XXXjlemon check this - is this correct? */
791		(void) tcp_respond(NULL, m, m, th,
792		    th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK);
793#endif
794		m_freem(m);			/* XXX: only needed for above */
795		tcpstat.tcps_sc_aborted++;
796		if (sch != NULL) {
797			syncache_insert(sc, sch);  /* try again later */
798			sc = NULL;
799		}
800		goto failed;
801	} else
802		tcpstat.tcps_sc_completed++;
803	*lsop = so;
804
805	syncache_free(sc);
806	return (1);
807failed:
808	if (sc != NULL)
809		syncache_free(sc);
810	return (0);
811}
812
813/*
814 * Given a LISTEN socket and an inbound SYN request, add
815 * this to the syn cache, and send back a segment:
816 *	<SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
817 * to the source.
818 *
819 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
820 * Doing so would require that we hold onto the data and deliver it
821 * to the application.  However, if we are the target of a SYN-flood
822 * DoS attack, an attacker could send data which would eventually
823 * consume all available buffer space if it were ACKed.  By not ACKing
824 * the data, we avoid this DoS scenario.
825 */
826int
827syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th,
828    struct inpcb *inp, struct socket **lsop, struct mbuf *m)
829{
830	struct tcpcb *tp;
831	struct socket *so;
832	struct syncache *sc = NULL;
833	struct syncache_head *sch;
834	struct mbuf *ipopts = NULL;
835	u_int32_t flowtmp;
836	int win, sb_hiwat, ip_ttl, ip_tos, noopt;
837#ifdef INET6
838	int autoflowlabel = 0;
839#endif
840
841	INP_INFO_WLOCK_ASSERT(&tcbinfo);
842	INP_LOCK_ASSERT(inp);			/* listen socket */
843
844	/*
845	 * Combine all so/tp operations very early to drop the INP lock as
846	 * soon as possible.
847	 */
848	so = *lsop;
849	tp = sototcpcb(so);
850
851#ifdef INET6
852	if (inc->inc_isipv6 &&
853	    (inp->in6p_flags & IN6P_AUTOFLOWLABEL))
854		autoflowlabel = 1;
855#endif
856	ip_ttl = inp->inp_ip_ttl;
857	ip_tos = inp->inp_ip_tos;
858	win = sbspace(&so->so_rcv);
859	sb_hiwat = so->so_rcv.sb_hiwat;
860	noopt = (tp->t_flags & TF_NOOPT);
861
862	so = NULL;
863	tp = NULL;
864
865	INP_UNLOCK(inp);
866	INP_INFO_WUNLOCK(&tcbinfo);
867
868	/*
869	 * Remember the IP options, if any.
870	 */
871#ifdef INET6
872	if (!inc->inc_isipv6)
873#endif
874		ipopts = ip_srcroute(m);
875
876	/*
877	 * See if we already have an entry for this connection.
878	 * If we do, resend the SYN,ACK, and reset the retransmit timer.
879	 *
880	 * XXX: should the syncache be re-initialized with the contents
881	 * of the new SYN here (which may have different options?)
882	 */
883	sc = syncache_lookup(inc, &sch);	/* returns locked entry */
884	SCH_LOCK_ASSERT(sch);
885	if (sc != NULL) {
886		tcpstat.tcps_sc_dupsyn++;
887		if (ipopts) {
888			/*
889			 * If we were remembering a previous source route,
890			 * forget it and use the new one we've been given.
891			 */
892			if (sc->sc_ipopts)
893				(void) m_free(sc->sc_ipopts);
894			sc->sc_ipopts = ipopts;
895		}
896		/*
897		 * Update timestamp if present.
898		 */
899		if (sc->sc_flags & SCF_TIMESTAMP)
900			sc->sc_tsrecent = to->to_tsval;
901		if (syncache_respond(sc, m) == 0) {
902			SYNCACHE_TIMEOUT(sc, sch, 1);
903			tcpstat.tcps_sndacks++;
904			tcpstat.tcps_sndtotal++;
905		}
906		SCH_UNLOCK(sch);
907		goto done;
908	}
909
910	sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
911	if (sc == NULL) {
912		/*
913		 * The zone allocator couldn't provide more entries.
914		 * Treat this as if the cache was full; drop the oldest
915		 * entry and insert the new one.
916		 */
917		tcpstat.tcps_sc_zonefail++;
918		sc = TAILQ_LAST(&sch->sch_bucket, sch_head);
919		syncache_drop(sc, sch);
920		SCH_UNLOCK(sch);
921		sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
922		if (sc == NULL) {
923			if (ipopts)
924				(void) m_free(ipopts);
925			goto done;
926		}
927	} else
928		SCH_UNLOCK(sch);
929
930	/*
931	 * Fill in the syncache values.
932	 */
933	sc->sc_ipopts = ipopts;
934	sc->sc_inc.inc_fport = inc->inc_fport;
935	sc->sc_inc.inc_lport = inc->inc_lport;
936#ifdef INET6
937	sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
938	if (inc->inc_isipv6) {
939		sc->sc_inc.inc6_faddr = inc->inc6_faddr;
940		sc->sc_inc.inc6_laddr = inc->inc6_laddr;
941	} else
942#endif
943	{
944		sc->sc_inc.inc_faddr = inc->inc_faddr;
945		sc->sc_inc.inc_laddr = inc->inc_laddr;
946		sc->sc_ip_tos = ip_tos;
947		sc->sc_ip_ttl = ip_ttl;
948	}
949	sc->sc_irs = th->th_seq;
950	sc->sc_flags = 0;
951	sc->sc_peer_mss = to->to_flags & TOF_MSS ? to->to_mss : 0;
952	sc->sc_flowlabel = 0;
953	if (tcp_syncookies) {
954		sc->sc_iss = syncookie_generate(sc, &flowtmp);
955#ifdef INET6
956		if (autoflowlabel)
957			sc->sc_flowlabel = flowtmp & IPV6_FLOWLABEL_MASK;
958#endif
959	} else {
960		sc->sc_iss = arc4random();
961#ifdef INET6
962		if (autoflowlabel)
963			sc->sc_flowlabel =
964			    (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
965#endif
966	}
967
968	/*
969	 * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN].
970	 * win was derived from socket earlier in the function.
971	 */
972	win = imax(win, 0);
973	win = imin(win, TCP_MAXWIN);
974	sc->sc_wnd = win;
975
976	if (tcp_do_rfc1323) {
977		/*
978		 * A timestamp received in a SYN makes
979		 * it ok to send timestamp requests and replies.
980		 */
981		if (to->to_flags & TOF_TS) {
982			sc->sc_tsrecent = to->to_tsval;
983			sc->sc_flags |= SCF_TIMESTAMP;
984		}
985		if (to->to_flags & TOF_SCALE) {
986			int wscale = 0;
987
988			/* Compute proper scaling value from buffer space */
989			while (wscale < TCP_MAX_WINSHIFT &&
990			    (TCP_MAXWIN << wscale) < sb_hiwat)
991				wscale++;
992			sc->sc_request_r_scale = wscale;
993			sc->sc_requested_s_scale = to->to_requested_s_scale;
994			sc->sc_flags |= SCF_WINSCALE;
995		}
996	}
997#ifdef TCP_SIGNATURE
998	/*
999	 * If listening socket requested TCP digests, and received SYN
1000	 * contains the option, flag this in the syncache so that
1001	 * syncache_respond() will do the right thing with the SYN+ACK.
1002	 * XXX: Currently we always record the option by default and will
1003	 * attempt to use it in syncache_respond().
1004	 */
1005	if (to->to_flags & TOF_SIGNATURE)
1006		sc->sc_flags |= SCF_SIGNATURE;
1007#endif
1008
1009	if (to->to_flags & TOF_SACK)
1010		sc->sc_flags |= SCF_SACK;
1011	if (noopt)
1012		sc->sc_flags |= SCF_NOOPT;
1013
1014	/*
1015	 * Do a standard 3-way handshake.
1016	 */
1017	if (syncache_respond(sc, m) == 0) {
1018		syncache_insert(sc, sch);	/* locks and unlocks sch */
1019		tcpstat.tcps_sndacks++;
1020		tcpstat.tcps_sndtotal++;
1021	} else {
1022		syncache_free(sc);
1023		tcpstat.tcps_sc_dropped++;
1024	}
1025
1026done:
1027	*lsop = NULL;
1028	return (1);
1029}
1030
1031static int
1032syncache_respond(struct syncache *sc, struct mbuf *m)
1033{
1034	u_int8_t *optp;
1035	int optlen, error;
1036	u_int16_t tlen, hlen, mssopt;
1037	struct ip *ip = NULL;
1038	struct tcphdr *th;
1039#ifdef INET6
1040	struct ip6_hdr *ip6 = NULL;
1041#endif
1042#ifdef MAC
1043	struct inpcb *inp = NULL;
1044#endif
1045
1046	hlen =
1047#ifdef INET6
1048	       (sc->sc_inc.inc_isipv6) ? sizeof(struct ip6_hdr) :
1049#endif
1050		sizeof(struct ip);
1051
1052	KASSERT((&sc->sc_inc) != NULL, ("syncache_respond with NULL in_conninfo pointer"));
1053
1054	/* Determine MSS we advertize to other end of connection. */
1055	mssopt = tcp_mssopt(&sc->sc_inc);
1056
1057	/* Compute the size of the TCP options. */
1058	if (sc->sc_flags & SCF_NOOPT) {
1059		optlen = 0;
1060	} else {
1061		optlen = TCPOLEN_MAXSEG +
1062		    ((sc->sc_flags & SCF_WINSCALE) ? 4 : 0) +
1063		    ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
1064#ifdef TCP_SIGNATURE
1065		if (sc->sc_flags & SCF_SIGNATURE)
1066			optlen += TCPOLEN_SIGNATURE;
1067#endif
1068		if (sc->sc_flags & SCF_SACK)
1069			optlen += TCPOLEN_SACK_PERMITTED;
1070		optlen = roundup2(optlen, 4);
1071	}
1072	tlen = hlen + sizeof(struct tcphdr) + optlen;
1073
1074	/*
1075	 * XXX: Assume that the entire packet will fit in a header mbuf.
1076	 */
1077	KASSERT(max_linkhdr + tlen <= MHLEN, ("syncache: mbuf too small"));
1078
1079	/* Create the IP+TCP header from scratch. */
1080	if (m)
1081		m_freem(m);
1082
1083	m = m_gethdr(M_DONTWAIT, MT_DATA);
1084	if (m == NULL)
1085		return (ENOBUFS);
1086	m->m_data += max_linkhdr;
1087	m->m_len = tlen;
1088	m->m_pkthdr.len = tlen;
1089	m->m_pkthdr.rcvif = NULL;
1090
1091#ifdef MAC
1092	/*
1093	 * For MAC look up the inpcb to get access to the label information.
1094	 * We don't store the inpcb pointer in struct syncache to make locking
1095	 * less complicated and to save locking operations.  However for MAC
1096	 * this gives a slight overhead as we have to do a full pcblookup here.
1097	 */
1098	INP_INFO_RLOCK(&tcbinfo);
1099	if (inp == NULL) {
1100#ifdef INET6 /* && MAC */
1101		if (sc->sc_inc.inc_isipv6)
1102			inp = in6_pcblookup_hash(&tcbinfo,
1103				&sc->sc_inc.inc6_faddr, sc->sc_inc.inc_fport,
1104				&sc->sc_inc.inc6_laddr, sc->sc_inc.inc_lport,
1105				1, NULL);
1106		else
1107#endif /* INET6 */
1108			inp = in_pcblookup_hash(&tcbinfo,
1109				sc->sc_inc.inc_faddr, sc->sc_inc.inc_fport,
1110				sc->sc_inc.inc_laddr, sc->sc_inc.inc_lport,
1111				1, NULL);
1112		if (inp == NULL) {
1113			m_freem(m);
1114			INP_INFO_RUNLOCK(&tcbinfo);
1115			return (ESHUTDOWN);
1116		}
1117	}
1118	INP_LOCK(inp);
1119	if (!inp->inp_socket->so_options & SO_ACCEPTCONN) {
1120		m_freem(m);
1121		INP_UNLOCK(inp);
1122		INP_INFO_RUNLOCK(&tcbinfo);
1123		return (ESHUTDOWN);
1124	}
1125	mac_create_mbuf_from_inpcb(inp, m);
1126	INP_UNLOCK(inp);
1127	INP_INFO_RUNLOCK(&tcbinfo);
1128#endif /* MAC */
1129
1130#ifdef INET6
1131	if (sc->sc_inc.inc_isipv6) {
1132		ip6 = mtod(m, struct ip6_hdr *);
1133		ip6->ip6_vfc = IPV6_VERSION;
1134		ip6->ip6_nxt = IPPROTO_TCP;
1135		ip6->ip6_src = sc->sc_inc.inc6_laddr;
1136		ip6->ip6_dst = sc->sc_inc.inc6_faddr;
1137		ip6->ip6_plen = htons(tlen - hlen);
1138		/* ip6_hlim is set after checksum */
1139		ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
1140		ip6->ip6_flow |= sc->sc_flowlabel;
1141
1142		th = (struct tcphdr *)(ip6 + 1);
1143	} else
1144#endif
1145	{
1146		ip = mtod(m, struct ip *);
1147		ip->ip_v = IPVERSION;
1148		ip->ip_hl = sizeof(struct ip) >> 2;
1149		ip->ip_len = tlen;
1150		ip->ip_id = 0;
1151		ip->ip_off = 0;
1152		ip->ip_sum = 0;
1153		ip->ip_p = IPPROTO_TCP;
1154		ip->ip_src = sc->sc_inc.inc_laddr;
1155		ip->ip_dst = sc->sc_inc.inc_faddr;
1156		ip->ip_ttl = sc->sc_ip_ttl;
1157		ip->ip_tos = sc->sc_ip_tos;
1158
1159		/*
1160		 * See if we should do MTU discovery.  Route lookups are
1161		 * expensive, so we will only unset the DF bit if:
1162		 *
1163		 *	1) path_mtu_discovery is disabled
1164		 *	2) the SCF_UNREACH flag has been set
1165		 */
1166		if (path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0))
1167		       ip->ip_off |= IP_DF;
1168
1169		th = (struct tcphdr *)(ip + 1);
1170	}
1171	th->th_sport = sc->sc_inc.inc_lport;
1172	th->th_dport = sc->sc_inc.inc_fport;
1173
1174	th->th_seq = htonl(sc->sc_iss);
1175	th->th_ack = htonl(sc->sc_irs + 1);
1176	th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
1177	th->th_x2 = 0;
1178	th->th_flags = TH_SYN|TH_ACK;
1179	th->th_win = htons(sc->sc_wnd);
1180	th->th_urp = 0;
1181
1182	/* Tack on the TCP options. */
1183	if (optlen != 0) {
1184		optp = (u_int8_t *)(th + 1);
1185		*optp++ = TCPOPT_MAXSEG;
1186		*optp++ = TCPOLEN_MAXSEG;
1187		*optp++ = (mssopt >> 8) & 0xff;
1188		*optp++ = mssopt & 0xff;
1189
1190		if (sc->sc_flags & SCF_WINSCALE) {
1191			*((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
1192			    TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
1193			    sc->sc_request_r_scale);
1194			optp += 4;
1195		}
1196
1197		if (sc->sc_flags & SCF_TIMESTAMP) {
1198			u_int32_t *lp = (u_int32_t *)(optp);
1199
1200			/* Form timestamp option per appendix A of RFC 1323. */
1201			*lp++ = htonl(TCPOPT_TSTAMP_HDR);
1202			*lp++ = htonl(ticks);
1203			*lp   = htonl(sc->sc_tsrecent);
1204			optp += TCPOLEN_TSTAMP_APPA;
1205		}
1206
1207#ifdef TCP_SIGNATURE
1208		/*
1209		 * Handle TCP-MD5 passive opener response.
1210		 */
1211		if (sc->sc_flags & SCF_SIGNATURE) {
1212			u_int8_t *bp = optp;
1213			int i;
1214
1215			*bp++ = TCPOPT_SIGNATURE;
1216			*bp++ = TCPOLEN_SIGNATURE;
1217			for (i = 0; i < TCP_SIGLEN; i++)
1218				*bp++ = 0;
1219			tcp_signature_compute(m, sizeof(struct ip), 0, optlen,
1220			    optp + 2, IPSEC_DIR_OUTBOUND);
1221			optp += TCPOLEN_SIGNATURE;
1222		}
1223#endif /* TCP_SIGNATURE */
1224
1225		if (sc->sc_flags & SCF_SACK) {
1226			*optp++ = TCPOPT_SACK_PERMITTED;
1227			*optp++ = TCPOLEN_SACK_PERMITTED;
1228		}
1229
1230		{
1231			/* Pad TCP options to a 4 byte boundary */
1232			int padlen = optlen - (optp - (u_int8_t *)(th + 1));
1233			while (padlen-- > 0)
1234				*optp++ = TCPOPT_EOL;
1235		}
1236	}
1237
1238#ifdef INET6
1239	if (sc->sc_inc.inc_isipv6) {
1240		th->th_sum = 0;
1241		th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
1242		ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
1243		error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
1244	} else
1245#endif
1246	{
1247		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1248		    htons(tlen - hlen + IPPROTO_TCP));
1249		m->m_pkthdr.csum_flags = CSUM_TCP;
1250		m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1251		error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL);
1252	}
1253	return (error);
1254}
1255
1256/*
1257 * cookie layers:
1258 *
1259 *	|. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|
1260 *	| peer iss                                                      |
1261 *	| MD5(laddr,faddr,secret,lport,fport)             |. . . . . . .|
1262 *	|                     0                       |(A)|             |
1263 * (A): peer mss index
1264 */
1265
1266/*
1267 * The values below are chosen to minimize the size of the tcp_secret
1268 * table, as well as providing roughly a 16 second lifetime for the cookie.
1269 */
1270
1271#define SYNCOOKIE_WNDBITS	5	/* exposed bits for window indexing */
1272#define SYNCOOKIE_TIMESHIFT	1	/* scale ticks to window time units */
1273
1274#define SYNCOOKIE_WNDMASK	((1 << SYNCOOKIE_WNDBITS) - 1)
1275#define SYNCOOKIE_NSECRETS	(1 << SYNCOOKIE_WNDBITS)
1276#define SYNCOOKIE_TIMEOUT \
1277    (hz * (1 << SYNCOOKIE_WNDBITS) / (1 << SYNCOOKIE_TIMESHIFT))
1278#define SYNCOOKIE_DATAMASK	((3 << SYNCOOKIE_WNDBITS) | SYNCOOKIE_WNDMASK)
1279
1280#define SYNCOOKIE_RLOCK(ts)	(rw_rlock(&(ts).ts_rwmtx))
1281#define SYNCOOKIE_RUNLOCK(ts)	(rw_runlock(&(ts).ts_rwmtx))
1282#define SYNCOOKIE_TRY_UPGRADE(ts)  (rw_try_upgrade(&(ts).ts_rwmtx))
1283#define SYNCOOKIE_DOWNGRADE(ts)	(rw_downgrade(&(ts).ts_rwmtx))
1284
1285static struct {
1286	struct rwlock	ts_rwmtx;
1287	u_int		ts_expire;	/* ticks */
1288	u_int32_t	ts_secbits[4];
1289} tcp_secret[SYNCOOKIE_NSECRETS];
1290
1291static int tcp_msstab[] = { 0, 536, 1460, 8960 };
1292
1293#define MD5Add(v)	MD5Update(&syn_ctx, (u_char *)&v, sizeof(v))
1294
1295struct md5_add {
1296	u_int32_t laddr, faddr;
1297	u_int32_t secbits[4];
1298	u_int16_t lport, fport;
1299};
1300
1301#ifdef CTASSERT
1302CTASSERT(sizeof(struct md5_add) == 28);
1303#endif
1304
1305/*
1306 * Consider the problem of a recreated (and retransmitted) cookie.  If the
1307 * original SYN was accepted, the connection is established.  The second
1308 * SYN is inflight, and if it arrives with an ISN that falls within the
1309 * receive window, the connection is killed.
1310 *
1311 * However, since cookies have other problems, this may not be worth
1312 * worrying about.
1313 */
1314
1315static void
1316syncookie_init(void) {
1317	int idx;
1318
1319	for (idx = 0; idx < SYNCOOKIE_NSECRETS; idx++) {
1320		rw_init(&(tcp_secret[idx].ts_rwmtx), "tcp_secret");
1321	}
1322}
1323
1324static u_int32_t
1325syncookie_generate(struct syncache *sc, u_int32_t *flowid)
1326{
1327	u_int32_t md5_buffer[4];
1328	u_int32_t data;
1329	int idx, i;
1330	struct md5_add add;
1331	MD5_CTX syn_ctx;
1332
1333	idx = ((ticks << SYNCOOKIE_TIMESHIFT) / hz) & SYNCOOKIE_WNDMASK;
1334	SYNCOOKIE_RLOCK(tcp_secret[idx]);
1335	if (tcp_secret[idx].ts_expire < time_uptime &&
1336	    SYNCOOKIE_TRY_UPGRADE(tcp_secret[idx]) ) {
1337		/* need write access */
1338		for (i = 0; i < 4; i++)
1339			tcp_secret[idx].ts_secbits[i] = arc4random();
1340		tcp_secret[idx].ts_expire = ticks + SYNCOOKIE_TIMEOUT;
1341		SYNCOOKIE_DOWNGRADE(tcp_secret[idx]);
1342	}
1343	for (data = sizeof(tcp_msstab) / sizeof(int) - 1; data > 0; data--)
1344		if (tcp_msstab[data] <= sc->sc_peer_mss)
1345			break;
1346	data = (data << SYNCOOKIE_WNDBITS) | idx;
1347	data ^= sc->sc_irs;				/* peer's iss */
1348	MD5Init(&syn_ctx);
1349#ifdef INET6
1350	if (sc->sc_inc.inc_isipv6) {
1351		MD5Add(sc->sc_inc.inc6_laddr);
1352		MD5Add(sc->sc_inc.inc6_faddr);
1353		add.laddr = 0;
1354		add.faddr = 0;
1355	} else
1356#endif
1357	{
1358		add.laddr = sc->sc_inc.inc_laddr.s_addr;
1359		add.faddr = sc->sc_inc.inc_faddr.s_addr;
1360	}
1361	add.lport = sc->sc_inc.inc_lport;
1362	add.fport = sc->sc_inc.inc_fport;
1363	add.secbits[0] = tcp_secret[idx].ts_secbits[0];
1364	add.secbits[1] = tcp_secret[idx].ts_secbits[1];
1365	add.secbits[2] = tcp_secret[idx].ts_secbits[2];
1366	add.secbits[3] = tcp_secret[idx].ts_secbits[3];
1367	SYNCOOKIE_RUNLOCK(tcp_secret[idx]);
1368	MD5Add(add);
1369	MD5Final((u_char *)&md5_buffer, &syn_ctx);
1370	data ^= (md5_buffer[0] & ~SYNCOOKIE_WNDMASK);
1371	*flowid = md5_buffer[1];
1372	return (data);
1373}
1374
1375static struct syncache *
1376syncookie_lookup(struct in_conninfo *inc, struct tcphdr *th, struct socket *so)
1377{
1378	u_int32_t md5_buffer[4];
1379	struct syncache *sc;
1380	u_int32_t data;
1381	int wnd, idx;
1382	struct md5_add add;
1383	MD5_CTX syn_ctx;
1384
1385	data = (th->th_ack - 1) ^ (th->th_seq - 1);	/* remove ISS */
1386	idx = data & SYNCOOKIE_WNDMASK;
1387	SYNCOOKIE_RLOCK(tcp_secret[idx]);
1388	if (tcp_secret[idx].ts_expire < ticks ||
1389	    sototcpcb(so)->ts_recent + SYNCOOKIE_TIMEOUT < ticks) {
1390		SYNCOOKIE_RUNLOCK(tcp_secret[idx]);
1391		return (NULL);
1392	}
1393	MD5Init(&syn_ctx);
1394#ifdef INET6
1395	if (inc->inc_isipv6) {
1396		MD5Add(inc->inc6_laddr);
1397		MD5Add(inc->inc6_faddr);
1398		add.laddr = 0;
1399		add.faddr = 0;
1400	} else
1401#endif
1402	{
1403		add.laddr = inc->inc_laddr.s_addr;
1404		add.faddr = inc->inc_faddr.s_addr;
1405	}
1406	add.lport = inc->inc_lport;
1407	add.fport = inc->inc_fport;
1408	add.secbits[0] = tcp_secret[idx].ts_secbits[0];
1409	add.secbits[1] = tcp_secret[idx].ts_secbits[1];
1410	add.secbits[2] = tcp_secret[idx].ts_secbits[2];
1411	add.secbits[3] = tcp_secret[idx].ts_secbits[3];
1412	SYNCOOKIE_RUNLOCK(tcp_secret[idx]);
1413	MD5Add(add);
1414	MD5Final((u_char *)&md5_buffer, &syn_ctx);
1415	data ^= md5_buffer[0];
1416	if ((data & ~SYNCOOKIE_DATAMASK) != 0)
1417		return (NULL);
1418	data = data >> SYNCOOKIE_WNDBITS;
1419
1420	sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
1421	if (sc == NULL)
1422		return (NULL);
1423	/*
1424	 * Fill in the syncache values.
1425	 * XXX: duplicate code from syncache_add
1426	 */
1427	sc->sc_ipopts = NULL;
1428	sc->sc_inc.inc_fport = inc->inc_fport;
1429	sc->sc_inc.inc_lport = inc->inc_lport;
1430#ifdef INET6
1431	sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
1432	if (inc->inc_isipv6) {
1433		sc->sc_inc.inc6_faddr = inc->inc6_faddr;
1434		sc->sc_inc.inc6_laddr = inc->inc6_laddr;
1435		if (sotoinpcb(so)->in6p_flags & IN6P_AUTOFLOWLABEL)
1436			sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
1437	} else
1438#endif
1439	{
1440		sc->sc_inc.inc_faddr = inc->inc_faddr;
1441		sc->sc_inc.inc_laddr = inc->inc_laddr;
1442		sc->sc_ip_ttl = sotoinpcb(so)->inp_ip_ttl;
1443		sc->sc_ip_tos = sotoinpcb(so)->inp_ip_tos;
1444	}
1445	sc->sc_irs = th->th_seq - 1;
1446	sc->sc_iss = th->th_ack - 1;
1447	wnd = sbspace(&so->so_rcv);
1448	wnd = imax(wnd, 0);
1449	wnd = imin(wnd, TCP_MAXWIN);
1450	sc->sc_wnd = wnd;
1451	sc->sc_flags = 0;
1452	sc->sc_rxmits = 0;
1453	sc->sc_peer_mss = tcp_msstab[data];
1454	return (sc);
1455}
1456