Deleted Added
sdiff udiff text old ( 121307 ) new ( 122062 )
full compact
1/*-
2 * Copyright (c) 2001 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Jonathan Lemon
6 * and NAI Labs, the Security Research Division of Network Associates, Inc.
7 * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
8 * DARPA CHATS research program.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. The name of the author may not be used to endorse or promote
19 * products derived from this software without specific prior written
20 * permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * $FreeBSD: head/sys/netinet/tcp_syncache.c 121307 2003-10-21 18:28:36Z silby $
35 */
36
37#include "opt_inet6.h"
38#include "opt_ipsec.h"
39#include "opt_mac.h"
40#include "opt_tcpdebug.h"
41
42#include <sys/param.h>
43#include <sys/systm.h>
44#include <sys/kernel.h>
45#include <sys/sysctl.h>
46#include <sys/malloc.h>
47#include <sys/mac.h>
48#include <sys/mbuf.h>
49#include <sys/md5.h>
50#include <sys/proc.h> /* for proc0 declaration */
51#include <sys/random.h>
52#include <sys/socket.h>
53#include <sys/socketvar.h>
54
55#include <net/if.h>
56#include <net/route.h>
57
58#include <netinet/in.h>
59#include <netinet/in_systm.h>
60#include <netinet/ip.h>
61#include <netinet/in_var.h>
62#include <netinet/in_pcb.h>
63#include <netinet/ip_var.h>
64#ifdef INET6
65#include <netinet/ip6.h>
66#include <netinet/icmp6.h>
67#include <netinet6/nd6.h>
68#include <netinet6/ip6_var.h>
69#include <netinet6/in6_pcb.h>
70#endif
71#include <netinet/tcp.h>
72#ifdef TCPDEBUG
73#include <netinet/tcpip.h>
74#endif
75#include <netinet/tcp_fsm.h>
76#include <netinet/tcp_seq.h>
77#include <netinet/tcp_timer.h>
78#include <netinet/tcp_var.h>
79#ifdef TCPDEBUG
80#include <netinet/tcp_debug.h>
81#endif
82#ifdef INET6
83#include <netinet6/tcp6_var.h>
84#endif
85
86#ifdef IPSEC
87#include <netinet6/ipsec.h>
88#ifdef INET6
89#include <netinet6/ipsec6.h>
90#endif
91#endif /*IPSEC*/
92
93#ifdef FAST_IPSEC
94#include <netipsec/ipsec.h>
95#ifdef INET6
96#include <netipsec/ipsec6.h>
97#endif
98#include <netipsec/key.h>
99#define IPSEC
100#endif /*FAST_IPSEC*/
101
102#include <machine/in_cksum.h>
103#include <vm/uma.h>
104
105static int tcp_syncookies = 1;
106SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW,
107 &tcp_syncookies, 0,
108 "Use TCP SYN cookies if the syncache overflows");
109
110static void syncache_drop(struct syncache *, struct syncache_head *);
111static void syncache_free(struct syncache *);
112static void syncache_insert(struct syncache *, struct syncache_head *);
113struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **);
114#ifdef TCPDEBUG
115static int syncache_respond(struct syncache *, struct mbuf *, struct socket *);
116#else
117static int syncache_respond(struct syncache *, struct mbuf *);
118#endif
119static struct socket *syncache_socket(struct syncache *, struct socket *,
120 struct mbuf *m);
121static void syncache_timer(void *);
122static u_int32_t syncookie_generate(struct syncache *);
123static struct syncache *syncookie_lookup(struct in_conninfo *,
124 struct tcphdr *, struct socket *);
125
126/*
127 * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
128 * 3 retransmits corresponds to a timeout of (1 + 2 + 4 + 8 == 15) seconds,
129 * the odds are that the user has given up attempting to connect by then.
130 */
131#define SYNCACHE_MAXREXMTS 3
132
133/* Arbitrary values */
134#define TCP_SYNCACHE_HASHSIZE 512
135#define TCP_SYNCACHE_BUCKETLIMIT 30
136
137struct tcp_syncache {
138 struct syncache_head *hashbase;
139 uma_zone_t zone;
140 u_int hashsize;
141 u_int hashmask;
142 u_int bucket_limit;
143 u_int cache_count;
144 u_int cache_limit;
145 u_int rexmt_limit;
146 u_int hash_secret;
147 u_int next_reseed;
148 TAILQ_HEAD(, syncache) timerq[SYNCACHE_MAXREXMTS + 1];
149 struct callout tt_timerq[SYNCACHE_MAXREXMTS + 1];
150};
151static struct tcp_syncache tcp_syncache;
152
153SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache");
154
155SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
156 &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache");
157
158SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
159 &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache");
160
161SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD,
162 &tcp_syncache.cache_count, 0, "Current number of entries in syncache");
163
164SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
165 &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable");
166
167SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW,
168 &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions");
169
170static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
171
172#define SYNCACHE_HASH(inc, mask) \
173 ((tcp_syncache.hash_secret ^ \
174 (inc)->inc_faddr.s_addr ^ \
175 ((inc)->inc_faddr.s_addr >> 16) ^ \
176 (inc)->inc_fport ^ (inc)->inc_lport) & mask)
177
178#define SYNCACHE_HASH6(inc, mask) \
179 ((tcp_syncache.hash_secret ^ \
180 (inc)->inc6_faddr.s6_addr32[0] ^ \
181 (inc)->inc6_faddr.s6_addr32[3] ^ \
182 (inc)->inc_fport ^ (inc)->inc_lport) & mask)
183
184#define ENDPTS_EQ(a, b) ( \
185 (a)->ie_fport == (b)->ie_fport && \
186 (a)->ie_lport == (b)->ie_lport && \
187 (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \
188 (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \
189)
190
191#define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
192
193#define SYNCACHE_TIMEOUT(sc, slot) do { \
194 sc->sc_rxtslot = (slot); \
195 sc->sc_rxttime = ticks + TCPTV_RTOBASE * tcp_backoff[(slot)]; \
196 TAILQ_INSERT_TAIL(&tcp_syncache.timerq[(slot)], sc, sc_timerq); \
197 if (!callout_active(&tcp_syncache.tt_timerq[(slot)])) \
198 callout_reset(&tcp_syncache.tt_timerq[(slot)], \
199 TCPTV_RTOBASE * tcp_backoff[(slot)], \
200 syncache_timer, (void *)((intptr_t)(slot))); \
201} while (0)
202
203static void
204syncache_free(struct syncache *sc)
205{
206 struct rtentry *rt;
207
208 if (sc->sc_ipopts)
209 (void) m_free(sc->sc_ipopts);
210#ifdef INET6
211 if (sc->sc_inc.inc_isipv6)
212 rt = sc->sc_route6.ro_rt;
213 else
214#endif
215 rt = sc->sc_route.ro_rt;
216 if (rt != NULL) {
217 /*
218 * If this is the only reference to a protocol cloned
219 * route, remove it immediately.
220 */
221 if (rt->rt_flags & RTF_WASCLONED &&
222 (sc->sc_flags & SCF_KEEPROUTE) == 0 &&
223 rt->rt_refcnt == 1)
224 rtrequest(RTM_DELETE, rt_key(rt),
225 rt->rt_gateway, rt_mask(rt),
226 rt->rt_flags, NULL);
227 RTFREE(rt);
228 }
229 uma_zfree(tcp_syncache.zone, sc);
230}
231
232void
233syncache_init(void)
234{
235 int i;
236
237 tcp_syncache.cache_count = 0;
238 tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
239 tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
240 tcp_syncache.cache_limit =
241 tcp_syncache.hashsize * tcp_syncache.bucket_limit;
242 tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
243 tcp_syncache.next_reseed = 0;
244 tcp_syncache.hash_secret = arc4random();
245
246 TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
247 &tcp_syncache.hashsize);
248 TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
249 &tcp_syncache.cache_limit);
250 TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
251 &tcp_syncache.bucket_limit);
252 if (!powerof2(tcp_syncache.hashsize)) {
253 printf("WARNING: syncache hash size is not a power of 2.\n");
254 tcp_syncache.hashsize = 512; /* safe default */
255 }
256 tcp_syncache.hashmask = tcp_syncache.hashsize - 1;
257
258 /* Allocate the hash table. */
259 MALLOC(tcp_syncache.hashbase, struct syncache_head *,
260 tcp_syncache.hashsize * sizeof(struct syncache_head),
261 M_SYNCACHE, M_WAITOK);
262
263 /* Initialize the hash buckets. */
264 for (i = 0; i < tcp_syncache.hashsize; i++) {
265 TAILQ_INIT(&tcp_syncache.hashbase[i].sch_bucket);
266 tcp_syncache.hashbase[i].sch_length = 0;
267 }
268
269 /* Initialize the timer queues. */
270 for (i = 0; i <= SYNCACHE_MAXREXMTS; i++) {
271 TAILQ_INIT(&tcp_syncache.timerq[i]);
272 callout_init(&tcp_syncache.tt_timerq[i], CALLOUT_MPSAFE);
273 }
274
275 /*
276 * Allocate the syncache entries. Allow the zone to allocate one
277 * more entry than cache limit, so a new entry can bump out an
278 * older one.
279 */
280 tcp_syncache.cache_limit -= 1;
281 tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
282 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
283 uma_zone_set_max(tcp_syncache.zone, tcp_syncache.cache_limit);
284}
285
286static void
287syncache_insert(sc, sch)
288 struct syncache *sc;
289 struct syncache_head *sch;
290{
291 struct syncache *sc2;
292 int s, i;
293
294 /*
295 * Make sure that we don't overflow the per-bucket
296 * limit or the total cache size limit.
297 */
298 s = splnet();
299 if (sch->sch_length >= tcp_syncache.bucket_limit) {
300 /*
301 * The bucket is full, toss the oldest element.
302 */
303 sc2 = TAILQ_FIRST(&sch->sch_bucket);
304 sc2->sc_tp->ts_recent = ticks;
305 syncache_drop(sc2, sch);
306 tcpstat.tcps_sc_bucketoverflow++;
307 } else if (tcp_syncache.cache_count >= tcp_syncache.cache_limit) {
308 /*
309 * The cache is full. Toss the oldest entry in the
310 * entire cache. This is the front entry in the
311 * first non-empty timer queue with the largest
312 * timeout value.
313 */
314 for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) {
315 sc2 = TAILQ_FIRST(&tcp_syncache.timerq[i]);
316 if (sc2 != NULL)
317 break;
318 }
319 sc2->sc_tp->ts_recent = ticks;
320 syncache_drop(sc2, NULL);
321 tcpstat.tcps_sc_cacheoverflow++;
322 }
323
324 /* Initialize the entry's timer. */
325 SYNCACHE_TIMEOUT(sc, 0);
326
327 /* Put it into the bucket. */
328 TAILQ_INSERT_TAIL(&sch->sch_bucket, sc, sc_hash);
329 sch->sch_length++;
330 tcp_syncache.cache_count++;
331 tcpstat.tcps_sc_added++;
332 splx(s);
333}
334
335static void
336syncache_drop(sc, sch)
337 struct syncache *sc;
338 struct syncache_head *sch;
339{
340 int s;
341
342 if (sch == NULL) {
343#ifdef INET6
344 if (sc->sc_inc.inc_isipv6) {
345 sch = &tcp_syncache.hashbase[
346 SYNCACHE_HASH6(&sc->sc_inc, tcp_syncache.hashmask)];
347 } else
348#endif
349 {
350 sch = &tcp_syncache.hashbase[
351 SYNCACHE_HASH(&sc->sc_inc, tcp_syncache.hashmask)];
352 }
353 }
354
355 s = splnet();
356
357 TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
358 sch->sch_length--;
359 tcp_syncache.cache_count--;
360
361 TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot], sc, sc_timerq);
362 if (TAILQ_EMPTY(&tcp_syncache.timerq[sc->sc_rxtslot]))
363 callout_stop(&tcp_syncache.tt_timerq[sc->sc_rxtslot]);
364 splx(s);
365
366 syncache_free(sc);
367}
368
369/*
370 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
371 * If we have retransmitted an entry the maximum number of times, expire it.
372 */
373static void
374syncache_timer(xslot)
375 void *xslot;
376{
377 intptr_t slot = (intptr_t)xslot;
378 struct syncache *sc, *nsc;
379 struct inpcb *inp;
380 int s;
381
382 s = splnet();
383 INP_INFO_WLOCK(&tcbinfo);
384 if (callout_pending(&tcp_syncache.tt_timerq[slot]) ||
385 !callout_active(&tcp_syncache.tt_timerq[slot])) {
386 INP_INFO_WUNLOCK(&tcbinfo);
387 splx(s);
388 return;
389 }
390 callout_deactivate(&tcp_syncache.tt_timerq[slot]);
391
392 nsc = TAILQ_FIRST(&tcp_syncache.timerq[slot]);
393 while (nsc != NULL) {
394 if (ticks < nsc->sc_rxttime)
395 break;
396 sc = nsc;
397 inp = sc->sc_tp->t_inpcb;
398 if (slot == SYNCACHE_MAXREXMTS ||
399 slot >= tcp_syncache.rexmt_limit ||
400 inp == NULL || inp->inp_gencnt != sc->sc_inp_gencnt) {
401 nsc = TAILQ_NEXT(sc, sc_timerq);
402 syncache_drop(sc, NULL);
403 tcpstat.tcps_sc_stale++;
404 continue;
405 }
406 /*
407 * syncache_respond() may call back into the syncache to
408 * to modify another entry, so do not obtain the next
409 * entry on the timer chain until it has completed.
410 */
411#ifdef TCPDEBUG
412 (void) syncache_respond(sc, NULL, NULL);
413#else
414 (void) syncache_respond(sc, NULL);
415#endif
416 nsc = TAILQ_NEXT(sc, sc_timerq);
417 tcpstat.tcps_sc_retransmitted++;
418 TAILQ_REMOVE(&tcp_syncache.timerq[slot], sc, sc_timerq);
419 SYNCACHE_TIMEOUT(sc, slot + 1);
420 }
421 if (nsc != NULL)
422 callout_reset(&tcp_syncache.tt_timerq[slot],
423 nsc->sc_rxttime - ticks, syncache_timer, (void *)(slot));
424 INP_INFO_WUNLOCK(&tcbinfo);
425 splx(s);
426}
427
428/*
429 * Find an entry in the syncache.
430 */
431struct syncache *
432syncache_lookup(inc, schp)
433 struct in_conninfo *inc;
434 struct syncache_head **schp;
435{
436 struct syncache *sc;
437 struct syncache_head *sch;
438 int s;
439
440#ifdef INET6
441 if (inc->inc_isipv6) {
442 sch = &tcp_syncache.hashbase[
443 SYNCACHE_HASH6(inc, tcp_syncache.hashmask)];
444 *schp = sch;
445 s = splnet();
446 TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
447 if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) {
448 splx(s);
449 return (sc);
450 }
451 }
452 splx(s);
453 } else
454#endif
455 {
456 sch = &tcp_syncache.hashbase[
457 SYNCACHE_HASH(inc, tcp_syncache.hashmask)];
458 *schp = sch;
459 s = splnet();
460 TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
461#ifdef INET6
462 if (sc->sc_inc.inc_isipv6)
463 continue;
464#endif
465 if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie)) {
466 splx(s);
467 return (sc);
468 }
469 }
470 splx(s);
471 }
472 return (NULL);
473}
474
475/*
476 * This function is called when we get a RST for a
477 * non-existent connection, so that we can see if the
478 * connection is in the syn cache. If it is, zap it.
479 */
480void
481syncache_chkrst(inc, th)
482 struct in_conninfo *inc;
483 struct tcphdr *th;
484{
485 struct syncache *sc;
486 struct syncache_head *sch;
487
488 sc = syncache_lookup(inc, &sch);
489 if (sc == NULL)
490 return;
491 /*
492 * If the RST bit is set, check the sequence number to see
493 * if this is a valid reset segment.
494 * RFC 793 page 37:
495 * In all states except SYN-SENT, all reset (RST) segments
496 * are validated by checking their SEQ-fields. A reset is
497 * valid if its sequence number is in the window.
498 *
499 * The sequence number in the reset segment is normally an
500 * echo of our outgoing acknowlegement numbers, but some hosts
501 * send a reset with the sequence number at the rightmost edge
502 * of our receive window, and we have to handle this case.
503 */
504 if (SEQ_GEQ(th->th_seq, sc->sc_irs) &&
505 SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
506 syncache_drop(sc, sch);
507 tcpstat.tcps_sc_reset++;
508 }
509}
510
511void
512syncache_badack(inc)
513 struct in_conninfo *inc;
514{
515 struct syncache *sc;
516 struct syncache_head *sch;
517
518 sc = syncache_lookup(inc, &sch);
519 if (sc != NULL) {
520 syncache_drop(sc, sch);
521 tcpstat.tcps_sc_badack++;
522 }
523}
524
525void
526syncache_unreach(inc, th)
527 struct in_conninfo *inc;
528 struct tcphdr *th;
529{
530 struct syncache *sc;
531 struct syncache_head *sch;
532
533 /* we are called at splnet() here */
534 sc = syncache_lookup(inc, &sch);
535 if (sc == NULL)
536 return;
537
538 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
539 if (ntohl(th->th_seq) != sc->sc_iss)
540 return;
541
542 /*
543 * If we've rertransmitted 3 times and this is our second error,
544 * we remove the entry. Otherwise, we allow it to continue on.
545 * This prevents us from incorrectly nuking an entry during a
546 * spurious network outage.
547 *
548 * See tcp_notify().
549 */
550 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtslot < 3) {
551 sc->sc_flags |= SCF_UNREACH;
552 return;
553 }
554 syncache_drop(sc, sch);
555 tcpstat.tcps_sc_unreach++;
556}
557
558/*
559 * Build a new TCP socket structure from a syncache entry.
560 */
561static struct socket *
562syncache_socket(sc, lso, m)
563 struct syncache *sc;
564 struct socket *lso;
565 struct mbuf *m;
566{
567 struct inpcb *inp = NULL;
568 struct socket *so;
569 struct tcpcb *tp;
570
571 /*
572 * Ok, create the full blown connection, and set things up
573 * as they would have been set up if we had created the
574 * connection when the SYN arrived. If we can't create
575 * the connection, abort it.
576 */
577 so = sonewconn(lso, SS_ISCONNECTED);
578 if (so == NULL) {
579 /*
580 * Drop the connection; we will send a RST if the peer
581 * retransmits the ACK,
582 */
583 tcpstat.tcps_listendrop++;
584 goto abort;
585 }
586#ifdef MAC
587 mac_set_socket_peer_from_mbuf(m, so);
588#endif
589
590 inp = sotoinpcb(so);
591
592 /*
593 * Insert new socket into hash list.
594 */
595 inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6;
596#ifdef INET6
597 if (sc->sc_inc.inc_isipv6) {
598 inp->in6p_laddr = sc->sc_inc.inc6_laddr;
599 } else {
600 inp->inp_vflag &= ~INP_IPV6;
601 inp->inp_vflag |= INP_IPV4;
602#endif
603 inp->inp_laddr = sc->sc_inc.inc_laddr;
604#ifdef INET6
605 }
606#endif
607 inp->inp_lport = sc->sc_inc.inc_lport;
608 if (in_pcbinshash(inp) != 0) {
609 /*
610 * Undo the assignments above if we failed to
611 * put the PCB on the hash lists.
612 */
613#ifdef INET6
614 if (sc->sc_inc.inc_isipv6)
615 inp->in6p_laddr = in6addr_any;
616 else
617#endif
618 inp->inp_laddr.s_addr = INADDR_ANY;
619 inp->inp_lport = 0;
620 goto abort;
621 }
622#ifdef IPSEC
623 /* copy old policy into new socket's */
624 if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
625 printf("syncache_expand: could not copy policy\n");
626#endif
627#ifdef INET6
628 if (sc->sc_inc.inc_isipv6) {
629 struct inpcb *oinp = sotoinpcb(lso);
630 struct in6_addr laddr6;
631 struct sockaddr_in6 *sin6;
632 /*
633 * Inherit socket options from the listening socket.
634 * Note that in6p_inputopts are not (and should not be)
635 * copied, since it stores previously received options and is
636 * used to detect if each new option is different than the
637 * previous one and hence should be passed to a user.
638 * If we copied in6p_inputopts, a user would not be able to
639 * receive options just after calling the accept system call.
640 */
641 inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS;
642 if (oinp->in6p_outputopts)
643 inp->in6p_outputopts =
644 ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
645 inp->in6p_route = sc->sc_route6;
646 sc->sc_route6.ro_rt = NULL;
647
648 MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
649 M_SONAME, M_NOWAIT | M_ZERO);
650 if (sin6 == NULL)
651 goto abort;
652 sin6->sin6_family = AF_INET6;
653 sin6->sin6_len = sizeof(*sin6);
654 sin6->sin6_addr = sc->sc_inc.inc6_faddr;
655 sin6->sin6_port = sc->sc_inc.inc_fport;
656 laddr6 = inp->in6p_laddr;
657 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
658 inp->in6p_laddr = sc->sc_inc.inc6_laddr;
659 if (in6_pcbconnect(inp, (struct sockaddr *)sin6, &thread0)) {
660 inp->in6p_laddr = laddr6;
661 FREE(sin6, M_SONAME);
662 goto abort;
663 }
664 FREE(sin6, M_SONAME);
665 } else
666#endif
667 {
668 struct in_addr laddr;
669 struct sockaddr_in *sin;
670
671 inp->inp_options = ip_srcroute();
672 if (inp->inp_options == NULL) {
673 inp->inp_options = sc->sc_ipopts;
674 sc->sc_ipopts = NULL;
675 }
676 inp->inp_route = sc->sc_route;
677 sc->sc_route.ro_rt = NULL;
678
679 MALLOC(sin, struct sockaddr_in *, sizeof *sin,
680 M_SONAME, M_NOWAIT | M_ZERO);
681 if (sin == NULL)
682 goto abort;
683 sin->sin_family = AF_INET;
684 sin->sin_len = sizeof(*sin);
685 sin->sin_addr = sc->sc_inc.inc_faddr;
686 sin->sin_port = sc->sc_inc.inc_fport;
687 bzero((caddr_t)sin->sin_zero, sizeof(sin->sin_zero));
688 laddr = inp->inp_laddr;
689 if (inp->inp_laddr.s_addr == INADDR_ANY)
690 inp->inp_laddr = sc->sc_inc.inc_laddr;
691 if (in_pcbconnect(inp, (struct sockaddr *)sin, &thread0)) {
692 inp->inp_laddr = laddr;
693 FREE(sin, M_SONAME);
694 goto abort;
695 }
696 FREE(sin, M_SONAME);
697 }
698
699 tp = intotcpcb(inp);
700 tp->t_state = TCPS_SYN_RECEIVED;
701 tp->iss = sc->sc_iss;
702 tp->irs = sc->sc_irs;
703 tcp_rcvseqinit(tp);
704 tcp_sendseqinit(tp);
705 tp->snd_wl1 = sc->sc_irs;
706 tp->rcv_up = sc->sc_irs + 1;
707 tp->rcv_wnd = sc->sc_wnd;
708 tp->rcv_adv += tp->rcv_wnd;
709
710 tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
711 if (sc->sc_flags & SCF_NOOPT)
712 tp->t_flags |= TF_NOOPT;
713 if (sc->sc_flags & SCF_WINSCALE) {
714 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
715 tp->requested_s_scale = sc->sc_requested_s_scale;
716 tp->request_r_scale = sc->sc_request_r_scale;
717 }
718 if (sc->sc_flags & SCF_TIMESTAMP) {
719 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
720 tp->ts_recent = sc->sc_tsrecent;
721 tp->ts_recent_age = ticks;
722 }
723 if (sc->sc_flags & SCF_CC) {
724 /*
725 * Initialization of the tcpcb for transaction;
726 * set SND.WND = SEG.WND,
727 * initialize CCsend and CCrecv.
728 */
729 tp->t_flags |= TF_REQ_CC|TF_RCVD_CC;
730 tp->cc_send = sc->sc_cc_send;
731 tp->cc_recv = sc->sc_cc_recv;
732 }
733
734 tcp_mss(tp, sc->sc_peer_mss);
735
736 /*
737 * If the SYN,ACK was retransmitted, reset cwnd to 1 segment.
738 */
739 if (sc->sc_rxtslot != 0)
740 tp->snd_cwnd = tp->t_maxseg;
741 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
742
743 tcpstat.tcps_accepts++;
744 return (so);
745
746abort:
747 if (so != NULL)
748 (void) soabort(so);
749 return (NULL);
750}
751
752/*
753 * This function gets called when we receive an ACK for a
754 * socket in the LISTEN state. We look up the connection
755 * in the syncache, and if its there, we pull it out of
756 * the cache and turn it into a full-blown connection in
757 * the SYN-RECEIVED state.
758 */
759int
760syncache_expand(inc, th, sop, m)
761 struct in_conninfo *inc;
762 struct tcphdr *th;
763 struct socket **sop;
764 struct mbuf *m;
765{
766 struct syncache *sc;
767 struct syncache_head *sch;
768 struct socket *so;
769
770 sc = syncache_lookup(inc, &sch);
771 if (sc == NULL) {
772 /*
773 * There is no syncache entry, so see if this ACK is
774 * a returning syncookie. To do this, first:
775 * A. See if this socket has had a syncache entry dropped in
776 * the past. We don't want to accept a bogus syncookie
777 * if we've never received a SYN.
778 * B. check that the syncookie is valid. If it is, then
779 * cobble up a fake syncache entry, and return.
780 */
781 if (!tcp_syncookies)
782 return (0);
783 sc = syncookie_lookup(inc, th, *sop);
784 if (sc == NULL)
785 return (0);
786 sch = NULL;
787 tcpstat.tcps_sc_recvcookie++;
788 }
789
790 /*
791 * If seg contains an ACK, but not for our SYN/ACK, send a RST.
792 */
793 if (th->th_ack != sc->sc_iss + 1)
794 return (0);
795
796 so = syncache_socket(sc, *sop, m);
797 if (so == NULL) {
798#if 0
799resetandabort:
800 /* XXXjlemon check this - is this correct? */
801 (void) tcp_respond(NULL, m, m, th,
802 th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK);
803#endif
804 m_freem(m); /* XXX only needed for above */
805 tcpstat.tcps_sc_aborted++;
806 } else {
807 sc->sc_flags |= SCF_KEEPROUTE;
808 tcpstat.tcps_sc_completed++;
809 }
810 if (sch == NULL)
811 syncache_free(sc);
812 else
813 syncache_drop(sc, sch);
814 *sop = so;
815 return (1);
816}
817
818/*
819 * Given a LISTEN socket and an inbound SYN request, add
820 * this to the syn cache, and send back a segment:
821 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
822 * to the source.
823 *
824 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
825 * Doing so would require that we hold onto the data and deliver it
826 * to the application. However, if we are the target of a SYN-flood
827 * DoS attack, an attacker could send data which would eventually
828 * consume all available buffer space if it were ACKed. By not ACKing
829 * the data, we avoid this DoS scenario.
830 */
831int
832syncache_add(inc, to, th, sop, m)
833 struct in_conninfo *inc;
834 struct tcpopt *to;
835 struct tcphdr *th;
836 struct socket **sop;
837 struct mbuf *m;
838{
839 struct tcpcb *tp;
840 struct socket *so;
841 struct syncache *sc = NULL;
842 struct syncache_head *sch;
843 struct mbuf *ipopts = NULL;
844 struct rmxp_tao *taop;
845 int i, s, win;
846
847 so = *sop;
848 tp = sototcpcb(so);
849
850 /*
851 * Remember the IP options, if any.
852 */
853#ifdef INET6
854 if (!inc->inc_isipv6)
855#endif
856 ipopts = ip_srcroute();
857
858 /*
859 * See if we already have an entry for this connection.
860 * If we do, resend the SYN,ACK, and reset the retransmit timer.
861 *
862 * XXX
863 * should the syncache be re-initialized with the contents
864 * of the new SYN here (which may have different options?)
865 */
866 sc = syncache_lookup(inc, &sch);
867 if (sc != NULL) {
868 tcpstat.tcps_sc_dupsyn++;
869 if (ipopts) {
870 /*
871 * If we were remembering a previous source route,
872 * forget it and use the new one we've been given.
873 */
874 if (sc->sc_ipopts)
875 (void) m_free(sc->sc_ipopts);
876 sc->sc_ipopts = ipopts;
877 }
878 /*
879 * Update timestamp if present.
880 */
881 if (sc->sc_flags & SCF_TIMESTAMP)
882 sc->sc_tsrecent = to->to_tsval;
883 /*
884 * PCB may have changed, pick up new values.
885 */
886 sc->sc_tp = tp;
887 sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt;
888#ifdef TCPDEBUG
889 if (syncache_respond(sc, m, so) == 0) {
890#else
891 if (syncache_respond(sc, m) == 0) {
892#endif
893 s = splnet();
894 TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot],
895 sc, sc_timerq);
896 SYNCACHE_TIMEOUT(sc, sc->sc_rxtslot);
897 splx(s);
898 tcpstat.tcps_sndacks++;
899 tcpstat.tcps_sndtotal++;
900 }
901 *sop = NULL;
902 return (1);
903 }
904
905 sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT);
906 if (sc == NULL) {
907 /*
908 * The zone allocator couldn't provide more entries.
909 * Treat this as if the cache was full; drop the oldest
910 * entry and insert the new one.
911 */
912 s = splnet();
913 for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) {
914 sc = TAILQ_FIRST(&tcp_syncache.timerq[i]);
915 if (sc != NULL)
916 break;
917 }
918 sc->sc_tp->ts_recent = ticks;
919 syncache_drop(sc, NULL);
920 splx(s);
921 tcpstat.tcps_sc_zonefail++;
922 sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT);
923 if (sc == NULL) {
924 if (ipopts)
925 (void) m_free(ipopts);
926 return (0);
927 }
928 }
929
930 /*
931 * Fill in the syncache values.
932 */
933 bzero(sc, sizeof(*sc));
934 sc->sc_tp = tp;
935 sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt;
936 sc->sc_ipopts = ipopts;
937 sc->sc_inc.inc_fport = inc->inc_fport;
938 sc->sc_inc.inc_lport = inc->inc_lport;
939#ifdef INET6
940 sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
941 if (inc->inc_isipv6) {
942 sc->sc_inc.inc6_faddr = inc->inc6_faddr;
943 sc->sc_inc.inc6_laddr = inc->inc6_laddr;
944 sc->sc_route6.ro_rt = NULL;
945 } else
946#endif
947 {
948 sc->sc_inc.inc_faddr = inc->inc_faddr;
949 sc->sc_inc.inc_laddr = inc->inc_laddr;
950 sc->sc_route.ro_rt = NULL;
951 }
952 sc->sc_irs = th->th_seq;
953 sc->sc_flags = 0;
954 sc->sc_peer_mss = to->to_flags & TOF_MSS ? to->to_mss : 0;
955 if (tcp_syncookies)
956 sc->sc_iss = syncookie_generate(sc);
957 else
958 sc->sc_iss = arc4random();
959
960 /* Initial receive window: clip sbspace to [0 .. TCP_MAXWIN] */
961 win = sbspace(&so->so_rcv);
962 win = imax(win, 0);
963 win = imin(win, TCP_MAXWIN);
964 sc->sc_wnd = win;
965
966 if (tcp_do_rfc1323) {
967 /*
968 * A timestamp received in a SYN makes
969 * it ok to send timestamp requests and replies.
970 */
971 if (to->to_flags & TOF_TS) {
972 sc->sc_tsrecent = to->to_tsval;
973 sc->sc_flags |= SCF_TIMESTAMP;
974 }
975 if (to->to_flags & TOF_SCALE) {
976 int wscale = 0;
977
978 /* Compute proper scaling value from buffer space */
979 while (wscale < TCP_MAX_WINSHIFT &&
980 (TCP_MAXWIN << wscale) < so->so_rcv.sb_hiwat)
981 wscale++;
982 sc->sc_request_r_scale = wscale;
983 sc->sc_requested_s_scale = to->to_requested_s_scale;
984 sc->sc_flags |= SCF_WINSCALE;
985 }
986 }
987 if (tcp_do_rfc1644) {
988 /*
989 * A CC or CC.new option received in a SYN makes
990 * it ok to send CC in subsequent segments.
991 */
992 if (to->to_flags & (TOF_CC|TOF_CCNEW)) {
993 sc->sc_cc_recv = to->to_cc;
994 sc->sc_cc_send = CC_INC(tcp_ccgen);
995 sc->sc_flags |= SCF_CC;
996 }
997 }
998 if (tp->t_flags & TF_NOOPT)
999 sc->sc_flags = SCF_NOOPT;
1000
1001 /*
1002 * XXX
1003 * We have the option here of not doing TAO (even if the segment
1004 * qualifies) and instead fall back to a normal 3WHS via the syncache.
1005 * This allows us to apply synflood protection to TAO-qualifying SYNs
1006 * also. However, there should be a hueristic to determine when to
1007 * do this, and is not present at the moment.
1008 */
1009
1010 /*
1011 * Perform TAO test on incoming CC (SEG.CC) option, if any.
1012 * - compare SEG.CC against cached CC from the same host, if any.
1013 * - if SEG.CC > chached value, SYN must be new and is accepted
1014 * immediately: save new CC in the cache, mark the socket
1015 * connected, enter ESTABLISHED state, turn on flag to
1016 * send a SYN in the next segment.
1017 * A virtual advertised window is set in rcv_adv to
1018 * initialize SWS prevention. Then enter normal segment
1019 * processing: drop SYN, process data and FIN.
1020 * - otherwise do a normal 3-way handshake.
1021 */
1022 taop = tcp_gettaocache(&sc->sc_inc);
1023 if ((to->to_flags & TOF_CC) != 0) {
1024 if (((tp->t_flags & TF_NOPUSH) != 0) &&
1025 sc->sc_flags & SCF_CC &&
1026 taop != NULL && taop->tao_cc != 0 &&
1027 CC_GT(to->to_cc, taop->tao_cc)) {
1028 sc->sc_rxtslot = 0;
1029 so = syncache_socket(sc, *sop, m);
1030 if (so != NULL) {
1031 sc->sc_flags |= SCF_KEEPROUTE;
1032 taop->tao_cc = to->to_cc;
1033 *sop = so;
1034 }
1035 syncache_free(sc);
1036 return (so != NULL);
1037 }
1038 } else {
1039 /*
1040 * No CC option, but maybe CC.NEW: invalidate cached value.
1041 */
1042 if (taop != NULL)
1043 taop->tao_cc = 0;
1044 }
1045 /*
1046 * TAO test failed or there was no CC option,
1047 * do a standard 3-way handshake.
1048 */
1049#ifdef TCPDEBUG
1050 if (syncache_respond(sc, m, so) == 0) {
1051#else
1052 if (syncache_respond(sc, m) == 0) {
1053#endif
1054 syncache_insert(sc, sch);
1055 tcpstat.tcps_sndacks++;
1056 tcpstat.tcps_sndtotal++;
1057 } else {
1058 syncache_free(sc);
1059 tcpstat.tcps_sc_dropped++;
1060 }
1061 *sop = NULL;
1062 return (1);
1063}
1064
1065#ifdef TCPDEBUG
1066static int
1067syncache_respond(sc, m, so)
1068 struct syncache *sc;
1069 struct mbuf *m;
1070 struct socket *so;
1071#else
1072static int
1073syncache_respond(sc, m)
1074 struct syncache *sc;
1075 struct mbuf *m;
1076#endif
1077{
1078 u_int8_t *optp;
1079 int optlen, error;
1080 u_int16_t tlen, hlen, mssopt;
1081 struct ip *ip = NULL;
1082 struct rtentry *rt;
1083 struct tcphdr *th;
1084#ifdef INET6
1085 struct ip6_hdr *ip6 = NULL;
1086#endif
1087
1088#ifdef INET6
1089 if (sc->sc_inc.inc_isipv6) {
1090 rt = tcp_rtlookup6(&sc->sc_inc);
1091 if (rt != NULL)
1092 mssopt = rt->rt_ifp->if_mtu -
1093 (sizeof(struct ip6_hdr) + sizeof(struct tcphdr));
1094 else
1095 mssopt = tcp_v6mssdflt;
1096 hlen = sizeof(struct ip6_hdr);
1097 } else
1098#endif
1099 {
1100 rt = tcp_rtlookup(&sc->sc_inc);
1101 if (rt != NULL)
1102 mssopt = rt->rt_ifp->if_mtu -
1103 (sizeof(struct ip) + sizeof(struct tcphdr));
1104 else
1105 mssopt = tcp_mssdflt;
1106 hlen = sizeof(struct ip);
1107 }
1108
1109 /* Compute the size of the TCP options. */
1110 if (sc->sc_flags & SCF_NOOPT) {
1111 optlen = 0;
1112 } else {
1113 optlen = TCPOLEN_MAXSEG +
1114 ((sc->sc_flags & SCF_WINSCALE) ? 4 : 0) +
1115 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0) +
1116 ((sc->sc_flags & SCF_CC) ? TCPOLEN_CC_APPA * 2 : 0);
1117 }
1118 tlen = hlen + sizeof(struct tcphdr) + optlen;
1119
1120 /*
1121 * XXX
1122 * assume that the entire packet will fit in a header mbuf
1123 */
1124 KASSERT(max_linkhdr + tlen <= MHLEN, ("syncache: mbuf too small"));
1125
1126 /*
1127 * XXX shouldn't this reuse the mbuf if possible ?
1128 * Create the IP+TCP header from scratch.
1129 */
1130 if (m)
1131 m_freem(m);
1132
1133 m = m_gethdr(M_DONTWAIT, MT_HEADER);
1134 if (m == NULL)
1135 return (ENOBUFS);
1136 m->m_data += max_linkhdr;
1137 m->m_len = tlen;
1138 m->m_pkthdr.len = tlen;
1139 m->m_pkthdr.rcvif = NULL;
1140#ifdef MAC
1141 mac_create_mbuf_from_socket(sc->sc_tp->t_inpcb->inp_socket, m);
1142#endif
1143
1144#ifdef INET6
1145 if (sc->sc_inc.inc_isipv6) {
1146 ip6 = mtod(m, struct ip6_hdr *);
1147 ip6->ip6_vfc = IPV6_VERSION;
1148 ip6->ip6_nxt = IPPROTO_TCP;
1149 ip6->ip6_src = sc->sc_inc.inc6_laddr;
1150 ip6->ip6_dst = sc->sc_inc.inc6_faddr;
1151 ip6->ip6_plen = htons(tlen - hlen);
1152 /* ip6_hlim is set after checksum */
1153 /* ip6_flow = ??? */
1154
1155 th = (struct tcphdr *)(ip6 + 1);
1156 } else
1157#endif
1158 {
1159 ip = mtod(m, struct ip *);
1160 ip->ip_v = IPVERSION;
1161 ip->ip_hl = sizeof(struct ip) >> 2;
1162 ip->ip_len = tlen;
1163 ip->ip_id = 0;
1164 ip->ip_off = 0;
1165 ip->ip_sum = 0;
1166 ip->ip_p = IPPROTO_TCP;
1167 ip->ip_src = sc->sc_inc.inc_laddr;
1168 ip->ip_dst = sc->sc_inc.inc_faddr;
1169 ip->ip_ttl = sc->sc_tp->t_inpcb->inp_ip_ttl; /* XXX */
1170 ip->ip_tos = sc->sc_tp->t_inpcb->inp_ip_tos; /* XXX */
1171
1172 /*
1173 * See if we should do MTU discovery. Route lookups are
1174 * expensive, so we will only unset the DF bit if:
1175 *
1176 * 1) path_mtu_discovery is disabled
1177 * 2) the SCF_UNREACH flag has been set
1178 */
1179 if (path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0))
1180 ip->ip_off |= IP_DF;
1181
1182 th = (struct tcphdr *)(ip + 1);
1183 }
1184 th->th_sport = sc->sc_inc.inc_lport;
1185 th->th_dport = sc->sc_inc.inc_fport;
1186
1187 th->th_seq = htonl(sc->sc_iss);
1188 th->th_ack = htonl(sc->sc_irs + 1);
1189 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
1190 th->th_x2 = 0;
1191 th->th_flags = TH_SYN|TH_ACK;
1192 th->th_win = htons(sc->sc_wnd);
1193 th->th_urp = 0;
1194
1195 /* Tack on the TCP options. */
1196 if (optlen != 0) {
1197 optp = (u_int8_t *)(th + 1);
1198 *optp++ = TCPOPT_MAXSEG;
1199 *optp++ = TCPOLEN_MAXSEG;
1200 *optp++ = (mssopt >> 8) & 0xff;
1201 *optp++ = mssopt & 0xff;
1202
1203 if (sc->sc_flags & SCF_WINSCALE) {
1204 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
1205 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
1206 sc->sc_request_r_scale);
1207 optp += 4;
1208 }
1209
1210 if (sc->sc_flags & SCF_TIMESTAMP) {
1211 u_int32_t *lp = (u_int32_t *)(optp);
1212
1213 /* Form timestamp option per appendix A of RFC 1323. */
1214 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
1215 *lp++ = htonl(ticks);
1216 *lp = htonl(sc->sc_tsrecent);
1217 optp += TCPOLEN_TSTAMP_APPA;
1218 }
1219
1220 /*
1221 * Send CC and CC.echo if we received CC from our peer.
1222 */
1223 if (sc->sc_flags & SCF_CC) {
1224 u_int32_t *lp = (u_int32_t *)(optp);
1225
1226 *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC));
1227 *lp++ = htonl(sc->sc_cc_send);
1228 *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CCECHO));
1229 *lp = htonl(sc->sc_cc_recv);
1230 optp += TCPOLEN_CC_APPA * 2;
1231 }
1232 }
1233
1234#ifdef INET6
1235 if (sc->sc_inc.inc_isipv6) {
1236 struct route_in6 *ro6 = &sc->sc_route6;
1237
1238 th->th_sum = 0;
1239 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
1240 ip6->ip6_hlim = in6_selecthlim(NULL,
1241 ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL);
1242 error = ip6_output(m, NULL, ro6, 0, NULL, NULL,
1243 sc->sc_tp->t_inpcb);
1244 } else
1245#endif
1246 {
1247 th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1248 htons(tlen - hlen + IPPROTO_TCP));
1249 m->m_pkthdr.csum_flags = CSUM_TCP;
1250 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1251#ifdef TCPDEBUG
1252 /*
1253 * Trace.
1254 */
1255 if (so != NULL && so->so_options & SO_DEBUG) {
1256 struct tcpcb *tp = sototcpcb(so);
1257 tcp_trace(TA_OUTPUT, tp->t_state, tp,
1258 mtod(m, void *), th, 0);
1259 }
1260#endif
1261 error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 0, NULL,
1262 sc->sc_tp->t_inpcb);
1263 }
1264 return (error);
1265}
1266
1267/*
1268 * cookie layers:
1269 *
1270 * |. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|
1271 * | peer iss |
1272 * | MD5(laddr,faddr,secret,lport,fport) |. . . . . . .|
1273 * | 0 |(A)| |
1274 * (A): peer mss index
1275 */
1276
1277/*
1278 * The values below are chosen to minimize the size of the tcp_secret
1279 * table, as well as providing roughly a 16 second lifetime for the cookie.
1280 */
1281
1282#define SYNCOOKIE_WNDBITS 5 /* exposed bits for window indexing */
1283#define SYNCOOKIE_TIMESHIFT 1 /* scale ticks to window time units */
1284
1285#define SYNCOOKIE_WNDMASK ((1 << SYNCOOKIE_WNDBITS) - 1)
1286#define SYNCOOKIE_NSECRETS (1 << SYNCOOKIE_WNDBITS)
1287#define SYNCOOKIE_TIMEOUT \
1288 (hz * (1 << SYNCOOKIE_WNDBITS) / (1 << SYNCOOKIE_TIMESHIFT))
1289#define SYNCOOKIE_DATAMASK ((3 << SYNCOOKIE_WNDBITS) | SYNCOOKIE_WNDMASK)
1290
1291static struct {
1292 u_int32_t ts_secbits[4];
1293 u_int ts_expire;
1294} tcp_secret[SYNCOOKIE_NSECRETS];
1295
1296static int tcp_msstab[] = { 0, 536, 1460, 8960 };
1297
1298static MD5_CTX syn_ctx;
1299
1300#define MD5Add(v) MD5Update(&syn_ctx, (u_char *)&v, sizeof(v))
1301
1302struct md5_add {
1303 u_int32_t laddr, faddr;
1304 u_int32_t secbits[4];
1305 u_int16_t lport, fport;
1306};
1307
1308#ifdef CTASSERT
1309CTASSERT(sizeof(struct md5_add) == 28);
1310#endif
1311
1312/*
1313 * Consider the problem of a recreated (and retransmitted) cookie. If the
1314 * original SYN was accepted, the connection is established. The second
1315 * SYN is inflight, and if it arrives with an ISN that falls within the
1316 * receive window, the connection is killed.
1317 *
1318 * However, since cookies have other problems, this may not be worth
1319 * worrying about.
1320 */
1321
1322static u_int32_t
1323syncookie_generate(struct syncache *sc)
1324{
1325 u_int32_t md5_buffer[4];
1326 u_int32_t data;
1327 int idx, i;
1328 struct md5_add add;
1329
1330 idx = ((ticks << SYNCOOKIE_TIMESHIFT) / hz) & SYNCOOKIE_WNDMASK;
1331 if (tcp_secret[idx].ts_expire < ticks) {
1332 for (i = 0; i < 4; i++)
1333 tcp_secret[idx].ts_secbits[i] = arc4random();
1334 tcp_secret[idx].ts_expire = ticks + SYNCOOKIE_TIMEOUT;
1335 }
1336 for (data = sizeof(tcp_msstab) / sizeof(int) - 1; data > 0; data--)
1337 if (tcp_msstab[data] <= sc->sc_peer_mss)
1338 break;
1339 data = (data << SYNCOOKIE_WNDBITS) | idx;
1340 data ^= sc->sc_irs; /* peer's iss */
1341 MD5Init(&syn_ctx);
1342#ifdef INET6
1343 if (sc->sc_inc.inc_isipv6) {
1344 MD5Add(sc->sc_inc.inc6_laddr);
1345 MD5Add(sc->sc_inc.inc6_faddr);
1346 add.laddr = 0;
1347 add.faddr = 0;
1348 } else
1349#endif
1350 {
1351 add.laddr = sc->sc_inc.inc_laddr.s_addr;
1352 add.faddr = sc->sc_inc.inc_faddr.s_addr;
1353 }
1354 add.lport = sc->sc_inc.inc_lport;
1355 add.fport = sc->sc_inc.inc_fport;
1356 add.secbits[0] = tcp_secret[idx].ts_secbits[0];
1357 add.secbits[1] = tcp_secret[idx].ts_secbits[1];
1358 add.secbits[2] = tcp_secret[idx].ts_secbits[2];
1359 add.secbits[3] = tcp_secret[idx].ts_secbits[3];
1360 MD5Add(add);
1361 MD5Final((u_char *)&md5_buffer, &syn_ctx);
1362 data ^= (md5_buffer[0] & ~SYNCOOKIE_WNDMASK);
1363 return (data);
1364}
1365
1366static struct syncache *
1367syncookie_lookup(inc, th, so)
1368 struct in_conninfo *inc;
1369 struct tcphdr *th;
1370 struct socket *so;
1371{
1372 u_int32_t md5_buffer[4];
1373 struct syncache *sc;
1374 u_int32_t data;
1375 int wnd, idx;
1376 struct md5_add add;
1377
1378 data = (th->th_ack - 1) ^ (th->th_seq - 1); /* remove ISS */
1379 idx = data & SYNCOOKIE_WNDMASK;
1380 if (tcp_secret[idx].ts_expire < ticks ||
1381 sototcpcb(so)->ts_recent + SYNCOOKIE_TIMEOUT < ticks)
1382 return (NULL);
1383 MD5Init(&syn_ctx);
1384#ifdef INET6
1385 if (inc->inc_isipv6) {
1386 MD5Add(inc->inc6_laddr);
1387 MD5Add(inc->inc6_faddr);
1388 add.laddr = 0;
1389 add.faddr = 0;
1390 } else
1391#endif
1392 {
1393 add.laddr = inc->inc_laddr.s_addr;
1394 add.faddr = inc->inc_faddr.s_addr;
1395 }
1396 add.lport = inc->inc_lport;
1397 add.fport = inc->inc_fport;
1398 add.secbits[0] = tcp_secret[idx].ts_secbits[0];
1399 add.secbits[1] = tcp_secret[idx].ts_secbits[1];
1400 add.secbits[2] = tcp_secret[idx].ts_secbits[2];
1401 add.secbits[3] = tcp_secret[idx].ts_secbits[3];
1402 MD5Add(add);
1403 MD5Final((u_char *)&md5_buffer, &syn_ctx);
1404 data ^= md5_buffer[0];
1405 if ((data & ~SYNCOOKIE_DATAMASK) != 0)
1406 return (NULL);
1407 data = data >> SYNCOOKIE_WNDBITS;
1408
1409 sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT);
1410 if (sc == NULL)
1411 return (NULL);
1412 /*
1413 * Fill in the syncache values.
1414 * XXX duplicate code from syncache_add
1415 */
1416 sc->sc_ipopts = NULL;
1417 sc->sc_inc.inc_fport = inc->inc_fport;
1418 sc->sc_inc.inc_lport = inc->inc_lport;
1419#ifdef INET6
1420 sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
1421 if (inc->inc_isipv6) {
1422 sc->sc_inc.inc6_faddr = inc->inc6_faddr;
1423 sc->sc_inc.inc6_laddr = inc->inc6_laddr;
1424 sc->sc_route6.ro_rt = NULL;
1425 } else
1426#endif
1427 {
1428 sc->sc_inc.inc_faddr = inc->inc_faddr;
1429 sc->sc_inc.inc_laddr = inc->inc_laddr;
1430 sc->sc_route.ro_rt = NULL;
1431 }
1432 sc->sc_irs = th->th_seq - 1;
1433 sc->sc_iss = th->th_ack - 1;
1434 wnd = sbspace(&so->so_rcv);
1435 wnd = imax(wnd, 0);
1436 wnd = imin(wnd, TCP_MAXWIN);
1437 sc->sc_wnd = wnd;
1438 sc->sc_flags = 0;
1439 sc->sc_rxtslot = 0;
1440 sc->sc_peer_mss = tcp_msstab[data];
1441 return (sc);
1442}