1/*
2 * Copyright (c) 2003-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*	$FreeBSD: src/sys/netinet6/in6_rmx.c,v 1.1.2.2 2001/07/03 11:01:52 ume Exp $	*/
30/*	$KAME: in6_rmx.c,v 1.10 2001/05/24 05:44:58 itojun Exp $	*/
31
32/*
33 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 * 3. Neither the name of the project nor the names of its contributors
45 *    may be used to endorse or promote products derived from this software
46 *    without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 */
60
61/*
62 * Copyright 1994, 1995 Massachusetts Institute of Technology
63 *
64 * Permission to use, copy, modify, and distribute this software and
65 * its documentation for any purpose and without fee is hereby
66 * granted, provided that both the above copyright notice and this
67 * permission notice appear in all copies, that both the above
68 * copyright notice and this permission notice appear in all
69 * supporting documentation, and that the name of M.I.T. not be used
70 * in advertising or publicity pertaining to distribution of the
71 * software without specific, written prior permission.  M.I.T. makes
72 * no representations about the suitability of this software for any
73 * purpose.  It is provided "as is" without express or implied
74 * warranty.
75 *
76 * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
77 * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
78 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
79 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
80 * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
81 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
82 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
83 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
84 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
85 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
86 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
87 * SUCH DAMAGE.
88 *
89 */
90
91/*
92 * This code does two things necessary for the enhanced TCP metrics to
93 * function in a useful manner:
94 *  1) It marks all non-host routes as `cloning', thus ensuring that
95 *     every actual reference to such a route actually gets turned
96 *     into a reference to a host route to the specific destination
97 *     requested.
98 *  2) When such routes lose all their references, it arranges for them
99 *     to be deleted in some random collection of circumstances, so that
100 *     a large quantity of stale routing data is not kept in kernel memory
101 *     indefinitely.  See in6_rtqtimo() below for the exact mechanism.
102 */
103
104#include <sys/param.h>
105#include <sys/systm.h>
106#include <sys/kernel.h>
107#include <sys/sysctl.h>
108#include <kern/queue.h>
109#include <sys/socket.h>
110#include <sys/socketvar.h>
111#include <sys/mbuf.h>
112#include <sys/syslog.h>
113#include <kern/lock.h>
114
115#include <net/if.h>
116#include <net/route.h>
117#include <netinet/in.h>
118#include <netinet/ip_var.h>
119#include <netinet/in_var.h>
120
121#include <netinet/ip6.h>
122#include <netinet6/ip6_var.h>
123
124#include <netinet/icmp6.h>
125
126#include <netinet/tcp.h>
127#include <netinet/tcp_seq.h>
128#include <netinet/tcp_timer.h>
129#include <netinet/tcp_var.h>
130
131extern int	in6_inithead(void **head, int off);
132static void	in6_rtqtimo(void *rock);
133static void in6_mtutimo(void *rock);
134extern int tvtohz(struct timeval *);
135
136static struct radix_node *in6_matroute_args(void *, struct radix_node_head *,
137    rn_matchf_t *, void *);
138
139#define RTPRF_OURS		RTF_PROTO3	/* set on routes we manage */
140
141/*
142 * Do what we need to do when inserting a route.
143 */
144static struct radix_node *
145in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
146	    struct radix_node *treenodes)
147{
148	struct rtentry *rt = (struct rtentry *)treenodes;
149	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)rt_key(rt);
150	struct radix_node *ret;
151
152	/*
153	 * For IPv6, all unicast non-host routes are automatically cloning.
154	 */
155	if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
156		rt->rt_flags |= RTF_MULTICAST;
157
158	if (!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) {
159		rt->rt_flags |= RTF_PRCLONING;
160	}
161
162	/*
163	 * A little bit of help for both IPv6 output and input:
164	 *   For local addresses, we make sure that RTF_LOCAL is set,
165	 *   with the thought that this might one day be used to speed up
166	 *   ip_input().
167	 *
168	 * We also mark routes to multicast addresses as such, because
169	 * it's easy to do and might be useful (but this is much more
170	 * dubious since it's so easy to inspect the address).  (This
171	 * is done above.)
172	 *
173	 * XXX
174	 * should elaborate the code.
175	 */
176	if (rt->rt_flags & RTF_HOST) {
177		if (IN6_ARE_ADDR_EQUAL(&satosin6(rt->rt_ifa->ifa_addr)
178					->sin6_addr,
179				       &sin6->sin6_addr)) {
180			rt->rt_flags |= RTF_LOCAL;
181		}
182	}
183
184	if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU)
185	    && rt->rt_ifp)
186		rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
187
188	ret = rn_addroute(v_arg, n_arg, head, treenodes);
189	if (ret == NULL && rt->rt_flags & RTF_HOST) {
190		struct rtentry *rt2;
191		/*
192		 * We are trying to add a host route, but can't.
193		 * Find out if it is because of an
194		 * ARP entry and delete it if so.
195		 */
196		rt2 = rtalloc1_locked((struct sockaddr *)sin6, 0,
197				RTF_CLONING | RTF_PRCLONING);
198		if (rt2) {
199			if (rt2->rt_flags & RTF_LLINFO &&
200				rt2->rt_flags & RTF_HOST &&
201				rt2->rt_gateway &&
202				rt2->rt_gateway->sa_family == AF_LINK) {
203				rtrequest_locked(RTM_DELETE,
204					  (struct sockaddr *)rt_key(rt2),
205					  rt2->rt_gateway,
206					  rt_mask(rt2), rt2->rt_flags, 0);
207				ret = rn_addroute(v_arg, n_arg, head,
208					treenodes);
209			}
210			rtfree_locked(rt2);
211		}
212	} else if (ret == NULL && rt->rt_flags & RTF_CLONING) {
213		struct rtentry *rt2;
214		/*
215		 * We are trying to add a net route, but can't.
216		 * The following case should be allowed, so we'll make a
217		 * special check for this:
218		 *	Two IPv6 addresses with the same prefix is assigned
219		 *	to a single interrface.
220		 *	# ifconfig if0 inet6 3ffe:0501::1 prefix 64 alias (*1)
221		 *	# ifconfig if0 inet6 3ffe:0501::2 prefix 64 alias (*2)
222		 *	In this case, (*1) and (*2) want to add the same
223		 *	net route entry, 3ffe:0501:: -> if0.
224		 *	This case should not raise an error.
225		 */
226		rt2 = rtalloc1_locked((struct sockaddr *)sin6, 0,
227				RTF_CLONING | RTF_PRCLONING);
228		if (rt2) {
229			if ((rt2->rt_flags & (RTF_CLONING|RTF_HOST|RTF_GATEWAY))
230					== RTF_CLONING
231			 && rt2->rt_gateway
232			 && rt2->rt_gateway->sa_family == AF_LINK
233			 && rt2->rt_ifp == rt->rt_ifp) {
234				ret = rt2->rt_nodes;
235			}
236			rtfree_locked(rt2);
237		}
238	}
239	return ret;
240}
241
242/*
243 * Similar to in6_matroute_args except without the leaf-matching parameters.
244 */
245static struct radix_node *
246in6_matroute(void *v_arg, struct radix_node_head *head)
247{
248	return (in6_matroute_args(v_arg, head, NULL, NULL));
249}
250
251/*
252 * This code is the inverse of in6_clsroute: on first reference, if we
253 * were managing the route, stop doing so and set the expiration timer
254 * back off again.
255 */
256static struct radix_node *
257in6_matroute_args(void *v_arg, struct radix_node_head *head,
258    rn_matchf_t *f, void *w)
259{
260	struct radix_node *rn = rn_match_args(v_arg, head, f, w);
261	struct rtentry *rt = (struct rtentry *)rn;
262
263	if (rt && rt->rt_refcnt == 0) { /* this is first reference */
264		if (rt->rt_flags & RTPRF_OURS) {
265			rt->rt_flags &= ~RTPRF_OURS;
266			rt->rt_rmx.rmx_expire = 0;
267		}
268	}
269	return (rn);
270}
271
272SYSCTL_DECL(_net_inet6_ip6);
273
274static int rtq_reallyold = 60*60;
275	/* one hour is ``really old'' */
276SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTEXPIRE, rtexpire,
277	CTLFLAG_RW, &rtq_reallyold , 0, "");
278
279static int rtq_minreallyold = 10;
280	/* never automatically crank down to less */
281SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMINEXPIRE, rtminexpire,
282	CTLFLAG_RW, &rtq_minreallyold , 0, "");
283
284static int rtq_toomany = 128;
285	/* 128 cached routes is ``too many'' */
286SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMAXCACHE, rtmaxcache,
287	CTLFLAG_RW, &rtq_toomany , 0, "");
288
289
290/*
291 * On last reference drop, mark the route as belong to us so that it can be
292 * timed out.
293 */
294static void
295in6_clsroute(struct radix_node *rn, __unused struct radix_node_head *head)
296{
297	struct rtentry *rt = (struct rtentry *)rn;
298
299	if (!(rt->rt_flags & RTF_UP))
300		return;		/* prophylactic measures */
301
302	if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST)
303		return;
304
305	if ((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS)) != RTF_WASCLONED)
306		return;
307
308	/*
309	 * Delete the route immediately if RTF_DELCLONE is set or
310	 * if route caching is disabled (rtq_reallyold set to 0).
311	 * Otherwise, let it expire and be deleted by in6_rtqkill().
312	 */
313	if ((rt->rt_flags & RTF_DELCLONE) || rtq_reallyold == 0) {
314		/*
315		 * Delete the route from the radix tree but since we are
316		 * called when the route's reference count is 0, don't
317		 * deallocate it until we return from this routine by
318		 * telling rtrequest that we're interested in it.
319		 */
320		if (rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt),
321		    rt->rt_gateway, rt_mask(rt), rt->rt_flags, &rt) == 0) {
322			/* Now let the caller free it */
323			rtunref(rt);
324		}
325	} else {
326		struct timeval timenow;
327
328		getmicrotime(&timenow);
329		rt->rt_flags |= RTPRF_OURS;
330		rt->rt_rmx.rmx_expire = timenow.tv_sec + rtq_reallyold;
331	}
332}
333
334struct rtqk_arg {
335	struct radix_node_head *rnh;
336	int mode;
337	int updating;
338	int draining;
339	int killed;
340	int found;
341	time_t nextstop;
342};
343
344/*
345 * Get rid of old routes.  When draining, this deletes everything, even when
346 * the timeout is not expired yet.  When updating, this makes sure that
347 * nothing has a timeout longer than the current value of rtq_reallyold.
348 */
349static int
350in6_rtqkill(struct radix_node *rn, void *rock)
351{
352	struct rtqk_arg *ap = rock;
353	struct rtentry *rt = (struct rtentry *)rn;
354	int err;
355	struct timeval timenow;
356
357	getmicrotime(&timenow);
358	lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED);
359
360	if (rt->rt_flags & RTPRF_OURS) {
361		ap->found++;
362
363		if (ap->draining || rt->rt_rmx.rmx_expire <= timenow.tv_sec) {
364			if (rt->rt_refcnt > 0)
365				panic("rtqkill route really not free");
366
367			err = rtrequest_locked(RTM_DELETE,
368					(struct sockaddr *)rt_key(rt),
369					rt->rt_gateway, rt_mask(rt),
370					rt->rt_flags, 0);
371			if (err) {
372				log(LOG_WARNING, "in6_rtqkill: error %d", err);
373			} else {
374				ap->killed++;
375			}
376		} else {
377			if (ap->updating
378			   && (rt->rt_rmx.rmx_expire - timenow.tv_sec
379			       > rtq_reallyold)) {
380				rt->rt_rmx.rmx_expire = timenow.tv_sec
381					+ rtq_reallyold;
382			}
383			ap->nextstop = lmin(ap->nextstop,
384					    rt->rt_rmx.rmx_expire);
385		}
386	}
387
388	return 0;
389}
390
391#define RTQ_TIMEOUT	60*10	/* run no less than once every ten minutes */
392static int rtq_timeout = RTQ_TIMEOUT;
393
394static void
395in6_rtqtimo(void *rock)
396{
397	struct radix_node_head *rnh = rock;
398	struct rtqk_arg arg;
399	struct timeval atv;
400	static time_t last_adjusted_timeout = 0;
401	struct timeval timenow;
402
403	lck_mtx_lock(rt_mtx);
404	/* Get the timestamp after we acquire the lock for better accuracy */
405	getmicrotime(&timenow);
406
407	arg.found = arg.killed = 0;
408	arg.rnh = rnh;
409	arg.nextstop = timenow.tv_sec + rtq_timeout;
410	arg.draining = arg.updating = 0;
411	rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
412
413	/*
414	 * Attempt to be somewhat dynamic about this:
415	 * If there are ``too many'' routes sitting around taking up space,
416	 * then crank down the timeout, and see if we can't make some more
417	 * go away.  However, we make sure that we will never adjust more
418	 * than once in rtq_timeout seconds, to keep from cranking down too
419	 * hard.
420	 */
421	if ((arg.found - arg.killed > rtq_toomany)
422	   && (timenow.tv_sec - last_adjusted_timeout >= rtq_timeout)
423	   && rtq_reallyold > rtq_minreallyold) {
424		rtq_reallyold = 2*rtq_reallyold / 3;
425		if (rtq_reallyold < rtq_minreallyold) {
426			rtq_reallyold = rtq_minreallyold;
427		}
428
429		last_adjusted_timeout = timenow.tv_sec;
430#if DIAGNOSTIC
431		log(LOG_DEBUG, "in6_rtqtimo: adjusted rtq_reallyold to %d",
432		    rtq_reallyold);
433#endif
434		arg.found = arg.killed = 0;
435		arg.updating = 1;
436		rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
437	}
438
439	atv.tv_usec = 0;
440	atv.tv_sec = arg.nextstop - timenow.tv_sec;
441	lck_mtx_unlock(rt_mtx);
442	timeout(in6_rtqtimo, rock, tvtohz(&atv));
443}
444
445/*
446 * Age old PMTUs.
447 */
448struct mtuex_arg {
449	struct radix_node_head *rnh;
450	time_t nextstop;
451};
452
453static int
454in6_mtuexpire(struct radix_node *rn, void *rock)
455{
456	struct rtentry *rt = (struct rtentry *)rn;
457	struct mtuex_arg *ap = rock;
458	struct timeval timenow;
459
460	getmicrotime(&timenow);
461
462	/* sanity */
463	if (!rt)
464		panic("rt == NULL in in6_mtuexpire");
465
466	if (rt->rt_rmx.rmx_expire && !(rt->rt_flags & RTF_PROBEMTU)) {
467		if (rt->rt_rmx.rmx_expire <= timenow.tv_sec) {
468			rt->rt_flags |= RTF_PROBEMTU;
469		} else {
470			ap->nextstop = lmin(ap->nextstop,
471					rt->rt_rmx.rmx_expire);
472		}
473	}
474
475	return 0;
476}
477
478#define	MTUTIMO_DEFAULT	(60*1)
479
480static void
481in6_mtutimo(void *rock)
482{
483	struct radix_node_head *rnh = rock;
484	struct mtuex_arg arg;
485	struct timeval atv;
486	struct timeval timenow;
487
488	getmicrotime(&timenow);
489
490	arg.rnh = rnh;
491	arg.nextstop = timenow.tv_sec + MTUTIMO_DEFAULT;
492	lck_mtx_lock(rt_mtx);
493	rnh->rnh_walktree(rnh, in6_mtuexpire, &arg);
494
495	atv.tv_usec = 0;
496	atv.tv_sec = arg.nextstop;
497	if (atv.tv_sec < timenow.tv_sec) {
498#if DIAGNOSTIC
499		log(LOG_DEBUG, "IPv6: invalid mtu expiration time on routing table\n");
500#endif
501		arg.nextstop = timenow.tv_sec + 30;	/*last resort*/
502	}
503	atv.tv_sec -= timenow.tv_sec;
504	lck_mtx_unlock(rt_mtx);
505	timeout(in6_mtutimo, rock, tvtohz(&atv));
506}
507
508#if 0
509void
510in6_rtqdrain()
511{
512	struct radix_node_head *rnh = rt_tables[AF_INET6];
513	struct rtqk_arg arg;
514	int s;
515	arg.found = arg.killed = 0;
516	arg.rnh = rnh;
517	arg.nextstop = 0;
518	arg.draining = 1;
519	arg.updating = 0;
520	s = splnet();
521	rnh->rnh_walktree(rnh, in6_rtqkill, &arg);
522	splx(s);
523}
524#endif
525
526/*
527 * Initialize our routing tree.
528 */
529int
530in6_inithead(void **head, int off)
531{
532	struct radix_node_head *rnh;
533
534	if (!rn_inithead(head, off))
535		return 0;
536
537	if (head != (void **)&rt_tables[AF_INET6]) /* BOGUS! */
538		return 1;	/* only do this for the real routing table */
539
540	rnh = *head;
541	rnh->rnh_addaddr = in6_addroute;
542	rnh->rnh_matchaddr = in6_matroute;
543	rnh->rnh_matchaddr_args = in6_matroute_args;
544	rnh->rnh_close = in6_clsroute;
545	in6_rtqtimo(rnh);	/* kick off timeout first time */
546	in6_mtutimo(rnh);	/* kick off timeout first time */
547	return 1;
548}
549