1/*
2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright 1994, 1995 Massachusetts Institute of Technology
30 *
31 * Permission to use, copy, modify, and distribute this software and
32 * its documentation for any purpose and without fee is hereby
33 * granted, provided that both the above copyright notice and this
34 * permission notice appear in all copies, that both the above
35 * copyright notice and this permission notice appear in all
36 * supporting documentation, and that the name of M.I.T. not be used
37 * in advertising or publicity pertaining to distribution of the
38 * software without specific, written prior permission.  M.I.T. makes
39 * no representations about the suitability of this software for any
40 * purpose.  It is provided "as is" without express or implied
41 * warranty.
42 *
43 * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''.  M.I.T. DISCLAIMS
44 * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE,
45 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
46 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT
47 * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
48 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
49 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
50 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
51 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
52 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
53 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
54 * SUCH DAMAGE.
55 *
56 * $FreeBSD: src/sys/netinet/in_rmx.c,v 1.37.2.1 2001/05/14 08:23:49 ru Exp $
57 */
58
59/*
60 * This code does two things necessary for the enhanced TCP metrics to
61 * function in a useful manner:
62 *  1) It marks all non-host routes as `cloning', thus ensuring that
63 *     every actual reference to such a route actually gets turned
64 *     into a reference to a host route to the specific destination
65 *     requested.
66 *  2) When such routes lose all their references, it arranges for them
67 *     to be deleted in some random collection of circumstances, so that
68 *     a large quantity of stale routing data is not kept in kernel memory
69 *     indefinitely.  See in_rtqtimo() below for the exact mechanism.
70 */
71
72#include <sys/param.h>
73#include <sys/systm.h>
74#include <sys/kernel.h>
75#include <sys/sysctl.h>
76#include <sys/socket.h>
77#include <sys/mbuf.h>
78#include <sys/syslog.h>
79#include <kern/lock.h>
80
81#include <net/if.h>
82#include <net/route.h>
83#include <netinet/in.h>
84#include <netinet/in_var.h>
85
86extern int tvtohz(struct timeval *);
87extern int	in_inithead(void **head, int off);
88extern u_long route_generation;
89
90#ifdef __APPLE__
91static void in_rtqtimo(void *rock);
92#endif
93
94static struct radix_node *in_matroute_args(void *, struct radix_node_head *,
95    rn_matchf_t *f, void *);
96
97#define RTPRF_OURS		RTF_PROTO3	/* set on routes we manage */
98
99/*
100 * Do what we need to do when inserting a route.
101 */
102static struct radix_node *
103in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
104	    struct radix_node *treenodes)
105{
106	struct rtentry *rt = (struct rtentry *)treenodes;
107	struct sockaddr_in *sin = (struct sockaddr_in *)rt_key(rt);
108	struct radix_node *ret;
109
110	/*
111	 * For IP, all unicast non-host routes are automatically cloning.
112	 */
113	if(IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
114		rt->rt_flags |= RTF_MULTICAST;
115
116	if(!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) {
117		rt->rt_flags |= RTF_PRCLONING;
118	}
119
120	/*
121	 * A little bit of help for both IP output and input:
122	 *   For host routes, we make sure that RTF_BROADCAST
123	 *   is set for anything that looks like a broadcast address.
124	 *   This way, we can avoid an expensive call to in_broadcast()
125	 *   in ip_output() most of the time (because the route passed
126	 *   to ip_output() is almost always a host route).
127	 *
128	 *   We also do the same for local addresses, with the thought
129	 *   that this might one day be used to speed up ip_input().
130	 *
131	 * We also mark routes to multicast addresses as such, because
132	 * it's easy to do and might be useful (but this is much more
133	 * dubious since it's so easy to inspect the address).  (This
134	 * is done above.)
135	 */
136	if (rt->rt_flags & RTF_HOST) {
137		if (in_broadcast(sin->sin_addr, rt->rt_ifp)) {
138			rt->rt_flags |= RTF_BROADCAST;
139		} else {
140#define satosin(sa) ((struct sockaddr_in *)sa)
141			if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr
142			    == sin->sin_addr.s_addr)
143				rt->rt_flags |= RTF_LOCAL;
144#undef satosin
145		}
146	}
147
148	if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU)
149	    && rt->rt_ifp)
150		rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
151
152	ret = rn_addroute(v_arg, n_arg, head, treenodes);
153	if (ret == NULL && rt->rt_flags & RTF_HOST) {
154		struct rtentry *rt2;
155		/*
156		 * We are trying to add a host route, but can't.
157		 * Find out if it is because of an
158		 * ARP entry and delete it if so.
159		 */
160		rt2 = rtalloc1_scoped_locked(rt_key(rt), 0,
161		    RTF_CLONING | RTF_PRCLONING, sa_get_ifscope(rt_key(rt)));
162		if (rt2) {
163			if (rt2->rt_flags & RTF_LLINFO &&
164				rt2->rt_flags & RTF_HOST &&
165				rt2->rt_gateway &&
166				rt2->rt_gateway->sa_family == AF_LINK) {
167				rtrequest_locked(RTM_DELETE,
168					  (struct sockaddr *)rt_key(rt2),
169					  rt2->rt_gateway,
170					  rt_mask(rt2), rt2->rt_flags, 0);
171				ret = rn_addroute(v_arg, n_arg, head,
172					treenodes);
173			}
174			rtfree_locked(rt2);
175		}
176	}
177	return ret;
178}
179
180/*
181 * Validate (unexpire) an expiring AF_INET route.
182 */
183struct radix_node *
184in_validate(struct radix_node *rn)
185{
186	struct rtentry *rt = (struct rtentry *)rn;
187
188	/* This is first reference? */
189	if (rt != NULL && rt->rt_refcnt == 0 && (rt->rt_flags & RTPRF_OURS)) {
190		rt->rt_flags &= ~RTPRF_OURS;
191		rt->rt_rmx.rmx_expire = 0;
192	}
193	return (rn);
194}
195
196/*
197 * Similar to in_matroute_args except without the leaf-matching parameters.
198 */
199static struct radix_node *
200in_matroute(void *v_arg, struct radix_node_head *head)
201{
202	return (in_matroute_args(v_arg, head, NULL, NULL));
203}
204
205/*
206 * This code is the inverse of in_clsroute: on first reference, if we
207 * were managing the route, stop doing so and set the expiration timer
208 * back off again.
209 */
210static struct radix_node *
211in_matroute_args(void *v_arg, struct radix_node_head *head,
212    rn_matchf_t *f, void *w)
213{
214	struct radix_node *rn = rn_match_args(v_arg, head, f, w);
215
216	return (in_validate(rn));
217}
218
219static int rtq_reallyold = 60*60;
220	/* one hour is ``really old'' */
221SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW,
222    &rtq_reallyold , 0,
223    "Default expiration time on dynamically learned routes");
224
225static int rtq_minreallyold = 10;
226	/* never automatically crank down to less */
227SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW,
228    &rtq_minreallyold , 0,
229    "Minimum time to attempt to hold onto dynamically learned routes");
230
231static int rtq_toomany = 128;
232	/* 128 cached routes is ``too many'' */
233SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW,
234    &rtq_toomany , 0, "Upper limit on dynamically learned routes");
235
236#ifdef __APPLE__
237/* XXX LD11JUL02 Special case for AOL 5.1.2 connectivity issue to AirPort BS (Radar 2969954)
238 * AOL is adding a circular route ("10.0.1.1/32 10.0.1.1") when establishing its ppp tunnel
239 * to the AP BaseStation by removing the default gateway and replacing it with their tunnel entry point.
240 * There is no apparent reason to add this route as there is a valid 10.0.1.1/24 route to the BS.
241 * That circular route was ignored on previous version of MacOS X because of a routing bug
242 * corrected with the merge to FreeBSD4.4 (a route generated from an RTF_CLONING route had the RTF_WASCLONED
243 * flag set but did not have a reference to the parent route) and that entry was left in the RT. This workaround is
244 * made in order to provide binary compatibility with AOL.
245 * If we catch a process adding a circular route with a /32 from the routing socket, we error it out instead of
246 * confusing the routing table with a wrong route to the previous default gateway
247 * If for some reason a circular route is needed, turn this sysctl (net.inet.ip.check_route_selfref) to zero.
248 */
249int check_routeselfref = 1;
250SYSCTL_INT(_net_inet_ip, OID_AUTO, check_route_selfref, CTLFLAG_RW,
251    &check_routeselfref , 0, "");
252#endif
253
254__private_extern__ int use_routegenid = 1;
255SYSCTL_INT(_net_inet_ip, OID_AUTO, use_route_genid, CTLFLAG_RW,
256    &use_routegenid , 0, "");
257
258/*
259 * On last reference drop, mark the route as belong to us so that it can be
260 * timed out.
261 */
262static void
263in_clsroute(struct radix_node *rn, __unused struct radix_node_head *head)
264{
265	struct rtentry *rt = (struct rtentry *)rn;
266
267	if (!(rt->rt_flags & RTF_UP))
268		return;		/* prophylactic measures */
269
270	if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST)
271		return;
272
273	if ((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS)) != RTF_WASCLONED)
274		return;
275
276	/*
277	 * Delete the route immediately if RTF_DELCLONE is set or
278	 * if route caching is disabled (rtq_reallyold set to 0).
279	 * Otherwise, let it expire and be deleted by in_rtqkill().
280	 */
281	if ((rt->rt_flags & RTF_DELCLONE) || rtq_reallyold == 0) {
282		/*
283		 * Delete the route from the radix tree but since we are
284		 * called when the route's reference count is 0, don't
285		 * deallocate it until we return from this routine by
286		 * telling rtrequest that we're interested in it.
287		 */
288		if (rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt),
289		    rt->rt_gateway, rt_mask(rt), rt->rt_flags, &rt) == 0) {
290			/* Now let the caller free it */
291			rtunref(rt);
292		}
293	} else {
294		struct timeval timenow;
295
296		getmicrotime(&timenow);
297		rt->rt_flags |= RTPRF_OURS;
298		rt->rt_rmx.rmx_expire = timenow.tv_sec + rtq_reallyold;
299	}
300}
301
302struct rtqk_arg {
303	struct radix_node_head *rnh;
304	int draining;
305	int killed;
306	int found;
307	int updating;
308	time_t nextstop;
309};
310
311/*
312 * Get rid of old routes.  When draining, this deletes everything, even when
313 * the timeout is not expired yet.  When updating, this makes sure that
314 * nothing has a timeout longer than the current value of rtq_reallyold.
315 */
316static int
317in_rtqkill(struct radix_node *rn, void *rock)
318{
319	struct rtqk_arg *ap = rock;
320	struct rtentry *rt = (struct rtentry *)rn;
321	int err;
322	struct timeval timenow;
323
324	getmicrotime(&timenow);
325	lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED);
326
327	if (rt->rt_flags & RTPRF_OURS) {
328		ap->found++;
329
330		if (ap->draining || rt->rt_rmx.rmx_expire <= timenow.tv_sec) {
331			if (rt->rt_refcnt > 0)
332				panic("rtqkill route really not free");
333
334			err = rtrequest_locked(RTM_DELETE,
335					(struct sockaddr *)rt_key(rt),
336					rt->rt_gateway, rt_mask(rt),
337					rt->rt_flags, 0);
338			if (err) {
339				log(LOG_WARNING, "in_rtqkill: error %d\n", err);
340			} else {
341				ap->killed++;
342			}
343		} else {
344			if (ap->updating
345			   && (rt->rt_rmx.rmx_expire - timenow.tv_sec
346			       > rtq_reallyold)) {
347				rt->rt_rmx.rmx_expire = timenow.tv_sec
348					+ rtq_reallyold;
349			}
350			ap->nextstop = lmin(ap->nextstop,
351					    rt->rt_rmx.rmx_expire);
352		}
353	}
354
355	return 0;
356}
357
358static void
359in_rtqtimo_funnel(void *rock)
360{
361        in_rtqtimo(rock);
362
363}
364#define RTQ_TIMEOUT	60*10	/* run no less than once every ten minutes */
365static int rtq_timeout = RTQ_TIMEOUT;
366
367static void
368in_rtqtimo(void *rock)
369{
370	struct radix_node_head *rnh = rock;
371	struct rtqk_arg arg;
372	struct timeval atv;
373	static time_t last_adjusted_timeout = 0;
374	struct timeval timenow;
375
376	lck_mtx_lock(rt_mtx);
377	/* Get the timestamp after we acquire the lock for better accuracy */
378	getmicrotime(&timenow);
379
380	arg.found = arg.killed = 0;
381	arg.rnh = rnh;
382	arg.nextstop = timenow.tv_sec + rtq_timeout;
383	arg.draining = arg.updating = 0;
384	rnh->rnh_walktree(rnh, in_rtqkill, &arg);
385
386	/*
387	 * Attempt to be somewhat dynamic about this:
388	 * If there are ``too many'' routes sitting around taking up space,
389	 * then crank down the timeout, and see if we can't make some more
390	 * go away.  However, we make sure that we will never adjust more
391	 * than once in rtq_timeout seconds, to keep from cranking down too
392	 * hard.
393	 */
394	if((arg.found - arg.killed > rtq_toomany)
395	   && (timenow.tv_sec - last_adjusted_timeout >= rtq_timeout)
396	   && rtq_reallyold > rtq_minreallyold) {
397		rtq_reallyold = 2*rtq_reallyold / 3;
398		if(rtq_reallyold < rtq_minreallyold) {
399			rtq_reallyold = rtq_minreallyold;
400		}
401
402		last_adjusted_timeout = timenow.tv_sec;
403#if DIAGNOSTIC
404		log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n",
405		    rtq_reallyold);
406#endif
407		arg.found = arg.killed = 0;
408		arg.updating = 1;
409		rnh->rnh_walktree(rnh, in_rtqkill, &arg);
410	}
411
412	atv.tv_usec = 0;
413	atv.tv_sec = arg.nextstop - timenow.tv_sec;
414	lck_mtx_unlock(rt_mtx);
415	timeout(in_rtqtimo_funnel, rock, tvtohz(&atv));
416}
417
418void
419in_rtqdrain(void)
420{
421	struct radix_node_head *rnh = rt_tables[AF_INET];
422	struct rtqk_arg arg;
423	arg.found = arg.killed = 0;
424	arg.rnh = rnh;
425	arg.nextstop = 0;
426	arg.draining = 1;
427	arg.updating = 0;
428	lck_mtx_lock(rt_mtx);
429	rnh->rnh_walktree(rnh, in_rtqkill, &arg);
430	lck_mtx_unlock(rt_mtx);
431}
432
433/*
434 * Initialize our routing tree.
435 */
436int
437in_inithead(void **head, int off)
438{
439	struct radix_node_head *rnh;
440
441#ifdef __APPLE__
442	if (*head)
443		return 1;
444#endif
445
446	if(!rn_inithead(head, off))
447		return 0;
448
449	if(head != (void **)&rt_tables[AF_INET]) /* BOGUS! */
450		return 1;	/* only do this for the real routing table */
451
452	rnh = *head;
453	rnh->rnh_addaddr = in_addroute;
454	rnh->rnh_matchaddr = in_matroute;
455	rnh->rnh_matchaddr_args = in_matroute_args;
456	rnh->rnh_close = in_clsroute;
457	in_rtqtimo(rnh);	/* kick off timeout first time */
458	return 1;
459}
460
461
462/*
463 * This zaps old routes when the interface goes down or interface
464 * address is deleted.  In the latter case, it deletes static routes
465 * that point to this address.  If we don't do this, we may end up
466 * using the old address in the future.  The ones we always want to
467 * get rid of are things like ARP entries, since the user might down
468 * the interface, walk over to a completely different network, and
469 * plug back in.
470 */
471struct in_ifadown_arg {
472	struct radix_node_head *rnh;
473	struct ifaddr *ifa;
474	int del;
475};
476
477static int
478in_ifadownkill(struct radix_node *rn, void *xap)
479{
480	struct in_ifadown_arg *ap = xap;
481	struct rtentry *rt = (struct rtentry *)rn;
482	int err;
483
484	if (rt->rt_ifa == ap->ifa &&
485	    (ap->del || !(rt->rt_flags & RTF_STATIC))) {
486		/*
487		 * We need to disable the automatic prune that happens
488		 * in this case in rtrequest() because it will blow
489		 * away the pointers that rn_walktree() needs in order
490		 * continue our descent.  We will end up deleting all
491		 * the routes that rtrequest() would have in any case,
492		 * so that behavior is not needed there.
493		 */
494		rt->rt_flags &= ~(RTF_CLONING | RTF_PRCLONING);
495		err = rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt),
496				rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0);
497		if (err) {
498			log(LOG_WARNING, "in_ifadownkill: error %d\n", err);
499		}
500	}
501	return 0;
502}
503
504int
505in_ifadown(struct ifaddr *ifa, int delete)
506{
507	struct in_ifadown_arg arg;
508	struct radix_node_head *rnh;
509
510	lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED);
511
512	if (ifa->ifa_addr->sa_family != AF_INET)
513		return 1;
514
515	/* trigger route cache reevaluation */
516	if (use_routegenid)
517		route_generation++;
518
519	arg.rnh = rnh = rt_tables[AF_INET];
520	arg.ifa = ifa;
521	arg.del = delete;
522	rnh->rnh_walktree(rnh, in_ifadownkill, &arg);
523	ifa->ifa_flags &= ~IFA_ROUTE;
524	return 0;
525}
526