1/* 2 * Copyright (c) 2000-2011 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright 1994, 1995 Massachusetts Institute of Technology 30 * 31 * Permission to use, copy, modify, and distribute this software and 32 * its documentation for any purpose and without fee is hereby 33 * granted, provided that both the above copyright notice and this 34 * permission notice appear in all copies, that both the above 35 * copyright notice and this permission notice appear in all 36 * supporting documentation, and that the name of M.I.T. not be used 37 * in advertising or publicity pertaining to distribution of the 38 * software without specific, written prior permission. M.I.T. makes 39 * no representations about the suitability of this software for any 40 * purpose. It is provided "as is" without express or implied 41 * warranty. 42 * 43 * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS 44 * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, 45 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 46 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT 47 * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 48 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 49 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 50 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 51 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 52 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 53 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 54 * SUCH DAMAGE. 55 * 56 * $FreeBSD: src/sys/netinet/in_rmx.c,v 1.37.2.1 2001/05/14 08:23:49 ru Exp $ 57 */ 58 59/* 60 * This code does two things necessary for the enhanced TCP metrics to 61 * function in a useful manner: 62 * 1) It marks all non-host routes as `cloning', thus ensuring that 63 * every actual reference to such a route actually gets turned 64 * into a reference to a host route to the specific destination 65 * requested. 66 * 2) When such routes lose all their references, it arranges for them 67 * to be deleted in some random collection of circumstances, so that 68 * a large quantity of stale routing data is not kept in kernel memory 69 * indefinitely. See in_rtqtimo() below for the exact mechanism. 70 */ 71 72#include <sys/param.h> 73#include <sys/systm.h> 74#include <sys/kernel.h> 75#include <sys/sysctl.h> 76#include <sys/socket.h> 77#include <sys/mbuf.h> 78#include <sys/protosw.h> 79#include <sys/syslog.h> 80#include <sys/mcache.h> 81#include <kern/lock.h> 82 83#include <net/if.h> 84#include <net/route.h> 85#include <netinet/in.h> 86#include <netinet/in_var.h> 87#include <netinet/in_arp.h> 88 89extern int tvtohz(struct timeval *); 90extern int in_inithead(void **head, int off); 91 92#ifdef __APPLE__ 93static void in_rtqtimo(void *rock); 94#endif 95 96static struct radix_node *in_matroute_args(void *, struct radix_node_head *, 97 rn_matchf_t *f, void *); 98 99#define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ 100 101/* 102 * Do what we need to do when inserting a route. 103 */ 104static struct radix_node * 105in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, 106 struct radix_node *treenodes) 107{ 108 struct rtentry *rt = (struct rtentry *)treenodes; 109 struct sockaddr_in *sin = (struct sockaddr_in *)(void *)rt_key(rt); 110 struct radix_node *ret; 111 112 lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); 113 RT_LOCK_ASSERT_HELD(rt); 114 115 /* 116 * For IP, all unicast non-host routes are automatically cloning. 117 */ 118 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) 119 rt->rt_flags |= RTF_MULTICAST; 120 121 if (!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) { 122 rt->rt_flags |= RTF_PRCLONING; 123 } 124 125 /* 126 * A little bit of help for both IP output and input: 127 * For host routes, we make sure that RTF_BROADCAST 128 * is set for anything that looks like a broadcast address. 129 * This way, we can avoid an expensive call to in_broadcast() 130 * in ip_output() most of the time (because the route passed 131 * to ip_output() is almost always a host route). 132 * 133 * We also do the same for local addresses, with the thought 134 * that this might one day be used to speed up ip_input(). 135 * 136 * We also mark routes to multicast addresses as such, because 137 * it's easy to do and might be useful (but this is much more 138 * dubious since it's so easy to inspect the address). (This 139 * is done above.) 140 */ 141 if (rt->rt_flags & RTF_HOST) { 142 if (in_broadcast(sin->sin_addr, rt->rt_ifp)) { 143 rt->rt_flags |= RTF_BROADCAST; 144 } else { 145 /* Become a regular mutex */ 146 RT_CONVERT_LOCK(rt); 147 IFA_LOCK_SPIN(rt->rt_ifa); 148 if (satosin(rt->rt_ifa->ifa_addr)->sin_addr.s_addr 149 == sin->sin_addr.s_addr) 150 rt->rt_flags |= RTF_LOCAL; 151 IFA_UNLOCK(rt->rt_ifa); 152 } 153 } 154 155 if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) 156 && rt->rt_ifp) 157 rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; 158 159 ret = rn_addroute(v_arg, n_arg, head, treenodes); 160 if (ret == NULL && rt->rt_flags & RTF_HOST) { 161 struct rtentry *rt2; 162 /* 163 * We are trying to add a host route, but can't. 164 * Find out if it is because of an 165 * ARP entry and delete it if so. 166 */ 167 rt2 = rtalloc1_scoped_locked(rt_key(rt), 0, 168 RTF_CLONING | RTF_PRCLONING, sin_get_ifscope(rt_key(rt))); 169 if (rt2) { 170 RT_LOCK(rt2); 171 if ((rt2->rt_flags & RTF_LLINFO) && 172 (rt2->rt_flags & RTF_HOST) && 173 rt2->rt_gateway != NULL && 174 rt2->rt_gateway->sa_family == AF_LINK) { 175 /* 176 * Safe to drop rt_lock and use rt_key, 177 * rt_gateway, since holding rnh_lock here 178 * prevents another thread from calling 179 * rt_setgate() on this route. 180 */ 181 RT_UNLOCK(rt2); 182 rtrequest_locked(RTM_DELETE, rt_key(rt2), 183 rt2->rt_gateway, rt_mask(rt2), 184 rt2->rt_flags, 0); 185 ret = rn_addroute(v_arg, n_arg, head, 186 treenodes); 187 } else { 188 RT_UNLOCK(rt2); 189 } 190 rtfree_locked(rt2); 191 } 192 } 193 return ret; 194} 195 196/* 197 * Validate (unexpire) an expiring AF_INET route. 198 */ 199struct radix_node * 200in_validate(struct radix_node *rn) 201{ 202 struct rtentry *rt = (struct rtentry *)rn; 203 204 RT_LOCK_ASSERT_HELD(rt); 205 206 /* This is first reference? */ 207 if (rt->rt_refcnt == 0) { 208 if (rt->rt_flags & RTPRF_OURS) { 209 /* It's one of ours; unexpire it */ 210 rt->rt_flags &= ~RTPRF_OURS; 211 rt_setexpire(rt, 0); 212 } else if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) == 213 (RTF_LLINFO | RTF_HOST) && rt->rt_llinfo != NULL && 214 rt->rt_gateway != NULL && 215 rt->rt_gateway->sa_family == AF_LINK) { 216 /* It's ARP; let it be handled there */ 217 arp_validate(rt); 218 } 219 } 220 return (rn); 221} 222 223/* 224 * Similar to in_matroute_args except without the leaf-matching parameters. 225 */ 226static struct radix_node * 227in_matroute(void *v_arg, struct radix_node_head *head) 228{ 229 return (in_matroute_args(v_arg, head, NULL, NULL)); 230} 231 232/* 233 * This code is the inverse of in_clsroute: on first reference, if we 234 * were managing the route, stop doing so and set the expiration timer 235 * back off again. 236 */ 237static struct radix_node * 238in_matroute_args(void *v_arg, struct radix_node_head *head, 239 rn_matchf_t *f, void *w) 240{ 241 struct radix_node *rn = rn_match_args(v_arg, head, f, w); 242 243 if (rn != NULL) { 244 RT_LOCK_SPIN((struct rtentry *)rn); 245 in_validate(rn); 246 RT_UNLOCK((struct rtentry *)rn); 247 } 248 return (rn); 249} 250 251static int rtq_reallyold = 60*60; 252 /* one hour is ``really old'' */ 253SYSCTL_INT(_net_inet_ip, IPCTL_RTEXPIRE, rtexpire, CTLFLAG_RW | CTLFLAG_LOCKED, 254 &rtq_reallyold , 0, 255 "Default expiration time on dynamically learned routes"); 256 257static int rtq_minreallyold = 10; 258 /* never automatically crank down to less */ 259SYSCTL_INT(_net_inet_ip, IPCTL_RTMINEXPIRE, rtminexpire, CTLFLAG_RW | CTLFLAG_LOCKED, 260 &rtq_minreallyold , 0, 261 "Minimum time to attempt to hold onto dynamically learned routes"); 262 263static int rtq_toomany = 128; 264 /* 128 cached routes is ``too many'' */ 265SYSCTL_INT(_net_inet_ip, IPCTL_RTMAXCACHE, rtmaxcache, CTLFLAG_RW | CTLFLAG_LOCKED, 266 &rtq_toomany , 0, "Upper limit on dynamically learned routes"); 267 268#ifdef __APPLE__ 269/* XXX LD11JUL02 Special case for AOL 5.1.2 connectivity issue to AirPort BS (Radar 2969954) 270 * AOL is adding a circular route ("10.0.1.1/32 10.0.1.1") when establishing its ppp tunnel 271 * to the AP BaseStation by removing the default gateway and replacing it with their tunnel entry point. 272 * There is no apparent reason to add this route as there is a valid 10.0.1.1/24 route to the BS. 273 * That circular route was ignored on previous version of MacOS X because of a routing bug 274 * corrected with the merge to FreeBSD4.4 (a route generated from an RTF_CLONING route had the RTF_WASCLONED 275 * flag set but did not have a reference to the parent route) and that entry was left in the RT. This workaround is 276 * made in order to provide binary compatibility with AOL. 277 * If we catch a process adding a circular route with a /32 from the routing socket, we error it out instead of 278 * confusing the routing table with a wrong route to the previous default gateway 279 * If for some reason a circular route is needed, turn this sysctl (net.inet.ip.check_route_selfref) to zero. 280 */ 281int check_routeselfref = 1; 282SYSCTL_INT(_net_inet_ip, OID_AUTO, check_route_selfref, CTLFLAG_RW | CTLFLAG_LOCKED, 283 &check_routeselfref , 0, ""); 284#endif 285 286int use_routegenid = 1; 287SYSCTL_INT(_net_inet_ip, OID_AUTO, use_route_genid, CTLFLAG_RW | CTLFLAG_LOCKED, 288 &use_routegenid , 0, ""); 289 290/* 291 * On last reference drop, mark the route as belong to us so that it can be 292 * timed out. 293 */ 294static void 295in_clsroute(struct radix_node *rn, __unused struct radix_node_head *head) 296{ 297 struct rtentry *rt = (struct rtentry *)rn; 298 299 lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); 300 RT_LOCK_ASSERT_HELD(rt); 301 302 if (!(rt->rt_flags & RTF_UP)) 303 return; /* prophylactic measures */ 304 305 if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST) 306 return; 307 308 if ((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS)) != RTF_WASCLONED) 309 return; 310 311 /* 312 * Delete the route immediately if RTF_DELCLONE is set or 313 * if route caching is disabled (rtq_reallyold set to 0). 314 * Otherwise, let it expire and be deleted by in_rtqkill(). 315 */ 316 if ((rt->rt_flags & RTF_DELCLONE) || rtq_reallyold == 0) { 317 /* 318 * Delete the route from the radix tree but since we are 319 * called when the route's reference count is 0, don't 320 * deallocate it until we return from this routine by 321 * telling rtrequest that we're interested in it. 322 * Safe to drop rt_lock and use rt_key, rt_gateway since 323 * holding rnh_lock here prevents another thread from 324 * calling rt_setgate() on this route. 325 */ 326 RT_UNLOCK(rt); 327 if (rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt), 328 rt->rt_gateway, rt_mask(rt), rt->rt_flags, &rt) == 0) { 329 /* Now let the caller free it */ 330 RT_LOCK(rt); 331 RT_REMREF_LOCKED(rt); 332 } else { 333 RT_LOCK(rt); 334 } 335 } else { 336 uint64_t timenow; 337 338 timenow = net_uptime(); 339 rt->rt_flags |= RTPRF_OURS; 340 rt_setexpire(rt, 341 rt_expiry(rt, timenow, rtq_reallyold)); 342 } 343} 344 345struct rtqk_arg { 346 struct radix_node_head *rnh; 347 int draining; 348 int killed; 349 int found; 350 int updating; 351 uint64_t nextstop; 352}; 353 354/* 355 * Get rid of old routes. When draining, this deletes everything, even when 356 * the timeout is not expired yet. When updating, this makes sure that 357 * nothing has a timeout longer than the current value of rtq_reallyold. 358 */ 359static int 360in_rtqkill(struct radix_node *rn, void *rock) 361{ 362 struct rtqk_arg *ap = rock; 363 struct rtentry *rt = (struct rtentry *)rn; 364 int err; 365 uint64_t timenow; 366 367 timenow = net_uptime(); 368 lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); 369 370 RT_LOCK(rt); 371 if (rt->rt_flags & RTPRF_OURS) { 372 ap->found++; 373 374 VERIFY(rt->rt_expire == 0 || rt->rt_rmx.rmx_expire != 0); 375 VERIFY(rt->rt_expire != 0 || rt->rt_rmx.rmx_expire == 0); 376 if (ap->draining || rt->rt_expire <= timenow) { 377 if (rt->rt_refcnt > 0) 378 panic("rtqkill route really not free"); 379 380 /* 381 * Delete this route since we're done with it; 382 * the route may be freed afterwards, so we 383 * can no longer refer to 'rt' upon returning 384 * from rtrequest(). Safe to drop rt_lock and 385 * use rt_key, rt_gateway since holding rnh_lock 386 * here prevents another thread from calling 387 * rt_setgate() on this route. 388 */ 389 RT_UNLOCK(rt); 390 err = rtrequest_locked(RTM_DELETE, rt_key(rt), 391 rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0); 392 if (err) { 393 log(LOG_WARNING, "in_rtqkill: error %d\n", err); 394 } else { 395 ap->killed++; 396 } 397 } else { 398 if (ap->updating && 399 (rt->rt_expire - timenow) > 400 rt_expiry(rt, 0, rtq_reallyold)) { 401 rt_setexpire(rt, rt_expiry(rt, 402 timenow, rtq_reallyold)); 403 } 404 ap->nextstop = lmin(ap->nextstop, 405 rt->rt_expire); 406 RT_UNLOCK(rt); 407 } 408 } else { 409 RT_UNLOCK(rt); 410 } 411 412 return 0; 413} 414 415static void 416in_rtqtimo_funnel(void *rock) 417{ 418 in_rtqtimo(rock); 419 420} 421#define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ 422static int rtq_timeout = RTQ_TIMEOUT; 423 424static void 425in_rtqtimo(void *rock) 426{ 427 struct radix_node_head *rnh = rock; 428 struct rtqk_arg arg; 429 struct timeval atv; 430 static uint64_t last_adjusted_timeout = 0; 431 uint64_t timenow; 432 433 lck_mtx_lock(rnh_lock); 434 /* Get the timestamp after we acquire the lock for better accuracy */ 435 timenow = net_uptime(); 436 437 arg.found = arg.killed = 0; 438 arg.rnh = rnh; 439 arg.nextstop = timenow + rtq_timeout; 440 arg.draining = arg.updating = 0; 441 rnh->rnh_walktree(rnh, in_rtqkill, &arg); 442 443 /* 444 * Attempt to be somewhat dynamic about this: 445 * If there are ``too many'' routes sitting around taking up space, 446 * then crank down the timeout, and see if we can't make some more 447 * go away. However, we make sure that we will never adjust more 448 * than once in rtq_timeout seconds, to keep from cranking down too 449 * hard. 450 */ 451 if((arg.found - arg.killed > rtq_toomany) 452 && ((timenow - last_adjusted_timeout) >= (uint64_t)rtq_timeout) 453 && rtq_reallyold > rtq_minreallyold) { 454 rtq_reallyold = 2*rtq_reallyold / 3; 455 if(rtq_reallyold < rtq_minreallyold) { 456 rtq_reallyold = rtq_minreallyold; 457 } 458 459 last_adjusted_timeout = timenow; 460#if DIAGNOSTIC 461 log(LOG_DEBUG, "in_rtqtimo: adjusted rtq_reallyold to %d\n", 462 rtq_reallyold); 463#endif 464 arg.found = arg.killed = 0; 465 arg.updating = 1; 466 rnh->rnh_walktree(rnh, in_rtqkill, &arg); 467 } 468 469 atv.tv_usec = 0; 470 atv.tv_sec = arg.nextstop - timenow; 471 lck_mtx_unlock(rnh_lock); 472 timeout(in_rtqtimo_funnel, rock, tvtohz(&atv)); 473} 474 475void 476in_rtqdrain(void) 477{ 478 struct radix_node_head *rnh = rt_tables[AF_INET]; 479 struct rtqk_arg arg; 480 arg.found = arg.killed = 0; 481 arg.rnh = rnh; 482 arg.nextstop = 0; 483 arg.draining = 1; 484 arg.updating = 0; 485 lck_mtx_lock(rnh_lock); 486 rnh->rnh_walktree(rnh, in_rtqkill, &arg); 487 lck_mtx_unlock(rnh_lock); 488} 489 490/* 491 * Initialize our routing tree. 492 */ 493int 494in_inithead(void **head, int off) 495{ 496 struct radix_node_head *rnh; 497 498#ifdef __APPLE__ 499 if (*head) 500 return 1; 501#endif 502 503 if(!rn_inithead(head, off)) 504 return 0; 505 506 if(head != (void **)&rt_tables[AF_INET]) /* BOGUS! */ 507 return 1; /* only do this for the real routing table */ 508 509 rnh = *head; 510 rnh->rnh_addaddr = in_addroute; 511 rnh->rnh_matchaddr = in_matroute; 512 rnh->rnh_matchaddr_args = in_matroute_args; 513 rnh->rnh_close = in_clsroute; 514 in_rtqtimo(rnh); /* kick off timeout first time */ 515 return 1; 516} 517 518 519/* 520 * This zaps old routes when the interface goes down or interface 521 * address is deleted. In the latter case, it deletes static routes 522 * that point to this address. If we don't do this, we may end up 523 * using the old address in the future. The ones we always want to 524 * get rid of are things like ARP entries, since the user might down 525 * the interface, walk over to a completely different network, and 526 * plug back in. 527 */ 528struct in_ifadown_arg { 529 struct radix_node_head *rnh; 530 struct ifaddr *ifa; 531 int del; 532}; 533 534static int 535in_ifadownkill(struct radix_node *rn, void *xap) 536{ 537 struct in_ifadown_arg *ap = xap; 538 struct rtentry *rt = (struct rtentry *)rn; 539 int err; 540 541 RT_LOCK(rt); 542 if (rt->rt_ifa == ap->ifa && 543 (ap->del || !(rt->rt_flags & RTF_STATIC))) { 544 /* 545 * We need to disable the automatic prune that happens 546 * in this case in rtrequest() because it will blow 547 * away the pointers that rn_walktree() needs in order 548 * continue our descent. We will end up deleting all 549 * the routes that rtrequest() would have in any case, 550 * so that behavior is not needed there. Safe to drop 551 * rt_lock and use rt_key, rt_gateway, since holding 552 * rnh_lock here prevents another thread from calling 553 * rt_setgate() on this route. 554 */ 555 rt->rt_flags &= ~(RTF_CLONING | RTF_PRCLONING); 556 RT_UNLOCK(rt); 557 err = rtrequest_locked(RTM_DELETE, rt_key(rt), 558 rt->rt_gateway, rt_mask(rt), rt->rt_flags, 0); 559 if (err) { 560 log(LOG_WARNING, "in_ifadownkill: error %d\n", err); 561 } 562 } else { 563 RT_UNLOCK(rt); 564 } 565 return 0; 566} 567 568int 569in_ifadown(struct ifaddr *ifa, int delete) 570{ 571 struct in_ifadown_arg arg; 572 struct radix_node_head *rnh; 573 574 lck_mtx_assert(rnh_lock, LCK_MTX_ASSERT_OWNED); 575 576 /* 577 * Holding rnh_lock here prevents the possibility of 578 * ifa from changing (e.g. in_ifinit), so it is safe 579 * to access its ifa_addr without locking. 580 */ 581 if (ifa->ifa_addr->sa_family != AF_INET) 582 return (1); 583 584 /* trigger route cache reevaluation */ 585 if (use_routegenid) 586 routegenid_update(); 587 588 arg.rnh = rnh = rt_tables[AF_INET]; 589 arg.ifa = ifa; 590 arg.del = delete; 591 rnh->rnh_walktree(rnh, in_ifadownkill, &arg); 592 IFA_LOCK_SPIN(ifa); 593 ifa->ifa_flags &= ~IFA_ROUTE; 594 IFA_UNLOCK(ifa); 595 return (0); 596} 597