1/* 2 * Copyright (c) 2003-2008 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* $FreeBSD: src/sys/netinet6/in6_rmx.c,v 1.1.2.2 2001/07/03 11:01:52 ume Exp $ */ 30/* $KAME: in6_rmx.c,v 1.10 2001/05/24 05:44:58 itojun Exp $ */ 31 32/* 33 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. Neither the name of the project nor the names of its contributors 45 * may be used to endorse or promote products derived from this software 46 * without specific prior written permission. 47 * 48 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 51 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 58 * SUCH DAMAGE. 59 */ 60 61/* 62 * Copyright 1994, 1995 Massachusetts Institute of Technology 63 * 64 * Permission to use, copy, modify, and distribute this software and 65 * its documentation for any purpose and without fee is hereby 66 * granted, provided that both the above copyright notice and this 67 * permission notice appear in all copies, that both the above 68 * copyright notice and this permission notice appear in all 69 * supporting documentation, and that the name of M.I.T. not be used 70 * in advertising or publicity pertaining to distribution of the 71 * software without specific, written prior permission. M.I.T. makes 72 * no representations about the suitability of this software for any 73 * purpose. It is provided "as is" without express or implied 74 * warranty. 75 * 76 * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS 77 * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, 78 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 79 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT 80 * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 81 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 82 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 83 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 84 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 85 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 86 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 87 * SUCH DAMAGE. 88 * 89 */ 90 91/* 92 * This code does two things necessary for the enhanced TCP metrics to 93 * function in a useful manner: 94 * 1) It marks all non-host routes as `cloning', thus ensuring that 95 * every actual reference to such a route actually gets turned 96 * into a reference to a host route to the specific destination 97 * requested. 98 * 2) When such routes lose all their references, it arranges for them 99 * to be deleted in some random collection of circumstances, so that 100 * a large quantity of stale routing data is not kept in kernel memory 101 * indefinitely. See in6_rtqtimo() below for the exact mechanism. 102 */ 103 104#include <sys/param.h> 105#include <sys/systm.h> 106#include <sys/kernel.h> 107#include <sys/sysctl.h> 108#include <kern/queue.h> 109#include <sys/socket.h> 110#include <sys/socketvar.h> 111#include <sys/mbuf.h> 112#include <sys/syslog.h> 113#include <kern/lock.h> 114 115#include <net/if.h> 116#include <net/route.h> 117#include <netinet/in.h> 118#include <netinet/ip_var.h> 119#include <netinet/in_var.h> 120 121#include <netinet/ip6.h> 122#include <netinet6/ip6_var.h> 123 124#include <netinet/icmp6.h> 125 126#include <netinet/tcp.h> 127#include <netinet/tcp_seq.h> 128#include <netinet/tcp_timer.h> 129#include <netinet/tcp_var.h> 130 131extern int in6_inithead(void **head, int off); 132static void in6_rtqtimo(void *rock); 133static void in6_mtutimo(void *rock); 134extern int tvtohz(struct timeval *); 135 136static struct radix_node *in6_matroute_args(void *, struct radix_node_head *, 137 rn_matchf_t *, void *); 138 139#define RTPRF_OURS RTF_PROTO3 /* set on routes we manage */ 140 141/* 142 * Do what we need to do when inserting a route. 143 */ 144static struct radix_node * 145in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, 146 struct radix_node *treenodes) 147{ 148 struct rtentry *rt = (struct rtentry *)treenodes; 149 struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)rt_key(rt); 150 struct radix_node *ret; 151 152 /* 153 * For IPv6, all unicast non-host routes are automatically cloning. 154 */ 155 if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) 156 rt->rt_flags |= RTF_MULTICAST; 157 158 if (!(rt->rt_flags & (RTF_HOST | RTF_CLONING | RTF_MULTICAST))) { 159 rt->rt_flags |= RTF_PRCLONING; 160 } 161 162 /* 163 * A little bit of help for both IPv6 output and input: 164 * For local addresses, we make sure that RTF_LOCAL is set, 165 * with the thought that this might one day be used to speed up 166 * ip_input(). 167 * 168 * We also mark routes to multicast addresses as such, because 169 * it's easy to do and might be useful (but this is much more 170 * dubious since it's so easy to inspect the address). (This 171 * is done above.) 172 * 173 * XXX 174 * should elaborate the code. 175 */ 176 if (rt->rt_flags & RTF_HOST) { 177 if (IN6_ARE_ADDR_EQUAL(&satosin6(rt->rt_ifa->ifa_addr) 178 ->sin6_addr, 179 &sin6->sin6_addr)) { 180 rt->rt_flags |= RTF_LOCAL; 181 } 182 } 183 184 if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) 185 && rt->rt_ifp) 186 rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; 187 188 ret = rn_addroute(v_arg, n_arg, head, treenodes); 189 if (ret == NULL && rt->rt_flags & RTF_HOST) { 190 struct rtentry *rt2; 191 /* 192 * We are trying to add a host route, but can't. 193 * Find out if it is because of an 194 * ARP entry and delete it if so. 195 */ 196 rt2 = rtalloc1_locked((struct sockaddr *)sin6, 0, 197 RTF_CLONING | RTF_PRCLONING); 198 if (rt2) { 199 if (rt2->rt_flags & RTF_LLINFO && 200 rt2->rt_flags & RTF_HOST && 201 rt2->rt_gateway && 202 rt2->rt_gateway->sa_family == AF_LINK) { 203 rtrequest_locked(RTM_DELETE, 204 (struct sockaddr *)rt_key(rt2), 205 rt2->rt_gateway, 206 rt_mask(rt2), rt2->rt_flags, 0); 207 ret = rn_addroute(v_arg, n_arg, head, 208 treenodes); 209 } 210 rtfree_locked(rt2); 211 } 212 } else if (ret == NULL && rt->rt_flags & RTF_CLONING) { 213 struct rtentry *rt2; 214 /* 215 * We are trying to add a net route, but can't. 216 * The following case should be allowed, so we'll make a 217 * special check for this: 218 * Two IPv6 addresses with the same prefix is assigned 219 * to a single interrface. 220 * # ifconfig if0 inet6 3ffe:0501::1 prefix 64 alias (*1) 221 * # ifconfig if0 inet6 3ffe:0501::2 prefix 64 alias (*2) 222 * In this case, (*1) and (*2) want to add the same 223 * net route entry, 3ffe:0501:: -> if0. 224 * This case should not raise an error. 225 */ 226 rt2 = rtalloc1_locked((struct sockaddr *)sin6, 0, 227 RTF_CLONING | RTF_PRCLONING); 228 if (rt2) { 229 if ((rt2->rt_flags & (RTF_CLONING|RTF_HOST|RTF_GATEWAY)) 230 == RTF_CLONING 231 && rt2->rt_gateway 232 && rt2->rt_gateway->sa_family == AF_LINK 233 && rt2->rt_ifp == rt->rt_ifp) { 234 ret = rt2->rt_nodes; 235 } 236 rtfree_locked(rt2); 237 } 238 } 239 return ret; 240} 241 242/* 243 * Similar to in6_matroute_args except without the leaf-matching parameters. 244 */ 245static struct radix_node * 246in6_matroute(void *v_arg, struct radix_node_head *head) 247{ 248 return (in6_matroute_args(v_arg, head, NULL, NULL)); 249} 250 251/* 252 * This code is the inverse of in6_clsroute: on first reference, if we 253 * were managing the route, stop doing so and set the expiration timer 254 * back off again. 255 */ 256static struct radix_node * 257in6_matroute_args(void *v_arg, struct radix_node_head *head, 258 rn_matchf_t *f, void *w) 259{ 260 struct radix_node *rn = rn_match_args(v_arg, head, f, w); 261 struct rtentry *rt = (struct rtentry *)rn; 262 263 if (rt && rt->rt_refcnt == 0) { /* this is first reference */ 264 if (rt->rt_flags & RTPRF_OURS) { 265 rt->rt_flags &= ~RTPRF_OURS; 266 rt->rt_rmx.rmx_expire = 0; 267 } 268 } 269 return (rn); 270} 271 272SYSCTL_DECL(_net_inet6_ip6); 273 274static int rtq_reallyold = 60*60; 275 /* one hour is ``really old'' */ 276SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTEXPIRE, rtexpire, 277 CTLFLAG_RW, &rtq_reallyold , 0, ""); 278 279static int rtq_minreallyold = 10; 280 /* never automatically crank down to less */ 281SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMINEXPIRE, rtminexpire, 282 CTLFLAG_RW, &rtq_minreallyold , 0, ""); 283 284static int rtq_toomany = 128; 285 /* 128 cached routes is ``too many'' */ 286SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RTMAXCACHE, rtmaxcache, 287 CTLFLAG_RW, &rtq_toomany , 0, ""); 288 289 290/* 291 * On last reference drop, mark the route as belong to us so that it can be 292 * timed out. 293 */ 294static void 295in6_clsroute(struct radix_node *rn, __unused struct radix_node_head *head) 296{ 297 struct rtentry *rt = (struct rtentry *)rn; 298 299 if (!(rt->rt_flags & RTF_UP)) 300 return; /* prophylactic measures */ 301 302 if ((rt->rt_flags & (RTF_LLINFO | RTF_HOST)) != RTF_HOST) 303 return; 304 305 if ((rt->rt_flags & (RTF_WASCLONED | RTPRF_OURS)) != RTF_WASCLONED) 306 return; 307 308 /* 309 * Delete the route immediately if RTF_DELCLONE is set or 310 * if route caching is disabled (rtq_reallyold set to 0). 311 * Otherwise, let it expire and be deleted by in6_rtqkill(). 312 */ 313 if ((rt->rt_flags & RTF_DELCLONE) || rtq_reallyold == 0) { 314 /* 315 * Delete the route from the radix tree but since we are 316 * called when the route's reference count is 0, don't 317 * deallocate it until we return from this routine by 318 * telling rtrequest that we're interested in it. 319 */ 320 if (rtrequest_locked(RTM_DELETE, (struct sockaddr *)rt_key(rt), 321 rt->rt_gateway, rt_mask(rt), rt->rt_flags, &rt) == 0) { 322 /* Now let the caller free it */ 323 rtunref(rt); 324 } 325 } else { 326 struct timeval timenow; 327 328 getmicrotime(&timenow); 329 rt->rt_flags |= RTPRF_OURS; 330 rt->rt_rmx.rmx_expire = timenow.tv_sec + rtq_reallyold; 331 } 332} 333 334struct rtqk_arg { 335 struct radix_node_head *rnh; 336 int mode; 337 int updating; 338 int draining; 339 int killed; 340 int found; 341 time_t nextstop; 342}; 343 344/* 345 * Get rid of old routes. When draining, this deletes everything, even when 346 * the timeout is not expired yet. When updating, this makes sure that 347 * nothing has a timeout longer than the current value of rtq_reallyold. 348 */ 349static int 350in6_rtqkill(struct radix_node *rn, void *rock) 351{ 352 struct rtqk_arg *ap = rock; 353 struct rtentry *rt = (struct rtentry *)rn; 354 int err; 355 struct timeval timenow; 356 357 getmicrotime(&timenow); 358 lck_mtx_assert(rt_mtx, LCK_MTX_ASSERT_OWNED); 359 360 if (rt->rt_flags & RTPRF_OURS) { 361 ap->found++; 362 363 if (ap->draining || rt->rt_rmx.rmx_expire <= timenow.tv_sec) { 364 if (rt->rt_refcnt > 0) 365 panic("rtqkill route really not free"); 366 367 err = rtrequest_locked(RTM_DELETE, 368 (struct sockaddr *)rt_key(rt), 369 rt->rt_gateway, rt_mask(rt), 370 rt->rt_flags, 0); 371 if (err) { 372 log(LOG_WARNING, "in6_rtqkill: error %d", err); 373 } else { 374 ap->killed++; 375 } 376 } else { 377 if (ap->updating 378 && (rt->rt_rmx.rmx_expire - timenow.tv_sec 379 > rtq_reallyold)) { 380 rt->rt_rmx.rmx_expire = timenow.tv_sec 381 + rtq_reallyold; 382 } 383 ap->nextstop = lmin(ap->nextstop, 384 rt->rt_rmx.rmx_expire); 385 } 386 } 387 388 return 0; 389} 390 391#define RTQ_TIMEOUT 60*10 /* run no less than once every ten minutes */ 392static int rtq_timeout = RTQ_TIMEOUT; 393 394static void 395in6_rtqtimo(void *rock) 396{ 397 struct radix_node_head *rnh = rock; 398 struct rtqk_arg arg; 399 struct timeval atv; 400 static time_t last_adjusted_timeout = 0; 401 struct timeval timenow; 402 403 lck_mtx_lock(rt_mtx); 404 /* Get the timestamp after we acquire the lock for better accuracy */ 405 getmicrotime(&timenow); 406 407 arg.found = arg.killed = 0; 408 arg.rnh = rnh; 409 arg.nextstop = timenow.tv_sec + rtq_timeout; 410 arg.draining = arg.updating = 0; 411 rnh->rnh_walktree(rnh, in6_rtqkill, &arg); 412 413 /* 414 * Attempt to be somewhat dynamic about this: 415 * If there are ``too many'' routes sitting around taking up space, 416 * then crank down the timeout, and see if we can't make some more 417 * go away. However, we make sure that we will never adjust more 418 * than once in rtq_timeout seconds, to keep from cranking down too 419 * hard. 420 */ 421 if ((arg.found - arg.killed > rtq_toomany) 422 && (timenow.tv_sec - last_adjusted_timeout >= rtq_timeout) 423 && rtq_reallyold > rtq_minreallyold) { 424 rtq_reallyold = 2*rtq_reallyold / 3; 425 if (rtq_reallyold < rtq_minreallyold) { 426 rtq_reallyold = rtq_minreallyold; 427 } 428 429 last_adjusted_timeout = timenow.tv_sec; 430#if DIAGNOSTIC 431 log(LOG_DEBUG, "in6_rtqtimo: adjusted rtq_reallyold to %d", 432 rtq_reallyold); 433#endif 434 arg.found = arg.killed = 0; 435 arg.updating = 1; 436 rnh->rnh_walktree(rnh, in6_rtqkill, &arg); 437 } 438 439 atv.tv_usec = 0; 440 atv.tv_sec = arg.nextstop - timenow.tv_sec; 441 lck_mtx_unlock(rt_mtx); 442 timeout(in6_rtqtimo, rock, tvtohz(&atv)); 443} 444 445/* 446 * Age old PMTUs. 447 */ 448struct mtuex_arg { 449 struct radix_node_head *rnh; 450 time_t nextstop; 451}; 452 453static int 454in6_mtuexpire(struct radix_node *rn, void *rock) 455{ 456 struct rtentry *rt = (struct rtentry *)rn; 457 struct mtuex_arg *ap = rock; 458 struct timeval timenow; 459 460 getmicrotime(&timenow); 461 462 /* sanity */ 463 if (!rt) 464 panic("rt == NULL in in6_mtuexpire"); 465 466 if (rt->rt_rmx.rmx_expire && !(rt->rt_flags & RTF_PROBEMTU)) { 467 if (rt->rt_rmx.rmx_expire <= timenow.tv_sec) { 468 rt->rt_flags |= RTF_PROBEMTU; 469 } else { 470 ap->nextstop = lmin(ap->nextstop, 471 rt->rt_rmx.rmx_expire); 472 } 473 } 474 475 return 0; 476} 477 478#define MTUTIMO_DEFAULT (60*1) 479 480static void 481in6_mtutimo(void *rock) 482{ 483 struct radix_node_head *rnh = rock; 484 struct mtuex_arg arg; 485 struct timeval atv; 486 struct timeval timenow; 487 488 getmicrotime(&timenow); 489 490 arg.rnh = rnh; 491 arg.nextstop = timenow.tv_sec + MTUTIMO_DEFAULT; 492 lck_mtx_lock(rt_mtx); 493 rnh->rnh_walktree(rnh, in6_mtuexpire, &arg); 494 495 atv.tv_usec = 0; 496 atv.tv_sec = arg.nextstop; 497 if (atv.tv_sec < timenow.tv_sec) { 498#if DIAGNOSTIC 499 log(LOG_DEBUG, "IPv6: invalid mtu expiration time on routing table\n"); 500#endif 501 arg.nextstop = timenow.tv_sec + 30; /*last resort*/ 502 } 503 atv.tv_sec -= timenow.tv_sec; 504 lck_mtx_unlock(rt_mtx); 505 timeout(in6_mtutimo, rock, tvtohz(&atv)); 506} 507 508#if 0 509void 510in6_rtqdrain() 511{ 512 struct radix_node_head *rnh = rt_tables[AF_INET6]; 513 struct rtqk_arg arg; 514 int s; 515 arg.found = arg.killed = 0; 516 arg.rnh = rnh; 517 arg.nextstop = 0; 518 arg.draining = 1; 519 arg.updating = 0; 520 s = splnet(); 521 rnh->rnh_walktree(rnh, in6_rtqkill, &arg); 522 splx(s); 523} 524#endif 525 526/* 527 * Initialize our routing tree. 528 */ 529int 530in6_inithead(void **head, int off) 531{ 532 struct radix_node_head *rnh; 533 534 if (!rn_inithead(head, off)) 535 return 0; 536 537 if (head != (void **)&rt_tables[AF_INET6]) /* BOGUS! */ 538 return 1; /* only do this for the real routing table */ 539 540 rnh = *head; 541 rnh->rnh_addaddr = in6_addroute; 542 rnh->rnh_matchaddr = in6_matroute; 543 rnh->rnh_matchaddr_args = in6_matroute_args; 544 rnh->rnh_close = in6_clsroute; 545 in6_rtqtimo(rnh); /* kick off timeout first time */ 546 in6_mtutimo(rnh); /* kick off timeout first time */ 547 return 1; 548} 549