1/*	$NetBSD: route.c,v 1.237 2023/06/05 03:51:45 ozaki-r Exp $	*/
2
3/*-
4 * Copyright (c) 1998, 2008 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Kevin M. Lahey of the Numerical Aerospace Simulation Facility,
9 * NASA Ames Research Center.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33/*
34 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
35 * All rights reserved.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 *    notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 *    notice, this list of conditions and the following disclaimer in the
44 *    documentation and/or other materials provided with the distribution.
45 * 3. Neither the name of the project nor the names of its contributors
46 *    may be used to endorse or promote products derived from this software
47 *    without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 */
61
62/*
63 * Copyright (c) 1980, 1986, 1991, 1993
64 *	The Regents of the University of California.  All rights reserved.
65 *
66 * Redistribution and use in source and binary forms, with or without
67 * modification, are permitted provided that the following conditions
68 * are met:
69 * 1. Redistributions of source code must retain the above copyright
70 *    notice, this list of conditions and the following disclaimer.
71 * 2. Redistributions in binary form must reproduce the above copyright
72 *    notice, this list of conditions and the following disclaimer in the
73 *    documentation and/or other materials provided with the distribution.
74 * 3. Neither the name of the University nor the names of its contributors
75 *    may be used to endorse or promote products derived from this software
76 *    without specific prior written permission.
77 *
78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
81 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
88 * SUCH DAMAGE.
89 *
90 *	@(#)route.c	8.3 (Berkeley) 1/9/95
91 */
92
93#ifdef _KERNEL_OPT
94#include "opt_inet.h"
95#include "opt_route.h"
96#include "opt_net_mpsafe.h"
97#endif
98
99#include <sys/cdefs.h>
100__KERNEL_RCSID(0, "$NetBSD: route.c,v 1.237 2023/06/05 03:51:45 ozaki-r Exp $");
101
102#include <sys/param.h>
103#ifdef RTFLUSH_DEBUG
104#include <sys/sysctl.h>
105#endif
106#include <sys/systm.h>
107#include <sys/callout.h>
108#include <sys/proc.h>
109#include <sys/mbuf.h>
110#include <sys/socket.h>
111#include <sys/socketvar.h>
112#include <sys/domain.h>
113#include <sys/kernel.h>
114#include <sys/ioctl.h>
115#include <sys/pool.h>
116#include <sys/kauth.h>
117#include <sys/workqueue.h>
118#include <sys/syslog.h>
119#include <sys/rwlock.h>
120#include <sys/mutex.h>
121#include <sys/cpu.h>
122#include <sys/kmem.h>
123
124#include <net/if.h>
125#include <net/if_dl.h>
126#include <net/route.h>
127#if defined(INET) || defined(INET6)
128#include <net/if_llatbl.h>
129#endif
130
131#include <netinet/in.h>
132#include <netinet/in_var.h>
133
134#define	PRESERVED_RTF	(RTF_UP | RTF_GATEWAY | RTF_HOST | RTF_DONE | RTF_MASK)
135
136#ifdef RTFLUSH_DEBUG
137#define	rtcache_debug() __predict_false(_rtcache_debug)
138#else /* RTFLUSH_DEBUG */
139#define	rtcache_debug() 0
140#endif /* RTFLUSH_DEBUG */
141
142#ifdef RT_DEBUG
143#define RT_REFCNT_TRACE(rt)	printf("%s:%d: rt=%p refcnt=%d\n", \
144				    __func__, __LINE__, (rt), (rt)->rt_refcnt)
145#else
146#define RT_REFCNT_TRACE(rt)	do {} while (0)
147#endif
148
149#ifdef RT_DEBUG
150#define dlog(level, fmt, args...)	log(level, fmt, ##args)
151#else
152#define dlog(level, fmt, args...)	do {} while (0)
153#endif
154
155struct rtstat		rtstat;
156
157static int		rttrash;	/* routes not in table but not freed */
158
159static struct pool	rtentry_pool;
160static struct pool	rttimer_pool;
161
162static struct callout	rt_timer_ch; /* callout for rt_timer_timer() */
163static struct workqueue	*rt_timer_wq;
164static struct work	rt_timer_wk;
165
166static void	rt_timer_init(void);
167static void	rt_timer_queue_remove_all(struct rttimer_queue *);
168static void	rt_timer_remove_all(struct rtentry *);
169static void	rt_timer_timer(void *);
170
171/*
172 * Locking notes:
173 * - The routing table is protected by a global rwlock
174 *   - API: RT_RLOCK and friends
175 * - rtcaches are NOT protected by the framework
176 *   - Callers must guarantee a rtcache isn't accessed simultaneously
177 *   - How the constraint is guaranteed in the wild
178 *     - Protect a rtcache by a mutex (e.g., inp_route)
179 *     - Make rtcache per-CPU and allow only accesses from softint
180 *       (e.g., ipforward_rt_percpu)
181 * - References to a rtentry is managed by reference counting and psref
182 *   - Reference counting is used for temporal reference when a rtentry
183 *     is fetched from the routing table
184 *   - psref is used for temporal reference when a rtentry is fetched
185 *     from a rtcache
186 *     - struct route (rtcache) has struct psref, so we cannot obtain
187 *       a reference twice on the same struct route
188 *   - Before destroying or updating a rtentry, we have to wait for
189 *     all references left (see below for details)
190 *   - APIs
191 *     - An obtained rtentry via rtalloc1 or rtrequest* must be
192 *       unreferenced by rt_unref
193 *     - An obtained rtentry via rtcache_* must be unreferenced by
194 *       rtcache_unref
195 *   - TODO: once we get a lockless routing table, we should use only
196 *           psref for rtentries
197 * - rtentry destruction
198 *   - A rtentry is destroyed (freed) only when we call rtrequest(RTM_DELETE)
199 *   - If a caller of rtrequest grabs a reference of a rtentry, the caller
200 *     has a responsibility to destroy the rtentry by itself by calling
201 *     rt_free
202 *     - If not, rtrequest itself does that
203 *   - If rt_free is called in softint, the actual destruction routine is
204 *     deferred to a workqueue
205 * - rtentry update
206 *   - When updating a rtentry, RTF_UPDATING flag is set
207 *   - If a rtentry is set RTF_UPDATING, fetching the rtentry from
208 *     the routing table or a rtcache results in either of the following
209 *     cases:
210 *     - if the caller runs in softint, the caller fails to fetch
211 *     - otherwise, the caller waits for the update completed and retries
212 *       to fetch (probably succeed to fetch for the second time)
213 * - rtcache invalidation
214 *   - There is a global generation counter that is incremented when
215 *     any routes have been added or deleted
216 *   - When a rtcache caches a rtentry into itself, it also stores
217 *     a snapshot of the generation counter
218 *   - If the snapshot equals to the global counter, the cache is valid,
219 *     otherwise the cache is invalidated
220 */
221
222/*
223 * Global lock for the routing table.
224 */
225static krwlock_t		rt_lock __cacheline_aligned;
226#ifdef NET_MPSAFE
227#define RT_RLOCK()		rw_enter(&rt_lock, RW_READER)
228#define RT_WLOCK()		rw_enter(&rt_lock, RW_WRITER)
229#define RT_UNLOCK()		rw_exit(&rt_lock)
230#define RT_WLOCKED()		rw_write_held(&rt_lock)
231#define	RT_ASSERT_WLOCK()	KASSERT(rw_write_held(&rt_lock))
232#define RT_WQ_FLAGS		WQ_MPSAFE
233#else
234#define RT_RLOCK()		do {} while (0)
235#define RT_WLOCK()		do {} while (0)
236#define RT_UNLOCK()		do {} while (0)
237#define RT_WLOCKED()		true
238#define	RT_ASSERT_WLOCK()	do {} while (0)
239#define RT_WQ_FLAGS		0
240#endif
241
242static uint64_t rtcache_generation;
243
244/*
245 * mutex and cv that are used to wait for references to a rtentry left
246 * before updating the rtentry.
247 */
248static struct {
249	kmutex_t		lock;
250	kcondvar_t		cv;
251	bool			ongoing;
252	const struct lwp	*lwp;
253} rt_update_global __cacheline_aligned;
254
255/*
256 * A workqueue and stuff that are used to defer the destruction routine
257 * of rtentries.
258 */
259static struct {
260	struct workqueue	*wq;
261	struct work		wk;
262	kmutex_t		lock;
263	SLIST_HEAD(, rtentry)	queue;
264	bool			enqueued;
265} rt_free_global __cacheline_aligned;
266
267/* psref for rtentry */
268static struct psref_class *rt_psref_class __read_mostly;
269
270#ifdef RTFLUSH_DEBUG
271static int _rtcache_debug = 0;
272#endif /* RTFLUSH_DEBUG */
273
274static kauth_listener_t route_listener;
275
276static int rtdeletemsg(struct rtentry *);
277
278static void rt_maskedcopy(const struct sockaddr *,
279    struct sockaddr *, const struct sockaddr *);
280
281static void rtcache_invalidate(void);
282
283static void rt_ref(struct rtentry *);
284
285static struct rtentry *
286    rtalloc1_locked(const struct sockaddr *, int, bool, bool);
287
288static struct ifaddr *rt_getifa(struct rt_addrinfo *, struct psref *);
289static struct ifnet *rt_getifp(struct rt_addrinfo *, struct psref *);
290static struct ifaddr *ifa_ifwithroute_psref(int, const struct sockaddr *,
291    const struct sockaddr *, struct psref *);
292
293static void rtcache_ref(struct rtentry *, struct route *);
294
295#ifdef NET_MPSAFE
296static void rt_update_wait(void);
297#endif
298
299static bool rt_wait_ok(void);
300static void rt_wait_refcnt(const char *, struct rtentry *, int);
301static void rt_wait_psref(struct rtentry *);
302
303#ifdef DDB
304static void db_print_sa(const struct sockaddr *);
305static void db_print_ifa(struct ifaddr *);
306static int db_show_rtentry(struct rtentry *, void *);
307#endif
308
309#ifdef RTFLUSH_DEBUG
310static void sysctl_net_rtcache_setup(struct sysctllog **);
311static void
312sysctl_net_rtcache_setup(struct sysctllog **clog)
313{
314	const struct sysctlnode *rnode;
315
316	if (sysctl_createv(clog, 0, NULL, &rnode, CTLFLAG_PERMANENT,
317	    CTLTYPE_NODE,
318	    "rtcache", SYSCTL_DESCR("Route cache related settings"),
319	    NULL, 0, NULL, 0, CTL_NET, CTL_CREATE, CTL_EOL) != 0)
320		return;
321	if (sysctl_createv(clog, 0, &rnode, &rnode,
322	    CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT,
323	    "debug", SYSCTL_DESCR("Debug route caches"),
324	    NULL, 0, &_rtcache_debug, 0, CTL_CREATE, CTL_EOL) != 0)
325		return;
326}
327#endif /* RTFLUSH_DEBUG */
328
329static inline void
330rt_destroy(struct rtentry *rt)
331{
332	if (rt->_rt_key != NULL)
333		sockaddr_free(rt->_rt_key);
334	if (rt->rt_gateway != NULL)
335		sockaddr_free(rt->rt_gateway);
336	if (rt_gettag(rt) != NULL)
337		sockaddr_free(rt_gettag(rt));
338	rt->_rt_key = rt->rt_gateway = rt->rt_tag = NULL;
339}
340
341static inline const struct sockaddr *
342rt_setkey(struct rtentry *rt, const struct sockaddr *key, int flags)
343{
344	if (rt->_rt_key == key)
345		goto out;
346
347	if (rt->_rt_key != NULL)
348		sockaddr_free(rt->_rt_key);
349	rt->_rt_key = sockaddr_dup(key, flags);
350out:
351	rt->rt_nodes->rn_key = (const char *)rt->_rt_key;
352	return rt->_rt_key;
353}
354
355struct ifaddr *
356rt_get_ifa(struct rtentry *rt)
357{
358	struct ifaddr *ifa;
359
360	ifa = rt->rt_ifa;
361	if (ifa->ifa_getifa == NULL)
362		return ifa;
363#if 0
364	else if (ifa->ifa_seqno != NULL && *ifa->ifa_seqno == rt->rt_ifa_seqno)
365		return ifa;
366#endif
367	else {
368		ifa = (*ifa->ifa_getifa)(ifa, rt_getkey(rt));
369		if (ifa == NULL)
370			return NULL;
371		rt_replace_ifa(rt, ifa);
372		return ifa;
373	}
374}
375
376static void
377rt_set_ifa1(struct rtentry *rt, struct ifaddr *ifa)
378{
379	rt->rt_ifa = ifa;
380	if (ifa->ifa_seqno != NULL)
381		rt->rt_ifa_seqno = *ifa->ifa_seqno;
382}
383
384/*
385 * Is this route the connected route for the ifa?
386 */
387static int
388rt_ifa_connected(const struct rtentry *rt, const struct ifaddr *ifa)
389{
390	const struct sockaddr *key, *dst, *odst;
391	struct sockaddr_storage maskeddst;
392
393	key = rt_getkey(rt);
394	dst = rt->rt_flags & RTF_HOST ? ifa->ifa_dstaddr : ifa->ifa_addr;
395	if (dst == NULL ||
396	    dst->sa_family != key->sa_family ||
397	    dst->sa_len != key->sa_len)
398		return 0;
399	if ((rt->rt_flags & RTF_HOST) == 0 && ifa->ifa_netmask) {
400		odst = dst;
401		dst = (struct sockaddr *)&maskeddst;
402		rt_maskedcopy(odst, (struct sockaddr *)&maskeddst,
403		    ifa->ifa_netmask);
404	}
405	return (memcmp(dst, key, dst->sa_len) == 0);
406}
407
408void
409rt_replace_ifa(struct rtentry *rt, struct ifaddr *ifa)
410{
411	struct ifaddr *old;
412
413	if (rt->rt_ifa == ifa)
414		return;
415
416	if (rt->rt_ifa != ifa &&
417	    rt->rt_ifa->ifa_flags & IFA_ROUTE &&
418	    rt_ifa_connected(rt, rt->rt_ifa))
419	{
420		RT_DPRINTF("rt->_rt_key = %p, ifa = %p, "
421		    "replace deleted IFA_ROUTE\n",
422		    (void *)rt->_rt_key, (void *)rt->rt_ifa);
423		rt->rt_ifa->ifa_flags &= ~IFA_ROUTE;
424		if (rt_ifa_connected(rt, ifa)) {
425			RT_DPRINTF("rt->_rt_key = %p, ifa = %p, "
426			    "replace added IFA_ROUTE\n",
427			    (void *)rt->_rt_key, (void *)ifa);
428			ifa->ifa_flags |= IFA_ROUTE;
429		}
430	}
431
432	ifaref(ifa);
433	old = rt->rt_ifa;
434	rt_set_ifa1(rt, ifa);
435	ifafree(old);
436}
437
438static void
439rt_set_ifa(struct rtentry *rt, struct ifaddr *ifa)
440{
441	ifaref(ifa);
442	rt_set_ifa1(rt, ifa);
443}
444
445static int
446route_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
447    void *arg0, void *arg1, void *arg2, void *arg3)
448{
449	struct rt_msghdr *rtm;
450	int result;
451
452	result = KAUTH_RESULT_DEFER;
453	rtm = arg1;
454
455	if (action != KAUTH_NETWORK_ROUTE)
456		return result;
457
458	if (rtm->rtm_type == RTM_GET)
459		result = KAUTH_RESULT_ALLOW;
460
461	return result;
462}
463
464static void rt_free_work(struct work *, void *);
465
466void
467rt_init(void)
468{
469	int error;
470
471#ifdef RTFLUSH_DEBUG
472	sysctl_net_rtcache_setup(NULL);
473#endif
474
475	mutex_init(&rt_free_global.lock, MUTEX_DEFAULT, IPL_SOFTNET);
476	SLIST_INIT(&rt_free_global.queue);
477	rt_free_global.enqueued = false;
478
479	rt_psref_class = psref_class_create("rtentry", IPL_SOFTNET);
480
481	error = workqueue_create(&rt_free_global.wq, "rt_free",
482	    rt_free_work, NULL, PRI_SOFTNET, IPL_SOFTNET, RT_WQ_FLAGS);
483	if (error)
484		panic("%s: workqueue_create failed (%d)\n", __func__, error);
485
486	mutex_init(&rt_update_global.lock, MUTEX_DEFAULT, IPL_SOFTNET);
487	cv_init(&rt_update_global.cv, "rt_update");
488
489	pool_init(&rtentry_pool, sizeof(struct rtentry), 0, 0, 0, "rtentpl",
490	    NULL, IPL_SOFTNET);
491	pool_init(&rttimer_pool, sizeof(struct rttimer), 0, 0, 0, "rttmrpl",
492	    NULL, IPL_SOFTNET);
493
494	rn_init();	/* initialize all zeroes, all ones, mask table */
495	rtbl_init();
496
497	route_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
498	    route_listener_cb, NULL);
499}
500
501static void
502rtcache_invalidate(void)
503{
504
505	RT_ASSERT_WLOCK();
506
507	if (rtcache_debug())
508		printf("%s: enter\n", __func__);
509
510	rtcache_generation++;
511}
512
513#ifdef RT_DEBUG
514static void
515dump_rt(const struct rtentry *rt)
516{
517	char buf[512];
518
519	log(LOG_DEBUG, "rt: ");
520	log(LOG_DEBUG, "p=%p ", rt);
521	if (rt->_rt_key == NULL) {
522		log(LOG_DEBUG, "dst=(NULL) ");
523	} else {
524		sockaddr_format(rt->_rt_key, buf, sizeof(buf));
525		log(LOG_DEBUG, "dst=%s ", buf);
526	}
527	if (rt->rt_gateway == NULL) {
528		log(LOG_DEBUG, "gw=(NULL) ");
529	} else {
530		sockaddr_format(rt->_rt_key, buf, sizeof(buf));
531		log(LOG_DEBUG, "gw=%s ", buf);
532	}
533	log(LOG_DEBUG, "flags=%x ", rt->rt_flags);
534	if (rt->rt_ifp == NULL) {
535		log(LOG_DEBUG, "if=(NULL) ");
536	} else {
537		log(LOG_DEBUG, "if=%s ", rt->rt_ifp->if_xname);
538	}
539	log(LOG_DEBUG, "\n");
540}
541#endif /* RT_DEBUG */
542
543/*
544 * Packet routing routines. If success, refcnt of a returned rtentry
545 * will be incremented. The caller has to rtfree it by itself.
546 */
547struct rtentry *
548rtalloc1_locked(const struct sockaddr *dst, int report, bool wait_ok,
549    bool wlock)
550{
551	rtbl_t *rtbl;
552	struct rtentry *rt;
553	int s;
554
555#ifdef NET_MPSAFE
556retry:
557#endif
558	s = splsoftnet();
559	rtbl = rt_gettable(dst->sa_family);
560	if (rtbl == NULL)
561		goto miss;
562
563	rt = rt_matchaddr(rtbl, dst);
564	if (rt == NULL)
565		goto miss;
566
567	if (!ISSET(rt->rt_flags, RTF_UP))
568		goto miss;
569
570#ifdef NET_MPSAFE
571	if (ISSET(rt->rt_flags, RTF_UPDATING) &&
572	    /* XXX updater should be always able to acquire */
573	    curlwp != rt_update_global.lwp) {
574		if (!wait_ok || !rt_wait_ok())
575			goto miss;
576		RT_UNLOCK();
577		splx(s);
578
579		/* We can wait until the update is complete */
580		rt_update_wait();
581
582		if (wlock)
583			RT_WLOCK();
584		else
585			RT_RLOCK();
586		goto retry;
587	}
588#endif /* NET_MPSAFE */
589
590	rt_ref(rt);
591	RT_REFCNT_TRACE(rt);
592
593	splx(s);
594	return rt;
595miss:
596	rtstat.rts_unreach++;
597	if (report) {
598		struct rt_addrinfo info;
599
600		memset(&info, 0, sizeof(info));
601		info.rti_info[RTAX_DST] = dst;
602		rt_missmsg(RTM_MISS, &info, 0, 0);
603	}
604	splx(s);
605	return NULL;
606}
607
608struct rtentry *
609rtalloc1(const struct sockaddr *dst, int report)
610{
611	struct rtentry *rt;
612
613	RT_RLOCK();
614	rt = rtalloc1_locked(dst, report, true, false);
615	RT_UNLOCK();
616
617	return rt;
618}
619
620static void
621rt_ref(struct rtentry *rt)
622{
623
624	KASSERTMSG(rt->rt_refcnt >= 0, "rt_refcnt=%d", rt->rt_refcnt);
625	atomic_inc_uint(&rt->rt_refcnt);
626}
627
628void
629rt_unref(struct rtentry *rt)
630{
631
632	KASSERT(rt != NULL);
633	KASSERTMSG(rt->rt_refcnt > 0, "refcnt=%d", rt->rt_refcnt);
634
635	atomic_dec_uint(&rt->rt_refcnt);
636	if (!ISSET(rt->rt_flags, RTF_UP) || ISSET(rt->rt_flags, RTF_UPDATING)) {
637		mutex_enter(&rt_free_global.lock);
638		cv_broadcast(&rt->rt_cv);
639		mutex_exit(&rt_free_global.lock);
640	}
641}
642
643static bool
644rt_wait_ok(void)
645{
646
647	/*
648	 * This originally returned !cpu_softintr_p(), but that doesn't
649	 * work: the caller may hold a lock (probably softnet lock)
650	 * that a softint is waiting for, in which case waiting here
651	 * would cause a deadlock.  See https://gnats.netbsd.org/56844
652	 * for details.  For now, until the locking paths are sorted
653	 * out, we just disable the waiting option altogether and
654	 * always defer to workqueue.
655	 */
656	KASSERT(!cpu_intr_p());
657	return false;
658}
659
660void
661rt_wait_refcnt(const char *title, struct rtentry *rt, int cnt)
662{
663	mutex_enter(&rt_free_global.lock);
664	while (rt->rt_refcnt > cnt) {
665		dlog(LOG_DEBUG, "%s: %s waiting (refcnt=%d)\n",
666		    __func__, title, rt->rt_refcnt);
667		cv_wait(&rt->rt_cv, &rt_free_global.lock);
668		dlog(LOG_DEBUG, "%s: %s waited (refcnt=%d)\n",
669		    __func__, title, rt->rt_refcnt);
670	}
671	mutex_exit(&rt_free_global.lock);
672}
673
674void
675rt_wait_psref(struct rtentry *rt)
676{
677
678	psref_target_destroy(&rt->rt_psref, rt_psref_class);
679	psref_target_init(&rt->rt_psref, rt_psref_class);
680}
681
682static void
683_rt_free(struct rtentry *rt)
684{
685	struct ifaddr *ifa;
686
687	/*
688	 * Need to avoid a deadlock on rt_wait_refcnt of update
689	 * and a conflict on psref_target_destroy of update.
690	 */
691#ifdef NET_MPSAFE
692	rt_update_wait();
693#endif
694
695	RT_REFCNT_TRACE(rt);
696	KASSERTMSG(rt->rt_refcnt >= 0, "refcnt=%d", rt->rt_refcnt);
697	rt_wait_refcnt("free", rt, 0);
698#ifdef NET_MPSAFE
699	psref_target_destroy(&rt->rt_psref, rt_psref_class);
700#endif
701
702	rt_assert_inactive(rt);
703	rttrash--;
704	ifa = rt->rt_ifa;
705	rt->rt_ifa = NULL;
706	ifafree(ifa);
707	rt->rt_ifp = NULL;
708	cv_destroy(&rt->rt_cv);
709	rt_destroy(rt);
710	pool_put(&rtentry_pool, rt);
711}
712
713static void
714rt_free_work(struct work *wk, void *arg)
715{
716
717	for (;;) {
718		struct rtentry *rt;
719
720		mutex_enter(&rt_free_global.lock);
721		if ((rt = SLIST_FIRST(&rt_free_global.queue)) == NULL) {
722			rt_free_global.enqueued = false;
723			mutex_exit(&rt_free_global.lock);
724			return;
725		}
726		SLIST_REMOVE_HEAD(&rt_free_global.queue, rt_free);
727		mutex_exit(&rt_free_global.lock);
728		atomic_dec_uint(&rt->rt_refcnt);
729		_rt_free(rt);
730	}
731}
732
733void
734rt_free(struct rtentry *rt)
735{
736
737	KASSERTMSG(rt->rt_refcnt > 0, "rt_refcnt=%d", rt->rt_refcnt);
738	if (rt_wait_ok()) {
739		atomic_dec_uint(&rt->rt_refcnt);
740		_rt_free(rt);
741		return;
742	}
743
744	mutex_enter(&rt_free_global.lock);
745	/* No need to add a reference here. */
746	SLIST_INSERT_HEAD(&rt_free_global.queue, rt, rt_free);
747	if (!rt_free_global.enqueued) {
748		workqueue_enqueue(rt_free_global.wq, &rt_free_global.wk, NULL);
749		rt_free_global.enqueued = true;
750	}
751	mutex_exit(&rt_free_global.lock);
752}
753
754#ifdef NET_MPSAFE
755static void
756rt_update_wait(void)
757{
758
759	mutex_enter(&rt_update_global.lock);
760	while (rt_update_global.ongoing) {
761		dlog(LOG_DEBUG, "%s: waiting lwp=%p\n", __func__, curlwp);
762		cv_wait(&rt_update_global.cv, &rt_update_global.lock);
763		dlog(LOG_DEBUG, "%s: waited lwp=%p\n", __func__, curlwp);
764	}
765	mutex_exit(&rt_update_global.lock);
766}
767#endif
768
769int
770rt_update_prepare(struct rtentry *rt)
771{
772
773	dlog(LOG_DEBUG, "%s: updating rt=%p lwp=%p\n", __func__, rt, curlwp);
774
775	RT_WLOCK();
776	/* If the entry is being destroyed, don't proceed the update. */
777	if (!ISSET(rt->rt_flags, RTF_UP)) {
778		RT_UNLOCK();
779		return ESRCH;
780	}
781	rt->rt_flags |= RTF_UPDATING;
782	RT_UNLOCK();
783
784	mutex_enter(&rt_update_global.lock);
785	while (rt_update_global.ongoing) {
786		dlog(LOG_DEBUG, "%s: waiting ongoing updating rt=%p lwp=%p\n",
787		    __func__, rt, curlwp);
788		cv_wait(&rt_update_global.cv, &rt_update_global.lock);
789		dlog(LOG_DEBUG, "%s: waited ongoing updating rt=%p lwp=%p\n",
790		    __func__, rt, curlwp);
791	}
792	rt_update_global.ongoing = true;
793	/* XXX need it to avoid rt_update_wait by updater itself. */
794	rt_update_global.lwp = curlwp;
795	mutex_exit(&rt_update_global.lock);
796
797	rt_wait_refcnt("update", rt, 1);
798	rt_wait_psref(rt);
799
800	return 0;
801}
802
803void
804rt_update_finish(struct rtentry *rt)
805{
806
807	RT_WLOCK();
808	rt->rt_flags &= ~RTF_UPDATING;
809	RT_UNLOCK();
810
811	mutex_enter(&rt_update_global.lock);
812	rt_update_global.ongoing = false;
813	rt_update_global.lwp = NULL;
814	cv_broadcast(&rt_update_global.cv);
815	mutex_exit(&rt_update_global.lock);
816
817	dlog(LOG_DEBUG, "%s: updated rt=%p lwp=%p\n", __func__, rt, curlwp);
818}
819
820/*
821 * Force a routing table entry to the specified
822 * destination to go through the given gateway.
823 * Normally called as a result of a routing redirect
824 * message from the network layer.
825 *
826 * N.B.: must be called at splsoftnet
827 */
828void
829rtredirect(const struct sockaddr *dst, const struct sockaddr *gateway,
830	const struct sockaddr *netmask, int flags, const struct sockaddr *src,
831	struct rtentry **rtp)
832{
833	struct rtentry *rt;
834	int error = 0;
835	uint64_t *stat = NULL;
836	struct rt_addrinfo info;
837	struct ifaddr *ifa;
838	struct psref psref;
839
840	/* verify the gateway is directly reachable */
841	if ((ifa = ifa_ifwithnet_psref(gateway, &psref)) == NULL) {
842		error = ENETUNREACH;
843		goto out;
844	}
845	rt = rtalloc1(dst, 0);
846	/*
847	 * If the redirect isn't from our current router for this dst,
848	 * it's either old or wrong.  If it redirects us to ourselves,
849	 * we have a routing loop, perhaps as a result of an interface
850	 * going down recently.
851	 */
852	if (!(flags & RTF_DONE) && rt &&
853	     (sockaddr_cmp(src, rt->rt_gateway) != 0 || rt->rt_ifa != ifa))
854		error = EINVAL;
855	else {
856		int s = pserialize_read_enter();
857		struct ifaddr *_ifa;
858
859		_ifa = ifa_ifwithaddr(gateway);
860		if (_ifa != NULL)
861			error = EHOSTUNREACH;
862		pserialize_read_exit(s);
863	}
864	if (error)
865		goto done;
866	/*
867	 * Create a new entry if we just got back a wildcard entry
868	 * or the lookup failed.  This is necessary for hosts
869	 * which use routing redirects generated by smart gateways
870	 * to dynamically build the routing tables.
871	 */
872	if (rt == NULL || (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
873		goto create;
874	/*
875	 * Don't listen to the redirect if it's
876	 * for a route to an interface.
877	 */
878	if (rt->rt_flags & RTF_GATEWAY) {
879		if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
880			/*
881			 * Changing from route to net => route to host.
882			 * Create new route, rather than smashing route to net.
883			 */
884		create:
885			if (rt != NULL)
886				rt_unref(rt);
887			flags |=  RTF_GATEWAY | RTF_DYNAMIC;
888			memset(&info, 0, sizeof(info));
889			info.rti_info[RTAX_DST] = dst;
890			info.rti_info[RTAX_GATEWAY] = gateway;
891			info.rti_info[RTAX_NETMASK] = netmask;
892			info.rti_ifa = ifa;
893			info.rti_flags = flags;
894			rt = NULL;
895			error = rtrequest1(RTM_ADD, &info, &rt);
896			if (rt != NULL)
897				flags = rt->rt_flags;
898			if (error == 0)
899				rt_newmsg_dynamic(RTM_ADD, rt);
900			stat = &rtstat.rts_dynamic;
901		} else {
902			/*
903			 * Smash the current notion of the gateway to
904			 * this destination.  Should check about netmask!!!
905			 */
906#ifdef NET_MPSAFE
907			KASSERT(!cpu_softintr_p());
908
909			error = rt_update_prepare(rt);
910			if (error == 0) {
911#endif
912				RT_WLOCK();
913				error = rt_setgate(rt, gateway);
914				if (error == 0) {
915					rt->rt_flags |= RTF_MODIFIED;
916					flags |= RTF_MODIFIED;
917				}
918				RT_UNLOCK();
919#ifdef NET_MPSAFE
920				rt_update_finish(rt);
921			} else {
922				/*
923				 * If error != 0, the rtentry is being
924				 * destroyed, so doing nothing doesn't
925				 * matter.
926				 */
927			}
928#endif
929			stat = &rtstat.rts_newgateway;
930		}
931	} else
932		error = EHOSTUNREACH;
933done:
934	if (rt) {
935		if (rtp != NULL && !error)
936			*rtp = rt;
937		else
938			rt_unref(rt);
939	}
940out:
941	if (error)
942		rtstat.rts_badredirect++;
943	else if (stat != NULL)
944		(*stat)++;
945	memset(&info, 0, sizeof(info));
946	info.rti_info[RTAX_DST] = dst;
947	info.rti_info[RTAX_GATEWAY] = gateway;
948	info.rti_info[RTAX_NETMASK] = netmask;
949	info.rti_info[RTAX_AUTHOR] = src;
950	rt_missmsg(RTM_REDIRECT, &info, flags, error);
951	ifa_release(ifa, &psref);
952}
953
954/*
955 * Delete a route and generate a message.
956 * It doesn't free a passed rt.
957 */
958static int
959rtdeletemsg(struct rtentry *rt)
960{
961	int error;
962	struct rt_addrinfo info;
963	struct rtentry *retrt;
964
965	/*
966	 * Request the new route so that the entry is not actually
967	 * deleted.  That will allow the information being reported to
968	 * be accurate (and consistent with route_output()).
969	 */
970	memset(&info, 0, sizeof(info));
971	info.rti_info[RTAX_DST] = rt_getkey(rt);
972	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
973	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
974	info.rti_flags = rt->rt_flags;
975	error = rtrequest1(RTM_DELETE, &info, &retrt);
976
977	rt_missmsg(RTM_DELETE, &info, info.rti_flags, error);
978
979	return error;
980}
981
982static struct ifaddr *
983ifa_ifwithroute_psref(int flags, const struct sockaddr *dst,
984    const struct sockaddr *gateway, struct psref *psref)
985{
986	struct ifaddr *ifa = NULL;
987
988	if ((flags & RTF_GATEWAY) == 0) {
989		/*
990		 * If we are adding a route to an interface,
991		 * and the interface is a pt to pt link
992		 * we should search for the destination
993		 * as our clue to the interface.  Otherwise
994		 * we can use the local address.
995		 */
996		if ((flags & RTF_HOST) && gateway->sa_family != AF_LINK)
997			ifa = ifa_ifwithdstaddr_psref(dst, psref);
998		if (ifa == NULL)
999			ifa = ifa_ifwithaddr_psref(gateway, psref);
1000	} else {
1001		/*
1002		 * If we are adding a route to a remote net
1003		 * or host, the gateway may still be on the
1004		 * other end of a pt to pt link.
1005		 */
1006		ifa = ifa_ifwithdstaddr_psref(gateway, psref);
1007	}
1008	if (ifa == NULL)
1009		ifa = ifa_ifwithnet_psref(gateway, psref);
1010	if (ifa == NULL) {
1011		int s;
1012		struct rtentry *rt;
1013
1014		rt = rtalloc1_locked(gateway, 0, true, true);
1015		if (rt == NULL)
1016			return NULL;
1017		if (rt->rt_flags & RTF_GATEWAY) {
1018			rt_unref(rt);
1019			return NULL;
1020		}
1021		/*
1022		 * Just in case. May not need to do this workaround.
1023		 * Revisit when working on rtentry MP-ification.
1024		 */
1025		s = pserialize_read_enter();
1026		IFADDR_READER_FOREACH(ifa, rt->rt_ifp) {
1027			if (ifa == rt->rt_ifa)
1028				break;
1029		}
1030		if (ifa != NULL)
1031			ifa_acquire(ifa, psref);
1032		pserialize_read_exit(s);
1033		rt_unref(rt);
1034		if (ifa == NULL)
1035			return NULL;
1036	}
1037	if (ifa->ifa_addr->sa_family != dst->sa_family) {
1038		struct ifaddr *nifa;
1039		int s;
1040
1041		s = pserialize_read_enter();
1042		nifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
1043		if (nifa != NULL) {
1044			ifa_release(ifa, psref);
1045			ifa_acquire(nifa, psref);
1046			ifa = nifa;
1047		}
1048		pserialize_read_exit(s);
1049	}
1050	return ifa;
1051}
1052
1053/*
1054 * If it suceeds and ret_nrt isn't NULL, refcnt of ret_nrt is incremented.
1055 * The caller has to rtfree it by itself.
1056 */
1057int
1058rtrequest(int req, const struct sockaddr *dst, const struct sockaddr *gateway,
1059	const struct sockaddr *netmask, int flags, struct rtentry **ret_nrt)
1060{
1061	struct rt_addrinfo info;
1062
1063	memset(&info, 0, sizeof(info));
1064	info.rti_flags = flags;
1065	info.rti_info[RTAX_DST] = dst;
1066	info.rti_info[RTAX_GATEWAY] = gateway;
1067	info.rti_info[RTAX_NETMASK] = netmask;
1068	return rtrequest1(req, &info, ret_nrt);
1069}
1070
1071static struct ifnet *
1072rt_getifp(struct rt_addrinfo *info, struct psref *psref)
1073{
1074	const struct sockaddr *ifpaddr = info->rti_info[RTAX_IFP];
1075
1076	if (info->rti_ifp != NULL)
1077		return NULL;
1078	/*
1079	 * ifp may be specified by sockaddr_dl when protocol address
1080	 * is ambiguous
1081	 */
1082	if (ifpaddr != NULL && ifpaddr->sa_family == AF_LINK) {
1083		struct ifaddr *ifa;
1084		int s = pserialize_read_enter();
1085
1086		ifa = ifa_ifwithnet(ifpaddr);
1087		if (ifa != NULL)
1088			info->rti_ifp = if_get_byindex(ifa->ifa_ifp->if_index,
1089			    psref);
1090		pserialize_read_exit(s);
1091	}
1092
1093	return info->rti_ifp;
1094}
1095
1096static struct ifaddr *
1097rt_getifa(struct rt_addrinfo *info, struct psref *psref)
1098{
1099	struct ifaddr *ifa = NULL;
1100	const struct sockaddr *dst = info->rti_info[RTAX_DST];
1101	const struct sockaddr *gateway = info->rti_info[RTAX_GATEWAY];
1102	const struct sockaddr *ifaaddr = info->rti_info[RTAX_IFA];
1103	int flags = info->rti_flags;
1104	const struct sockaddr *sa;
1105
1106	if (info->rti_ifa == NULL && ifaaddr != NULL) {
1107		ifa = ifa_ifwithaddr_psref(ifaaddr, psref);
1108		if (ifa != NULL)
1109			goto got;
1110	}
1111
1112	sa = ifaaddr != NULL ? ifaaddr :
1113	    (gateway != NULL ? gateway : dst);
1114	if (sa != NULL && info->rti_ifp != NULL)
1115		ifa = ifaof_ifpforaddr_psref(sa, info->rti_ifp, psref);
1116	else if (dst != NULL && gateway != NULL)
1117		ifa = ifa_ifwithroute_psref(flags, dst, gateway, psref);
1118	else if (sa != NULL)
1119		ifa = ifa_ifwithroute_psref(flags, sa, sa, psref);
1120	if (ifa == NULL)
1121		return NULL;
1122got:
1123	if (ifa->ifa_getifa != NULL) {
1124		/* FIXME ifa_getifa is NOMPSAFE */
1125		ifa = (*ifa->ifa_getifa)(ifa, dst);
1126		if (ifa == NULL)
1127			return NULL;
1128		ifa_acquire(ifa, psref);
1129	}
1130	info->rti_ifa = ifa;
1131	if (info->rti_ifp == NULL)
1132		info->rti_ifp = ifa->ifa_ifp;
1133	return ifa;
1134}
1135
1136/*
1137 * If it suceeds and ret_nrt isn't NULL, refcnt of ret_nrt is incremented.
1138 * The caller has to rtfree it by itself.
1139 */
1140int
1141rtrequest1(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt)
1142{
1143	int s = splsoftnet(), ss;
1144	int error = 0, rc;
1145	struct rtentry *rt;
1146	rtbl_t *rtbl;
1147	struct ifaddr *ifa = NULL;
1148	struct sockaddr_storage maskeddst;
1149	const struct sockaddr *dst = info->rti_info[RTAX_DST];
1150	const struct sockaddr *gateway = info->rti_info[RTAX_GATEWAY];
1151	const struct sockaddr *netmask = info->rti_info[RTAX_NETMASK];
1152	int flags = info->rti_flags;
1153	struct psref psref_ifp, psref_ifa;
1154	int bound = 0;
1155	struct ifnet *ifp = NULL;
1156	bool need_to_release_ifa = true;
1157	bool need_unlock = true;
1158#define senderr(x) { error = x ; goto bad; }
1159
1160	RT_WLOCK();
1161
1162	bound = curlwp_bind();
1163	if ((rtbl = rt_gettable(dst->sa_family)) == NULL)
1164		senderr(ESRCH);
1165	if (flags & RTF_HOST)
1166		netmask = NULL;
1167	switch (req) {
1168	case RTM_DELETE:
1169		if (netmask) {
1170			rt_maskedcopy(dst, (struct sockaddr *)&maskeddst,
1171			    netmask);
1172			dst = (struct sockaddr *)&maskeddst;
1173		}
1174		if ((rt = rt_lookup(rtbl, dst, netmask)) == NULL)
1175			senderr(ESRCH);
1176		if ((rt = rt_deladdr(rtbl, dst, netmask)) == NULL)
1177			senderr(ESRCH);
1178		rt->rt_flags &= ~RTF_UP;
1179		ifa = rt->rt_ifa;
1180		if (ifa->ifa_flags & IFA_ROUTE &&
1181		    rt_ifa_connected(rt, ifa)) {
1182			RT_DPRINTF("rt->_rt_key = %p, ifa = %p, "
1183			    "deleted IFA_ROUTE\n",
1184			    (void *)rt->_rt_key, (void *)ifa);
1185			ifa->ifa_flags &= ~IFA_ROUTE;
1186		}
1187		if (ifa->ifa_rtrequest)
1188			ifa->ifa_rtrequest(RTM_DELETE, rt, info);
1189		ifa = NULL;
1190		rttrash++;
1191		if (ret_nrt) {
1192			*ret_nrt = rt;
1193			rt_ref(rt);
1194			RT_REFCNT_TRACE(rt);
1195		}
1196		rtcache_invalidate();
1197		RT_UNLOCK();
1198		need_unlock = false;
1199		rt_timer_remove_all(rt);
1200#if defined(INET) || defined(INET6)
1201		if (netmask != NULL)
1202			lltable_prefix_free(dst->sa_family, dst, netmask, 0);
1203#endif
1204		if (ret_nrt == NULL) {
1205			/* Adjust the refcount */
1206			rt_ref(rt);
1207			RT_REFCNT_TRACE(rt);
1208			rt_free(rt);
1209		}
1210		break;
1211
1212	case RTM_ADD:
1213		if (info->rti_ifa == NULL) {
1214			ifp = rt_getifp(info, &psref_ifp);
1215			ifa = rt_getifa(info, &psref_ifa);
1216			if (ifa == NULL)
1217				senderr(ENETUNREACH);
1218		} else {
1219			/* Caller should have a reference of ifa */
1220			ifa = info->rti_ifa;
1221			need_to_release_ifa = false;
1222		}
1223		rt = pool_get(&rtentry_pool, PR_NOWAIT);
1224		if (rt == NULL)
1225			senderr(ENOBUFS);
1226		memset(rt, 0, sizeof(*rt));
1227		rt->rt_flags = RTF_UP | (flags & ~RTF_DONTCHANGEIFA);
1228		LIST_INIT(&rt->rt_timer);
1229
1230		RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1231		if (netmask) {
1232			rt_maskedcopy(dst, (struct sockaddr *)&maskeddst,
1233			    netmask);
1234			rt_setkey(rt, (struct sockaddr *)&maskeddst, M_NOWAIT);
1235		} else {
1236			rt_setkey(rt, dst, M_NOWAIT);
1237		}
1238		RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1239		if (rt_getkey(rt) == NULL ||
1240		    rt_setgate(rt, gateway) != 0) {
1241			pool_put(&rtentry_pool, rt);
1242			senderr(ENOBUFS);
1243		}
1244
1245		rt_set_ifa(rt, ifa);
1246		if (info->rti_info[RTAX_TAG] != NULL) {
1247			const struct sockaddr *tag;
1248			tag = rt_settag(rt, info->rti_info[RTAX_TAG]);
1249			if (tag == NULL)
1250				senderr(ENOBUFS);
1251		}
1252		RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1253
1254		ss = pserialize_read_enter();
1255		if (info->rti_info[RTAX_IFP] != NULL) {
1256			struct ifaddr *ifa2;
1257			ifa2 = ifa_ifwithnet(info->rti_info[RTAX_IFP]);
1258			if (ifa2 != NULL)
1259				rt->rt_ifp = ifa2->ifa_ifp;
1260			else
1261				rt->rt_ifp = ifa->ifa_ifp;
1262		} else
1263			rt->rt_ifp = ifa->ifa_ifp;
1264		pserialize_read_exit(ss);
1265		cv_init(&rt->rt_cv, "rtentry");
1266		psref_target_init(&rt->rt_psref, rt_psref_class);
1267
1268		RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1269		rc = rt_addaddr(rtbl, rt, netmask);
1270		RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1271		if (rc != 0) {
1272			ifafree(ifa); /* for rt_set_ifa above */
1273			cv_destroy(&rt->rt_cv);
1274			rt_destroy(rt);
1275			pool_put(&rtentry_pool, rt);
1276			senderr(rc);
1277		}
1278		RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1279		if (ifa->ifa_rtrequest)
1280			ifa->ifa_rtrequest(req, rt, info);
1281		if (need_to_release_ifa)
1282			ifa_release(ifa, &psref_ifa);
1283		ifa = NULL;
1284		if_put(ifp, &psref_ifp);
1285		ifp = NULL;
1286		RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1287		if (ret_nrt) {
1288			*ret_nrt = rt;
1289			rt_ref(rt);
1290			RT_REFCNT_TRACE(rt);
1291		}
1292		rtcache_invalidate();
1293		RT_UNLOCK();
1294		need_unlock = false;
1295		break;
1296	case RTM_GET:
1297		if (netmask != NULL) {
1298			rt_maskedcopy(dst, (struct sockaddr *)&maskeddst,
1299			    netmask);
1300			dst = (struct sockaddr *)&maskeddst;
1301		}
1302		if ((rt = rt_lookup(rtbl, dst, netmask)) == NULL)
1303			senderr(ESRCH);
1304		if (ret_nrt != NULL) {
1305			*ret_nrt = rt;
1306			rt_ref(rt);
1307			RT_REFCNT_TRACE(rt);
1308		}
1309		break;
1310	}
1311bad:
1312	if (need_to_release_ifa)
1313		ifa_release(ifa, &psref_ifa);
1314	if_put(ifp, &psref_ifp);
1315	curlwp_bindx(bound);
1316	if (need_unlock)
1317		RT_UNLOCK();
1318	splx(s);
1319	return error;
1320}
1321
1322int
1323rt_setgate(struct rtentry *rt, const struct sockaddr *gate)
1324{
1325	struct sockaddr *new, *old;
1326
1327	KASSERT(RT_WLOCKED());
1328	KASSERT(rt->_rt_key != NULL);
1329	RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1330
1331	new = sockaddr_dup(gate, M_ZERO | M_NOWAIT);
1332	if (new == NULL)
1333		return ENOMEM;
1334
1335	old = rt->rt_gateway;
1336	rt->rt_gateway = new;
1337	if (old != NULL)
1338		sockaddr_free(old);
1339
1340	KASSERT(rt->_rt_key != NULL);
1341	RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1342
1343	if (rt->rt_flags & RTF_GATEWAY) {
1344		struct rtentry *gwrt;
1345
1346		gwrt = rtalloc1_locked(gate, 1, false, true);
1347		/*
1348		 * If we switched gateways, grab the MTU from the new
1349		 * gateway route if the current MTU, if the current MTU is
1350		 * greater than the MTU of gateway.
1351		 * Note that, if the MTU of gateway is 0, we will reset the
1352		 * MTU of the route to run PMTUD again from scratch. XXX
1353		 */
1354		if (gwrt != NULL) {
1355			KASSERT(gwrt->_rt_key != NULL);
1356			RT_DPRINTF("gwrt->_rt_key = %p\n", gwrt->_rt_key);
1357			if ((rt->rt_rmx.rmx_locks & RTV_MTU) == 0 &&
1358			    rt->rt_rmx.rmx_mtu &&
1359			    rt->rt_rmx.rmx_mtu > gwrt->rt_rmx.rmx_mtu) {
1360				rt->rt_rmx.rmx_mtu = gwrt->rt_rmx.rmx_mtu;
1361			}
1362			rt_unref(gwrt);
1363		}
1364	}
1365	KASSERT(rt->_rt_key != NULL);
1366	RT_DPRINTF("rt->_rt_key = %p\n", (void *)rt->_rt_key);
1367	return 0;
1368}
1369
1370static struct ifaddr *
1371rt_update_get_ifa(const struct rt_addrinfo *info, const struct rtentry *rt,
1372    struct ifnet **ifp, struct psref *psref_ifp, struct psref *psref)
1373{
1374	struct ifaddr *ifa = NULL;
1375
1376	*ifp = NULL;
1377	if (info->rti_info[RTAX_IFP] != NULL) {
1378		ifa = ifa_ifwithnet_psref(info->rti_info[RTAX_IFP], psref);
1379		if (ifa == NULL)
1380			goto next;
1381		if (ifa->ifa_ifp->if_flags & IFF_UNNUMBERED) {
1382			ifa_release(ifa, psref);
1383			ifa = NULL;
1384			goto next;
1385		}
1386		*ifp = ifa->ifa_ifp;
1387		if_acquire(*ifp, psref_ifp);
1388		if (info->rti_info[RTAX_IFA] == NULL &&
1389		    info->rti_info[RTAX_GATEWAY] == NULL)
1390			goto out;
1391		ifa_release(ifa, psref);
1392		if (info->rti_info[RTAX_IFA] == NULL) {
1393			/* route change <dst> <gw> -ifp <if> */
1394			ifa = ifaof_ifpforaddr_psref(
1395			    info->rti_info[RTAX_GATEWAY], *ifp, psref);
1396		} else {
1397			/* route change <dst> -ifp <if> -ifa <addr> */
1398			ifa = ifa_ifwithaddr_psref(info->rti_info[RTAX_IFA],
1399			    psref);
1400			if (ifa != NULL)
1401				goto out;
1402			ifa = ifaof_ifpforaddr_psref(info->rti_info[RTAX_IFA],
1403			    *ifp, psref);
1404		}
1405		goto out;
1406	}
1407next:
1408	if (info->rti_info[RTAX_IFA] != NULL) {
1409		/* route change <dst> <gw> -ifa <addr> */
1410		ifa = ifa_ifwithaddr_psref(info->rti_info[RTAX_IFA], psref);
1411		if (ifa != NULL)
1412			goto out;
1413	}
1414	if (info->rti_info[RTAX_GATEWAY] != NULL) {
1415		/* route change <dst> <gw> */
1416		ifa = ifa_ifwithroute_psref(rt->rt_flags, rt_getkey(rt),
1417		    info->rti_info[RTAX_GATEWAY], psref);
1418	}
1419out:
1420	if (ifa != NULL && *ifp == NULL) {
1421		*ifp = ifa->ifa_ifp;
1422		if_acquire(*ifp, psref_ifp);
1423	}
1424	if (ifa == NULL && *ifp != NULL) {
1425		if_put(*ifp, psref_ifp);
1426		*ifp = NULL;
1427	}
1428	return ifa;
1429}
1430
1431int
1432rt_update(struct rtentry *rt, struct rt_addrinfo *info, void *rtm)
1433{
1434	int error = 0;
1435	struct ifnet *ifp = NULL, *new_ifp = NULL;
1436	struct ifaddr *ifa = NULL, *new_ifa;
1437	struct psref psref_ifa, psref_new_ifa, psref_ifp, psref_new_ifp;
1438	bool newgw, ifp_changed = false;
1439
1440	RT_WLOCK();
1441	/*
1442	 * New gateway could require new ifaddr, ifp;
1443	 * flags may also be different; ifp may be specified
1444	 * by ll sockaddr when protocol address is ambiguous
1445	 */
1446	newgw = info->rti_info[RTAX_GATEWAY] != NULL &&
1447	    sockaddr_cmp(info->rti_info[RTAX_GATEWAY], rt->rt_gateway) != 0;
1448
1449	if (newgw || info->rti_info[RTAX_IFP] != NULL ||
1450	    info->rti_info[RTAX_IFA] != NULL) {
1451		ifp = rt_getifp(info, &psref_ifp);
1452		/* info refers ifp so we need to keep a reference */
1453		ifa = rt_getifa(info, &psref_ifa);
1454		if (ifa == NULL) {
1455			error = ENETUNREACH;
1456			goto out;
1457		}
1458	}
1459	if (newgw) {
1460		error = rt_setgate(rt, info->rti_info[RTAX_GATEWAY]);
1461		if (error != 0)
1462			goto out;
1463	}
1464	if (info->rti_info[RTAX_TAG]) {
1465		const struct sockaddr *tag;
1466		tag = rt_settag(rt, info->rti_info[RTAX_TAG]);
1467		if (tag == NULL) {
1468			error = ENOBUFS;
1469			goto out;
1470		}
1471	}
1472	/*
1473	 * New gateway could require new ifaddr, ifp;
1474	 * flags may also be different; ifp may be specified
1475	 * by ll sockaddr when protocol address is ambiguous
1476	 */
1477	new_ifa = rt_update_get_ifa(info, rt, &new_ifp, &psref_new_ifp,
1478	    &psref_new_ifa);
1479	if (new_ifa != NULL) {
1480		ifa_release(ifa, &psref_ifa);
1481		ifa = new_ifa;
1482	}
1483	if (ifa) {
1484		struct ifaddr *oifa = rt->rt_ifa;
1485		if (oifa != ifa && !ifa_is_destroying(ifa) &&
1486		    new_ifp != NULL && !if_is_deactivated(new_ifp)) {
1487			if (oifa && oifa->ifa_rtrequest)
1488				oifa->ifa_rtrequest(RTM_DELETE, rt, info);
1489			rt_replace_ifa(rt, ifa);
1490			rt->rt_ifp = new_ifp;
1491			ifp_changed = true;
1492		}
1493		if (new_ifa == NULL)
1494			ifa_release(ifa, &psref_ifa);
1495		/* To avoid ifa_release below */
1496		ifa = NULL;
1497	}
1498	ifa_release(new_ifa, &psref_new_ifa);
1499	if (new_ifp && rt->rt_ifp != new_ifp && !if_is_deactivated(new_ifp)) {
1500		rt->rt_ifp = new_ifp;
1501		ifp_changed = true;
1502	}
1503	rt_setmetrics(rtm, rt);
1504	if (rt->rt_flags != info->rti_flags) {
1505		rt->rt_flags = (info->rti_flags & ~PRESERVED_RTF) |
1506		    (rt->rt_flags & PRESERVED_RTF);
1507	}
1508	if (rt->rt_ifa->ifa_rtrequest)
1509		rt->rt_ifa->ifa_rtrequest(RTM_ADD, rt, info);
1510#if defined(INET) || defined(INET6)
1511	if (ifp_changed && rt_mask(rt) != NULL)
1512		lltable_prefix_free(rt_getkey(rt)->sa_family, rt_getkey(rt),
1513		    rt_mask(rt), 0);
1514#else
1515	(void)ifp_changed; /* XXX gcc */
1516#endif
1517out:
1518	ifa_release(ifa, &psref_ifa);
1519	if_put(new_ifp, &psref_new_ifp);
1520	if_put(ifp, &psref_ifp);
1521
1522	RT_UNLOCK();
1523
1524	return error;
1525}
1526
1527static void
1528rt_maskedcopy(const struct sockaddr *src, struct sockaddr *dst,
1529	const struct sockaddr *netmask)
1530{
1531	const char *netmaskp = &netmask->sa_data[0],
1532	           *srcp = &src->sa_data[0];
1533	char *dstp = &dst->sa_data[0];
1534	const char *maskend = (char *)dst + MIN(netmask->sa_len, src->sa_len);
1535	const char *srcend = (char *)dst + src->sa_len;
1536
1537	dst->sa_len = src->sa_len;
1538	dst->sa_family = src->sa_family;
1539
1540	while (dstp < maskend)
1541		*dstp++ = *srcp++ & *netmaskp++;
1542	if (dstp < srcend)
1543		memset(dstp, 0, (size_t)(srcend - dstp));
1544}
1545
1546/*
1547 * Inform the routing socket of a route change.
1548 */
1549void
1550rt_newmsg(const int cmd, const struct rtentry *rt)
1551{
1552	struct rt_addrinfo info;
1553
1554	memset((void *)&info, 0, sizeof(info));
1555	info.rti_info[RTAX_DST] = rt_getkey(rt);
1556	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
1557	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
1558	if (rt->rt_ifp) {
1559		info.rti_info[RTAX_IFP] = rt->rt_ifp->if_dl->ifa_addr;
1560		info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
1561	}
1562
1563	rt_missmsg(cmd, &info, rt->rt_flags, 0);
1564}
1565
1566/*
1567 * Inform the routing socket of a route change for RTF_DYNAMIC.
1568 */
1569void
1570rt_newmsg_dynamic(const int cmd, const struct rtentry *rt)
1571{
1572	struct rt_addrinfo info;
1573	struct sockaddr *gateway = rt->rt_gateway;
1574
1575	if (gateway == NULL)
1576		return;
1577
1578	switch(gateway->sa_family) {
1579#ifdef INET
1580	case AF_INET: {
1581		extern bool icmp_dynamic_rt_msg;
1582		if (!icmp_dynamic_rt_msg)
1583			return;
1584		break;
1585	}
1586#endif
1587#ifdef INET6
1588	case AF_INET6: {
1589		extern bool icmp6_dynamic_rt_msg;
1590		if (!icmp6_dynamic_rt_msg)
1591			return;
1592		break;
1593	}
1594#endif
1595	default:
1596		return;
1597	}
1598
1599	memset((void *)&info, 0, sizeof(info));
1600	info.rti_info[RTAX_DST] = rt_getkey(rt);
1601	info.rti_info[RTAX_GATEWAY] = gateway;
1602	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
1603	if (rt->rt_ifp) {
1604		info.rti_info[RTAX_IFP] = rt->rt_ifp->if_dl->ifa_addr;
1605		info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
1606	}
1607
1608	rt_missmsg(cmd, &info, rt->rt_flags, 0);
1609}
1610
1611/*
1612 * Set up or tear down a routing table entry, normally
1613 * for an interface.
1614 */
1615int
1616rtinit(struct ifaddr *ifa, int cmd, int flags)
1617{
1618	struct rtentry *rt;
1619	struct sockaddr *dst, *odst;
1620	struct sockaddr_storage maskeddst;
1621	struct rtentry *nrt = NULL;
1622	int error;
1623	struct rt_addrinfo info;
1624
1625	dst = flags & RTF_HOST ? ifa->ifa_dstaddr : ifa->ifa_addr;
1626	if (cmd == RTM_DELETE) {
1627		if ((flags & RTF_HOST) == 0 && ifa->ifa_netmask) {
1628			/* Delete subnet route for this interface */
1629			odst = dst;
1630			dst = (struct sockaddr *)&maskeddst;
1631			rt_maskedcopy(odst, dst, ifa->ifa_netmask);
1632		}
1633		if ((rt = rtalloc1(dst, 0)) != NULL) {
1634			if (rt->rt_ifa != ifa) {
1635				rt_unref(rt);
1636				return (flags & RTF_HOST) ? EHOSTUNREACH
1637							: ENETUNREACH;
1638			}
1639			rt_unref(rt);
1640		}
1641	}
1642	memset(&info, 0, sizeof(info));
1643	info.rti_ifa = ifa;
1644	info.rti_flags = flags | ifa->ifa_flags | RTF_DONTCHANGEIFA;
1645	info.rti_info[RTAX_DST] = dst;
1646	info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
1647
1648	/*
1649	 * XXX here, it seems that we are assuming that ifa_netmask is NULL
1650	 * for RTF_HOST.  bsdi4 passes NULL explicitly (via intermediate
1651	 * variable) when RTF_HOST is 1.  still not sure if i can safely
1652	 * change it to meet bsdi4 behavior.
1653	 */
1654	if (cmd != RTM_LLINFO_UPD)
1655		info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
1656	error = rtrequest1((cmd == RTM_LLINFO_UPD) ? RTM_GET : cmd, &info,
1657	    &nrt);
1658	if (error != 0)
1659		return error;
1660
1661	rt = nrt;
1662	RT_REFCNT_TRACE(rt);
1663	switch (cmd) {
1664	case RTM_DELETE:
1665		rt_newmsg(cmd, rt);
1666		rt_free(rt);
1667		break;
1668	case RTM_LLINFO_UPD:
1669		if (cmd == RTM_LLINFO_UPD && ifa->ifa_rtrequest != NULL)
1670			ifa->ifa_rtrequest(RTM_LLINFO_UPD, rt, &info);
1671		rt_newmsg(RTM_CHANGE, rt);
1672		rt_unref(rt);
1673		break;
1674	case RTM_ADD:
1675		KASSERT(rt->rt_ifa == ifa);
1676		rt_newmsg(cmd, rt);
1677		rt_unref(rt);
1678		RT_REFCNT_TRACE(rt);
1679		break;
1680	}
1681	return error;
1682}
1683
1684/*
1685 * Create a local route entry for the address.
1686 * Announce the addition of the address and the route to the routing socket.
1687 */
1688int
1689rt_ifa_addlocal(struct ifaddr *ifa)
1690{
1691	struct rtentry *rt;
1692	int e;
1693
1694	/* If there is no loopback entry, allocate one. */
1695	rt = rtalloc1(ifa->ifa_addr, 0);
1696#ifdef RT_DEBUG
1697	if (rt != NULL)
1698		dump_rt(rt);
1699#endif
1700	if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0 ||
1701	    (rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0)
1702	{
1703		struct rt_addrinfo info;
1704		struct rtentry *nrt;
1705
1706		memset(&info, 0, sizeof(info));
1707		info.rti_flags = RTF_HOST | RTF_LOCAL | RTF_DONTCHANGEIFA;
1708		info.rti_info[RTAX_DST] = ifa->ifa_addr;
1709		info.rti_info[RTAX_GATEWAY] =
1710		    (const struct sockaddr *)ifa->ifa_ifp->if_sadl;
1711		info.rti_ifa = ifa;
1712		nrt = NULL;
1713		e = rtrequest1(RTM_ADD, &info, &nrt);
1714		rt_addrmsg_rt(RTM_ADD, ifa, e, nrt);
1715		if (nrt != NULL) {
1716			KASSERT(nrt->rt_ifa == ifa);
1717#ifdef RT_DEBUG
1718			dump_rt(nrt);
1719#endif
1720			rt_unref(nrt);
1721			RT_REFCNT_TRACE(nrt);
1722		}
1723	} else {
1724		e = 0;
1725		rt_addrmsg(RTM_NEWADDR, ifa);
1726	}
1727	if (rt != NULL)
1728		rt_unref(rt);
1729	return e;
1730}
1731
1732/*
1733 * Remove the local route entry for the address.
1734 * Announce the removal of the address and the route to the routing socket.
1735 */
1736int
1737rt_ifa_remlocal(struct ifaddr *ifa, struct ifaddr *alt_ifa)
1738{
1739	struct rtentry *rt;
1740	int e = 0;
1741
1742	rt = rtalloc1(ifa->ifa_addr, 0);
1743
1744	/*
1745	 * Before deleting, check if a corresponding loopbacked
1746	 * host route surely exists.  With this check, we can avoid
1747	 * deleting an interface direct route whose destination is
1748	 * the same as the address being removed.  This can happen
1749	 * when removing a subnet-router anycast address on an
1750	 * interface attached to a shared medium.
1751	 */
1752	if (rt != NULL &&
1753	    (rt->rt_flags & RTF_HOST) &&
1754	    (rt->rt_ifp->if_flags & IFF_LOOPBACK))
1755	{
1756		/* If we cannot replace the route's ifaddr with the equivalent
1757		 * ifaddr of another interface, I believe it is safest to
1758		 * delete the route.
1759		 */
1760		if (alt_ifa == NULL) {
1761			e = rtdeletemsg(rt);
1762			if (e == 0) {
1763				rt_unref(rt);
1764				rt_free(rt);
1765				rt = NULL;
1766			}
1767			rt_addrmsg(RTM_DELADDR, ifa);
1768		} else {
1769#ifdef NET_MPSAFE
1770			int error = rt_update_prepare(rt);
1771			if (error == 0) {
1772				rt_replace_ifa(rt, alt_ifa);
1773				rt_update_finish(rt);
1774			} else {
1775				/*
1776				 * If error != 0, the rtentry is being
1777				 * destroyed, so doing nothing doesn't
1778				 * matter.
1779				 */
1780			}
1781#else
1782			rt_replace_ifa(rt, alt_ifa);
1783#endif
1784			rt_newmsg(RTM_CHANGE, rt);
1785		}
1786	} else
1787		rt_addrmsg(RTM_DELADDR, ifa);
1788	if (rt != NULL)
1789		rt_unref(rt);
1790	return e;
1791}
1792
1793/*
1794 * Route timer routines.  These routes allow functions to be called
1795 * for various routes at any time.  This is useful in supporting
1796 * path MTU discovery and redirect route deletion.
1797 *
1798 * This is similar to some BSDI internal functions, but it provides
1799 * for multiple queues for efficiency's sake...
1800 */
1801
1802LIST_HEAD(, rttimer_queue) rttimer_queue_head;
1803static int rt_init_done = 0;
1804
1805/*
1806 * Some subtle order problems with domain initialization mean that
1807 * we cannot count on this being run from rt_init before various
1808 * protocol initializations are done.  Therefore, we make sure
1809 * that this is run when the first queue is added...
1810 */
1811
1812static void rt_timer_work(struct work *, void *);
1813
1814static void
1815rt_timer_init(void)
1816{
1817	int error;
1818
1819	assert(rt_init_done == 0);
1820
1821	/* XXX should be in rt_init */
1822	rw_init(&rt_lock);
1823
1824	LIST_INIT(&rttimer_queue_head);
1825	callout_init(&rt_timer_ch, CALLOUT_MPSAFE);
1826	error = workqueue_create(&rt_timer_wq, "rt_timer",
1827	    rt_timer_work, NULL, PRI_SOFTNET, IPL_SOFTNET, RT_WQ_FLAGS);
1828	if (error)
1829		panic("%s: workqueue_create failed (%d)\n", __func__, error);
1830	callout_reset(&rt_timer_ch, hz, rt_timer_timer, NULL);
1831	rt_init_done = 1;
1832}
1833
1834struct rttimer_queue *
1835rt_timer_queue_create(u_int timeout)
1836{
1837	struct rttimer_queue *rtq;
1838
1839	if (rt_init_done == 0)
1840		rt_timer_init();
1841
1842	R_Malloc(rtq, struct rttimer_queue *, sizeof *rtq);
1843	if (rtq == NULL)
1844		return NULL;
1845	memset(rtq, 0, sizeof(*rtq));
1846
1847	rtq->rtq_timeout = timeout;
1848	TAILQ_INIT(&rtq->rtq_head);
1849	RT_WLOCK();
1850	LIST_INSERT_HEAD(&rttimer_queue_head, rtq, rtq_link);
1851	RT_UNLOCK();
1852
1853	return rtq;
1854}
1855
1856void
1857rt_timer_queue_change(struct rttimer_queue *rtq, long timeout)
1858{
1859
1860	rtq->rtq_timeout = timeout;
1861}
1862
1863static void
1864rt_timer_queue_remove_all(struct rttimer_queue *rtq)
1865{
1866	struct rttimer *r;
1867
1868	RT_ASSERT_WLOCK();
1869
1870	while ((r = TAILQ_FIRST(&rtq->rtq_head)) != NULL) {
1871		LIST_REMOVE(r, rtt_link);
1872		TAILQ_REMOVE(&rtq->rtq_head, r, rtt_next);
1873		rt_ref(r->rtt_rt); /* XXX */
1874		RT_REFCNT_TRACE(r->rtt_rt);
1875		RT_UNLOCK();
1876		(*r->rtt_func)(r->rtt_rt, r);
1877		pool_put(&rttimer_pool, r);
1878		RT_WLOCK();
1879		if (rtq->rtq_count > 0)
1880			rtq->rtq_count--;
1881		else
1882			printf("rt_timer_queue_remove_all: "
1883			    "rtq_count reached 0\n");
1884	}
1885}
1886
1887void
1888rt_timer_queue_destroy(struct rttimer_queue *rtq)
1889{
1890
1891	RT_WLOCK();
1892	rt_timer_queue_remove_all(rtq);
1893	LIST_REMOVE(rtq, rtq_link);
1894	RT_UNLOCK();
1895
1896	/*
1897	 * Caller is responsible for freeing the rttimer_queue structure.
1898	 */
1899}
1900
1901unsigned long
1902rt_timer_count(struct rttimer_queue *rtq)
1903{
1904	return rtq->rtq_count;
1905}
1906
1907static void
1908rt_timer_remove_all(struct rtentry *rt)
1909{
1910	struct rttimer *r;
1911
1912	RT_WLOCK();
1913	while ((r = LIST_FIRST(&rt->rt_timer)) != NULL) {
1914		LIST_REMOVE(r, rtt_link);
1915		TAILQ_REMOVE(&r->rtt_queue->rtq_head, r, rtt_next);
1916		if (r->rtt_queue->rtq_count > 0)
1917			r->rtt_queue->rtq_count--;
1918		else
1919			printf("rt_timer_remove_all: rtq_count reached 0\n");
1920		pool_put(&rttimer_pool, r);
1921	}
1922	RT_UNLOCK();
1923}
1924
1925int
1926rt_timer_add(struct rtentry *rt,
1927	void (*func)(struct rtentry *, struct rttimer *),
1928	struct rttimer_queue *queue)
1929{
1930	struct rttimer *r;
1931
1932	KASSERT(func != NULL);
1933	RT_WLOCK();
1934	/*
1935	 * If there's already a timer with this action, destroy it before
1936	 * we add a new one.
1937	 */
1938	LIST_FOREACH(r, &rt->rt_timer, rtt_link) {
1939		if (r->rtt_func == func)
1940			break;
1941	}
1942	if (r != NULL) {
1943		LIST_REMOVE(r, rtt_link);
1944		TAILQ_REMOVE(&r->rtt_queue->rtq_head, r, rtt_next);
1945		if (r->rtt_queue->rtq_count > 0)
1946			r->rtt_queue->rtq_count--;
1947		else
1948			printf("rt_timer_add: rtq_count reached 0\n");
1949	} else {
1950		r = pool_get(&rttimer_pool, PR_NOWAIT);
1951		if (r == NULL) {
1952			RT_UNLOCK();
1953			return ENOBUFS;
1954		}
1955	}
1956
1957	memset(r, 0, sizeof(*r));
1958
1959	r->rtt_rt = rt;
1960	r->rtt_time = time_uptime;
1961	r->rtt_func = func;
1962	r->rtt_queue = queue;
1963	LIST_INSERT_HEAD(&rt->rt_timer, r, rtt_link);
1964	TAILQ_INSERT_TAIL(&queue->rtq_head, r, rtt_next);
1965	r->rtt_queue->rtq_count++;
1966
1967	RT_UNLOCK();
1968
1969	return 0;
1970}
1971
1972static void
1973rt_timer_work(struct work *wk, void *arg)
1974{
1975	struct rttimer_queue *rtq;
1976	struct rttimer *r;
1977
1978	RT_WLOCK();
1979	LIST_FOREACH(rtq, &rttimer_queue_head, rtq_link) {
1980		while ((r = TAILQ_FIRST(&rtq->rtq_head)) != NULL &&
1981		    (r->rtt_time + rtq->rtq_timeout) < time_uptime) {
1982			LIST_REMOVE(r, rtt_link);
1983			TAILQ_REMOVE(&rtq->rtq_head, r, rtt_next);
1984			/*
1985			 * Take a reference to avoid the rtentry is freed
1986			 * accidentally after RT_UNLOCK.  The callback
1987			 * (rtt_func) must rt_unref it by itself.
1988			 */
1989			rt_ref(r->rtt_rt);
1990			RT_REFCNT_TRACE(r->rtt_rt);
1991			RT_UNLOCK();
1992			(*r->rtt_func)(r->rtt_rt, r);
1993			pool_put(&rttimer_pool, r);
1994			RT_WLOCK();
1995			if (rtq->rtq_count > 0)
1996				rtq->rtq_count--;
1997			else
1998				printf("rt_timer_timer: rtq_count reached 0\n");
1999		}
2000	}
2001	RT_UNLOCK();
2002
2003	callout_reset(&rt_timer_ch, hz, rt_timer_timer, NULL);
2004}
2005
2006static void
2007rt_timer_timer(void *arg)
2008{
2009
2010	workqueue_enqueue(rt_timer_wq, &rt_timer_wk, NULL);
2011}
2012
2013static struct rtentry *
2014_rtcache_init(struct route *ro, int flag)
2015{
2016	struct rtentry *rt;
2017
2018	rtcache_invariants(ro);
2019	KASSERT(ro->_ro_rt == NULL);
2020
2021	if (rtcache_getdst(ro) == NULL)
2022		return NULL;
2023	rt = rtalloc1(rtcache_getdst(ro), flag);
2024	if (rt != NULL) {
2025		RT_RLOCK();
2026		if (ISSET(rt->rt_flags, RTF_UP)) {
2027			ro->_ro_rt = rt;
2028			ro->ro_rtcache_generation = rtcache_generation;
2029			rtcache_ref(rt, ro);
2030		}
2031		RT_UNLOCK();
2032		rt_unref(rt);
2033	}
2034
2035	rtcache_invariants(ro);
2036	return ro->_ro_rt;
2037}
2038
2039struct rtentry *
2040rtcache_init(struct route *ro)
2041{
2042
2043	return _rtcache_init(ro, 1);
2044}
2045
2046struct rtentry *
2047rtcache_init_noclone(struct route *ro)
2048{
2049
2050	return _rtcache_init(ro, 0);
2051}
2052
2053struct rtentry *
2054rtcache_update(struct route *ro, int clone)
2055{
2056
2057	ro->_ro_rt = NULL;
2058	return _rtcache_init(ro, clone);
2059}
2060
2061void
2062rtcache_copy(struct route *new_ro, struct route *old_ro)
2063{
2064	struct rtentry *rt;
2065	int ret;
2066
2067	KASSERT(new_ro != old_ro);
2068	rtcache_invariants(new_ro);
2069	rtcache_invariants(old_ro);
2070
2071	rt = rtcache_validate(old_ro);
2072
2073	if (rtcache_getdst(old_ro) == NULL)
2074		goto out;
2075	ret = rtcache_setdst(new_ro, rtcache_getdst(old_ro));
2076	if (ret != 0)
2077		goto out;
2078
2079	RT_RLOCK();
2080	new_ro->_ro_rt = rt;
2081	new_ro->ro_rtcache_generation = rtcache_generation;
2082	RT_UNLOCK();
2083	rtcache_invariants(new_ro);
2084out:
2085	rtcache_unref(rt, old_ro);
2086	return;
2087}
2088
2089#if defined(RT_DEBUG) && defined(NET_MPSAFE)
2090static void
2091rtcache_trace(const char *func, struct rtentry *rt, struct route *ro)
2092{
2093	char dst[64];
2094
2095	sockaddr_format(ro->ro_sa, dst, 64);
2096	printf("trace: %s:\tdst=%s cpu=%d lwp=%p psref=%p target=%p\n", func, dst,
2097	    cpu_index(curcpu()), curlwp, &ro->ro_psref, &rt->rt_psref);
2098}
2099#define RTCACHE_PSREF_TRACE(rt, ro)	rtcache_trace(__func__, (rt), (ro))
2100#else
2101#define RTCACHE_PSREF_TRACE(rt, ro)	do {} while (0)
2102#endif
2103
2104static void
2105rtcache_ref(struct rtentry *rt, struct route *ro)
2106{
2107
2108	KASSERT(rt != NULL);
2109
2110#ifdef NET_MPSAFE
2111	RTCACHE_PSREF_TRACE(rt, ro);
2112	ro->ro_bound = curlwp_bind();
2113	/* XXX Use a real caller's address */
2114	PSREF_DEBUG_FILL_RETURN_ADDRESS(&ro->ro_psref);
2115	psref_acquire(&ro->ro_psref, &rt->rt_psref, rt_psref_class);
2116#endif
2117}
2118
2119void
2120rtcache_unref(struct rtentry *rt, struct route *ro)
2121{
2122
2123	if (rt == NULL)
2124		return;
2125
2126#ifdef NET_MPSAFE
2127	psref_release(&ro->ro_psref, &rt->rt_psref, rt_psref_class);
2128	curlwp_bindx(ro->ro_bound);
2129	RTCACHE_PSREF_TRACE(rt, ro);
2130#endif
2131}
2132
2133struct rtentry *
2134rtcache_validate(struct route *ro)
2135{
2136	struct rtentry *rt = NULL;
2137
2138#ifdef NET_MPSAFE
2139retry:
2140#endif
2141	rtcache_invariants(ro);
2142	RT_RLOCK();
2143	if (ro->ro_rtcache_generation != rtcache_generation) {
2144		/* The cache is invalidated */
2145		rt = NULL;
2146		goto out;
2147	}
2148
2149	rt = ro->_ro_rt;
2150	if (rt == NULL)
2151		goto out;
2152
2153	if ((rt->rt_flags & RTF_UP) == 0) {
2154		rt = NULL;
2155		goto out;
2156	}
2157#ifdef NET_MPSAFE
2158	if (ISSET(rt->rt_flags, RTF_UPDATING)) {
2159		if (rt_wait_ok()) {
2160			RT_UNLOCK();
2161
2162			/* We can wait until the update is complete */
2163			rt_update_wait();
2164			goto retry;
2165		} else {
2166			rt = NULL;
2167		}
2168	} else
2169#endif
2170		rtcache_ref(rt, ro);
2171out:
2172	RT_UNLOCK();
2173	return rt;
2174}
2175
2176struct rtentry *
2177rtcache_lookup2(struct route *ro, const struct sockaddr *dst,
2178    int clone, int *hitp)
2179{
2180	const struct sockaddr *odst;
2181	struct rtentry *rt = NULL;
2182
2183	odst = rtcache_getdst(ro);
2184	if (odst == NULL)
2185		goto miss;
2186
2187	if (sockaddr_cmp(odst, dst) != 0) {
2188		rtcache_free(ro);
2189		goto miss;
2190	}
2191
2192	rt = rtcache_validate(ro);
2193	if (rt == NULL) {
2194		ro->_ro_rt = NULL;
2195		goto miss;
2196	}
2197
2198	rtcache_invariants(ro);
2199
2200	if (hitp != NULL)
2201		*hitp = 1;
2202	return rt;
2203miss:
2204	if (hitp != NULL)
2205		*hitp = 0;
2206	if (rtcache_setdst(ro, dst) == 0)
2207		rt = _rtcache_init(ro, clone);
2208
2209	rtcache_invariants(ro);
2210
2211	return rt;
2212}
2213
2214void
2215rtcache_free(struct route *ro)
2216{
2217
2218	ro->_ro_rt = NULL;
2219	if (ro->ro_sa != NULL) {
2220		sockaddr_free(ro->ro_sa);
2221		ro->ro_sa = NULL;
2222	}
2223	rtcache_invariants(ro);
2224}
2225
2226int
2227rtcache_setdst(struct route *ro, const struct sockaddr *sa)
2228{
2229	KASSERT(sa != NULL);
2230
2231	rtcache_invariants(ro);
2232	if (ro->ro_sa != NULL) {
2233		if (ro->ro_sa->sa_family == sa->sa_family) {
2234			ro->_ro_rt = NULL;
2235			sockaddr_copy(ro->ro_sa, ro->ro_sa->sa_len, sa);
2236			rtcache_invariants(ro);
2237			return 0;
2238		}
2239		/* free ro_sa, wrong family */
2240		rtcache_free(ro);
2241	}
2242
2243	KASSERT(ro->_ro_rt == NULL);
2244
2245	if ((ro->ro_sa = sockaddr_dup(sa, M_ZERO | M_NOWAIT)) == NULL) {
2246		rtcache_invariants(ro);
2247		return ENOMEM;
2248	}
2249	rtcache_invariants(ro);
2250	return 0;
2251}
2252
2253static void
2254rtcache_percpu_init_cpu(void *p, void *arg __unused, struct cpu_info *ci __unused)
2255{
2256	struct route **rop = p;
2257
2258	/*
2259	 * We can't have struct route as percpu data because it can be destroyed
2260	 * over a memory enlargement processing of percpu.
2261	 */
2262	*rop = kmem_zalloc(sizeof(**rop), KM_SLEEP);
2263}
2264
2265percpu_t *
2266rtcache_percpu_alloc(void)
2267{
2268
2269	return percpu_create(sizeof(struct route *),
2270	    rtcache_percpu_init_cpu, NULL, NULL);
2271}
2272
2273const struct sockaddr *
2274rt_settag(struct rtentry *rt, const struct sockaddr *tag)
2275{
2276	if (rt->rt_tag != tag) {
2277		if (rt->rt_tag != NULL)
2278			sockaddr_free(rt->rt_tag);
2279		rt->rt_tag = sockaddr_dup(tag, M_ZERO | M_NOWAIT);
2280	}
2281	return rt->rt_tag;
2282}
2283
2284struct sockaddr *
2285rt_gettag(const struct rtentry *rt)
2286{
2287	return rt->rt_tag;
2288}
2289
2290int
2291rt_check_reject_route(const struct rtentry *rt, const struct ifnet *ifp)
2292{
2293
2294	if ((rt->rt_flags & RTF_REJECT) != 0) {
2295		/* Mimic looutput */
2296		if (ifp->if_flags & IFF_LOOPBACK)
2297			return (rt->rt_flags & RTF_HOST) ?
2298			    EHOSTUNREACH : ENETUNREACH;
2299		else if (rt->rt_rmx.rmx_expire == 0 ||
2300		    time_uptime < rt->rt_rmx.rmx_expire)
2301			return (rt->rt_flags & RTF_GATEWAY) ?
2302			    EHOSTUNREACH : EHOSTDOWN;
2303	}
2304
2305	return 0;
2306}
2307
2308void
2309rt_delete_matched_entries(sa_family_t family, int (*f)(struct rtentry *, void *),
2310    void *v, bool notify)
2311{
2312
2313	for (;;) {
2314		int s;
2315		int error;
2316		struct rtentry *rt, *retrt = NULL;
2317
2318		RT_RLOCK();
2319		s = splsoftnet();
2320		rt = rtbl_search_matched_entry(family, f, v);
2321		if (rt == NULL) {
2322			splx(s);
2323			RT_UNLOCK();
2324			return;
2325		}
2326		rt_ref(rt);
2327		RT_REFCNT_TRACE(rt);
2328		splx(s);
2329		RT_UNLOCK();
2330
2331		error = rtrequest(RTM_DELETE, rt_getkey(rt), rt->rt_gateway,
2332		    rt_mask(rt), rt->rt_flags, &retrt);
2333		if (error == 0) {
2334			KASSERT(retrt == rt);
2335			KASSERT((retrt->rt_flags & RTF_UP) == 0);
2336			if (notify)
2337				rt_newmsg(RTM_DELETE, retrt);
2338			retrt->rt_ifp = NULL;
2339			rt_unref(rt);
2340			RT_REFCNT_TRACE(rt);
2341			rt_free(retrt);
2342		} else if (error == ESRCH) {
2343			/* Someone deleted the entry already. */
2344			rt_unref(rt);
2345			RT_REFCNT_TRACE(rt);
2346		} else {
2347			log(LOG_ERR, "%s: unable to delete rtentry @ %p, "
2348			    "error = %d\n", rt->rt_ifp->if_xname, rt, error);
2349			/* XXX how to treat this case? */
2350		}
2351	}
2352}
2353
2354static int
2355rt_walktree_locked(sa_family_t family, int (*f)(struct rtentry *, void *),
2356    void *v)
2357{
2358
2359	return rtbl_walktree(family, f, v);
2360}
2361
2362void
2363rt_replace_ifa_matched_entries(sa_family_t family,
2364    int (*f)(struct rtentry *, void *), void *v, struct ifaddr *ifa)
2365{
2366
2367	for (;;) {
2368		int s;
2369#ifdef NET_MPSAFE
2370		int error;
2371#endif
2372		struct rtentry *rt;
2373
2374		RT_RLOCK();
2375		s = splsoftnet();
2376		rt = rtbl_search_matched_entry(family, f, v);
2377		if (rt == NULL) {
2378			splx(s);
2379			RT_UNLOCK();
2380			return;
2381		}
2382		rt_ref(rt);
2383		RT_REFCNT_TRACE(rt);
2384		splx(s);
2385		RT_UNLOCK();
2386
2387#ifdef NET_MPSAFE
2388		error = rt_update_prepare(rt);
2389		if (error == 0) {
2390			rt_replace_ifa(rt, ifa);
2391			rt_update_finish(rt);
2392			rt_newmsg(RTM_CHANGE, rt);
2393		} else {
2394			/*
2395			 * If error != 0, the rtentry is being
2396			 * destroyed, so doing nothing doesn't
2397			 * matter.
2398			 */
2399		}
2400#else
2401		rt_replace_ifa(rt, ifa);
2402		rt_newmsg(RTM_CHANGE, rt);
2403#endif
2404		rt_unref(rt);
2405		RT_REFCNT_TRACE(rt);
2406	}
2407}
2408
2409int
2410rt_walktree(sa_family_t family, int (*f)(struct rtentry *, void *), void *v)
2411{
2412	int error;
2413
2414	RT_RLOCK();
2415	error = rt_walktree_locked(family, f, v);
2416	RT_UNLOCK();
2417
2418	return error;
2419}
2420
2421#ifdef DDB
2422
2423#include <machine/db_machdep.h>
2424#include <ddb/db_interface.h>
2425#include <ddb/db_output.h>
2426
2427#define	rt_expire rt_rmx.rmx_expire
2428
2429static void
2430db_print_sa(const struct sockaddr *sa)
2431{
2432	int len;
2433	const u_char *p;
2434
2435	if (sa == NULL) {
2436		db_printf("[NULL]");
2437		return;
2438	}
2439
2440	p = (const u_char *)sa;
2441	len = sa->sa_len;
2442	db_printf("[");
2443	while (len > 0) {
2444		db_printf("%d", *p);
2445		p++; len--;
2446		if (len) db_printf(",");
2447	}
2448	db_printf("]\n");
2449}
2450
2451static void
2452db_print_ifa(struct ifaddr *ifa)
2453{
2454	if (ifa == NULL)
2455		return;
2456	db_printf("  ifa_addr=");
2457	db_print_sa(ifa->ifa_addr);
2458	db_printf("  ifa_dsta=");
2459	db_print_sa(ifa->ifa_dstaddr);
2460	db_printf("  ifa_mask=");
2461	db_print_sa(ifa->ifa_netmask);
2462	db_printf("  flags=0x%x,refcnt=%d,metric=%d\n",
2463			  ifa->ifa_flags,
2464			  ifa->ifa_refcnt,
2465			  ifa->ifa_metric);
2466}
2467
2468/*
2469 * Function to pass to rt_walktree().
2470 * Return non-zero error to abort walk.
2471 */
2472static int
2473db_show_rtentry(struct rtentry *rt, void *w)
2474{
2475	db_printf("rtentry=%p", rt);
2476
2477	db_printf(" flags=0x%x refcnt=%d use=%"PRId64" expire=%"PRId64"\n",
2478			  rt->rt_flags, rt->rt_refcnt,
2479			  rt->rt_use, (uint64_t)rt->rt_expire);
2480
2481	db_printf(" key="); db_print_sa(rt_getkey(rt));
2482	db_printf(" mask="); db_print_sa(rt_mask(rt));
2483	db_printf(" gw="); db_print_sa(rt->rt_gateway);
2484
2485	db_printf(" ifp=%p ", rt->rt_ifp);
2486	if (rt->rt_ifp)
2487		db_printf("(%s)", rt->rt_ifp->if_xname);
2488	else
2489		db_printf("(NULL)");
2490
2491	db_printf(" ifa=%p\n", rt->rt_ifa);
2492	db_print_ifa(rt->rt_ifa);
2493
2494	db_printf(" gwroute=%p llinfo=%p\n",
2495			  rt->rt_gwroute, rt->rt_llinfo);
2496
2497	return 0;
2498}
2499
2500/*
2501 * Function to print all the route trees.
2502 * Use this from ddb:  "show routes"
2503 */
2504void
2505db_show_routes(db_expr_t addr, bool have_addr,
2506    db_expr_t count, const char *modif)
2507{
2508
2509	/* Taking RT_LOCK will fail if LOCKDEBUG is enabled. */
2510	rt_walktree_locked(AF_INET, db_show_rtentry, NULL);
2511}
2512#endif
2513