1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2020 Alexander V. Chernikov
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29#include "opt_inet.h"
30#include "opt_inet6.h"
31#include "opt_route.h"
32
33#include <sys/param.h>
34#include <sys/systm.h>
35#include <sys/malloc.h>
36#include <sys/mbuf.h>
37#include <sys/socket.h>
38#include <sys/sysctl.h>
39#include <sys/syslog.h>
40#include <sys/kernel.h>
41#include <sys/lock.h>
42#include <sys/rmlock.h>
43
44#include <net/if.h>
45#include <net/if_var.h>
46#include <net/if_private.h>
47#include <net/if_dl.h>
48#include <net/vnet.h>
49#include <net/route.h>
50#include <net/route/route_ctl.h>
51#include <net/route/route_var.h>
52#include <net/route/nhop_utils.h>
53#include <net/route/nhop.h>
54#include <net/route/nhop_var.h>
55#include <netinet/in.h>
56#include <netinet6/scope6_var.h>
57#include <netinet6/in6_var.h>
58
59#define	DEBUG_MOD_NAME	route_ctl
60#define	DEBUG_MAX_LEVEL	LOG_DEBUG
61#include <net/route/route_debug.h>
62_DECLARE_DEBUG(LOG_INFO);
63
64/*
65 * This file contains control plane routing tables functions.
66 *
67 * All functions assumes they are called in net epoch.
68 */
69
70union sockaddr_union {
71	struct sockaddr		sa;
72	struct sockaddr_in	sin;
73	struct sockaddr_in6	sin6;
74	char			_buf[32];
75};
76
77static int add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
78    struct rib_cmd_info *rc);
79static int change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
80    struct rt_addrinfo *info, struct route_nhop_data *nhd_orig,
81    struct rib_cmd_info *rc);
82
83static int add_route_flags(struct rib_head *rnh, struct rtentry *rt,
84    struct route_nhop_data *rnd_add, int op_flags, struct rib_cmd_info *rc);
85#ifdef ROUTE_MPATH
86static int add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
87    struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
88    int op_flags, struct rib_cmd_info *rc);
89#endif
90
91static int add_route(struct rib_head *rnh, struct rtentry *rt,
92    struct route_nhop_data *rnd, struct rib_cmd_info *rc);
93static int delete_route(struct rib_head *rnh, struct rtentry *rt,
94    struct rib_cmd_info *rc);
95static int rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
96    int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc);
97
98static bool fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
99    struct sockaddr **pmask);
100static int get_prio_from_info(const struct rt_addrinfo *info);
101static int nhop_get_prio(const struct nhop_object *nh);
102
103#ifdef ROUTE_MPATH
104static bool rib_can_multipath(struct rib_head *rh);
105#endif
106
107/* Per-vnet multipath routing configuration */
108SYSCTL_DECL(_net_route);
109#define	V_rib_route_multipath	VNET(rib_route_multipath)
110#ifdef ROUTE_MPATH
111#define _MP_FLAGS	CTLFLAG_RW
112#else
113#define _MP_FLAGS	CTLFLAG_RD
114#endif
115VNET_DEFINE(u_int, rib_route_multipath) = 1;
116SYSCTL_UINT(_net_route, OID_AUTO, multipath, _MP_FLAGS | CTLFLAG_VNET,
117    &VNET_NAME(rib_route_multipath), 0, "Enable route multipath");
118#undef _MP_FLAGS
119
120#ifdef ROUTE_MPATH
121VNET_DEFINE(u_int, fib_hash_outbound) = 0;
122SYSCTL_UINT(_net_route, OID_AUTO, hash_outbound, CTLFLAG_RD | CTLFLAG_VNET,
123    &VNET_NAME(fib_hash_outbound), 0,
124    "Compute flowid for locally-originated packets");
125
126/* Default entropy to add to the hash calculation for the outbound connections*/
127uint8_t mpath_entropy_key[MPATH_ENTROPY_KEY_LEN] = {
128	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
129	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
130	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
131	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
132	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
133};
134#endif
135
136#if defined(INET) && defined(INET6)
137FEATURE(ipv4_rfc5549_support, "Route IPv4 packets via IPv6 nexthops");
138#define V_rib_route_ipv6_nexthop VNET(rib_route_ipv6_nexthop)
139VNET_DEFINE_STATIC(u_int, rib_route_ipv6_nexthop) = 1;
140SYSCTL_UINT(_net_route, OID_AUTO, ipv6_nexthop, CTLFLAG_RW | CTLFLAG_VNET,
141    &VNET_NAME(rib_route_ipv6_nexthop), 0, "Enable IPv4 route via IPv6 Next Hop address");
142#endif
143
144/* Debug bits */
145SYSCTL_NODE(_net_route, OID_AUTO, debug, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
146
147static struct rib_head *
148get_rnh(uint32_t fibnum, const struct rt_addrinfo *info)
149{
150	struct rib_head *rnh;
151	struct sockaddr *dst;
152
153	KASSERT((fibnum < rt_numfibs), ("rib_add_route: bad fibnum"));
154
155	dst = info->rti_info[RTAX_DST];
156	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
157
158	return (rnh);
159}
160
161#if defined(INET) && defined(INET6)
162bool
163rib_can_4o6_nhop(void)
164{
165	return (!!V_rib_route_ipv6_nexthop);
166}
167#endif
168
169#ifdef ROUTE_MPATH
170static bool
171rib_can_multipath(struct rib_head *rh)
172{
173	int result;
174
175	CURVNET_SET(rh->rib_vnet);
176	result = !!V_rib_route_multipath;
177	CURVNET_RESTORE();
178
179	return (result);
180}
181
182/*
183 * Check is nhop is multipath-eligible.
184 * Avoid nhops without gateways and redirects.
185 *
186 * Returns 1 for multipath-eligible nexthop,
187 * 0 otherwise.
188 */
189bool
190nhop_can_multipath(const struct nhop_object *nh)
191{
192
193	if ((nh->nh_flags & NHF_MULTIPATH) != 0)
194		return (1);
195	if ((nh->nh_flags & NHF_GATEWAY) == 0)
196		return (0);
197	if ((nh->nh_flags & NHF_REDIRECT) != 0)
198		return (0);
199
200	return (1);
201}
202#endif
203
204static int
205get_info_weight(const struct rt_addrinfo *info, uint32_t default_weight)
206{
207	uint32_t weight;
208
209	if (info->rti_mflags & RTV_WEIGHT)
210		weight = info->rti_rmx->rmx_weight;
211	else
212		weight = default_weight;
213	/* Keep upper 1 byte for adm distance purposes */
214	if (weight > RT_MAX_WEIGHT)
215		weight = RT_MAX_WEIGHT;
216	else if (weight == 0)
217		weight = default_weight;
218
219	return (weight);
220}
221
222/*
223 * File-local concept for distingushing between the normal and
224 * RTF_PINNED routes tha can override the "normal" one.
225 */
226#define	NH_PRIORITY_HIGH	2
227#define	NH_PRIORITY_NORMAL	1
228static int
229get_prio_from_info(const struct rt_addrinfo *info)
230{
231	if (info->rti_flags & RTF_PINNED)
232		return (NH_PRIORITY_HIGH);
233	return (NH_PRIORITY_NORMAL);
234}
235
236static int
237nhop_get_prio(const struct nhop_object *nh)
238{
239	if (NH_IS_PINNED(nh))
240		return (NH_PRIORITY_HIGH);
241	return (NH_PRIORITY_NORMAL);
242}
243
244/*
245 * Check if specified @gw matches gw data in the nexthop @nh.
246 *
247 * Returns true if matches, false otherwise.
248 */
249bool
250match_nhop_gw(const struct nhop_object *nh, const struct sockaddr *gw)
251{
252
253	if (nh->gw_sa.sa_family != gw->sa_family)
254		return (false);
255
256	switch (gw->sa_family) {
257	case AF_INET:
258		return (nh->gw4_sa.sin_addr.s_addr ==
259		    ((const struct sockaddr_in *)gw)->sin_addr.s_addr);
260	case AF_INET6:
261		{
262			const struct sockaddr_in6 *gw6;
263			gw6 = (const struct sockaddr_in6 *)gw;
264
265			/*
266			 * Currently (2020-09) IPv6 gws in kernel have their
267			 * scope embedded. Once this becomes false, this code
268			 * has to be revisited.
269			 */
270			if (IN6_ARE_ADDR_EQUAL(&nh->gw6_sa.sin6_addr,
271			    &gw6->sin6_addr))
272				return (true);
273			return (false);
274		}
275	case AF_LINK:
276		{
277			const struct sockaddr_dl *sdl;
278			sdl = (const struct sockaddr_dl *)gw;
279			return (nh->gwl_sa.sdl_index == sdl->sdl_index);
280		}
281	default:
282		return (memcmp(&nh->gw_sa, gw, nh->gw_sa.sa_len) == 0);
283	}
284
285	/* NOTREACHED */
286	return (false);
287}
288
289/*
290 * Matches all nexthop with given @gw.
291 * Can be used as rib_filter_f callback.
292 */
293int
294rib_match_gw(const struct rtentry *rt, const struct nhop_object *nh, void *gw_sa)
295{
296	const struct sockaddr *gw = (const struct sockaddr *)gw_sa;
297
298	return (match_nhop_gw(nh, gw));
299}
300
301struct gw_filter_data {
302	const struct sockaddr *gw;
303	int count;
304};
305
306/*
307 * Matches first occurence of the gateway provided in @gwd
308 */
309static int
310match_gw_one(const struct rtentry *rt, const struct nhop_object *nh, void *_data)
311{
312	struct gw_filter_data *gwd = (struct gw_filter_data *)_data;
313
314	/* Return only first match to make rtsock happy */
315	if (match_nhop_gw(nh, gwd->gw) && gwd->count++ == 0)
316		return (1);
317	return (0);
318}
319
320/*
321 * Checks if data in @info matches nexhop @nh.
322 *
323 * Returns 0 on success,
324 * ESRCH if not matched,
325 * ENOENT if filter function returned false
326 */
327int
328check_info_match_nhop(const struct rt_addrinfo *info, const struct rtentry *rt,
329    const struct nhop_object *nh)
330{
331	const struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
332
333	if (info->rti_filter != NULL) {
334	    if (info->rti_filter(rt, nh, info->rti_filterdata) == 0)
335		    return (ENOENT);
336	    else
337		    return (0);
338	}
339	if ((gw != NULL) && !match_nhop_gw(nh, gw))
340		return (ESRCH);
341
342	return (0);
343}
344
345/*
346 * Runs exact prefix match based on @dst and @netmask.
347 * Returns matched @rtentry if found or NULL.
348 * If rtentry was found, saves nexthop / weight value into @rnd.
349 */
350static struct rtentry *
351lookup_prefix_bysa(struct rib_head *rnh, const struct sockaddr *dst,
352    const struct sockaddr *netmask, struct route_nhop_data *rnd)
353{
354	struct rtentry *rt;
355
356	RIB_LOCK_ASSERT(rnh);
357
358	rt = (struct rtentry *)rnh->rnh_lookup(dst, netmask, &rnh->head);
359	if (rt != NULL) {
360		rnd->rnd_nhop = rt->rt_nhop;
361		rnd->rnd_weight = rt->rt_weight;
362	} else {
363		rnd->rnd_nhop = NULL;
364		rnd->rnd_weight = 0;
365	}
366
367	return (rt);
368}
369
370struct rtentry *
371lookup_prefix_rt(struct rib_head *rnh, const struct rtentry *rt,
372    struct route_nhop_data *rnd)
373{
374	return (lookup_prefix_bysa(rnh, rt_key_const(rt), rt_mask_const(rt), rnd));
375}
376
377/*
378 * Runs exact prefix match based on dst/netmask from @info.
379 * Assumes RIB lock is held.
380 * Returns matched @rtentry if found or NULL.
381 * If rtentry was found, saves nexthop / weight value into @rnd.
382 */
383struct rtentry *
384lookup_prefix(struct rib_head *rnh, const struct rt_addrinfo *info,
385    struct route_nhop_data *rnd)
386{
387	struct rtentry *rt;
388
389	rt = lookup_prefix_bysa(rnh, info->rti_info[RTAX_DST],
390	    info->rti_info[RTAX_NETMASK], rnd);
391
392	return (rt);
393}
394
395const struct rtentry *
396rib_lookup_prefix_plen(struct rib_head *rnh, struct sockaddr *dst, int plen,
397    struct route_nhop_data *rnd)
398{
399	union sockaddr_union mask_storage;
400	struct sockaddr *netmask = &mask_storage.sa;
401
402	if (fill_pxmask_family(dst->sa_family, plen, dst, &netmask))
403		return (lookup_prefix_bysa(rnh, dst, netmask, rnd));
404	return (NULL);
405}
406
407static bool
408fill_pxmask_family(int family, int plen, struct sockaddr *_dst,
409    struct sockaddr **pmask)
410{
411	if (plen == -1) {
412		*pmask = NULL;
413		return (true);
414	}
415
416	switch (family) {
417#ifdef INET
418	case AF_INET:
419		{
420			struct sockaddr_in *mask = (struct sockaddr_in *)(*pmask);
421			struct sockaddr_in *dst= (struct sockaddr_in *)_dst;
422
423			memset(mask, 0, sizeof(*mask));
424			mask->sin_family = family;
425			mask->sin_len = sizeof(*mask);
426			if (plen == 32)
427				*pmask = NULL;
428			else if (plen > 32 || plen < 0)
429				return (false);
430			else {
431				uint32_t daddr, maddr;
432				maddr = htonl(plen ? ~((1 << (32 - plen)) - 1) : 0);
433				mask->sin_addr.s_addr = maddr;
434				daddr = dst->sin_addr.s_addr;
435				daddr = htonl(ntohl(daddr) & ntohl(maddr));
436				dst->sin_addr.s_addr = daddr;
437			}
438			return (true);
439		}
440		break;
441#endif
442#ifdef INET6
443	case AF_INET6:
444		{
445			struct sockaddr_in6 *mask = (struct sockaddr_in6 *)(*pmask);
446			struct sockaddr_in6 *dst = (struct sockaddr_in6 *)_dst;
447
448			memset(mask, 0, sizeof(*mask));
449			mask->sin6_family = family;
450			mask->sin6_len = sizeof(*mask);
451			if (plen == 128)
452				*pmask = NULL;
453			else if (plen > 128 || plen < 0)
454				return (false);
455			else {
456				ip6_writemask(&mask->sin6_addr, plen);
457				IN6_MASK_ADDR(&dst->sin6_addr, &mask->sin6_addr);
458			}
459			return (true);
460		}
461		break;
462#endif
463	}
464	return (false);
465}
466
467/*
468 * Attempts to add @dst/plen prefix with nexthop/nexhopgroup data @rnd
469 * to the routing table.
470 *
471 * @fibnum: verified kernel rtable id to insert route to
472 * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
473 * @plen: prefix length (or -1 if host route or not applicable for AF)
474 * @op_flags: combination of RTM_F_ flags
475 * @rc: storage to report operation result
476 *
477 * Returns 0 on success.
478 */
479int
480rib_add_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
481    struct route_nhop_data *rnd, int op_flags, struct rib_cmd_info *rc)
482{
483	union sockaddr_union mask_storage;
484	struct sockaddr *netmask = &mask_storage.sa;
485	struct rtentry *rt = NULL;
486
487	NET_EPOCH_ASSERT();
488
489	bzero(rc, sizeof(struct rib_cmd_info));
490	rc->rc_cmd = RTM_ADD;
491
492	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
493	if (rnh == NULL)
494		return (EAFNOSUPPORT);
495
496	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
497		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
498		return (EINVAL);
499	}
500
501	if (op_flags & RTM_F_CREATE) {
502		if ((rt = rt_alloc(rnh, dst, netmask)) == NULL) {
503			FIB_RH_LOG(LOG_INFO, rnh, "rtentry allocation failed");
504			return (ENOMEM);
505		}
506	} else {
507		struct route_nhop_data rnd_tmp;
508		RIB_RLOCK_TRACKER;
509
510		RIB_RLOCK(rnh);
511		rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd_tmp);
512		RIB_RUNLOCK(rnh);
513
514		if (rt == NULL)
515			return (ESRCH);
516	}
517
518	return (add_route_flags(rnh, rt, rnd, op_flags, rc));
519}
520
521/*
522 * Attempts to delete @dst/plen prefix matching gateway @gw from the
523 *  routing rable.
524 *
525 * @fibnum: rtable id to remove route from
526 * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
527 * @plen: prefix length (or -1 if host route or not applicable for AF)
528 * @gw: gateway to match
529 * @op_flags: combination of RTM_F_ flags
530 * @rc: storage to report operation result
531 *
532 * Returns 0 on success.
533 */
534int
535rib_del_route_px_gw(uint32_t fibnum, struct sockaddr *dst, int plen,
536    const struct sockaddr *gw, int op_flags, struct rib_cmd_info *rc)
537{
538	struct gw_filter_data gwd = { .gw = gw };
539
540	return (rib_del_route_px(fibnum, dst, plen, match_gw_one, &gwd, op_flags, rc));
541}
542
543/*
544 * Attempts to delete @dst/plen prefix matching @filter_func from the
545 *  routing rable.
546 *
547 * @fibnum: rtable id to remove route from
548 * @dst: verified kernel-originated sockaddr, can be masked if plen non-empty
549 * @plen: prefix length (or -1 if host route or not applicable for AF)
550 * @filter_func: func to be called for each nexthop of the prefix for matching
551 * @filter_arg: argument to pass to @filter_func
552 * @op_flags: combination of RTM_F_ flags
553 * @rc: storage to report operation result
554 *
555 * Returns 0 on success.
556 */
557int
558rib_del_route_px(uint32_t fibnum, struct sockaddr *dst, int plen,
559    rib_filter_f_t *filter_func, void *filter_arg, int op_flags,
560    struct rib_cmd_info *rc)
561{
562	union sockaddr_union mask_storage;
563	struct sockaddr *netmask = &mask_storage.sa;
564	int error;
565
566	NET_EPOCH_ASSERT();
567
568	bzero(rc, sizeof(struct rib_cmd_info));
569	rc->rc_cmd = RTM_DELETE;
570
571	struct rib_head *rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
572	if (rnh == NULL)
573		return (EAFNOSUPPORT);
574
575	if (dst->sa_len > sizeof(mask_storage)) {
576		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too big: %d", dst->sa_len);
577		return (EINVAL);
578	}
579
580	if (!fill_pxmask_family(dst->sa_family, plen, dst, &netmask)) {
581		FIB_RH_LOG(LOG_DEBUG, rnh, "error: invalid plen %d", plen);
582		return (EINVAL);
583	}
584
585	int prio = (op_flags & RTM_F_FORCE) ? NH_PRIORITY_HIGH : NH_PRIORITY_NORMAL;
586
587	RIB_WLOCK(rnh);
588	struct route_nhop_data rnd;
589	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
590	if (rt != NULL) {
591		error = rt_delete_conditional(rnh, rt, prio, filter_func,
592		    filter_arg, rc);
593	} else
594		error = ESRCH;
595	RIB_WUNLOCK(rnh);
596
597	if (error != 0)
598		return (error);
599
600	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
601
602	if (rc->rc_cmd == RTM_DELETE)
603		rt_free(rc->rc_rt);
604#ifdef ROUTE_MPATH
605	else {
606		/*
607		 * Deleting 1 path may result in RTM_CHANGE to
608		 * a different mpath group/nhop.
609		 * Free old mpath group.
610		 */
611		nhop_free_any(rc->rc_nh_old);
612	}
613#endif
614
615	return (0);
616}
617
618/*
619 * Tries to copy route @rt from one rtable to the rtable specified by @dst_rh.
620 * @rt: route to copy.
621 * @rnd_src: nhop and weight. Multipath routes are not supported
622 * @rh_dst: target rtable.
623 * @rc: operation result storage
624 *
625 * Return 0 on success.
626 */
627int
628rib_copy_route(struct rtentry *rt, const struct route_nhop_data *rnd_src,
629    struct rib_head *rh_dst, struct rib_cmd_info *rc)
630{
631	struct nhop_object __diagused *nh_src = rnd_src->rnd_nhop;
632	int error;
633
634	MPASS((nh_src->nh_flags & NHF_MULTIPATH) == 0);
635
636	IF_DEBUG_LEVEL(LOG_DEBUG2) {
637		char nhbuf[NHOP_PRINT_BUFSIZE], rtbuf[NHOP_PRINT_BUFSIZE];
638		nhop_print_buf_any(nh_src, nhbuf, sizeof(nhbuf));
639		rt_print_buf(rt, rtbuf, sizeof(rtbuf));
640		FIB_RH_LOG(LOG_DEBUG2, rh_dst, "copying %s -> %s from fib %u",
641		    rtbuf, nhbuf, nhop_get_fibnum(nh_src));
642	}
643	struct nhop_object *nh = nhop_alloc(rh_dst->rib_fibnum, rh_dst->rib_family);
644	if (nh == NULL) {
645		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to allocate new nexthop");
646		return (ENOMEM);
647	}
648	nhop_copy(nh, rnd_src->rnd_nhop);
649	nhop_set_origin(nh, nhop_get_origin(rnd_src->rnd_nhop));
650	nhop_set_fibnum(nh, rh_dst->rib_fibnum);
651	nh = nhop_get_nhop_internal(rh_dst, nh, &error);
652	if (error != 0) {
653		FIB_RH_LOG(LOG_INFO, rh_dst,
654		    "unable to finalize new nexthop: error %d", error);
655		return (ENOMEM);
656	}
657
658	struct rtentry *rt_new = rt_alloc(rh_dst, rt_key(rt), rt_mask(rt));
659	if (rt_new == NULL) {
660		FIB_RH_LOG(LOG_INFO, rh_dst, "unable to create new rtentry");
661		nhop_free(nh);
662		return (ENOMEM);
663	}
664
665	struct route_nhop_data rnd = {
666		.rnd_nhop = nh,
667		.rnd_weight = rnd_src->rnd_weight
668	};
669	int op_flags = RTM_F_CREATE | (NH_IS_PINNED(nh) ? RTM_F_FORCE : 0);
670	error = add_route_flags(rh_dst, rt_new, &rnd, op_flags, rc);
671
672	if (error != 0) {
673		IF_DEBUG_LEVEL(LOG_DEBUG2) {
674			char buf[NHOP_PRINT_BUFSIZE];
675			rt_print_buf(rt_new, buf, sizeof(buf));
676			FIB_RH_LOG(LOG_DEBUG, rh_dst,
677			    "Unable to add route %s: error %d", buf, error);
678		}
679		nhop_free(nh);
680		rt_free_immediate(rt_new);
681	}
682	return (error);
683}
684
685/*
686 * Adds route defined by @info into the kernel table specified by @fibnum and
687 * sa_family in @info->rti_info[RTAX_DST].
688 *
689 * Returns 0 on success and fills in operation metadata into @rc.
690 */
691int
692rib_add_route(uint32_t fibnum, struct rt_addrinfo *info,
693    struct rib_cmd_info *rc)
694{
695	struct rib_head *rnh;
696	int error;
697
698	NET_EPOCH_ASSERT();
699
700	rnh = get_rnh(fibnum, info);
701	if (rnh == NULL)
702		return (EAFNOSUPPORT);
703
704	/*
705	 * Check consistency between RTF_HOST flag and netmask
706	 * existence.
707	 */
708	if (info->rti_flags & RTF_HOST)
709		info->rti_info[RTAX_NETMASK] = NULL;
710	else if (info->rti_info[RTAX_NETMASK] == NULL) {
711		FIB_RH_LOG(LOG_DEBUG, rnh, "error: no RTF_HOST and empty netmask");
712		return (EINVAL);
713	}
714
715	bzero(rc, sizeof(struct rib_cmd_info));
716	rc->rc_cmd = RTM_ADD;
717
718	error = add_route_byinfo(rnh, info, rc);
719	if (error == 0)
720		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
721
722	return (error);
723}
724
725static int
726add_route_byinfo(struct rib_head *rnh, struct rt_addrinfo *info,
727    struct rib_cmd_info *rc)
728{
729	struct route_nhop_data rnd_add;
730	struct nhop_object *nh;
731	struct rtentry *rt;
732	struct sockaddr *dst, *gateway, *netmask;
733	int error;
734
735	dst = info->rti_info[RTAX_DST];
736	gateway = info->rti_info[RTAX_GATEWAY];
737	netmask = info->rti_info[RTAX_NETMASK];
738
739	if ((info->rti_flags & RTF_GATEWAY) && !gateway) {
740		FIB_RH_LOG(LOG_DEBUG, rnh, "error: RTF_GATEWAY set with empty gw");
741		return (EINVAL);
742	}
743	if (dst && gateway && !nhop_check_gateway(dst->sa_family, gateway->sa_family)) {
744		FIB_RH_LOG(LOG_DEBUG, rnh,
745		    "error: invalid dst/gateway family combination (%d, %d)",
746		    dst->sa_family, gateway->sa_family);
747		return (EINVAL);
748	}
749
750	if (dst->sa_len > sizeof(((struct rtentry *)NULL)->rt_dstb)) {
751		FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large: %d",
752		    dst->sa_len);
753		return (EINVAL);
754	}
755
756	if (info->rti_ifa == NULL) {
757		error = rt_getifa_fib(info, rnh->rib_fibnum);
758		if (error)
759			return (error);
760	}
761
762	if ((rt = rt_alloc(rnh, dst, netmask)) == NULL)
763		return (ENOBUFS);
764
765	error = nhop_create_from_info(rnh, info, &nh);
766	if (error != 0) {
767		rt_free_immediate(rt);
768		return (error);
769	}
770
771	rnd_add.rnd_nhop = nh;
772	rnd_add.rnd_weight = get_info_weight(info, RT_DEFAULT_WEIGHT);
773
774	int op_flags = RTM_F_CREATE;
775	if (get_prio_from_info(info) == NH_PRIORITY_HIGH)
776		op_flags |= RTM_F_FORCE;
777	else
778		op_flags |= RTM_F_APPEND;
779	return (add_route_flags(rnh, rt, &rnd_add, op_flags, rc));
780
781}
782
783static int
784add_route_flags(struct rib_head *rnh, struct rtentry *rt, struct route_nhop_data *rnd_add,
785    int op_flags, struct rib_cmd_info *rc)
786{
787	struct route_nhop_data rnd_orig;
788	struct nhop_object *nh;
789	struct rtentry *rt_orig;
790	int error = 0;
791
792	MPASS(rt != NULL);
793
794	nh = rnd_add->rnd_nhop;
795
796	RIB_WLOCK(rnh);
797
798	rt_orig = lookup_prefix_rt(rnh, rt, &rnd_orig);
799
800	if (rt_orig == NULL) {
801		if (op_flags & RTM_F_CREATE)
802			error = add_route(rnh, rt, rnd_add, rc);
803		else
804			error = ESRCH; /* no entry but creation was not required */
805		RIB_WUNLOCK(rnh);
806		if (error != 0)
807			goto out;
808		return (0);
809	}
810
811	if (op_flags & RTM_F_EXCL) {
812		/* We have existing route in the RIB but not allowed to replace. */
813		RIB_WUNLOCK(rnh);
814		error = EEXIST;
815		goto out;
816	}
817
818	/* Now either append or replace */
819	if (op_flags & RTM_F_REPLACE) {
820		if (nhop_get_prio(rnd_orig.rnd_nhop) > nhop_get_prio(rnd_add->rnd_nhop)) {
821			/* Old path is "better" (e.g. has PINNED flag set) */
822			RIB_WUNLOCK(rnh);
823			error = EEXIST;
824			goto out;
825		}
826		change_route(rnh, rt_orig, rnd_add, rc);
827		RIB_WUNLOCK(rnh);
828		nh = rc->rc_nh_old;
829		goto out;
830	}
831
832	RIB_WUNLOCK(rnh);
833
834#ifdef ROUTE_MPATH
835	if ((op_flags & RTM_F_APPEND) && rib_can_multipath(rnh) &&
836	    nhop_can_multipath(rnd_add->rnd_nhop) &&
837	    nhop_can_multipath(rnd_orig.rnd_nhop)) {
838
839		for (int i = 0; i < RIB_MAX_RETRIES; i++) {
840			error = add_route_flags_mpath(rnh, rt_orig, rnd_add, &rnd_orig,
841			    op_flags, rc);
842			if (error != EAGAIN)
843				break;
844			RTSTAT_INC(rts_add_retry);
845		}
846
847		/*
848		 *  Original nhop reference is unused in any case.
849		 */
850		nhop_free_any(rnd_add->rnd_nhop);
851		if (op_flags & RTM_F_CREATE) {
852			if (error != 0 || rc->rc_cmd != RTM_ADD)
853				rt_free_immediate(rt);
854		}
855		return (error);
856	}
857#endif
858	/* Out of options - free state and return error */
859	error = EEXIST;
860out:
861	if (op_flags & RTM_F_CREATE)
862		rt_free_immediate(rt);
863	nhop_free_any(nh);
864
865	return (error);
866}
867
868#ifdef ROUTE_MPATH
869static int
870add_route_flags_mpath(struct rib_head *rnh, struct rtentry *rt,
871    struct route_nhop_data *rnd_add, struct route_nhop_data *rnd_orig,
872    int op_flags, struct rib_cmd_info *rc)
873{
874	RIB_RLOCK_TRACKER;
875	struct route_nhop_data rnd_new;
876	int error = 0;
877
878	error = nhgrp_get_addition_group(rnh, rnd_orig, rnd_add, &rnd_new);
879	if (error != 0) {
880		if (error == EAGAIN) {
881			/*
882			 * Group creation failed, most probably because
883			 * @rnd_orig data got scheduled for deletion.
884			 * Refresh @rnd_orig data and retry.
885			 */
886			RIB_RLOCK(rnh);
887			lookup_prefix_rt(rnh, rt, rnd_orig);
888			RIB_RUNLOCK(rnh);
889			if (rnd_orig == NULL && !(op_flags & RTM_F_CREATE)) {
890				/* In this iteration route doesn't exist */
891				error = ENOENT;
892			}
893		}
894		return (error);
895	}
896	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
897	if (error != 0)
898		return (error);
899
900	if (V_fib_hash_outbound == 0 && NH_IS_NHGRP(rc->rc_nh_new)) {
901		/*
902		 * First multipath route got installed. Enable local
903		 * outbound connections hashing.
904		 */
905		if (bootverbose)
906			printf("FIB: enabled flowid calculation for locally-originated packets\n");
907		V_fib_hash_outbound = 1;
908	}
909
910	return (0);
911}
912#endif
913
914/*
915 * Removes route defined by @info from the kernel table specified by @fibnum and
916 * sa_family in @info->rti_info[RTAX_DST].
917 *
918 * Returns 0 on success and fills in operation metadata into @rc.
919 */
920int
921rib_del_route(uint32_t fibnum, struct rt_addrinfo *info, struct rib_cmd_info *rc)
922{
923	struct rib_head *rnh;
924	struct sockaddr *dst, *netmask;
925	struct sockaddr_storage mdst;
926	int error;
927
928	NET_EPOCH_ASSERT();
929
930	rnh = get_rnh(fibnum, info);
931	if (rnh == NULL)
932		return (EAFNOSUPPORT);
933
934	bzero(rc, sizeof(struct rib_cmd_info));
935	rc->rc_cmd = RTM_DELETE;
936
937	dst = info->rti_info[RTAX_DST];
938	netmask = info->rti_info[RTAX_NETMASK];
939
940	if (netmask != NULL) {
941		/* Ensure @dst is always properly masked */
942		if (dst->sa_len > sizeof(mdst)) {
943			FIB_RH_LOG(LOG_DEBUG, rnh, "error: dst->sa_len too large");
944			return (EINVAL);
945		}
946		rt_maskedcopy(dst, (struct sockaddr *)&mdst, netmask);
947		dst = (struct sockaddr *)&mdst;
948	}
949
950	rib_filter_f_t *filter_func = NULL;
951	void *filter_arg = NULL;
952	struct gw_filter_data gwd = { .gw = info->rti_info[RTAX_GATEWAY] };
953
954	if (info->rti_filter != NULL) {
955		filter_func = info->rti_filter;
956		filter_arg = info->rti_filterdata;
957	} else if (gwd.gw != NULL) {
958		filter_func = match_gw_one;
959		filter_arg = &gwd;
960	}
961
962	int prio = get_prio_from_info(info);
963
964	RIB_WLOCK(rnh);
965	struct route_nhop_data rnd;
966	struct rtentry *rt = lookup_prefix_bysa(rnh, dst, netmask, &rnd);
967	if (rt != NULL) {
968		error = rt_delete_conditional(rnh, rt, prio, filter_func,
969		    filter_arg, rc);
970	} else
971		error = ESRCH;
972	RIB_WUNLOCK(rnh);
973
974	if (error != 0)
975		return (error);
976
977	rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
978
979	if (rc->rc_cmd == RTM_DELETE)
980		rt_free(rc->rc_rt);
981#ifdef ROUTE_MPATH
982	else {
983		/*
984		 * Deleting 1 path may result in RTM_CHANGE to
985		 * a different mpath group/nhop.
986		 * Free old mpath group.
987		 */
988		nhop_free_any(rc->rc_nh_old);
989	}
990#endif
991
992	return (0);
993}
994
995/*
996 * Conditionally unlinks rtentry paths from @rnh matching @cb.
997 * Returns 0 on success with operation result stored in @rc.
998 * On error, returns:
999 * ESRCH - if prefix was not found or filter function failed to match
1000 * EADDRINUSE - if trying to delete higher priority route.
1001 */
1002static int
1003rt_delete_conditional(struct rib_head *rnh, struct rtentry *rt,
1004    int prio, rib_filter_f_t *cb, void *cbdata, struct rib_cmd_info *rc)
1005{
1006	struct nhop_object *nh = rt->rt_nhop;
1007
1008#ifdef ROUTE_MPATH
1009	if (NH_IS_NHGRP(nh)) {
1010		struct nhgrp_object *nhg = (struct nhgrp_object *)nh;
1011		struct route_nhop_data rnd;
1012		int error;
1013
1014		if (cb == NULL)
1015			return (ESRCH);
1016		error = nhgrp_get_filtered_group(rnh, rt, nhg, cb, cbdata, &rnd);
1017		if (error == 0) {
1018			if (rnd.rnd_nhgrp == nhg) {
1019				/* No match, unreference new group and return. */
1020				nhop_free_any(rnd.rnd_nhop);
1021				return (ESRCH);
1022			}
1023			error = change_route(rnh, rt, &rnd, rc);
1024		}
1025		return (error);
1026	}
1027#endif
1028	if (cb != NULL && !cb(rt, nh, cbdata))
1029		return (ESRCH);
1030
1031	if (prio < nhop_get_prio(nh))
1032		return (EADDRINUSE);
1033
1034	return (delete_route(rnh, rt, rc));
1035}
1036
1037int
1038rib_change_route(uint32_t fibnum, struct rt_addrinfo *info,
1039    struct rib_cmd_info *rc)
1040{
1041	RIB_RLOCK_TRACKER;
1042	struct route_nhop_data rnd_orig;
1043	struct rib_head *rnh;
1044	struct rtentry *rt;
1045	int error;
1046
1047	NET_EPOCH_ASSERT();
1048
1049	rnh = get_rnh(fibnum, info);
1050	if (rnh == NULL)
1051		return (EAFNOSUPPORT);
1052
1053	bzero(rc, sizeof(struct rib_cmd_info));
1054	rc->rc_cmd = RTM_CHANGE;
1055
1056	/* Check if updated gateway exists */
1057	if ((info->rti_flags & RTF_GATEWAY) &&
1058	    (info->rti_info[RTAX_GATEWAY] == NULL)) {
1059
1060		/*
1061		 * route(8) adds RTF_GATEWAY flag if -interface is not set.
1062		 * Remove RTF_GATEWAY to enforce consistency and maintain
1063		 * compatibility..
1064		 */
1065		info->rti_flags &= ~RTF_GATEWAY;
1066	}
1067
1068	/*
1069	 * route change is done in multiple steps, with dropping and
1070	 * reacquiring lock. In the situations with multiple processes
1071	 * changes the same route in can lead to the case when route
1072	 * is changed between the steps. Address it by retrying the operation
1073	 * multiple times before failing.
1074	 */
1075
1076	RIB_RLOCK(rnh);
1077	rt = (struct rtentry *)rnh->rnh_lookup(info->rti_info[RTAX_DST],
1078	    info->rti_info[RTAX_NETMASK], &rnh->head);
1079
1080	if (rt == NULL) {
1081		RIB_RUNLOCK(rnh);
1082		return (ESRCH);
1083	}
1084
1085	rnd_orig.rnd_nhop = rt->rt_nhop;
1086	rnd_orig.rnd_weight = rt->rt_weight;
1087
1088	RIB_RUNLOCK(rnh);
1089
1090	for (int i = 0; i < RIB_MAX_RETRIES; i++) {
1091		error = change_route_byinfo(rnh, rt, info, &rnd_orig, rc);
1092		if (error != EAGAIN)
1093			break;
1094	}
1095
1096	return (error);
1097}
1098
1099static int
1100change_nhop(struct rib_head *rnh, struct rt_addrinfo *info,
1101    struct nhop_object *nh_orig, struct nhop_object **nh_new)
1102{
1103	int error;
1104
1105	/*
1106	 * New gateway could require new ifaddr, ifp;
1107	 * flags may also be different; ifp may be specified
1108	 * by ll sockaddr when protocol address is ambiguous
1109	 */
1110	if (((nh_orig->nh_flags & NHF_GATEWAY) &&
1111	    info->rti_info[RTAX_GATEWAY] != NULL) ||
1112	    info->rti_info[RTAX_IFP] != NULL ||
1113	    (info->rti_info[RTAX_IFA] != NULL &&
1114	     !sa_equal(info->rti_info[RTAX_IFA], nh_orig->nh_ifa->ifa_addr))) {
1115		error = rt_getifa_fib(info, rnh->rib_fibnum);
1116
1117		if (error != 0) {
1118			info->rti_ifa = NULL;
1119			return (error);
1120		}
1121	}
1122
1123	error = nhop_create_from_nhop(rnh, nh_orig, info, nh_new);
1124	info->rti_ifa = NULL;
1125
1126	return (error);
1127}
1128
1129#ifdef ROUTE_MPATH
1130static int
1131change_mpath_route(struct rib_head *rnh, struct rtentry *rt,
1132    struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1133    struct rib_cmd_info *rc)
1134{
1135	int error = 0, found_idx = 0;
1136	struct nhop_object *nh_orig = NULL, *nh_new;
1137	struct route_nhop_data rnd_new = {};
1138	const struct weightened_nhop *wn = NULL;
1139	struct weightened_nhop *wn_new;
1140	uint32_t num_nhops;
1141
1142	wn = nhgrp_get_nhops(rnd_orig->rnd_nhgrp, &num_nhops);
1143	for (int i = 0; i < num_nhops; i++) {
1144		if (check_info_match_nhop(info, NULL, wn[i].nh) == 0) {
1145			nh_orig = wn[i].nh;
1146			found_idx = i;
1147			break;
1148		}
1149	}
1150
1151	if (nh_orig == NULL)
1152		return (ESRCH);
1153
1154	error = change_nhop(rnh, info, nh_orig, &nh_new);
1155	if (error != 0)
1156		return (error);
1157
1158	wn_new = mallocarray(num_nhops, sizeof(struct weightened_nhop),
1159	    M_TEMP, M_NOWAIT | M_ZERO);
1160	if (wn_new == NULL) {
1161		nhop_free(nh_new);
1162		return (EAGAIN);
1163	}
1164
1165	memcpy(wn_new, wn, num_nhops * sizeof(struct weightened_nhop));
1166	wn_new[found_idx].nh = nh_new;
1167	wn_new[found_idx].weight = get_info_weight(info, wn[found_idx].weight);
1168
1169	error = nhgrp_get_group(rnh, wn_new, num_nhops, 0, &rnd_new.rnd_nhgrp);
1170	nhop_free(nh_new);
1171	free(wn_new, M_TEMP);
1172
1173	if (error != 0)
1174		return (error);
1175
1176	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1177
1178	return (error);
1179}
1180#endif
1181
1182static int
1183change_route_byinfo(struct rib_head *rnh, struct rtentry *rt,
1184    struct rt_addrinfo *info, struct route_nhop_data *rnd_orig,
1185    struct rib_cmd_info *rc)
1186{
1187	int error = 0;
1188	struct nhop_object *nh_orig;
1189	struct route_nhop_data rnd_new;
1190
1191	nh_orig = rnd_orig->rnd_nhop;
1192	if (nh_orig == NULL)
1193		return (ESRCH);
1194
1195#ifdef ROUTE_MPATH
1196	if (NH_IS_NHGRP(nh_orig))
1197		return (change_mpath_route(rnh, rt, info, rnd_orig, rc));
1198#endif
1199
1200	rnd_new.rnd_weight = get_info_weight(info, rnd_orig->rnd_weight);
1201	error = change_nhop(rnh, info, nh_orig, &rnd_new.rnd_nhop);
1202	if (error != 0)
1203		return (error);
1204	error = change_route_conditional(rnh, rt, rnd_orig, &rnd_new, rc);
1205
1206	return (error);
1207}
1208
1209/*
1210 * Insert @rt with nhop data from @rnd_new to @rnh.
1211 * Returns 0 on success and stores operation results in @rc.
1212 */
1213static int
1214add_route(struct rib_head *rnh, struct rtentry *rt,
1215    struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1216{
1217	struct radix_node *rn;
1218
1219	RIB_WLOCK_ASSERT(rnh);
1220
1221	rt->rt_nhop = rnd->rnd_nhop;
1222	rt->rt_weight = rnd->rnd_weight;
1223	rn = rnh->rnh_addaddr(rt_key(rt), rt_mask_const(rt), &rnh->head, rt->rt_nodes);
1224
1225	if (rn != NULL) {
1226		if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1227			tmproutes_update(rnh, rt, rnd->rnd_nhop);
1228
1229		/* Finalize notification */
1230		rib_bump_gen(rnh);
1231		rnh->rnh_prefixes++;
1232
1233		rc->rc_cmd = RTM_ADD;
1234		rc->rc_rt = rt;
1235		rc->rc_nh_old = NULL;
1236		rc->rc_nh_new = rnd->rnd_nhop;
1237		rc->rc_nh_weight = rnd->rnd_weight;
1238
1239		rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1240		return (0);
1241	}
1242
1243	/* Existing route or memory allocation failure. */
1244	return (EEXIST);
1245}
1246
1247/*
1248 * Unconditionally deletes @rt from @rnh.
1249 */
1250static int
1251delete_route(struct rib_head *rnh, struct rtentry *rt, struct rib_cmd_info *rc)
1252{
1253	RIB_WLOCK_ASSERT(rnh);
1254
1255	/* Route deletion requested. */
1256	struct radix_node *rn;
1257
1258	rn = rnh->rnh_deladdr(rt_key_const(rt), rt_mask_const(rt), &rnh->head);
1259	if (rn == NULL)
1260		return (ESRCH);
1261	rt = RNTORT(rn);
1262	rt->rte_flags &= ~RTF_UP;
1263
1264	rib_bump_gen(rnh);
1265	rnh->rnh_prefixes--;
1266
1267	rc->rc_cmd = RTM_DELETE;
1268	rc->rc_rt = rt;
1269	rc->rc_nh_old = rt->rt_nhop;
1270	rc->rc_nh_new = NULL;
1271	rc->rc_nh_weight = rt->rt_weight;
1272
1273	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1274
1275	return (0);
1276}
1277
1278/*
1279 * Switch @rt nhop/weigh to the ones specified in @rnd.
1280 * Returns 0 on success.
1281 */
1282int
1283change_route(struct rib_head *rnh, struct rtentry *rt,
1284    struct route_nhop_data *rnd, struct rib_cmd_info *rc)
1285{
1286	struct nhop_object *nh_orig;
1287
1288	RIB_WLOCK_ASSERT(rnh);
1289
1290	nh_orig = rt->rt_nhop;
1291
1292	if (rnd->rnd_nhop == NULL)
1293		return (delete_route(rnh, rt, rc));
1294
1295	/* Changing nexthop & weight to a new one */
1296	rt->rt_nhop = rnd->rnd_nhop;
1297	rt->rt_weight = rnd->rnd_weight;
1298	if (!NH_IS_NHGRP(rnd->rnd_nhop) && nhop_get_expire(rnd->rnd_nhop))
1299		tmproutes_update(rnh, rt, rnd->rnd_nhop);
1300
1301	/* Finalize notification */
1302	rib_bump_gen(rnh);
1303	rc->rc_cmd = RTM_CHANGE;
1304	rc->rc_rt = rt;
1305	rc->rc_nh_old = nh_orig;
1306	rc->rc_nh_new = rnd->rnd_nhop;
1307	rc->rc_nh_weight = rnd->rnd_weight;
1308
1309	rib_notify(rnh, RIB_NOTIFY_IMMEDIATE, rc);
1310
1311	return (0);
1312}
1313
1314/*
1315 * Conditionally update route nhop/weight IFF data in @nhd_orig is
1316 *  consistent with the current route data.
1317 * Nexthop in @nhd_new is consumed.
1318 */
1319int
1320change_route_conditional(struct rib_head *rnh, struct rtentry *rt,
1321    struct route_nhop_data *rnd_orig, struct route_nhop_data *rnd_new,
1322    struct rib_cmd_info *rc)
1323{
1324	struct rtentry *rt_new;
1325	int error = 0;
1326
1327	IF_DEBUG_LEVEL(LOG_DEBUG2) {
1328		char buf_old[NHOP_PRINT_BUFSIZE], buf_new[NHOP_PRINT_BUFSIZE];
1329		nhop_print_buf_any(rnd_orig->rnd_nhop, buf_old, NHOP_PRINT_BUFSIZE);
1330		nhop_print_buf_any(rnd_new->rnd_nhop, buf_new, NHOP_PRINT_BUFSIZE);
1331		FIB_LOG(LOG_DEBUG2, rnh->rib_fibnum, rnh->rib_family,
1332		    "trying change %s -> %s", buf_old, buf_new);
1333	}
1334	RIB_WLOCK(rnh);
1335
1336	struct route_nhop_data rnd;
1337	rt_new = lookup_prefix_rt(rnh, rt, &rnd);
1338
1339	if (rt_new == NULL) {
1340		if (rnd_orig->rnd_nhop == NULL)
1341			error = add_route(rnh, rt, rnd_new, rc);
1342		else {
1343			/*
1344			 * Prefix does not exist, which was not our assumption.
1345			 * Update @rnd_orig with the new data and return
1346			 */
1347			rnd_orig->rnd_nhop = NULL;
1348			rnd_orig->rnd_weight = 0;
1349			error = EAGAIN;
1350		}
1351	} else {
1352		/* Prefix exists, try to update */
1353		if (rnd_orig->rnd_nhop == rt_new->rt_nhop) {
1354			/*
1355			 * Nhop/mpath group hasn't changed. Flip
1356			 * to the new precalculated one and return
1357			 */
1358			error = change_route(rnh, rt_new, rnd_new, rc);
1359		} else {
1360			/* Update and retry */
1361			rnd_orig->rnd_nhop = rt_new->rt_nhop;
1362			rnd_orig->rnd_weight = rt_new->rt_weight;
1363			error = EAGAIN;
1364		}
1365	}
1366
1367	RIB_WUNLOCK(rnh);
1368
1369	if (error == 0) {
1370		rib_notify(rnh, RIB_NOTIFY_DELAYED, rc);
1371
1372		if (rnd_orig->rnd_nhop != NULL)
1373			nhop_free_any(rnd_orig->rnd_nhop);
1374
1375	} else {
1376		if (rnd_new->rnd_nhop != NULL)
1377			nhop_free_any(rnd_new->rnd_nhop);
1378	}
1379
1380	return (error);
1381}
1382
1383/*
1384 * Performs modification of routing table specificed by @action.
1385 * Table is specified by @fibnum and sa_family in @info->rti_info[RTAX_DST].
1386 * Needs to be run in network epoch.
1387 *
1388 * Returns 0 on success and fills in @rc with action result.
1389 */
1390int
1391rib_action(uint32_t fibnum, int action, struct rt_addrinfo *info,
1392    struct rib_cmd_info *rc)
1393{
1394	int error;
1395
1396	switch (action) {
1397	case RTM_ADD:
1398		error = rib_add_route(fibnum, info, rc);
1399		break;
1400	case RTM_DELETE:
1401		error = rib_del_route(fibnum, info, rc);
1402		break;
1403	case RTM_CHANGE:
1404		error = rib_change_route(fibnum, info, rc);
1405		break;
1406	default:
1407		error = ENOTSUP;
1408	}
1409
1410	return (error);
1411}
1412
1413struct rt_delinfo
1414{
1415	struct rib_head *rnh;
1416	struct rtentry *head;
1417	rib_filter_f_t *filter_f;
1418	void *filter_arg;
1419	int prio;
1420	struct rib_cmd_info rc;
1421};
1422
1423/*
1424 * Conditionally unlinks rtenties or paths from radix tree based
1425 * on the callback data passed in @arg.
1426 */
1427static int
1428rt_checkdelroute(struct radix_node *rn, void *arg)
1429{
1430	struct rt_delinfo *di = (struct rt_delinfo *)arg;
1431	struct rtentry *rt = (struct rtentry *)rn;
1432
1433	if (rt_delete_conditional(di->rnh, rt, di->prio,
1434	    di->filter_f, di->filter_arg, &di->rc) != 0)
1435		return (0);
1436
1437	/*
1438	 * Add deleted rtentries to the list to GC them
1439	 *  after dropping the lock.
1440	 *
1441	 * XXX: Delayed notifications not implemented
1442	 *  for nexthop updates.
1443	 */
1444	if (di->rc.rc_cmd == RTM_DELETE) {
1445		/* Add to the list and return */
1446		rt->rt_chain = di->head;
1447		di->head = rt;
1448#ifdef ROUTE_MPATH
1449	} else {
1450		/*
1451		 * RTM_CHANGE to a different nexthop or nexthop group.
1452		 * Free old multipath group.
1453		 */
1454		nhop_free_any(di->rc.rc_nh_old);
1455#endif
1456	}
1457
1458	return (0);
1459}
1460
1461/*
1462 * Iterates over a routing table specified by @fibnum and @family and
1463 *  deletes elements marked by @filter_f.
1464 * @fibnum: rtable id
1465 * @family: AF_ address family
1466 * @filter_f: function returning non-zero value for items to delete
1467 * @arg: data to pass to the @filter_f function
1468 * @report: true if rtsock notification is needed.
1469 */
1470void
1471rib_walk_del(u_int fibnum, int family, rib_filter_f_t *filter_f, void *filter_arg,
1472    bool report)
1473{
1474	struct rib_head *rnh;
1475	struct rtentry *rt;
1476	struct nhop_object *nh;
1477	struct epoch_tracker et;
1478
1479	rnh = rt_tables_get_rnh(fibnum, family);
1480	if (rnh == NULL)
1481		return;
1482
1483	struct rt_delinfo di = {
1484		.rnh = rnh,
1485		.filter_f = filter_f,
1486		.filter_arg = filter_arg,
1487		.prio = NH_PRIORITY_NORMAL,
1488	};
1489
1490	NET_EPOCH_ENTER(et);
1491
1492	RIB_WLOCK(rnh);
1493	rnh->rnh_walktree(&rnh->head, rt_checkdelroute, &di);
1494	RIB_WUNLOCK(rnh);
1495
1496	/* We might have something to reclaim. */
1497	bzero(&di.rc, sizeof(di.rc));
1498	di.rc.rc_cmd = RTM_DELETE;
1499	while (di.head != NULL) {
1500		rt = di.head;
1501		di.head = rt->rt_chain;
1502		rt->rt_chain = NULL;
1503		nh = rt->rt_nhop;
1504
1505		di.rc.rc_rt = rt;
1506		di.rc.rc_nh_old = nh;
1507		rib_notify(rnh, RIB_NOTIFY_DELAYED, &di.rc);
1508
1509		if (report) {
1510#ifdef ROUTE_MPATH
1511			struct nhgrp_object *nhg;
1512			const struct weightened_nhop *wn;
1513			uint32_t num_nhops;
1514			if (NH_IS_NHGRP(nh)) {
1515				nhg = (struct nhgrp_object *)nh;
1516				wn = nhgrp_get_nhops(nhg, &num_nhops);
1517				for (int i = 0; i < num_nhops; i++)
1518					rt_routemsg(RTM_DELETE, rt, wn[i].nh, fibnum);
1519			} else
1520#endif
1521			rt_routemsg(RTM_DELETE, rt, nh, fibnum);
1522		}
1523		rt_free(rt);
1524	}
1525
1526	NET_EPOCH_EXIT(et);
1527}
1528
1529static int
1530rt_delete_unconditional(struct radix_node *rn, void *arg)
1531{
1532	struct rtentry *rt = RNTORT(rn);
1533	struct rib_head *rnh = (struct rib_head *)arg;
1534
1535	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), &rnh->head);
1536	if (RNTORT(rn) == rt)
1537		rt_free(rt);
1538
1539	return (0);
1540}
1541
1542/*
1543 * Removes all routes from the routing table without executing notifications.
1544 * rtentres will be removed after the end of a current epoch.
1545 */
1546static void
1547rib_flush_routes(struct rib_head *rnh)
1548{
1549	RIB_WLOCK(rnh);
1550	rnh->rnh_walktree(&rnh->head, rt_delete_unconditional, rnh);
1551	RIB_WUNLOCK(rnh);
1552}
1553
1554void
1555rib_flush_routes_family(int family)
1556{
1557	struct rib_head *rnh;
1558
1559	for (uint32_t fibnum = 0; fibnum < rt_numfibs; fibnum++) {
1560		if ((rnh = rt_tables_get_rnh(fibnum, family)) != NULL)
1561			rib_flush_routes(rnh);
1562	}
1563}
1564
1565const char *
1566rib_print_family(int family)
1567{
1568	switch (family) {
1569	case AF_INET:
1570		return ("inet");
1571	case AF_INET6:
1572		return ("inet6");
1573	case AF_LINK:
1574		return ("link");
1575	}
1576	return ("unknown");
1577}
1578
1579