1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2020 Alexander V. Chernikov
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29#include "opt_inet.h"
30#include "opt_inet6.h"
31#include "opt_route.h"
32
33#include <sys/param.h>
34#include <sys/eventhandler.h>
35#include <sys/kernel.h>
36#include <sys/sbuf.h>
37#include <sys/lock.h>
38#include <sys/rmlock.h>
39#include <sys/malloc.h>
40#include <sys/mbuf.h>
41#include <sys/module.h>
42#include <sys/kernel.h>
43#include <sys/priv.h>
44#include <sys/proc.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
47#include <sys/sysctl.h>
48#include <sys/syslog.h>
49#include <sys/queue.h>
50#include <net/vnet.h>
51
52#include <net/if.h>
53#include <net/if_var.h>
54
55#include <netinet/in.h>
56#include <netinet/in_var.h>
57#include <netinet/ip.h>
58#include <netinet/ip_var.h>
59#ifdef INET6
60#include <netinet/ip6.h>
61#include <netinet6/ip6_var.h>
62#endif
63
64#include <net/route.h>
65#include <net/route/nhop.h>
66#include <net/route/route_ctl.h>
67#include <net/route/route_var.h>
68#include <net/route/fib_algo.h>
69
70#include <machine/stdarg.h>
71
72/*
73 * Fib lookup framework.
74 *
75 * This framework enables accelerated longest-prefix-match lookups for the
76 *  routing tables by adding the ability to dynamically attach/detach lookup
77 *  algorithms implementation to/from the datapath.
78 *
79 * flm - fib lookup modules - implementation of particular lookup algorithm
80 * fd - fib data - instance of an flm bound to specific routing table
81 *
82 * This file provides main framework functionality.
83 *
84 * The following are the features provided by the framework
85 *
86 * 1) nexhops abstraction -> provides transparent referencing, indexing
87 *   and efficient idx->ptr mappings for nexthop and nexthop groups.
88 * 2) Routing table synchronisation
89 * 3) dataplane attachment points
90 * 4) automatic algorithm selection based on the provided preference.
91 *
92 *
93 * DATAPATH
94 * For each supported address family, there is a an allocated array of fib_dp
95 *  structures, indexed by fib number. Each array entry contains callback function
96 *  and its argument. This function will be called with a family-specific lookup key,
97 *  scope and provided argument. This array gets re-created every time when new algo
98 *  instance gets created. Please take a look at the replace_rtables_family() function
99 *  for more details.
100 *
101 */
102
103SYSCTL_DECL(_net_route);
104SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
105    "Fib algorithm lookups");
106
107/* Algorithm sync policy */
108
109/* Time interval to bucket updates */
110VNET_DEFINE_STATIC(unsigned int, update_bucket_time_ms) = 50;
111#define	V_update_bucket_time_ms	VNET(update_bucket_time_ms)
112SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET,
113    &VNET_NAME(update_bucket_time_ms), 0, "Time interval to calculate update rate");
114
115/* Minimum update rate to delay sync */
116VNET_DEFINE_STATIC(unsigned int, bucket_change_threshold_rate) = 500;
117#define	V_bucket_change_threshold_rate	VNET(bucket_change_threshold_rate)
118SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET,
119    &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync");
120
121/* Max allowed delay to sync */
122VNET_DEFINE_STATIC(unsigned int, fib_max_sync_delay_ms) = 1000;
123#define	V_fib_max_sync_delay_ms	VNET(fib_max_sync_delay_ms)
124SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET,
125    &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)");
126
127
128#ifdef INET6
129VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
130#define	V_algo_fixed_inet6	VNET(algo_fixed_inet6)
131SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
132    "IPv6 longest prefix match lookups");
133#endif
134#ifdef INET
135VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false;
136#define	V_algo_fixed_inet	VNET(algo_fixed_inet)
137SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
138    "IPv4 longest prefix match lookups");
139#endif
140
141/* Fib instance counter */
142static uint32_t fib_gen = 0;
143
144struct nhop_ref_table {
145	uint32_t		count;
146	int32_t			refcnt[0];
147};
148
149enum fib_callout_action {
150	FDA_NONE,	/* No callout scheduled */
151	FDA_REBUILD,	/* Asks to rebuild algo instance */
152	FDA_EVAL,	/* Asks to evaluate if the current algo is still be best */
153	FDA_BATCH,	/* Asks to submit batch of updates to the algo */
154};
155
156struct fib_sync_status {
157	struct timeval		diverge_time;	/* ts when diverged */
158	uint32_t		num_changes;	/* number of changes since sync */
159	uint32_t		bucket_changes;	/* num changes within the current bucket */
160	uint64_t		bucket_id;	/* 50ms bucket # */
161	struct fib_change_queue	fd_change_queue;/* list of scheduled entries */
162};
163
164/*
165 * Data structure for the fib lookup instance tied to the particular rib.
166 */
167struct fib_data {
168	uint32_t		number_nhops;	/* current # of nhops */
169	uint8_t			hit_nhops;	/* true if out of nhop limit */
170	uint8_t			init_done;	/* true if init is competed */
171	uint32_t		fd_dead:1;	/* Scheduled for deletion */
172	uint32_t		fd_linked:1;	/* true if linked */
173	uint32_t		fd_need_rebuild:1;	/* true if rebuild scheduled */
174	uint32_t		fd_batch:1;	/* true if batched notification scheduled */
175	uint8_t			fd_family;	/* family */
176	uint32_t		fd_fibnum;	/* fibnum */
177	uint32_t		fd_failed_rebuilds;	/* stat: failed rebuilds */
178	uint32_t		fd_gen;		/* instance gen# */
179	struct callout		fd_callout;	/* rebuild callout */
180	enum fib_callout_action	fd_callout_action;	/* Callout action to take */
181	void			*fd_algo_data;	/* algorithm data */
182	struct nhop_object	**nh_idx;	/* nhop idx->ptr array */
183	struct nhop_ref_table	*nh_ref_table;	/* array with # of nhop references */
184	struct rib_head		*fd_rh;		/* RIB table we're attached to */
185	struct rib_subscription	*fd_rs;		/* storing table subscription */
186	struct fib_dp		fd_dp;		/* fib datapath data */
187	struct vnet		*fd_vnet;	/* vnet fib belongs to */
188	struct epoch_context	fd_epoch_ctx;	/* epoch context for deletion */
189	struct fib_lookup_module	*fd_flm;/* pointer to the lookup module */
190	struct fib_sync_status	fd_ss;		/* State relevant to the rib sync  */
191	uint32_t		fd_num_changes;	/* number of changes since last callout */
192	TAILQ_ENTRY(fib_data)	entries;	/* list of all fds in vnet */
193};
194
195static bool rebuild_fd(struct fib_data *fd, const char *reason);
196static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new);
197static void handle_fd_callout(void *_data);
198static void destroy_fd_instance_epoch(epoch_context_t ctx);
199static bool is_idx_free(struct fib_data *fd, uint32_t index);
200static void set_algo_fixed(struct rib_head *rh);
201static bool is_algo_fixed(struct rib_head *rh);
202
203static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh);
204static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh);
205
206static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
207    struct fib_lookup_module *orig_flm);
208static void fib_unref_algo(struct fib_lookup_module *flm);
209static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum);
210
211struct mtx fib_mtx;
212#define	FIB_MOD_LOCK()		mtx_lock(&fib_mtx)
213#define	FIB_MOD_UNLOCK()	mtx_unlock(&fib_mtx)
214#define	FIB_MOD_LOCK_ASSERT()	mtx_assert(&fib_mtx, MA_OWNED)
215
216MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF);
217
218/* Algorithm has to be this percent better than the current to switch */
219#define	BEST_DIFF_PERCENT	(5 * 256 / 100)
220/* Schedule algo re-evaluation X seconds after a change */
221#define	ALGO_EVAL_DELAY_MS	30000
222/* Force algo re-evaluation after X changes */
223#define	ALGO_EVAL_NUM_ROUTES	100
224/* Try to setup algorithm X times */
225#define	FIB_MAX_TRIES		32
226/* Max amount of supported nexthops */
227#define	FIB_MAX_NHOPS		262144
228#define	FIB_CALLOUT_DELAY_MS	50
229
230
231/* Debug */
232static int flm_debug_level = LOG_NOTICE;
233SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN,
234    &flm_debug_level, 0, "debuglevel");
235#define	FLM_MAX_DEBUG_LEVEL	LOG_DEBUG
236#ifndef	LOG_DEBUG2
237#define	LOG_DEBUG2	8
238#endif
239
240#define	_PASS_MSG(_l)	(flm_debug_level >= (_l))
241#define	ALGO_PRINTF(_l, _fmt, ...)	if (_PASS_MSG(_l)) {		\
242	printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__);	\
243}
244#define	_ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \
245    printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\
246    print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__)
247#define	_RH_PRINTF(_fib, _fam, _func, _fmt, ...) \
248    printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__)
249#define	RH_PRINTF(_l, _rh, _fmt, ...)	if (_PASS_MSG(_l)) {	\
250    _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\
251}
252#define	FD_PRINTF(_l, _fd, _fmt, ...)	FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__)
253#define	_FD_PRINTF(_l, _fd, _fmt, ...)	if (_PASS_MSG(_l)) {		\
254    _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name,	\
255    _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__);			\
256}
257#if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2
258#define	FD_PRINTF_LOG_DEBUG2	_FD_PRINTF
259#else
260#define	FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...)
261#endif
262#if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG
263#define	FD_PRINTF_LOG_DEBUG	_FD_PRINTF
264#else
265#define	FD_PRINTF_LOG_DEBUG()
266#endif
267#if FLM_MAX_DEBUG_LEVEL>=LOG_INFO
268#define	FD_PRINTF_LOG_INFO	_FD_PRINTF
269#else
270#define	FD_PRINTF_LOG_INFO()
271#endif
272#define	FD_PRINTF_LOG_NOTICE	_FD_PRINTF
273#define	FD_PRINTF_LOG_ERR	_FD_PRINTF
274#define	FD_PRINTF_LOG_WARNING	_FD_PRINTF
275
276
277/* List of all registered lookup algorithms */
278static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list);
279
280/* List of all fib lookup instances in the vnet */
281VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list);
282#define	V_fib_data_list	VNET(fib_data_list)
283
284/* Datastructure for storing non-transient fib lookup module failures */
285struct fib_error {
286	int				fe_family;
287	uint32_t			fe_fibnum;	/* failed rtable */
288	struct fib_lookup_module	*fe_flm;	/* failed module */
289	TAILQ_ENTRY(fib_error)		entries;/* list of all errored entries */
290};
291VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list);
292#define	V_fib_error_list VNET(fib_error_list)
293
294/* Per-family array of fibnum -> {func, arg} mappings used in datapath */
295struct fib_dp_header {
296	struct epoch_context	fdh_epoch_ctx;
297	uint32_t		fdh_num_tables;
298	struct fib_dp		fdh_idx[0];
299};
300
301/*
302 * Tries to add new non-transient algorithm error to the list of
303 *  errors.
304 * Returns true on success.
305 */
306static bool
307flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum)
308{
309	struct fib_error *fe;
310
311	fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO);
312	if (fe == NULL)
313		return (false);
314	fe->fe_flm = flm;
315	fe->fe_family = flm->flm_family;
316	fe->fe_fibnum = fibnum;
317
318	FIB_MOD_LOCK();
319	/* Avoid duplicates by checking if error already exists first */
320	if (flm_error_check(flm, fibnum)) {
321		FIB_MOD_UNLOCK();
322		free(fe, M_TEMP);
323		return (true);
324	}
325	TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries);
326	FIB_MOD_UNLOCK();
327
328	return (true);
329}
330
331/*
332 * True if non-transient error has been registered for @flm in @fibnum.
333 */
334static bool
335flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum)
336{
337	const struct fib_error *fe;
338
339	TAILQ_FOREACH(fe, &V_fib_error_list, entries) {
340		if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum))
341			return (true);
342	}
343
344	return (false);
345}
346
347/*
348 * Clear all errors of algo specified by @flm.
349 */
350static void
351fib_error_clear_flm(struct fib_lookup_module *flm)
352{
353	struct fib_error *fe, *fe_tmp;
354
355	FIB_MOD_LOCK_ASSERT();
356
357	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
358		if (fe->fe_flm == flm) {
359			TAILQ_REMOVE(&V_fib_error_list, fe, entries);
360			free(fe, M_TEMP);
361		}
362	}
363}
364
365/*
366 * Clears all errors in current VNET.
367 */
368static void
369fib_error_clear(void)
370{
371	struct fib_error *fe, *fe_tmp;
372
373	FIB_MOD_LOCK_ASSERT();
374
375	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
376		TAILQ_REMOVE(&V_fib_error_list, fe, entries);
377		free(fe, M_TEMP);
378	}
379}
380
381static const char *
382print_op_result(enum flm_op_result result)
383{
384	switch (result) {
385	case FLM_SUCCESS:
386		return "success";
387	case FLM_REBUILD:
388		return "rebuild";
389	case FLM_BATCH:
390		return "batch";
391	case FLM_ERROR:
392		return "error";
393	}
394
395	return "unknown";
396}
397
398static const char *
399print_family(int family)
400{
401
402	if (family == AF_INET)
403		return ("inet");
404	else if (family == AF_INET6)
405		return ("inet6");
406	else
407		return ("unknown");
408}
409
410/*
411 * Debug function used by lookup algorithms.
412 * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) "
413 */
414void
415fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...)
416{
417	char buf[128];
418	va_list ap;
419
420	if (level > flm_debug_level)
421		return;
422
423	va_start(ap, fmt);
424	vsnprintf(buf, sizeof(buf), fmt, ap);
425	va_end(ap);
426
427	_ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name,
428	    fd->fd_gen, func, "%s", buf);
429}
430
431/*
432 * Outputs list of algorithms supported by the provided address family.
433 */
434static int
435print_algos_sysctl(struct sysctl_req *req, int family)
436{
437	struct fib_lookup_module *flm;
438	struct sbuf sbuf;
439	int error, count = 0;
440
441	error = sysctl_wire_old_buffer(req, 0);
442	if (error == 0) {
443		sbuf_new_for_sysctl(&sbuf, NULL, 512, req);
444		TAILQ_FOREACH(flm, &all_algo_list, entries) {
445			if (flm->flm_family == family) {
446				if (count++ > 0)
447					sbuf_cat(&sbuf, ", ");
448				sbuf_cat(&sbuf, flm->flm_name);
449			}
450		}
451		error = sbuf_finish(&sbuf);
452		sbuf_delete(&sbuf);
453	}
454	return (error);
455}
456
457#ifdef INET6
458static int
459print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)
460{
461
462	return (print_algos_sysctl(req, AF_INET6));
463}
464SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
465    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
466    print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms");
467#endif
468
469#ifdef INET
470static int
471print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)
472{
473
474	return (print_algos_sysctl(req, AF_INET));
475}
476SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
477    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
478    print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms");
479#endif
480
481/*
482 * Calculate delay between repeated failures.
483 * Returns current delay in milliseconds.
484 */
485static uint32_t
486callout_calc_delay_ms(struct fib_data *fd)
487{
488	uint32_t shift;
489
490	if (fd->fd_failed_rebuilds > 10)
491		shift = 10;
492	else
493		shift = fd->fd_failed_rebuilds;
494
495	return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
496}
497
498static void
499schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms)
500{
501
502	FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action);
503	fd->fd_callout_action = action;
504	callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0,
505	    handle_fd_callout, fd, 0);
506}
507
508static void
509schedule_fd_rebuild(struct fib_data *fd, const char *reason)
510{
511
512	RIB_WLOCK_ASSERT(fd->fd_rh);
513
514	if (!fd->fd_need_rebuild) {
515		fd->fd_need_rebuild = true;
516		/* Stop batch updates */
517		fd->fd_batch = false;
518
519		/*
520		 * Potentially re-schedules pending callout
521		 *  initiated by schedule_algo_eval.
522		 */
523		FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)",
524		    reason, fd->fd_failed_rebuilds);
525		schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd));
526	}
527}
528
529static void
530sync_rib_gen(struct fib_data *fd)
531{
532	FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib);
533	fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib;
534}
535
536static int64_t
537get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv)
538{
539	int64_t diff = 0;
540
541	diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000;
542	diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000;
543
544	return (diff);
545}
546
547static void
548add_tv_diff_ms(struct timeval *tv, int ms)
549{
550	tv->tv_sec += ms / 1000;
551	ms = ms % 1000;
552	if (ms * 1000 + tv->tv_usec < 1000000)
553		tv->tv_usec += ms * 1000;
554	else {
555		tv->tv_sec += 1;
556		tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000;
557	}
558}
559
560/*
561 * Marks the time when algo state diverges from the rib state.
562 */
563static void
564mark_diverge_time(struct fib_data *fd)
565{
566	struct fib_sync_status *fd_ss = &fd->fd_ss;
567
568	getmicrouptime(&fd_ss->diverge_time);
569	fd_ss->bucket_id = 0;
570	fd_ss->bucket_changes = 0;
571}
572
573/*
574 * Calculates and updates the next algorithm sync time, based on the current activity.
575 *
576 * The intent is to provide reasonable balance between the update
577 *  latency and efficient batching when changing large amount of routes.
578 *
579 * High-level algorithm looks the following:
580 * 1) all changes are bucketed in 50ms intervals
581 * 2) If amount of changes within the bucket is greater than the threshold,
582 *   the update gets delayed, up to maximum delay threshold.
583 */
584static void
585update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action)
586{
587	uint32_t bucket_id, new_delay = 0;
588	struct timeval tv;
589
590	/* Fetch all variables at once to ensure consistent reads */
591	uint32_t bucket_time_ms = V_update_bucket_time_ms;
592	uint32_t threshold_rate = V_bucket_change_threshold_rate;
593	uint32_t max_delay_ms = V_fib_max_sync_delay_ms;
594
595	if (bucket_time_ms == 0)
596		bucket_time_ms = 50;
597	/* calculate per-bucket threshold rate */
598	threshold_rate = threshold_rate * bucket_time_ms / 1000;
599
600	getmicrouptime(&tv);
601
602	struct fib_sync_status *fd_ss = &fd->fd_ss;
603
604	bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms;
605
606	if (fd_ss->bucket_id == bucket_id) {
607		fd_ss->bucket_changes++;
608		if (fd_ss->bucket_changes == threshold_rate) {
609			new_delay = (bucket_id + 2) * bucket_time_ms;
610			if (new_delay <= max_delay_ms) {
611				FD_PRINTF(LOG_DEBUG, fd,
612				    "hit threshold of %u routes, delay update,"
613				    "bucket: %u, total delay: %u",
614				    threshold_rate, bucket_id + 1, new_delay);
615			} else {
616				new_delay = 0;
617				FD_PRINTF(LOG_DEBUG, fd,
618				    "maximum sync delay (%u ms) reached", max_delay_ms);
619			}
620		} else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1))
621			new_delay = bucket_time_ms;
622	} else {
623		fd_ss->bucket_id = bucket_id;
624		fd_ss->bucket_changes = 1;
625	}
626
627	if (new_delay > 0) {
628		/* Calculated time has been updated */
629		struct timeval new_tv = fd_ss->diverge_time;
630		add_tv_diff_ms(&new_tv, new_delay);
631
632		int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv);
633		schedule_callout(fd, action, delay_ms);
634	}
635}
636
637static void
638update_algo_state(struct fib_data *fd)
639{
640
641	RIB_WLOCK_ASSERT(fd->fd_rh);
642
643	if (fd->fd_batch || fd->fd_need_rebuild) {
644		enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH;
645		update_rebuild_delay(fd, action);
646		return;
647	}
648
649	if (fd->fd_num_changes++ == 0) {
650		/* Start callout to consider switch */
651		if (!callout_pending(&fd->fd_callout))
652			schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS);
653	} else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) {
654		/* Reset callout to exec immediately */
655		if (fd->fd_callout_action == FDA_EVAL)
656			schedule_callout(fd, FDA_EVAL, 1);
657	}
658}
659
660static bool
661need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc)
662{
663	struct nhop_object *nh;
664
665	/* Sync addition/removal of interface routes */
666	switch (rc->rc_cmd) {
667	case RTM_ADD:
668		nh = rc->rc_nh_new;
669		if (!NH_IS_NHGRP(nh)) {
670			if (!(nh->nh_flags & NHF_GATEWAY))
671				return (true);
672			if (nhop_get_rtflags(nh) & RTF_STATIC)
673				return (true);
674		}
675		break;
676	case RTM_DELETE:
677		nh = rc->rc_nh_old;
678		if (!NH_IS_NHGRP(nh)) {
679			if (!(nh->nh_flags & NHF_GATEWAY))
680				return (true);
681			if (nhop_get_rtflags(nh) & RTF_STATIC)
682				return (true);
683		}
684		break;
685	}
686
687	return (false);
688}
689
690static bool
691apply_rtable_changes(struct fib_data *fd)
692{
693	enum flm_op_result result;
694	struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
695
696	result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data);
697
698	if (result == FLM_SUCCESS) {
699		sync_rib_gen(fd);
700		for (int i = 0; i < q->count; i++)
701			if (q->entries[i].nh_old)
702				fib_unref_nhop(fd, q->entries[i].nh_old);
703		q->count = 0;
704	}
705	fd->fd_batch = false;
706
707	return (result == FLM_SUCCESS);
708}
709
710static bool
711fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc)
712{
713	int plen = 0;
714
715	switch (fd->fd_family) {
716#ifdef INET
717	case AF_INET:
718		rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid);
719		break;
720#endif
721#ifdef INET6
722	case AF_INET6:
723		rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid);
724		break;
725#endif
726	}
727
728	ce->plen = plen;
729	ce->nh_old = rc->rc_nh_old;
730	ce->nh_new = rc->rc_nh_new;
731	if (ce->nh_new != NULL) {
732		if (fib_ref_nhop(fd, ce->nh_new) == 0)
733			return (false);
734	}
735
736	return (true);
737}
738
739static bool
740queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc)
741{
742	struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
743
744	if (q->count >= q->size) {
745		uint32_t q_size;
746
747		if (q->size == 0)
748			q_size = 256; /* ~18k memory */
749		else
750			q_size = q->size * 2;
751
752		size_t size = q_size * sizeof(struct fib_change_entry);
753		void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO);
754		if (a == NULL) {
755			FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements",
756			    q_size);
757			return (false);
758		}
759		q->entries = a;
760		q->size = q_size;
761	}
762
763	return (fill_change_entry(fd, &q->entries[q->count++], rc));
764}
765
766/*
767 * Rib subscription handler. Checks if the algorithm is ready to
768 *  receive updates, handles nexthop refcounting and passes change
769 *  data to the algorithm callback.
770 */
771static void
772handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
773    void *_data)
774{
775	struct fib_data *fd = (struct fib_data *)_data;
776	enum flm_op_result result;
777
778	RIB_WLOCK_ASSERT(rnh);
779
780	/*
781	 * There is a small gap between subscribing for route changes
782	 *  and initiating rtable dump. Avoid receiving route changes
783	 *  prior to finishing rtable dump by checking `init_done`.
784	 */
785	if (!fd->init_done)
786		return;
787
788	bool immediate_sync = need_immediate_sync(fd, rc);
789
790	/* Consider scheduling algorithm re-evaluation */
791	update_algo_state(fd);
792
793	/*
794	 * If algo requested rebuild, stop sending updates by default.
795	 * This simplifies nexthop refcount handling logic.
796	 */
797	if (fd->fd_need_rebuild) {
798		if (immediate_sync)
799			rebuild_fd(fd, "rtable change type enforced sync");
800		return;
801	}
802
803	/*
804	 * Algo requested updates to be delivered in batches.
805	 * Add the current change to the queue and return.
806	 */
807	if (fd->fd_batch) {
808		if (immediate_sync) {
809			if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd))
810				rebuild_fd(fd, "batch sync failed");
811		} else {
812			if (!queue_rtable_change(fd, rc))
813				schedule_fd_rebuild(fd, "batch queue failed");
814		}
815		return;
816	}
817
818	/*
819	 * Maintain guarantee that every nexthop returned by the dataplane
820	 *  lookup has > 0 refcount, so can be safely referenced within current
821	 *  epoch.
822	 */
823	if (rc->rc_nh_new != NULL) {
824		if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) {
825			/* ran out of indexes */
826			schedule_fd_rebuild(fd, "ran out of nhop indexes");
827			return;
828		}
829	}
830
831	result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
832
833	switch (result) {
834	case FLM_SUCCESS:
835		sync_rib_gen(fd);
836		/* Unref old nexthop on success */
837		if (rc->rc_nh_old != NULL)
838			fib_unref_nhop(fd, rc->rc_nh_old);
839		break;
840	case FLM_BATCH:
841
842		/*
843		 * Algo asks to batch the changes.
844		 */
845		if (queue_rtable_change(fd, rc)) {
846			if (!immediate_sync) {
847				fd->fd_batch = true;
848				mark_diverge_time(fd);
849				update_rebuild_delay(fd, FDA_BATCH);
850				break;
851			}
852			if (apply_rtable_changes(fd))
853				break;
854		}
855		FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild");
856
857	case FLM_REBUILD:
858
859		/*
860		 * Algo is not able to apply the update.
861		 * Schedule algo rebuild.
862		 */
863		if (!immediate_sync) {
864			mark_diverge_time(fd);
865			schedule_fd_rebuild(fd, "algo requested rebuild");
866			break;
867		}
868
869		FD_PRINTF(LOG_INFO, fd, "running sync rebuild");
870		rebuild_fd(fd, "rtable change type enforced sync");
871		break;
872	case FLM_ERROR:
873
874		/*
875		 * Algo reported a non-recoverable error.
876		 * Record the error and schedule rebuild, which will
877		 *  trigger best algo selection.
878		 */
879		FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error");
880		if (!flm_error_add(fd->fd_flm, fd->fd_fibnum))
881			FD_PRINTF(LOG_ERR, fd, "failed to ban algo");
882		schedule_fd_rebuild(fd, "algo reported non-recoverable error");
883	}
884}
885
886static void
887estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd)
888{
889
890	if (old_fd == NULL) {
891		// TODO: read from rtable
892		fd->number_nhops = 16;
893		return;
894	}
895
896	if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
897		fd->number_nhops = 2 * old_fd->number_nhops;
898	else
899		fd->number_nhops = old_fd->number_nhops;
900}
901
902struct walk_cbdata {
903	struct fib_data		*fd;
904	flm_dump_t		*func;
905	enum flm_op_result	result;
906};
907
908/*
909 * Handler called after all rtenties have been dumped.
910 * Performs post-dump framework checks and calls
911 * algo:flm_dump_end_cb().
912 *
913 * Updates walk_cbdata result.
914 */
915static void
916sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
917{
918	struct walk_cbdata *w = (struct walk_cbdata *)_data;
919	struct fib_data *fd = w->fd;
920
921	RIB_WLOCK_ASSERT(w->fd->fd_rh);
922
923	if (rnh->rib_dying) {
924		w->result = FLM_ERROR;
925		return;
926	}
927
928	if (fd->hit_nhops) {
929		FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops",
930		    fd->nh_ref_table->count);
931		if (w->result == FLM_SUCCESS)
932			w->result = FLM_REBUILD;
933		return;
934	}
935
936	if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
937		return;
938
939	/* Post-dump hook, dump successful */
940	w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
941
942	if (w->result == FLM_SUCCESS) {
943		/* Mark init as done to allow routing updates */
944		fd->init_done = 1;
945	}
946}
947
948/*
949 * Callback for each entry in rib.
950 * Calls algo:flm_dump_rib_item_cb func as a part of initial
951 *  route table synchronisation.
952 */
953static int
954sync_algo_cb(struct rtentry *rt, void *_data)
955{
956	struct walk_cbdata *w = (struct walk_cbdata *)_data;
957
958	RIB_WLOCK_ASSERT(w->fd->fd_rh);
959
960	if (w->result == FLM_SUCCESS && w->func) {
961
962		/*
963		 * Reference nexthops to maintain guarantee that
964		 *  each nexthop returned by datapath has > 0 references
965		 *  and can be safely referenced within current epoch.
966		 */
967		struct nhop_object *nh = rt_get_raw_nhop(rt);
968		if (fib_ref_nhop(w->fd, nh) != 0)
969			w->result = w->func(rt, w->fd->fd_algo_data);
970		else
971			w->result = FLM_REBUILD;
972	}
973
974	return (0);
975}
976
977/*
978 * Dump all routing table state to the algo instance.
979 */
980static enum flm_op_result
981sync_algo(struct fib_data *fd)
982{
983	struct walk_cbdata w = {
984		.fd = fd,
985		.func = fd->fd_flm->flm_dump_rib_item_cb,
986		.result = FLM_SUCCESS,
987	};
988
989	rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w);
990
991	FD_PRINTF(LOG_INFO, fd,
992	    "initial dump completed (rtable version: %d), result: %s",
993	    fd->fd_rh->rnh_gen, print_op_result(w.result));
994
995	return (w.result);
996}
997
998/*
999 * Schedules epoch-backed @fd instance deletion.
1000 * * Unlinks @fd from the list of active algo instances.
1001 * * Removes rib subscription.
1002 * * Stops callout.
1003 * * Schedules actual deletion.
1004 *
1005 * Assume @fd is already unlinked from the datapath.
1006 */
1007static int
1008schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
1009{
1010	bool is_dead;
1011
1012	NET_EPOCH_ASSERT();
1013	RIB_WLOCK_ASSERT(fd->fd_rh);
1014
1015	FIB_MOD_LOCK();
1016	is_dead = fd->fd_dead;
1017	if (!is_dead)
1018		fd->fd_dead = true;
1019	if (fd->fd_linked) {
1020		TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1021		fd->fd_linked = false;
1022	}
1023	FIB_MOD_UNLOCK();
1024	if (is_dead)
1025		return (0);
1026
1027	FD_PRINTF(LOG_INFO, fd, "DETACH");
1028
1029	if (fd->fd_rs != NULL)
1030		rib_unsubscribe_locked(fd->fd_rs);
1031
1032	/*
1033	 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
1034	 * will be executed, hence no _new_ callout schedules will happen.
1035	 */
1036	callout_stop(&fd->fd_callout);
1037
1038	fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx);
1039
1040	return (0);
1041}
1042
1043/*
1044 * Wipe all fd instances from the list matching rib specified by @rh.
1045 * If @keep_first is set, remove all but the first record.
1046 */
1047static void
1048fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
1049{
1050	struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
1051	struct fib_data *fd, *fd_tmp;
1052	struct epoch_tracker et;
1053
1054	FIB_MOD_LOCK();
1055	TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) {
1056		if (fd->fd_rh == rh) {
1057			if (keep_first) {
1058				keep_first = false;
1059				continue;
1060			}
1061			TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1062			fd->fd_linked = false;
1063			TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
1064		}
1065	}
1066	FIB_MOD_UNLOCK();
1067
1068	/* Pass 2: remove each entry */
1069	NET_EPOCH_ENTER(et);
1070	TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
1071		if (!in_callout)
1072			RIB_WLOCK(fd->fd_rh);
1073		schedule_destroy_fd_instance(fd, in_callout);
1074		if (!in_callout)
1075			RIB_WUNLOCK(fd->fd_rh);
1076	}
1077	NET_EPOCH_EXIT(et);
1078}
1079
1080void
1081fib_destroy_rib(struct rib_head *rh)
1082{
1083
1084	/*
1085	 * rnh has `is_dying` flag set, so setup of new fd's will fail at
1086	 *  sync_algo() stage, preventing new entries to be added to the list
1087	 *  of active algos. Remove all existing entries for the particular rib.
1088	 */
1089	fib_cleanup_algo(rh, false, false);
1090}
1091
1092/*
1093 * Finalises fd destruction by freeing all fd resources.
1094 */
1095static void
1096destroy_fd_instance(struct fib_data *fd)
1097{
1098
1099	FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd);
1100
1101	/* Call destroy callback first */
1102	if (fd->fd_algo_data != NULL)
1103		fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
1104
1105	/* Nhop table */
1106	if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) {
1107		for (int i = 0; i < fd->number_nhops; i++) {
1108			if (!is_idx_free(fd, i)) {
1109				FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p",
1110				    i, fd->nh_idx[i]);
1111				nhop_free_any(fd->nh_idx[i]);
1112			}
1113		}
1114		free(fd->nh_idx, M_RTABLE);
1115	}
1116	if (fd->nh_ref_table != NULL)
1117		free(fd->nh_ref_table, M_RTABLE);
1118
1119	if (fd->fd_ss.fd_change_queue.entries != NULL)
1120		free(fd->fd_ss.fd_change_queue.entries, M_TEMP);
1121
1122	fib_unref_algo(fd->fd_flm);
1123
1124	free(fd, M_RTABLE);
1125}
1126
1127/*
1128 * Epoch callback indicating fd is safe to destroy
1129 */
1130static void
1131destroy_fd_instance_epoch(epoch_context_t ctx)
1132{
1133	struct fib_data *fd;
1134
1135	fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
1136
1137	CURVNET_SET(fd->fd_vnet);
1138	destroy_fd_instance(fd);
1139	CURVNET_RESTORE();
1140}
1141
1142/*
1143 * Tries to setup fd instance.
1144 * - Allocates fd/nhop table
1145 * - Runs algo:flm_init_cb algo init
1146 * - Subscribes fd to the rib
1147 * - Runs rtable dump
1148 * - Adds instance to the list of active instances.
1149 *
1150 * Returns: operation result. Fills in @pfd with resulting fd on success.
1151 *
1152 */
1153static enum flm_op_result
1154try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1155    struct fib_data *old_fd, struct fib_data **pfd)
1156{
1157	struct fib_data *fd;
1158	size_t size;
1159	enum flm_op_result result;
1160
1161	/* Allocate */
1162	fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
1163	if (fd == NULL)  {
1164		*pfd = NULL;
1165		RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure");
1166		return (FLM_REBUILD);
1167	}
1168	*pfd = fd;
1169
1170	estimate_nhop_scale(old_fd, fd);
1171
1172	fd->fd_rh = rh;
1173	fd->fd_family = rh->rib_family;
1174	fd->fd_fibnum = rh->rib_fibnum;
1175	callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0);
1176	fd->fd_vnet = curvnet;
1177	fd->fd_flm = flm;
1178
1179	FIB_MOD_LOCK();
1180	flm->flm_refcount++;
1181	fd->fd_gen = ++fib_gen;
1182	FIB_MOD_UNLOCK();
1183
1184	FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd);
1185
1186	/* Allocate nhidx -> nhop_ptr table */
1187	size = fd->number_nhops * sizeof(void *);
1188	fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1189	if (fd->nh_idx == NULL) {
1190		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size);
1191		return (FLM_REBUILD);
1192	}
1193
1194	/* Allocate nhop index refcount table */
1195	size = sizeof(struct nhop_ref_table);
1196	size += fd->number_nhops * sizeof(uint32_t);
1197	fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1198	if (fd->nh_ref_table == NULL) {
1199		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
1200		return (FLM_REBUILD);
1201	}
1202	FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops);
1203
1204	/* Okay, we're ready for algo init */
1205	void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
1206	result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
1207	if (result != FLM_SUCCESS) {
1208		FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name);
1209		return (result);
1210	}
1211
1212	/* Try to subscribe */
1213	if (flm->flm_change_rib_item_cb != NULL) {
1214		fd->fd_rs = rib_subscribe_locked(fd->fd_rh,
1215		    handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE);
1216		if (fd->fd_rs == NULL) {
1217			FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes");
1218			return (FLM_REBUILD);
1219		}
1220	}
1221
1222	/* Dump */
1223	result = sync_algo(fd);
1224	if (result != FLM_SUCCESS) {
1225		FD_PRINTF(LOG_INFO, fd, "rib sync failed");
1226		return (result);
1227	}
1228	FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully.");
1229
1230	FIB_MOD_LOCK();
1231	/*
1232	 * Insert fd in the beginning of a list, to maintain invariant
1233	 *  that first matching entry for the AF/fib is always the active
1234	 *  one.
1235	 */
1236	TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries);
1237	fd->fd_linked = true;
1238	FIB_MOD_UNLOCK();
1239
1240	return (FLM_SUCCESS);
1241}
1242
1243/*
1244 * Sets up algo @flm for table @rh and links it to the datapath.
1245 *
1246 */
1247static enum flm_op_result
1248setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1249    struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
1250{
1251	struct fib_data *prev_fd, *new_fd;
1252	enum flm_op_result result;
1253
1254	NET_EPOCH_ASSERT();
1255	RIB_WLOCK_ASSERT(rh);
1256
1257	prev_fd = orig_fd;
1258	new_fd = NULL;
1259	for (int i = 0; i < FIB_MAX_TRIES; i++) {
1260		result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);
1261
1262		if ((result == FLM_SUCCESS) && attach) {
1263			if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp))
1264				sync_rib_gen(new_fd);
1265			else
1266				result = FLM_REBUILD;
1267		}
1268
1269		if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
1270			schedule_destroy_fd_instance(prev_fd, false);
1271			prev_fd = NULL;
1272		}
1273
1274		RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
1275		    print_op_result(result));
1276
1277		if (result == FLM_REBUILD) {
1278			prev_fd = new_fd;
1279			new_fd = NULL;
1280			continue;
1281		}
1282
1283		break;
1284	}
1285
1286	if (result != FLM_SUCCESS) {
1287		RH_PRINTF(LOG_WARNING, rh,
1288		    "%s algo instance setup failed, failures=%d", flm->flm_name,
1289		    orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0);
1290		/* update failure count */
1291		FIB_MOD_LOCK();
1292		if (orig_fd != NULL)
1293			orig_fd->fd_failed_rebuilds++;
1294		FIB_MOD_UNLOCK();
1295
1296		/* Ban algo on non-recoverable error */
1297		if (result == FLM_ERROR)
1298			flm_error_add(flm, rh->rib_fibnum);
1299
1300		if ((prev_fd != NULL) && (prev_fd != orig_fd))
1301			schedule_destroy_fd_instance(prev_fd, false);
1302		if (new_fd != NULL) {
1303			schedule_destroy_fd_instance(new_fd, false);
1304			new_fd = NULL;
1305		}
1306	}
1307
1308	*pfd = new_fd;
1309	return (result);
1310}
1311
1312/*
1313 * Tries to sync algo with the current rtable state, either
1314 * by executing batch update or rebuilding.
1315 * Returns true on success.
1316 */
1317static bool
1318execute_callout_action(struct fib_data *fd)
1319{
1320	enum fib_callout_action action = fd->fd_callout_action;
1321	struct fib_lookup_module *flm_new = NULL;
1322	bool result = true;
1323
1324	NET_EPOCH_ASSERT();
1325	RIB_WLOCK_ASSERT(fd->fd_rh);
1326
1327	fd->fd_need_rebuild = false;
1328	fd->fd_batch = false;
1329	fd->fd_num_changes = 0;
1330
1331	/* First, check if we're still OK to use this algo */
1332	if (!is_algo_fixed(fd->fd_rh))
1333		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1334	if (flm_new != NULL)
1335		action = FDA_REBUILD;
1336
1337	if (action == FDA_BATCH) {
1338		/* Try to sync */
1339		if (!apply_rtable_changes(fd))
1340			action = FDA_REBUILD;
1341	}
1342
1343	if (action == FDA_REBUILD)
1344		result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1345	if (flm_new != NULL)
1346		fib_unref_algo(flm_new);
1347
1348	return (result);
1349}
1350
1351/*
1352 * Callout for all scheduled fd-related work.
1353 * - Checks if the current algo is still the best algo
1354 * - Synchronises algo instance to the rtable (batch usecase)
1355 * - Creates a new instance of an algo for af/fib if desired.
1356 */
1357static void
1358handle_fd_callout(void *_data)
1359{
1360	struct fib_data *fd = (struct fib_data *)_data;
1361	struct epoch_tracker et;
1362
1363	FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action);
1364
1365	NET_EPOCH_ENTER(et);
1366	CURVNET_SET(fd->fd_vnet);
1367	execute_callout_action(fd);
1368	CURVNET_RESTORE();
1369	NET_EPOCH_EXIT(et);
1370}
1371
1372/*
1373 * Tries to create new algo instance based on @fd data.
1374 * Returns true on success.
1375 */
1376static bool
1377rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new)
1378{
1379	struct fib_data *fd_new, *fd_tmp = NULL;
1380	bool result;
1381
1382	if (flm_new == fd->fd_flm)
1383		fd_tmp = fd;
1384	else
1385		FD_PRINTF(LOG_INFO, fd, "switching algo to %s", flm_new->flm_name);
1386
1387	result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true);
1388	if (result != FLM_SUCCESS) {
1389		FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed");
1390		return (false);
1391	}
1392	FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");
1393
1394	/* Remove old instance */
1395	schedule_destroy_fd_instance(fd, true);
1396
1397	return (true);
1398}
1399
1400static bool
1401rebuild_fd(struct fib_data *fd, const char *reason)
1402{
1403	struct fib_lookup_module *flm_new = NULL;
1404	bool result;
1405
1406	if (!is_algo_fixed(fd->fd_rh))
1407		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1408
1409	FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason);
1410	result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1411	if (flm_new != NULL)
1412		fib_unref_algo(flm_new);
1413
1414	if (!result) {
1415		FD_PRINTF(LOG_ERR, fd, "sync rebuild failed");
1416		schedule_fd_rebuild(fd, "sync rebuild failed");
1417	}
1418
1419	return (result);
1420}
1421
1422/*
1423 * Finds algo by name/family.
1424 * Returns referenced algo or NULL.
1425 */
1426static struct fib_lookup_module *
1427fib_find_algo(const char *algo_name, int family)
1428{
1429	struct fib_lookup_module *flm;
1430
1431	FIB_MOD_LOCK();
1432	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1433		if ((strcmp(flm->flm_name, algo_name) == 0) &&
1434		    (family == flm->flm_family)) {
1435			flm->flm_refcount++;
1436			FIB_MOD_UNLOCK();
1437			return (flm);
1438		}
1439	}
1440	FIB_MOD_UNLOCK();
1441
1442	return (NULL);
1443}
1444
1445static void
1446fib_unref_algo(struct fib_lookup_module *flm)
1447{
1448
1449	FIB_MOD_LOCK();
1450	flm->flm_refcount--;
1451	FIB_MOD_UNLOCK();
1452}
1453
1454static int
1455set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req)
1456{
1457	struct fib_lookup_module *flm = NULL;
1458	struct fib_data *fd = NULL;
1459	char old_algo_name[32], algo_name[32];
1460	struct rib_head *rh = NULL;
1461	enum flm_op_result result;
1462	struct epoch_tracker et;
1463	int error;
1464
1465	/* Fetch current algo/rib for af/family */
1466	FIB_MOD_LOCK();
1467	TAILQ_FOREACH(fd, &V_fib_data_list, entries) {
1468		if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum))
1469			break;
1470	}
1471	if (fd == NULL) {
1472		FIB_MOD_UNLOCK();
1473		return (ENOENT);
1474	}
1475	rh = fd->fd_rh;
1476	strlcpy(old_algo_name, fd->fd_flm->flm_name,
1477	    sizeof(old_algo_name));
1478	FIB_MOD_UNLOCK();
1479
1480	strlcpy(algo_name, old_algo_name, sizeof(algo_name));
1481	error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
1482	if (error != 0 || req->newptr == NULL)
1483		return (error);
1484
1485	if (strcmp(algo_name, old_algo_name) == 0)
1486		return (0);
1487
1488	/* New algorithm name is different */
1489	flm = fib_find_algo(algo_name, family);
1490	if (flm == NULL) {
1491		RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name);
1492		return (ESRCH);
1493	}
1494
1495	fd = NULL;
1496	NET_EPOCH_ENTER(et);
1497	RIB_WLOCK(rh);
1498	result = setup_fd_instance(flm, rh, NULL, &fd, true);
1499	RIB_WUNLOCK(rh);
1500	NET_EPOCH_EXIT(et);
1501	fib_unref_algo(flm);
1502	if (result != FLM_SUCCESS)
1503		return (EINVAL);
1504
1505	/* Disable automated jumping between algos */
1506	FIB_MOD_LOCK();
1507	set_algo_fixed(rh);
1508	FIB_MOD_UNLOCK();
1509	/* Remove old instance(s) */
1510	fib_cleanup_algo(rh, true, false);
1511
1512	/* Drain cb so user can unload the module after userret if so desired */
1513	NET_EPOCH_DRAIN_CALLBACKS();
1514
1515	return (0);
1516}
1517
1518#ifdef INET
1519static int
1520set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)
1521{
1522
1523	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req));
1524}
1525SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo,
1526    CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1527    set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo");
1528#endif
1529
1530#ifdef INET6
1531static int
1532set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)
1533{
1534
1535	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req));
1536}
1537SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
1538    CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1539    set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo");
1540#endif
1541
1542static struct nhop_object *
1543dummy_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
1544{
1545	return (NULL);
1546}
1547
1548static void
1549destroy_fdh_epoch(epoch_context_t ctx)
1550{
1551	struct fib_dp_header *fdh;
1552
1553	fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx);
1554	free(fdh, M_RTABLE);
1555}
1556
1557static struct fib_dp_header *
1558alloc_fib_dp_array(uint32_t num_tables, bool waitok)
1559{
1560	size_t sz;
1561	struct fib_dp_header *fdh;
1562
1563	sz = sizeof(struct fib_dp_header);
1564	sz += sizeof(struct fib_dp) * num_tables;
1565	fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
1566	if (fdh != NULL) {
1567		fdh->fdh_num_tables = num_tables;
1568		/*
1569		 * Set dummy lookup function ptr always returning NULL, so
1570		 * we can delay algo init.
1571		 */
1572		for (uint32_t i = 0; i < num_tables; i++)
1573			fdh->fdh_idx[i].f = dummy_lookup;
1574	}
1575	return (fdh);
1576}
1577
1578static struct fib_dp_header *
1579get_fib_dp_header(struct fib_dp *dp)
1580{
1581
1582	return (__containerof((void *)dp, struct fib_dp_header, fdh_idx));
1583}
1584
1585/*
1586 * Replace per-family index pool @pdp with a new one which
1587 * contains updated callback/algo data from @fd.
1588 * Returns true on success.
1589 */
1590static bool
1591replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp)
1592{
1593	struct fib_dp_header *new_fdh, *old_fdh;
1594
1595	NET_EPOCH_ASSERT();
1596
1597	FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p",
1598	    curvnet, dp->f, dp->arg);
1599
1600	FIB_MOD_LOCK();
1601	old_fdh = get_fib_dp_header(*pdp);
1602
1603	if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) {
1604		/*
1605		 * Function is the same, data pointer needs update.
1606		 * Perform in-line replace without reallocation.
1607		 */
1608		old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg;
1609		FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh);
1610		FIB_MOD_UNLOCK();
1611		return (true);
1612	}
1613
1614	new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false);
1615	FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh);
1616	if (new_fdh == NULL) {
1617		FIB_MOD_UNLOCK();
1618		FD_PRINTF(LOG_WARNING, fd, "error attaching datapath");
1619		return (false);
1620	}
1621
1622	memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1623	    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1624	/* Update relevant data structure for @fd */
1625	new_fdh->fdh_idx[fd->fd_fibnum] = *dp;
1626
1627	/* Ensure memcpy() writes have completed */
1628	atomic_thread_fence_rel();
1629	/* Set new datapath pointer */
1630	*pdp = &new_fdh->fdh_idx[0];
1631	FIB_MOD_UNLOCK();
1632	FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh);
1633
1634	fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
1635
1636	return (true);
1637}
1638
1639static struct fib_dp **
1640get_family_dp_ptr(int family)
1641{
1642	switch (family) {
1643#ifdef INET
1644	case AF_INET:
1645		return (&V_inet_dp);
1646#endif
1647#ifdef INET6
1648	case AF_INET6:
1649		return (&V_inet6_dp);
1650#endif
1651	}
1652	return (NULL);
1653}
1654
1655/*
1656 * Make datapath use fib instance @fd
1657 */
1658bool
1659fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp)
1660{
1661	struct fib_dp **pdp;
1662
1663	pdp = get_family_dp_ptr(fd->fd_family);
1664	return (replace_rtables_family(pdp, fd, dp));
1665}
1666
1667/*
1668 * Grow datapath pointers array.
1669 * Called from sysctl handler on growing number of routing tables.
1670 */
1671static void
1672grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
1673{
1674	struct fib_dp_header *new_fdh, *old_fdh = NULL;
1675
1676	new_fdh = alloc_fib_dp_array(new_num_tables, true);
1677
1678	FIB_MOD_LOCK();
1679	if (*pdp != NULL) {
1680		old_fdh = get_fib_dp_header(*pdp);
1681		memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1682		    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1683	}
1684
1685	/* Wait till all writes completed */
1686	atomic_thread_fence_rel();
1687
1688	*pdp = &new_fdh->fdh_idx[0];
1689	FIB_MOD_UNLOCK();
1690
1691	if (old_fdh != NULL)
1692		fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
1693}
1694
1695/*
1696 * Grows per-AF arrays of datapath pointers for each supported family.
1697 * Called from fibs resize sysctl handler.
1698 */
1699void
1700fib_grow_rtables(uint32_t new_num_tables)
1701{
1702
1703#ifdef INET
1704	grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables);
1705#endif
1706#ifdef INET6
1707	grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables);
1708#endif
1709}
1710
1711void
1712fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
1713{
1714
1715	bzero(rinfo, sizeof(struct rib_rtable_info));
1716	rinfo->num_prefixes = rh->rnh_prefixes;
1717	rinfo->num_nhops = nhops_get_count(rh);
1718#ifdef ROUTE_MPATH
1719	rinfo->num_nhgrp = nhgrp_get_count(rh);
1720#endif
1721}
1722
1723/*
1724 * Updates pointer to the algo data for the @fd.
1725 */
1726void
1727fib_set_algo_ptr(struct fib_data *fd, void *algo_data)
1728{
1729	RIB_WLOCK_ASSERT(fd->fd_rh);
1730
1731	fd->fd_algo_data = algo_data;
1732}
1733
1734/*
1735 * Calls @callback with @ctx after the end of a current epoch.
1736 */
1737void
1738fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx)
1739{
1740	NET_EPOCH_CALL(callback, ctx);
1741}
1742
1743/*
1744 * Accessor to get rib instance @fd is attached to.
1745 */
1746struct rib_head *
1747fib_get_rh(struct fib_data *fd)
1748{
1749
1750	return (fd->fd_rh);
1751}
1752
1753/*
1754 * Accessor to export idx->nhop array
1755 */
1756struct nhop_object **
1757fib_get_nhop_array(struct fib_data *fd)
1758{
1759
1760	return (fd->nh_idx);
1761}
1762
1763static uint32_t
1764get_nhop_idx(struct nhop_object *nh)
1765{
1766#ifdef ROUTE_MPATH
1767	if (NH_IS_NHGRP(nh))
1768		return (nhgrp_get_idx((struct nhgrp_object *)nh));
1769	else
1770#endif
1771		return (nhop_get_idx(nh));
1772}
1773
1774uint32_t
1775fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
1776{
1777
1778	return (get_nhop_idx(nh));
1779}
1780
1781static bool
1782is_idx_free(struct fib_data *fd, uint32_t index)
1783{
1784
1785	return (fd->nh_ref_table->refcnt[index] == 0);
1786}
1787
1788static uint32_t
1789fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh)
1790{
1791	uint32_t idx = get_nhop_idx(nh);
1792
1793	if (idx >= fd->number_nhops) {
1794		fd->hit_nhops = 1;
1795		return (0);
1796	}
1797
1798	if (is_idx_free(fd, idx)) {
1799		nhop_ref_any(nh);
1800		fd->nh_idx[idx] = nh;
1801		fd->nh_ref_table->count++;
1802		FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
1803	}
1804	fd->nh_ref_table->refcnt[idx]++;
1805
1806	return (idx);
1807}
1808
1809struct nhop_release_data {
1810	struct nhop_object	*nh;
1811	struct epoch_context	ctx;
1812};
1813
1814static void
1815release_nhop_epoch(epoch_context_t ctx)
1816{
1817	struct nhop_release_data *nrd;
1818
1819	nrd = __containerof(ctx, struct nhop_release_data, ctx);
1820	nhop_free_any(nrd->nh);
1821	free(nrd, M_TEMP);
1822}
1823
1824/*
1825 * Delays nexthop refcount release.
1826 * Datapath may have the datastructures not updated yet, so the old
1827 *  nexthop may still be returned till the end of current epoch. Delay
1828 *  refcount removal, as we may be removing the last instance, which will
1829 *  trigger nexthop deletion, rendering returned nexthop invalid.
1830 */
1831static void
1832fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
1833{
1834	struct nhop_release_data *nrd;
1835
1836	nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO);
1837	if (nrd != NULL) {
1838		nrd->nh = nh;
1839		fib_epoch_call(release_nhop_epoch, &nrd->ctx);
1840	} else {
1841		/*
1842		 * Unable to allocate memory. Leak nexthop to maintain guarantee
1843		 *  that each nhop can be referenced.
1844		 */
1845		FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh);
1846	}
1847}
1848
1849static void
1850fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh)
1851{
1852	uint32_t idx = get_nhop_idx(nh);
1853
1854	KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
1855	KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh"));
1856
1857	fd->nh_ref_table->refcnt[idx]--;
1858	if (fd->nh_ref_table->refcnt[idx] == 0) {
1859		FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
1860		fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
1861	}
1862}
1863
1864static void
1865set_algo_fixed(struct rib_head *rh)
1866{
1867	switch (rh->rib_family) {
1868#ifdef INET
1869	case AF_INET:
1870		V_algo_fixed_inet = true;
1871		break;
1872#endif
1873#ifdef INET6
1874	case AF_INET6:
1875		V_algo_fixed_inet6 = true;
1876		break;
1877#endif
1878	}
1879}
1880
1881static bool
1882is_algo_fixed(struct rib_head *rh)
1883{
1884
1885	switch (rh->rib_family) {
1886#ifdef INET
1887	case AF_INET:
1888		return (V_algo_fixed_inet);
1889#endif
1890#ifdef INET6
1891	case AF_INET6:
1892		return (V_algo_fixed_inet6);
1893#endif
1894	}
1895	return (false);
1896}
1897
1898/*
1899 * Runs the check on what would be the best algo for rib @rh, assuming
1900 *  that the current algo is the one specified by @orig_flm. Note that
1901 *  it can be NULL for initial selection.
1902 *
1903 * Returns referenced new algo or NULL if the current one is the best.
1904 */
1905static struct fib_lookup_module *
1906fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
1907{
1908	uint8_t preference, curr_preference = 0, best_preference = 0;
1909	struct fib_lookup_module *flm, *best_flm = NULL;
1910	struct rib_rtable_info rinfo;
1911	int candidate_algos = 0;
1912
1913	fib_get_rtable_info(rh, &rinfo);
1914
1915	FIB_MOD_LOCK();
1916	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1917		if (flm->flm_family != rh->rib_family)
1918			continue;
1919		candidate_algos++;
1920		preference = flm->flm_get_pref(&rinfo);
1921		if (preference > best_preference) {
1922			if (!flm_error_check(flm, rh->rib_fibnum)) {
1923				best_preference = preference;
1924				best_flm = flm;
1925			}
1926		}
1927		if (flm == orig_flm)
1928			curr_preference = preference;
1929	}
1930	if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference))
1931		best_flm->flm_refcount++;
1932	else
1933		best_flm = NULL;
1934	FIB_MOD_UNLOCK();
1935
1936	RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
1937	    candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
1938	    best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"),
1939	    best_preference);
1940
1941	return (best_flm);
1942}
1943
1944/*
1945 * Called when new route table is created.
1946 * Selects, allocates and attaches fib algo for the table.
1947 */
1948static bool
1949fib_select_algo_initial(struct rib_head *rh, struct fib_dp *dp)
1950{
1951	struct fib_lookup_module *flm;
1952	struct fib_data *fd = NULL;
1953	enum flm_op_result result;
1954	struct epoch_tracker et;
1955
1956	flm = fib_check_best_algo(rh, NULL);
1957	if (flm == NULL) {
1958		RH_PRINTF(LOG_CRIT, rh, "no algo selected");
1959		return (false);
1960	}
1961	RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);
1962
1963	NET_EPOCH_ENTER(et);
1964	RIB_WLOCK(rh);
1965	result = setup_fd_instance(flm, rh, NULL, &fd, false);
1966	RIB_WUNLOCK(rh);
1967	NET_EPOCH_EXIT(et);
1968
1969	RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
1970	if (result == FLM_SUCCESS)
1971		*dp = fd->fd_dp;
1972	else
1973		RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name);
1974
1975	fib_unref_algo(flm);
1976
1977	return (result == FLM_SUCCESS);
1978}
1979
1980/*
1981 * Sets up fib algo instances for the non-initialized RIBs in the @family.
1982 * Allocates temporary datapath index to amortize datapaint index updates
1983 * with large @num_tables.
1984 */
1985void
1986fib_setup_family(int family, uint32_t num_tables)
1987{
1988	struct fib_dp_header *new_fdh = alloc_fib_dp_array(num_tables, false);
1989	if (new_fdh == NULL) {
1990		ALGO_PRINTF(LOG_CRIT, "Unable to setup framework for %s", print_family(family));
1991		return;
1992	}
1993
1994	for (int i = 0; i < num_tables; i++) {
1995		struct rib_head *rh = rt_tables_get_rnh(i, family);
1996		if (rh->rib_algo_init)
1997			continue;
1998		if (!fib_select_algo_initial(rh, &new_fdh->fdh_idx[i]))
1999			continue;
2000
2001		rh->rib_algo_init = true;
2002	}
2003
2004	FIB_MOD_LOCK();
2005	struct fib_dp **pdp = get_family_dp_ptr(family);
2006	struct fib_dp_header *old_fdh = get_fib_dp_header(*pdp);
2007
2008	/* Update the items not touched by the new init, from the old data pointer */
2009	for (int i = 0; i < num_tables; i++) {
2010		if (new_fdh->fdh_idx[i].f == dummy_lookup)
2011			new_fdh->fdh_idx[i] = old_fdh->fdh_idx[i];
2012	}
2013
2014	/* Ensure all index writes have completed */
2015	atomic_thread_fence_rel();
2016	/* Set new datapath pointer */
2017	*pdp = &new_fdh->fdh_idx[0];
2018
2019	FIB_MOD_UNLOCK();
2020
2021	fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
2022}
2023
2024/*
2025 * Registers fib lookup module within the subsystem.
2026 */
2027int
2028fib_module_register(struct fib_lookup_module *flm)
2029{
2030
2031	FIB_MOD_LOCK();
2032	ALGO_PRINTF(LOG_INFO, "attaching %s to %s", flm->flm_name,
2033	    print_family(flm->flm_family));
2034	TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
2035	FIB_MOD_UNLOCK();
2036
2037	return (0);
2038}
2039
2040/*
2041 * Tries to unregister fib lookup module.
2042 *
2043 * Returns 0 on success, EBUSY if module is still used
2044 *  by some of the tables.
2045 */
2046int
2047fib_module_unregister(struct fib_lookup_module *flm)
2048{
2049
2050	FIB_MOD_LOCK();
2051	if (flm->flm_refcount > 0) {
2052		FIB_MOD_UNLOCK();
2053		return (EBUSY);
2054	}
2055	fib_error_clear_flm(flm);
2056	ALGO_PRINTF(LOG_INFO, "detaching %s from %s", flm->flm_name,
2057	    print_family(flm->flm_family));
2058	TAILQ_REMOVE(&all_algo_list, flm, entries);
2059	FIB_MOD_UNLOCK();
2060
2061	return (0);
2062}
2063
2064void
2065vnet_fib_init(void)
2066{
2067
2068	TAILQ_INIT(&V_fib_data_list);
2069}
2070
2071void
2072vnet_fib_destroy(void)
2073{
2074
2075	FIB_MOD_LOCK();
2076	fib_error_clear();
2077	FIB_MOD_UNLOCK();
2078}
2079