1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2020 Alexander V. Chernikov
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD$");
30#include "opt_inet.h"
31#include "opt_inet6.h"
32#include "opt_route.h"
33
34#include <sys/param.h>
35#include <sys/eventhandler.h>
36#include <sys/kernel.h>
37#include <sys/sbuf.h>
38#include <sys/lock.h>
39#include <sys/rmlock.h>
40#include <sys/malloc.h>
41#include <sys/mbuf.h>
42#include <sys/module.h>
43#include <sys/kernel.h>
44#include <sys/priv.h>
45#include <sys/proc.h>
46#include <sys/socket.h>
47#include <sys/socketvar.h>
48#include <sys/sysctl.h>
49#include <sys/syslog.h>
50#include <sys/queue.h>
51#include <net/vnet.h>
52
53#include <net/if.h>
54#include <net/if_var.h>
55
56#include <netinet/in.h>
57#include <netinet/in_var.h>
58#include <netinet/ip.h>
59#include <netinet/ip_var.h>
60#ifdef INET6
61#include <netinet/ip6.h>
62#include <netinet6/ip6_var.h>
63#endif
64
65#include <net/route.h>
66#include <net/route/nhop.h>
67#include <net/route/route_ctl.h>
68#include <net/route/route_var.h>
69#include <net/route/fib_algo.h>
70
71#include <machine/stdarg.h>
72
73/*
74 * Fib lookup framework.
75 *
76 * This framework enables accelerated longest-prefix-match lookups for the
77 *  routing tables by adding the ability to dynamically attach/detach lookup
78 *  algorithms implementation to/from the datapath.
79 *
80 * flm - fib lookup modules - implementation of particular lookup algorithm
81 * fd - fib data - instance of an flm bound to specific routing table
82 *
83 * This file provides main framework functionality.
84 *
85 * The following are the features provided by the framework
86 *
87 * 1) nexhops abstraction -> provides transparent referencing, indexing
88 *   and efficient idx->ptr mappings for nexthop and nexthop groups.
89 * 2) Routing table synchronisation
90 * 3) dataplane attachment points
91 * 4) automatic algorithm selection based on the provided preference.
92 *
93 *
94 * DATAPATH
95 * For each supported address family, there is a an allocated array of fib_dp
96 *  structures, indexed by fib number. Each array entry contains callback function
97 *  and its argument. This function will be called with a family-specific lookup key,
98 *  scope and provided argument. This array gets re-created every time when new algo
99 *  instance gets created. Please take a look at the replace_rtables_family() function
100 *  for more details.
101 *
102 */
103
104SYSCTL_DECL(_net_route);
105SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
106    "Fib algorithm lookups");
107
108/* Algorithm sync policy */
109
110/* Time interval to bucket updates */
111VNET_DEFINE_STATIC(unsigned int, update_bucket_time_ms) = 50;
112#define	V_update_bucket_time_ms	VNET(update_bucket_time_ms)
113SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET,
114    &VNET_NAME(update_bucket_time_ms), 0, "Time interval to calculate update rate");
115
116/* Minimum update rate to delay sync */
117VNET_DEFINE_STATIC(unsigned int, bucket_change_threshold_rate) = 500;
118#define	V_bucket_change_threshold_rate	VNET(bucket_change_threshold_rate)
119SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET,
120    &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync");
121
122/* Max allowed delay to sync */
123VNET_DEFINE_STATIC(unsigned int, fib_max_sync_delay_ms) = 1000;
124#define	V_fib_max_sync_delay_ms	VNET(fib_max_sync_delay_ms)
125SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET,
126    &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)");
127
128
129#ifdef INET6
130VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
131#define	V_algo_fixed_inet6	VNET(algo_fixed_inet6)
132SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
133    "IPv6 longest prefix match lookups");
134#endif
135#ifdef INET
136VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false;
137#define	V_algo_fixed_inet	VNET(algo_fixed_inet)
138SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
139    "IPv4 longest prefix match lookups");
140#endif
141
142/* Fib instance counter */
143static uint32_t fib_gen = 0;
144
145struct nhop_ref_table {
146	uint32_t		count;
147	int32_t			refcnt[0];
148};
149
150enum fib_callout_action {
151	FDA_NONE,	/* No callout scheduled */
152	FDA_REBUILD,	/* Asks to rebuild algo instance */
153	FDA_EVAL,	/* Asks to evaluate if the current algo is still be best */
154	FDA_BATCH,	/* Asks to submit batch of updates to the algo */
155};
156
157struct fib_sync_status {
158	struct timeval		diverge_time;	/* ts when diverged */
159	uint32_t		num_changes;	/* number of changes since sync */
160	uint32_t		bucket_changes;	/* num changes within the current bucket */
161	uint64_t		bucket_id;	/* 50ms bucket # */
162	struct fib_change_queue	fd_change_queue;/* list of scheduled entries */
163};
164
165/*
166 * Data structure for the fib lookup instance tied to the particular rib.
167 */
168struct fib_data {
169	uint32_t		number_nhops;	/* current # of nhops */
170	uint8_t			hit_nhops;	/* true if out of nhop limit */
171	uint8_t			init_done;	/* true if init is competed */
172	uint32_t		fd_dead:1;	/* Scheduled for deletion */
173	uint32_t		fd_linked:1;	/* true if linked */
174	uint32_t		fd_need_rebuild:1;	/* true if rebuild scheduled */
175	uint32_t		fd_batch:1;	/* true if batched notification scheduled */
176	uint8_t			fd_family;	/* family */
177	uint32_t		fd_fibnum;	/* fibnum */
178	uint32_t		fd_failed_rebuilds;	/* stat: failed rebuilds */
179	uint32_t		fd_gen;		/* instance gen# */
180	struct callout		fd_callout;	/* rebuild callout */
181	enum fib_callout_action	fd_callout_action;	/* Callout action to take */
182	void			*fd_algo_data;	/* algorithm data */
183	struct nhop_object	**nh_idx;	/* nhop idx->ptr array */
184	struct nhop_ref_table	*nh_ref_table;	/* array with # of nhop references */
185	struct rib_head		*fd_rh;		/* RIB table we're attached to */
186	struct rib_subscription	*fd_rs;		/* storing table subscription */
187	struct fib_dp		fd_dp;		/* fib datapath data */
188	struct vnet		*fd_vnet;	/* vnet fib belongs to */
189	struct epoch_context	fd_epoch_ctx;	/* epoch context for deletion */
190	struct fib_lookup_module	*fd_flm;/* pointer to the lookup module */
191	struct fib_sync_status	fd_ss;		/* State relevant to the rib sync  */
192	uint32_t		fd_num_changes;	/* number of changes since last callout */
193	TAILQ_ENTRY(fib_data)	entries;	/* list of all fds in vnet */
194};
195
196static bool rebuild_fd(struct fib_data *fd, const char *reason);
197static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new);
198static void handle_fd_callout(void *_data);
199static void destroy_fd_instance_epoch(epoch_context_t ctx);
200static bool is_idx_free(struct fib_data *fd, uint32_t index);
201static void set_algo_fixed(struct rib_head *rh);
202static bool is_algo_fixed(struct rib_head *rh);
203
204static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh);
205static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh);
206
207static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
208    struct fib_lookup_module *orig_flm);
209static void fib_unref_algo(struct fib_lookup_module *flm);
210static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum);
211
212struct mtx fib_mtx;
213#define	FIB_MOD_LOCK()		mtx_lock(&fib_mtx)
214#define	FIB_MOD_UNLOCK()	mtx_unlock(&fib_mtx)
215#define	FIB_MOD_LOCK_ASSERT()	mtx_assert(&fib_mtx, MA_OWNED)
216
217MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF);
218
219/* Algorithm has to be this percent better than the current to switch */
220#define	BEST_DIFF_PERCENT	(5 * 256 / 100)
221/* Schedule algo re-evaluation X seconds after a change */
222#define	ALGO_EVAL_DELAY_MS	30000
223/* Force algo re-evaluation after X changes */
224#define	ALGO_EVAL_NUM_ROUTES	100
225/* Try to setup algorithm X times */
226#define	FIB_MAX_TRIES		32
227/* Max amount of supported nexthops */
228#define	FIB_MAX_NHOPS		262144
229#define	FIB_CALLOUT_DELAY_MS	50
230
231
232/* Debug */
233static int flm_debug_level = LOG_NOTICE;
234SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN,
235    &flm_debug_level, 0, "debuglevel");
236#define	FLM_MAX_DEBUG_LEVEL	LOG_DEBUG
237#ifndef	LOG_DEBUG2
238#define	LOG_DEBUG2	8
239#endif
240
241#define	_PASS_MSG(_l)	(flm_debug_level >= (_l))
242#define	ALGO_PRINTF(_l, _fmt, ...)	if (_PASS_MSG(_l)) {		\
243	printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__);	\
244}
245#define	_ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \
246    printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\
247    print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__)
248#define	_RH_PRINTF(_fib, _fam, _func, _fmt, ...) \
249    printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__)
250#define	RH_PRINTF(_l, _rh, _fmt, ...)	if (_PASS_MSG(_l)) {	\
251    _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\
252}
253#define	FD_PRINTF(_l, _fd, _fmt, ...)	FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__)
254#define	_FD_PRINTF(_l, _fd, _fmt, ...)	if (_PASS_MSG(_l)) {		\
255    _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name,	\
256    _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__);			\
257}
258#if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2
259#define	FD_PRINTF_LOG_DEBUG2	_FD_PRINTF
260#else
261#define	FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...)
262#endif
263#if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG
264#define	FD_PRINTF_LOG_DEBUG	_FD_PRINTF
265#else
266#define	FD_PRINTF_LOG_DEBUG()
267#endif
268#if FLM_MAX_DEBUG_LEVEL>=LOG_INFO
269#define	FD_PRINTF_LOG_INFO	_FD_PRINTF
270#else
271#define	FD_PRINTF_LOG_INFO()
272#endif
273#define	FD_PRINTF_LOG_NOTICE	_FD_PRINTF
274#define	FD_PRINTF_LOG_ERR	_FD_PRINTF
275#define	FD_PRINTF_LOG_WARNING	_FD_PRINTF
276
277
278/* List of all registered lookup algorithms */
279static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list);
280
281/* List of all fib lookup instances in the vnet */
282VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list);
283#define	V_fib_data_list	VNET(fib_data_list)
284
285/* Datastructure for storing non-transient fib lookup module failures */
286struct fib_error {
287	int				fe_family;
288	uint32_t			fe_fibnum;	/* failed rtable */
289	struct fib_lookup_module	*fe_flm;	/* failed module */
290	TAILQ_ENTRY(fib_error)		entries;/* list of all errored entries */
291};
292VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list);
293#define	V_fib_error_list VNET(fib_error_list)
294
295/* Per-family array of fibnum -> {func, arg} mappings used in datapath */
296struct fib_dp_header {
297	struct epoch_context	fdh_epoch_ctx;
298	uint32_t		fdh_num_tables;
299	struct fib_dp		fdh_idx[0];
300};
301
302/*
303 * Tries to add new non-transient algorithm error to the list of
304 *  errors.
305 * Returns true on success.
306 */
307static bool
308flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum)
309{
310	struct fib_error *fe;
311
312	fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO);
313	if (fe == NULL)
314		return (false);
315	fe->fe_flm = flm;
316	fe->fe_family = flm->flm_family;
317	fe->fe_fibnum = fibnum;
318
319	FIB_MOD_LOCK();
320	/* Avoid duplicates by checking if error already exists first */
321	if (flm_error_check(flm, fibnum)) {
322		FIB_MOD_UNLOCK();
323		free(fe, M_TEMP);
324		return (true);
325	}
326	TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries);
327	FIB_MOD_UNLOCK();
328
329	return (true);
330}
331
332/*
333 * True if non-transient error has been registered for @flm in @fibnum.
334 */
335static bool
336flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum)
337{
338	const struct fib_error *fe;
339
340	TAILQ_FOREACH(fe, &V_fib_error_list, entries) {
341		if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum))
342			return (true);
343	}
344
345	return (false);
346}
347
348/*
349 * Clear all errors of algo specified by @flm.
350 */
351static void
352fib_error_clear_flm(struct fib_lookup_module *flm)
353{
354	struct fib_error *fe, *fe_tmp;
355
356	FIB_MOD_LOCK_ASSERT();
357
358	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
359		if (fe->fe_flm == flm) {
360			TAILQ_REMOVE(&V_fib_error_list, fe, entries);
361			free(fe, M_TEMP);
362		}
363	}
364}
365
366/*
367 * Clears all errors in current VNET.
368 */
369static void
370fib_error_clear(void)
371{
372	struct fib_error *fe, *fe_tmp;
373
374	FIB_MOD_LOCK_ASSERT();
375
376	TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
377		TAILQ_REMOVE(&V_fib_error_list, fe, entries);
378		free(fe, M_TEMP);
379	}
380}
381
382static const char *
383print_op_result(enum flm_op_result result)
384{
385	switch (result) {
386	case FLM_SUCCESS:
387		return "success";
388	case FLM_REBUILD:
389		return "rebuild";
390	case FLM_BATCH:
391		return "batch";
392	case FLM_ERROR:
393		return "error";
394	}
395
396	return "unknown";
397}
398
399static const char *
400print_family(int family)
401{
402
403	if (family == AF_INET)
404		return ("inet");
405	else if (family == AF_INET6)
406		return ("inet6");
407	else
408		return ("unknown");
409}
410
411/*
412 * Debug function used by lookup algorithms.
413 * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) "
414 */
415void
416fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...)
417{
418	char buf[128];
419	va_list ap;
420
421	if (level > flm_debug_level)
422		return;
423
424	va_start(ap, fmt);
425	vsnprintf(buf, sizeof(buf), fmt, ap);
426	va_end(ap);
427
428	_ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name,
429	    fd->fd_gen, func, "%s", buf);
430}
431
432/*
433 * Outputs list of algorithms supported by the provided address family.
434 */
435static int
436print_algos_sysctl(struct sysctl_req *req, int family)
437{
438	struct fib_lookup_module *flm;
439	struct sbuf sbuf;
440	int error, count = 0;
441
442	error = sysctl_wire_old_buffer(req, 0);
443	if (error == 0) {
444		sbuf_new_for_sysctl(&sbuf, NULL, 512, req);
445		TAILQ_FOREACH(flm, &all_algo_list, entries) {
446			if (flm->flm_family == family) {
447				if (count++ > 0)
448					sbuf_cat(&sbuf, ", ");
449				sbuf_cat(&sbuf, flm->flm_name);
450			}
451		}
452		error = sbuf_finish(&sbuf);
453		sbuf_delete(&sbuf);
454	}
455	return (error);
456}
457
458#ifdef INET6
459static int
460print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)
461{
462
463	return (print_algos_sysctl(req, AF_INET6));
464}
465SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
466    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
467    print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms");
468#endif
469
470#ifdef INET
471static int
472print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)
473{
474
475	return (print_algos_sysctl(req, AF_INET));
476}
477SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
478    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
479    print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms");
480#endif
481
482/*
483 * Calculate delay between repeated failures.
484 * Returns current delay in milliseconds.
485 */
486static uint32_t
487callout_calc_delay_ms(struct fib_data *fd)
488{
489	uint32_t shift;
490
491	if (fd->fd_failed_rebuilds > 10)
492		shift = 10;
493	else
494		shift = fd->fd_failed_rebuilds;
495
496	return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
497}
498
499static void
500schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms)
501{
502
503	FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action);
504	fd->fd_callout_action = action;
505	callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0,
506	    handle_fd_callout, fd, 0);
507}
508
509static void
510schedule_fd_rebuild(struct fib_data *fd, const char *reason)
511{
512
513	RIB_WLOCK_ASSERT(fd->fd_rh);
514
515	if (!fd->fd_need_rebuild) {
516		fd->fd_need_rebuild = true;
517		/* Stop batch updates */
518		fd->fd_batch = false;
519
520		/*
521		 * Potentially re-schedules pending callout
522		 *  initiated by schedule_algo_eval.
523		 */
524		FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)",
525		    reason, fd->fd_failed_rebuilds);
526		schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd));
527	}
528}
529
530static void
531sync_rib_gen(struct fib_data *fd)
532{
533	FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib);
534	fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib;
535}
536
537static int64_t
538get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv)
539{
540	int64_t diff = 0;
541
542	diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000;
543	diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000;
544
545	return (diff);
546}
547
548static void
549add_tv_diff_ms(struct timeval *tv, int ms)
550{
551	tv->tv_sec += ms / 1000;
552	ms = ms % 1000;
553	if (ms * 1000 + tv->tv_usec < 1000000)
554		tv->tv_usec += ms * 1000;
555	else {
556		tv->tv_sec += 1;
557		tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000;
558	}
559}
560
561/*
562 * Marks the time when algo state diverges from the rib state.
563 */
564static void
565mark_diverge_time(struct fib_data *fd)
566{
567	struct fib_sync_status *fd_ss = &fd->fd_ss;
568
569	getmicrouptime(&fd_ss->diverge_time);
570	fd_ss->bucket_id = 0;
571	fd_ss->bucket_changes = 0;
572}
573
574/*
575 * Calculates and updates the next algorithm sync time, based on the current activity.
576 *
577 * The intent is to provide reasonable balance between the update
578 *  latency and efficient batching when changing large amount of routes.
579 *
580 * High-level algorithm looks the following:
581 * 1) all changes are bucketed in 50ms intervals
582 * 2) If amount of changes within the bucket is greater than the threshold,
583 *   the update gets delayed, up to maximum delay threshold.
584 */
585static void
586update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action)
587{
588	uint32_t bucket_id, new_delay = 0;
589	struct timeval tv;
590
591	/* Fetch all variables at once to ensure consistent reads */
592	uint32_t bucket_time_ms = V_update_bucket_time_ms;
593	uint32_t threshold_rate = V_bucket_change_threshold_rate;
594	uint32_t max_delay_ms = V_fib_max_sync_delay_ms;
595
596	if (bucket_time_ms == 0)
597		bucket_time_ms = 50;
598	/* calculate per-bucket threshold rate */
599	threshold_rate = threshold_rate * bucket_time_ms / 1000;
600
601	getmicrouptime(&tv);
602
603	struct fib_sync_status *fd_ss = &fd->fd_ss;
604
605	bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms;
606
607	if (fd_ss->bucket_id == bucket_id) {
608		fd_ss->bucket_changes++;
609		if (fd_ss->bucket_changes == threshold_rate) {
610			new_delay = (bucket_id + 2) * bucket_time_ms;
611			if (new_delay <= max_delay_ms) {
612				FD_PRINTF(LOG_DEBUG, fd,
613				    "hit threshold of %u routes, delay update,"
614				    "bucket: %u, total delay: %u",
615				    threshold_rate, bucket_id + 1, new_delay);
616			} else {
617				new_delay = 0;
618				FD_PRINTF(LOG_DEBUG, fd,
619				    "maximum sync delay (%u ms) reached", max_delay_ms);
620			}
621		} else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1))
622			new_delay = bucket_time_ms;
623	} else {
624		fd_ss->bucket_id = bucket_id;
625		fd_ss->bucket_changes = 1;
626	}
627
628	if (new_delay > 0) {
629		/* Calculated time has been updated */
630		struct timeval new_tv = fd_ss->diverge_time;
631		add_tv_diff_ms(&new_tv, new_delay);
632
633		int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv);
634		schedule_callout(fd, action, delay_ms);
635	}
636}
637
638static void
639update_algo_state(struct fib_data *fd)
640{
641
642	RIB_WLOCK_ASSERT(fd->fd_rh);
643
644	if (fd->fd_batch || fd->fd_need_rebuild) {
645		enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH;
646		update_rebuild_delay(fd, action);
647		return;
648	}
649
650	if (fd->fd_num_changes++ == 0) {
651		/* Start callout to consider switch */
652		if (!callout_pending(&fd->fd_callout))
653			schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS);
654	} else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) {
655		/* Reset callout to exec immediately */
656		if (fd->fd_callout_action == FDA_EVAL)
657			schedule_callout(fd, FDA_EVAL, 1);
658	}
659}
660
661static bool
662need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc)
663{
664	struct nhop_object *nh;
665
666	/* Sync addition/removal of interface routes */
667	switch (rc->rc_cmd) {
668	case RTM_ADD:
669		nh = rc->rc_nh_new;
670		if (!NH_IS_NHGRP(nh)) {
671			if (!(nh->nh_flags & NHF_GATEWAY))
672				return (true);
673			if (nhop_get_rtflags(nh) & RTF_STATIC)
674				return (true);
675		}
676		break;
677	case RTM_DELETE:
678		nh = rc->rc_nh_old;
679		if (!NH_IS_NHGRP(nh)) {
680			if (!(nh->nh_flags & NHF_GATEWAY))
681				return (true);
682			if (nhop_get_rtflags(nh) & RTF_STATIC)
683				return (true);
684		}
685		break;
686	}
687
688	return (false);
689}
690
691static bool
692apply_rtable_changes(struct fib_data *fd)
693{
694	enum flm_op_result result;
695	struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
696
697	result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data);
698
699	if (result == FLM_SUCCESS) {
700		sync_rib_gen(fd);
701		for (int i = 0; i < q->count; i++)
702			if (q->entries[i].nh_old)
703				fib_unref_nhop(fd, q->entries[i].nh_old);
704		q->count = 0;
705	}
706	fd->fd_batch = false;
707
708	return (result == FLM_SUCCESS);
709}
710
711static bool
712fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc)
713{
714	int plen = 0;
715
716	switch (fd->fd_family) {
717#ifdef INET
718	case AF_INET:
719		rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid);
720		break;
721#endif
722#ifdef INET6
723	case AF_INET6:
724		rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid);
725		break;
726#endif
727	}
728
729	ce->plen = plen;
730	ce->nh_old = rc->rc_nh_old;
731	ce->nh_new = rc->rc_nh_new;
732	if (ce->nh_new != NULL) {
733		if (fib_ref_nhop(fd, ce->nh_new) == 0)
734			return (false);
735	}
736
737	return (true);
738}
739
740static bool
741queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc)
742{
743	struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
744
745	if (q->count >= q->size) {
746		uint32_t q_size;
747
748		if (q->size == 0)
749			q_size = 256; /* ~18k memory */
750		else
751			q_size = q->size * 2;
752
753		size_t size = q_size * sizeof(struct fib_change_entry);
754		void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO);
755		if (a == NULL) {
756			FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements",
757			    q_size);
758			return (false);
759		}
760		q->entries = a;
761		q->size = q_size;
762	}
763
764	return (fill_change_entry(fd, &q->entries[q->count++], rc));
765}
766
767/*
768 * Rib subscription handler. Checks if the algorithm is ready to
769 *  receive updates, handles nexthop refcounting and passes change
770 *  data to the algorithm callback.
771 */
772static void
773handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
774    void *_data)
775{
776	struct fib_data *fd = (struct fib_data *)_data;
777	enum flm_op_result result;
778
779	RIB_WLOCK_ASSERT(rnh);
780
781	/*
782	 * There is a small gap between subscribing for route changes
783	 *  and initiating rtable dump. Avoid receiving route changes
784	 *  prior to finishing rtable dump by checking `init_done`.
785	 */
786	if (!fd->init_done)
787		return;
788
789	bool immediate_sync = need_immediate_sync(fd, rc);
790
791	/* Consider scheduling algorithm re-evaluation */
792	update_algo_state(fd);
793
794	/*
795	 * If algo requested rebuild, stop sending updates by default.
796	 * This simplifies nexthop refcount handling logic.
797	 */
798	if (fd->fd_need_rebuild) {
799		if (immediate_sync)
800			rebuild_fd(fd, "rtable change type enforced sync");
801		return;
802	}
803
804	/*
805	 * Algo requested updates to be delivered in batches.
806	 * Add the current change to the queue and return.
807	 */
808	if (fd->fd_batch) {
809		if (immediate_sync) {
810			if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd))
811				rebuild_fd(fd, "batch sync failed");
812		} else {
813			if (!queue_rtable_change(fd, rc))
814				schedule_fd_rebuild(fd, "batch queue failed");
815		}
816		return;
817	}
818
819	/*
820	 * Maintain guarantee that every nexthop returned by the dataplane
821	 *  lookup has > 0 refcount, so can be safely referenced within current
822	 *  epoch.
823	 */
824	if (rc->rc_nh_new != NULL) {
825		if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) {
826			/* ran out of indexes */
827			schedule_fd_rebuild(fd, "ran out of nhop indexes");
828			return;
829		}
830	}
831
832	result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
833
834	switch (result) {
835	case FLM_SUCCESS:
836		sync_rib_gen(fd);
837		/* Unref old nexthop on success */
838		if (rc->rc_nh_old != NULL)
839			fib_unref_nhop(fd, rc->rc_nh_old);
840		break;
841	case FLM_BATCH:
842
843		/*
844		 * Algo asks to batch the changes.
845		 */
846		if (queue_rtable_change(fd, rc)) {
847			if (!immediate_sync) {
848				fd->fd_batch = true;
849				mark_diverge_time(fd);
850				update_rebuild_delay(fd, FDA_BATCH);
851				break;
852			}
853			if (apply_rtable_changes(fd))
854				break;
855		}
856		FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild");
857
858	case FLM_REBUILD:
859
860		/*
861		 * Algo is not able to apply the update.
862		 * Schedule algo rebuild.
863		 */
864		if (!immediate_sync) {
865			mark_diverge_time(fd);
866			schedule_fd_rebuild(fd, "algo requested rebuild");
867			break;
868		}
869
870		FD_PRINTF(LOG_INFO, fd, "running sync rebuild");
871		rebuild_fd(fd, "rtable change type enforced sync");
872		break;
873	case FLM_ERROR:
874
875		/*
876		 * Algo reported a non-recoverable error.
877		 * Record the error and schedule rebuild, which will
878		 *  trigger best algo selection.
879		 */
880		FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error");
881		if (!flm_error_add(fd->fd_flm, fd->fd_fibnum))
882			FD_PRINTF(LOG_ERR, fd, "failed to ban algo");
883		schedule_fd_rebuild(fd, "algo reported non-recoverable error");
884	}
885}
886
887static void
888estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd)
889{
890
891	if (old_fd == NULL) {
892		// TODO: read from rtable
893		fd->number_nhops = 16;
894		return;
895	}
896
897	if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
898		fd->number_nhops = 2 * old_fd->number_nhops;
899	else
900		fd->number_nhops = old_fd->number_nhops;
901}
902
903struct walk_cbdata {
904	struct fib_data		*fd;
905	flm_dump_t		*func;
906	enum flm_op_result	result;
907};
908
909/*
910 * Handler called after all rtenties have been dumped.
911 * Performs post-dump framework checks and calls
912 * algo:flm_dump_end_cb().
913 *
914 * Updates walk_cbdata result.
915 */
916static void
917sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
918{
919	struct walk_cbdata *w = (struct walk_cbdata *)_data;
920	struct fib_data *fd = w->fd;
921
922	RIB_WLOCK_ASSERT(w->fd->fd_rh);
923
924	if (rnh->rib_dying) {
925		w->result = FLM_ERROR;
926		return;
927	}
928
929	if (fd->hit_nhops) {
930		FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops",
931		    fd->nh_ref_table->count);
932		if (w->result == FLM_SUCCESS)
933			w->result = FLM_REBUILD;
934		return;
935	}
936
937	if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
938		return;
939
940	/* Post-dump hook, dump successful */
941	w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
942
943	if (w->result == FLM_SUCCESS) {
944		/* Mark init as done to allow routing updates */
945		fd->init_done = 1;
946	}
947}
948
949/*
950 * Callback for each entry in rib.
951 * Calls algo:flm_dump_rib_item_cb func as a part of initial
952 *  route table synchronisation.
953 */
954static int
955sync_algo_cb(struct rtentry *rt, void *_data)
956{
957	struct walk_cbdata *w = (struct walk_cbdata *)_data;
958
959	RIB_WLOCK_ASSERT(w->fd->fd_rh);
960
961	if (w->result == FLM_SUCCESS && w->func) {
962
963		/*
964		 * Reference nexthops to maintain guarantee that
965		 *  each nexthop returned by datapath has > 0 references
966		 *  and can be safely referenced within current epoch.
967		 */
968		struct nhop_object *nh = rt_get_raw_nhop(rt);
969		if (fib_ref_nhop(w->fd, nh) != 0)
970			w->result = w->func(rt, w->fd->fd_algo_data);
971		else
972			w->result = FLM_REBUILD;
973	}
974
975	return (0);
976}
977
978/*
979 * Dump all routing table state to the algo instance.
980 */
981static enum flm_op_result
982sync_algo(struct fib_data *fd)
983{
984	struct walk_cbdata w = {
985		.fd = fd,
986		.func = fd->fd_flm->flm_dump_rib_item_cb,
987		.result = FLM_SUCCESS,
988	};
989
990	rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w);
991
992	FD_PRINTF(LOG_INFO, fd,
993	    "initial dump completed (rtable version: %d), result: %s",
994	    fd->fd_rh->rnh_gen, print_op_result(w.result));
995
996	return (w.result);
997}
998
999/*
1000 * Schedules epoch-backed @fd instance deletion.
1001 * * Unlinks @fd from the list of active algo instances.
1002 * * Removes rib subscription.
1003 * * Stops callout.
1004 * * Schedules actual deletion.
1005 *
1006 * Assume @fd is already unlinked from the datapath.
1007 */
1008static int
1009schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
1010{
1011	bool is_dead;
1012
1013	NET_EPOCH_ASSERT();
1014	RIB_WLOCK_ASSERT(fd->fd_rh);
1015
1016	FIB_MOD_LOCK();
1017	is_dead = fd->fd_dead;
1018	if (!is_dead)
1019		fd->fd_dead = true;
1020	if (fd->fd_linked) {
1021		TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1022		fd->fd_linked = false;
1023	}
1024	FIB_MOD_UNLOCK();
1025	if (is_dead)
1026		return (0);
1027
1028	FD_PRINTF(LOG_INFO, fd, "DETACH");
1029
1030	if (fd->fd_rs != NULL)
1031		rib_unsibscribe_locked(fd->fd_rs);
1032
1033	/*
1034	 * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
1035	 * will be executed, hence no _new_ callout schedules will happen.
1036	 */
1037	callout_stop(&fd->fd_callout);
1038
1039	fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx);
1040
1041	return (0);
1042}
1043
1044/*
1045 * Wipe all fd instances from the list matching rib specified by @rh.
1046 * If @keep_first is set, remove all but the first record.
1047 */
1048static void
1049fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
1050{
1051	struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
1052	struct fib_data *fd, *fd_tmp;
1053	struct epoch_tracker et;
1054
1055	FIB_MOD_LOCK();
1056	TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) {
1057		if (fd->fd_rh == rh) {
1058			if (keep_first) {
1059				keep_first = false;
1060				continue;
1061			}
1062			TAILQ_REMOVE(&V_fib_data_list, fd, entries);
1063			fd->fd_linked = false;
1064			TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
1065		}
1066	}
1067	FIB_MOD_UNLOCK();
1068
1069	/* Pass 2: remove each entry */
1070	NET_EPOCH_ENTER(et);
1071	TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
1072		if (!in_callout)
1073			RIB_WLOCK(fd->fd_rh);
1074		schedule_destroy_fd_instance(fd, in_callout);
1075		if (!in_callout)
1076			RIB_WUNLOCK(fd->fd_rh);
1077	}
1078	NET_EPOCH_EXIT(et);
1079}
1080
1081void
1082fib_destroy_rib(struct rib_head *rh)
1083{
1084
1085	/*
1086	 * rnh has `is_dying` flag set, so setup of new fd's will fail at
1087	 *  sync_algo() stage, preventing new entries to be added to the list
1088	 *  of active algos. Remove all existing entries for the particular rib.
1089	 */
1090	fib_cleanup_algo(rh, false, false);
1091}
1092
1093/*
1094 * Finalises fd destruction by freeing all fd resources.
1095 */
1096static void
1097destroy_fd_instance(struct fib_data *fd)
1098{
1099
1100	FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd);
1101
1102	/* Call destroy callback first */
1103	if (fd->fd_algo_data != NULL)
1104		fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
1105
1106	/* Nhop table */
1107	if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) {
1108		for (int i = 0; i < fd->number_nhops; i++) {
1109			if (!is_idx_free(fd, i)) {
1110				FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p",
1111				    i, fd->nh_idx[i]);
1112				nhop_free_any(fd->nh_idx[i]);
1113			}
1114		}
1115		free(fd->nh_idx, M_RTABLE);
1116	}
1117	if (fd->nh_ref_table != NULL)
1118		free(fd->nh_ref_table, M_RTABLE);
1119
1120	if (fd->fd_ss.fd_change_queue.entries != NULL)
1121		free(fd->fd_ss.fd_change_queue.entries, M_TEMP);
1122
1123	fib_unref_algo(fd->fd_flm);
1124
1125	free(fd, M_RTABLE);
1126}
1127
1128/*
1129 * Epoch callback indicating fd is safe to destroy
1130 */
1131static void
1132destroy_fd_instance_epoch(epoch_context_t ctx)
1133{
1134	struct fib_data *fd;
1135
1136	fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
1137
1138	destroy_fd_instance(fd);
1139}
1140
1141/*
1142 * Tries to setup fd instance.
1143 * - Allocates fd/nhop table
1144 * - Runs algo:flm_init_cb algo init
1145 * - Subscribes fd to the rib
1146 * - Runs rtable dump
1147 * - Adds instance to the list of active instances.
1148 *
1149 * Returns: operation result. Fills in @pfd with resulting fd on success.
1150 *
1151 */
1152static enum flm_op_result
1153try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1154    struct fib_data *old_fd, struct fib_data **pfd)
1155{
1156	struct fib_data *fd;
1157	size_t size;
1158	enum flm_op_result result;
1159
1160	/* Allocate */
1161	fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
1162	if (fd == NULL)  {
1163		*pfd = NULL;
1164		RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure");
1165		return (FLM_REBUILD);
1166	}
1167	*pfd = fd;
1168
1169	estimate_nhop_scale(old_fd, fd);
1170
1171	fd->fd_rh = rh;
1172	fd->fd_family = rh->rib_family;
1173	fd->fd_fibnum = rh->rib_fibnum;
1174	callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0);
1175	fd->fd_vnet = curvnet;
1176	fd->fd_flm = flm;
1177
1178	FIB_MOD_LOCK();
1179	flm->flm_refcount++;
1180	fd->fd_gen = ++fib_gen;
1181	FIB_MOD_UNLOCK();
1182
1183	FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd);
1184
1185	/* Allocate nhidx -> nhop_ptr table */
1186	size = fd->number_nhops * sizeof(void *);
1187	fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1188	if (fd->nh_idx == NULL) {
1189		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size);
1190		return (FLM_REBUILD);
1191	}
1192
1193	/* Allocate nhop index refcount table */
1194	size = sizeof(struct nhop_ref_table);
1195	size += fd->number_nhops * sizeof(uint32_t);
1196	fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
1197	if (fd->nh_ref_table == NULL) {
1198		FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
1199		return (FLM_REBUILD);
1200	}
1201	FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops);
1202
1203	/* Okay, we're ready for algo init */
1204	void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
1205	result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
1206	if (result != FLM_SUCCESS) {
1207		FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name);
1208		return (result);
1209	}
1210
1211	/* Try to subscribe */
1212	if (flm->flm_change_rib_item_cb != NULL) {
1213		fd->fd_rs = rib_subscribe_locked(fd->fd_rh,
1214		    handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE);
1215		if (fd->fd_rs == NULL) {
1216			FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes");
1217			return (FLM_REBUILD);
1218		}
1219	}
1220
1221	/* Dump */
1222	result = sync_algo(fd);
1223	if (result != FLM_SUCCESS) {
1224		FD_PRINTF(LOG_INFO, fd, "rib sync failed");
1225		return (result);
1226	}
1227	FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully.");
1228
1229	FIB_MOD_LOCK();
1230	/*
1231	 * Insert fd in the beginning of a list, to maintain invariant
1232	 *  that first matching entry for the AF/fib is always the active
1233	 *  one.
1234	 */
1235	TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries);
1236	fd->fd_linked = true;
1237	FIB_MOD_UNLOCK();
1238
1239	return (FLM_SUCCESS);
1240}
1241
1242/*
1243 * Sets up algo @flm for table @rh and links it to the datapath.
1244 *
1245 */
1246static enum flm_op_result
1247setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
1248    struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
1249{
1250	struct fib_data *prev_fd, *new_fd;
1251	enum flm_op_result result;
1252
1253	NET_EPOCH_ASSERT();
1254	RIB_WLOCK_ASSERT(rh);
1255
1256	prev_fd = orig_fd;
1257	new_fd = NULL;
1258	for (int i = 0; i < FIB_MAX_TRIES; i++) {
1259		result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);
1260
1261		if ((result == FLM_SUCCESS) && attach) {
1262			if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp))
1263				sync_rib_gen(new_fd);
1264			else
1265				result = FLM_REBUILD;
1266		}
1267
1268		if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
1269			schedule_destroy_fd_instance(prev_fd, false);
1270			prev_fd = NULL;
1271		}
1272
1273		RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
1274		    print_op_result(result));
1275
1276		if (result == FLM_REBUILD) {
1277			prev_fd = new_fd;
1278			new_fd = NULL;
1279			continue;
1280		}
1281
1282		break;
1283	}
1284
1285	if (result != FLM_SUCCESS) {
1286		RH_PRINTF(LOG_WARNING, rh,
1287		    "%s algo instance setup failed, failures=%d", flm->flm_name,
1288		    orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0);
1289		/* update failure count */
1290		FIB_MOD_LOCK();
1291		if (orig_fd != NULL)
1292			orig_fd->fd_failed_rebuilds++;
1293		FIB_MOD_UNLOCK();
1294
1295		/* Ban algo on non-recoverable error */
1296		if (result == FLM_ERROR)
1297			flm_error_add(flm, rh->rib_fibnum);
1298
1299		if ((prev_fd != NULL) && (prev_fd != orig_fd))
1300			schedule_destroy_fd_instance(prev_fd, false);
1301		if (new_fd != NULL) {
1302			schedule_destroy_fd_instance(new_fd, false);
1303			new_fd = NULL;
1304		}
1305	}
1306
1307	*pfd = new_fd;
1308	return (result);
1309}
1310
1311/*
1312 * Tries to sync algo with the current rtable state, either
1313 * by executing batch update or rebuilding.
1314 * Returns true on success.
1315 */
1316static bool
1317execute_callout_action(struct fib_data *fd)
1318{
1319	enum fib_callout_action action = fd->fd_callout_action;
1320	struct fib_lookup_module *flm_new = NULL;
1321	bool result = true;
1322
1323	NET_EPOCH_ASSERT();
1324	RIB_WLOCK_ASSERT(fd->fd_rh);
1325
1326	fd->fd_need_rebuild = false;
1327	fd->fd_batch = false;
1328	fd->fd_num_changes = 0;
1329
1330	/* First, check if we're still OK to use this algo */
1331	if (!is_algo_fixed(fd->fd_rh))
1332		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1333	if (flm_new != NULL)
1334		action = FDA_REBUILD;
1335
1336	if (action == FDA_BATCH) {
1337		/* Try to sync */
1338		if (!apply_rtable_changes(fd))
1339			action = FDA_REBUILD;
1340	}
1341
1342	if (action == FDA_REBUILD)
1343		result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1344	if (flm_new != NULL)
1345		fib_unref_algo(flm_new);
1346
1347	return (result);
1348}
1349
1350/*
1351 * Callout for all scheduled fd-related work.
1352 * - Checks if the current algo is still the best algo
1353 * - Synchronises algo instance to the rtable (batch usecase)
1354 * - Creates a new instance of an algo for af/fib if desired.
1355 */
1356static void
1357handle_fd_callout(void *_data)
1358{
1359	struct fib_data *fd = (struct fib_data *)_data;
1360	struct epoch_tracker et;
1361
1362	FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action);
1363
1364	NET_EPOCH_ENTER(et);
1365	CURVNET_SET(fd->fd_vnet);
1366	execute_callout_action(fd);
1367	CURVNET_RESTORE();
1368	NET_EPOCH_EXIT(et);
1369}
1370
1371/*
1372 * Tries to create new algo instance based on @fd data.
1373 * Returns true on success.
1374 */
1375static bool
1376rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new)
1377{
1378	struct fib_data *fd_new, *fd_tmp = NULL;
1379	bool result;
1380
1381	if (flm_new == fd->fd_flm)
1382		fd_tmp = fd;
1383	else
1384		FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name);
1385
1386	result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true);
1387	if (result != FLM_SUCCESS) {
1388		FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed");
1389		return (false);
1390	}
1391	FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");
1392
1393	/* Remove old instance */
1394	schedule_destroy_fd_instance(fd, true);
1395
1396	return (true);
1397}
1398
1399static bool
1400rebuild_fd(struct fib_data *fd, const char *reason)
1401{
1402	struct fib_lookup_module *flm_new = NULL;
1403	bool result;
1404
1405	if (!is_algo_fixed(fd->fd_rh))
1406		flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
1407
1408	FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason);
1409	result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
1410	if (flm_new != NULL)
1411		fib_unref_algo(flm_new);
1412
1413	if (!result) {
1414		FD_PRINTF(LOG_ERR, fd, "sync rebuild failed");
1415		schedule_fd_rebuild(fd, "sync rebuild failed");
1416	}
1417
1418	return (result);
1419}
1420
1421/*
1422 * Finds algo by name/family.
1423 * Returns referenced algo or NULL.
1424 */
1425static struct fib_lookup_module *
1426fib_find_algo(const char *algo_name, int family)
1427{
1428	struct fib_lookup_module *flm;
1429
1430	FIB_MOD_LOCK();
1431	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1432		if ((strcmp(flm->flm_name, algo_name) == 0) &&
1433		    (family == flm->flm_family)) {
1434			flm->flm_refcount++;
1435			FIB_MOD_UNLOCK();
1436			return (flm);
1437		}
1438	}
1439	FIB_MOD_UNLOCK();
1440
1441	return (NULL);
1442}
1443
1444static void
1445fib_unref_algo(struct fib_lookup_module *flm)
1446{
1447
1448	FIB_MOD_LOCK();
1449	flm->flm_refcount--;
1450	FIB_MOD_UNLOCK();
1451}
1452
1453static int
1454set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req)
1455{
1456	struct fib_lookup_module *flm = NULL;
1457	struct fib_data *fd = NULL;
1458	char old_algo_name[32], algo_name[32];
1459	struct rib_head *rh = NULL;
1460	enum flm_op_result result;
1461	struct epoch_tracker et;
1462	int error;
1463
1464	/* Fetch current algo/rib for af/family */
1465	FIB_MOD_LOCK();
1466	TAILQ_FOREACH(fd, &V_fib_data_list, entries) {
1467		if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum))
1468			break;
1469	}
1470	if (fd == NULL) {
1471		FIB_MOD_UNLOCK();
1472		return (ENOENT);
1473	}
1474	rh = fd->fd_rh;
1475	strlcpy(old_algo_name, fd->fd_flm->flm_name,
1476	    sizeof(old_algo_name));
1477	FIB_MOD_UNLOCK();
1478
1479	strlcpy(algo_name, old_algo_name, sizeof(algo_name));
1480	error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
1481	if (error != 0 || req->newptr == NULL)
1482		return (error);
1483
1484	if (strcmp(algo_name, old_algo_name) == 0)
1485		return (0);
1486
1487	/* New algorithm name is different */
1488	flm = fib_find_algo(algo_name, family);
1489	if (flm == NULL) {
1490		RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name);
1491		return (ESRCH);
1492	}
1493
1494	fd = NULL;
1495	NET_EPOCH_ENTER(et);
1496	RIB_WLOCK(rh);
1497	result = setup_fd_instance(flm, rh, NULL, &fd, true);
1498	RIB_WUNLOCK(rh);
1499	NET_EPOCH_EXIT(et);
1500	fib_unref_algo(flm);
1501	if (result != FLM_SUCCESS)
1502		return (EINVAL);
1503
1504	/* Disable automated jumping between algos */
1505	FIB_MOD_LOCK();
1506	set_algo_fixed(rh);
1507	FIB_MOD_UNLOCK();
1508	/* Remove old instance(s) */
1509	fib_cleanup_algo(rh, true, false);
1510
1511	/* Drain cb so user can unload the module after userret if so desired */
1512	epoch_drain_callbacks(net_epoch_preempt);
1513
1514	return (0);
1515}
1516
1517#ifdef INET
1518static int
1519set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)
1520{
1521
1522	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req));
1523}
1524SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo,
1525    CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1526    set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo");
1527#endif
1528
1529#ifdef INET6
1530static int
1531set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)
1532{
1533
1534	return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req));
1535}
1536SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
1537    CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
1538    set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo");
1539#endif
1540
1541static struct nhop_object *
1542dummy_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
1543{
1544	return (NULL);
1545}
1546
1547static void
1548destroy_fdh_epoch(epoch_context_t ctx)
1549{
1550	struct fib_dp_header *fdh;
1551
1552	fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx);
1553	free(fdh, M_RTABLE);
1554}
1555
1556static struct fib_dp_header *
1557alloc_fib_dp_array(uint32_t num_tables, bool waitok)
1558{
1559	size_t sz;
1560	struct fib_dp_header *fdh;
1561
1562	sz = sizeof(struct fib_dp_header);
1563	sz += sizeof(struct fib_dp) * num_tables;
1564	fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
1565	if (fdh != NULL) {
1566		fdh->fdh_num_tables = num_tables;
1567		/*
1568		 * Set dummy lookup function ptr always returning NULL, so
1569		 * we can delay algo init.
1570		 */
1571		for (uint32_t i = 0; i < num_tables; i++)
1572			fdh->fdh_idx[i].f = dummy_lookup;
1573	}
1574	return (fdh);
1575}
1576
1577static struct fib_dp_header *
1578get_fib_dp_header(struct fib_dp *dp)
1579{
1580
1581	return (__containerof((void *)dp, struct fib_dp_header, fdh_idx));
1582}
1583
1584/*
1585 * Replace per-family index pool @pdp with a new one which
1586 * contains updated callback/algo data from @fd.
1587 * Returns true on success.
1588 */
1589static bool
1590replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp)
1591{
1592	struct fib_dp_header *new_fdh, *old_fdh;
1593
1594	NET_EPOCH_ASSERT();
1595
1596	FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p",
1597	    curvnet, dp->f, dp->arg);
1598
1599	FIB_MOD_LOCK();
1600	old_fdh = get_fib_dp_header(*pdp);
1601
1602	if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) {
1603		/*
1604		 * Function is the same, data pointer needs update.
1605		 * Perform in-line replace without reallocation.
1606		 */
1607		old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg;
1608		FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh);
1609		FIB_MOD_UNLOCK();
1610		return (true);
1611	}
1612
1613	new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false);
1614	FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh);
1615	if (new_fdh == NULL) {
1616		FIB_MOD_UNLOCK();
1617		FD_PRINTF(LOG_WARNING, fd, "error attaching datapath");
1618		return (false);
1619	}
1620
1621	memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1622	    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1623	/* Update relevant data structure for @fd */
1624	new_fdh->fdh_idx[fd->fd_fibnum] = *dp;
1625
1626	/* Ensure memcpy() writes have completed */
1627	atomic_thread_fence_rel();
1628	/* Set new datapath pointer */
1629	*pdp = &new_fdh->fdh_idx[0];
1630	FIB_MOD_UNLOCK();
1631	FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh);
1632
1633	fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
1634
1635	return (true);
1636}
1637
1638static struct fib_dp **
1639get_family_dp_ptr(int family)
1640{
1641	switch (family) {
1642#ifdef INET
1643	case AF_INET:
1644		return (&V_inet_dp);
1645#endif
1646#ifdef INET6
1647	case AF_INET6:
1648		return (&V_inet6_dp);
1649#endif
1650	}
1651	return (NULL);
1652}
1653
1654/*
1655 * Make datapath use fib instance @fd
1656 */
1657bool
1658fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp)
1659{
1660	struct fib_dp **pdp;
1661
1662	pdp = get_family_dp_ptr(fd->fd_family);
1663	return (replace_rtables_family(pdp, fd, dp));
1664}
1665
1666/*
1667 * Grow datapath pointers array.
1668 * Called from sysctl handler on growing number of routing tables.
1669 */
1670static void
1671grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
1672{
1673	struct fib_dp_header *new_fdh, *old_fdh = NULL;
1674
1675	new_fdh = alloc_fib_dp_array(new_num_tables, true);
1676
1677	FIB_MOD_LOCK();
1678	if (*pdp != NULL) {
1679		old_fdh = get_fib_dp_header(*pdp);
1680		memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
1681		    old_fdh->fdh_num_tables * sizeof(struct fib_dp));
1682	}
1683
1684	/* Wait till all writes completed */
1685	atomic_thread_fence_rel();
1686
1687	*pdp = &new_fdh->fdh_idx[0];
1688	FIB_MOD_UNLOCK();
1689
1690	if (old_fdh != NULL)
1691		fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
1692}
1693
1694/*
1695 * Grows per-AF arrays of datapath pointers for each supported family.
1696 * Called from fibs resize sysctl handler.
1697 */
1698void
1699fib_grow_rtables(uint32_t new_num_tables)
1700{
1701
1702#ifdef INET
1703	grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables);
1704#endif
1705#ifdef INET6
1706	grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables);
1707#endif
1708}
1709
1710void
1711fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
1712{
1713
1714	bzero(rinfo, sizeof(struct rib_rtable_info));
1715	rinfo->num_prefixes = rh->rnh_prefixes;
1716	rinfo->num_nhops = nhops_get_count(rh);
1717#ifdef ROUTE_MPATH
1718	rinfo->num_nhgrp = nhgrp_get_count(rh);
1719#endif
1720}
1721
1722/*
1723 * Updates pointer to the algo data for the @fd.
1724 */
1725void
1726fib_set_algo_ptr(struct fib_data *fd, void *algo_data)
1727{
1728	RIB_WLOCK_ASSERT(fd->fd_rh);
1729
1730	fd->fd_algo_data = algo_data;
1731}
1732
1733/*
1734 * Calls @callback with @ctx after the end of a current epoch.
1735 */
1736void
1737fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx)
1738{
1739	epoch_call(net_epoch_preempt, callback, ctx);
1740}
1741
1742/*
1743 * Accessor to get rib instance @fd is attached to.
1744 */
1745struct rib_head *
1746fib_get_rh(struct fib_data *fd)
1747{
1748
1749	return (fd->fd_rh);
1750}
1751
1752/*
1753 * Accessor to export idx->nhop array
1754 */
1755struct nhop_object **
1756fib_get_nhop_array(struct fib_data *fd)
1757{
1758
1759	return (fd->nh_idx);
1760}
1761
1762static uint32_t
1763get_nhop_idx(struct nhop_object *nh)
1764{
1765#ifdef ROUTE_MPATH
1766	if (NH_IS_NHGRP(nh))
1767		return (nhgrp_get_idx((struct nhgrp_object *)nh) * 2 - 1);
1768	else
1769		return (nhop_get_idx(nh) * 2);
1770#else
1771	return (nhop_get_idx(nh));
1772#endif
1773}
1774
1775uint32_t
1776fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
1777{
1778
1779	return (get_nhop_idx(nh));
1780}
1781
1782static bool
1783is_idx_free(struct fib_data *fd, uint32_t index)
1784{
1785
1786	return (fd->nh_ref_table->refcnt[index] == 0);
1787}
1788
1789static uint32_t
1790fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh)
1791{
1792	uint32_t idx = get_nhop_idx(nh);
1793
1794	if (idx >= fd->number_nhops) {
1795		fd->hit_nhops = 1;
1796		return (0);
1797	}
1798
1799	if (is_idx_free(fd, idx)) {
1800		nhop_ref_any(nh);
1801		fd->nh_idx[idx] = nh;
1802		fd->nh_ref_table->count++;
1803		FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
1804	}
1805	fd->nh_ref_table->refcnt[idx]++;
1806
1807	return (idx);
1808}
1809
1810struct nhop_release_data {
1811	struct nhop_object	*nh;
1812	struct epoch_context	ctx;
1813};
1814
1815static void
1816release_nhop_epoch(epoch_context_t ctx)
1817{
1818	struct nhop_release_data *nrd;
1819
1820	nrd = __containerof(ctx, struct nhop_release_data, ctx);
1821	nhop_free_any(nrd->nh);
1822	free(nrd, M_TEMP);
1823}
1824
1825/*
1826 * Delays nexthop refcount release.
1827 * Datapath may have the datastructures not updated yet, so the old
1828 *  nexthop may still be returned till the end of current epoch. Delay
1829 *  refcount removal, as we may be removing the last instance, which will
1830 *  trigger nexthop deletion, rendering returned nexthop invalid.
1831 */
1832static void
1833fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
1834{
1835	struct nhop_release_data *nrd;
1836
1837	nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO);
1838	if (nrd != NULL) {
1839		nrd->nh = nh;
1840		fib_epoch_call(release_nhop_epoch, &nrd->ctx);
1841	} else {
1842		/*
1843		 * Unable to allocate memory. Leak nexthop to maintain guarantee
1844		 *  that each nhop can be referenced.
1845		 */
1846		FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh);
1847	}
1848}
1849
1850static void
1851fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh)
1852{
1853	uint32_t idx = get_nhop_idx(nh);
1854
1855	KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
1856	KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh"));
1857
1858	fd->nh_ref_table->refcnt[idx]--;
1859	if (fd->nh_ref_table->refcnt[idx] == 0) {
1860		FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
1861		fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
1862	}
1863}
1864
1865static void
1866set_algo_fixed(struct rib_head *rh)
1867{
1868	switch (rh->rib_family) {
1869#ifdef INET
1870	case AF_INET:
1871		V_algo_fixed_inet = true;
1872		break;
1873#endif
1874#ifdef INET6
1875	case AF_INET6:
1876		V_algo_fixed_inet6 = true;
1877		break;
1878#endif
1879	}
1880}
1881
1882static bool
1883is_algo_fixed(struct rib_head *rh)
1884{
1885
1886	switch (rh->rib_family) {
1887#ifdef INET
1888	case AF_INET:
1889		return (V_algo_fixed_inet);
1890#endif
1891#ifdef INET6
1892	case AF_INET6:
1893		return (V_algo_fixed_inet6);
1894#endif
1895	}
1896	return (false);
1897}
1898
1899/*
1900 * Runs the check on what would be the best algo for rib @rh, assuming
1901 *  that the current algo is the one specified by @orig_flm. Note that
1902 *  it can be NULL for initial selection.
1903 *
1904 * Returns referenced new algo or NULL if the current one is the best.
1905 */
1906static struct fib_lookup_module *
1907fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
1908{
1909	uint8_t preference, curr_preference = 0, best_preference = 0;
1910	struct fib_lookup_module *flm, *best_flm = NULL;
1911	struct rib_rtable_info rinfo;
1912	int candidate_algos = 0;
1913
1914	fib_get_rtable_info(rh, &rinfo);
1915
1916	FIB_MOD_LOCK();
1917	TAILQ_FOREACH(flm, &all_algo_list, entries) {
1918		if (flm->flm_family != rh->rib_family)
1919			continue;
1920		candidate_algos++;
1921		preference = flm->flm_get_pref(&rinfo);
1922		if (preference > best_preference) {
1923			if (!flm_error_check(flm, rh->rib_fibnum)) {
1924				best_preference = preference;
1925				best_flm = flm;
1926			}
1927		}
1928		if (flm == orig_flm)
1929			curr_preference = preference;
1930	}
1931	if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference))
1932		best_flm->flm_refcount++;
1933	else
1934		best_flm = NULL;
1935	FIB_MOD_UNLOCK();
1936
1937	RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
1938	    candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
1939	    best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"),
1940	    best_preference);
1941
1942	return (best_flm);
1943}
1944
1945/*
1946 * Called when new route table is created.
1947 * Selects, allocates and attaches fib algo for the table.
1948 */
1949static bool
1950fib_select_algo_initial(struct rib_head *rh, struct fib_dp *dp)
1951{
1952	struct fib_lookup_module *flm;
1953	struct fib_data *fd = NULL;
1954	enum flm_op_result result;
1955	struct epoch_tracker et;
1956
1957	flm = fib_check_best_algo(rh, NULL);
1958	if (flm == NULL) {
1959		RH_PRINTF(LOG_CRIT, rh, "no algo selected");
1960		return (false);
1961	}
1962	RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);
1963
1964	NET_EPOCH_ENTER(et);
1965	RIB_WLOCK(rh);
1966	result = setup_fd_instance(flm, rh, NULL, &fd, false);
1967	RIB_WUNLOCK(rh);
1968	NET_EPOCH_EXIT(et);
1969
1970	RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
1971	if (result == FLM_SUCCESS)
1972		*dp = fd->fd_dp;
1973	else
1974		RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name);
1975
1976	fib_unref_algo(flm);
1977
1978	return (result == FLM_SUCCESS);
1979}
1980
1981/*
1982 * Sets up fib algo instances for the non-initialized RIBs in the @family.
1983 * Allocates temporary datapath index to amortize datapaint index updates
1984 * with large @num_tables.
1985 */
1986void
1987fib_setup_family(int family, uint32_t num_tables)
1988{
1989	struct fib_dp_header *new_fdh = alloc_fib_dp_array(num_tables, false);
1990	if (new_fdh == NULL) {
1991		ALGO_PRINTF(LOG_CRIT, "Unable to setup framework for %s", print_family(family));
1992		return;
1993	}
1994
1995	for (int i = 0; i < num_tables; i++) {
1996		struct rib_head *rh = rt_tables_get_rnh(i, family);
1997		if (rh->rib_algo_init)
1998			continue;
1999		if (!fib_select_algo_initial(rh, &new_fdh->fdh_idx[i]))
2000			continue;
2001
2002		rh->rib_algo_init = true;
2003	}
2004
2005	FIB_MOD_LOCK();
2006	struct fib_dp **pdp = get_family_dp_ptr(family);
2007	struct fib_dp_header *old_fdh = get_fib_dp_header(*pdp);
2008
2009	/* Update the items not touched by the new init, from the old data pointer */
2010	for (int i = 0; i < num_tables; i++) {
2011		if (new_fdh->fdh_idx[i].f == dummy_lookup)
2012			new_fdh->fdh_idx[i] = old_fdh->fdh_idx[i];
2013	}
2014
2015	/* Ensure all index writes have completed */
2016	atomic_thread_fence_rel();
2017	/* Set new datapath pointer */
2018	*pdp = &new_fdh->fdh_idx[0];
2019
2020	FIB_MOD_UNLOCK();
2021
2022	fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
2023}
2024
2025/*
2026 * Registers fib lookup module within the subsystem.
2027 */
2028int
2029fib_module_register(struct fib_lookup_module *flm)
2030{
2031
2032	FIB_MOD_LOCK();
2033	ALGO_PRINTF(LOG_INFO, "attaching %s to %s", flm->flm_name,
2034	    print_family(flm->flm_family));
2035	TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
2036	FIB_MOD_UNLOCK();
2037
2038	return (0);
2039}
2040
2041/*
2042 * Tries to unregister fib lookup module.
2043 *
2044 * Returns 0 on success, EBUSY if module is still used
2045 *  by some of the tables.
2046 */
2047int
2048fib_module_unregister(struct fib_lookup_module *flm)
2049{
2050
2051	FIB_MOD_LOCK();
2052	if (flm->flm_refcount > 0) {
2053		FIB_MOD_UNLOCK();
2054		return (EBUSY);
2055	}
2056	fib_error_clear_flm(flm);
2057	ALGO_PRINTF(LOG_INFO, "detaching %s from %s", flm->flm_name,
2058	    print_family(flm->flm_family));
2059	TAILQ_REMOVE(&all_algo_list, flm, entries);
2060	FIB_MOD_UNLOCK();
2061
2062	return (0);
2063}
2064
2065void
2066vnet_fib_init(void)
2067{
2068
2069	TAILQ_INIT(&V_fib_data_list);
2070}
2071
2072void
2073vnet_fib_destroy(void)
2074{
2075
2076	FIB_MOD_LOCK();
2077	fib_error_clear();
2078	FIB_MOD_UNLOCK();
2079}
2080