1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2012-2021 Marko Zec
5 * Copyright (c) 2005, 2018 University of Zagreb
6 * Copyright (c) 2005 International Computer Science Institute
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30/*
31 * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
32 * structures and a trivial search procedure.  More significant bits of
33 * the search key are used to directly index a two-stage trie, while the
34 * remaining bits are used for finding the next hop in a sorted array.
35 * More details in:
36 *
37 * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
38 * second in software, ACM SIGCOMM Computer Communication Review, September
39 * 2012
40 *
41 * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
42 * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
43 */
44
45#include <sys/cdefs.h>
46__FBSDID("$FreeBSD$");
47
48#include "opt_inet.h"
49
50#include <sys/param.h>
51#include <sys/kernel.h>
52#include <sys/epoch.h>
53#include <sys/malloc.h>
54#include <sys/module.h>
55#include <sys/socket.h>
56#include <sys/sysctl.h>
57#include <sys/syslog.h>
58
59#include <vm/uma.h>
60
61#include <netinet/in.h>
62#include <netinet/in_fib.h>
63
64#include <net/route.h>
65#include <net/route/route_ctl.h>
66#include <net/route/fib_algo.h>
67
68#define	DXR_TRIE_BITS		20
69
70CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
71
72/* DXR2: two-stage primary trie, instead of a single direct lookup table */
73#define	DXR2
74
75#if DXR_TRIE_BITS > 16
76#define	DXR_D			16
77#else
78#define	DXR_D			(DXR_TRIE_BITS - 1)
79#endif
80#define	DXR_X			(DXR_TRIE_BITS - DXR_D)
81
82#define	D_TBL_SIZE		(1 << DXR_D)
83#define	DIRECT_TBL_SIZE		(1 << DXR_TRIE_BITS)
84#define	DXR_RANGE_MASK		(0xffffffffU >> DXR_TRIE_BITS)
85#define	DXR_RANGE_SHIFT		(32 - DXR_TRIE_BITS)
86
87#define	DESC_BASE_BITS		22
88#define	DESC_FRAGMENTS_BITS	(32 - DESC_BASE_BITS)
89#define	BASE_MAX		((1 << DESC_BASE_BITS) - 1)
90#define	RTBL_SIZE_INCR		(BASE_MAX / 64)
91
92#if DXR_TRIE_BITS < 24
93#define	FRAGS_MASK_SHORT	((1 << (23 - DXR_TRIE_BITS)) - 1)
94#else
95#define	FRAGS_MASK_SHORT	0
96#endif
97#define	FRAGS_PREF_SHORT	(((1 << DESC_FRAGMENTS_BITS) - 1) & \
98				 ~FRAGS_MASK_SHORT)
99#define	FRAGS_MARK_XL		(FRAGS_PREF_SHORT - 1)
100#define	FRAGS_MARK_HIT		(FRAGS_PREF_SHORT - 2)
101
102#define	IS_SHORT_FORMAT(x)	((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
103#define	IS_LONG_FORMAT(x)	((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
104#define	IS_XL_FORMAT(x)		(x == FRAGS_MARK_XL)
105
106#define	RE_SHORT_MAX_NH		((1 << (DXR_TRIE_BITS - 8)) - 1)
107
108#define	CHUNK_HASH_BITS		16
109#define	CHUNK_HASH_SIZE		(1 << CHUNK_HASH_BITS)
110#define	CHUNK_HASH_MASK		(CHUNK_HASH_SIZE - 1)
111
112#define	TRIE_HASH_BITS		16
113#define	TRIE_HASH_SIZE		(1 << TRIE_HASH_BITS)
114#define	TRIE_HASH_MASK		(TRIE_HASH_SIZE - 1)
115
116#define	XTBL_SIZE_INCR		(DIRECT_TBL_SIZE / 16)
117
118/* Lookup structure elements */
119
120struct direct_entry {
121	uint32_t		fragments: DESC_FRAGMENTS_BITS,
122				base: DESC_BASE_BITS;
123};
124
125struct range_entry_long {
126	uint32_t		start: DXR_RANGE_SHIFT,
127				nexthop: DXR_TRIE_BITS;
128};
129
130#if DXR_TRIE_BITS < 24
131struct range_entry_short {
132	uint16_t		start: DXR_RANGE_SHIFT - 8,
133				nexthop: DXR_TRIE_BITS - 8;
134};
135#endif
136
137/* Auxiliary structures */
138
139struct heap_entry {
140	uint32_t		start;
141	uint32_t		end;
142	uint32_t		preflen;
143	uint32_t		nexthop;
144};
145
146struct chunk_desc {
147	LIST_ENTRY(chunk_desc)	cd_all_le;
148	LIST_ENTRY(chunk_desc)	cd_hash_le;
149	uint32_t		cd_hash;
150	uint32_t		cd_refcnt;
151	uint32_t		cd_base;
152	uint32_t		cd_cur_size;
153	uint32_t		cd_max_size;
154};
155
156struct trie_desc {
157	LIST_ENTRY(trie_desc)	td_all_le;
158	LIST_ENTRY(trie_desc)	td_hash_le;
159	uint32_t		td_hash;
160	uint32_t		td_index;
161	uint32_t		td_refcnt;
162};
163
164struct dxr_aux {
165	/* Glue to external state */
166	struct fib_data		*fd;
167	uint32_t		fibnum;
168	int			refcnt;
169
170	/* Auxiliary build-time tables */
171	struct direct_entry	direct_tbl[DIRECT_TBL_SIZE];
172	uint16_t		d_tbl[D_TBL_SIZE];
173	struct direct_entry	*x_tbl;
174	union {
175		struct range_entry_long	re;
176		uint32_t	fragments;
177	}			*range_tbl;
178
179	/* Auxiliary internal state */
180	uint32_t		updates_mask[DIRECT_TBL_SIZE / 32];
181	struct trie_desc	*trietbl[D_TBL_SIZE];
182	LIST_HEAD(, chunk_desc)	chunk_hashtbl[CHUNK_HASH_SIZE];
183	LIST_HEAD(, chunk_desc)	all_chunks;
184	LIST_HEAD(, chunk_desc) unused_chunks; /* abuses hash link entry */
185	LIST_HEAD(, trie_desc)	trie_hashtbl[TRIE_HASH_SIZE];
186	LIST_HEAD(, trie_desc)	all_trie;
187	LIST_HEAD(, trie_desc)	unused_trie; /* abuses hash link entry */
188	struct sockaddr_in	dst;
189	struct sockaddr_in	mask;
190	struct heap_entry	heap[33];
191	uint32_t		prefixes;
192	uint32_t		updates_low;
193	uint32_t		updates_high;
194	uint32_t		all_chunks_cnt;
195	uint32_t		unused_chunks_cnt;
196	uint32_t		xtbl_size;
197	uint32_t		all_trie_cnt;
198	uint32_t		unused_trie_cnt;
199	uint32_t		trie_rebuilt_prefixes;
200	uint32_t		heap_index;
201	uint32_t		d_bits;
202	uint32_t		rtbl_size;
203	uint32_t		rtbl_top;
204	uint32_t		rtbl_work_frags;
205	uint32_t		work_chunk;
206};
207
208/* Main lookup structure container */
209
210struct dxr {
211	/* Lookup tables */
212	uint16_t		d_shift;
213	uint16_t		x_shift;
214	uint32_t		x_mask;
215	void			*d;
216	void			*x;
217	void			*r;
218	struct nhop_object	**nh_tbl;
219
220	/* Glue to external state */
221	struct dxr_aux		*aux;
222	struct fib_data		*fd;
223	struct epoch_context	epoch_ctx;
224	uint32_t		fibnum;
225};
226
227static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
228static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
229
230uma_zone_t chunk_zone;
231uma_zone_t trie_zone;
232
233SYSCTL_DECL(_net_route_algo);
234SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
235    "DXR tunables");
236
237VNET_DEFINE_STATIC(int, max_trie_holes) = 8;
238#define	V_max_trie_holes	VNET(max_trie_holes)
239SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_trie_holes,
240    CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_trie_holes), 0,
241    "Trie fragmentation threshold before triggering a full rebuild");
242
243VNET_DEFINE_STATIC(int, max_range_holes) = 16;
244#define	V_max_range_holes	VNET(max_range_holes)
245SYSCTL_INT(_net_route_algo_dxr, OID_AUTO, max_range_holes,
246    CTLFLAG_RW | CTLFLAG_VNET, &VNET_NAME(max_range_holes), 0,
247    "Range table fragmentation threshold before triggering a full rebuild");
248
249/* Binary search for a matching address range */
250#define	DXR_LOOKUP_STAGE					\
251	if (masked_dst < range[middle].start) {			\
252		upperbound = middle;				\
253		middle = (middle + lowerbound) / 2;		\
254	} else if (masked_dst < range[middle + 1].start)	\
255		return (range[middle].nexthop);			\
256	else {							\
257		lowerbound = middle + 1;			\
258		middle = (upperbound + middle + 1) / 2;		\
259	}							\
260	if (upperbound == lowerbound)				\
261		return (range[lowerbound].nexthop);
262
263static int
264dxr_lookup(struct dxr *dxr, uint32_t dst)
265{
266#ifdef DXR2
267	uint16_t *dt = dxr->d;
268	struct direct_entry *xt = dxr->x;
269	int xi;
270#else
271	struct direct_entry *dt = dxr->d;
272#endif
273	struct direct_entry de;
274	struct range_entry_long	*rt;
275	uint32_t base;
276	uint32_t upperbound;
277	uint32_t middle;
278	uint32_t lowerbound;
279	uint32_t masked_dst;
280
281#ifdef DXR2
282	xi = (dt[dst >> dxr->d_shift] << dxr->x_shift) +
283	    ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask);
284	de = xt[xi];
285#else
286	de = dt[dst >> DXR_RANGE_SHIFT];
287#endif
288
289	if (__predict_true(de.fragments == FRAGS_MARK_HIT))
290		return (de.base);
291
292	rt = dxr->r;
293	base = de.base;
294	lowerbound = 0;
295	masked_dst = dst & DXR_RANGE_MASK;
296
297#if DXR_TRIE_BITS < 24
298	if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
299		upperbound = de.fragments & FRAGS_MASK_SHORT;
300		struct range_entry_short *range =
301		    (struct range_entry_short *) &rt[base];
302
303		masked_dst >>= 8;
304		middle = upperbound;
305		upperbound = upperbound * 2 + 1;
306
307		for (;;) {
308			DXR_LOOKUP_STAGE
309			DXR_LOOKUP_STAGE
310		}
311	}
312#endif
313
314	upperbound = de.fragments;
315	middle = upperbound / 2;
316	struct range_entry_long *range = &rt[base];
317	if (__predict_false(IS_XL_FORMAT(de.fragments))) {
318		upperbound = *((uint32_t *) range);
319		range++;
320		middle = upperbound / 2;
321	}
322
323	for (;;) {
324		DXR_LOOKUP_STAGE
325		DXR_LOOKUP_STAGE
326	}
327}
328
329static void
330initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
331{
332	struct heap_entry *fhp = &da->heap[0];
333	struct rtentry *rt;
334	struct route_nhop_data rnd;
335
336	da->heap_index = 0;
337	da->dst.sin_addr.s_addr = htonl(dst_u32);
338	rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
339	    &rnd);
340	if (rt != NULL) {
341		struct in_addr addr;
342		uint32_t scopeid;
343
344		rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
345		fhp->start = ntohl(addr.s_addr);
346		fhp->end = fhp->start;
347		if (fhp->preflen < 32)
348			fhp->end |= (0xffffffffU >> fhp->preflen);
349		fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
350	} else {
351		fhp->preflen = fhp->nexthop = fhp->start = 0;
352		fhp->end = 0xffffffffU;
353	}
354}
355
356static uint32_t
357chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
358{
359
360	if (IS_SHORT_FORMAT(fdesc->fragments))
361		return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
362	else if (IS_XL_FORMAT(fdesc->fragments))
363		return (da->range_tbl[fdesc->base].fragments + 2);
364	else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
365		return (fdesc->fragments + 1);
366}
367
368static uint32_t
369chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
370{
371	uint32_t size = chunk_size(da, fdesc);
372	uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
373	uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
374	uint32_t hash = fdesc->fragments;
375
376	for (; p < l; p++)
377		hash = (hash << 7) + (hash >> 13) + *p;
378
379	return (hash + (hash >> 16));
380}
381
382static int
383chunk_ref(struct dxr_aux *da, uint32_t chunk)
384{
385	struct direct_entry *fdesc = &da->direct_tbl[chunk];
386	struct chunk_desc *cdp, *empty_cdp;
387	uint32_t base = fdesc->base;
388	uint32_t size = chunk_size(da, fdesc);
389	uint32_t hash = chunk_hash(da, fdesc);
390
391	/* Find an existing descriptor */
392	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
393	    cd_hash_le) {
394		if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
395		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
396		    sizeof(struct range_entry_long) * size))
397			continue;
398		da->rtbl_top = fdesc->base;
399		fdesc->base = cdp->cd_base;
400		cdp->cd_refcnt++;
401		return (0);
402	}
403
404	/* No matching chunks found. Recycle an empty or allocate a new one */
405	cdp = NULL;
406	LIST_FOREACH(empty_cdp, &da->unused_chunks, cd_hash_le)
407		if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
408		    empty_cdp->cd_max_size < cdp->cd_max_size)) {
409			cdp = empty_cdp;
410			if (empty_cdp->cd_max_size == size)
411				break;
412		}
413
414	if (cdp != NULL) {
415		/* Copy from heap into the recycled chunk */
416		bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
417		    size * sizeof(struct range_entry_long));
418		fdesc->base = cdp->cd_base;
419		da->rtbl_top -= size;
420		da->unused_chunks_cnt--;
421		if (cdp->cd_max_size > size + 1) {
422			/* Split the range in two, need a new descriptor */
423			empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
424			if (empty_cdp == NULL)
425				return (1);
426			empty_cdp->cd_max_size = cdp->cd_max_size - size;
427			empty_cdp->cd_base = cdp->cd_base + size;
428			LIST_INSERT_AFTER(cdp, empty_cdp, cd_all_le);
429			LIST_INSERT_AFTER(cdp, empty_cdp, cd_hash_le);
430			da->all_chunks_cnt++;
431			da->unused_chunks_cnt++;
432			cdp->cd_max_size = size;
433		}
434		LIST_REMOVE(cdp, cd_hash_le);
435	} else {
436		/* Alloc a new descriptor */
437		cdp = uma_zalloc(chunk_zone, M_NOWAIT);
438		if (cdp == NULL)
439			return (1);
440		cdp->cd_max_size = size;
441		cdp->cd_base = fdesc->base;
442		LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
443		da->all_chunks_cnt++;
444	}
445
446	cdp->cd_hash = hash;
447	cdp->cd_refcnt = 1;
448	cdp->cd_cur_size = size;
449	LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
450	    cd_hash_le);
451	if (da->rtbl_top >= da->rtbl_size) {
452		if (da->rtbl_top >= BASE_MAX) {
453			FIB_PRINTF(LOG_ERR, da->fd,
454			    "structural limit exceeded at %d "
455			    "range table elements", da->rtbl_top);
456			return (1);
457		}
458		da->rtbl_size += RTBL_SIZE_INCR;
459		if (da->rtbl_top >= BASE_MAX / 4)
460			FIB_PRINTF(LOG_WARNING, da->fd, "range table at %d%%",
461			    da->rtbl_top * 100 / BASE_MAX);
462		da->range_tbl = realloc(da->range_tbl,
463		    sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
464		    M_DXRAUX, M_NOWAIT);
465		if (da->range_tbl == NULL)
466			return (1);
467	}
468
469	return (0);
470}
471
472static void
473chunk_unref(struct dxr_aux *da, uint32_t chunk)
474{
475	struct direct_entry *fdesc = &da->direct_tbl[chunk];
476	struct chunk_desc *cdp;
477	uint32_t base = fdesc->base;
478	uint32_t size = chunk_size(da, fdesc);
479	uint32_t hash = chunk_hash(da, fdesc);
480
481	/* Find an existing descriptor */
482	LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
483	    cd_hash_le)
484		if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
485		    memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
486		    sizeof(struct range_entry_long) * size) == 0)
487			break;
488
489	KASSERT(cdp != NULL, ("dxr: dangling chunk"));
490	if (--cdp->cd_refcnt > 0)
491		return;
492
493	LIST_REMOVE(cdp, cd_hash_le);
494	da->unused_chunks_cnt++;
495	if (cdp->cd_base + cdp->cd_max_size != da->rtbl_top) {
496		LIST_INSERT_HEAD(&da->unused_chunks, cdp, cd_hash_le);
497		return;
498	}
499
500	do {
501		da->all_chunks_cnt--;
502		da->unused_chunks_cnt--;
503		da->rtbl_top -= cdp->cd_max_size;
504		LIST_REMOVE(cdp, cd_all_le);
505		uma_zfree(chunk_zone, cdp);
506		LIST_FOREACH(cdp, &da->unused_chunks, cd_hash_le)
507			if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
508				LIST_REMOVE(cdp, cd_hash_le);
509				break;
510			}
511	} while (cdp != NULL);
512}
513
514#ifdef DXR2
515static uint32_t
516trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
517{
518	uint32_t i, *val;
519	uint32_t hash = 0;
520
521	for (i = 0; i < (1 << dxr_x); i++) {
522		hash = (hash << 3) ^ (hash >> 3);
523		val = (uint32_t *)
524		    (void *) &da->direct_tbl[(index << dxr_x) + i];
525		hash += (*val << 5);
526		hash += (*val >> 5);
527	}
528
529	return (hash + (hash >> 16));
530}
531
532static int
533trie_ref(struct dxr_aux *da, uint32_t index)
534{
535	struct trie_desc *tp;
536	uint32_t dxr_d = da->d_bits;
537	uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
538	uint32_t hash = trie_hash(da, dxr_x, index);
539
540	/* Find an existing descriptor */
541	LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
542		if (tp->td_hash == hash &&
543		    memcmp(&da->direct_tbl[index << dxr_x],
544		    &da->x_tbl[tp->td_index << dxr_x],
545		    sizeof(*da->x_tbl) << dxr_x) == 0) {
546			tp->td_refcnt++;
547			da->trietbl[index] = tp;
548			return(tp->td_index);
549		}
550
551	tp = LIST_FIRST(&da->unused_trie);
552	if (tp != NULL) {
553		LIST_REMOVE(tp, td_hash_le);
554		da->unused_trie_cnt--;
555	} else {
556		tp = uma_zalloc(trie_zone, M_NOWAIT);
557		if (tp == NULL)
558			return (-1);
559		LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
560		tp->td_index = da->all_trie_cnt++;
561	}
562
563	tp->td_hash = hash;
564	tp->td_refcnt = 1;
565	LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
566	   td_hash_le);
567	memcpy(&da->x_tbl[tp->td_index << dxr_x],
568	    &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
569	da->trietbl[index] = tp;
570	if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
571		da->xtbl_size += XTBL_SIZE_INCR;
572		da->x_tbl = realloc(da->x_tbl,
573		    sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
574		if (da->x_tbl == NULL)
575			return (-1);
576	}
577	return(tp->td_index);
578}
579
580static void
581trie_unref(struct dxr_aux *da, uint32_t index)
582{
583	struct trie_desc *tp = da->trietbl[index];
584
585	if (tp == NULL)
586		return;
587	da->trietbl[index] = NULL;
588	if (--tp->td_refcnt > 0)
589		return;
590
591	LIST_REMOVE(tp, td_hash_le);
592	da->unused_trie_cnt++;
593	if (tp->td_index != da->all_trie_cnt - 1) {
594		LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
595		return;
596	}
597
598	do {
599		da->all_trie_cnt--;
600		da->unused_trie_cnt--;
601		LIST_REMOVE(tp, td_all_le);
602		uma_zfree(trie_zone, tp);
603		LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
604			if (tp->td_index == da->all_trie_cnt - 1) {
605				LIST_REMOVE(tp, td_hash_le);
606				break;
607			}
608	} while (tp != NULL);
609}
610#endif
611
612static void
613heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
614    uint32_t nh)
615{
616	struct heap_entry *fhp;
617	int i;
618
619	for (i = da->heap_index; i >= 0; i--) {
620		if (preflen > da->heap[i].preflen)
621			break;
622		else if (preflen < da->heap[i].preflen)
623			da->heap[i + 1] = da->heap[i];
624		else
625			return;
626	}
627
628	fhp = &da->heap[i + 1];
629	fhp->preflen = preflen;
630	fhp->start = start;
631	fhp->end = end;
632	fhp->nexthop = nh;
633	da->heap_index++;
634}
635
636static int
637dxr_walk(struct rtentry *rt, void *arg)
638{
639	struct dxr_aux *da = arg;
640	uint32_t chunk = da->work_chunk;
641	uint32_t first = chunk << DXR_RANGE_SHIFT;
642	uint32_t last = first | DXR_RANGE_MASK;
643	struct range_entry_long *fp =
644	    &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
645	struct heap_entry *fhp = &da->heap[da->heap_index];
646	uint32_t preflen, nh, start, end, scopeid;
647	struct in_addr addr;
648
649	rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
650	start = ntohl(addr.s_addr);
651	if (start > last)
652		return (-1);	/* Beyond chunk boundaries, we are done */
653	if (start < first)
654		return (0);	/* Skip this route */
655
656	end = start;
657	if (preflen < 32)
658		end |= (0xffffffffU >> preflen);
659	nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
660
661	if (start == fhp->start)
662		heap_inject(da, start, end, preflen, nh);
663	else {
664		/* start > fhp->start */
665		while (start > fhp->end) {
666			uint32_t oend = fhp->end;
667
668			if (da->heap_index > 0) {
669				fhp--;
670				da->heap_index--;
671			} else
672				initheap(da, fhp->end + 1, chunk);
673			if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
674				fp++;
675				da->rtbl_work_frags++;
676				fp->start = (oend + 1) & DXR_RANGE_MASK;
677				fp->nexthop = fhp->nexthop;
678			}
679		}
680		if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
681		    nh != fp->nexthop) {
682			fp++;
683			da->rtbl_work_frags++;
684			fp->start = start & DXR_RANGE_MASK;
685		} else if (da->rtbl_work_frags) {
686			if ((--fp)->nexthop == nh)
687				da->rtbl_work_frags--;
688			else
689				fp++;
690		}
691		fp->nexthop = nh;
692		heap_inject(da, start, end, preflen, nh);
693	}
694
695	return (0);
696}
697
698static int
699update_chunk(struct dxr_aux *da, uint32_t chunk)
700{
701	struct range_entry_long *fp;
702#if DXR_TRIE_BITS < 24
703	struct range_entry_short *fps;
704	uint32_t start, nh, i;
705#endif
706	struct heap_entry *fhp;
707	uint32_t first = chunk << DXR_RANGE_SHIFT;
708	uint32_t last = first | DXR_RANGE_MASK;
709
710	if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
711		chunk_unref(da, chunk);
712
713	initheap(da, first, chunk);
714
715	fp = &da->range_tbl[da->rtbl_top].re;
716	da->rtbl_work_frags = 0;
717	fp->start = first & DXR_RANGE_MASK;
718	fp->nexthop = da->heap[0].nexthop;
719
720	da->dst.sin_addr.s_addr = htonl(first);
721	da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
722
723	da->work_chunk = chunk;
724	rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
725	    (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
726	    dxr_walk, da);
727
728	/* Flush any remaining objects on the heap */
729	fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
730	fhp = &da->heap[da->heap_index];
731	while (fhp->preflen > DXR_TRIE_BITS) {
732		uint32_t oend = fhp->end;
733
734		if (da->heap_index > 0) {
735			fhp--;
736			da->heap_index--;
737		} else
738			initheap(da, fhp->end + 1, chunk);
739		if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
740			/* Have we crossed the upper chunk boundary? */
741			if (oend >= last)
742				break;
743			fp++;
744			da->rtbl_work_frags++;
745			fp->start = (oend + 1) & DXR_RANGE_MASK;
746			fp->nexthop = fhp->nexthop;
747		}
748	}
749
750	/* Direct hit if the chunk contains only a single fragment */
751	if (da->rtbl_work_frags == 0) {
752		da->direct_tbl[chunk].base = fp->nexthop;
753		da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
754		return (0);
755	}
756
757	da->direct_tbl[chunk].base = da->rtbl_top;
758	da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
759
760#if DXR_TRIE_BITS < 24
761	/* Check whether the chunk can be more compactly encoded */
762	fp = &da->range_tbl[da->rtbl_top].re;
763	for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
764		if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
765			break;
766	if (i == da->rtbl_work_frags + 1) {
767		fp = &da->range_tbl[da->rtbl_top].re;
768		fps = (void *) fp;
769		for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
770			start = fp->start;
771			nh = fp->nexthop;
772			fps->start = start >> 8;
773			fps->nexthop = nh;
774		}
775		fps->start = start >> 8;
776		fps->nexthop = nh;
777		da->rtbl_work_frags >>= 1;
778		da->direct_tbl[chunk].fragments =
779		    da->rtbl_work_frags | FRAGS_PREF_SHORT;
780	} else
781#endif
782	if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
783		da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
784		memmove(&da->range_tbl[da->rtbl_top + 1],
785		   &da->range_tbl[da->rtbl_top],
786		   (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
787		da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
788		da->rtbl_work_frags++;
789	}
790	da->rtbl_top += (da->rtbl_work_frags + 1);
791	return (chunk_ref(da, chunk));
792}
793
794static void
795dxr_build(struct dxr *dxr)
796{
797	struct dxr_aux *da = dxr->aux;
798	struct chunk_desc *cdp;
799	struct rib_rtable_info rinfo;
800	struct timeval t0, t1, t2, t3;
801	uint32_t r_size, dxr_tot_size;
802	uint32_t i, m, range_rebuild = 0;
803#ifdef DXR2
804	struct trie_desc *tp;
805	uint32_t d_tbl_size, dxr_x, d_size, x_size;
806	uint32_t ti, trie_rebuild = 0, prev_size = 0;
807#endif
808
809	KASSERT(dxr->d == NULL, ("dxr: d not free"));
810
811	if (da == NULL) {
812		da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
813		if (da == NULL)
814			return;
815		dxr->aux = da;
816		da->fibnum = dxr->fibnum;
817		da->refcnt = 1;
818		LIST_INIT(&da->all_chunks);
819		LIST_INIT(&da->all_trie);
820		da->rtbl_size = RTBL_SIZE_INCR;
821		da->range_tbl = NULL;
822		da->xtbl_size = XTBL_SIZE_INCR;
823		da->x_tbl = NULL;
824		bzero(&da->dst, sizeof(da->dst));
825		bzero(&da->mask, sizeof(da->mask));
826		da->dst.sin_len = sizeof(da->dst);
827		da->mask.sin_len = sizeof(da->mask);
828		da->dst.sin_family = AF_INET;
829		da->mask.sin_family = AF_INET;
830	}
831	if (da->range_tbl == NULL) {
832		da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
833		    + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
834		if (da->range_tbl == NULL)
835			return;
836		range_rebuild = 1;
837	}
838#ifdef DXR2
839	if (da->x_tbl == NULL) {
840		da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
841		    M_DXRAUX, M_NOWAIT);
842		if (da->x_tbl == NULL)
843			return;
844		trie_rebuild = 1;
845	}
846#endif
847	da->fd = dxr->fd;
848
849	microuptime(&t0);
850
851	dxr->nh_tbl = fib_get_nhop_array(da->fd);
852	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
853
854	if (da->updates_low > da->updates_high ||
855	    da->unused_chunks_cnt > V_max_range_holes)
856		range_rebuild = 1;
857	if (range_rebuild) {
858		/* Bulk cleanup */
859		bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
860		while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
861			LIST_REMOVE(cdp, cd_all_le);
862			uma_zfree(chunk_zone, cdp);
863		}
864		LIST_INIT(&da->unused_chunks);
865		da->all_chunks_cnt = da->unused_chunks_cnt = 0;
866		da->rtbl_top = 0;
867		da->updates_low = 0;
868		da->updates_high = DIRECT_TBL_SIZE - 1;
869		memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
870		for (i = 0; i < DIRECT_TBL_SIZE; i++) {
871			da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
872			da->direct_tbl[i].base = 0;
873		}
874	}
875	da->prefixes = rinfo.num_prefixes;
876
877	/* DXR: construct direct & range table */
878	for (i = da->updates_low; i <= da->updates_high; i++) {
879		m = da->updates_mask[i >> 5] >> (i & 0x1f);
880		if (m == 0)
881			i |= 0x1f;
882		else if (m & 1 && update_chunk(da, i) != 0)
883			return;
884	}
885	r_size = sizeof(*da->range_tbl) * da->rtbl_top;
886	microuptime(&t1);
887
888#ifdef DXR2
889	if (range_rebuild || da->unused_trie_cnt > V_max_trie_holes ||
890	    abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
891		trie_rebuild = 1;
892	if (trie_rebuild) {
893		da->trie_rebuilt_prefixes = da->prefixes;
894		da->d_bits = DXR_D;
895		da->updates_low = 0;
896		da->updates_high = DIRECT_TBL_SIZE - 1;
897	}
898
899dxr2_try_squeeze:
900	if (trie_rebuild) {
901		/* Bulk cleanup */
902		bzero(da->trietbl, sizeof(da->trietbl));
903		bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
904		while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
905			LIST_REMOVE(tp, td_all_le);
906			uma_zfree(trie_zone, tp);
907		}
908		LIST_INIT(&da->unused_trie);
909		da->all_trie_cnt = da->unused_trie_cnt = 0;
910	}
911
912	/* Populate d_tbl, x_tbl */
913	dxr_x = DXR_TRIE_BITS - da->d_bits;
914	d_tbl_size = (1 << da->d_bits);
915
916	for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
917	    i++) {
918		trie_unref(da, i);
919		ti = trie_ref(da, i);
920		if (ti < 0)
921			return;
922		da->d_tbl[i] = ti;
923	}
924
925	d_size = sizeof(*da->d_tbl) * d_tbl_size;
926	x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
927	    * da->all_trie_cnt;
928	dxr_tot_size = d_size + x_size + r_size;
929
930	if (trie_rebuild == 1) {
931		/* Try to find a more compact D/X split */
932		if (prev_size == 0 || dxr_tot_size <= prev_size)
933			da->d_bits--;
934		else {
935			da->d_bits++;
936			trie_rebuild = 2;
937		}
938		prev_size = dxr_tot_size;
939		goto dxr2_try_squeeze;
940	}
941	microuptime(&t2);
942#else /* !DXR2 */
943	dxr_tot_size = sizeof(da->direct_tbl) + r_size;
944	t2 = t1;
945#endif
946
947	dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
948	if (dxr->d == NULL)
949		return;
950#ifdef DXR2
951	memcpy(dxr->d, da->d_tbl, d_size);
952	dxr->x = ((char *) dxr->d) + d_size;
953	memcpy(dxr->x, da->x_tbl, x_size);
954	dxr->r = ((char *) dxr->x) + x_size;
955	dxr->d_shift = 32 - da->d_bits;
956	dxr->x_shift = dxr_x;
957	dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
958#else /* !DXR2 */
959	memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl));
960	dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl);
961#endif
962	memcpy(dxr->r, da->range_tbl, r_size);
963
964	if (da->updates_low <= da->updates_high)
965		bzero(&da->updates_mask[da->updates_low / 32],
966		    (da->updates_high - da->updates_low) / 8 + 1);
967	da->updates_low = DIRECT_TBL_SIZE - 1;
968	da->updates_high = 0;
969	microuptime(&t3);
970
971#ifdef DXR2
972	FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
973	    da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
974#else
975	FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)",
976	    DXR_D, rinfo.num_prefixes, rinfo.num_nhops);
977#endif
978	i = dxr_tot_size * 100 / rinfo.num_prefixes;
979	FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
980	    dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
981	    i / 100, i % 100);
982	i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
983	FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
984	    range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
985#ifdef DXR2
986	i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
987	FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
988	    trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
989#endif
990	i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
991	FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
992	    i / 1000, i % 1000);
993	FIB_PRINTF(LOG_INFO, da->fd, "range table: %d%%, %d chunks, %d holes",
994	    da->rtbl_top * 100 / BASE_MAX, da->all_chunks_cnt,
995	    da->unused_chunks_cnt);
996}
997
998/*
999 * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure.
1000 */
1001
1002static struct nhop_object *
1003dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
1004    uint32_t scopeid)
1005{
1006	struct dxr *dxr = algo_data;
1007	uint32_t nh;
1008
1009	nh = dxr_lookup(dxr, ntohl(key.addr4.s_addr));
1010
1011	return (dxr->nh_tbl[nh]);
1012}
1013
1014static enum flm_op_result
1015dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
1016{
1017	struct dxr *old_dxr = old_data;
1018	struct dxr_aux *da = NULL;
1019	struct dxr *dxr;
1020
1021	dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
1022	if (dxr == NULL)
1023		return (FLM_REBUILD);
1024
1025	/* Check whether we may reuse the old auxiliary structures */
1026	if (old_dxr != NULL && old_dxr->aux != NULL) {
1027		da = old_dxr->aux;
1028		atomic_add_int(&da->refcnt, 1);
1029	}
1030
1031	dxr->aux = da;
1032	dxr->d = NULL;
1033	dxr->fd = fd;
1034	dxr->fibnum = fibnum;
1035	*data = dxr;
1036
1037	return (FLM_SUCCESS);
1038}
1039
1040static void
1041dxr_destroy(void *data)
1042{
1043	struct dxr *dxr = data;
1044	struct dxr_aux *da;
1045	struct chunk_desc *cdp;
1046	struct trie_desc *tp;
1047
1048	if (dxr->d != NULL)
1049		free(dxr->d, M_DXRLPM);
1050
1051	da = dxr->aux;
1052	free(dxr, M_DXRAUX);
1053
1054	if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
1055		return;
1056
1057	/* Release auxiliary structures */
1058	while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
1059		LIST_REMOVE(cdp, cd_all_le);
1060		uma_zfree(chunk_zone, cdp);
1061	}
1062	while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
1063		LIST_REMOVE(tp, td_all_le);
1064		uma_zfree(trie_zone, tp);
1065	}
1066	free(da->range_tbl, M_DXRAUX);
1067	free(da->x_tbl, M_DXRAUX);
1068	free(da, M_DXRAUX);
1069}
1070
1071static void
1072epoch_dxr_destroy(epoch_context_t ctx)
1073{
1074	struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
1075
1076	dxr_destroy(dxr);
1077}
1078
1079static enum flm_op_result
1080dxr_dump_end(void *data, struct fib_dp *dp)
1081{
1082	struct dxr *dxr = data;
1083	struct dxr_aux *da;
1084
1085	dxr_build(dxr);
1086
1087	da = dxr->aux;
1088	if (da == NULL)
1089		return (FLM_REBUILD);
1090
1091	/* Structural limit exceeded, hard error */
1092	if (da->rtbl_top >= BASE_MAX)
1093		return (FLM_ERROR);
1094
1095	/* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1096	if (dxr->d == NULL)
1097		return (FLM_REBUILD);
1098
1099	dp->f = dxr_fib_lookup;
1100	dp->arg = dxr;
1101
1102	return (FLM_SUCCESS);
1103}
1104
1105static enum flm_op_result
1106dxr_dump_rib_item(struct rtentry *rt, void *data)
1107{
1108
1109	return (FLM_SUCCESS);
1110}
1111
1112static enum flm_op_result
1113dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
1114    void *data)
1115{
1116
1117	return (FLM_BATCH);
1118}
1119
1120static enum flm_op_result
1121dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
1122    void *data)
1123{
1124	struct dxr *dxr = data;
1125	struct dxr *new_dxr;
1126	struct dxr_aux *da;
1127	struct fib_dp new_dp;
1128	enum flm_op_result res;
1129	uint32_t ip, plen, hmask, start, end, i, ui;
1130#ifdef INVARIANTS
1131	struct rib_rtable_info rinfo;
1132	int update_delta = 0;
1133#endif
1134
1135	KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__));
1136	KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__));
1137	KASSERT(q->count < q->size, ("%s: q->count %d q->size %d",
1138	    __FUNCTION__, q->count, q->size));
1139
1140	da = dxr->aux;
1141	KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__));
1142
1143	FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
1144	for (ui = 0; ui < q->count; ui++) {
1145#ifdef INVARIANTS
1146		if (q->entries[ui].nh_new != NULL)
1147			update_delta++;
1148		if (q->entries[ui].nh_old != NULL)
1149			update_delta--;
1150#endif
1151		plen = q->entries[ui].plen;
1152		ip = ntohl(q->entries[ui].addr4.s_addr);
1153		hmask = 0xffffffffU >> plen;
1154		start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
1155		end = (ip | hmask) >> DXR_RANGE_SHIFT;
1156
1157		if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
1158			for (i = start >> 5; i <= end >> 5; i++)
1159				da->updates_mask[i] = 0xffffffffU;
1160		else
1161			for (i = start; i <= end; i++)
1162				da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
1163		if (start < da->updates_low)
1164			da->updates_low = start;
1165		if (end > da->updates_high)
1166			da->updates_high = end;
1167	}
1168
1169#ifdef INVARIANTS
1170	fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
1171	KASSERT(da->prefixes + update_delta == rinfo.num_prefixes,
1172	    ("%s: update count mismatch", __FUNCTION__));
1173#endif
1174
1175	res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
1176	if (res != FLM_SUCCESS)
1177		return (res);
1178
1179	dxr_build(new_dxr);
1180
1181	/* Structural limit exceeded, hard error */
1182	if (da->rtbl_top >= BASE_MAX) {
1183		dxr_destroy(new_dxr);
1184		return (FLM_ERROR);
1185	}
1186
1187	/* A malloc(,, M_NOWAIT) failed somewhere, retry later */
1188	if (new_dxr->d == NULL) {
1189		dxr_destroy(new_dxr);
1190		return (FLM_REBUILD);
1191	}
1192
1193	new_dp.f = dxr_fib_lookup;
1194	new_dp.arg = new_dxr;
1195	if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
1196		fib_set_algo_ptr(dxr->fd, new_dxr);
1197		fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
1198		return (FLM_SUCCESS);
1199	}
1200
1201	dxr_destroy(new_dxr);
1202	return (FLM_REBUILD);
1203}
1204
1205static uint8_t
1206dxr_get_pref(const struct rib_rtable_info *rinfo)
1207{
1208
1209	/* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
1210	return (251);
1211}
1212
1213static struct fib_lookup_module fib_dxr_mod = {
1214	.flm_name = "dxr",
1215	.flm_family = AF_INET,
1216	.flm_init_cb = dxr_init,
1217	.flm_destroy_cb = dxr_destroy,
1218	.flm_dump_rib_item_cb = dxr_dump_rib_item,
1219	.flm_dump_end_cb = dxr_dump_end,
1220	.flm_change_rib_item_cb = dxr_change_rib_item,
1221	.flm_change_rib_items_cb = dxr_change_rib_batch,
1222	.flm_get_pref = dxr_get_pref,
1223};
1224
1225static int
1226dxr_modevent(module_t mod, int type, void *unused)
1227{
1228	int error;
1229
1230	switch (type) {
1231	case MOD_LOAD:
1232		chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
1233		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1234		trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
1235		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1236		fib_module_register(&fib_dxr_mod);
1237		return(0);
1238	case MOD_UNLOAD:
1239		error = fib_module_unregister(&fib_dxr_mod);
1240		if (error)
1241			return (error);
1242		uma_zdestroy(chunk_zone);
1243		uma_zdestroy(trie_zone);
1244		return(0);
1245	default:
1246		return(EOPNOTSUPP);
1247	}
1248}
1249
1250static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
1251
1252DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1253MODULE_VERSION(fib_dxr, 1);
1254