radix.c revision 11042:2d6e217af1b4
1/*
2 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
3 * Use is subject to license terms.
4 *
5 * Copyright (c) 1988, 1989, 1993
6 *	The Regents of the University of California.  All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 4. Neither the name of the University nor the names of its contributors
17 *    may be used to endorse or promote products derived from this software
18 *    without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 *	@(#)radix.c	8.5 (Berkeley) 5/19/95
33 * $FreeBSD: /repoman/r/ncvs/src/sys/net/radix.c,v 1.36.2.1 2005/01/31 23:26:23
34 * imp Exp $
35 */
36
37
38/*
39 * Routines to build and maintain radix trees for routing lookups.
40 */
41#include <sys/types.h>
42
43#ifndef _RADIX_H_
44#include <sys/param.h>
45#ifdef	_KERNEL
46#include <sys/lock.h>
47#include <sys/mutex.h>
48#include <sys/systm.h>
49#include <sys/cmn_err.h>
50#else
51#include <assert.h>
52#define	ASSERT assert
53#include <stdio.h>
54#include <stdlib.h>
55#include <syslog.h>
56#include <strings.h>
57#endif	/* _KERNEL */
58#include <net/radix.h>
59#endif
60
61#ifndef	_KERNEL
62void
63panic(const char *str)
64{
65	fprintf(stderr, "Panic - %s\n", str);
66	abort();
67}
68#endif	/* _KERNEL */
69
70static int	rn_walktree(struct radix_node_head *, walktree_f_t *, void *);
71static int	rn_walktree_mt(struct radix_node_head *, walktree_f_t *,
72    void *, lockf_t, lockf_t);
73static struct radix_node
74	*rn_insert(void *, struct radix_node_head *, int *,
75	    struct radix_node [2]),
76	*rn_newpair(void *, int, struct radix_node[2]),
77	*rn_search(void *, struct radix_node *),
78	*rn_search_m(void *, struct radix_node *, void *),
79	*rn_lookup(void *, void *, struct radix_node_head *),
80	*rn_match(void *, struct radix_node_head *),
81	*rn_match_args(void *, struct radix_node_head *, match_leaf_t *,
82	    void *),
83	*rn_addmask(void *, int, int),
84	*rn_addroute(void *, void *, struct radix_node_head *,
85	    struct radix_node [2]),
86	*rn_delete(void *, void *, struct radix_node_head *);
87static	boolean_t rn_refines(void *, void *);
88
89/*
90 * IPF also uses PATRICIA tree to manage ippools. IPF stores its own structure
91 * addrfamily_t. sizeof (addrfamily_t) == 24.
92 */
93#define	MAX_KEYLEN	24
94static int	max_keylen = MAX_KEYLEN;
95
96#ifdef	_KERNEL
97static struct kmem_cache *radix_mask_cache; /* for rn_mkfreelist */
98static struct kmem_cache *radix_node_cache;
99#else
100static char *radix_mask_cache, *radix_node_cache; /* dummy vars. never inited */
101#endif	/* _KERNEL */
102
103static struct radix_mask *rn_mkfreelist;
104static struct radix_node_head *mask_rnhead;
105/*
106 * Work area -- the following point to 2 buffers of size max_keylen,
107 * allocated in this order in a block of memory malloc'ed by rn_init.
108 * A third buffer of size MAX_KEYLEN is allocated from the stack.
109 */
110static char *rn_zeros, *rn_ones;
111
112#define	MKGet(m)  R_Malloc(m, radix_mask_cache, sizeof (struct radix_mask))
113#define	MKFree(m) Free(m, radix_mask_cache)
114#define	rn_masktop (mask_rnhead->rnh_treetop)
115
116static boolean_t	rn_lexobetter(void *m_arg, void *n_arg);
117static struct radix_mask *
118		rn_new_radix_mask(struct radix_node *tt,
119		    struct radix_mask *next);
120static boolean_t
121		rn_satisfies_leaf(char *trial, struct radix_node *leaf,
122		    int skip, match_leaf_t *rn_leaf_fn, void *rn_leaf_arg);
123
124#define	RN_MATCHF(rn, f, arg)	(f == NULL || (*f)((rn), arg))
125
126/*
127 * The data structure for the keys is a radix tree with one way
128 * branching removed.  The index rn_bit at an internal node n represents a bit
129 * position to be tested.  The tree is arranged so that all descendants
130 * of a node n have keys whose bits all agree up to position rn_bit - 1.
131 * (We say the index of n is rn_bit.)
132 *
133 * There is at least one descendant which has a one bit at position rn_bit,
134 * and at least one with a zero there.
135 *
136 * A route is determined by a pair of key and mask.  We require that the
137 * bit-wise logical and of the key and mask to be the key.
138 * We define the index of a route associated with the mask to be
139 * the first bit number in the mask where 0 occurs (with bit number 0
140 * representing the highest order bit).
141 *
142 * We say a mask is normal if every bit is 0, past the index of the mask.
143 * If a node n has a descendant (k, m) with index(m) == index(n) == rn_bit,
144 * and m is a normal mask, then the route applies to every descendant of n.
145 * If the index(m) < rn_bit, this implies the trailing last few bits of k
146 * before bit b are all 0, (and hence consequently true of every descendant
147 * of n), so the route applies to all descendants of the node as well.
148 *
149 * Similar logic shows that a non-normal mask m such that
150 * index(m) <= index(n) could potentially apply to many children of n.
151 * Thus, for each non-host route, we attach its mask to a list at an internal
152 * node as high in the tree as we can go.
153 *
154 * The present version of the code makes use of normal routes in short-
155 * circuiting an explict mask and compare operation when testing whether
156 * a key satisfies a normal route, and also in remembering the unique leaf
157 * that governs a subtree.
158 */
159
160/*
161 * Most of the functions in this code assume that the key/mask arguments
162 * are sockaddr-like structures, where the first byte is an uchar_t
163 * indicating the size of the entire structure.
164 *
165 * To make the assumption more explicit, we use the LEN() macro to access
166 * this field. It is safe to pass an expression with side effects
167 * to LEN() as the argument is evaluated only once.
168 */
169#define	LEN(x) (*(const uchar_t *)(x))
170
171
172/*
173 * Search a node in the tree matching the key.
174 */
175static struct radix_node *
176rn_search(v_arg, head)
177	void *v_arg;
178	struct radix_node *head;
179{
180	struct radix_node *x;
181	caddr_t v;
182
183	for (x = head, v = v_arg; x->rn_bit >= 0; ) {
184		if (x->rn_bmask & v[x->rn_offset])
185			x = x->rn_right;
186		else
187			x = x->rn_left;
188	}
189	return (x);
190}
191
192/*
193 * Same as above, but with an additional mask.
194 */
195static struct radix_node *
196rn_search_m(v_arg, head, m_arg)
197	struct radix_node *head;
198	void *v_arg, *m_arg;
199{
200	struct radix_node *x;
201	caddr_t v = v_arg, m = m_arg;
202
203	for (x = head; x->rn_bit >= 0; ) {
204		if ((x->rn_bmask & m[x->rn_offset]) &&
205		    (x->rn_bmask & v[x->rn_offset]))
206			x = x->rn_right;
207		else
208			x = x->rn_left;
209	}
210	return (x);
211}
212
213/*
214 * Returns true if there are no bits set in n_arg that are zero in
215 * m_arg and the masks aren't equal.  In other words, it returns true
216 * when m_arg is a finer-granularity netmask -- it represents a subset
217 * of the destinations implied by n_arg.
218 */
219static boolean_t
220rn_refines(m_arg, n_arg)
221	void *m_arg, *n_arg;
222{
223	caddr_t m = m_arg, n = n_arg;
224	caddr_t lim = n + LEN(n), lim2 = lim;
225	int longer = LEN(n++) - (int)LEN(m++);
226	boolean_t masks_are_equal = B_TRUE;
227
228	if (longer > 0)
229		lim -= longer;
230	while (n < lim) {
231		if (*n & ~(*m))
232			return (0);
233		if (*n++ != *m++)
234			masks_are_equal = B_FALSE;
235	}
236	while (n < lim2)
237		if (*n++)
238			return (B_FALSE);
239	if (masks_are_equal && (longer < 0))
240		for (lim2 = m - longer; m < lim2; )
241			if (*m++)
242				return (B_TRUE);
243	return (!masks_are_equal);
244}
245
246static struct radix_node *
247rn_lookup(v_arg, m_arg, head)
248	void *v_arg, *m_arg;
249	struct radix_node_head *head;
250{
251	struct radix_node *x;
252	caddr_t netmask = NULL;
253
254	if (m_arg) {
255		x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_offset);
256		if (x == NULL)
257			return (NULL);
258		netmask = x->rn_key;
259	}
260	x = rn_match(v_arg, head);
261	if (x && netmask) {
262		while (x && x->rn_mask != netmask)
263			x = x->rn_dupedkey;
264	}
265	return (x);
266}
267
268/*
269 * Returns true if address 'trial' has no bits differing from the
270 * leaf's key when compared under the leaf's mask.  In other words,
271 * returns true when 'trial' matches leaf.
272 * In addition, if a rn_leaf_fn is passed in, that is used to find
273 * a match on conditions defined by the caller of rn_match.  This is
274 * used by the kernel ftable to match on IRE_MATCH_* conditions.
275 */
276static boolean_t
277rn_satisfies_leaf(trial, leaf, skip, rn_leaf_fn, rn_leaf_arg)
278	caddr_t trial;
279	struct radix_node *leaf;
280	int skip;
281	match_leaf_t *rn_leaf_fn;
282	void *rn_leaf_arg;
283{
284	char *cp = trial, *cp2 = leaf->rn_key, *cp3 = leaf->rn_mask;
285	char *cplim;
286	int length = min(LEN(cp), LEN(cp2));
287
288	if (cp3 == 0)
289		cp3 = rn_ones;
290	else
291		length = min(length, LEN(cp3));
292	cplim = cp + length;
293	cp3 += skip;
294	cp2 += skip;
295
296	for (cp += skip; cp < cplim; cp++, cp2++, cp3++)
297		if ((*cp ^ *cp2) & *cp3)
298			return (B_FALSE);
299
300	return (RN_MATCHF(leaf, rn_leaf_fn, rn_leaf_arg));
301}
302
303static struct radix_node *
304rn_match(v_arg, head)
305	void *v_arg;
306	struct radix_node_head *head;
307{
308	return (rn_match_args(v_arg, head, NULL, NULL));
309}
310
311static struct radix_node *
312rn_match_args(v_arg, head, rn_leaf_fn, rn_leaf_arg)
313	void *v_arg;
314	struct radix_node_head *head;
315	match_leaf_t *rn_leaf_fn;
316	void *rn_leaf_arg;
317{
318	caddr_t v = v_arg;
319	struct radix_node *t = head->rnh_treetop, *x;
320	caddr_t cp = v, cp2;
321	caddr_t cplim;
322	struct radix_node *saved_t, *top = t;
323	int off = t->rn_offset, vlen = LEN(cp), matched_off;
324	int test, b, rn_bit;
325
326	/*
327	 * Open code rn_search(v, top) to avoid overhead of extra
328	 * subroutine call.
329	 */
330	for (; t->rn_bit >= 0; ) {
331		if (t->rn_bmask & cp[t->rn_offset])
332			t = t->rn_right;
333		else
334			t = t->rn_left;
335	}
336	/*
337	 * See if we match exactly as a host destination
338	 * or at least learn how many bits match, for normal mask finesse.
339	 *
340	 * It doesn't hurt us to limit how many bytes to check
341	 * to the length of the mask, since if it matches we had a genuine
342	 * match and the leaf we have is the most specific one anyway;
343	 * if it didn't match with a shorter length it would fail
344	 * with a long one.  This wins big for class B&C netmasks which
345	 * are probably the most common case...
346	 */
347	if (t->rn_mask)
348		vlen = LEN(t->rn_mask);
349	cp += off; cp2 = t->rn_key + off; cplim = v + vlen;
350	for (; cp < cplim; cp++, cp2++)
351		if (*cp != *cp2)
352			goto keydiff;
353	/*
354	 * This extra grot is in case we are explicitly asked
355	 * to look up the default.  Ugh!
356	 *
357	 * Never return the root node itself, it seems to cause a
358	 * lot of confusion.
359	 */
360	if (t->rn_flags & RNF_ROOT)
361		t = t->rn_dupedkey;
362	if (t == NULL || RN_MATCHF(t, rn_leaf_fn, rn_leaf_arg)) {
363		return (t);
364	} else {
365		/*
366		 * Although we found an exact match on the key, rn_leaf_fn
367		 * is looking for some other criteria as well. Continue
368		 * looking as if the exact match failed.
369		 */
370		if (t->rn_dupedkey == NULL &&
371		    (t->rn_parent->rn_flags & RNF_ROOT)) {
372			/* no more dupedkeys and hit the top. have to give up */
373			return (NULL);
374		}
375		b = 0;
376		goto keeplooking;
377
378	}
379keydiff:
380	test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */
381	for (b = 7; (test >>= 1) > 0; )
382		b--;
383keeplooking:
384	matched_off = cp - v;
385	b += matched_off << 3;
386	rn_bit = -1 - b;
387
388	/*
389	 * If there is a host route in a duped-key chain, it will be first.
390	 */
391	if ((saved_t = t)->rn_mask == 0)
392		t = t->rn_dupedkey;
393	for (; t != NULL; t = t->rn_dupedkey) {
394		/*
395		 * Even if we don't match exactly as a host,
396		 * we may match if the leaf we wound up at is
397		 * a route to a net.
398		 */
399
400		if (t->rn_flags & RNF_NORMAL) {
401			if ((rn_bit <= t->rn_bit) &&
402			    RN_MATCHF(t, rn_leaf_fn, rn_leaf_arg)) {
403				return (t);
404			}
405		} else if (rn_satisfies_leaf(v, t, matched_off, rn_leaf_fn,
406		    rn_leaf_arg)) {
407			return (t);
408		}
409	}
410	t = saved_t;
411	/* start searching up the tree */
412	do {
413		struct radix_mask *m;
414
415		t = t->rn_parent;
416		m = t->rn_mklist;
417		/*
418		 * If non-contiguous masks ever become important
419		 * we can restore the masking and open coding of
420		 * the search and satisfaction test and put the
421		 * calculation of "off" back before the "do".
422		 */
423		while (m) {
424			if (m->rm_flags & RNF_NORMAL) {
425				if ((rn_bit <= m->rm_bit) &&
426				    RN_MATCHF(m->rm_leaf, rn_leaf_fn,
427				    rn_leaf_arg)) {
428					return (m->rm_leaf);
429				}
430			} else {
431				off = min(t->rn_offset, matched_off);
432				x = rn_search_m(v, t, m->rm_mask);
433				while (x != NULL && x->rn_mask != m->rm_mask)
434					x = x->rn_dupedkey;
435				if (x && rn_satisfies_leaf(v, x, off,
436				    rn_leaf_fn, rn_leaf_arg)) {
437					return (x);
438				}
439			}
440			m = m->rm_mklist;
441		}
442	} while (t != top);
443	return (0);
444}
445
446/*
447 * Whenever we add a new leaf to the tree, we also add a parent node,
448 * so we allocate them as an array of two elements: the first one must be
449 * the leaf (see RNTORT() in route.c), the second one is the parent.
450 * This routine initializes the relevant fields of the nodes, so that
451 * the leaf is the left child of the parent node, and both nodes have
452 * (almost) all all fields filled as appropriate.
453 * The function returns a pointer to the parent node.
454 */
455
456static struct radix_node *
457rn_newpair(v, b, nodes)
458	void *v;
459	int b;
460	struct radix_node nodes[2];
461{
462	struct radix_node *tt = nodes, *t = tt + 1;
463
464	t->rn_bit = b;
465	t->rn_bmask = 0x80 >> (b & 7);
466	t->rn_left = tt;
467	t->rn_offset = b >> 3;
468
469	/*
470	 * t->rn_parent, r->rn_right, tt->rn_mask, tt->rn_dupedkey
471	 * and tt->rn_bmask must have been zeroed by caller.
472	 */
473	tt->rn_bit = -1;
474	tt->rn_key = v;
475	tt->rn_parent = t;
476	tt->rn_flags = t->rn_flags = RNF_ACTIVE;
477	tt->rn_mklist = t->rn_mklist = 0;
478	return (t);
479}
480
481static struct radix_node *
482rn_insert(v_arg, head, dupentry, nodes)
483	void *v_arg;
484	struct radix_node_head *head;
485	int *dupentry;
486	struct radix_node nodes[2];
487{
488	caddr_t v = v_arg;
489	struct radix_node *top = head->rnh_treetop;
490	struct radix_node *p, *x;
491	int head_off = top->rn_offset, vlen = (int)LEN(v);
492	struct radix_node *t = rn_search(v_arg, top);
493	caddr_t cp = v + head_off;
494	int b;
495	struct radix_node *tt;
496	caddr_t cp2 = t->rn_key + head_off;
497	int cmp_res;
498	caddr_t cplim = v + vlen;
499
500	/*
501	 * Find first bit at which v and t->rn_key differ
502	 */
503	while (cp < cplim)
504		if (*cp2++ != *cp++)
505			goto on1;
506	*dupentry = 1;
507	return (t);
508on1:
509	*dupentry = 0;
510	cmp_res = (cp[-1] ^ cp2[-1]) & 0xff;
511	/*
512	 * (cp - v) is the number of bytes where the match is relevant.
513	 * Multiply by 8 to get number of bits. Then reduce this number
514	 * by the trailing bits in the last byte where we have a match
515	 * by looking at (cmp_res >> 1) in each iteration below.
516	 * Note that v starts at the beginning of the key, so, when key
517	 * is a sockaddr structure, the preliminary len/family/port bytes
518	 * are accounted for.
519	 */
520	for (b = (cp - v) << 3; cmp_res; b--)
521		cmp_res >>= 1;
522	cp = v;
523	x = top;
524	do {
525		p = x;
526		if (cp[x->rn_offset] & x->rn_bmask)
527			x = x->rn_right;
528		else
529			x = x->rn_left;
530	} while (b > (unsigned)x->rn_bit);
531			/* x->rn_bit < b && x->rn_bit >= 0 */
532	/*
533	 * now the rightmost bit where v and rn_key differ (b) is <
534	 * x->rn_bit.
535	 *
536	 * We will add a new branch at p.  b cannot equal x->rn_bit
537	 * because we know we didn't find a duplicated key.
538	 * The tree will be re-adjusted so that t is inserted between p
539	 * and x.
540	 */
541	t = rn_newpair(v_arg, b, nodes);
542	tt = t->rn_left;
543	if ((cp[p->rn_offset] & p->rn_bmask) == 0)
544		p->rn_left = t;
545	else
546		p->rn_right = t;
547	x->rn_parent = t;
548	t->rn_parent = p;
549	if ((cp[t->rn_offset] & t->rn_bmask) == 0) {
550		t->rn_right = x;
551	} else {
552		t->rn_right = tt;
553		t->rn_left = x;
554	}
555	return (tt);
556}
557
558static struct radix_node *
559rn_addmask(n_arg, search, skip)
560	int search, skip;
561	void *n_arg;
562{
563	caddr_t netmask = (caddr_t)n_arg;
564	struct radix_node *x;
565	caddr_t cp, cplim;
566	int b = 0, mlen, j;
567	int maskduplicated, m0, isnormal;
568	struct radix_node *saved_x;
569	int last_zeroed = 0;
570	char addmask_key[MAX_KEYLEN];
571
572	if ((mlen = LEN(netmask)) > max_keylen)
573		mlen = max_keylen;
574	if (skip == 0)
575		skip = 1;
576	if (mlen <= skip)
577		return (mask_rnhead->rnh_nodes);
578	if (skip > 1)
579		bcopy(rn_ones + 1, addmask_key + 1, skip - 1);
580	if ((m0 = mlen) > skip)
581		bcopy(netmask + skip, addmask_key + skip, mlen - skip);
582	/*
583	 * Trim trailing zeroes.
584	 */
585	for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0; )
586		cp--;
587	mlen = cp - addmask_key;
588	if (mlen <= skip) {
589		if (m0 >= last_zeroed)
590			last_zeroed = mlen;
591		return (mask_rnhead->rnh_nodes);
592	}
593	if (m0 < last_zeroed)
594		bzero(addmask_key + m0, last_zeroed - m0);
595	*addmask_key = last_zeroed = mlen;
596	x = rn_search(addmask_key, rn_masktop);
597	if (bcmp(addmask_key, x->rn_key, mlen) != 0)
598		x = 0;
599	if (x || search)
600		return (x);
601	R_Zalloc(x, radix_node_cache, max_keylen + 2 * sizeof (*x));
602
603	if ((saved_x = x) == 0)
604		return (0);
605	netmask = cp = (caddr_t)(x + 2);
606	bcopy(addmask_key, cp, mlen);
607	x = rn_insert(cp, mask_rnhead, &maskduplicated, x);
608	if (maskduplicated) {
609#ifdef	_KERNEL
610		cmn_err(CE_WARN, "rn_addmask: mask impossibly already in tree");
611#else
612		syslog(LOG_ERR, "rn_addmask: mask impossibly already in tree");
613#endif	/* _KERNEL */
614		Free(saved_x, radix_node_cache);
615		return (x);
616	}
617	/*
618	 * Calculate index of mask, and check for normalcy.
619	 * First find the first byte with a 0 bit, then if there are
620	 * more bits left (remember we already trimmed the trailing 0's),
621	 * the pattern must be one of those in normal_chars[], or we have
622	 * a non-contiguous mask.
623	 */
624	cplim = netmask + mlen;
625	isnormal = 1;
626	for (cp = netmask + skip; (cp < cplim) && *(uchar_t *)cp == 0xff; )
627		cp++;
628	if (cp != cplim) {
629		static uint8_t normal_chars[] = {
630			0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
631
632		for (j = 0x80; (j & *cp) != 0; j >>= 1)
633			b++;
634		if (*cp != normal_chars[b] || cp != (cplim - 1))
635			isnormal = 0;
636	}
637	b += (cp - netmask) << 3;
638	x->rn_bit = -1 - b;
639	if (isnormal)
640		x->rn_flags |= RNF_NORMAL;
641	return (x);
642}
643
644/* arbitrary ordering for non-contiguous masks */
645static boolean_t
646rn_lexobetter(m_arg, n_arg)
647	void *m_arg, *n_arg;
648{
649	uchar_t *mp = m_arg, *np = n_arg, *lim;
650
651	if (LEN(mp) > LEN(np))
652		/* not really, but need to check longer one first */
653		return (B_TRUE);
654	if (LEN(mp) == LEN(np))
655		for (lim = mp + LEN(mp); mp < lim; )
656			if (*mp++ > *np++)
657				return (B_TRUE);
658	return (B_FALSE);
659}
660
661static struct radix_mask *
662rn_new_radix_mask(tt, next)
663	struct radix_node *tt;
664	struct radix_mask *next;
665{
666	struct radix_mask *m;
667
668	MKGet(m);
669	if (m == 0) {
670#ifndef	_KERNEL
671		syslog(LOG_ERR, "Mask for route not entered\n");
672#endif	/* _KERNEL */
673		return (0);
674	}
675	bzero(m, sizeof (*m));
676	m->rm_bit = tt->rn_bit;
677	m->rm_flags = tt->rn_flags;
678	if (tt->rn_flags & RNF_NORMAL)
679		m->rm_leaf = tt;
680	else
681		m->rm_mask = tt->rn_mask;
682	m->rm_mklist = next;
683	tt->rn_mklist = m;
684	return (m);
685}
686
687static struct radix_node *
688rn_addroute(v_arg, n_arg, head, treenodes)
689	void *v_arg, *n_arg;
690	struct radix_node_head *head;
691	struct radix_node treenodes[2];
692{
693	caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg;
694	struct radix_node *t, *x = 0, *tt;
695	struct radix_node *saved_tt, *top = head->rnh_treetop;
696	short b = 0, b_leaf = 0;
697	int keyduplicated;
698	caddr_t mmask;
699	struct radix_mask *m, **mp;
700
701	/*
702	 * In dealing with non-contiguous masks, there may be
703	 * many different routes which have the same mask.
704	 * We will find it useful to have a unique pointer to
705	 * the mask to speed avoiding duplicate references at
706	 * nodes and possibly save time in calculating indices.
707	 */
708	if (netmask)  {
709		if ((x = rn_addmask(netmask, 0, top->rn_offset)) == 0)
710			return (0);
711		b_leaf = x->rn_bit;
712		b = -1 - x->rn_bit;
713		netmask = x->rn_key;
714	}
715	/*
716	 * Deal with duplicated keys: attach node to previous instance
717	 */
718	saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes);
719	if (keyduplicated) {
720		for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) {
721			if (tt->rn_mask == netmask)
722				return (0);
723			if (netmask == 0 ||
724			    (tt->rn_mask &&
725			    /* index (netmask) > node */
726			    ((b_leaf < tt->rn_bit) ||
727			    rn_refines(netmask, tt->rn_mask) ||
728			    rn_lexobetter(netmask, tt->rn_mask))))
729				break;
730		}
731		/*
732		 * If the mask is not duplicated, we wouldn't
733		 * find it among possible duplicate key entries
734		 * anyway, so the above test doesn't hurt.
735		 *
736		 * Insert treenodes before tt.
737		 *
738		 * We sort the masks for a duplicated key the same way as
739		 * in a masklist -- most specific to least specific.
740		 * This may require the unfortunate nuisance of relocating
741		 * the head of the list.
742		 *
743		 * We also reverse, or doubly link the list through the
744		 * parent pointer.
745		 */
746		if (tt == saved_tt) {
747			struct	radix_node *xx = x;
748			/* link in at head of list */
749			(tt = treenodes)->rn_dupedkey = t;
750			tt->rn_flags = t->rn_flags;
751			tt->rn_parent = x = t->rn_parent;
752			t->rn_parent = tt; /* parent */
753			if (x->rn_left == t)
754				x->rn_left = tt;
755			else
756				x->rn_right = tt;
757			saved_tt = tt; x = xx;
758		} else {
759			(tt = treenodes)->rn_dupedkey = t->rn_dupedkey;
760			t->rn_dupedkey = tt;
761			/* Set rn_parent value for tt and tt->rn_dupedkey */
762			tt->rn_parent = t;
763			if (tt->rn_dupedkey)
764				tt->rn_dupedkey->rn_parent = tt;
765		}
766		tt->rn_key = v;
767		tt->rn_bit = -1;
768		tt->rn_flags = RNF_ACTIVE;
769	}
770	/*
771	 * Put mask in tree.
772	 */
773	if (netmask) {
774		tt->rn_mask = netmask;
775		tt->rn_bit = x->rn_bit;
776		tt->rn_flags |= x->rn_flags & RNF_NORMAL;
777	}
778	/* BEGIN CSTYLED */
779	/*
780	 * at this point the parent-child relationship for p, t, x, tt is
781	 * one of the following:
782	 *		p			p
783	 *		:  (left/right child)	:
784	 *		:			:
785	 *		t			t
786	 *	       / \		       / \
787	 *	      x   tt		      tt  x
788	 *
789	 *	tt == saved_tt returned by rn_insert().
790	 */
791	/* END CSTYLED */
792	t = saved_tt->rn_parent;
793	if (keyduplicated)
794		goto key_exists;
795	b_leaf = -1 - t->rn_bit;
796	/*
797	 * b_leaf is now normalized to be in the leaf rn_bit format
798	 * (it is the rn_bit value of a leaf corresponding to netmask
799	 * of t->rn_bit).
800	 */
801	if (t->rn_right == saved_tt)
802		x = t->rn_left;
803	else
804		x = t->rn_right;
805	/*
806	 * Promote general routes from below.
807	 * Identify the less specific netmasks and add them to t->rm_mklist
808	 */
809	if (x->rn_bit < 0) {
810		/* x is the sibling node. it is a leaf node. */
811		for (mp = &t->rn_mklist; x; x = x->rn_dupedkey)
812			if (x->rn_mask && (x->rn_bit >= b_leaf) &&
813			    x->rn_mklist == 0) {
814				/*
815				 * x is the first node in the dupedkey chain
816				 * without a mklist, and with a shorter mask
817				 * than b_leaf. Create a radix_mask
818				 * corresponding to x's mask and add it to
819				 * t's rn_mklist. The mask list gets created
820				 * in decreasing order of mask length.
821				 */
822				*mp = m = rn_new_radix_mask(x, 0);
823				if (m)
824					mp = &m->rm_mklist;
825			}
826	} else if (x->rn_mklist) {
827		/*
828		 * Skip over masks whose index is > that of new node
829		 */
830		for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist)
831			if (m->rm_bit >= b_leaf)
832				break;
833		t->rn_mklist = m; *mp = 0;
834	}
835key_exists:
836	/* Add new route to highest possible ancestor's list */
837	if ((netmask == 0) || (b > t->rn_bit))
838		return (tt); /* can't lift at all */
839	b_leaf = tt->rn_bit;
840	/* b is the index of the netmask */
841	do {
842		x = t;
843		t = t->rn_parent;
844	} while (b <= t->rn_bit && x != top);
845	/*
846	 * Search through routes associated with node to
847	 * insert new route according to index.
848	 * Need same criteria as when sorting dupedkeys to avoid
849	 * double loop on deletion.
850	 */
851	for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist) {
852		if (m->rm_bit < b_leaf)
853			continue;
854		if (m->rm_bit > b_leaf)
855			break;
856		if (m->rm_flags & RNF_NORMAL) {
857			mmask = m->rm_leaf->rn_mask;
858			if (tt->rn_flags & RNF_NORMAL) {
859#ifdef	_KERNEL
860				cmn_err(CE_WARN, "Non-unique normal route, "
861				    "mask not entered\n");
862#else
863				syslog(LOG_ERR, "Non-unique normal route, "
864				    "mask not entered\n");
865#endif	/* _KERNEL */
866				return (tt);
867			}
868		} else
869			mmask = m->rm_mask;
870		if (mmask == netmask) {
871			m->rm_refs++;
872			tt->rn_mklist = m;
873			return (tt);
874		}
875		if (rn_refines(netmask, mmask) ||
876		    rn_lexobetter(netmask, mmask))
877			break;
878	}
879	*mp = rn_new_radix_mask(tt, *mp);
880	return (tt);
881}
882
883static struct radix_node *
884rn_delete(v_arg, netmask_arg, head)
885	void *v_arg, *netmask_arg;
886	struct radix_node_head *head;
887{
888	struct radix_node *t, *p, *x, *tt;
889	struct radix_mask *m, *saved_m, **mp;
890	struct radix_node *dupedkey, *saved_tt, *top;
891	caddr_t v, netmask;
892	int b, head_off, vlen;
893
894	v = v_arg;
895	netmask = netmask_arg;
896	x = head->rnh_treetop;
897	tt = rn_search(v, x);
898	head_off = x->rn_offset;
899	vlen =  LEN(v);
900	saved_tt = tt;
901	top = x;
902	if (tt == 0 ||
903	    bcmp(v + head_off, tt->rn_key + head_off, vlen - head_off))
904		return (0);
905	/*
906	 * Delete our route from mask lists.
907	 */
908	if (netmask) {
909		if ((x = rn_addmask(netmask, 1, head_off)) == 0)
910			return (0);
911		netmask = x->rn_key;
912		while (tt->rn_mask != netmask)
913			if ((tt = tt->rn_dupedkey) == 0)
914				return (0);
915	}
916	if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0)
917		goto on1;
918	if (tt->rn_flags & RNF_NORMAL) {
919		if (m->rm_leaf != tt || m->rm_refs > 0) {
920#ifdef	_KERNEL
921			cmn_err(CE_WARN,
922			    "rn_delete: inconsistent annotation\n");
923#else
924			syslog(LOG_ERR, "rn_delete: inconsistent annotation\n");
925#endif	/* _KERNEL */
926			return (0);  /* dangling ref could cause disaster */
927		}
928	} else {
929		if (m->rm_mask != tt->rn_mask) {
930#ifdef	_KERNEL
931			cmn_err(CE_WARN,
932			    "rn_delete: inconsistent annotation 2\n");
933#else
934			syslog(LOG_ERR,
935			    "rn_delete: inconsistent annotation 2\n");
936#endif	/* _KERNEL */
937			goto on1;
938		}
939		if (--m->rm_refs >= 0)
940			goto on1;
941	}
942	b = -1 - tt->rn_bit;
943	t = saved_tt->rn_parent;
944	if (b > t->rn_bit)
945		goto on1; /* Wasn't lifted at all */
946	do {
947		x = t;
948		t = t->rn_parent;
949	} while (b <= t->rn_bit && x != top);
950	for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist)
951		if (m == saved_m) {
952			*mp = m->rm_mklist;
953			MKFree(m);
954			break;
955		}
956	if (m == 0) {
957#ifdef	_KERNEL
958		cmn_err(CE_WARN, "rn_delete: couldn't find our annotation\n");
959#else
960		syslog(LOG_ERR, "rn_delete: couldn't find our annotation\n");
961#endif	/* _KERNEL */
962		if (tt->rn_flags & RNF_NORMAL)
963			return (0); /* Dangling ref to us */
964	}
965on1:
966	/*
967	 * Eliminate us from tree
968	 */
969	if (tt->rn_flags & RNF_ROOT)
970		return (0);
971	t = tt->rn_parent;
972	dupedkey = saved_tt->rn_dupedkey;
973	if (dupedkey) {
974		/*
975		 * Here, tt is the deletion target and
976		 * saved_tt is the head of the dupekey chain.
977		 */
978		if (tt == saved_tt) {
979			/* remove from head of chain */
980			x = dupedkey; x->rn_parent = t;
981			if (t->rn_left == tt)
982				t->rn_left = x;
983			else
984				t->rn_right = x;
985		} else {
986			/* find node in front of tt on the chain */
987			for (x = p = saved_tt; p && p->rn_dupedkey != tt; )
988				p = p->rn_dupedkey;
989			if (p) {
990				p->rn_dupedkey = tt->rn_dupedkey;
991				if (tt->rn_dupedkey)		/* parent */
992					tt->rn_dupedkey->rn_parent = p;
993								/* parent */
994			} else
995#ifdef	_KERNEL
996				cmn_err(CE_WARN,
997				    "rn_delete: couldn't find us\n");
998#else
999				syslog(LOG_ERR,
1000				    "rn_delete: couldn't find us\n");
1001#endif	/* _KERNEL */
1002		}
1003		t = tt + 1;
1004		if (t->rn_flags & RNF_ACTIVE) {
1005			*++x = *t;
1006			p = t->rn_parent;
1007			if (p->rn_left == t)
1008				p->rn_left = x;
1009			else
1010				p->rn_right = x;
1011			x->rn_left->rn_parent = x;
1012			x->rn_right->rn_parent = x;
1013		}
1014		goto out;
1015	}
1016	if (t->rn_left == tt)
1017		x = t->rn_right;
1018	else
1019		x = t->rn_left;
1020	p = t->rn_parent;
1021	if (p->rn_right == t)
1022		p->rn_right = x;
1023	else
1024		p->rn_left = x;
1025	x->rn_parent = p;
1026	/*
1027	 * Demote routes attached to us.
1028	 */
1029	if (t->rn_mklist) {
1030		if (x->rn_bit >= 0) {
1031			for (mp = &x->rn_mklist; (m = *mp) != NULL; )
1032				mp = &m->rm_mklist;
1033			*mp = t->rn_mklist;
1034		} else {
1035			/*
1036			 * If there are any key,mask pairs in a sibling
1037			 * duped-key chain, some subset will appear sorted
1038			 * in the same order attached to our mklist
1039			 */
1040			for (m = t->rn_mklist; m && x; x = x->rn_dupedkey)
1041				if (m == x->rn_mklist) {
1042					struct radix_mask *mm = m->rm_mklist;
1043					x->rn_mklist = 0;
1044					if (--(m->rm_refs) < 0)
1045						MKFree(m);
1046					m = mm;
1047				}
1048			if (m)
1049#ifdef	_KERNEL
1050				cmn_err(CE_WARN,
1051				    "rn_delete: Orphaned Mask %p at %p\n",
1052				    (void *)m, (void *)x);
1053#else
1054				syslog(LOG_ERR,
1055				    "rn_delete: Orphaned Mask %p at %p\n",
1056				    (void *)m, (void *)x);
1057#endif	/* _KERNEL */
1058		}
1059	}
1060	/*
1061	 * We may be holding an active internal node in the tree.
1062	 */
1063	x = tt + 1;
1064	if (t != x) {
1065		*t = *x;
1066		t->rn_left->rn_parent = t;
1067		t->rn_right->rn_parent = t;
1068		p = x->rn_parent;
1069		if (p->rn_left == x)
1070			p->rn_left = t;
1071		else
1072			p->rn_right = t;
1073	}
1074out:
1075	tt->rn_flags &= ~RNF_ACTIVE;
1076	tt[1].rn_flags &= ~RNF_ACTIVE;
1077	return (tt);
1078}
1079
1080/*
1081 * Walk the radix tree; For the kernel routing table, we hold additional
1082 * refs on the ire_bucket to ensure that the walk function f() does not
1083 * run into trashed memory. The kernel routing table is identified by
1084 * a rnh_treetop that has RNF_SUNW_FT set in the rn_flags.
1085 * Note that all refs takein in rn_walktree are released before it returns,
1086 * so that f() will need to take any additional references on memory
1087 * to be passed back to the caller of rn_walktree.
1088 */
1089static int
1090rn_walktree(h, f, w)
1091	struct radix_node_head *h;
1092	walktree_f_t *f;
1093	void *w;
1094{
1095	return (rn_walktree_mt(h, f, w, NULL, NULL));
1096}
1097static int
1098rn_walktree_mt(h, f, w, lockf, unlockf)
1099	struct radix_node_head *h;
1100	walktree_f_t *f;
1101	void *w;
1102	lockf_t lockf, unlockf;
1103{
1104	int error;
1105	struct radix_node *base, *next;
1106	struct radix_node *rn = h->rnh_treetop;
1107	boolean_t is_mt = B_FALSE;
1108
1109	if (lockf != NULL) {
1110		ASSERT(unlockf != NULL);
1111		is_mt = B_TRUE;
1112	}
1113	/*
1114	 * This gets complicated because we may delete the node
1115	 * while applying the function f to it, so we need to calculate
1116	 * the successor node in advance.
1117	 */
1118	RADIX_NODE_HEAD_RLOCK(h);
1119	/* First time through node, go left */
1120	while (rn->rn_bit >= 0) {
1121		rn = rn->rn_left;
1122	}
1123
1124	if (is_mt)
1125		(*lockf)(rn);
1126
1127	for (;;) {
1128		base = rn;
1129		/* If at right child go back up, otherwise, go right */
1130		while (rn->rn_parent->rn_right == rn &&
1131		    (rn->rn_flags & RNF_ROOT) == 0) {
1132			rn = rn->rn_parent;
1133		}
1134		/* Find the next *leaf* since next node might vanish, too */
1135		for (rn = rn->rn_parent->rn_right; rn->rn_bit >= 0; ) {
1136			rn = rn->rn_left;
1137		}
1138		next = rn;
1139
1140		if (is_mt && next != NULL)
1141			(*lockf)(next);
1142
1143		/* Process leaves */
1144		while ((rn = base) != NULL) {
1145			base = rn->rn_dupedkey;
1146
1147			if (is_mt && base != NULL)
1148				(*lockf)(base);
1149
1150			RADIX_NODE_HEAD_UNLOCK(h);
1151			if (!(rn->rn_flags & RNF_ROOT) &&
1152			    (error = (*f)(rn, w))) {
1153				if (is_mt) {
1154					(*unlockf)(rn);
1155					if (base != NULL)
1156						(*unlockf)(base);
1157					if (next != NULL)
1158						(*unlockf)(next);
1159				}
1160				return (error);
1161			}
1162			if (is_mt)
1163				(*unlockf)(rn);
1164			RADIX_NODE_HEAD_RLOCK(h);
1165		}
1166		rn = next;
1167		if (rn->rn_flags & RNF_ROOT) {
1168			RADIX_NODE_HEAD_UNLOCK(h);
1169			/*
1170			 * no ref to release, since we never take a ref
1171			 * on the root node- it can't be deleted.
1172			 */
1173			return (0);
1174		}
1175	}
1176	/* NOTREACHED */
1177}
1178
1179/*
1180 * Allocate and initialize an empty tree. This has 3 nodes, which are
1181 * part of the radix_node_head (in the order <left,root,right>) and are
1182 * marked RNF_ROOT so they cannot be freed.
1183 * The leaves have all-zero and all-one keys, with significant
1184 * bits starting at 'off'.
1185 * Return 1 on success, 0 on error.
1186 */
1187int
1188rn_inithead(head, off)
1189	void **head;
1190	int off;
1191{
1192	struct radix_node_head *rnh;
1193	struct radix_node *t, *tt, *ttt;
1194	if (*head)
1195		return (1);
1196	R_ZallocSleep(rnh, struct radix_node_head *, sizeof (*rnh));
1197	if (rnh == 0)
1198		return (0);
1199#ifdef _KERNEL
1200	RADIX_NODE_HEAD_LOCK_INIT(rnh);
1201#endif
1202	*head = rnh;
1203	t = rn_newpair(rn_zeros, off, rnh->rnh_nodes);
1204	ttt = rnh->rnh_nodes + 2;
1205	t->rn_right = ttt;
1206	t->rn_parent = t;
1207	tt = t->rn_left;	/* ... which in turn is rnh->rnh_nodes */
1208	tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE;
1209	tt->rn_bit = -1 - off;
1210	*ttt = *tt;
1211	ttt->rn_key = rn_ones;
1212	rnh->rnh_addaddr = rn_addroute;
1213	rnh->rnh_deladdr = rn_delete;
1214	rnh->rnh_matchaddr = rn_match;
1215	rnh->rnh_matchaddr_args = rn_match_args;
1216	rnh->rnh_lookup = rn_lookup;
1217	rnh->rnh_walktree = rn_walktree;
1218	rnh->rnh_walktree_mt = rn_walktree_mt;
1219	rnh->rnh_walktree_from = NULL;  /* not implemented */
1220	rnh->rnh_treetop = t;
1221	return (1);
1222}
1223
1224void
1225rn_init()
1226{
1227	char *cp, *cplim;
1228
1229#ifdef	_KERNEL
1230	radix_mask_cache = kmem_cache_create("radix_mask",
1231	    sizeof (struct radix_mask), 0, NULL, NULL, NULL, NULL, NULL, 0);
1232	radix_node_cache = kmem_cache_create("radix_node",
1233	    max_keylen + 2 * sizeof (struct radix_node),
1234	    0, NULL, NULL, NULL, NULL, NULL, 0);
1235#endif /* _KERNEL */
1236	R_ZallocSleep(rn_zeros, char *, 2 * max_keylen);
1237
1238	ASSERT(rn_zeros != NULL);
1239	bzero(rn_zeros, 2 * max_keylen);
1240	rn_ones = cp = rn_zeros + max_keylen;
1241	cplim = rn_ones + max_keylen;
1242	while (cp < cplim)
1243		*cp++ = -1;
1244	if (rn_inithead((void **)(void *)&mask_rnhead, 0) == 0)
1245		panic("rn_init: could not init mask_rnhead ");
1246}
1247
1248int
1249rn_freenode(n, p)
1250	struct radix_node *n;
1251	void *p;
1252{
1253	struct	radix_node_head *rnh = p;
1254	struct	radix_node *d;
1255
1256	d = rnh->rnh_deladdr(n->rn_key, NULL, rnh);
1257	if (d != NULL) {
1258		Free(d, radix_node_cache);
1259	}
1260	return (0);
1261}
1262
1263
1264void
1265rn_freehead(rnh)
1266	struct radix_node_head *rnh;
1267{
1268	(void) rn_walktree(rnh, rn_freenode, rnh);
1269
1270	rnh->rnh_addaddr = NULL;
1271	rnh->rnh_deladdr = NULL;
1272	rnh->rnh_matchaddr = NULL;
1273	rnh->rnh_lookup = NULL;
1274	rnh->rnh_walktree = NULL;
1275
1276#ifdef	_KERNEL
1277	RADIX_NODE_HEAD_DESTROY(rnh);
1278	FreeHead(rnh, sizeof (*rnh));
1279#else
1280	Free(rnh, NULL);
1281#endif	/* _KERNEL */
1282}
1283
1284void
1285rn_fini()
1286{
1287	struct radix_mask *m;
1288
1289	if (rn_zeros != NULL) {
1290#ifdef _KERNEL
1291		FreeHead(rn_zeros, 2 * max_keylen);
1292#else
1293		Free(rn_zeros, NULL);
1294#endif
1295		rn_zeros = NULL;
1296	}
1297
1298
1299	if (mask_rnhead != NULL) {
1300		rn_freehead(mask_rnhead);
1301		mask_rnhead = NULL;
1302	}
1303
1304	while ((m = rn_mkfreelist) != NULL) {
1305		rn_mkfreelist = m->rm_mklist;
1306		Free(m, NULL);
1307	}
1308}
1309