radix_mpath.c revision 191080
1177867Sjfv/*	$KAME: radix_mpath.c,v 1.17 2004/11/08 10:29:39 itojun Exp $	*/
2169240Sjfv
3269196Sjfv/*
4169240Sjfv * Copyright (C) 2001 WIDE Project.
5169240Sjfv * All rights reserved.
6169240Sjfv *
7169240Sjfv * Redistribution and use in source and binary forms, with or without
8169240Sjfv * modification, are permitted provided that the following conditions
9169240Sjfv * are met:
10169240Sjfv * 1. Redistributions of source code must retain the above copyright
11169240Sjfv *    notice, this list of conditions and the following disclaimer.
12169240Sjfv * 2. Redistributions in binary form must reproduce the above copyright
13169240Sjfv *    notice, this list of conditions and the following disclaimer in the
14169240Sjfv *    documentation and/or other materials provided with the distribution.
15169240Sjfv * 3. Neither the name of the project nor the names of its contributors
16169240Sjfv *    may be used to endorse or promote products derived from this software
17169240Sjfv *    without specific prior written permission.
18169240Sjfv *
19169240Sjfv * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
20169240Sjfv * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21169240Sjfv * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22169240Sjfv * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
23169240Sjfv * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24169240Sjfv * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25169240Sjfv * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26169240Sjfv * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27169240Sjfv * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28169240Sjfv * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29169240Sjfv * SUCH DAMAGE.
30169240Sjfv * THE AUTHORS DO NOT GUARANTEE THAT THIS SOFTWARE DOES NOT INFRINGE
31169240Sjfv * ANY OTHERS' INTELLECTUAL PROPERTIES. IN NO EVENT SHALL THE AUTHORS
32177867Sjfv * BE LIABLE FOR ANY INFRINGEMENT OF ANY OTHERS' INTELLECTUAL
33177867Sjfv * PROPERTIES.
34169240Sjfv */
35169240Sjfv
36169240Sjfv#include <sys/cdefs.h>
37169240Sjfv__FBSDID("$FreeBSD: head/sys/net/radix_mpath.c 191080 2009-04-14 23:05:36Z kmacy $");
38169240Sjfv
39169240Sjfv#include "opt_inet.h"
40169240Sjfv#include "opt_inet6.h"
41169240Sjfv
42169240Sjfv#include <sys/param.h>
43169240Sjfv#include <sys/systm.h>
44228386Sjfv#include <sys/malloc.h>
45228386Sjfv#include <sys/socket.h>
46228386Sjfv#include <sys/domain.h>
47228386Sjfv#include <sys/syslog.h>
48228386Sjfv#include <net/radix.h>
49228386Sjfv#include <net/radix_mpath.h>
50228386Sjfv#include <net/route.h>
51228386Sjfv#include <net/if.h>
52228386Sjfv#include <net/if_var.h>
53228386Sjfv
54228386Sjfv/*
55228386Sjfv * give some jitter to hash, to avoid synchronization between routers
56228386Sjfv */
57228386Sjfvstatic uint32_t hashjitter;
58228386Sjfv
59228386Sjfvint
60228386Sjfvrn_mpath_capable(struct radix_node_head *rnh)
61228386Sjfv{
62228386Sjfv
63228386Sjfv	return rnh->rnh_multipath;
64228386Sjfv}
65228386Sjfv
66228386Sjfvstruct radix_node *
67228386Sjfvrn_mpath_next(struct radix_node *rn)
68228386Sjfv{
69228386Sjfv	struct radix_node *next;
70228386Sjfv
71228386Sjfv	if (!rn->rn_dupedkey)
72228386Sjfv		return NULL;
73228386Sjfv	next = rn->rn_dupedkey;
74228386Sjfv	if (rn->rn_mask == next->rn_mask)
75228386Sjfv		return next;
76228386Sjfv	else
77228386Sjfv		return NULL;
78228386Sjfv}
79228386Sjfv
80228386Sjfvuint32_t
81228386Sjfvrn_mpath_count(struct radix_node *rn)
82228386Sjfv{
83228386Sjfv	uint32_t i = 0;
84228386Sjfv	struct rtentry *rt;
85228386Sjfv
86228386Sjfv	while (rn != NULL) {
87228386Sjfv		rt = (struct rtentry *)rn;
88228386Sjfv		i += rt->rt_rmx.rmx_weight;
89228386Sjfv		rn = rn_mpath_next(rn);
90228386Sjfv	}
91228386Sjfv	return (i);
92228386Sjfv}
93228386Sjfv
94228386Sjfvstruct rtentry *
95228386Sjfvrt_mpath_matchgate(struct rtentry *rt, struct sockaddr *gate)
96228386Sjfv{
97228386Sjfv	struct radix_node *rn;
98228386Sjfv
99228386Sjfv	if (!rn_mpath_next((struct radix_node *)rt))
100228386Sjfv		return rt;
101228386Sjfv
102228386Sjfv	if (!gate)
103228386Sjfv		return NULL;
104228386Sjfv
105228386Sjfv	/* beyond here, we use rn as the master copy */
106228386Sjfv	rn = (struct radix_node *)rt;
107228386Sjfv	do {
108228386Sjfv		rt = (struct rtentry *)rn;
109228386Sjfv		/*
110228386Sjfv		 * we are removing an address alias that has
111228386Sjfv		 * the same prefix as another address
112228386Sjfv		 * we need to compare the interface address because
113228386Sjfv		 * rt_gateway is a special sockadd_dl structure
114228386Sjfv		 */
115228386Sjfv		if (rt->rt_gateway->sa_family == AF_LINK) {
116228386Sjfv			if (!memcmp(rt->rt_ifa->ifa_addr, gate, gate->sa_len))
117228386Sjfv				break;
118228386Sjfv		} else {
119228386Sjfv			if (rt->rt_gateway->sa_len == gate->sa_len &&
120228386Sjfv			    !memcmp(rt->rt_gateway, gate, gate->sa_len))
121228386Sjfv				break;
122228386Sjfv		}
123228386Sjfv	} while ((rn = rn_mpath_next(rn)) != NULL);
124228386Sjfv
125228386Sjfv	return (struct rtentry *)rn;
126228386Sjfv}
127228386Sjfv
128228386Sjfv/*
129228386Sjfv * go through the chain and unlink "rt" from the list
130228386Sjfv * the caller will free "rt"
131228386Sjfv */
132247064Sjfvint
133247064Sjfvrt_mpath_deldup(struct rtentry *headrt, struct rtentry *rt)
134247064Sjfv{
135247064Sjfv        struct radix_node *t, *tt;
136269196Sjfv
137269196Sjfv        if (!headrt || !rt)
138269196Sjfv            return (0);
139269196Sjfv        t = (struct radix_node *)headrt;
140228386Sjfv        tt = rn_mpath_next(t);
141228386Sjfv        while (tt) {
142228386Sjfv            if (tt == (struct radix_node *)rt) {
143228386Sjfv                t->rn_dupedkey = tt->rn_dupedkey;
144228386Sjfv                tt->rn_dupedkey = NULL;
145228386Sjfv    	        tt->rn_flags &= ~RNF_ACTIVE;
146228386Sjfv	        tt[1].rn_flags &= ~RNF_ACTIVE;
147228386Sjfv                return (1);
148228386Sjfv            }
149247064Sjfv            t = tt;
150228386Sjfv            tt = rn_mpath_next((struct radix_node *)t);
151247064Sjfv        }
152228386Sjfv        return (0);
153228386Sjfv}
154228386Sjfv
155228386Sjfv/*
156228386Sjfv * check if we have the same key/mask/gateway on the table already.
157228386Sjfv */
158228386Sjfvint
159228386Sjfvrt_mpath_conflict(struct radix_node_head *rnh, struct rtentry *rt,
160228386Sjfv    struct sockaddr *netmask)
161228386Sjfv{
162228386Sjfv	struct radix_node *rn, *rn1;
163228386Sjfv	struct rtentry *rt1;
164228386Sjfv	char *p, *q, *eq;
165228386Sjfv	int same, l, skip;
166238148Sjfv
167238148Sjfv	rn = (struct radix_node *)rt;
168238148Sjfv	rn1 = rnh->rnh_lookup(rt_key(rt), netmask, rnh);
169238148Sjfv	if (!rn1 || rn1->rn_flags & RNF_ROOT)
170238148Sjfv		return 0;
171238148Sjfv
172256200Sjfv	/*
173256200Sjfv	 * unlike other functions we have in this file, we have to check
174238148Sjfv	 * all key/mask/gateway as rnh_lookup can match less specific entry.
175256200Sjfv	 */
176256200Sjfv	rt1 = (struct rtentry *)rn1;
177256200Sjfv
178228386Sjfv	/* compare key. */
179228386Sjfv	if (rt_key(rt1)->sa_len != rt_key(rt)->sa_len ||
180228386Sjfv	    bcmp(rt_key(rt1), rt_key(rt), rt_key(rt1)->sa_len))
181228386Sjfv		goto different;
182247064Sjfv
183228386Sjfv	/* key was the same.  compare netmask.  hairy... */
184228386Sjfv	if (rt_mask(rt1) && netmask) {
185228386Sjfv		skip = rnh->rnh_treetop->rn_offset;
186228386Sjfv		if (rt_mask(rt1)->sa_len > netmask->sa_len) {
187228386Sjfv			/*
188169240Sjfv			 * as rt_mask(rt1) is made optimal by radix.c,
189228386Sjfv			 * there must be some 1-bits on rt_mask(rt1)
190228386Sjfv			 * after netmask->sa_len.  therefore, in
191228386Sjfv			 * this case, the entries are different.
192228386Sjfv			 */
193169240Sjfv			if (rt_mask(rt1)->sa_len > skip)
194228386Sjfv				goto different;
195228386Sjfv			else {
196228386Sjfv				/* no bits to compare, i.e. same*/
197228386Sjfv				goto maskmatched;
198190872Sjfv			}
199181027Sjfv		}
200169240Sjfv
201169240Sjfv		l = rt_mask(rt1)->sa_len;
202169240Sjfv		if (skip > l) {
203169240Sjfv			/* no bits to compare, i.e. same */
204169240Sjfv			goto maskmatched;
205169240Sjfv		}
206169240Sjfv		p = (char *)rt_mask(rt1);
207169240Sjfv		q = (char *)netmask;
208169240Sjfv		if (bcmp(p + skip, q + skip, l - skip))
209169240Sjfv			goto different;
210169240Sjfv		/*
211169240Sjfv		 * need to go through all the bit, as netmask is not
212169240Sjfv		 * optimal and can contain trailing 0s
213169240Sjfv		 */
214169240Sjfv		eq = (char *)netmask + netmask->sa_len;
215169240Sjfv		q += l;
216178523Sjfv		same = 1;
217194865Sjfv		while (eq > q)
218169240Sjfv			if (*q++) {
219169240Sjfv				same = 0;
220169240Sjfv				break;
221178523Sjfv			}
222194865Sjfv		if (!same)
223213234Sjfv			goto different;
224247064Sjfv	} else if (!rt_mask(rt1) && !netmask)
225177867Sjfv		; /* no mask to compare, i.e. same */
226181027Sjfv	else {
227200243Sjfv		/* one has mask and the other does not, different */
228218530Sjfv		goto different;
229256200Sjfv	}
230238148Sjfv
231238148Sjfvmaskmatched:
232209616Sjfv
233218530Sjfv	/* key/mask were the same.  compare gateway for all multipaths */
234177867Sjfv	do {
235181027Sjfv		rt1 = (struct rtentry *)rn1;
236169240Sjfv
237181027Sjfv		/* sanity: no use in comparing the same thing */
238169240Sjfv		if (rn1 == rn)
239169240Sjfv			continue;
240169240Sjfv
241169240Sjfv		if (rt1->rt_gateway->sa_family == AF_LINK) {
242169240Sjfv			if (rt1->rt_ifa->ifa_addr->sa_len != rt->rt_ifa->ifa_addr->sa_len ||
243181027Sjfv			    bcmp(rt1->rt_ifa->ifa_addr, rt->rt_ifa->ifa_addr,
244169240Sjfv			    rt1->rt_ifa->ifa_addr->sa_len))
245181027Sjfv				continue;
246169240Sjfv		} else {
247169240Sjfv			if (rt1->rt_gateway->sa_len != rt->rt_gateway->sa_len ||
248169240Sjfv			    bcmp(rt1->rt_gateway, rt->rt_gateway,
249169240Sjfv			    rt1->rt_gateway->sa_len))
250169240Sjfv				continue;
251256200Sjfv		}
252169240Sjfv
253181027Sjfv		/* all key/mask/gateway are the same.  conflicting entry. */
254169240Sjfv		return EEXIST;
255181027Sjfv	} while ((rn1 = rn_mpath_next(rn1)) != NULL);
256169240Sjfv
257169240Sjfvdifferent:
258169240Sjfv	return 0;
259169240Sjfv}
260169240Sjfv
261181027Sjfvvoid
262169240Sjfvrtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum)
263181027Sjfv{
264169240Sjfv	struct radix_node *rn0, *rn;
265169240Sjfv	u_int32_t n;
266169240Sjfv	struct rtentry *rt;
267169240Sjfv	int64_t weight;
268169240Sjfv
269169240Sjfv	/*
270169240Sjfv	 * XXX we don't attempt to lookup cached route again; what should
271169240Sjfv	 * be done for sendto(3) case?
272176667Sjfv	 */
273194865Sjfv	if (ro->ro_rt && ro->ro_rt->rt_ifp && (ro->ro_rt->rt_flags & RTF_UP))
274194865Sjfv		return;
275213234Sjfv	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, 0, fibnum);
276247064Sjfv
277200243Sjfv	/* if the route does not exist or it is not multipath, don't care */
278181027Sjfv	if (ro->ro_rt == NULL)
279238148Sjfv		return;
280181027Sjfv	if (rn_mpath_next((struct radix_node *)ro->ro_rt) == NULL) {
281169240Sjfv		RT_UNLOCK(ro->ro_rt);
282181027Sjfv		return;
283169240Sjfv	}
284169240Sjfv
285169240Sjfv	/* beyond here, we use rn as the master copy */
286169240Sjfv	rn0 = rn = (struct radix_node *)ro->ro_rt;
287169240Sjfv	n = rn_mpath_count(rn0);
288181027Sjfv
289169240Sjfv	/* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */
290181027Sjfv	hash += hashjitter;
291169240Sjfv	hash %= n;
292169240Sjfv	for (weight = abs((int32_t)hash), rt = ro->ro_rt;
293169240Sjfv	     weight >= rt->rt_rmx.rmx_weight && rn;
294169240Sjfv	     weight -= rt->rt_rmx.rmx_weight) {
295169240Sjfv
296169240Sjfv		/* stay within the multipath routes */
297169240Sjfv		if (rn->rn_dupedkey && rn->rn_mask != rn->rn_dupedkey->rn_mask)
298173788Sjfv			break;
299169240Sjfv		rn = rn->rn_dupedkey;
300181027Sjfv		rt = (struct rtentry *)rn;
301169240Sjfv	}
302181027Sjfv	/* XXX try filling rt_gwroute and avoid unreachable gw  */
303169240Sjfv
304169240Sjfv	/* gw selection has failed - there must be only zero weight routes */
305169240Sjfv	if (!rn) {
306169240Sjfv		RT_UNLOCK(ro->ro_rt);
307173788Sjfv		ro->ro_rt = NULL;
308169240Sjfv		return;
309169240Sjfv	}
310169240Sjfv	if (ro->ro_rt != rt) {
311181027Sjfv		RTFREE_LOCKED(ro->ro_rt);
312169240Sjfv		ro->ro_rt = (struct rtentry *)rn;
313181027Sjfv		RT_LOCK(ro->ro_rt);
314169240Sjfv		RT_ADDREF(ro->ro_rt);
315169240Sjfv
316169240Sjfv	}
317181027Sjfv	RT_UNLOCK(ro->ro_rt);
318169240Sjfv}
319181027Sjfv
320169240Sjfvextern int	in6_inithead(void **head, int off);
321169240Sjfvextern int	in_inithead(void **head, int off);
322169240Sjfv
323181027Sjfv#ifdef INET
324169240Sjfvint
325185353Sjfvrn4_mpath_inithead(void **head, int off)
326169240Sjfv{
327169240Sjfv	struct radix_node_head *rnh;
328169240Sjfv
329169240Sjfv	hashjitter = arc4random();
330169240Sjfv	if (in_inithead(head, off) == 1) {
331181027Sjfv		rnh = (struct radix_node_head *)*head;
332169240Sjfv		rnh->rnh_multipath = 1;
333181027Sjfv		return 1;
334169240Sjfv	} else
335169240Sjfv		return 0;
336169240Sjfv}
337181027Sjfv#endif
338169240Sjfv
339181027Sjfv#ifdef INET6
340169240Sjfvint
341169240Sjfvrn6_mpath_inithead(void **head, int off)
342169240Sjfv{
343169240Sjfv	struct radix_node_head *rnh;
344181027Sjfv
345169240Sjfv	hashjitter = arc4random();
346185353Sjfv	if (in6_inithead(head, off) == 1) {
347185353Sjfv		rnh = (struct radix_node_head *)*head;
348185353Sjfv		rnh->rnh_multipath = 1;
349185353Sjfv		return 1;
350185353Sjfv	} else
351185353Sjfv		return 0;
352185353Sjfv}
353185353Sjfv
354185353Sjfv#endif
355185353Sjfv