radix_mpath.c revision 191080
1177867Sjfv/* $KAME: radix_mpath.c,v 1.17 2004/11/08 10:29:39 itojun Exp $ */ 2169240Sjfv 3269196Sjfv/* 4169240Sjfv * Copyright (C) 2001 WIDE Project. 5169240Sjfv * All rights reserved. 6169240Sjfv * 7169240Sjfv * Redistribution and use in source and binary forms, with or without 8169240Sjfv * modification, are permitted provided that the following conditions 9169240Sjfv * are met: 10169240Sjfv * 1. Redistributions of source code must retain the above copyright 11169240Sjfv * notice, this list of conditions and the following disclaimer. 12169240Sjfv * 2. Redistributions in binary form must reproduce the above copyright 13169240Sjfv * notice, this list of conditions and the following disclaimer in the 14169240Sjfv * documentation and/or other materials provided with the distribution. 15169240Sjfv * 3. Neither the name of the project nor the names of its contributors 16169240Sjfv * may be used to endorse or promote products derived from this software 17169240Sjfv * without specific prior written permission. 18169240Sjfv * 19169240Sjfv * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND 20169240Sjfv * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21169240Sjfv * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22169240Sjfv * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE 23169240Sjfv * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24169240Sjfv * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25169240Sjfv * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26169240Sjfv * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27169240Sjfv * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28169240Sjfv * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29169240Sjfv * SUCH DAMAGE. 30169240Sjfv * THE AUTHORS DO NOT GUARANTEE THAT THIS SOFTWARE DOES NOT INFRINGE 31169240Sjfv * ANY OTHERS' INTELLECTUAL PROPERTIES. IN NO EVENT SHALL THE AUTHORS 32177867Sjfv * BE LIABLE FOR ANY INFRINGEMENT OF ANY OTHERS' INTELLECTUAL 33177867Sjfv * PROPERTIES. 34169240Sjfv */ 35169240Sjfv 36169240Sjfv#include <sys/cdefs.h> 37169240Sjfv__FBSDID("$FreeBSD: head/sys/net/radix_mpath.c 191080 2009-04-14 23:05:36Z kmacy $"); 38169240Sjfv 39169240Sjfv#include "opt_inet.h" 40169240Sjfv#include "opt_inet6.h" 41169240Sjfv 42169240Sjfv#include <sys/param.h> 43169240Sjfv#include <sys/systm.h> 44228386Sjfv#include <sys/malloc.h> 45228386Sjfv#include <sys/socket.h> 46228386Sjfv#include <sys/domain.h> 47228386Sjfv#include <sys/syslog.h> 48228386Sjfv#include <net/radix.h> 49228386Sjfv#include <net/radix_mpath.h> 50228386Sjfv#include <net/route.h> 51228386Sjfv#include <net/if.h> 52228386Sjfv#include <net/if_var.h> 53228386Sjfv 54228386Sjfv/* 55228386Sjfv * give some jitter to hash, to avoid synchronization between routers 56228386Sjfv */ 57228386Sjfvstatic uint32_t hashjitter; 58228386Sjfv 59228386Sjfvint 60228386Sjfvrn_mpath_capable(struct radix_node_head *rnh) 61228386Sjfv{ 62228386Sjfv 63228386Sjfv return rnh->rnh_multipath; 64228386Sjfv} 65228386Sjfv 66228386Sjfvstruct radix_node * 67228386Sjfvrn_mpath_next(struct radix_node *rn) 68228386Sjfv{ 69228386Sjfv struct radix_node *next; 70228386Sjfv 71228386Sjfv if (!rn->rn_dupedkey) 72228386Sjfv return NULL; 73228386Sjfv next = rn->rn_dupedkey; 74228386Sjfv if (rn->rn_mask == next->rn_mask) 75228386Sjfv return next; 76228386Sjfv else 77228386Sjfv return NULL; 78228386Sjfv} 79228386Sjfv 80228386Sjfvuint32_t 81228386Sjfvrn_mpath_count(struct radix_node *rn) 82228386Sjfv{ 83228386Sjfv uint32_t i = 0; 84228386Sjfv struct rtentry *rt; 85228386Sjfv 86228386Sjfv while (rn != NULL) { 87228386Sjfv rt = (struct rtentry *)rn; 88228386Sjfv i += rt->rt_rmx.rmx_weight; 89228386Sjfv rn = rn_mpath_next(rn); 90228386Sjfv } 91228386Sjfv return (i); 92228386Sjfv} 93228386Sjfv 94228386Sjfvstruct rtentry * 95228386Sjfvrt_mpath_matchgate(struct rtentry *rt, struct sockaddr *gate) 96228386Sjfv{ 97228386Sjfv struct radix_node *rn; 98228386Sjfv 99228386Sjfv if (!rn_mpath_next((struct radix_node *)rt)) 100228386Sjfv return rt; 101228386Sjfv 102228386Sjfv if (!gate) 103228386Sjfv return NULL; 104228386Sjfv 105228386Sjfv /* beyond here, we use rn as the master copy */ 106228386Sjfv rn = (struct radix_node *)rt; 107228386Sjfv do { 108228386Sjfv rt = (struct rtentry *)rn; 109228386Sjfv /* 110228386Sjfv * we are removing an address alias that has 111228386Sjfv * the same prefix as another address 112228386Sjfv * we need to compare the interface address because 113228386Sjfv * rt_gateway is a special sockadd_dl structure 114228386Sjfv */ 115228386Sjfv if (rt->rt_gateway->sa_family == AF_LINK) { 116228386Sjfv if (!memcmp(rt->rt_ifa->ifa_addr, gate, gate->sa_len)) 117228386Sjfv break; 118228386Sjfv } else { 119228386Sjfv if (rt->rt_gateway->sa_len == gate->sa_len && 120228386Sjfv !memcmp(rt->rt_gateway, gate, gate->sa_len)) 121228386Sjfv break; 122228386Sjfv } 123228386Sjfv } while ((rn = rn_mpath_next(rn)) != NULL); 124228386Sjfv 125228386Sjfv return (struct rtentry *)rn; 126228386Sjfv} 127228386Sjfv 128228386Sjfv/* 129228386Sjfv * go through the chain and unlink "rt" from the list 130228386Sjfv * the caller will free "rt" 131228386Sjfv */ 132247064Sjfvint 133247064Sjfvrt_mpath_deldup(struct rtentry *headrt, struct rtentry *rt) 134247064Sjfv{ 135247064Sjfv struct radix_node *t, *tt; 136269196Sjfv 137269196Sjfv if (!headrt || !rt) 138269196Sjfv return (0); 139269196Sjfv t = (struct radix_node *)headrt; 140228386Sjfv tt = rn_mpath_next(t); 141228386Sjfv while (tt) { 142228386Sjfv if (tt == (struct radix_node *)rt) { 143228386Sjfv t->rn_dupedkey = tt->rn_dupedkey; 144228386Sjfv tt->rn_dupedkey = NULL; 145228386Sjfv tt->rn_flags &= ~RNF_ACTIVE; 146228386Sjfv tt[1].rn_flags &= ~RNF_ACTIVE; 147228386Sjfv return (1); 148228386Sjfv } 149247064Sjfv t = tt; 150228386Sjfv tt = rn_mpath_next((struct radix_node *)t); 151247064Sjfv } 152228386Sjfv return (0); 153228386Sjfv} 154228386Sjfv 155228386Sjfv/* 156228386Sjfv * check if we have the same key/mask/gateway on the table already. 157228386Sjfv */ 158228386Sjfvint 159228386Sjfvrt_mpath_conflict(struct radix_node_head *rnh, struct rtentry *rt, 160228386Sjfv struct sockaddr *netmask) 161228386Sjfv{ 162228386Sjfv struct radix_node *rn, *rn1; 163228386Sjfv struct rtentry *rt1; 164228386Sjfv char *p, *q, *eq; 165228386Sjfv int same, l, skip; 166238148Sjfv 167238148Sjfv rn = (struct radix_node *)rt; 168238148Sjfv rn1 = rnh->rnh_lookup(rt_key(rt), netmask, rnh); 169238148Sjfv if (!rn1 || rn1->rn_flags & RNF_ROOT) 170238148Sjfv return 0; 171238148Sjfv 172256200Sjfv /* 173256200Sjfv * unlike other functions we have in this file, we have to check 174238148Sjfv * all key/mask/gateway as rnh_lookup can match less specific entry. 175256200Sjfv */ 176256200Sjfv rt1 = (struct rtentry *)rn1; 177256200Sjfv 178228386Sjfv /* compare key. */ 179228386Sjfv if (rt_key(rt1)->sa_len != rt_key(rt)->sa_len || 180228386Sjfv bcmp(rt_key(rt1), rt_key(rt), rt_key(rt1)->sa_len)) 181228386Sjfv goto different; 182247064Sjfv 183228386Sjfv /* key was the same. compare netmask. hairy... */ 184228386Sjfv if (rt_mask(rt1) && netmask) { 185228386Sjfv skip = rnh->rnh_treetop->rn_offset; 186228386Sjfv if (rt_mask(rt1)->sa_len > netmask->sa_len) { 187228386Sjfv /* 188169240Sjfv * as rt_mask(rt1) is made optimal by radix.c, 189228386Sjfv * there must be some 1-bits on rt_mask(rt1) 190228386Sjfv * after netmask->sa_len. therefore, in 191228386Sjfv * this case, the entries are different. 192228386Sjfv */ 193169240Sjfv if (rt_mask(rt1)->sa_len > skip) 194228386Sjfv goto different; 195228386Sjfv else { 196228386Sjfv /* no bits to compare, i.e. same*/ 197228386Sjfv goto maskmatched; 198190872Sjfv } 199181027Sjfv } 200169240Sjfv 201169240Sjfv l = rt_mask(rt1)->sa_len; 202169240Sjfv if (skip > l) { 203169240Sjfv /* no bits to compare, i.e. same */ 204169240Sjfv goto maskmatched; 205169240Sjfv } 206169240Sjfv p = (char *)rt_mask(rt1); 207169240Sjfv q = (char *)netmask; 208169240Sjfv if (bcmp(p + skip, q + skip, l - skip)) 209169240Sjfv goto different; 210169240Sjfv /* 211169240Sjfv * need to go through all the bit, as netmask is not 212169240Sjfv * optimal and can contain trailing 0s 213169240Sjfv */ 214169240Sjfv eq = (char *)netmask + netmask->sa_len; 215169240Sjfv q += l; 216178523Sjfv same = 1; 217194865Sjfv while (eq > q) 218169240Sjfv if (*q++) { 219169240Sjfv same = 0; 220169240Sjfv break; 221178523Sjfv } 222194865Sjfv if (!same) 223213234Sjfv goto different; 224247064Sjfv } else if (!rt_mask(rt1) && !netmask) 225177867Sjfv ; /* no mask to compare, i.e. same */ 226181027Sjfv else { 227200243Sjfv /* one has mask and the other does not, different */ 228218530Sjfv goto different; 229256200Sjfv } 230238148Sjfv 231238148Sjfvmaskmatched: 232209616Sjfv 233218530Sjfv /* key/mask were the same. compare gateway for all multipaths */ 234177867Sjfv do { 235181027Sjfv rt1 = (struct rtentry *)rn1; 236169240Sjfv 237181027Sjfv /* sanity: no use in comparing the same thing */ 238169240Sjfv if (rn1 == rn) 239169240Sjfv continue; 240169240Sjfv 241169240Sjfv if (rt1->rt_gateway->sa_family == AF_LINK) { 242169240Sjfv if (rt1->rt_ifa->ifa_addr->sa_len != rt->rt_ifa->ifa_addr->sa_len || 243181027Sjfv bcmp(rt1->rt_ifa->ifa_addr, rt->rt_ifa->ifa_addr, 244169240Sjfv rt1->rt_ifa->ifa_addr->sa_len)) 245181027Sjfv continue; 246169240Sjfv } else { 247169240Sjfv if (rt1->rt_gateway->sa_len != rt->rt_gateway->sa_len || 248169240Sjfv bcmp(rt1->rt_gateway, rt->rt_gateway, 249169240Sjfv rt1->rt_gateway->sa_len)) 250169240Sjfv continue; 251256200Sjfv } 252169240Sjfv 253181027Sjfv /* all key/mask/gateway are the same. conflicting entry. */ 254169240Sjfv return EEXIST; 255181027Sjfv } while ((rn1 = rn_mpath_next(rn1)) != NULL); 256169240Sjfv 257169240Sjfvdifferent: 258169240Sjfv return 0; 259169240Sjfv} 260169240Sjfv 261181027Sjfvvoid 262169240Sjfvrtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum) 263181027Sjfv{ 264169240Sjfv struct radix_node *rn0, *rn; 265169240Sjfv u_int32_t n; 266169240Sjfv struct rtentry *rt; 267169240Sjfv int64_t weight; 268169240Sjfv 269169240Sjfv /* 270169240Sjfv * XXX we don't attempt to lookup cached route again; what should 271169240Sjfv * be done for sendto(3) case? 272176667Sjfv */ 273194865Sjfv if (ro->ro_rt && ro->ro_rt->rt_ifp && (ro->ro_rt->rt_flags & RTF_UP)) 274194865Sjfv return; 275213234Sjfv ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, 0, fibnum); 276247064Sjfv 277200243Sjfv /* if the route does not exist or it is not multipath, don't care */ 278181027Sjfv if (ro->ro_rt == NULL) 279238148Sjfv return; 280181027Sjfv if (rn_mpath_next((struct radix_node *)ro->ro_rt) == NULL) { 281169240Sjfv RT_UNLOCK(ro->ro_rt); 282181027Sjfv return; 283169240Sjfv } 284169240Sjfv 285169240Sjfv /* beyond here, we use rn as the master copy */ 286169240Sjfv rn0 = rn = (struct radix_node *)ro->ro_rt; 287169240Sjfv n = rn_mpath_count(rn0); 288181027Sjfv 289169240Sjfv /* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */ 290181027Sjfv hash += hashjitter; 291169240Sjfv hash %= n; 292169240Sjfv for (weight = abs((int32_t)hash), rt = ro->ro_rt; 293169240Sjfv weight >= rt->rt_rmx.rmx_weight && rn; 294169240Sjfv weight -= rt->rt_rmx.rmx_weight) { 295169240Sjfv 296169240Sjfv /* stay within the multipath routes */ 297169240Sjfv if (rn->rn_dupedkey && rn->rn_mask != rn->rn_dupedkey->rn_mask) 298173788Sjfv break; 299169240Sjfv rn = rn->rn_dupedkey; 300181027Sjfv rt = (struct rtentry *)rn; 301169240Sjfv } 302181027Sjfv /* XXX try filling rt_gwroute and avoid unreachable gw */ 303169240Sjfv 304169240Sjfv /* gw selection has failed - there must be only zero weight routes */ 305169240Sjfv if (!rn) { 306169240Sjfv RT_UNLOCK(ro->ro_rt); 307173788Sjfv ro->ro_rt = NULL; 308169240Sjfv return; 309169240Sjfv } 310169240Sjfv if (ro->ro_rt != rt) { 311181027Sjfv RTFREE_LOCKED(ro->ro_rt); 312169240Sjfv ro->ro_rt = (struct rtentry *)rn; 313181027Sjfv RT_LOCK(ro->ro_rt); 314169240Sjfv RT_ADDREF(ro->ro_rt); 315169240Sjfv 316169240Sjfv } 317181027Sjfv RT_UNLOCK(ro->ro_rt); 318169240Sjfv} 319181027Sjfv 320169240Sjfvextern int in6_inithead(void **head, int off); 321169240Sjfvextern int in_inithead(void **head, int off); 322169240Sjfv 323181027Sjfv#ifdef INET 324169240Sjfvint 325185353Sjfvrn4_mpath_inithead(void **head, int off) 326169240Sjfv{ 327169240Sjfv struct radix_node_head *rnh; 328169240Sjfv 329169240Sjfv hashjitter = arc4random(); 330169240Sjfv if (in_inithead(head, off) == 1) { 331181027Sjfv rnh = (struct radix_node_head *)*head; 332169240Sjfv rnh->rnh_multipath = 1; 333181027Sjfv return 1; 334169240Sjfv } else 335169240Sjfv return 0; 336169240Sjfv} 337181027Sjfv#endif 338169240Sjfv 339181027Sjfv#ifdef INET6 340169240Sjfvint 341169240Sjfvrn6_mpath_inithead(void **head, int off) 342169240Sjfv{ 343169240Sjfv struct radix_node_head *rnh; 344181027Sjfv 345169240Sjfv hashjitter = arc4random(); 346185353Sjfv if (in6_inithead(head, off) == 1) { 347185353Sjfv rnh = (struct radix_node_head *)*head; 348185353Sjfv rnh->rnh_multipath = 1; 349185353Sjfv return 1; 350185353Sjfv } else 351185353Sjfv return 0; 352185353Sjfv} 353185353Sjfv 354185353Sjfv#endif 355185353Sjfv