ip_mroute.c revision 166576
187416Sdes/*- 287416Sdes * Copyright (c) 1989 Stephen Deering 387416Sdes * Copyright (c) 1992, 1993 487416Sdes * The Regents of the University of California. All rights reserved. 587416Sdes * 687416Sdes * This code is derived from software contributed to Berkeley by 787416Sdes * Stephen Deering of Stanford University. 887416Sdes * 987416Sdes * Redistribution and use in source and binary forms, with or without 1087416Sdes * modification, are permitted provided that the following conditions 1187416Sdes * are met: 1287416Sdes * 1. Redistributions of source code must retain the above copyright 1387416Sdes * notice, this list of conditions and the following disclaimer. 1487416Sdes * 2. Redistributions in binary form must reproduce the above copyright 1587416Sdes * notice, this list of conditions and the following disclaimer in the 1687416Sdes * documentation and/or other materials provided with the distribution. 1787416Sdes * 4. Neither the name of the University nor the names of its contributors 1887416Sdes * may be used to endorse or promote products derived from this software 1987416Sdes * without specific prior written permission. 2087416Sdes * 2187416Sdes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 2287416Sdes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 2387416Sdes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 2487416Sdes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 2587416Sdes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2687416Sdes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2787416Sdes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2887416Sdes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2987416Sdes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 3087416Sdes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 3187416Sdes * SUCH DAMAGE. 3287416Sdes * 3387416Sdes * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 3487416Sdes */ 3587416Sdes 3687416Sdes/* 3787416Sdes * IP multicast forwarding procedures 3887416Sdes * 3987416Sdes * Written by David Waitzman, BBN Labs, August 1988. 4087416Sdes * Modified by Steve Deering, Stanford, February 1989. 4187416Sdes * Modified by Mark J. Steiglitz, Stanford, May, 1991 4287416Sdes * Modified by Van Jacobson, LBL, January 1993 4387416Sdes * Modified by Ajit Thyagarajan, PARC, August 1993 4487416Sdes * Modified by Bill Fenner, PARC, April 1995 4587416Sdes * Modified by Ahmed Helmy, SGI, June 1996 4687416Sdes * Modified by George Edmond Eddy (Rusty), ISI, February 1998 4787416Sdes * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 4887416Sdes * Modified by Hitoshi Asaeda, WIDE, August 2000 4987416Sdes * Modified by Pavlin Radoslavov, ICSI, October 2002 5087416Sdes * 5187416Sdes * MROUTING Revision: 3.5 5287416Sdes * and PIM-SMv2 and PIM-DM support, advanced API support, 5387416Sdes * bandwidth metering and signaling 5487416Sdes * 5587416Sdes * $FreeBSD: head/sys/netinet/ip_mroute.c 166576 2007-02-08 23:05:08Z bms $ 5687416Sdes */ 5787416Sdes 5887416Sdes#include "opt_mac.h" 5987416Sdes#include "opt_mrouting.h" 6087416Sdes 6187416Sdes#ifdef PIM 6287416Sdes#define _PIM_VT 1 6387416Sdes#endif 6487416Sdes 6587416Sdes#include <sys/param.h> 6687416Sdes#include <sys/kernel.h> 6787416Sdes#include <sys/lock.h> 6887416Sdes#include <sys/malloc.h> 6987416Sdes#include <sys/mbuf.h> 7087416Sdes#include <sys/module.h> 7187416Sdes#include <sys/priv.h> 7287416Sdes#include <sys/protosw.h> 7387416Sdes#include <sys/signalvar.h> 74#include <sys/socket.h> 75#include <sys/socketvar.h> 76#include <sys/sockio.h> 77#include <sys/sx.h> 78#include <sys/sysctl.h> 79#include <sys/syslog.h> 80#include <sys/systm.h> 81#include <sys/time.h> 82#include <net/if.h> 83#include <net/netisr.h> 84#include <net/route.h> 85#include <netinet/in.h> 86#include <netinet/igmp.h> 87#include <netinet/in_systm.h> 88#include <netinet/in_var.h> 89#include <netinet/ip.h> 90#include <netinet/ip_encap.h> 91#include <netinet/ip_mroute.h> 92#include <netinet/ip_var.h> 93#include <netinet/ip_options.h> 94#ifdef PIM 95#include <netinet/pim.h> 96#include <netinet/pim_var.h> 97#endif 98#include <netinet/udp.h> 99#include <machine/in_cksum.h> 100 101#include <security/mac/mac_framework.h> 102 103/* 104 * Control debugging code for rsvp and multicast routing code. 105 * Can only set them with the debugger. 106 */ 107static u_int rsvpdebug; /* non-zero enables debugging */ 108 109static u_int mrtdebug; /* any set of the flags below */ 110#define DEBUG_MFC 0x02 111#define DEBUG_FORWARD 0x04 112#define DEBUG_EXPIRE 0x08 113#define DEBUG_XMIT 0x10 114#define DEBUG_PIM 0x20 115 116#define VIFI_INVALID ((vifi_t) -1) 117 118#define M_HASCL(m) ((m)->m_flags & M_EXT) 119 120static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables"); 121 122/* 123 * Locking. We use two locks: one for the virtual interface table and 124 * one for the forwarding table. These locks may be nested in which case 125 * the VIF lock must always be taken first. Note that each lock is used 126 * to cover not only the specific data structure but also related data 127 * structures. It may be better to add more fine-grained locking later; 128 * it's not clear how performance-critical this code is. 129 * 130 * XXX: This module could particularly benefit from being cleaned 131 * up to use the <sys/queue.h> macros. 132 * 133 */ 134 135static struct mrtstat mrtstat; 136SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW, 137 &mrtstat, mrtstat, 138 "Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)"); 139 140static struct mfc *mfctable[MFCTBLSIZ]; 141SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, 142 &mfctable, sizeof(mfctable), "S,*mfc[MFCTBLSIZ]", 143 "Multicast Forwarding Table (struct *mfc[MFCTBLSIZ], netinet/ip_mroute.h)"); 144 145static struct mtx mfc_mtx; 146#define MFC_LOCK() mtx_lock(&mfc_mtx) 147#define MFC_UNLOCK() mtx_unlock(&mfc_mtx) 148#define MFC_LOCK_ASSERT() do { \ 149 mtx_assert(&mfc_mtx, MA_OWNED); \ 150 NET_ASSERT_GIANT(); \ 151} while (0) 152#define MFC_LOCK_INIT() mtx_init(&mfc_mtx, "mroute mfc table", NULL, MTX_DEF) 153#define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) 154 155static struct vif viftable[MAXVIFS]; 156SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD, 157 &viftable, sizeof(viftable), "S,vif[MAXVIFS]", 158 "Multicast Virtual Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); 159 160static struct mtx vif_mtx; 161#define VIF_LOCK() mtx_lock(&vif_mtx) 162#define VIF_UNLOCK() mtx_unlock(&vif_mtx) 163#define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) 164#define VIF_LOCK_INIT() mtx_init(&vif_mtx, "mroute vif table", NULL, MTX_DEF) 165#define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) 166 167static u_char nexpire[MFCTBLSIZ]; 168 169static eventhandler_tag if_detach_event_tag = NULL; 170 171static struct callout expire_upcalls_ch; 172 173#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ 174#define UPCALL_EXPIRE 6 /* number of timeouts */ 175 176#define ENCAP_TTL 64 177 178/* 179 * Bandwidth meter variables and constants 180 */ 181static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); 182/* 183 * Pending timeouts are stored in a hash table, the key being the 184 * expiration time. Periodically, the entries are analysed and processed. 185 */ 186#define BW_METER_BUCKETS 1024 187static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS]; 188static struct callout bw_meter_ch; 189#define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ 190 191/* 192 * Pending upcalls are stored in a vector which is flushed when 193 * full, or periodically 194 */ 195static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX]; 196static u_int bw_upcalls_n; /* # of pending upcalls */ 197static struct callout bw_upcalls_ch; 198#define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ 199 200#ifdef PIM 201static struct pimstat pimstat; 202SYSCTL_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD, 203 &pimstat, pimstat, 204 "PIM Statistics (struct pimstat, netinet/pim_var.h)"); 205 206/* 207 * Note: the PIM Register encapsulation adds the following in front of a 208 * data packet: 209 * 210 * struct pim_encap_hdr { 211 * struct ip ip; 212 * struct pim_encap_pimhdr pim; 213 * } 214 * 215 */ 216 217struct pim_encap_pimhdr { 218 struct pim pim; 219 uint32_t flags; 220}; 221 222static struct ip pim_encap_iphdr = { 223#if BYTE_ORDER == LITTLE_ENDIAN 224 sizeof(struct ip) >> 2, 225 IPVERSION, 226#else 227 IPVERSION, 228 sizeof(struct ip) >> 2, 229#endif 230 0, /* tos */ 231 sizeof(struct ip), /* total length */ 232 0, /* id */ 233 0, /* frag offset */ 234 ENCAP_TTL, 235 IPPROTO_PIM, 236 0, /* checksum */ 237}; 238 239static struct pim_encap_pimhdr pim_encap_pimhdr = { 240 { 241 PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 242 0, /* reserved */ 243 0, /* checksum */ 244 }, 245 0 /* flags */ 246}; 247 248static struct ifnet multicast_register_if; 249static vifi_t reg_vif_num = VIFI_INVALID; 250#endif /* PIM */ 251 252/* 253 * Private variables. 254 */ 255static vifi_t numvifs; 256 257static u_long X_ip_mcast_src(int vifi); 258static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, 259 struct mbuf *m, struct ip_moptions *imo); 260static int X_ip_mrouter_done(void); 261static int X_ip_mrouter_get(struct socket *so, struct sockopt *m); 262static int X_ip_mrouter_set(struct socket *so, struct sockopt *m); 263static int X_legal_vif_num(int vif); 264static int X_mrt_ioctl(int cmd, caddr_t data); 265 266static int get_sg_cnt(struct sioc_sg_req *); 267static int get_vif_cnt(struct sioc_vif_req *); 268static void if_detached_event(void *arg __unused, struct ifnet *); 269static int ip_mrouter_init(struct socket *, int); 270static int add_vif(struct vifctl *); 271static int del_vif_locked(vifi_t); 272static int del_vif(vifi_t); 273static int add_mfc(struct mfcctl2 *); 274static int del_mfc(struct mfcctl2 *); 275static int set_api_config(uint32_t *); /* chose API capabilities */ 276static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); 277static int set_assert(int); 278static void expire_upcalls(void *); 279static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); 280static void phyint_send(struct ip *, struct vif *, struct mbuf *); 281static void send_packet(struct vif *, struct mbuf *); 282 283/* 284 * Bandwidth monitoring 285 */ 286static void free_bw_list(struct bw_meter *list); 287static int add_bw_upcall(struct bw_upcall *); 288static int del_bw_upcall(struct bw_upcall *); 289static void bw_meter_receive_packet(struct bw_meter *x, int plen, 290 struct timeval *nowp); 291static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp); 292static void bw_upcalls_send(void); 293static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp); 294static void unschedule_bw_meter(struct bw_meter *x); 295static void bw_meter_process(void); 296static void expire_bw_upcalls_send(void *); 297static void expire_bw_meter_process(void *); 298 299#ifdef PIM 300static int pim_register_send(struct ip *, struct vif *, 301 struct mbuf *, struct mfc *); 302static int pim_register_send_rp(struct ip *, struct vif *, 303 struct mbuf *, struct mfc *); 304static int pim_register_send_upcall(struct ip *, struct vif *, 305 struct mbuf *, struct mfc *); 306static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *); 307#endif 308 309/* 310 * whether or not special PIM assert processing is enabled. 311 */ 312static int pim_assert; 313/* 314 * Rate limit for assert notification messages, in usec 315 */ 316#define ASSERT_MSG_TIME 3000000 317 318/* 319 * Kernel multicast routing API capabilities and setup. 320 * If more API capabilities are added to the kernel, they should be 321 * recorded in `mrt_api_support'. 322 */ 323static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | 324 MRT_MFC_FLAGS_BORDER_VIF | 325 MRT_MFC_RP | 326 MRT_MFC_BW_UPCALL); 327static uint32_t mrt_api_config = 0; 328 329/* 330 * Hash function for a source, group entry 331 */ 332#define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 333 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 334 335/* 336 * Find a route for a given origin IP address and Multicast group address 337 * Type of service parameter to be added in the future!!! 338 * Statistics are updated by the caller if needed 339 * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses) 340 */ 341static struct mfc * 342mfc_find(in_addr_t o, in_addr_t g) 343{ 344 struct mfc *rt; 345 346 MFC_LOCK_ASSERT(); 347 348 for (rt = mfctable[MFCHASH(o,g)]; rt; rt = rt->mfc_next) 349 if ((rt->mfc_origin.s_addr == o) && 350 (rt->mfc_mcastgrp.s_addr == g) && (rt->mfc_stall == NULL)) 351 break; 352 return rt; 353} 354 355/* 356 * Macros to compute elapsed time efficiently 357 * Borrowed from Van Jacobson's scheduling code 358 */ 359#define TV_DELTA(a, b, delta) { \ 360 int xxs; \ 361 delta = (a).tv_usec - (b).tv_usec; \ 362 if ((xxs = (a).tv_sec - (b).tv_sec)) { \ 363 switch (xxs) { \ 364 case 2: \ 365 delta += 1000000; \ 366 /* FALLTHROUGH */ \ 367 case 1: \ 368 delta += 1000000; \ 369 break; \ 370 default: \ 371 delta += (1000000 * xxs); \ 372 } \ 373 } \ 374} 375 376#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ 377 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 378 379/* 380 * Handle MRT setsockopt commands to modify the multicast routing tables. 381 */ 382static int 383X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) 384{ 385 int error, optval; 386 vifi_t vifi; 387 struct vifctl vifc; 388 struct mfcctl2 mfc; 389 struct bw_upcall bw_upcall; 390 uint32_t i; 391 392 if (so != ip_mrouter && sopt->sopt_name != MRT_INIT) 393 return EPERM; 394 395 error = 0; 396 switch (sopt->sopt_name) { 397 case MRT_INIT: 398 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 399 if (error) 400 break; 401 error = ip_mrouter_init(so, optval); 402 break; 403 404 case MRT_DONE: 405 error = ip_mrouter_done(); 406 break; 407 408 case MRT_ADD_VIF: 409 error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); 410 if (error) 411 break; 412 error = add_vif(&vifc); 413 break; 414 415 case MRT_DEL_VIF: 416 error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); 417 if (error) 418 break; 419 error = del_vif(vifi); 420 break; 421 422 case MRT_ADD_MFC: 423 case MRT_DEL_MFC: 424 /* 425 * select data size depending on API version. 426 */ 427 if (sopt->sopt_name == MRT_ADD_MFC && 428 mrt_api_config & MRT_API_FLAGS_ALL) { 429 error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), 430 sizeof(struct mfcctl2)); 431 } else { 432 error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), 433 sizeof(struct mfcctl)); 434 bzero((caddr_t)&mfc + sizeof(struct mfcctl), 435 sizeof(mfc) - sizeof(struct mfcctl)); 436 } 437 if (error) 438 break; 439 if (sopt->sopt_name == MRT_ADD_MFC) 440 error = add_mfc(&mfc); 441 else 442 error = del_mfc(&mfc); 443 break; 444 445 case MRT_ASSERT: 446 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 447 if (error) 448 break; 449 set_assert(optval); 450 break; 451 452 case MRT_API_CONFIG: 453 error = sooptcopyin(sopt, &i, sizeof i, sizeof i); 454 if (!error) 455 error = set_api_config(&i); 456 if (!error) 457 error = sooptcopyout(sopt, &i, sizeof i); 458 break; 459 460 case MRT_ADD_BW_UPCALL: 461 case MRT_DEL_BW_UPCALL: 462 error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, 463 sizeof bw_upcall); 464 if (error) 465 break; 466 if (sopt->sopt_name == MRT_ADD_BW_UPCALL) 467 error = add_bw_upcall(&bw_upcall); 468 else 469 error = del_bw_upcall(&bw_upcall); 470 break; 471 472 default: 473 error = EOPNOTSUPP; 474 break; 475 } 476 return error; 477} 478 479/* 480 * Handle MRT getsockopt commands 481 */ 482static int 483X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) 484{ 485 int error; 486 static int version = 0x0305; /* !!! why is this here? XXX */ 487 488 switch (sopt->sopt_name) { 489 case MRT_VERSION: 490 error = sooptcopyout(sopt, &version, sizeof version); 491 break; 492 493 case MRT_ASSERT: 494 error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert); 495 break; 496 497 case MRT_API_SUPPORT: 498 error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); 499 break; 500 501 case MRT_API_CONFIG: 502 error = sooptcopyout(sopt, &mrt_api_config, sizeof mrt_api_config); 503 break; 504 505 default: 506 error = EOPNOTSUPP; 507 break; 508 } 509 return error; 510} 511 512/* 513 * Handle ioctl commands to obtain information from the cache 514 */ 515static int 516X_mrt_ioctl(int cmd, caddr_t data) 517{ 518 int error = 0; 519 520 /* 521 * Currently the only function calling this ioctl routine is rtioctl(). 522 * Typically, only root can create the raw socket in order to execute 523 * this ioctl method, however the request might be coming from a prison 524 */ 525 error = priv_check(curthread, PRIV_NETINET_MROUTE); 526 if (error) 527 return (error); 528 switch (cmd) { 529 case (SIOCGETVIFCNT): 530 error = get_vif_cnt((struct sioc_vif_req *)data); 531 break; 532 533 case (SIOCGETSGCNT): 534 error = get_sg_cnt((struct sioc_sg_req *)data); 535 break; 536 537 default: 538 error = EINVAL; 539 break; 540 } 541 return error; 542} 543 544/* 545 * returns the packet, byte, rpf-failure count for the source group provided 546 */ 547static int 548get_sg_cnt(struct sioc_sg_req *req) 549{ 550 struct mfc *rt; 551 552 MFC_LOCK(); 553 rt = mfc_find(req->src.s_addr, req->grp.s_addr); 554 if (rt == NULL) { 555 MFC_UNLOCK(); 556 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; 557 return EADDRNOTAVAIL; 558 } 559 req->pktcnt = rt->mfc_pkt_cnt; 560 req->bytecnt = rt->mfc_byte_cnt; 561 req->wrong_if = rt->mfc_wrong_if; 562 MFC_UNLOCK(); 563 return 0; 564} 565 566/* 567 * returns the input and output packet and byte counts on the vif provided 568 */ 569static int 570get_vif_cnt(struct sioc_vif_req *req) 571{ 572 vifi_t vifi = req->vifi; 573 574 VIF_LOCK(); 575 if (vifi >= numvifs) { 576 VIF_UNLOCK(); 577 return EINVAL; 578 } 579 580 req->icount = viftable[vifi].v_pkt_in; 581 req->ocount = viftable[vifi].v_pkt_out; 582 req->ibytes = viftable[vifi].v_bytes_in; 583 req->obytes = viftable[vifi].v_bytes_out; 584 VIF_UNLOCK(); 585 586 return 0; 587} 588 589static void 590ip_mrouter_reset(void) 591{ 592 bzero((caddr_t)mfctable, sizeof(mfctable)); 593 bzero((caddr_t)nexpire, sizeof(nexpire)); 594 595 pim_assert = 0; 596 mrt_api_config = 0; 597 598 callout_init(&expire_upcalls_ch, NET_CALLOUT_MPSAFE); 599 600 bw_upcalls_n = 0; 601 bzero((caddr_t)bw_meter_timers, sizeof(bw_meter_timers)); 602 callout_init(&bw_upcalls_ch, NET_CALLOUT_MPSAFE); 603 callout_init(&bw_meter_ch, NET_CALLOUT_MPSAFE); 604} 605 606static struct mtx mrouter_mtx; /* used to synch init/done work */ 607 608static void 609if_detached_event(void *arg __unused, struct ifnet *ifp) 610{ 611 vifi_t vifi; 612 int i; 613 struct mfc *mfc; 614 struct mfc *nmfc; 615 struct mfc **ppmfc; /* Pointer to previous node's next-pointer */ 616 struct rtdetq *pq; 617 struct rtdetq *npq; 618 619 mtx_lock(&mrouter_mtx); 620 if (ip_mrouter == NULL) { 621 mtx_unlock(&mrouter_mtx); 622 } 623 624 /* 625 * Tear down multicast forwarder state associated with this ifnet. 626 * 1. Walk the vif list, matching vifs against this ifnet. 627 * 2. Walk the multicast forwarding cache (mfc) looking for 628 * inner matches with this vif's index. 629 * 3. Free any pending mbufs for this mfc. 630 * 4. Free the associated mfc entry and state associated with this vif. 631 * Be very careful about unlinking from a singly-linked list whose 632 * "head node" is a pointer in a simple array. 633 * 5. Free vif state. This should disable ALLMULTI on the interface. 634 */ 635 VIF_LOCK(); 636 MFC_LOCK(); 637 for (vifi = 0; vifi < numvifs; vifi++) { 638 if (viftable[vifi].v_ifp != ifp) 639 continue; 640 for (i = 0; i < MFCTBLSIZ; i++) { 641 ppmfc = &mfctable[i]; 642 for (mfc = mfctable[i]; mfc != NULL; ) { 643 nmfc = mfc->mfc_next; 644 if (mfc->mfc_parent == vifi) { 645 for (pq = mfc->mfc_stall; pq != NULL; ) { 646 npq = pq->next; 647 m_freem(pq->m); 648 free(pq, M_MRTABLE); 649 pq = npq; 650 } 651 free_bw_list(mfc->mfc_bw_meter); 652 free(mfc, M_MRTABLE); 653 *ppmfc = nmfc; 654 } else { 655 ppmfc = &mfc->mfc_next; 656 } 657 mfc = nmfc; 658 } 659 } 660 del_vif_locked(vifi); 661 } 662 MFC_UNLOCK(); 663 VIF_UNLOCK(); 664 665 mtx_unlock(&mrouter_mtx); 666} 667 668/* 669 * Enable multicast routing 670 */ 671static int 672ip_mrouter_init(struct socket *so, int version) 673{ 674 if (mrtdebug) 675 log(LOG_DEBUG, "ip_mrouter_init: so_type = %d, pr_protocol = %d\n", 676 so->so_type, so->so_proto->pr_protocol); 677 678 if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) 679 return EOPNOTSUPP; 680 681 if (version != 1) 682 return ENOPROTOOPT; 683 684 mtx_lock(&mrouter_mtx); 685 686 if (ip_mrouter != NULL) { 687 mtx_unlock(&mrouter_mtx); 688 return EADDRINUSE; 689 } 690 691 if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, 692 if_detached_event, NULL, EVENTHANDLER_PRI_ANY); 693 if (if_detach_event_tag == NULL) 694 return (ENOMEM); 695 696 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); 697 698 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 699 expire_bw_upcalls_send, NULL); 700 callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); 701 702 ip_mrouter = so; 703 704 mtx_unlock(&mrouter_mtx); 705 706 if (mrtdebug) 707 log(LOG_DEBUG, "ip_mrouter_init\n"); 708 709 return 0; 710} 711 712/* 713 * Disable multicast routing 714 */ 715static int 716X_ip_mrouter_done(void) 717{ 718 vifi_t vifi; 719 int i; 720 struct ifnet *ifp; 721 struct ifreq ifr; 722 struct mfc *rt; 723 struct rtdetq *rte; 724 725 mtx_lock(&mrouter_mtx); 726 727 if (ip_mrouter == NULL) { 728 mtx_unlock(&mrouter_mtx); 729 return EINVAL; 730 } 731 732 /* 733 * Detach/disable hooks to the reset of the system. 734 */ 735 ip_mrouter = NULL; 736 mrt_api_config = 0; 737 738 VIF_LOCK(); 739 /* 740 * For each phyint in use, disable promiscuous reception of all IP 741 * multicasts. 742 */ 743 for (vifi = 0; vifi < numvifs; vifi++) { 744 if (viftable[vifi].v_lcl_addr.s_addr != 0 && 745 !(viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 746 struct sockaddr_in *so = (struct sockaddr_in *)&(ifr.ifr_addr); 747 748 so->sin_len = sizeof(struct sockaddr_in); 749 so->sin_family = AF_INET; 750 so->sin_addr.s_addr = INADDR_ANY; 751 ifp = viftable[vifi].v_ifp; 752 if_allmulti(ifp, 0); 753 } 754 } 755 bzero((caddr_t)viftable, sizeof(viftable)); 756 numvifs = 0; 757 pim_assert = 0; 758 VIF_UNLOCK(); 759 EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); 760 761 /* 762 * Free all multicast forwarding cache entries. 763 */ 764 callout_stop(&expire_upcalls_ch); 765 callout_stop(&bw_upcalls_ch); 766 callout_stop(&bw_meter_ch); 767 768 MFC_LOCK(); 769 for (i = 0; i < MFCTBLSIZ; i++) { 770 for (rt = mfctable[i]; rt != NULL; ) { 771 struct mfc *nr = rt->mfc_next; 772 773 for (rte = rt->mfc_stall; rte != NULL; ) { 774 struct rtdetq *n = rte->next; 775 776 m_freem(rte->m); 777 free(rte, M_MRTABLE); 778 rte = n; 779 } 780 free_bw_list(rt->mfc_bw_meter); 781 free(rt, M_MRTABLE); 782 rt = nr; 783 } 784 } 785 bzero((caddr_t)mfctable, sizeof(mfctable)); 786 bzero((caddr_t)nexpire, sizeof(nexpire)); 787 bw_upcalls_n = 0; 788 bzero(bw_meter_timers, sizeof(bw_meter_timers)); 789 MFC_UNLOCK(); 790 791#ifdef PIM 792 reg_vif_num = VIFI_INVALID; 793#endif 794 795 mtx_unlock(&mrouter_mtx); 796 797 if (mrtdebug) 798 log(LOG_DEBUG, "ip_mrouter_done\n"); 799 800 return 0; 801} 802 803/* 804 * Set PIM assert processing global 805 */ 806static int 807set_assert(int i) 808{ 809 if ((i != 1) && (i != 0)) 810 return EINVAL; 811 812 pim_assert = i; 813 814 return 0; 815} 816 817/* 818 * Configure API capabilities 819 */ 820int 821set_api_config(uint32_t *apival) 822{ 823 int i; 824 825 /* 826 * We can set the API capabilities only if it is the first operation 827 * after MRT_INIT. I.e.: 828 * - there are no vifs installed 829 * - pim_assert is not enabled 830 * - the MFC table is empty 831 */ 832 if (numvifs > 0) { 833 *apival = 0; 834 return EPERM; 835 } 836 if (pim_assert) { 837 *apival = 0; 838 return EPERM; 839 } 840 for (i = 0; i < MFCTBLSIZ; i++) { 841 if (mfctable[i] != NULL) { 842 *apival = 0; 843 return EPERM; 844 } 845 } 846 847 mrt_api_config = *apival & mrt_api_support; 848 *apival = mrt_api_config; 849 850 return 0; 851} 852 853/* 854 * Add a vif to the vif table 855 */ 856static int 857add_vif(struct vifctl *vifcp) 858{ 859 struct vif *vifp = viftable + vifcp->vifc_vifi; 860 struct sockaddr_in sin = {sizeof sin, AF_INET}; 861 struct ifaddr *ifa; 862 struct ifnet *ifp; 863 int error; 864 865 VIF_LOCK(); 866 if (vifcp->vifc_vifi >= MAXVIFS) { 867 VIF_UNLOCK(); 868 return EINVAL; 869 } 870 /* rate limiting is no longer supported by this code */ 871 if (vifcp->vifc_rate_limit != 0) { 872 log(LOG_ERR, "rate limiting is no longer supported\n"); 873 VIF_UNLOCK(); 874 return EINVAL; 875 } 876 if (vifp->v_lcl_addr.s_addr != INADDR_ANY) { 877 VIF_UNLOCK(); 878 return EADDRINUSE; 879 } 880 if (vifcp->vifc_lcl_addr.s_addr == INADDR_ANY) { 881 VIF_UNLOCK(); 882 return EADDRNOTAVAIL; 883 } 884 885 /* Find the interface with an address in AF_INET family */ 886#ifdef PIM 887 if (vifcp->vifc_flags & VIFF_REGISTER) { 888 /* 889 * XXX: Because VIFF_REGISTER does not really need a valid 890 * local interface (e.g. it could be 127.0.0.2), we don't 891 * check its address. 892 */ 893 ifp = NULL; 894 } else 895#endif 896 { 897 sin.sin_addr = vifcp->vifc_lcl_addr; 898 ifa = ifa_ifwithaddr((struct sockaddr *)&sin); 899 if (ifa == NULL) { 900 VIF_UNLOCK(); 901 return EADDRNOTAVAIL; 902 } 903 ifp = ifa->ifa_ifp; 904 } 905 906 if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) { 907 log(LOG_ERR, "tunnels are no longer supported\n"); 908 VIF_UNLOCK(); 909 return EOPNOTSUPP; 910#ifdef PIM 911 } else if (vifcp->vifc_flags & VIFF_REGISTER) { 912 ifp = &multicast_register_if; 913 if (mrtdebug) 914 log(LOG_DEBUG, "Adding a register vif, ifp: %p\n", 915 (void *)&multicast_register_if); 916 if (reg_vif_num == VIFI_INVALID) { 917 if_initname(&multicast_register_if, "register_vif", 0); 918 multicast_register_if.if_flags = IFF_LOOPBACK; 919 reg_vif_num = vifcp->vifc_vifi; 920 } 921#endif 922 } else { /* Make sure the interface supports multicast */ 923 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 924 VIF_UNLOCK(); 925 return EOPNOTSUPP; 926 } 927 928 /* Enable promiscuous reception of all IP multicasts from the if */ 929 error = if_allmulti(ifp, 1); 930 if (error) { 931 VIF_UNLOCK(); 932 return error; 933 } 934 } 935 936 vifp->v_flags = vifcp->vifc_flags; 937 vifp->v_threshold = vifcp->vifc_threshold; 938 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 939 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 940 vifp->v_ifp = ifp; 941 vifp->v_rsvp_on = 0; 942 vifp->v_rsvpd = NULL; 943 /* initialize per vif pkt counters */ 944 vifp->v_pkt_in = 0; 945 vifp->v_pkt_out = 0; 946 vifp->v_bytes_in = 0; 947 vifp->v_bytes_out = 0; 948 bzero(&vifp->v_route, sizeof(vifp->v_route)); 949 950 /* Adjust numvifs up if the vifi is higher than numvifs */ 951 if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1; 952 953 VIF_UNLOCK(); 954 955 if (mrtdebug) 956 log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x\n", 957 vifcp->vifc_vifi, 958 (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr), 959 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 960 (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr), 961 vifcp->vifc_threshold); 962 963 return 0; 964} 965 966/* 967 * Delete a vif from the vif table 968 */ 969static int 970del_vif_locked(vifi_t vifi) 971{ 972 struct vif *vifp; 973 974 VIF_LOCK_ASSERT(); 975 976 if (vifi >= numvifs) { 977 return EINVAL; 978 } 979 vifp = &viftable[vifi]; 980 if (vifp->v_lcl_addr.s_addr == INADDR_ANY) { 981 return EADDRNOTAVAIL; 982 } 983 984 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) 985 if_allmulti(vifp->v_ifp, 0); 986 987#ifdef PIM 988 if (vifp->v_flags & VIFF_REGISTER) 989 reg_vif_num = VIFI_INVALID; 990#endif 991 992 bzero((caddr_t)vifp, sizeof (*vifp)); 993 994 if (mrtdebug) 995 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs); 996 997 /* Adjust numvifs down */ 998 for (vifi = numvifs; vifi > 0; vifi--) 999 if (viftable[vifi-1].v_lcl_addr.s_addr != INADDR_ANY) 1000 break; 1001 numvifs = vifi; 1002 1003 return 0; 1004} 1005 1006static int 1007del_vif(vifi_t vifi) 1008{ 1009 int cc; 1010 1011 VIF_LOCK(); 1012 cc = del_vif_locked(vifi); 1013 VIF_UNLOCK(); 1014 1015 return cc; 1016} 1017 1018/* 1019 * update an mfc entry without resetting counters and S,G addresses. 1020 */ 1021static void 1022update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 1023{ 1024 int i; 1025 1026 rt->mfc_parent = mfccp->mfcc_parent; 1027 for (i = 0; i < numvifs; i++) { 1028 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1029 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config & 1030 MRT_MFC_FLAGS_ALL; 1031 } 1032 /* set the RP address */ 1033 if (mrt_api_config & MRT_MFC_RP) 1034 rt->mfc_rp = mfccp->mfcc_rp; 1035 else 1036 rt->mfc_rp.s_addr = INADDR_ANY; 1037} 1038 1039/* 1040 * fully initialize an mfc entry from the parameter. 1041 */ 1042static void 1043init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 1044{ 1045 rt->mfc_origin = mfccp->mfcc_origin; 1046 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1047 1048 update_mfc_params(rt, mfccp); 1049 1050 /* initialize pkt counters per src-grp */ 1051 rt->mfc_pkt_cnt = 0; 1052 rt->mfc_byte_cnt = 0; 1053 rt->mfc_wrong_if = 0; 1054 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; 1055} 1056 1057 1058/* 1059 * Add an mfc entry 1060 */ 1061static int 1062add_mfc(struct mfcctl2 *mfccp) 1063{ 1064 struct mfc *rt; 1065 u_long hash; 1066 struct rtdetq *rte; 1067 u_short nstl; 1068 1069 VIF_LOCK(); 1070 MFC_LOCK(); 1071 1072 rt = mfc_find(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); 1073 1074 /* If an entry already exists, just update the fields */ 1075 if (rt) { 1076 if (mrtdebug & DEBUG_MFC) 1077 log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n", 1078 (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1079 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1080 mfccp->mfcc_parent); 1081 1082 update_mfc_params(rt, mfccp); 1083 MFC_UNLOCK(); 1084 VIF_UNLOCK(); 1085 return 0; 1086 } 1087 1088 /* 1089 * Find the entry for which the upcall was made and update 1090 */ 1091 hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); 1092 for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) { 1093 1094 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1095 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && 1096 (rt->mfc_stall != NULL)) { 1097 1098 if (nstl++) 1099 log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n", 1100 "multiple kernel entries", 1101 (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1102 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1103 mfccp->mfcc_parent, (void *)rt->mfc_stall); 1104 1105 if (mrtdebug & DEBUG_MFC) 1106 log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n", 1107 (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1108 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1109 mfccp->mfcc_parent, (void *)rt->mfc_stall); 1110 1111 init_mfc_params(rt, mfccp); 1112 1113 rt->mfc_expire = 0; /* Don't clean this guy up */ 1114 nexpire[hash]--; 1115 1116 /* free packets Qed at the end of this entry */ 1117 for (rte = rt->mfc_stall; rte != NULL; ) { 1118 struct rtdetq *n = rte->next; 1119 1120 ip_mdq(rte->m, rte->ifp, rt, -1); 1121 m_freem(rte->m); 1122 free(rte, M_MRTABLE); 1123 rte = n; 1124 } 1125 rt->mfc_stall = NULL; 1126 } 1127 } 1128 1129 /* 1130 * It is possible that an entry is being inserted without an upcall 1131 */ 1132 if (nstl == 0) { 1133 if (mrtdebug & DEBUG_MFC) 1134 log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n", 1135 hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1136 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1137 mfccp->mfcc_parent); 1138 1139 for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) { 1140 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1141 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) { 1142 init_mfc_params(rt, mfccp); 1143 if (rt->mfc_expire) 1144 nexpire[hash]--; 1145 rt->mfc_expire = 0; 1146 break; /* XXX */ 1147 } 1148 } 1149 if (rt == NULL) { /* no upcall, so make a new entry */ 1150 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); 1151 if (rt == NULL) { 1152 MFC_UNLOCK(); 1153 VIF_UNLOCK(); 1154 return ENOBUFS; 1155 } 1156 1157 init_mfc_params(rt, mfccp); 1158 rt->mfc_expire = 0; 1159 rt->mfc_stall = NULL; 1160 1161 rt->mfc_bw_meter = NULL; 1162 /* insert new entry at head of hash chain */ 1163 rt->mfc_next = mfctable[hash]; 1164 mfctable[hash] = rt; 1165 } 1166 } 1167 MFC_UNLOCK(); 1168 VIF_UNLOCK(); 1169 return 0; 1170} 1171 1172/* 1173 * Delete an mfc entry 1174 */ 1175static int 1176del_mfc(struct mfcctl2 *mfccp) 1177{ 1178 struct in_addr origin; 1179 struct in_addr mcastgrp; 1180 struct mfc *rt; 1181 struct mfc **nptr; 1182 u_long hash; 1183 struct bw_meter *list; 1184 1185 origin = mfccp->mfcc_origin; 1186 mcastgrp = mfccp->mfcc_mcastgrp; 1187 1188 if (mrtdebug & DEBUG_MFC) 1189 log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n", 1190 (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); 1191 1192 MFC_LOCK(); 1193 1194 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); 1195 for (nptr = &mfctable[hash]; (rt = *nptr) != NULL; nptr = &rt->mfc_next) 1196 if (origin.s_addr == rt->mfc_origin.s_addr && 1197 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && 1198 rt->mfc_stall == NULL) 1199 break; 1200 if (rt == NULL) { 1201 MFC_UNLOCK(); 1202 return EADDRNOTAVAIL; 1203 } 1204 1205 *nptr = rt->mfc_next; 1206 1207 /* 1208 * free the bw_meter entries 1209 */ 1210 list = rt->mfc_bw_meter; 1211 rt->mfc_bw_meter = NULL; 1212 1213 free(rt, M_MRTABLE); 1214 1215 free_bw_list(list); 1216 1217 MFC_UNLOCK(); 1218 1219 return 0; 1220} 1221 1222/* 1223 * Send a message to the routing daemon on the multicast routing socket 1224 */ 1225static int 1226socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) 1227{ 1228 if (s) { 1229 SOCKBUF_LOCK(&s->so_rcv); 1230 if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm, 1231 NULL) != 0) { 1232 sorwakeup_locked(s); 1233 return 0; 1234 } 1235 SOCKBUF_UNLOCK(&s->so_rcv); 1236 } 1237 m_freem(mm); 1238 return -1; 1239} 1240 1241/* 1242 * IP multicast forwarding function. This function assumes that the packet 1243 * pointed to by "ip" has arrived on (or is about to be sent to) the interface 1244 * pointed to by "ifp", and the packet is to be relayed to other networks 1245 * that have members of the packet's destination IP multicast group. 1246 * 1247 * The packet is returned unscathed to the caller, unless it is 1248 * erroneous, in which case a non-zero return value tells the caller to 1249 * discard it. 1250 */ 1251 1252#define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1253 1254static int 1255X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, 1256 struct ip_moptions *imo) 1257{ 1258 struct mfc *rt; 1259 int error; 1260 vifi_t vifi; 1261 1262 if (mrtdebug & DEBUG_FORWARD) 1263 log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n", 1264 (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), 1265 (void *)ifp); 1266 1267 if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || 1268 ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { 1269 /* 1270 * Packet arrived via a physical interface or 1271 * an encapsulated tunnel or a register_vif. 1272 */ 1273 } else { 1274 /* 1275 * Packet arrived through a source-route tunnel. 1276 * Source-route tunnels are no longer supported. 1277 */ 1278 static int last_log; 1279 if (last_log != time_uptime) { 1280 last_log = time_uptime; 1281 log(LOG_ERR, 1282 "ip_mforward: received source-routed packet from %lx\n", 1283 (u_long)ntohl(ip->ip_src.s_addr)); 1284 } 1285 return 1; 1286 } 1287 1288 VIF_LOCK(); 1289 MFC_LOCK(); 1290 if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) { 1291 if (ip->ip_ttl < 255) 1292 ip->ip_ttl++; /* compensate for -1 in *_send routines */ 1293 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { 1294 struct vif *vifp = viftable + vifi; 1295 1296 printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s)\n", 1297 (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr), 1298 vifi, 1299 (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "", 1300 vifp->v_ifp->if_xname); 1301 } 1302 error = ip_mdq(m, ifp, NULL, vifi); 1303 MFC_UNLOCK(); 1304 VIF_UNLOCK(); 1305 return error; 1306 } 1307 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { 1308 printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n", 1309 (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr)); 1310 if (!imo) 1311 printf("In fact, no options were specified at all\n"); 1312 } 1313 1314 /* 1315 * Don't forward a packet with time-to-live of zero or one, 1316 * or a packet destined to a local-only group. 1317 */ 1318 if (ip->ip_ttl <= 1 || ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP) { 1319 MFC_UNLOCK(); 1320 VIF_UNLOCK(); 1321 return 0; 1322 } 1323 1324 /* 1325 * Determine forwarding vifs from the forwarding cache table 1326 */ 1327 ++mrtstat.mrts_mfc_lookups; 1328 rt = mfc_find(ip->ip_src.s_addr, ip->ip_dst.s_addr); 1329 1330 /* Entry exists, so forward if necessary */ 1331 if (rt != NULL) { 1332 error = ip_mdq(m, ifp, rt, -1); 1333 MFC_UNLOCK(); 1334 VIF_UNLOCK(); 1335 return error; 1336 } else { 1337 /* 1338 * If we don't have a route for packet's origin, 1339 * Make a copy of the packet & send message to routing daemon 1340 */ 1341 1342 struct mbuf *mb0; 1343 struct rtdetq *rte; 1344 u_long hash; 1345 int hlen = ip->ip_hl << 2; 1346 1347 ++mrtstat.mrts_mfc_misses; 1348 1349 mrtstat.mrts_no_route++; 1350 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) 1351 log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n", 1352 (u_long)ntohl(ip->ip_src.s_addr), 1353 (u_long)ntohl(ip->ip_dst.s_addr)); 1354 1355 /* 1356 * Allocate mbufs early so that we don't do extra work if we are 1357 * just going to fail anyway. Make sure to pullup the header so 1358 * that other people can't step on it. 1359 */ 1360 rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT); 1361 if (rte == NULL) { 1362 MFC_UNLOCK(); 1363 VIF_UNLOCK(); 1364 return ENOBUFS; 1365 } 1366 mb0 = m_copypacket(m, M_DONTWAIT); 1367 if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) 1368 mb0 = m_pullup(mb0, hlen); 1369 if (mb0 == NULL) { 1370 free(rte, M_MRTABLE); 1371 MFC_UNLOCK(); 1372 VIF_UNLOCK(); 1373 return ENOBUFS; 1374 } 1375 1376 /* is there an upcall waiting for this flow ? */ 1377 hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr); 1378 for (rt = mfctable[hash]; rt; rt = rt->mfc_next) { 1379 if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) && 1380 (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) && 1381 (rt->mfc_stall != NULL)) 1382 break; 1383 } 1384 1385 if (rt == NULL) { 1386 int i; 1387 struct igmpmsg *im; 1388 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 1389 struct mbuf *mm; 1390 1391 /* 1392 * Locate the vifi for the incoming interface for this packet. 1393 * If none found, drop packet. 1394 */ 1395 for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) 1396 ; 1397 if (vifi >= numvifs) /* vif not found, drop packet */ 1398 goto non_fatal; 1399 1400 /* no upcall, so make a new entry */ 1401 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); 1402 if (rt == NULL) 1403 goto fail; 1404 /* Make a copy of the header to send to the user level process */ 1405 mm = m_copy(mb0, 0, hlen); 1406 if (mm == NULL) 1407 goto fail1; 1408 1409 /* 1410 * Send message to routing daemon to install 1411 * a route into the kernel table 1412 */ 1413 1414 im = mtod(mm, struct igmpmsg *); 1415 im->im_msgtype = IGMPMSG_NOCACHE; 1416 im->im_mbz = 0; 1417 im->im_vif = vifi; 1418 1419 mrtstat.mrts_upcalls++; 1420 1421 k_igmpsrc.sin_addr = ip->ip_src; 1422 if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { 1423 log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); 1424 ++mrtstat.mrts_upq_sockfull; 1425fail1: 1426 free(rt, M_MRTABLE); 1427fail: 1428 free(rte, M_MRTABLE); 1429 m_freem(mb0); 1430 MFC_UNLOCK(); 1431 VIF_UNLOCK(); 1432 return ENOBUFS; 1433 } 1434 1435 /* insert new entry at head of hash chain */ 1436 rt->mfc_origin.s_addr = ip->ip_src.s_addr; 1437 rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; 1438 rt->mfc_expire = UPCALL_EXPIRE; 1439 nexpire[hash]++; 1440 for (i = 0; i < numvifs; i++) { 1441 rt->mfc_ttls[i] = 0; 1442 rt->mfc_flags[i] = 0; 1443 } 1444 rt->mfc_parent = -1; 1445 1446 rt->mfc_rp.s_addr = INADDR_ANY; /* clear the RP address */ 1447 1448 rt->mfc_bw_meter = NULL; 1449 1450 /* link into table */ 1451 rt->mfc_next = mfctable[hash]; 1452 mfctable[hash] = rt; 1453 rt->mfc_stall = rte; 1454 1455 } else { 1456 /* determine if q has overflowed */ 1457 int npkts = 0; 1458 struct rtdetq **p; 1459 1460 /* 1461 * XXX ouch! we need to append to the list, but we 1462 * only have a pointer to the front, so we have to 1463 * scan the entire list every time. 1464 */ 1465 for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) 1466 npkts++; 1467 1468 if (npkts > MAX_UPQ) { 1469 mrtstat.mrts_upq_ovflw++; 1470non_fatal: 1471 free(rte, M_MRTABLE); 1472 m_freem(mb0); 1473 MFC_UNLOCK(); 1474 VIF_UNLOCK(); 1475 return 0; 1476 } 1477 1478 /* Add this entry to the end of the queue */ 1479 *p = rte; 1480 } 1481 1482 rte->m = mb0; 1483 rte->ifp = ifp; 1484 rte->next = NULL; 1485 1486 MFC_UNLOCK(); 1487 VIF_UNLOCK(); 1488 1489 return 0; 1490 } 1491} 1492 1493/* 1494 * Clean up the cache entry if upcall is not serviced 1495 */ 1496static void 1497expire_upcalls(void *unused) 1498{ 1499 struct rtdetq *rte; 1500 struct mfc *mfc, **nptr; 1501 int i; 1502 1503 MFC_LOCK(); 1504 for (i = 0; i < MFCTBLSIZ; i++) { 1505 if (nexpire[i] == 0) 1506 continue; 1507 nptr = &mfctable[i]; 1508 for (mfc = *nptr; mfc != NULL; mfc = *nptr) { 1509 /* 1510 * Skip real cache entries 1511 * Make sure it wasn't marked to not expire (shouldn't happen) 1512 * If it expires now 1513 */ 1514 if (mfc->mfc_stall != NULL && mfc->mfc_expire != 0 && 1515 --mfc->mfc_expire == 0) { 1516 if (mrtdebug & DEBUG_EXPIRE) 1517 log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n", 1518 (u_long)ntohl(mfc->mfc_origin.s_addr), 1519 (u_long)ntohl(mfc->mfc_mcastgrp.s_addr)); 1520 /* 1521 * drop all the packets 1522 * free the mbuf with the pkt, if, timing info 1523 */ 1524 for (rte = mfc->mfc_stall; rte; ) { 1525 struct rtdetq *n = rte->next; 1526 1527 m_freem(rte->m); 1528 free(rte, M_MRTABLE); 1529 rte = n; 1530 } 1531 ++mrtstat.mrts_cache_cleanups; 1532 nexpire[i]--; 1533 1534 /* 1535 * free the bw_meter entries 1536 */ 1537 while (mfc->mfc_bw_meter != NULL) { 1538 struct bw_meter *x = mfc->mfc_bw_meter; 1539 1540 mfc->mfc_bw_meter = x->bm_mfc_next; 1541 free(x, M_BWMETER); 1542 } 1543 1544 *nptr = mfc->mfc_next; 1545 free(mfc, M_MRTABLE); 1546 } else { 1547 nptr = &mfc->mfc_next; 1548 } 1549 } 1550 } 1551 MFC_UNLOCK(); 1552 1553 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); 1554} 1555 1556/* 1557 * Packet forwarding routine once entry in the cache is made 1558 */ 1559static int 1560ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) 1561{ 1562 struct ip *ip = mtod(m, struct ip *); 1563 vifi_t vifi; 1564 int plen = ip->ip_len; 1565 1566 VIF_LOCK_ASSERT(); 1567 1568 /* 1569 * If xmt_vif is not -1, send on only the requested vif. 1570 * 1571 * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) 1572 */ 1573 if (xmt_vif < numvifs) { 1574#ifdef PIM 1575 if (viftable[xmt_vif].v_flags & VIFF_REGISTER) 1576 pim_register_send(ip, viftable + xmt_vif, m, rt); 1577 else 1578#endif 1579 phyint_send(ip, viftable + xmt_vif, m); 1580 return 1; 1581 } 1582 1583 /* 1584 * Don't forward if it didn't arrive from the parent vif for its origin. 1585 */ 1586 vifi = rt->mfc_parent; 1587 if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { 1588 /* came in the wrong interface */ 1589 if (mrtdebug & DEBUG_FORWARD) 1590 log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", 1591 (void *)ifp, vifi, (void *)viftable[vifi].v_ifp); 1592 ++mrtstat.mrts_wrong_if; 1593 ++rt->mfc_wrong_if; 1594 /* 1595 * If we are doing PIM assert processing, send a message 1596 * to the routing daemon. 1597 * 1598 * XXX: A PIM-SM router needs the WRONGVIF detection so it 1599 * can complete the SPT switch, regardless of the type 1600 * of the iif (broadcast media, GRE tunnel, etc). 1601 */ 1602 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) { 1603 struct timeval now; 1604 u_long delta; 1605 1606#ifdef PIM 1607 if (ifp == &multicast_register_if) 1608 pimstat.pims_rcv_registers_wrongiif++; 1609#endif 1610 1611 /* Get vifi for the incoming packet */ 1612 for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) 1613 ; 1614 if (vifi >= numvifs) 1615 return 0; /* The iif is not found: ignore the packet. */ 1616 1617 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) 1618 return 0; /* WRONGVIF disabled: ignore the packet */ 1619 1620 GET_TIME(now); 1621 1622 TV_DELTA(now, rt->mfc_last_assert, delta); 1623 1624 if (delta > ASSERT_MSG_TIME) { 1625 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 1626 struct igmpmsg *im; 1627 int hlen = ip->ip_hl << 2; 1628 struct mbuf *mm = m_copy(m, 0, hlen); 1629 1630 if (mm && (M_HASCL(mm) || mm->m_len < hlen)) 1631 mm = m_pullup(mm, hlen); 1632 if (mm == NULL) 1633 return ENOBUFS; 1634 1635 rt->mfc_last_assert = now; 1636 1637 im = mtod(mm, struct igmpmsg *); 1638 im->im_msgtype = IGMPMSG_WRONGVIF; 1639 im->im_mbz = 0; 1640 im->im_vif = vifi; 1641 1642 mrtstat.mrts_upcalls++; 1643 1644 k_igmpsrc.sin_addr = im->im_src; 1645 if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { 1646 log(LOG_WARNING, 1647 "ip_mforward: ip_mrouter socket queue full\n"); 1648 ++mrtstat.mrts_upq_sockfull; 1649 return ENOBUFS; 1650 } 1651 } 1652 } 1653 return 0; 1654 } 1655 1656 /* If I sourced this packet, it counts as output, else it was input. */ 1657 if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) { 1658 viftable[vifi].v_pkt_out++; 1659 viftable[vifi].v_bytes_out += plen; 1660 } else { 1661 viftable[vifi].v_pkt_in++; 1662 viftable[vifi].v_bytes_in += plen; 1663 } 1664 rt->mfc_pkt_cnt++; 1665 rt->mfc_byte_cnt += plen; 1666 1667 /* 1668 * For each vif, decide if a copy of the packet should be forwarded. 1669 * Forward if: 1670 * - the ttl exceeds the vif's threshold 1671 * - there are group members downstream on interface 1672 */ 1673 for (vifi = 0; vifi < numvifs; vifi++) 1674 if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { 1675 viftable[vifi].v_pkt_out++; 1676 viftable[vifi].v_bytes_out += plen; 1677#ifdef PIM 1678 if (viftable[vifi].v_flags & VIFF_REGISTER) 1679 pim_register_send(ip, viftable + vifi, m, rt); 1680 else 1681#endif 1682 phyint_send(ip, viftable + vifi, m); 1683 } 1684 1685 /* 1686 * Perform upcall-related bw measuring. 1687 */ 1688 if (rt->mfc_bw_meter != NULL) { 1689 struct bw_meter *x; 1690 struct timeval now; 1691 1692 GET_TIME(now); 1693 MFC_LOCK_ASSERT(); 1694 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) 1695 bw_meter_receive_packet(x, plen, &now); 1696 } 1697 1698 return 0; 1699} 1700 1701/* 1702 * check if a vif number is legal/ok. This is used by ip_output. 1703 */ 1704static int 1705X_legal_vif_num(int vif) 1706{ 1707 /* XXX unlocked, matter? */ 1708 return (vif >= 0 && vif < numvifs); 1709} 1710 1711/* 1712 * Return the local address used by this vif 1713 */ 1714static u_long 1715X_ip_mcast_src(int vifi) 1716{ 1717 /* XXX unlocked, matter? */ 1718 if (vifi >= 0 && vifi < numvifs) 1719 return viftable[vifi].v_lcl_addr.s_addr; 1720 else 1721 return INADDR_ANY; 1722} 1723 1724static void 1725phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1726{ 1727 struct mbuf *mb_copy; 1728 int hlen = ip->ip_hl << 2; 1729 1730 VIF_LOCK_ASSERT(); 1731 1732 /* 1733 * Make a new reference to the packet; make sure that 1734 * the IP header is actually copied, not just referenced, 1735 * so that ip_output() only scribbles on the copy. 1736 */ 1737 mb_copy = m_copypacket(m, M_DONTWAIT); 1738 if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) 1739 mb_copy = m_pullup(mb_copy, hlen); 1740 if (mb_copy == NULL) 1741 return; 1742 1743 send_packet(vifp, mb_copy); 1744} 1745 1746static void 1747send_packet(struct vif *vifp, struct mbuf *m) 1748{ 1749 struct ip_moptions imo; 1750 struct in_multi *imm[2]; 1751 int error; 1752 1753 VIF_LOCK_ASSERT(); 1754 1755 imo.imo_multicast_ifp = vifp->v_ifp; 1756 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; 1757 imo.imo_multicast_loop = 1; 1758 imo.imo_multicast_vif = -1; 1759 imo.imo_num_memberships = 0; 1760 imo.imo_max_memberships = 2; 1761 imo.imo_membership = &imm[0]; 1762 1763 /* 1764 * Re-entrancy should not be a problem here, because 1765 * the packets that we send out and are looped back at us 1766 * should get rejected because they appear to come from 1767 * the loopback interface, thus preventing looping. 1768 */ 1769 error = ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, &imo, NULL); 1770 if (mrtdebug & DEBUG_XMIT) { 1771 log(LOG_DEBUG, "phyint_send on vif %td err %d\n", 1772 vifp - viftable, error); 1773 } 1774} 1775 1776static int 1777X_ip_rsvp_vif(struct socket *so, struct sockopt *sopt) 1778{ 1779 int error, vifi; 1780 1781 if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) 1782 return EOPNOTSUPP; 1783 1784 error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); 1785 if (error) 1786 return error; 1787 1788 VIF_LOCK(); 1789 1790 if (vifi < 0 || vifi >= numvifs) { /* Error if vif is invalid */ 1791 VIF_UNLOCK(); 1792 return EADDRNOTAVAIL; 1793 } 1794 1795 if (sopt->sopt_name == IP_RSVP_VIF_ON) { 1796 /* Check if socket is available. */ 1797 if (viftable[vifi].v_rsvpd != NULL) { 1798 VIF_UNLOCK(); 1799 return EADDRINUSE; 1800 } 1801 1802 viftable[vifi].v_rsvpd = so; 1803 /* This may seem silly, but we need to be sure we don't over-increment 1804 * the RSVP counter, in case something slips up. 1805 */ 1806 if (!viftable[vifi].v_rsvp_on) { 1807 viftable[vifi].v_rsvp_on = 1; 1808 rsvp_on++; 1809 } 1810 } else { /* must be VIF_OFF */ 1811 /* 1812 * XXX as an additional consistency check, one could make sure 1813 * that viftable[vifi].v_rsvpd == so, otherwise passing so as 1814 * first parameter is pretty useless. 1815 */ 1816 viftable[vifi].v_rsvpd = NULL; 1817 /* 1818 * This may seem silly, but we need to be sure we don't over-decrement 1819 * the RSVP counter, in case something slips up. 1820 */ 1821 if (viftable[vifi].v_rsvp_on) { 1822 viftable[vifi].v_rsvp_on = 0; 1823 rsvp_on--; 1824 } 1825 } 1826 VIF_UNLOCK(); 1827 return 0; 1828} 1829 1830static void 1831X_ip_rsvp_force_done(struct socket *so) 1832{ 1833 int vifi; 1834 1835 /* Don't bother if it is not the right type of socket. */ 1836 if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) 1837 return; 1838 1839 VIF_LOCK(); 1840 1841 /* The socket may be attached to more than one vif...this 1842 * is perfectly legal. 1843 */ 1844 for (vifi = 0; vifi < numvifs; vifi++) { 1845 if (viftable[vifi].v_rsvpd == so) { 1846 viftable[vifi].v_rsvpd = NULL; 1847 /* This may seem silly, but we need to be sure we don't 1848 * over-decrement the RSVP counter, in case something slips up. 1849 */ 1850 if (viftable[vifi].v_rsvp_on) { 1851 viftable[vifi].v_rsvp_on = 0; 1852 rsvp_on--; 1853 } 1854 } 1855 } 1856 1857 VIF_UNLOCK(); 1858} 1859 1860static void 1861X_rsvp_input(struct mbuf *m, int off) 1862{ 1863 int vifi; 1864 struct ip *ip = mtod(m, struct ip *); 1865 struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET }; 1866 struct ifnet *ifp; 1867 1868 if (rsvpdebug) 1869 printf("rsvp_input: rsvp_on %d\n",rsvp_on); 1870 1871 /* Can still get packets with rsvp_on = 0 if there is a local member 1872 * of the group to which the RSVP packet is addressed. But in this 1873 * case we want to throw the packet away. 1874 */ 1875 if (!rsvp_on) { 1876 m_freem(m); 1877 return; 1878 } 1879 1880 if (rsvpdebug) 1881 printf("rsvp_input: check vifs\n"); 1882 1883#ifdef DIAGNOSTIC 1884 M_ASSERTPKTHDR(m); 1885#endif 1886 1887 ifp = m->m_pkthdr.rcvif; 1888 1889 VIF_LOCK(); 1890 /* Find which vif the packet arrived on. */ 1891 for (vifi = 0; vifi < numvifs; vifi++) 1892 if (viftable[vifi].v_ifp == ifp) 1893 break; 1894 1895 if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) { 1896 /* 1897 * Drop the lock here to avoid holding it across rip_input. 1898 * This could make rsvpdebug printfs wrong. If you care, 1899 * record the state of stuff before dropping the lock. 1900 */ 1901 VIF_UNLOCK(); 1902 /* 1903 * If the old-style non-vif-associated socket is set, 1904 * then use it. Otherwise, drop packet since there 1905 * is no specific socket for this vif. 1906 */ 1907 if (ip_rsvpd != NULL) { 1908 if (rsvpdebug) 1909 printf("rsvp_input: Sending packet up old-style socket\n"); 1910 rip_input(m, off); /* xxx */ 1911 } else { 1912 if (rsvpdebug && vifi == numvifs) 1913 printf("rsvp_input: Can't find vif for packet.\n"); 1914 else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL) 1915 printf("rsvp_input: No socket defined for vif %d\n",vifi); 1916 m_freem(m); 1917 } 1918 return; 1919 } 1920 rsvp_src.sin_addr = ip->ip_src; 1921 1922 if (rsvpdebug && m) 1923 printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n", 1924 m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv))); 1925 1926 if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) { 1927 if (rsvpdebug) 1928 printf("rsvp_input: Failed to append to socket\n"); 1929 } else { 1930 if (rsvpdebug) 1931 printf("rsvp_input: send packet up\n"); 1932 } 1933 VIF_UNLOCK(); 1934} 1935 1936/* 1937 * Code for bandwidth monitors 1938 */ 1939 1940/* 1941 * Define common interface for timeval-related methods 1942 */ 1943#define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) 1944#define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) 1945#define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) 1946 1947static uint32_t 1948compute_bw_meter_flags(struct bw_upcall *req) 1949{ 1950 uint32_t flags = 0; 1951 1952 if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) 1953 flags |= BW_METER_UNIT_PACKETS; 1954 if (req->bu_flags & BW_UPCALL_UNIT_BYTES) 1955 flags |= BW_METER_UNIT_BYTES; 1956 if (req->bu_flags & BW_UPCALL_GEQ) 1957 flags |= BW_METER_GEQ; 1958 if (req->bu_flags & BW_UPCALL_LEQ) 1959 flags |= BW_METER_LEQ; 1960 1961 return flags; 1962} 1963 1964/* 1965 * Add a bw_meter entry 1966 */ 1967static int 1968add_bw_upcall(struct bw_upcall *req) 1969{ 1970 struct mfc *mfc; 1971 struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, 1972 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; 1973 struct timeval now; 1974 struct bw_meter *x; 1975 uint32_t flags; 1976 1977 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 1978 return EOPNOTSUPP; 1979 1980 /* Test if the flags are valid */ 1981 if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) 1982 return EINVAL; 1983 if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) 1984 return EINVAL; 1985 if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 1986 == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 1987 return EINVAL; 1988 1989 /* Test if the threshold time interval is valid */ 1990 if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) 1991 return EINVAL; 1992 1993 flags = compute_bw_meter_flags(req); 1994 1995 /* 1996 * Find if we have already same bw_meter entry 1997 */ 1998 MFC_LOCK(); 1999 mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr); 2000 if (mfc == NULL) { 2001 MFC_UNLOCK(); 2002 return EADDRNOTAVAIL; 2003 } 2004 for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { 2005 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2006 &req->bu_threshold.b_time, ==)) && 2007 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 2008 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2009 (x->bm_flags & BW_METER_USER_FLAGS) == flags) { 2010 MFC_UNLOCK(); 2011 return 0; /* XXX Already installed */ 2012 } 2013 } 2014 2015 /* Allocate the new bw_meter entry */ 2016 x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); 2017 if (x == NULL) { 2018 MFC_UNLOCK(); 2019 return ENOBUFS; 2020 } 2021 2022 /* Set the new bw_meter entry */ 2023 x->bm_threshold.b_time = req->bu_threshold.b_time; 2024 GET_TIME(now); 2025 x->bm_start_time = now; 2026 x->bm_threshold.b_packets = req->bu_threshold.b_packets; 2027 x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; 2028 x->bm_measured.b_packets = 0; 2029 x->bm_measured.b_bytes = 0; 2030 x->bm_flags = flags; 2031 x->bm_time_next = NULL; 2032 x->bm_time_hash = BW_METER_BUCKETS; 2033 2034 /* Add the new bw_meter entry to the front of entries for this MFC */ 2035 x->bm_mfc = mfc; 2036 x->bm_mfc_next = mfc->mfc_bw_meter; 2037 mfc->mfc_bw_meter = x; 2038 schedule_bw_meter(x, &now); 2039 MFC_UNLOCK(); 2040 2041 return 0; 2042} 2043 2044static void 2045free_bw_list(struct bw_meter *list) 2046{ 2047 while (list != NULL) { 2048 struct bw_meter *x = list; 2049 2050 list = list->bm_mfc_next; 2051 unschedule_bw_meter(x); 2052 free(x, M_BWMETER); 2053 } 2054} 2055 2056/* 2057 * Delete one or multiple bw_meter entries 2058 */ 2059static int 2060del_bw_upcall(struct bw_upcall *req) 2061{ 2062 struct mfc *mfc; 2063 struct bw_meter *x; 2064 2065 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2066 return EOPNOTSUPP; 2067 2068 MFC_LOCK(); 2069 /* Find the corresponding MFC entry */ 2070 mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr); 2071 if (mfc == NULL) { 2072 MFC_UNLOCK(); 2073 return EADDRNOTAVAIL; 2074 } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { 2075 /* 2076 * Delete all bw_meter entries for this mfc 2077 */ 2078 struct bw_meter *list; 2079 2080 list = mfc->mfc_bw_meter; 2081 mfc->mfc_bw_meter = NULL; 2082 free_bw_list(list); 2083 MFC_UNLOCK(); 2084 return 0; 2085 } else { /* Delete a single bw_meter entry */ 2086 struct bw_meter *prev; 2087 uint32_t flags = 0; 2088 2089 flags = compute_bw_meter_flags(req); 2090 2091 /* Find the bw_meter entry to delete */ 2092 for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; 2093 prev = x, x = x->bm_mfc_next) { 2094 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2095 &req->bu_threshold.b_time, ==)) && 2096 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 2097 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2098 (x->bm_flags & BW_METER_USER_FLAGS) == flags) 2099 break; 2100 } 2101 if (x != NULL) { /* Delete entry from the list for this MFC */ 2102 if (prev != NULL) 2103 prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ 2104 else 2105 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ 2106 2107 unschedule_bw_meter(x); 2108 MFC_UNLOCK(); 2109 /* Free the bw_meter entry */ 2110 free(x, M_BWMETER); 2111 return 0; 2112 } else { 2113 MFC_UNLOCK(); 2114 return EINVAL; 2115 } 2116 } 2117 /* NOTREACHED */ 2118} 2119 2120/* 2121 * Perform bandwidth measurement processing that may result in an upcall 2122 */ 2123static void 2124bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) 2125{ 2126 struct timeval delta; 2127 2128 MFC_LOCK_ASSERT(); 2129 2130 delta = *nowp; 2131 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2132 2133 if (x->bm_flags & BW_METER_GEQ) { 2134 /* 2135 * Processing for ">=" type of bw_meter entry 2136 */ 2137 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2138 /* Reset the bw_meter entry */ 2139 x->bm_start_time = *nowp; 2140 x->bm_measured.b_packets = 0; 2141 x->bm_measured.b_bytes = 0; 2142 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2143 } 2144 2145 /* Record that a packet is received */ 2146 x->bm_measured.b_packets++; 2147 x->bm_measured.b_bytes += plen; 2148 2149 /* 2150 * Test if we should deliver an upcall 2151 */ 2152 if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { 2153 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2154 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || 2155 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2156 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { 2157 /* Prepare an upcall for delivery */ 2158 bw_meter_prepare_upcall(x, nowp); 2159 x->bm_flags |= BW_METER_UPCALL_DELIVERED; 2160 } 2161 } 2162 } else if (x->bm_flags & BW_METER_LEQ) { 2163 /* 2164 * Processing for "<=" type of bw_meter entry 2165 */ 2166 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2167 /* 2168 * We are behind time with the multicast forwarding table 2169 * scanning for "<=" type of bw_meter entries, so test now 2170 * if we should deliver an upcall. 2171 */ 2172 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2173 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 2174 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2175 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 2176 /* Prepare an upcall for delivery */ 2177 bw_meter_prepare_upcall(x, nowp); 2178 } 2179 /* Reschedule the bw_meter entry */ 2180 unschedule_bw_meter(x); 2181 schedule_bw_meter(x, nowp); 2182 } 2183 2184 /* Record that a packet is received */ 2185 x->bm_measured.b_packets++; 2186 x->bm_measured.b_bytes += plen; 2187 2188 /* 2189 * Test if we should restart the measuring interval 2190 */ 2191 if ((x->bm_flags & BW_METER_UNIT_PACKETS && 2192 x->bm_measured.b_packets <= x->bm_threshold.b_packets) || 2193 (x->bm_flags & BW_METER_UNIT_BYTES && 2194 x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { 2195 /* Don't restart the measuring interval */ 2196 } else { 2197 /* Do restart the measuring interval */ 2198 /* 2199 * XXX: note that we don't unschedule and schedule, because this 2200 * might be too much overhead per packet. Instead, when we process 2201 * all entries for a given timer hash bin, we check whether it is 2202 * really a timeout. If not, we reschedule at that time. 2203 */ 2204 x->bm_start_time = *nowp; 2205 x->bm_measured.b_packets = 0; 2206 x->bm_measured.b_bytes = 0; 2207 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2208 } 2209 } 2210} 2211 2212/* 2213 * Prepare a bandwidth-related upcall 2214 */ 2215static void 2216bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) 2217{ 2218 struct timeval delta; 2219 struct bw_upcall *u; 2220 2221 MFC_LOCK_ASSERT(); 2222 2223 /* 2224 * Compute the measured time interval 2225 */ 2226 delta = *nowp; 2227 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2228 2229 /* 2230 * If there are too many pending upcalls, deliver them now 2231 */ 2232 if (bw_upcalls_n >= BW_UPCALLS_MAX) 2233 bw_upcalls_send(); 2234 2235 /* 2236 * Set the bw_upcall entry 2237 */ 2238 u = &bw_upcalls[bw_upcalls_n++]; 2239 u->bu_src = x->bm_mfc->mfc_origin; 2240 u->bu_dst = x->bm_mfc->mfc_mcastgrp; 2241 u->bu_threshold.b_time = x->bm_threshold.b_time; 2242 u->bu_threshold.b_packets = x->bm_threshold.b_packets; 2243 u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; 2244 u->bu_measured.b_time = delta; 2245 u->bu_measured.b_packets = x->bm_measured.b_packets; 2246 u->bu_measured.b_bytes = x->bm_measured.b_bytes; 2247 u->bu_flags = 0; 2248 if (x->bm_flags & BW_METER_UNIT_PACKETS) 2249 u->bu_flags |= BW_UPCALL_UNIT_PACKETS; 2250 if (x->bm_flags & BW_METER_UNIT_BYTES) 2251 u->bu_flags |= BW_UPCALL_UNIT_BYTES; 2252 if (x->bm_flags & BW_METER_GEQ) 2253 u->bu_flags |= BW_UPCALL_GEQ; 2254 if (x->bm_flags & BW_METER_LEQ) 2255 u->bu_flags |= BW_UPCALL_LEQ; 2256} 2257 2258/* 2259 * Send the pending bandwidth-related upcalls 2260 */ 2261static void 2262bw_upcalls_send(void) 2263{ 2264 struct mbuf *m; 2265 int len = bw_upcalls_n * sizeof(bw_upcalls[0]); 2266 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 2267 static struct igmpmsg igmpmsg = { 0, /* unused1 */ 2268 0, /* unused2 */ 2269 IGMPMSG_BW_UPCALL,/* im_msgtype */ 2270 0, /* im_mbz */ 2271 0, /* im_vif */ 2272 0, /* unused3 */ 2273 { 0 }, /* im_src */ 2274 { 0 } }; /* im_dst */ 2275 2276 MFC_LOCK_ASSERT(); 2277 2278 if (bw_upcalls_n == 0) 2279 return; /* No pending upcalls */ 2280 2281 bw_upcalls_n = 0; 2282 2283 /* 2284 * Allocate a new mbuf, initialize it with the header and 2285 * the payload for the pending calls. 2286 */ 2287 MGETHDR(m, M_DONTWAIT, MT_DATA); 2288 if (m == NULL) { 2289 log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); 2290 return; 2291 } 2292 2293 m->m_len = m->m_pkthdr.len = 0; 2294 m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); 2295 m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]); 2296 2297 /* 2298 * Send the upcalls 2299 * XXX do we need to set the address in k_igmpsrc ? 2300 */ 2301 mrtstat.mrts_upcalls++; 2302 if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) { 2303 log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); 2304 ++mrtstat.mrts_upq_sockfull; 2305 } 2306} 2307 2308/* 2309 * Compute the timeout hash value for the bw_meter entries 2310 */ 2311#define BW_METER_TIMEHASH(bw_meter, hash) \ 2312 do { \ 2313 struct timeval next_timeval = (bw_meter)->bm_start_time; \ 2314 \ 2315 BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ 2316 (hash) = next_timeval.tv_sec; \ 2317 if (next_timeval.tv_usec) \ 2318 (hash)++; /* XXX: make sure we don't timeout early */ \ 2319 (hash) %= BW_METER_BUCKETS; \ 2320 } while (0) 2321 2322/* 2323 * Schedule a timer to process periodically bw_meter entry of type "<=" 2324 * by linking the entry in the proper hash bucket. 2325 */ 2326static void 2327schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) 2328{ 2329 int time_hash; 2330 2331 MFC_LOCK_ASSERT(); 2332 2333 if (!(x->bm_flags & BW_METER_LEQ)) 2334 return; /* XXX: we schedule timers only for "<=" entries */ 2335 2336 /* 2337 * Reset the bw_meter entry 2338 */ 2339 x->bm_start_time = *nowp; 2340 x->bm_measured.b_packets = 0; 2341 x->bm_measured.b_bytes = 0; 2342 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2343 2344 /* 2345 * Compute the timeout hash value and insert the entry 2346 */ 2347 BW_METER_TIMEHASH(x, time_hash); 2348 x->bm_time_next = bw_meter_timers[time_hash]; 2349 bw_meter_timers[time_hash] = x; 2350 x->bm_time_hash = time_hash; 2351} 2352 2353/* 2354 * Unschedule the periodic timer that processes bw_meter entry of type "<=" 2355 * by removing the entry from the proper hash bucket. 2356 */ 2357static void 2358unschedule_bw_meter(struct bw_meter *x) 2359{ 2360 int time_hash; 2361 struct bw_meter *prev, *tmp; 2362 2363 MFC_LOCK_ASSERT(); 2364 2365 if (!(x->bm_flags & BW_METER_LEQ)) 2366 return; /* XXX: we schedule timers only for "<=" entries */ 2367 2368 /* 2369 * Compute the timeout hash value and delete the entry 2370 */ 2371 time_hash = x->bm_time_hash; 2372 if (time_hash >= BW_METER_BUCKETS) 2373 return; /* Entry was not scheduled */ 2374 2375 for (prev = NULL, tmp = bw_meter_timers[time_hash]; 2376 tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) 2377 if (tmp == x) 2378 break; 2379 2380 if (tmp == NULL) 2381 panic("unschedule_bw_meter: bw_meter entry not found"); 2382 2383 if (prev != NULL) 2384 prev->bm_time_next = x->bm_time_next; 2385 else 2386 bw_meter_timers[time_hash] = x->bm_time_next; 2387 2388 x->bm_time_next = NULL; 2389 x->bm_time_hash = BW_METER_BUCKETS; 2390} 2391 2392 2393/* 2394 * Process all "<=" type of bw_meter that should be processed now, 2395 * and for each entry prepare an upcall if necessary. Each processed 2396 * entry is rescheduled again for the (periodic) processing. 2397 * 2398 * This is run periodically (once per second normally). On each round, 2399 * all the potentially matching entries are in the hash slot that we are 2400 * looking at. 2401 */ 2402static void 2403bw_meter_process() 2404{ 2405 static uint32_t last_tv_sec; /* last time we processed this */ 2406 2407 uint32_t loops; 2408 int i; 2409 struct timeval now, process_endtime; 2410 2411 GET_TIME(now); 2412 if (last_tv_sec == now.tv_sec) 2413 return; /* nothing to do */ 2414 2415 loops = now.tv_sec - last_tv_sec; 2416 last_tv_sec = now.tv_sec; 2417 if (loops > BW_METER_BUCKETS) 2418 loops = BW_METER_BUCKETS; 2419 2420 MFC_LOCK(); 2421 /* 2422 * Process all bins of bw_meter entries from the one after the last 2423 * processed to the current one. On entry, i points to the last bucket 2424 * visited, so we need to increment i at the beginning of the loop. 2425 */ 2426 for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { 2427 struct bw_meter *x, *tmp_list; 2428 2429 if (++i >= BW_METER_BUCKETS) 2430 i = 0; 2431 2432 /* Disconnect the list of bw_meter entries from the bin */ 2433 tmp_list = bw_meter_timers[i]; 2434 bw_meter_timers[i] = NULL; 2435 2436 /* Process the list of bw_meter entries */ 2437 while (tmp_list != NULL) { 2438 x = tmp_list; 2439 tmp_list = tmp_list->bm_time_next; 2440 2441 /* Test if the time interval is over */ 2442 process_endtime = x->bm_start_time; 2443 BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); 2444 if (BW_TIMEVALCMP(&process_endtime, &now, >)) { 2445 /* Not yet: reschedule, but don't reset */ 2446 int time_hash; 2447 2448 BW_METER_TIMEHASH(x, time_hash); 2449 if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { 2450 /* 2451 * XXX: somehow the bin processing is a bit ahead of time. 2452 * Put the entry in the next bin. 2453 */ 2454 if (++time_hash >= BW_METER_BUCKETS) 2455 time_hash = 0; 2456 } 2457 x->bm_time_next = bw_meter_timers[time_hash]; 2458 bw_meter_timers[time_hash] = x; 2459 x->bm_time_hash = time_hash; 2460 2461 continue; 2462 } 2463 2464 /* 2465 * Test if we should deliver an upcall 2466 */ 2467 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2468 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 2469 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2470 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 2471 /* Prepare an upcall for delivery */ 2472 bw_meter_prepare_upcall(x, &now); 2473 } 2474 2475 /* 2476 * Reschedule for next processing 2477 */ 2478 schedule_bw_meter(x, &now); 2479 } 2480 } 2481 2482 /* Send all upcalls that are pending delivery */ 2483 bw_upcalls_send(); 2484 2485 MFC_UNLOCK(); 2486} 2487 2488/* 2489 * A periodic function for sending all upcalls that are pending delivery 2490 */ 2491static void 2492expire_bw_upcalls_send(void *unused) 2493{ 2494 MFC_LOCK(); 2495 bw_upcalls_send(); 2496 MFC_UNLOCK(); 2497 2498 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 2499 expire_bw_upcalls_send, NULL); 2500} 2501 2502/* 2503 * A periodic function for periodic scanning of the multicast forwarding 2504 * table for processing all "<=" bw_meter entries. 2505 */ 2506static void 2507expire_bw_meter_process(void *unused) 2508{ 2509 if (mrt_api_config & MRT_MFC_BW_UPCALL) 2510 bw_meter_process(); 2511 2512 callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); 2513} 2514 2515/* 2516 * End of bandwidth monitoring code 2517 */ 2518 2519#ifdef PIM 2520/* 2521 * Send the packet up to the user daemon, or eventually do kernel encapsulation 2522 * 2523 */ 2524static int 2525pim_register_send(struct ip *ip, struct vif *vifp, 2526 struct mbuf *m, struct mfc *rt) 2527{ 2528 struct mbuf *mb_copy, *mm; 2529 2530 if (mrtdebug & DEBUG_PIM) 2531 log(LOG_DEBUG, "pim_register_send: "); 2532 2533 mb_copy = pim_register_prepare(ip, m); 2534 if (mb_copy == NULL) 2535 return ENOBUFS; 2536 2537 /* 2538 * Send all the fragments. Note that the mbuf for each fragment 2539 * is freed by the sending machinery. 2540 */ 2541 for (mm = mb_copy; mm; mm = mb_copy) { 2542 mb_copy = mm->m_nextpkt; 2543 mm->m_nextpkt = 0; 2544 mm = m_pullup(mm, sizeof(struct ip)); 2545 if (mm != NULL) { 2546 ip = mtod(mm, struct ip *); 2547 if ((mrt_api_config & MRT_MFC_RP) && 2548 (rt->mfc_rp.s_addr != INADDR_ANY)) { 2549 pim_register_send_rp(ip, vifp, mm, rt); 2550 } else { 2551 pim_register_send_upcall(ip, vifp, mm, rt); 2552 } 2553 } 2554 } 2555 2556 return 0; 2557} 2558 2559/* 2560 * Return a copy of the data packet that is ready for PIM Register 2561 * encapsulation. 2562 * XXX: Note that in the returned copy the IP header is a valid one. 2563 */ 2564static struct mbuf * 2565pim_register_prepare(struct ip *ip, struct mbuf *m) 2566{ 2567 struct mbuf *mb_copy = NULL; 2568 int mtu; 2569 2570 /* Take care of delayed checksums */ 2571 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 2572 in_delayed_cksum(m); 2573 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 2574 } 2575 2576 /* 2577 * Copy the old packet & pullup its IP header into the 2578 * new mbuf so we can modify it. 2579 */ 2580 mb_copy = m_copypacket(m, M_DONTWAIT); 2581 if (mb_copy == NULL) 2582 return NULL; 2583 mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); 2584 if (mb_copy == NULL) 2585 return NULL; 2586 2587 /* take care of the TTL */ 2588 ip = mtod(mb_copy, struct ip *); 2589 --ip->ip_ttl; 2590 2591 /* Compute the MTU after the PIM Register encapsulation */ 2592 mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); 2593 2594 if (ip->ip_len <= mtu) { 2595 /* Turn the IP header into a valid one */ 2596 ip->ip_len = htons(ip->ip_len); 2597 ip->ip_off = htons(ip->ip_off); 2598 ip->ip_sum = 0; 2599 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 2600 } else { 2601 /* Fragment the packet */ 2602 if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) { 2603 m_freem(mb_copy); 2604 return NULL; 2605 } 2606 } 2607 return mb_copy; 2608} 2609 2610/* 2611 * Send an upcall with the data packet to the user-level process. 2612 */ 2613static int 2614pim_register_send_upcall(struct ip *ip, struct vif *vifp, 2615 struct mbuf *mb_copy, struct mfc *rt) 2616{ 2617 struct mbuf *mb_first; 2618 int len = ntohs(ip->ip_len); 2619 struct igmpmsg *im; 2620 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 2621 2622 VIF_LOCK_ASSERT(); 2623 2624 /* 2625 * Add a new mbuf with an upcall header 2626 */ 2627 MGETHDR(mb_first, M_DONTWAIT, MT_DATA); 2628 if (mb_first == NULL) { 2629 m_freem(mb_copy); 2630 return ENOBUFS; 2631 } 2632 mb_first->m_data += max_linkhdr; 2633 mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); 2634 mb_first->m_len = sizeof(struct igmpmsg); 2635 mb_first->m_next = mb_copy; 2636 2637 /* Send message to routing daemon */ 2638 im = mtod(mb_first, struct igmpmsg *); 2639 im->im_msgtype = IGMPMSG_WHOLEPKT; 2640 im->im_mbz = 0; 2641 im->im_vif = vifp - viftable; 2642 im->im_src = ip->ip_src; 2643 im->im_dst = ip->ip_dst; 2644 2645 k_igmpsrc.sin_addr = ip->ip_src; 2646 2647 mrtstat.mrts_upcalls++; 2648 2649 if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) { 2650 if (mrtdebug & DEBUG_PIM) 2651 log(LOG_WARNING, 2652 "mcast: pim_register_send_upcall: ip_mrouter socket queue full"); 2653 ++mrtstat.mrts_upq_sockfull; 2654 return ENOBUFS; 2655 } 2656 2657 /* Keep statistics */ 2658 pimstat.pims_snd_registers_msgs++; 2659 pimstat.pims_snd_registers_bytes += len; 2660 2661 return 0; 2662} 2663 2664/* 2665 * Encapsulate the data packet in PIM Register message and send it to the RP. 2666 */ 2667static int 2668pim_register_send_rp(struct ip *ip, struct vif *vifp, 2669 struct mbuf *mb_copy, struct mfc *rt) 2670{ 2671 struct mbuf *mb_first; 2672 struct ip *ip_outer; 2673 struct pim_encap_pimhdr *pimhdr; 2674 int len = ntohs(ip->ip_len); 2675 vifi_t vifi = rt->mfc_parent; 2676 2677 VIF_LOCK_ASSERT(); 2678 2679 if ((vifi >= numvifs) || (viftable[vifi].v_lcl_addr.s_addr == 0)) { 2680 m_freem(mb_copy); 2681 return EADDRNOTAVAIL; /* The iif vif is invalid */ 2682 } 2683 2684 /* 2685 * Add a new mbuf with the encapsulating header 2686 */ 2687 MGETHDR(mb_first, M_DONTWAIT, MT_DATA); 2688 if (mb_first == NULL) { 2689 m_freem(mb_copy); 2690 return ENOBUFS; 2691 } 2692 mb_first->m_data += max_linkhdr; 2693 mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); 2694 mb_first->m_next = mb_copy; 2695 2696 mb_first->m_pkthdr.len = len + mb_first->m_len; 2697 2698 /* 2699 * Fill in the encapsulating IP and PIM header 2700 */ 2701 ip_outer = mtod(mb_first, struct ip *); 2702 *ip_outer = pim_encap_iphdr; 2703 ip_outer->ip_id = ip_newid(); 2704 ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); 2705 ip_outer->ip_src = viftable[vifi].v_lcl_addr; 2706 ip_outer->ip_dst = rt->mfc_rp; 2707 /* 2708 * Copy the inner header TOS to the outer header, and take care of the 2709 * IP_DF bit. 2710 */ 2711 ip_outer->ip_tos = ip->ip_tos; 2712 if (ntohs(ip->ip_off) & IP_DF) 2713 ip_outer->ip_off |= IP_DF; 2714 pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer 2715 + sizeof(pim_encap_iphdr)); 2716 *pimhdr = pim_encap_pimhdr; 2717 /* If the iif crosses a border, set the Border-bit */ 2718 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config) 2719 pimhdr->flags |= htonl(PIM_BORDER_REGISTER); 2720 2721 mb_first->m_data += sizeof(pim_encap_iphdr); 2722 pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); 2723 mb_first->m_data -= sizeof(pim_encap_iphdr); 2724 2725 send_packet(vifp, mb_first); 2726 2727 /* Keep statistics */ 2728 pimstat.pims_snd_registers_msgs++; 2729 pimstat.pims_snd_registers_bytes += len; 2730 2731 return 0; 2732} 2733 2734/* 2735 * PIM-SMv2 and PIM-DM messages processing. 2736 * Receives and verifies the PIM control messages, and passes them 2737 * up to the listening socket, using rip_input(). 2738 * The only message with special processing is the PIM_REGISTER message 2739 * (used by PIM-SM): the PIM header is stripped off, and the inner packet 2740 * is passed to if_simloop(). 2741 */ 2742void 2743pim_input(struct mbuf *m, int off) 2744{ 2745 struct ip *ip = mtod(m, struct ip *); 2746 struct pim *pim; 2747 int minlen; 2748 int datalen = ip->ip_len; 2749 int ip_tos; 2750 int iphlen = off; 2751 2752 /* Keep statistics */ 2753 pimstat.pims_rcv_total_msgs++; 2754 pimstat.pims_rcv_total_bytes += datalen; 2755 2756 /* 2757 * Validate lengths 2758 */ 2759 if (datalen < PIM_MINLEN) { 2760 pimstat.pims_rcv_tooshort++; 2761 log(LOG_ERR, "pim_input: packet size too small %d from %lx\n", 2762 datalen, (u_long)ip->ip_src.s_addr); 2763 m_freem(m); 2764 return; 2765 } 2766 2767 /* 2768 * If the packet is at least as big as a REGISTER, go agead 2769 * and grab the PIM REGISTER header size, to avoid another 2770 * possible m_pullup() later. 2771 * 2772 * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 2773 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 2774 */ 2775 minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); 2776 /* 2777 * Get the IP and PIM headers in contiguous memory, and 2778 * possibly the PIM REGISTER header. 2779 */ 2780 if ((m->m_flags & M_EXT || m->m_len < minlen) && 2781 (m = m_pullup(m, minlen)) == 0) { 2782 log(LOG_ERR, "pim_input: m_pullup failure\n"); 2783 return; 2784 } 2785 /* m_pullup() may have given us a new mbuf so reset ip. */ 2786 ip = mtod(m, struct ip *); 2787 ip_tos = ip->ip_tos; 2788 2789 /* adjust mbuf to point to the PIM header */ 2790 m->m_data += iphlen; 2791 m->m_len -= iphlen; 2792 pim = mtod(m, struct pim *); 2793 2794 /* 2795 * Validate checksum. If PIM REGISTER, exclude the data packet. 2796 * 2797 * XXX: some older PIMv2 implementations don't make this distinction, 2798 * so for compatibility reason perform the checksum over part of the 2799 * message, and if error, then over the whole message. 2800 */ 2801 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { 2802 /* do nothing, checksum okay */ 2803 } else if (in_cksum(m, datalen)) { 2804 pimstat.pims_rcv_badsum++; 2805 if (mrtdebug & DEBUG_PIM) 2806 log(LOG_DEBUG, "pim_input: invalid checksum"); 2807 m_freem(m); 2808 return; 2809 } 2810 2811 /* PIM version check */ 2812 if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { 2813 pimstat.pims_rcv_badversion++; 2814 log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n", 2815 PIM_VT_V(pim->pim_vt), PIM_VERSION); 2816 m_freem(m); 2817 return; 2818 } 2819 2820 /* restore mbuf back to the outer IP */ 2821 m->m_data -= iphlen; 2822 m->m_len += iphlen; 2823 2824 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { 2825 /* 2826 * Since this is a REGISTER, we'll make a copy of the register 2827 * headers ip + pim + u_int32 + encap_ip, to be passed up to the 2828 * routing daemon. 2829 */ 2830 struct sockaddr_in dst = { sizeof(dst), AF_INET }; 2831 struct mbuf *mcp; 2832 struct ip *encap_ip; 2833 u_int32_t *reghdr; 2834 struct ifnet *vifp; 2835 2836 VIF_LOCK(); 2837 if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) { 2838 VIF_UNLOCK(); 2839 if (mrtdebug & DEBUG_PIM) 2840 log(LOG_DEBUG, 2841 "pim_input: register vif not set: %d\n", reg_vif_num); 2842 m_freem(m); 2843 return; 2844 } 2845 /* XXX need refcnt? */ 2846 vifp = viftable[reg_vif_num].v_ifp; 2847 VIF_UNLOCK(); 2848 2849 /* 2850 * Validate length 2851 */ 2852 if (datalen < PIM_REG_MINLEN) { 2853 pimstat.pims_rcv_tooshort++; 2854 pimstat.pims_rcv_badregisters++; 2855 log(LOG_ERR, 2856 "pim_input: register packet size too small %d from %lx\n", 2857 datalen, (u_long)ip->ip_src.s_addr); 2858 m_freem(m); 2859 return; 2860 } 2861 2862 reghdr = (u_int32_t *)(pim + 1); 2863 encap_ip = (struct ip *)(reghdr + 1); 2864 2865 if (mrtdebug & DEBUG_PIM) { 2866 log(LOG_DEBUG, 2867 "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n", 2868 (u_long)ntohl(encap_ip->ip_src.s_addr), 2869 (u_long)ntohl(encap_ip->ip_dst.s_addr), 2870 ntohs(encap_ip->ip_len)); 2871 } 2872 2873 /* verify the version number of the inner packet */ 2874 if (encap_ip->ip_v != IPVERSION) { 2875 pimstat.pims_rcv_badregisters++; 2876 if (mrtdebug & DEBUG_PIM) { 2877 log(LOG_DEBUG, "pim_input: invalid IP version (%d) " 2878 "of the inner packet\n", encap_ip->ip_v); 2879 } 2880 m_freem(m); 2881 return; 2882 } 2883 2884 /* verify the inner packet is destined to a mcast group */ 2885 if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { 2886 pimstat.pims_rcv_badregisters++; 2887 if (mrtdebug & DEBUG_PIM) 2888 log(LOG_DEBUG, 2889 "pim_input: inner packet of register is not " 2890 "multicast %lx\n", 2891 (u_long)ntohl(encap_ip->ip_dst.s_addr)); 2892 m_freem(m); 2893 return; 2894 } 2895 2896 /* If a NULL_REGISTER, pass it to the daemon */ 2897 if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) 2898 goto pim_input_to_daemon; 2899 2900 /* 2901 * Copy the TOS from the outer IP header to the inner IP header. 2902 */ 2903 if (encap_ip->ip_tos != ip_tos) { 2904 /* Outer TOS -> inner TOS */ 2905 encap_ip->ip_tos = ip_tos; 2906 /* Recompute the inner header checksum. Sigh... */ 2907 2908 /* adjust mbuf to point to the inner IP header */ 2909 m->m_data += (iphlen + PIM_MINLEN); 2910 m->m_len -= (iphlen + PIM_MINLEN); 2911 2912 encap_ip->ip_sum = 0; 2913 encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); 2914 2915 /* restore mbuf to point back to the outer IP header */ 2916 m->m_data -= (iphlen + PIM_MINLEN); 2917 m->m_len += (iphlen + PIM_MINLEN); 2918 } 2919 2920 /* 2921 * Decapsulate the inner IP packet and loopback to forward it 2922 * as a normal multicast packet. Also, make a copy of the 2923 * outer_iphdr + pimhdr + reghdr + encap_iphdr 2924 * to pass to the daemon later, so it can take the appropriate 2925 * actions (e.g., send back PIM_REGISTER_STOP). 2926 * XXX: here m->m_data points to the outer IP header. 2927 */ 2928 mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN); 2929 if (mcp == NULL) { 2930 log(LOG_ERR, 2931 "pim_input: pim register: could not copy register head\n"); 2932 m_freem(m); 2933 return; 2934 } 2935 2936 /* Keep statistics */ 2937 /* XXX: registers_bytes include only the encap. mcast pkt */ 2938 pimstat.pims_rcv_registers_msgs++; 2939 pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len); 2940 2941 /* 2942 * forward the inner ip packet; point m_data at the inner ip. 2943 */ 2944 m_adj(m, iphlen + PIM_MINLEN); 2945 2946 if (mrtdebug & DEBUG_PIM) { 2947 log(LOG_DEBUG, 2948 "pim_input: forwarding decapsulated register: " 2949 "src %lx, dst %lx, vif %d\n", 2950 (u_long)ntohl(encap_ip->ip_src.s_addr), 2951 (u_long)ntohl(encap_ip->ip_dst.s_addr), 2952 reg_vif_num); 2953 } 2954 /* NB: vifp was collected above; can it change on us? */ 2955 if_simloop(vifp, m, dst.sin_family, 0); 2956 2957 /* prepare the register head to send to the mrouting daemon */ 2958 m = mcp; 2959 } 2960 2961pim_input_to_daemon: 2962 /* 2963 * Pass the PIM message up to the daemon; if it is a Register message, 2964 * pass the 'head' only up to the daemon. This includes the 2965 * outer IP header, PIM header, PIM-Register header and the 2966 * inner IP header. 2967 * XXX: the outer IP header pkt size of a Register is not adjust to 2968 * reflect the fact that the inner multicast data is truncated. 2969 */ 2970 rip_input(m, iphlen); 2971 2972 return; 2973} 2974#endif /* PIM */ 2975 2976static int 2977ip_mroute_modevent(module_t mod, int type, void *unused) 2978{ 2979 switch (type) { 2980 case MOD_LOAD: 2981 mtx_init(&mrouter_mtx, "mrouter initialization", NULL, MTX_DEF); 2982 MFC_LOCK_INIT(); 2983 VIF_LOCK_INIT(); 2984 ip_mrouter_reset(); 2985 ip_mcast_src = X_ip_mcast_src; 2986 ip_mforward = X_ip_mforward; 2987 ip_mrouter_done = X_ip_mrouter_done; 2988 ip_mrouter_get = X_ip_mrouter_get; 2989 ip_mrouter_set = X_ip_mrouter_set; 2990 ip_rsvp_force_done = X_ip_rsvp_force_done; 2991 ip_rsvp_vif = X_ip_rsvp_vif; 2992 legal_vif_num = X_legal_vif_num; 2993 mrt_ioctl = X_mrt_ioctl; 2994 rsvp_input_p = X_rsvp_input; 2995 break; 2996 2997 case MOD_UNLOAD: 2998 /* 2999 * Typically module unload happens after the user-level 3000 * process has shutdown the kernel services (the check 3001 * below insures someone can't just yank the module out 3002 * from under a running process). But if the module is 3003 * just loaded and then unloaded w/o starting up a user 3004 * process we still need to cleanup. 3005 */ 3006 if (ip_mrouter) 3007 return EINVAL; 3008 3009 X_ip_mrouter_done(); 3010 ip_mcast_src = NULL; 3011 ip_mforward = NULL; 3012 ip_mrouter_done = NULL; 3013 ip_mrouter_get = NULL; 3014 ip_mrouter_set = NULL; 3015 ip_rsvp_force_done = NULL; 3016 ip_rsvp_vif = NULL; 3017 legal_vif_num = NULL; 3018 mrt_ioctl = NULL; 3019 rsvp_input_p = NULL; 3020 VIF_LOCK_DESTROY(); 3021 MFC_LOCK_DESTROY(); 3022 mtx_destroy(&mrouter_mtx); 3023 break; 3024 default: 3025 return EOPNOTSUPP; 3026 } 3027 return 0; 3028} 3029 3030static moduledata_t ip_mroutemod = { 3031 "ip_mroute", 3032 ip_mroute_modevent, 3033 0 3034}; 3035DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY); 3036