ip_mroute.c revision 129880
1/* 2 * IP multicast forwarding procedures 3 * 4 * Written by David Waitzman, BBN Labs, August 1988. 5 * Modified by Steve Deering, Stanford, February 1989. 6 * Modified by Mark J. Steiglitz, Stanford, May, 1991 7 * Modified by Van Jacobson, LBL, January 1993 8 * Modified by Ajit Thyagarajan, PARC, August 1993 9 * Modified by Bill Fenner, PARC, April 1995 10 * Modified by Ahmed Helmy, SGI, June 1996 11 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 12 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 13 * Modified by Hitoshi Asaeda, WIDE, August 2000 14 * Modified by Pavlin Radoslavov, ICSI, October 2002 15 * 16 * MROUTING Revision: 3.5 17 * and PIM-SMv2 and PIM-DM support, advanced API support, 18 * bandwidth metering and signaling 19 * 20 * $FreeBSD: head/sys/netinet/ip_mroute.c 129880 2004-05-30 20:27:19Z phk $ 21 */ 22 23#include "opt_mac.h" 24#include "opt_mrouting.h" 25#include "opt_random_ip_id.h" 26 27#ifdef PIM 28#define _PIM_VT 1 29#endif 30 31#include <sys/param.h> 32#include <sys/kernel.h> 33#include <sys/lock.h> 34#include <sys/mac.h> 35#include <sys/malloc.h> 36#include <sys/mbuf.h> 37#include <sys/module.h> 38#include <sys/protosw.h> 39#include <sys/signalvar.h> 40#include <sys/socket.h> 41#include <sys/socketvar.h> 42#include <sys/sockio.h> 43#include <sys/sx.h> 44#include <sys/sysctl.h> 45#include <sys/syslog.h> 46#include <sys/systm.h> 47#include <sys/time.h> 48#include <net/if.h> 49#include <net/netisr.h> 50#include <net/route.h> 51#include <netinet/in.h> 52#include <netinet/igmp.h> 53#include <netinet/in_systm.h> 54#include <netinet/in_var.h> 55#include <netinet/ip.h> 56#include <netinet/ip_encap.h> 57#include <netinet/ip_mroute.h> 58#include <netinet/ip_var.h> 59#ifdef PIM 60#include <netinet/pim.h> 61#include <netinet/pim_var.h> 62#endif 63#include <netinet/udp.h> 64#include <machine/in_cksum.h> 65 66/* 67 * Control debugging code for rsvp and multicast routing code. 68 * Can only set them with the debugger. 69 */ 70static u_int rsvpdebug; /* non-zero enables debugging */ 71 72static u_int mrtdebug; /* any set of the flags below */ 73#define DEBUG_MFC 0x02 74#define DEBUG_FORWARD 0x04 75#define DEBUG_EXPIRE 0x08 76#define DEBUG_XMIT 0x10 77#define DEBUG_PIM 0x20 78 79#define VIFI_INVALID ((vifi_t) -1) 80 81#define M_HASCL(m) ((m)->m_flags & M_EXT) 82 83static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables"); 84 85/* 86 * Locking. We use two locks: one for the virtual interface table and 87 * one for the forwarding table. These locks may be nested in which case 88 * the VIF lock must always be taken first. Note that each lock is used 89 * to cover not only the specific data structure but also related data 90 * structures. It may be better to add more fine-grained locking later; 91 * it's not clear how performance-critical this code is. 92 */ 93 94static struct mrtstat mrtstat; 95SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW, 96 &mrtstat, mrtstat, 97 "Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)"); 98 99static struct mfc *mfctable[MFCTBLSIZ]; 100SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, 101 &mfctable, sizeof(mfctable), "S,*mfc[MFCTBLSIZ]", 102 "Multicast Forwarding Table (struct *mfc[MFCTBLSIZ], netinet/ip_mroute.h)"); 103 104static struct mtx mfc_mtx; 105#define MFC_LOCK() mtx_lock(&mfc_mtx) 106#define MFC_UNLOCK() mtx_unlock(&mfc_mtx) 107#define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED) 108#define MFC_LOCK_INIT() mtx_init(&mfc_mtx, "mroute mfc table", NULL, MTX_DEF) 109#define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) 110 111static struct vif viftable[MAXVIFS]; 112SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD, 113 &viftable, sizeof(viftable), "S,vif[MAXVIFS]", 114 "Multicast Virtual Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); 115 116static struct mtx vif_mtx; 117#define VIF_LOCK() mtx_lock(&vif_mtx) 118#define VIF_UNLOCK() mtx_unlock(&vif_mtx) 119#define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) 120#define VIF_LOCK_INIT() mtx_init(&vif_mtx, "mroute vif table", NULL, MTX_DEF) 121#define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) 122 123static u_char nexpire[MFCTBLSIZ]; 124 125static struct callout expire_upcalls_ch; 126 127#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ 128#define UPCALL_EXPIRE 6 /* number of timeouts */ 129 130/* 131 * Define the token bucket filter structures 132 * tbftable -> each vif has one of these for storing info 133 */ 134 135static struct tbf tbftable[MAXVIFS]; 136#define TBF_REPROCESS (hz / 100) /* 100x / second */ 137 138/* 139 * 'Interfaces' associated with decapsulator (so we can tell 140 * packets that went through it from ones that get reflected 141 * by a broken gateway). These interfaces are never linked into 142 * the system ifnet list & no routes point to them. I.e., packets 143 * can't be sent this way. They only exist as a placeholder for 144 * multicast source verification. 145 */ 146static struct ifnet multicast_decap_if[MAXVIFS]; 147 148#define ENCAP_TTL 64 149#define ENCAP_PROTO IPPROTO_IPIP /* 4 */ 150 151/* prototype IP hdr for encapsulated packets */ 152static struct ip multicast_encap_iphdr = { 153#if BYTE_ORDER == LITTLE_ENDIAN 154 sizeof(struct ip) >> 2, IPVERSION, 155#else 156 IPVERSION, sizeof(struct ip) >> 2, 157#endif 158 0, /* tos */ 159 sizeof(struct ip), /* total length */ 160 0, /* id */ 161 0, /* frag offset */ 162 ENCAP_TTL, ENCAP_PROTO, 163 0, /* checksum */ 164}; 165 166/* 167 * Bandwidth meter variables and constants 168 */ 169static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); 170/* 171 * Pending timeouts are stored in a hash table, the key being the 172 * expiration time. Periodically, the entries are analysed and processed. 173 */ 174#define BW_METER_BUCKETS 1024 175static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS]; 176static struct callout bw_meter_ch; 177#define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ 178 179/* 180 * Pending upcalls are stored in a vector which is flushed when 181 * full, or periodically 182 */ 183static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX]; 184static u_int bw_upcalls_n; /* # of pending upcalls */ 185static struct callout bw_upcalls_ch; 186#define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ 187 188#ifdef PIM 189static struct pimstat pimstat; 190SYSCTL_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD, 191 &pimstat, pimstat, 192 "PIM Statistics (struct pimstat, netinet/pim_var.h)"); 193 194/* 195 * Note: the PIM Register encapsulation adds the following in front of a 196 * data packet: 197 * 198 * struct pim_encap_hdr { 199 * struct ip ip; 200 * struct pim_encap_pimhdr pim; 201 * } 202 * 203 */ 204 205struct pim_encap_pimhdr { 206 struct pim pim; 207 uint32_t flags; 208}; 209 210static struct ip pim_encap_iphdr = { 211#if BYTE_ORDER == LITTLE_ENDIAN 212 sizeof(struct ip) >> 2, 213 IPVERSION, 214#else 215 IPVERSION, 216 sizeof(struct ip) >> 2, 217#endif 218 0, /* tos */ 219 sizeof(struct ip), /* total length */ 220 0, /* id */ 221 0, /* frag offset */ 222 ENCAP_TTL, 223 IPPROTO_PIM, 224 0, /* checksum */ 225}; 226 227static struct pim_encap_pimhdr pim_encap_pimhdr = { 228 { 229 PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 230 0, /* reserved */ 231 0, /* checksum */ 232 }, 233 0 /* flags */ 234}; 235 236static struct ifnet multicast_register_if; 237static vifi_t reg_vif_num = VIFI_INVALID; 238#endif /* PIM */ 239 240/* 241 * Private variables. 242 */ 243static vifi_t numvifs; 244static const struct encaptab *encap_cookie; 245 246/* 247 * one-back cache used by mroute_encapcheck to locate a tunnel's vif 248 * given a datagram's src ip address. 249 */ 250static u_long last_encap_src; 251static struct vif *last_encap_vif; 252 253/* 254 * Callout for queue processing. 255 */ 256static struct callout tbf_reprocess_ch; 257 258static u_long X_ip_mcast_src(int vifi); 259static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, 260 struct mbuf *m, struct ip_moptions *imo); 261static int X_ip_mrouter_done(void); 262static int X_ip_mrouter_get(struct socket *so, struct sockopt *m); 263static int X_ip_mrouter_set(struct socket *so, struct sockopt *m); 264static int X_legal_vif_num(int vif); 265static int X_mrt_ioctl(int cmd, caddr_t data); 266 267static int get_sg_cnt(struct sioc_sg_req *); 268static int get_vif_cnt(struct sioc_vif_req *); 269static int ip_mrouter_init(struct socket *, int); 270static int add_vif(struct vifctl *); 271static int del_vif(vifi_t); 272static int add_mfc(struct mfcctl2 *); 273static int del_mfc(struct mfcctl2 *); 274static int set_api_config(uint32_t *); /* chose API capabilities */ 275static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); 276static int set_assert(int); 277static void expire_upcalls(void *); 278static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); 279static void phyint_send(struct ip *, struct vif *, struct mbuf *); 280static void encap_send(struct ip *, struct vif *, struct mbuf *); 281static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long); 282static void tbf_queue(struct vif *, struct mbuf *); 283static void tbf_process_q(struct vif *); 284static void tbf_reprocess_q(void *); 285static int tbf_dq_sel(struct vif *, struct ip *); 286static void tbf_send_packet(struct vif *, struct mbuf *); 287static void tbf_update_tokens(struct vif *); 288static int priority(struct vif *, struct ip *); 289 290/* 291 * Bandwidth monitoring 292 */ 293static void free_bw_list(struct bw_meter *list); 294static int add_bw_upcall(struct bw_upcall *); 295static int del_bw_upcall(struct bw_upcall *); 296static void bw_meter_receive_packet(struct bw_meter *x, int plen, 297 struct timeval *nowp); 298static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp); 299static void bw_upcalls_send(void); 300static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp); 301static void unschedule_bw_meter(struct bw_meter *x); 302static void bw_meter_process(void); 303static void expire_bw_upcalls_send(void *); 304static void expire_bw_meter_process(void *); 305 306#ifdef PIM 307static int pim_register_send(struct ip *, struct vif *, 308 struct mbuf *, struct mfc *); 309static int pim_register_send_rp(struct ip *, struct vif *, 310 struct mbuf *, struct mfc *); 311static int pim_register_send_upcall(struct ip *, struct vif *, 312 struct mbuf *, struct mfc *); 313static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *); 314#endif 315 316/* 317 * whether or not special PIM assert processing is enabled. 318 */ 319static int pim_assert; 320/* 321 * Rate limit for assert notification messages, in usec 322 */ 323#define ASSERT_MSG_TIME 3000000 324 325/* 326 * Kernel multicast routing API capabilities and setup. 327 * If more API capabilities are added to the kernel, they should be 328 * recorded in `mrt_api_support'. 329 */ 330static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | 331 MRT_MFC_FLAGS_BORDER_VIF | 332 MRT_MFC_RP | 333 MRT_MFC_BW_UPCALL); 334static uint32_t mrt_api_config = 0; 335 336/* 337 * Hash function for a source, group entry 338 */ 339#define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 340 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 341 342/* 343 * Find a route for a given origin IP address and Multicast group address 344 * Type of service parameter to be added in the future!!! 345 * Statistics are updated by the caller if needed 346 * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses) 347 */ 348static struct mfc * 349mfc_find(in_addr_t o, in_addr_t g) 350{ 351 struct mfc *rt; 352 353 MFC_LOCK_ASSERT(); 354 355 for (rt = mfctable[MFCHASH(o,g)]; rt; rt = rt->mfc_next) 356 if ((rt->mfc_origin.s_addr == o) && 357 (rt->mfc_mcastgrp.s_addr == g) && (rt->mfc_stall == NULL)) 358 break; 359 return rt; 360} 361 362/* 363 * Macros to compute elapsed time efficiently 364 * Borrowed from Van Jacobson's scheduling code 365 */ 366#define TV_DELTA(a, b, delta) { \ 367 int xxs; \ 368 delta = (a).tv_usec - (b).tv_usec; \ 369 if ((xxs = (a).tv_sec - (b).tv_sec)) { \ 370 switch (xxs) { \ 371 case 2: \ 372 delta += 1000000; \ 373 /* FALLTHROUGH */ \ 374 case 1: \ 375 delta += 1000000; \ 376 break; \ 377 default: \ 378 delta += (1000000 * xxs); \ 379 } \ 380 } \ 381} 382 383#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ 384 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 385 386/* 387 * Handle MRT setsockopt commands to modify the multicast routing tables. 388 */ 389static int 390X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) 391{ 392 int error, optval; 393 vifi_t vifi; 394 struct vifctl vifc; 395 struct mfcctl2 mfc; 396 struct bw_upcall bw_upcall; 397 uint32_t i; 398 399 if (so != ip_mrouter && sopt->sopt_name != MRT_INIT) 400 return EPERM; 401 402 error = 0; 403 switch (sopt->sopt_name) { 404 case MRT_INIT: 405 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 406 if (error) 407 break; 408 error = ip_mrouter_init(so, optval); 409 break; 410 411 case MRT_DONE: 412 error = ip_mrouter_done(); 413 break; 414 415 case MRT_ADD_VIF: 416 error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); 417 if (error) 418 break; 419 error = add_vif(&vifc); 420 break; 421 422 case MRT_DEL_VIF: 423 error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); 424 if (error) 425 break; 426 error = del_vif(vifi); 427 break; 428 429 case MRT_ADD_MFC: 430 case MRT_DEL_MFC: 431 /* 432 * select data size depending on API version. 433 */ 434 if (sopt->sopt_name == MRT_ADD_MFC && 435 mrt_api_config & MRT_API_FLAGS_ALL) { 436 error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), 437 sizeof(struct mfcctl2)); 438 } else { 439 error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), 440 sizeof(struct mfcctl)); 441 bzero((caddr_t)&mfc + sizeof(struct mfcctl), 442 sizeof(mfc) - sizeof(struct mfcctl)); 443 } 444 if (error) 445 break; 446 if (sopt->sopt_name == MRT_ADD_MFC) 447 error = add_mfc(&mfc); 448 else 449 error = del_mfc(&mfc); 450 break; 451 452 case MRT_ASSERT: 453 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 454 if (error) 455 break; 456 set_assert(optval); 457 break; 458 459 case MRT_API_CONFIG: 460 error = sooptcopyin(sopt, &i, sizeof i, sizeof i); 461 if (!error) 462 error = set_api_config(&i); 463 if (!error) 464 error = sooptcopyout(sopt, &i, sizeof i); 465 break; 466 467 case MRT_ADD_BW_UPCALL: 468 case MRT_DEL_BW_UPCALL: 469 error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, 470 sizeof bw_upcall); 471 if (error) 472 break; 473 if (sopt->sopt_name == MRT_ADD_BW_UPCALL) 474 error = add_bw_upcall(&bw_upcall); 475 else 476 error = del_bw_upcall(&bw_upcall); 477 break; 478 479 default: 480 error = EOPNOTSUPP; 481 break; 482 } 483 return error; 484} 485 486/* 487 * Handle MRT getsockopt commands 488 */ 489static int 490X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) 491{ 492 int error; 493 static int version = 0x0305; /* !!! why is this here? XXX */ 494 495 switch (sopt->sopt_name) { 496 case MRT_VERSION: 497 error = sooptcopyout(sopt, &version, sizeof version); 498 break; 499 500 case MRT_ASSERT: 501 error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert); 502 break; 503 504 case MRT_API_SUPPORT: 505 error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); 506 break; 507 508 case MRT_API_CONFIG: 509 error = sooptcopyout(sopt, &mrt_api_config, sizeof mrt_api_config); 510 break; 511 512 default: 513 error = EOPNOTSUPP; 514 break; 515 } 516 return error; 517} 518 519/* 520 * Handle ioctl commands to obtain information from the cache 521 */ 522static int 523X_mrt_ioctl(int cmd, caddr_t data) 524{ 525 int error = 0; 526 527 switch (cmd) { 528 case (SIOCGETVIFCNT): 529 error = get_vif_cnt((struct sioc_vif_req *)data); 530 break; 531 532 case (SIOCGETSGCNT): 533 error = get_sg_cnt((struct sioc_sg_req *)data); 534 break; 535 536 default: 537 error = EINVAL; 538 break; 539 } 540 return error; 541} 542 543/* 544 * returns the packet, byte, rpf-failure count for the source group provided 545 */ 546static int 547get_sg_cnt(struct sioc_sg_req *req) 548{ 549 struct mfc *rt; 550 551 MFC_LOCK(); 552 rt = mfc_find(req->src.s_addr, req->grp.s_addr); 553 if (rt == NULL) { 554 MFC_UNLOCK(); 555 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; 556 return EADDRNOTAVAIL; 557 } 558 req->pktcnt = rt->mfc_pkt_cnt; 559 req->bytecnt = rt->mfc_byte_cnt; 560 req->wrong_if = rt->mfc_wrong_if; 561 MFC_UNLOCK(); 562 return 0; 563} 564 565/* 566 * returns the input and output packet and byte counts on the vif provided 567 */ 568static int 569get_vif_cnt(struct sioc_vif_req *req) 570{ 571 vifi_t vifi = req->vifi; 572 573 VIF_LOCK(); 574 if (vifi >= numvifs) { 575 VIF_UNLOCK(); 576 return EINVAL; 577 } 578 579 req->icount = viftable[vifi].v_pkt_in; 580 req->ocount = viftable[vifi].v_pkt_out; 581 req->ibytes = viftable[vifi].v_bytes_in; 582 req->obytes = viftable[vifi].v_bytes_out; 583 VIF_UNLOCK(); 584 585 return 0; 586} 587 588static void 589ip_mrouter_reset(void) 590{ 591 bzero((caddr_t)mfctable, sizeof(mfctable)); 592 bzero((caddr_t)nexpire, sizeof(nexpire)); 593 594 pim_assert = 0; 595 mrt_api_config = 0; 596 597 callout_init(&expire_upcalls_ch, CALLOUT_MPSAFE); 598 599 bw_upcalls_n = 0; 600 bzero((caddr_t)bw_meter_timers, sizeof(bw_meter_timers)); 601 callout_init(&bw_upcalls_ch, CALLOUT_MPSAFE); 602 callout_init(&bw_meter_ch, CALLOUT_MPSAFE); 603 604 callout_init(&tbf_reprocess_ch, CALLOUT_MPSAFE); 605} 606 607static struct mtx mrouter_mtx; /* used to synch init/done work */ 608 609/* 610 * Enable multicast routing 611 */ 612static int 613ip_mrouter_init(struct socket *so, int version) 614{ 615 if (mrtdebug) 616 log(LOG_DEBUG, "ip_mrouter_init: so_type = %d, pr_protocol = %d\n", 617 so->so_type, so->so_proto->pr_protocol); 618 619 if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) 620 return EOPNOTSUPP; 621 622 if (version != 1) 623 return ENOPROTOOPT; 624 625 mtx_lock(&mrouter_mtx); 626 627 if (ip_mrouter != NULL) { 628 mtx_unlock(&mrouter_mtx); 629 return EADDRINUSE; 630 } 631 632 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); 633 634 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 635 expire_bw_upcalls_send, NULL); 636 callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); 637 638 ip_mrouter = so; 639 640 mtx_unlock(&mrouter_mtx); 641 642 if (mrtdebug) 643 log(LOG_DEBUG, "ip_mrouter_init\n"); 644 645 return 0; 646} 647 648/* 649 * Disable multicast routing 650 */ 651static int 652X_ip_mrouter_done(void) 653{ 654 vifi_t vifi; 655 int i; 656 struct ifnet *ifp; 657 struct ifreq ifr; 658 struct mfc *rt; 659 struct rtdetq *rte; 660 661 mtx_lock(&mrouter_mtx); 662 663 if (ip_mrouter == NULL) { 664 mtx_unlock(&mrouter_mtx); 665 return EINVAL; 666 } 667 668 /* 669 * Detach/disable hooks to the reset of the system. 670 */ 671 ip_mrouter = NULL; 672 mrt_api_config = 0; 673 674 VIF_LOCK(); 675 if (encap_cookie) { 676 const struct encaptab *c = encap_cookie; 677 encap_cookie = NULL; 678 encap_detach(c); 679 } 680 VIF_UNLOCK(); 681 682 callout_stop(&tbf_reprocess_ch); 683 684 VIF_LOCK(); 685 /* 686 * For each phyint in use, disable promiscuous reception of all IP 687 * multicasts. 688 */ 689 for (vifi = 0; vifi < numvifs; vifi++) { 690 if (viftable[vifi].v_lcl_addr.s_addr != 0 && 691 !(viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 692 struct sockaddr_in *so = (struct sockaddr_in *)&(ifr.ifr_addr); 693 694 so->sin_len = sizeof(struct sockaddr_in); 695 so->sin_family = AF_INET; 696 so->sin_addr.s_addr = INADDR_ANY; 697 ifp = viftable[vifi].v_ifp; 698 if_allmulti(ifp, 0); 699 } 700 } 701 bzero((caddr_t)tbftable, sizeof(tbftable)); 702 bzero((caddr_t)viftable, sizeof(viftable)); 703 numvifs = 0; 704 pim_assert = 0; 705 VIF_UNLOCK(); 706 707 /* 708 * Free all multicast forwarding cache entries. 709 */ 710 callout_stop(&expire_upcalls_ch); 711 callout_stop(&bw_upcalls_ch); 712 callout_stop(&bw_meter_ch); 713 714 MFC_LOCK(); 715 for (i = 0; i < MFCTBLSIZ; i++) { 716 for (rt = mfctable[i]; rt != NULL; ) { 717 struct mfc *nr = rt->mfc_next; 718 719 for (rte = rt->mfc_stall; rte != NULL; ) { 720 struct rtdetq *n = rte->next; 721 722 m_freem(rte->m); 723 free(rte, M_MRTABLE); 724 rte = n; 725 } 726 free_bw_list(rt->mfc_bw_meter); 727 free(rt, M_MRTABLE); 728 rt = nr; 729 } 730 } 731 bzero((caddr_t)mfctable, sizeof(mfctable)); 732 bzero((caddr_t)nexpire, sizeof(nexpire)); 733 bw_upcalls_n = 0; 734 bzero(bw_meter_timers, sizeof(bw_meter_timers)); 735 MFC_UNLOCK(); 736 737 /* 738 * Reset de-encapsulation cache 739 */ 740 last_encap_src = INADDR_ANY; 741 last_encap_vif = NULL; 742#ifdef PIM 743 reg_vif_num = VIFI_INVALID; 744#endif 745 746 mtx_unlock(&mrouter_mtx); 747 748 if (mrtdebug) 749 log(LOG_DEBUG, "ip_mrouter_done\n"); 750 751 return 0; 752} 753 754/* 755 * Set PIM assert processing global 756 */ 757static int 758set_assert(int i) 759{ 760 if ((i != 1) && (i != 0)) 761 return EINVAL; 762 763 pim_assert = i; 764 765 return 0; 766} 767 768/* 769 * Configure API capabilities 770 */ 771int 772set_api_config(uint32_t *apival) 773{ 774 int i; 775 776 /* 777 * We can set the API capabilities only if it is the first operation 778 * after MRT_INIT. I.e.: 779 * - there are no vifs installed 780 * - pim_assert is not enabled 781 * - the MFC table is empty 782 */ 783 if (numvifs > 0) { 784 *apival = 0; 785 return EPERM; 786 } 787 if (pim_assert) { 788 *apival = 0; 789 return EPERM; 790 } 791 for (i = 0; i < MFCTBLSIZ; i++) { 792 if (mfctable[i] != NULL) { 793 *apival = 0; 794 return EPERM; 795 } 796 } 797 798 mrt_api_config = *apival & mrt_api_support; 799 *apival = mrt_api_config; 800 801 return 0; 802} 803 804/* 805 * Decide if a packet is from a tunnelled peer. 806 * Return 0 if not, 64 if so. XXX yuck.. 64 ??? 807 */ 808static int 809mroute_encapcheck(const struct mbuf *m, int off, int proto, void *arg) 810{ 811 struct ip *ip = mtod(m, struct ip *); 812 int hlen = ip->ip_hl << 2; 813 814 /* 815 * don't claim the packet if it's not to a multicast destination or if 816 * we don't have an encapsulating tunnel with the source. 817 * Note: This code assumes that the remote site IP address 818 * uniquely identifies the tunnel (i.e., that this site has 819 * at most one tunnel with the remote site). 820 */ 821 if (!IN_MULTICAST(ntohl(((struct ip *)((char *)ip+hlen))->ip_dst.s_addr))) 822 return 0; 823 if (ip->ip_src.s_addr != last_encap_src) { 824 struct vif *vifp = viftable; 825 struct vif *vife = vifp + numvifs; 826 827 last_encap_src = ip->ip_src.s_addr; 828 last_encap_vif = NULL; 829 for ( ; vifp < vife; ++vifp) 830 if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) { 831 if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT)) == VIFF_TUNNEL) 832 last_encap_vif = vifp; 833 break; 834 } 835 } 836 if (last_encap_vif == NULL) { 837 last_encap_src = INADDR_ANY; 838 return 0; 839 } 840 return 64; 841} 842 843/* 844 * De-encapsulate a packet and feed it back through ip input (this 845 * routine is called whenever IP gets a packet that mroute_encap_func() 846 * claimed). 847 */ 848static void 849mroute_encap_input(struct mbuf *m, int off) 850{ 851 struct ip *ip = mtod(m, struct ip *); 852 int hlen = ip->ip_hl << 2; 853 854 if (hlen > sizeof(struct ip)) 855 ip_stripoptions(m, (struct mbuf *) 0); 856 m->m_data += sizeof(struct ip); 857 m->m_len -= sizeof(struct ip); 858 m->m_pkthdr.len -= sizeof(struct ip); 859 860 m->m_pkthdr.rcvif = last_encap_vif->v_ifp; 861 862 netisr_queue(NETISR_IP, m); 863 /* 864 * normally we would need a "schednetisr(NETISR_IP)" 865 * here but we were called by ip_input and it is going 866 * to loop back & try to dequeue the packet we just 867 * queued as soon as we return so we avoid the 868 * unnecessary software interrrupt. 869 * 870 * XXX 871 * This no longer holds - we may have direct-dispatched the packet, 872 * or there may be a queue processing limit. 873 */ 874} 875 876extern struct domain inetdomain; 877static struct protosw mroute_encap_protosw = 878{ SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR, 879 mroute_encap_input, 0, 0, rip_ctloutput, 880 0, 881 0, 0, 0, 0, 882 &rip_usrreqs 883}; 884 885/* 886 * Add a vif to the vif table 887 */ 888static int 889add_vif(struct vifctl *vifcp) 890{ 891 struct vif *vifp = viftable + vifcp->vifc_vifi; 892 struct sockaddr_in sin = {sizeof sin, AF_INET}; 893 struct ifaddr *ifa; 894 struct ifnet *ifp; 895 int error; 896 struct tbf *v_tbf = tbftable + vifcp->vifc_vifi; 897 898 VIF_LOCK(); 899 if (vifcp->vifc_vifi >= MAXVIFS) { 900 VIF_UNLOCK(); 901 return EINVAL; 902 } 903 if (vifp->v_lcl_addr.s_addr != INADDR_ANY) { 904 VIF_UNLOCK(); 905 return EADDRINUSE; 906 } 907 if (vifcp->vifc_lcl_addr.s_addr == INADDR_ANY) { 908 VIF_UNLOCK(); 909 return EADDRNOTAVAIL; 910 } 911 912 /* Find the interface with an address in AF_INET family */ 913#ifdef PIM 914 if (vifcp->vifc_flags & VIFF_REGISTER) { 915 /* 916 * XXX: Because VIFF_REGISTER does not really need a valid 917 * local interface (e.g. it could be 127.0.0.2), we don't 918 * check its address. 919 */ 920 ifp = NULL; 921 } else 922#endif 923 { 924 sin.sin_addr = vifcp->vifc_lcl_addr; 925 ifa = ifa_ifwithaddr((struct sockaddr *)&sin); 926 if (ifa == NULL) { 927 VIF_UNLOCK(); 928 return EADDRNOTAVAIL; 929 } 930 ifp = ifa->ifa_ifp; 931 } 932 933 if (vifcp->vifc_flags & VIFF_TUNNEL) { 934 if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) { 935 /* 936 * An encapsulating tunnel is wanted. Tell 937 * mroute_encap_input() to start paying attention 938 * to encapsulated packets. 939 */ 940 if (encap_cookie == NULL) { 941 int i; 942 943 encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4, 944 mroute_encapcheck, 945 (struct protosw *)&mroute_encap_protosw, NULL); 946 947 if (encap_cookie == NULL) { 948 printf("ip_mroute: unable to attach encap\n"); 949 VIF_UNLOCK(); 950 return EIO; /* XXX */ 951 } 952 for (i = 0; i < MAXVIFS; ++i) { 953 if_initname(&multicast_decap_if[i], "mdecap", i); 954 } 955 } 956 /* 957 * Set interface to fake encapsulator interface 958 */ 959 ifp = &multicast_decap_if[vifcp->vifc_vifi]; 960 /* 961 * Prepare cached route entry 962 */ 963 bzero(&vifp->v_route, sizeof(vifp->v_route)); 964 } else { 965 log(LOG_ERR, "source routed tunnels not supported\n"); 966 VIF_UNLOCK(); 967 return EOPNOTSUPP; 968 } 969#ifdef PIM 970 } else if (vifcp->vifc_flags & VIFF_REGISTER) { 971 ifp = &multicast_register_if; 972 if (mrtdebug) 973 log(LOG_DEBUG, "Adding a register vif, ifp: %p\n", 974 (void *)&multicast_register_if); 975 if (reg_vif_num == VIFI_INVALID) { 976 if_initname(&multicast_register_if, "register_vif", 0); 977 multicast_register_if.if_flags = IFF_LOOPBACK; 978 bzero(&vifp->v_route, sizeof(vifp->v_route)); 979 reg_vif_num = vifcp->vifc_vifi; 980 } 981#endif 982 } else { /* Make sure the interface supports multicast */ 983 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 984 VIF_UNLOCK(); 985 return EOPNOTSUPP; 986 } 987 988 /* Enable promiscuous reception of all IP multicasts from the if */ 989 error = if_allmulti(ifp, 1); 990 if (error) { 991 VIF_UNLOCK(); 992 return error; 993 } 994 } 995 996 /* define parameters for the tbf structure */ 997 vifp->v_tbf = v_tbf; 998 GET_TIME(vifp->v_tbf->tbf_last_pkt_t); 999 vifp->v_tbf->tbf_n_tok = 0; 1000 vifp->v_tbf->tbf_q_len = 0; 1001 vifp->v_tbf->tbf_max_q_len = MAXQSIZE; 1002 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; 1003 1004 vifp->v_flags = vifcp->vifc_flags; 1005 vifp->v_threshold = vifcp->vifc_threshold; 1006 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 1007 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 1008 vifp->v_ifp = ifp; 1009 /* scaling up here allows division by 1024 in critical code */ 1010 vifp->v_rate_limit= vifcp->vifc_rate_limit * 1024 / 1000; 1011 vifp->v_rsvp_on = 0; 1012 vifp->v_rsvpd = NULL; 1013 /* initialize per vif pkt counters */ 1014 vifp->v_pkt_in = 0; 1015 vifp->v_pkt_out = 0; 1016 vifp->v_bytes_in = 0; 1017 vifp->v_bytes_out = 0; 1018 1019 /* Adjust numvifs up if the vifi is higher than numvifs */ 1020 if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1; 1021 1022 VIF_UNLOCK(); 1023 1024 if (mrtdebug) 1025 log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x, rate %d\n", 1026 vifcp->vifc_vifi, 1027 (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr), 1028 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1029 (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr), 1030 vifcp->vifc_threshold, 1031 vifcp->vifc_rate_limit); 1032 1033 return 0; 1034} 1035 1036/* 1037 * Delete a vif from the vif table 1038 */ 1039static int 1040del_vif(vifi_t vifi) 1041{ 1042 struct vif *vifp; 1043 1044 VIF_LOCK(); 1045 1046 if (vifi >= numvifs) { 1047 VIF_UNLOCK(); 1048 return EINVAL; 1049 } 1050 vifp = &viftable[vifi]; 1051 if (vifp->v_lcl_addr.s_addr == INADDR_ANY) { 1052 VIF_UNLOCK(); 1053 return EADDRNOTAVAIL; 1054 } 1055 1056 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) 1057 if_allmulti(vifp->v_ifp, 0); 1058 1059 if (vifp == last_encap_vif) { 1060 last_encap_vif = NULL; 1061 last_encap_src = INADDR_ANY; 1062 } 1063 1064 /* 1065 * Free packets queued at the interface 1066 */ 1067 while (vifp->v_tbf->tbf_q) { 1068 struct mbuf *m = vifp->v_tbf->tbf_q; 1069 1070 vifp->v_tbf->tbf_q = m->m_act; 1071 m_freem(m); 1072 } 1073 1074#ifdef PIM 1075 if (vifp->v_flags & VIFF_REGISTER) 1076 reg_vif_num = VIFI_INVALID; 1077#endif 1078 1079 bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf))); 1080 bzero((caddr_t)vifp, sizeof (*vifp)); 1081 1082 if (mrtdebug) 1083 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs); 1084 1085 /* Adjust numvifs down */ 1086 for (vifi = numvifs; vifi > 0; vifi--) 1087 if (viftable[vifi-1].v_lcl_addr.s_addr != INADDR_ANY) 1088 break; 1089 numvifs = vifi; 1090 1091 VIF_UNLOCK(); 1092 1093 return 0; 1094} 1095 1096/* 1097 * update an mfc entry without resetting counters and S,G addresses. 1098 */ 1099static void 1100update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 1101{ 1102 int i; 1103 1104 rt->mfc_parent = mfccp->mfcc_parent; 1105 for (i = 0; i < numvifs; i++) { 1106 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1107 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config & 1108 MRT_MFC_FLAGS_ALL; 1109 } 1110 /* set the RP address */ 1111 if (mrt_api_config & MRT_MFC_RP) 1112 rt->mfc_rp = mfccp->mfcc_rp; 1113 else 1114 rt->mfc_rp.s_addr = INADDR_ANY; 1115} 1116 1117/* 1118 * fully initialize an mfc entry from the parameter. 1119 */ 1120static void 1121init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 1122{ 1123 rt->mfc_origin = mfccp->mfcc_origin; 1124 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1125 1126 update_mfc_params(rt, mfccp); 1127 1128 /* initialize pkt counters per src-grp */ 1129 rt->mfc_pkt_cnt = 0; 1130 rt->mfc_byte_cnt = 0; 1131 rt->mfc_wrong_if = 0; 1132 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; 1133} 1134 1135 1136/* 1137 * Add an mfc entry 1138 */ 1139static int 1140add_mfc(struct mfcctl2 *mfccp) 1141{ 1142 struct mfc *rt; 1143 u_long hash; 1144 struct rtdetq *rte; 1145 u_short nstl; 1146 1147 VIF_LOCK(); 1148 MFC_LOCK(); 1149 1150 rt = mfc_find(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); 1151 1152 /* If an entry already exists, just update the fields */ 1153 if (rt) { 1154 if (mrtdebug & DEBUG_MFC) 1155 log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n", 1156 (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1157 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1158 mfccp->mfcc_parent); 1159 1160 update_mfc_params(rt, mfccp); 1161 MFC_UNLOCK(); 1162 VIF_UNLOCK(); 1163 return 0; 1164 } 1165 1166 /* 1167 * Find the entry for which the upcall was made and update 1168 */ 1169 hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); 1170 for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) { 1171 1172 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1173 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && 1174 (rt->mfc_stall != NULL)) { 1175 1176 if (nstl++) 1177 log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n", 1178 "multiple kernel entries", 1179 (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1180 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1181 mfccp->mfcc_parent, (void *)rt->mfc_stall); 1182 1183 if (mrtdebug & DEBUG_MFC) 1184 log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n", 1185 (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1186 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1187 mfccp->mfcc_parent, (void *)rt->mfc_stall); 1188 1189 init_mfc_params(rt, mfccp); 1190 1191 rt->mfc_expire = 0; /* Don't clean this guy up */ 1192 nexpire[hash]--; 1193 1194 /* free packets Qed at the end of this entry */ 1195 for (rte = rt->mfc_stall; rte != NULL; ) { 1196 struct rtdetq *n = rte->next; 1197 1198 ip_mdq(rte->m, rte->ifp, rt, -1); 1199 m_freem(rte->m); 1200 free(rte, M_MRTABLE); 1201 rte = n; 1202 } 1203 rt->mfc_stall = NULL; 1204 } 1205 } 1206 1207 /* 1208 * It is possible that an entry is being inserted without an upcall 1209 */ 1210 if (nstl == 0) { 1211 if (mrtdebug & DEBUG_MFC) 1212 log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n", 1213 hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1214 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1215 mfccp->mfcc_parent); 1216 1217 for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) { 1218 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1219 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) { 1220 init_mfc_params(rt, mfccp); 1221 if (rt->mfc_expire) 1222 nexpire[hash]--; 1223 rt->mfc_expire = 0; 1224 break; /* XXX */ 1225 } 1226 } 1227 if (rt == NULL) { /* no upcall, so make a new entry */ 1228 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); 1229 if (rt == NULL) { 1230 MFC_UNLOCK(); 1231 VIF_UNLOCK(); 1232 return ENOBUFS; 1233 } 1234 1235 init_mfc_params(rt, mfccp); 1236 rt->mfc_expire = 0; 1237 rt->mfc_stall = NULL; 1238 1239 rt->mfc_bw_meter = NULL; 1240 /* insert new entry at head of hash chain */ 1241 rt->mfc_next = mfctable[hash]; 1242 mfctable[hash] = rt; 1243 } 1244 } 1245 MFC_UNLOCK(); 1246 VIF_UNLOCK(); 1247 return 0; 1248} 1249 1250/* 1251 * Delete an mfc entry 1252 */ 1253static int 1254del_mfc(struct mfcctl2 *mfccp) 1255{ 1256 struct in_addr origin; 1257 struct in_addr mcastgrp; 1258 struct mfc *rt; 1259 struct mfc **nptr; 1260 u_long hash; 1261 struct bw_meter *list; 1262 1263 origin = mfccp->mfcc_origin; 1264 mcastgrp = mfccp->mfcc_mcastgrp; 1265 1266 if (mrtdebug & DEBUG_MFC) 1267 log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n", 1268 (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); 1269 1270 MFC_LOCK(); 1271 1272 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); 1273 for (nptr = &mfctable[hash]; (rt = *nptr) != NULL; nptr = &rt->mfc_next) 1274 if (origin.s_addr == rt->mfc_origin.s_addr && 1275 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && 1276 rt->mfc_stall == NULL) 1277 break; 1278 if (rt == NULL) { 1279 MFC_UNLOCK(); 1280 return EADDRNOTAVAIL; 1281 } 1282 1283 *nptr = rt->mfc_next; 1284 1285 /* 1286 * free the bw_meter entries 1287 */ 1288 list = rt->mfc_bw_meter; 1289 rt->mfc_bw_meter = NULL; 1290 1291 free(rt, M_MRTABLE); 1292 1293 free_bw_list(list); 1294 1295 MFC_UNLOCK(); 1296 1297 return 0; 1298} 1299 1300/* 1301 * Send a message to mrouted on the multicast routing socket 1302 */ 1303static int 1304socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) 1305{ 1306 if (s) { 1307 mtx_lock(&Giant); /* XXX until sockets are locked */ 1308 if (sbappendaddr(&s->so_rcv, (struct sockaddr *)src, mm, NULL) != 0) { 1309 sorwakeup(s); 1310 mtx_unlock(&Giant); 1311 return 0; 1312 } 1313 mtx_unlock(&Giant); 1314 } 1315 m_freem(mm); 1316 return -1; 1317} 1318 1319/* 1320 * IP multicast forwarding function. This function assumes that the packet 1321 * pointed to by "ip" has arrived on (or is about to be sent to) the interface 1322 * pointed to by "ifp", and the packet is to be relayed to other networks 1323 * that have members of the packet's destination IP multicast group. 1324 * 1325 * The packet is returned unscathed to the caller, unless it is 1326 * erroneous, in which case a non-zero return value tells the caller to 1327 * discard it. 1328 */ 1329 1330#define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1331 1332static int 1333X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, 1334 struct ip_moptions *imo) 1335{ 1336 struct mfc *rt; 1337 int error; 1338 vifi_t vifi; 1339 1340 if (mrtdebug & DEBUG_FORWARD) 1341 log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n", 1342 (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), 1343 (void *)ifp); 1344 1345 if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || 1346 ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { 1347 /* 1348 * Packet arrived via a physical interface or 1349 * an encapsulated tunnel or a register_vif. 1350 */ 1351 } else { 1352 /* 1353 * Packet arrived through a source-route tunnel. 1354 * Source-route tunnels are no longer supported. 1355 */ 1356 static int last_log; 1357 if (last_log != time_second) { 1358 last_log = time_second; 1359 log(LOG_ERR, 1360 "ip_mforward: received source-routed packet from %lx\n", 1361 (u_long)ntohl(ip->ip_src.s_addr)); 1362 } 1363 return 1; 1364 } 1365 1366 VIF_LOCK(); 1367 MFC_LOCK(); 1368 if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) { 1369 if (ip->ip_ttl < 255) 1370 ip->ip_ttl++; /* compensate for -1 in *_send routines */ 1371 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { 1372 struct vif *vifp = viftable + vifi; 1373 1374 printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s)\n", 1375 (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr), 1376 vifi, 1377 (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "", 1378 vifp->v_ifp->if_xname); 1379 } 1380 error = ip_mdq(m, ifp, NULL, vifi); 1381 MFC_UNLOCK(); 1382 VIF_UNLOCK(); 1383 return error; 1384 } 1385 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { 1386 printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n", 1387 (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr)); 1388 if (!imo) 1389 printf("In fact, no options were specified at all\n"); 1390 } 1391 1392 /* 1393 * Don't forward a packet with time-to-live of zero or one, 1394 * or a packet destined to a local-only group. 1395 */ 1396 if (ip->ip_ttl <= 1 || ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP) { 1397 MFC_UNLOCK(); 1398 VIF_UNLOCK(); 1399 return 0; 1400 } 1401 1402 /* 1403 * Determine forwarding vifs from the forwarding cache table 1404 */ 1405 ++mrtstat.mrts_mfc_lookups; 1406 rt = mfc_find(ip->ip_src.s_addr, ip->ip_dst.s_addr); 1407 1408 /* Entry exists, so forward if necessary */ 1409 if (rt != NULL) { 1410 error = ip_mdq(m, ifp, rt, -1); 1411 MFC_UNLOCK(); 1412 VIF_UNLOCK(); 1413 return error; 1414 } else { 1415 /* 1416 * If we don't have a route for packet's origin, 1417 * Make a copy of the packet & send message to routing daemon 1418 */ 1419 1420 struct mbuf *mb0; 1421 struct rtdetq *rte; 1422 u_long hash; 1423 int hlen = ip->ip_hl << 2; 1424 1425 ++mrtstat.mrts_mfc_misses; 1426 1427 mrtstat.mrts_no_route++; 1428 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) 1429 log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n", 1430 (u_long)ntohl(ip->ip_src.s_addr), 1431 (u_long)ntohl(ip->ip_dst.s_addr)); 1432 1433 /* 1434 * Allocate mbufs early so that we don't do extra work if we are 1435 * just going to fail anyway. Make sure to pullup the header so 1436 * that other people can't step on it. 1437 */ 1438 rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT); 1439 if (rte == NULL) { 1440 MFC_UNLOCK(); 1441 VIF_UNLOCK(); 1442 return ENOBUFS; 1443 } 1444 mb0 = m_copypacket(m, M_DONTWAIT); 1445 if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) 1446 mb0 = m_pullup(mb0, hlen); 1447 if (mb0 == NULL) { 1448 free(rte, M_MRTABLE); 1449 MFC_UNLOCK(); 1450 VIF_UNLOCK(); 1451 return ENOBUFS; 1452 } 1453 1454 /* is there an upcall waiting for this flow ? */ 1455 hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr); 1456 for (rt = mfctable[hash]; rt; rt = rt->mfc_next) { 1457 if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) && 1458 (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) && 1459 (rt->mfc_stall != NULL)) 1460 break; 1461 } 1462 1463 if (rt == NULL) { 1464 int i; 1465 struct igmpmsg *im; 1466 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 1467 struct mbuf *mm; 1468 1469 /* 1470 * Locate the vifi for the incoming interface for this packet. 1471 * If none found, drop packet. 1472 */ 1473 for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) 1474 ; 1475 if (vifi >= numvifs) /* vif not found, drop packet */ 1476 goto non_fatal; 1477 1478 /* no upcall, so make a new entry */ 1479 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); 1480 if (rt == NULL) 1481 goto fail; 1482 /* Make a copy of the header to send to the user level process */ 1483 mm = m_copy(mb0, 0, hlen); 1484 if (mm == NULL) 1485 goto fail1; 1486 1487 /* 1488 * Send message to routing daemon to install 1489 * a route into the kernel table 1490 */ 1491 1492 im = mtod(mm, struct igmpmsg *); 1493 im->im_msgtype = IGMPMSG_NOCACHE; 1494 im->im_mbz = 0; 1495 im->im_vif = vifi; 1496 1497 mrtstat.mrts_upcalls++; 1498 1499 k_igmpsrc.sin_addr = ip->ip_src; 1500 if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { 1501 log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); 1502 ++mrtstat.mrts_upq_sockfull; 1503fail1: 1504 free(rt, M_MRTABLE); 1505fail: 1506 free(rte, M_MRTABLE); 1507 m_freem(mb0); 1508 MFC_UNLOCK(); 1509 VIF_UNLOCK(); 1510 return ENOBUFS; 1511 } 1512 1513 /* insert new entry at head of hash chain */ 1514 rt->mfc_origin.s_addr = ip->ip_src.s_addr; 1515 rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; 1516 rt->mfc_expire = UPCALL_EXPIRE; 1517 nexpire[hash]++; 1518 for (i = 0; i < numvifs; i++) { 1519 rt->mfc_ttls[i] = 0; 1520 rt->mfc_flags[i] = 0; 1521 } 1522 rt->mfc_parent = -1; 1523 1524 rt->mfc_rp.s_addr = INADDR_ANY; /* clear the RP address */ 1525 1526 rt->mfc_bw_meter = NULL; 1527 1528 /* link into table */ 1529 rt->mfc_next = mfctable[hash]; 1530 mfctable[hash] = rt; 1531 rt->mfc_stall = rte; 1532 1533 } else { 1534 /* determine if q has overflowed */ 1535 int npkts = 0; 1536 struct rtdetq **p; 1537 1538 /* 1539 * XXX ouch! we need to append to the list, but we 1540 * only have a pointer to the front, so we have to 1541 * scan the entire list every time. 1542 */ 1543 for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) 1544 npkts++; 1545 1546 if (npkts > MAX_UPQ) { 1547 mrtstat.mrts_upq_ovflw++; 1548non_fatal: 1549 free(rte, M_MRTABLE); 1550 m_freem(mb0); 1551 MFC_UNLOCK(); 1552 VIF_UNLOCK(); 1553 return 0; 1554 } 1555 1556 /* Add this entry to the end of the queue */ 1557 *p = rte; 1558 } 1559 1560 rte->m = mb0; 1561 rte->ifp = ifp; 1562 rte->next = NULL; 1563 1564 MFC_UNLOCK(); 1565 VIF_UNLOCK(); 1566 1567 return 0; 1568 } 1569} 1570 1571/* 1572 * Clean up the cache entry if upcall is not serviced 1573 */ 1574static void 1575expire_upcalls(void *unused) 1576{ 1577 struct rtdetq *rte; 1578 struct mfc *mfc, **nptr; 1579 int i; 1580 1581 MFC_LOCK(); 1582 for (i = 0; i < MFCTBLSIZ; i++) { 1583 if (nexpire[i] == 0) 1584 continue; 1585 nptr = &mfctable[i]; 1586 for (mfc = *nptr; mfc != NULL; mfc = *nptr) { 1587 /* 1588 * Skip real cache entries 1589 * Make sure it wasn't marked to not expire (shouldn't happen) 1590 * If it expires now 1591 */ 1592 if (mfc->mfc_stall != NULL && mfc->mfc_expire != 0 && 1593 --mfc->mfc_expire == 0) { 1594 if (mrtdebug & DEBUG_EXPIRE) 1595 log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n", 1596 (u_long)ntohl(mfc->mfc_origin.s_addr), 1597 (u_long)ntohl(mfc->mfc_mcastgrp.s_addr)); 1598 /* 1599 * drop all the packets 1600 * free the mbuf with the pkt, if, timing info 1601 */ 1602 for (rte = mfc->mfc_stall; rte; ) { 1603 struct rtdetq *n = rte->next; 1604 1605 m_freem(rte->m); 1606 free(rte, M_MRTABLE); 1607 rte = n; 1608 } 1609 ++mrtstat.mrts_cache_cleanups; 1610 nexpire[i]--; 1611 1612 /* 1613 * free the bw_meter entries 1614 */ 1615 while (mfc->mfc_bw_meter != NULL) { 1616 struct bw_meter *x = mfc->mfc_bw_meter; 1617 1618 mfc->mfc_bw_meter = x->bm_mfc_next; 1619 free(x, M_BWMETER); 1620 } 1621 1622 *nptr = mfc->mfc_next; 1623 free(mfc, M_MRTABLE); 1624 } else { 1625 nptr = &mfc->mfc_next; 1626 } 1627 } 1628 } 1629 MFC_UNLOCK(); 1630 1631 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); 1632} 1633 1634/* 1635 * Packet forwarding routine once entry in the cache is made 1636 */ 1637static int 1638ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) 1639{ 1640 struct ip *ip = mtod(m, struct ip *); 1641 vifi_t vifi; 1642 int plen = ip->ip_len; 1643 1644 VIF_LOCK_ASSERT(); 1645/* 1646 * Macro to send packet on vif. Since RSVP packets don't get counted on 1647 * input, they shouldn't get counted on output, so statistics keeping is 1648 * separate. 1649 */ 1650#define MC_SEND(ip,vifp,m) { \ 1651 if ((vifp)->v_flags & VIFF_TUNNEL) \ 1652 encap_send((ip), (vifp), (m)); \ 1653 else \ 1654 phyint_send((ip), (vifp), (m)); \ 1655} 1656 1657 /* 1658 * If xmt_vif is not -1, send on only the requested vif. 1659 * 1660 * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) 1661 */ 1662 if (xmt_vif < numvifs) { 1663#ifdef PIM 1664 if (viftable[xmt_vif].v_flags & VIFF_REGISTER) 1665 pim_register_send(ip, viftable + xmt_vif, m, rt); 1666 else 1667#endif 1668 MC_SEND(ip, viftable + xmt_vif, m); 1669 return 1; 1670 } 1671 1672 /* 1673 * Don't forward if it didn't arrive from the parent vif for its origin. 1674 */ 1675 vifi = rt->mfc_parent; 1676 if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { 1677 /* came in the wrong interface */ 1678 if (mrtdebug & DEBUG_FORWARD) 1679 log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", 1680 (void *)ifp, vifi, (void *)viftable[vifi].v_ifp); 1681 ++mrtstat.mrts_wrong_if; 1682 ++rt->mfc_wrong_if; 1683 /* 1684 * If we are doing PIM assert processing, send a message 1685 * to the routing daemon. 1686 * 1687 * XXX: A PIM-SM router needs the WRONGVIF detection so it 1688 * can complete the SPT switch, regardless of the type 1689 * of the iif (broadcast media, GRE tunnel, etc). 1690 */ 1691 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) { 1692 struct timeval now; 1693 u_long delta; 1694 1695#ifdef PIM 1696 if (ifp == &multicast_register_if) 1697 pimstat.pims_rcv_registers_wrongiif++; 1698#endif 1699 1700 /* Get vifi for the incoming packet */ 1701 for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) 1702 ; 1703 if (vifi >= numvifs) 1704 return 0; /* The iif is not found: ignore the packet. */ 1705 1706 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) 1707 return 0; /* WRONGVIF disabled: ignore the packet */ 1708 1709 GET_TIME(now); 1710 1711 TV_DELTA(rt->mfc_last_assert, now, delta); 1712 1713 if (delta > ASSERT_MSG_TIME) { 1714 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 1715 struct igmpmsg *im; 1716 int hlen = ip->ip_hl << 2; 1717 struct mbuf *mm = m_copy(m, 0, hlen); 1718 1719 if (mm && (M_HASCL(mm) || mm->m_len < hlen)) 1720 mm = m_pullup(mm, hlen); 1721 if (mm == NULL) 1722 return ENOBUFS; 1723 1724 rt->mfc_last_assert = now; 1725 1726 im = mtod(mm, struct igmpmsg *); 1727 im->im_msgtype = IGMPMSG_WRONGVIF; 1728 im->im_mbz = 0; 1729 im->im_vif = vifi; 1730 1731 mrtstat.mrts_upcalls++; 1732 1733 k_igmpsrc.sin_addr = im->im_src; 1734 if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { 1735 log(LOG_WARNING, 1736 "ip_mforward: ip_mrouter socket queue full\n"); 1737 ++mrtstat.mrts_upq_sockfull; 1738 return ENOBUFS; 1739 } 1740 } 1741 } 1742 return 0; 1743 } 1744 1745 /* If I sourced this packet, it counts as output, else it was input. */ 1746 if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) { 1747 viftable[vifi].v_pkt_out++; 1748 viftable[vifi].v_bytes_out += plen; 1749 } else { 1750 viftable[vifi].v_pkt_in++; 1751 viftable[vifi].v_bytes_in += plen; 1752 } 1753 rt->mfc_pkt_cnt++; 1754 rt->mfc_byte_cnt += plen; 1755 1756 /* 1757 * For each vif, decide if a copy of the packet should be forwarded. 1758 * Forward if: 1759 * - the ttl exceeds the vif's threshold 1760 * - there are group members downstream on interface 1761 */ 1762 for (vifi = 0; vifi < numvifs; vifi++) 1763 if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { 1764 viftable[vifi].v_pkt_out++; 1765 viftable[vifi].v_bytes_out += plen; 1766#ifdef PIM 1767 if (viftable[vifi].v_flags & VIFF_REGISTER) 1768 pim_register_send(ip, viftable + vifi, m, rt); 1769 else 1770#endif 1771 MC_SEND(ip, viftable+vifi, m); 1772 } 1773 1774 /* 1775 * Perform upcall-related bw measuring. 1776 */ 1777 if (rt->mfc_bw_meter != NULL) { 1778 struct bw_meter *x; 1779 struct timeval now; 1780 1781 GET_TIME(now); 1782 MFC_LOCK_ASSERT(); 1783 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) 1784 bw_meter_receive_packet(x, plen, &now); 1785 } 1786 1787 return 0; 1788} 1789 1790/* 1791 * check if a vif number is legal/ok. This is used by ip_output. 1792 */ 1793static int 1794X_legal_vif_num(int vif) 1795{ 1796 /* XXX unlocked, matter? */ 1797 return (vif >= 0 && vif < numvifs); 1798} 1799 1800/* 1801 * Return the local address used by this vif 1802 */ 1803static u_long 1804X_ip_mcast_src(int vifi) 1805{ 1806 /* XXX unlocked, matter? */ 1807 if (vifi >= 0 && vifi < numvifs) 1808 return viftable[vifi].v_lcl_addr.s_addr; 1809 else 1810 return INADDR_ANY; 1811} 1812 1813static void 1814phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1815{ 1816 struct mbuf *mb_copy; 1817 int hlen = ip->ip_hl << 2; 1818 1819 VIF_LOCK_ASSERT(); 1820 1821 /* 1822 * Make a new reference to the packet; make sure that 1823 * the IP header is actually copied, not just referenced, 1824 * so that ip_output() only scribbles on the copy. 1825 */ 1826 mb_copy = m_copypacket(m, M_DONTWAIT); 1827 if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) 1828 mb_copy = m_pullup(mb_copy, hlen); 1829 if (mb_copy == NULL) 1830 return; 1831 1832 if (vifp->v_rate_limit == 0) 1833 tbf_send_packet(vifp, mb_copy); 1834 else 1835 tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len); 1836} 1837 1838static void 1839encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1840{ 1841 struct mbuf *mb_copy; 1842 struct ip *ip_copy; 1843 int i, len = ip->ip_len; 1844 1845 VIF_LOCK_ASSERT(); 1846 1847 /* Take care of delayed checksums */ 1848 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 1849 in_delayed_cksum(m); 1850 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 1851 } 1852 1853 /* 1854 * copy the old packet & pullup its IP header into the 1855 * new mbuf so we can modify it. Try to fill the new 1856 * mbuf since if we don't the ethernet driver will. 1857 */ 1858 MGETHDR(mb_copy, M_DONTWAIT, MT_HEADER); 1859 if (mb_copy == NULL) 1860 return; 1861#ifdef MAC 1862 mac_create_mbuf_multicast_encap(m, vifp->v_ifp, mb_copy); 1863#endif 1864 mb_copy->m_data += max_linkhdr; 1865 mb_copy->m_len = sizeof(multicast_encap_iphdr); 1866 1867 if ((mb_copy->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) { 1868 m_freem(mb_copy); 1869 return; 1870 } 1871 i = MHLEN - M_LEADINGSPACE(mb_copy); 1872 if (i > len) 1873 i = len; 1874 mb_copy = m_pullup(mb_copy, i); 1875 if (mb_copy == NULL) 1876 return; 1877 mb_copy->m_pkthdr.len = len + sizeof(multicast_encap_iphdr); 1878 1879 /* 1880 * fill in the encapsulating IP header. 1881 */ 1882 ip_copy = mtod(mb_copy, struct ip *); 1883 *ip_copy = multicast_encap_iphdr; 1884#ifdef RANDOM_IP_ID 1885 ip_copy->ip_id = ip_randomid(); 1886#else 1887 ip_copy->ip_id = htons(ip_id++); 1888#endif 1889 ip_copy->ip_len += len; 1890 ip_copy->ip_src = vifp->v_lcl_addr; 1891 ip_copy->ip_dst = vifp->v_rmt_addr; 1892 1893 /* 1894 * turn the encapsulated IP header back into a valid one. 1895 */ 1896 ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr)); 1897 --ip->ip_ttl; 1898 ip->ip_len = htons(ip->ip_len); 1899 ip->ip_off = htons(ip->ip_off); 1900 ip->ip_sum = 0; 1901 mb_copy->m_data += sizeof(multicast_encap_iphdr); 1902 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 1903 mb_copy->m_data -= sizeof(multicast_encap_iphdr); 1904 1905 if (vifp->v_rate_limit == 0) 1906 tbf_send_packet(vifp, mb_copy); 1907 else 1908 tbf_control(vifp, mb_copy, ip, ip_copy->ip_len); 1909} 1910 1911/* 1912 * Token bucket filter module 1913 */ 1914 1915static void 1916tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_long p_len) 1917{ 1918 struct tbf *t = vifp->v_tbf; 1919 1920 VIF_LOCK_ASSERT(); 1921 1922 if (p_len > MAX_BKT_SIZE) { /* drop if packet is too large */ 1923 mrtstat.mrts_pkt2large++; 1924 m_freem(m); 1925 return; 1926 } 1927 1928 tbf_update_tokens(vifp); 1929 1930 if (t->tbf_q_len == 0) { /* queue empty... */ 1931 if (p_len <= t->tbf_n_tok) { /* send packet if enough tokens */ 1932 t->tbf_n_tok -= p_len; 1933 tbf_send_packet(vifp, m); 1934 } else { /* no, queue packet and try later */ 1935 tbf_queue(vifp, m); 1936 callout_reset(&tbf_reprocess_ch, TBF_REPROCESS, 1937 tbf_reprocess_q, vifp); 1938 } 1939 } else if (t->tbf_q_len < t->tbf_max_q_len) { 1940 /* finite queue length, so queue pkts and process queue */ 1941 tbf_queue(vifp, m); 1942 tbf_process_q(vifp); 1943 } else { 1944 /* queue full, try to dq and queue and process */ 1945 if (!tbf_dq_sel(vifp, ip)) { 1946 mrtstat.mrts_q_overflow++; 1947 m_freem(m); 1948 } else { 1949 tbf_queue(vifp, m); 1950 tbf_process_q(vifp); 1951 } 1952 } 1953} 1954 1955/* 1956 * adds a packet to the queue at the interface 1957 */ 1958static void 1959tbf_queue(struct vif *vifp, struct mbuf *m) 1960{ 1961 struct tbf *t = vifp->v_tbf; 1962 1963 VIF_LOCK_ASSERT(); 1964 1965 if (t->tbf_t == NULL) /* Queue was empty */ 1966 t->tbf_q = m; 1967 else /* Insert at tail */ 1968 t->tbf_t->m_act = m; 1969 1970 t->tbf_t = m; /* Set new tail pointer */ 1971 1972#ifdef DIAGNOSTIC 1973 /* Make sure we didn't get fed a bogus mbuf */ 1974 if (m->m_act) 1975 panic("tbf_queue: m_act"); 1976#endif 1977 m->m_act = NULL; 1978 1979 t->tbf_q_len++; 1980} 1981 1982/* 1983 * processes the queue at the interface 1984 */ 1985static void 1986tbf_process_q(struct vif *vifp) 1987{ 1988 struct tbf *t = vifp->v_tbf; 1989 1990 VIF_LOCK_ASSERT(); 1991 1992 /* loop through the queue at the interface and send as many packets 1993 * as possible 1994 */ 1995 while (t->tbf_q_len > 0) { 1996 struct mbuf *m = t->tbf_q; 1997 int len = mtod(m, struct ip *)->ip_len; 1998 1999 /* determine if the packet can be sent */ 2000 if (len > t->tbf_n_tok) /* not enough tokens, we are done */ 2001 break; 2002 /* ok, reduce no of tokens, dequeue and send the packet. */ 2003 t->tbf_n_tok -= len; 2004 2005 t->tbf_q = m->m_act; 2006 if (--t->tbf_q_len == 0) 2007 t->tbf_t = NULL; 2008 2009 m->m_act = NULL; 2010 tbf_send_packet(vifp, m); 2011 } 2012} 2013 2014static void 2015tbf_reprocess_q(void *xvifp) 2016{ 2017 struct vif *vifp = xvifp; 2018 2019 if (ip_mrouter == NULL) 2020 return; 2021 VIF_LOCK(); 2022 tbf_update_tokens(vifp); 2023 tbf_process_q(vifp); 2024 if (vifp->v_tbf->tbf_q_len) 2025 callout_reset(&tbf_reprocess_ch, TBF_REPROCESS, tbf_reprocess_q, vifp); 2026 VIF_UNLOCK(); 2027} 2028 2029/* function that will selectively discard a member of the queue 2030 * based on the precedence value and the priority 2031 */ 2032static int 2033tbf_dq_sel(struct vif *vifp, struct ip *ip) 2034{ 2035 u_int p; 2036 struct mbuf *m, *last; 2037 struct mbuf **np; 2038 struct tbf *t = vifp->v_tbf; 2039 2040 VIF_LOCK_ASSERT(); 2041 2042 p = priority(vifp, ip); 2043 2044 np = &t->tbf_q; 2045 last = NULL; 2046 while ((m = *np) != NULL) { 2047 if (p > priority(vifp, mtod(m, struct ip *))) { 2048 *np = m->m_act; 2049 /* If we're removing the last packet, fix the tail pointer */ 2050 if (m == t->tbf_t) 2051 t->tbf_t = last; 2052 m_freem(m); 2053 /* It's impossible for the queue to be empty, but check anyways. */ 2054 if (--t->tbf_q_len == 0) 2055 t->tbf_t = NULL; 2056 mrtstat.mrts_drop_sel++; 2057 return 1; 2058 } 2059 np = &m->m_act; 2060 last = m; 2061 } 2062 return 0; 2063} 2064 2065static void 2066tbf_send_packet(struct vif *vifp, struct mbuf *m) 2067{ 2068 VIF_LOCK_ASSERT(); 2069 2070 if (vifp->v_flags & VIFF_TUNNEL) /* If tunnel options */ 2071 ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, NULL, NULL); 2072 else { 2073 struct ip_moptions imo; 2074 int error; 2075 static struct route ro; /* XXX check this */ 2076 2077 imo.imo_multicast_ifp = vifp->v_ifp; 2078 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; 2079 imo.imo_multicast_loop = 1; 2080 imo.imo_multicast_vif = -1; 2081 2082 /* 2083 * Re-entrancy should not be a problem here, because 2084 * the packets that we send out and are looped back at us 2085 * should get rejected because they appear to come from 2086 * the loopback interface, thus preventing looping. 2087 */ 2088 error = ip_output(m, NULL, &ro, IP_FORWARDING, &imo, NULL); 2089 2090 if (mrtdebug & DEBUG_XMIT) 2091 log(LOG_DEBUG, "phyint_send on vif %d err %d\n", 2092 (int)(vifp - viftable), error); 2093 } 2094} 2095 2096/* determine the current time and then 2097 * the elapsed time (between the last time and time now) 2098 * in milliseconds & update the no. of tokens in the bucket 2099 */ 2100static void 2101tbf_update_tokens(struct vif *vifp) 2102{ 2103 struct timeval tp; 2104 u_long tm; 2105 struct tbf *t = vifp->v_tbf; 2106 2107 VIF_LOCK_ASSERT(); 2108 2109 GET_TIME(tp); 2110 2111 TV_DELTA(tp, t->tbf_last_pkt_t, tm); 2112 2113 /* 2114 * This formula is actually 2115 * "time in seconds" * "bytes/second". 2116 * 2117 * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8) 2118 * 2119 * The (1000/1024) was introduced in add_vif to optimize 2120 * this divide into a shift. 2121 */ 2122 t->tbf_n_tok += tm * vifp->v_rate_limit / 1024 / 8; 2123 t->tbf_last_pkt_t = tp; 2124 2125 if (t->tbf_n_tok > MAX_BKT_SIZE) 2126 t->tbf_n_tok = MAX_BKT_SIZE; 2127} 2128 2129static int 2130priority(struct vif *vifp, struct ip *ip) 2131{ 2132 int prio = 50; /* the lowest priority -- default case */ 2133 2134 /* temporary hack; may add general packet classifier some day */ 2135 2136 /* 2137 * The UDP port space is divided up into four priority ranges: 2138 * [0, 16384) : unclassified - lowest priority 2139 * [16384, 32768) : audio - highest priority 2140 * [32768, 49152) : whiteboard - medium priority 2141 * [49152, 65536) : video - low priority 2142 * 2143 * Everything else gets lowest priority. 2144 */ 2145 if (ip->ip_p == IPPROTO_UDP) { 2146 struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2)); 2147 switch (ntohs(udp->uh_dport) & 0xc000) { 2148 case 0x4000: 2149 prio = 70; 2150 break; 2151 case 0x8000: 2152 prio = 60; 2153 break; 2154 case 0xc000: 2155 prio = 55; 2156 break; 2157 } 2158 } 2159 return prio; 2160} 2161 2162/* 2163 * End of token bucket filter modifications 2164 */ 2165 2166static int 2167X_ip_rsvp_vif(struct socket *so, struct sockopt *sopt) 2168{ 2169 int error, vifi; 2170 2171 if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) 2172 return EOPNOTSUPP; 2173 2174 error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); 2175 if (error) 2176 return error; 2177 2178 VIF_LOCK(); 2179 2180 if (vifi < 0 || vifi >= numvifs) { /* Error if vif is invalid */ 2181 VIF_UNLOCK(); 2182 return EADDRNOTAVAIL; 2183 } 2184 2185 if (sopt->sopt_name == IP_RSVP_VIF_ON) { 2186 /* Check if socket is available. */ 2187 if (viftable[vifi].v_rsvpd != NULL) { 2188 VIF_UNLOCK(); 2189 return EADDRINUSE; 2190 } 2191 2192 viftable[vifi].v_rsvpd = so; 2193 /* This may seem silly, but we need to be sure we don't over-increment 2194 * the RSVP counter, in case something slips up. 2195 */ 2196 if (!viftable[vifi].v_rsvp_on) { 2197 viftable[vifi].v_rsvp_on = 1; 2198 rsvp_on++; 2199 } 2200 } else { /* must be VIF_OFF */ 2201 /* 2202 * XXX as an additional consistency check, one could make sure 2203 * that viftable[vifi].v_rsvpd == so, otherwise passing so as 2204 * first parameter is pretty useless. 2205 */ 2206 viftable[vifi].v_rsvpd = NULL; 2207 /* 2208 * This may seem silly, but we need to be sure we don't over-decrement 2209 * the RSVP counter, in case something slips up. 2210 */ 2211 if (viftable[vifi].v_rsvp_on) { 2212 viftable[vifi].v_rsvp_on = 0; 2213 rsvp_on--; 2214 } 2215 } 2216 VIF_UNLOCK(); 2217 return 0; 2218} 2219 2220static void 2221X_ip_rsvp_force_done(struct socket *so) 2222{ 2223 int vifi; 2224 2225 /* Don't bother if it is not the right type of socket. */ 2226 if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) 2227 return; 2228 2229 VIF_LOCK(); 2230 2231 /* The socket may be attached to more than one vif...this 2232 * is perfectly legal. 2233 */ 2234 for (vifi = 0; vifi < numvifs; vifi++) { 2235 if (viftable[vifi].v_rsvpd == so) { 2236 viftable[vifi].v_rsvpd = NULL; 2237 /* This may seem silly, but we need to be sure we don't 2238 * over-decrement the RSVP counter, in case something slips up. 2239 */ 2240 if (viftable[vifi].v_rsvp_on) { 2241 viftable[vifi].v_rsvp_on = 0; 2242 rsvp_on--; 2243 } 2244 } 2245 } 2246 2247 VIF_UNLOCK(); 2248} 2249 2250static void 2251X_rsvp_input(struct mbuf *m, int off) 2252{ 2253 int vifi; 2254 struct ip *ip = mtod(m, struct ip *); 2255 struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET }; 2256 struct ifnet *ifp; 2257 2258 if (rsvpdebug) 2259 printf("rsvp_input: rsvp_on %d\n",rsvp_on); 2260 2261 /* Can still get packets with rsvp_on = 0 if there is a local member 2262 * of the group to which the RSVP packet is addressed. But in this 2263 * case we want to throw the packet away. 2264 */ 2265 if (!rsvp_on) { 2266 m_freem(m); 2267 return; 2268 } 2269 2270 if (rsvpdebug) 2271 printf("rsvp_input: check vifs\n"); 2272 2273#ifdef DIAGNOSTIC 2274 M_ASSERTPKTHDR(m); 2275#endif 2276 2277 ifp = m->m_pkthdr.rcvif; 2278 2279 VIF_LOCK(); 2280 /* Find which vif the packet arrived on. */ 2281 for (vifi = 0; vifi < numvifs; vifi++) 2282 if (viftable[vifi].v_ifp == ifp) 2283 break; 2284 2285 if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) { 2286 /* 2287 * Drop the lock here to avoid holding it across rip_input. 2288 * This could make rsvpdebug printfs wrong. If you care, 2289 * record the state of stuff before dropping the lock. 2290 */ 2291 VIF_UNLOCK(); 2292 /* 2293 * If the old-style non-vif-associated socket is set, 2294 * then use it. Otherwise, drop packet since there 2295 * is no specific socket for this vif. 2296 */ 2297 if (ip_rsvpd != NULL) { 2298 if (rsvpdebug) 2299 printf("rsvp_input: Sending packet up old-style socket\n"); 2300 rip_input(m, off); /* xxx */ 2301 } else { 2302 if (rsvpdebug && vifi == numvifs) 2303 printf("rsvp_input: Can't find vif for packet.\n"); 2304 else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL) 2305 printf("rsvp_input: No socket defined for vif %d\n",vifi); 2306 m_freem(m); 2307 } 2308 return; 2309 } 2310 rsvp_src.sin_addr = ip->ip_src; 2311 2312 if (rsvpdebug && m) 2313 printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n", 2314 m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv))); 2315 2316 if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) { 2317 if (rsvpdebug) 2318 printf("rsvp_input: Failed to append to socket\n"); 2319 } else { 2320 if (rsvpdebug) 2321 printf("rsvp_input: send packet up\n"); 2322 } 2323 VIF_UNLOCK(); 2324} 2325 2326/* 2327 * Code for bandwidth monitors 2328 */ 2329 2330/* 2331 * Define common interface for timeval-related methods 2332 */ 2333#define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) 2334#define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) 2335#define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) 2336 2337static uint32_t 2338compute_bw_meter_flags(struct bw_upcall *req) 2339{ 2340 uint32_t flags = 0; 2341 2342 if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) 2343 flags |= BW_METER_UNIT_PACKETS; 2344 if (req->bu_flags & BW_UPCALL_UNIT_BYTES) 2345 flags |= BW_METER_UNIT_BYTES; 2346 if (req->bu_flags & BW_UPCALL_GEQ) 2347 flags |= BW_METER_GEQ; 2348 if (req->bu_flags & BW_UPCALL_LEQ) 2349 flags |= BW_METER_LEQ; 2350 2351 return flags; 2352} 2353 2354/* 2355 * Add a bw_meter entry 2356 */ 2357static int 2358add_bw_upcall(struct bw_upcall *req) 2359{ 2360 struct mfc *mfc; 2361 struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, 2362 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; 2363 struct timeval now; 2364 struct bw_meter *x; 2365 uint32_t flags; 2366 2367 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2368 return EOPNOTSUPP; 2369 2370 /* Test if the flags are valid */ 2371 if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) 2372 return EINVAL; 2373 if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) 2374 return EINVAL; 2375 if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 2376 == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 2377 return EINVAL; 2378 2379 /* Test if the threshold time interval is valid */ 2380 if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) 2381 return EINVAL; 2382 2383 flags = compute_bw_meter_flags(req); 2384 2385 /* 2386 * Find if we have already same bw_meter entry 2387 */ 2388 MFC_LOCK(); 2389 mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr); 2390 if (mfc == NULL) { 2391 MFC_UNLOCK(); 2392 return EADDRNOTAVAIL; 2393 } 2394 for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { 2395 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2396 &req->bu_threshold.b_time, ==)) && 2397 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 2398 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2399 (x->bm_flags & BW_METER_USER_FLAGS) == flags) { 2400 MFC_UNLOCK(); 2401 return 0; /* XXX Already installed */ 2402 } 2403 } 2404 2405 /* Allocate the new bw_meter entry */ 2406 x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); 2407 if (x == NULL) { 2408 MFC_UNLOCK(); 2409 return ENOBUFS; 2410 } 2411 2412 /* Set the new bw_meter entry */ 2413 x->bm_threshold.b_time = req->bu_threshold.b_time; 2414 GET_TIME(now); 2415 x->bm_start_time = now; 2416 x->bm_threshold.b_packets = req->bu_threshold.b_packets; 2417 x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; 2418 x->bm_measured.b_packets = 0; 2419 x->bm_measured.b_bytes = 0; 2420 x->bm_flags = flags; 2421 x->bm_time_next = NULL; 2422 x->bm_time_hash = BW_METER_BUCKETS; 2423 2424 /* Add the new bw_meter entry to the front of entries for this MFC */ 2425 x->bm_mfc = mfc; 2426 x->bm_mfc_next = mfc->mfc_bw_meter; 2427 mfc->mfc_bw_meter = x; 2428 schedule_bw_meter(x, &now); 2429 MFC_UNLOCK(); 2430 2431 return 0; 2432} 2433 2434static void 2435free_bw_list(struct bw_meter *list) 2436{ 2437 while (list != NULL) { 2438 struct bw_meter *x = list; 2439 2440 list = list->bm_mfc_next; 2441 unschedule_bw_meter(x); 2442 free(x, M_BWMETER); 2443 } 2444} 2445 2446/* 2447 * Delete one or multiple bw_meter entries 2448 */ 2449static int 2450del_bw_upcall(struct bw_upcall *req) 2451{ 2452 struct mfc *mfc; 2453 struct bw_meter *x; 2454 2455 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2456 return EOPNOTSUPP; 2457 2458 MFC_LOCK(); 2459 /* Find the corresponding MFC entry */ 2460 mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr); 2461 if (mfc == NULL) { 2462 MFC_UNLOCK(); 2463 return EADDRNOTAVAIL; 2464 } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { 2465 /* 2466 * Delete all bw_meter entries for this mfc 2467 */ 2468 struct bw_meter *list; 2469 2470 list = mfc->mfc_bw_meter; 2471 mfc->mfc_bw_meter = NULL; 2472 free_bw_list(list); 2473 MFC_UNLOCK(); 2474 return 0; 2475 } else { /* Delete a single bw_meter entry */ 2476 struct bw_meter *prev; 2477 uint32_t flags = 0; 2478 2479 flags = compute_bw_meter_flags(req); 2480 2481 /* Find the bw_meter entry to delete */ 2482 for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; 2483 x = x->bm_mfc_next) { 2484 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2485 &req->bu_threshold.b_time, ==)) && 2486 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 2487 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2488 (x->bm_flags & BW_METER_USER_FLAGS) == flags) 2489 break; 2490 } 2491 if (x != NULL) { /* Delete entry from the list for this MFC */ 2492 if (prev != NULL) 2493 prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ 2494 else 2495 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ 2496 2497 unschedule_bw_meter(x); 2498 MFC_UNLOCK(); 2499 /* Free the bw_meter entry */ 2500 free(x, M_BWMETER); 2501 return 0; 2502 } else { 2503 MFC_UNLOCK(); 2504 return EINVAL; 2505 } 2506 } 2507 /* NOTREACHED */ 2508} 2509 2510/* 2511 * Perform bandwidth measurement processing that may result in an upcall 2512 */ 2513static void 2514bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) 2515{ 2516 struct timeval delta; 2517 2518 MFC_LOCK_ASSERT(); 2519 2520 delta = *nowp; 2521 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2522 2523 if (x->bm_flags & BW_METER_GEQ) { 2524 /* 2525 * Processing for ">=" type of bw_meter entry 2526 */ 2527 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2528 /* Reset the bw_meter entry */ 2529 x->bm_start_time = *nowp; 2530 x->bm_measured.b_packets = 0; 2531 x->bm_measured.b_bytes = 0; 2532 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2533 } 2534 2535 /* Record that a packet is received */ 2536 x->bm_measured.b_packets++; 2537 x->bm_measured.b_bytes += plen; 2538 2539 /* 2540 * Test if we should deliver an upcall 2541 */ 2542 if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { 2543 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2544 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || 2545 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2546 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { 2547 /* Prepare an upcall for delivery */ 2548 bw_meter_prepare_upcall(x, nowp); 2549 x->bm_flags |= BW_METER_UPCALL_DELIVERED; 2550 } 2551 } 2552 } else if (x->bm_flags & BW_METER_LEQ) { 2553 /* 2554 * Processing for "<=" type of bw_meter entry 2555 */ 2556 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2557 /* 2558 * We are behind time with the multicast forwarding table 2559 * scanning for "<=" type of bw_meter entries, so test now 2560 * if we should deliver an upcall. 2561 */ 2562 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2563 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 2564 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2565 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 2566 /* Prepare an upcall for delivery */ 2567 bw_meter_prepare_upcall(x, nowp); 2568 } 2569 /* Reschedule the bw_meter entry */ 2570 unschedule_bw_meter(x); 2571 schedule_bw_meter(x, nowp); 2572 } 2573 2574 /* Record that a packet is received */ 2575 x->bm_measured.b_packets++; 2576 x->bm_measured.b_bytes += plen; 2577 2578 /* 2579 * Test if we should restart the measuring interval 2580 */ 2581 if ((x->bm_flags & BW_METER_UNIT_PACKETS && 2582 x->bm_measured.b_packets <= x->bm_threshold.b_packets) || 2583 (x->bm_flags & BW_METER_UNIT_BYTES && 2584 x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { 2585 /* Don't restart the measuring interval */ 2586 } else { 2587 /* Do restart the measuring interval */ 2588 /* 2589 * XXX: note that we don't unschedule and schedule, because this 2590 * might be too much overhead per packet. Instead, when we process 2591 * all entries for a given timer hash bin, we check whether it is 2592 * really a timeout. If not, we reschedule at that time. 2593 */ 2594 x->bm_start_time = *nowp; 2595 x->bm_measured.b_packets = 0; 2596 x->bm_measured.b_bytes = 0; 2597 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2598 } 2599 } 2600} 2601 2602/* 2603 * Prepare a bandwidth-related upcall 2604 */ 2605static void 2606bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) 2607{ 2608 struct timeval delta; 2609 struct bw_upcall *u; 2610 2611 MFC_LOCK_ASSERT(); 2612 2613 /* 2614 * Compute the measured time interval 2615 */ 2616 delta = *nowp; 2617 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2618 2619 /* 2620 * If there are too many pending upcalls, deliver them now 2621 */ 2622 if (bw_upcalls_n >= BW_UPCALLS_MAX) 2623 bw_upcalls_send(); 2624 2625 /* 2626 * Set the bw_upcall entry 2627 */ 2628 u = &bw_upcalls[bw_upcalls_n++]; 2629 u->bu_src = x->bm_mfc->mfc_origin; 2630 u->bu_dst = x->bm_mfc->mfc_mcastgrp; 2631 u->bu_threshold.b_time = x->bm_threshold.b_time; 2632 u->bu_threshold.b_packets = x->bm_threshold.b_packets; 2633 u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; 2634 u->bu_measured.b_time = delta; 2635 u->bu_measured.b_packets = x->bm_measured.b_packets; 2636 u->bu_measured.b_bytes = x->bm_measured.b_bytes; 2637 u->bu_flags = 0; 2638 if (x->bm_flags & BW_METER_UNIT_PACKETS) 2639 u->bu_flags |= BW_UPCALL_UNIT_PACKETS; 2640 if (x->bm_flags & BW_METER_UNIT_BYTES) 2641 u->bu_flags |= BW_UPCALL_UNIT_BYTES; 2642 if (x->bm_flags & BW_METER_GEQ) 2643 u->bu_flags |= BW_UPCALL_GEQ; 2644 if (x->bm_flags & BW_METER_LEQ) 2645 u->bu_flags |= BW_UPCALL_LEQ; 2646} 2647 2648/* 2649 * Send the pending bandwidth-related upcalls 2650 */ 2651static void 2652bw_upcalls_send(void) 2653{ 2654 struct mbuf *m; 2655 int len = bw_upcalls_n * sizeof(bw_upcalls[0]); 2656 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 2657 static struct igmpmsg igmpmsg = { 0, /* unused1 */ 2658 0, /* unused2 */ 2659 IGMPMSG_BW_UPCALL,/* im_msgtype */ 2660 0, /* im_mbz */ 2661 0, /* im_vif */ 2662 0, /* unused3 */ 2663 { 0 }, /* im_src */ 2664 { 0 } }; /* im_dst */ 2665 2666 MFC_LOCK_ASSERT(); 2667 2668 if (bw_upcalls_n == 0) 2669 return; /* No pending upcalls */ 2670 2671 bw_upcalls_n = 0; 2672 2673 /* 2674 * Allocate a new mbuf, initialize it with the header and 2675 * the payload for the pending calls. 2676 */ 2677 MGETHDR(m, M_DONTWAIT, MT_HEADER); 2678 if (m == NULL) { 2679 log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); 2680 return; 2681 } 2682 2683 m->m_len = m->m_pkthdr.len = 0; 2684 m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); 2685 m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]); 2686 2687 /* 2688 * Send the upcalls 2689 * XXX do we need to set the address in k_igmpsrc ? 2690 */ 2691 mrtstat.mrts_upcalls++; 2692 if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) { 2693 log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); 2694 ++mrtstat.mrts_upq_sockfull; 2695 } 2696} 2697 2698/* 2699 * Compute the timeout hash value for the bw_meter entries 2700 */ 2701#define BW_METER_TIMEHASH(bw_meter, hash) \ 2702 do { \ 2703 struct timeval next_timeval = (bw_meter)->bm_start_time; \ 2704 \ 2705 BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ 2706 (hash) = next_timeval.tv_sec; \ 2707 if (next_timeval.tv_usec) \ 2708 (hash)++; /* XXX: make sure we don't timeout early */ \ 2709 (hash) %= BW_METER_BUCKETS; \ 2710 } while (0) 2711 2712/* 2713 * Schedule a timer to process periodically bw_meter entry of type "<=" 2714 * by linking the entry in the proper hash bucket. 2715 */ 2716static void 2717schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) 2718{ 2719 int time_hash; 2720 2721 MFC_LOCK_ASSERT(); 2722 2723 if (!(x->bm_flags & BW_METER_LEQ)) 2724 return; /* XXX: we schedule timers only for "<=" entries */ 2725 2726 /* 2727 * Reset the bw_meter entry 2728 */ 2729 x->bm_start_time = *nowp; 2730 x->bm_measured.b_packets = 0; 2731 x->bm_measured.b_bytes = 0; 2732 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2733 2734 /* 2735 * Compute the timeout hash value and insert the entry 2736 */ 2737 BW_METER_TIMEHASH(x, time_hash); 2738 x->bm_time_next = bw_meter_timers[time_hash]; 2739 bw_meter_timers[time_hash] = x; 2740 x->bm_time_hash = time_hash; 2741} 2742 2743/* 2744 * Unschedule the periodic timer that processes bw_meter entry of type "<=" 2745 * by removing the entry from the proper hash bucket. 2746 */ 2747static void 2748unschedule_bw_meter(struct bw_meter *x) 2749{ 2750 int time_hash; 2751 struct bw_meter *prev, *tmp; 2752 2753 MFC_LOCK_ASSERT(); 2754 2755 if (!(x->bm_flags & BW_METER_LEQ)) 2756 return; /* XXX: we schedule timers only for "<=" entries */ 2757 2758 /* 2759 * Compute the timeout hash value and delete the entry 2760 */ 2761 time_hash = x->bm_time_hash; 2762 if (time_hash >= BW_METER_BUCKETS) 2763 return; /* Entry was not scheduled */ 2764 2765 for (prev = NULL, tmp = bw_meter_timers[time_hash]; 2766 tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) 2767 if (tmp == x) 2768 break; 2769 2770 if (tmp == NULL) 2771 panic("unschedule_bw_meter: bw_meter entry not found"); 2772 2773 if (prev != NULL) 2774 prev->bm_time_next = x->bm_time_next; 2775 else 2776 bw_meter_timers[time_hash] = x->bm_time_next; 2777 2778 x->bm_time_next = NULL; 2779 x->bm_time_hash = BW_METER_BUCKETS; 2780} 2781 2782 2783/* 2784 * Process all "<=" type of bw_meter that should be processed now, 2785 * and for each entry prepare an upcall if necessary. Each processed 2786 * entry is rescheduled again for the (periodic) processing. 2787 * 2788 * This is run periodically (once per second normally). On each round, 2789 * all the potentially matching entries are in the hash slot that we are 2790 * looking at. 2791 */ 2792static void 2793bw_meter_process() 2794{ 2795 static uint32_t last_tv_sec; /* last time we processed this */ 2796 2797 uint32_t loops; 2798 int i; 2799 struct timeval now, process_endtime; 2800 2801 GET_TIME(now); 2802 if (last_tv_sec == now.tv_sec) 2803 return; /* nothing to do */ 2804 2805 loops = now.tv_sec - last_tv_sec; 2806 last_tv_sec = now.tv_sec; 2807 if (loops > BW_METER_BUCKETS) 2808 loops = BW_METER_BUCKETS; 2809 2810 MFC_LOCK(); 2811 /* 2812 * Process all bins of bw_meter entries from the one after the last 2813 * processed to the current one. On entry, i points to the last bucket 2814 * visited, so we need to increment i at the beginning of the loop. 2815 */ 2816 for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { 2817 struct bw_meter *x, *tmp_list; 2818 2819 if (++i >= BW_METER_BUCKETS) 2820 i = 0; 2821 2822 /* Disconnect the list of bw_meter entries from the bin */ 2823 tmp_list = bw_meter_timers[i]; 2824 bw_meter_timers[i] = NULL; 2825 2826 /* Process the list of bw_meter entries */ 2827 while (tmp_list != NULL) { 2828 x = tmp_list; 2829 tmp_list = tmp_list->bm_time_next; 2830 2831 /* Test if the time interval is over */ 2832 process_endtime = x->bm_start_time; 2833 BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); 2834 if (BW_TIMEVALCMP(&process_endtime, &now, >)) { 2835 /* Not yet: reschedule, but don't reset */ 2836 int time_hash; 2837 2838 BW_METER_TIMEHASH(x, time_hash); 2839 if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { 2840 /* 2841 * XXX: somehow the bin processing is a bit ahead of time. 2842 * Put the entry in the next bin. 2843 */ 2844 if (++time_hash >= BW_METER_BUCKETS) 2845 time_hash = 0; 2846 } 2847 x->bm_time_next = bw_meter_timers[time_hash]; 2848 bw_meter_timers[time_hash] = x; 2849 x->bm_time_hash = time_hash; 2850 2851 continue; 2852 } 2853 2854 /* 2855 * Test if we should deliver an upcall 2856 */ 2857 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2858 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 2859 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2860 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 2861 /* Prepare an upcall for delivery */ 2862 bw_meter_prepare_upcall(x, &now); 2863 } 2864 2865 /* 2866 * Reschedule for next processing 2867 */ 2868 schedule_bw_meter(x, &now); 2869 } 2870 } 2871 2872 /* Send all upcalls that are pending delivery */ 2873 bw_upcalls_send(); 2874 2875 MFC_UNLOCK(); 2876} 2877 2878/* 2879 * A periodic function for sending all upcalls that are pending delivery 2880 */ 2881static void 2882expire_bw_upcalls_send(void *unused) 2883{ 2884 MFC_LOCK(); 2885 bw_upcalls_send(); 2886 MFC_UNLOCK(); 2887 2888 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 2889 expire_bw_upcalls_send, NULL); 2890} 2891 2892/* 2893 * A periodic function for periodic scanning of the multicast forwarding 2894 * table for processing all "<=" bw_meter entries. 2895 */ 2896static void 2897expire_bw_meter_process(void *unused) 2898{ 2899 if (mrt_api_config & MRT_MFC_BW_UPCALL) 2900 bw_meter_process(); 2901 2902 callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); 2903} 2904 2905/* 2906 * End of bandwidth monitoring code 2907 */ 2908 2909#ifdef PIM 2910/* 2911 * Send the packet up to the user daemon, or eventually do kernel encapsulation 2912 * 2913 */ 2914static int 2915pim_register_send(struct ip *ip, struct vif *vifp, 2916 struct mbuf *m, struct mfc *rt) 2917{ 2918 struct mbuf *mb_copy, *mm; 2919 2920 if (mrtdebug & DEBUG_PIM) 2921 log(LOG_DEBUG, "pim_register_send: "); 2922 2923 mb_copy = pim_register_prepare(ip, m); 2924 if (mb_copy == NULL) 2925 return ENOBUFS; 2926 2927 /* 2928 * Send all the fragments. Note that the mbuf for each fragment 2929 * is freed by the sending machinery. 2930 */ 2931 for (mm = mb_copy; mm; mm = mb_copy) { 2932 mb_copy = mm->m_nextpkt; 2933 mm->m_nextpkt = 0; 2934 mm = m_pullup(mm, sizeof(struct ip)); 2935 if (mm != NULL) { 2936 ip = mtod(mm, struct ip *); 2937 if ((mrt_api_config & MRT_MFC_RP) && 2938 (rt->mfc_rp.s_addr != INADDR_ANY)) { 2939 pim_register_send_rp(ip, vifp, mm, rt); 2940 } else { 2941 pim_register_send_upcall(ip, vifp, mm, rt); 2942 } 2943 } 2944 } 2945 2946 return 0; 2947} 2948 2949/* 2950 * Return a copy of the data packet that is ready for PIM Register 2951 * encapsulation. 2952 * XXX: Note that in the returned copy the IP header is a valid one. 2953 */ 2954static struct mbuf * 2955pim_register_prepare(struct ip *ip, struct mbuf *m) 2956{ 2957 struct mbuf *mb_copy = NULL; 2958 int mtu; 2959 2960 /* Take care of delayed checksums */ 2961 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 2962 in_delayed_cksum(m); 2963 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 2964 } 2965 2966 /* 2967 * Copy the old packet & pullup its IP header into the 2968 * new mbuf so we can modify it. 2969 */ 2970 mb_copy = m_copypacket(m, M_DONTWAIT); 2971 if (mb_copy == NULL) 2972 return NULL; 2973 mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); 2974 if (mb_copy == NULL) 2975 return NULL; 2976 2977 /* take care of the TTL */ 2978 ip = mtod(mb_copy, struct ip *); 2979 --ip->ip_ttl; 2980 2981 /* Compute the MTU after the PIM Register encapsulation */ 2982 mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); 2983 2984 if (ip->ip_len <= mtu) { 2985 /* Turn the IP header into a valid one */ 2986 ip->ip_len = htons(ip->ip_len); 2987 ip->ip_off = htons(ip->ip_off); 2988 ip->ip_sum = 0; 2989 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 2990 } else { 2991 /* Fragment the packet */ 2992 if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) { 2993 m_freem(mb_copy); 2994 return NULL; 2995 } 2996 } 2997 return mb_copy; 2998} 2999 3000/* 3001 * Send an upcall with the data packet to the user-level process. 3002 */ 3003static int 3004pim_register_send_upcall(struct ip *ip, struct vif *vifp, 3005 struct mbuf *mb_copy, struct mfc *rt) 3006{ 3007 struct mbuf *mb_first; 3008 int len = ntohs(ip->ip_len); 3009 struct igmpmsg *im; 3010 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 3011 3012 VIF_LOCK_ASSERT(); 3013 3014 /* 3015 * Add a new mbuf with an upcall header 3016 */ 3017 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); 3018 if (mb_first == NULL) { 3019 m_freem(mb_copy); 3020 return ENOBUFS; 3021 } 3022 mb_first->m_data += max_linkhdr; 3023 mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); 3024 mb_first->m_len = sizeof(struct igmpmsg); 3025 mb_first->m_next = mb_copy; 3026 3027 /* Send message to routing daemon */ 3028 im = mtod(mb_first, struct igmpmsg *); 3029 im->im_msgtype = IGMPMSG_WHOLEPKT; 3030 im->im_mbz = 0; 3031 im->im_vif = vifp - viftable; 3032 im->im_src = ip->ip_src; 3033 im->im_dst = ip->ip_dst; 3034 3035 k_igmpsrc.sin_addr = ip->ip_src; 3036 3037 mrtstat.mrts_upcalls++; 3038 3039 if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) { 3040 if (mrtdebug & DEBUG_PIM) 3041 log(LOG_WARNING, 3042 "mcast: pim_register_send_upcall: ip_mrouter socket queue full"); 3043 ++mrtstat.mrts_upq_sockfull; 3044 return ENOBUFS; 3045 } 3046 3047 /* Keep statistics */ 3048 pimstat.pims_snd_registers_msgs++; 3049 pimstat.pims_snd_registers_bytes += len; 3050 3051 return 0; 3052} 3053 3054/* 3055 * Encapsulate the data packet in PIM Register message and send it to the RP. 3056 */ 3057static int 3058pim_register_send_rp(struct ip *ip, struct vif *vifp, 3059 struct mbuf *mb_copy, struct mfc *rt) 3060{ 3061 struct mbuf *mb_first; 3062 struct ip *ip_outer; 3063 struct pim_encap_pimhdr *pimhdr; 3064 int len = ntohs(ip->ip_len); 3065 vifi_t vifi = rt->mfc_parent; 3066 3067 VIF_LOCK_ASSERT(); 3068 3069 if ((vifi >= numvifs) || (viftable[vifi].v_lcl_addr.s_addr == 0)) { 3070 m_freem(mb_copy); 3071 return EADDRNOTAVAIL; /* The iif vif is invalid */ 3072 } 3073 3074 /* 3075 * Add a new mbuf with the encapsulating header 3076 */ 3077 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); 3078 if (mb_first == NULL) { 3079 m_freem(mb_copy); 3080 return ENOBUFS; 3081 } 3082 mb_first->m_data += max_linkhdr; 3083 mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); 3084 mb_first->m_next = mb_copy; 3085 3086 mb_first->m_pkthdr.len = len + mb_first->m_len; 3087 3088 /* 3089 * Fill in the encapsulating IP and PIM header 3090 */ 3091 ip_outer = mtod(mb_first, struct ip *); 3092 *ip_outer = pim_encap_iphdr; 3093#ifdef RANDOM_IP_ID 3094 ip_outer->ip_id = ip_randomid(); 3095#else 3096 ip_outer->ip_id = htons(ip_id++); 3097#endif 3098 ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); 3099 ip_outer->ip_src = viftable[vifi].v_lcl_addr; 3100 ip_outer->ip_dst = rt->mfc_rp; 3101 /* 3102 * Copy the inner header TOS to the outer header, and take care of the 3103 * IP_DF bit. 3104 */ 3105 ip_outer->ip_tos = ip->ip_tos; 3106 if (ntohs(ip->ip_off) & IP_DF) 3107 ip_outer->ip_off |= IP_DF; 3108 pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer 3109 + sizeof(pim_encap_iphdr)); 3110 *pimhdr = pim_encap_pimhdr; 3111 /* If the iif crosses a border, set the Border-bit */ 3112 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config) 3113 pimhdr->flags |= htonl(PIM_BORDER_REGISTER); 3114 3115 mb_first->m_data += sizeof(pim_encap_iphdr); 3116 pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); 3117 mb_first->m_data -= sizeof(pim_encap_iphdr); 3118 3119 if (vifp->v_rate_limit == 0) 3120 tbf_send_packet(vifp, mb_first); 3121 else 3122 tbf_control(vifp, mb_first, ip, ip_outer->ip_len); 3123 3124 /* Keep statistics */ 3125 pimstat.pims_snd_registers_msgs++; 3126 pimstat.pims_snd_registers_bytes += len; 3127 3128 return 0; 3129} 3130 3131/* 3132 * PIM-SMv2 and PIM-DM messages processing. 3133 * Receives and verifies the PIM control messages, and passes them 3134 * up to the listening socket, using rip_input(). 3135 * The only message with special processing is the PIM_REGISTER message 3136 * (used by PIM-SM): the PIM header is stripped off, and the inner packet 3137 * is passed to if_simloop(). 3138 */ 3139void 3140pim_input(struct mbuf *m, int off) 3141{ 3142 struct ip *ip = mtod(m, struct ip *); 3143 struct pim *pim; 3144 int minlen; 3145 int datalen = ip->ip_len; 3146 int ip_tos; 3147 int iphlen = off; 3148 3149 /* Keep statistics */ 3150 pimstat.pims_rcv_total_msgs++; 3151 pimstat.pims_rcv_total_bytes += datalen; 3152 3153 /* 3154 * Validate lengths 3155 */ 3156 if (datalen < PIM_MINLEN) { 3157 pimstat.pims_rcv_tooshort++; 3158 log(LOG_ERR, "pim_input: packet size too small %d from %lx\n", 3159 datalen, (u_long)ip->ip_src.s_addr); 3160 m_freem(m); 3161 return; 3162 } 3163 3164 /* 3165 * If the packet is at least as big as a REGISTER, go agead 3166 * and grab the PIM REGISTER header size, to avoid another 3167 * possible m_pullup() later. 3168 * 3169 * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 3170 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 3171 */ 3172 minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); 3173 /* 3174 * Get the IP and PIM headers in contiguous memory, and 3175 * possibly the PIM REGISTER header. 3176 */ 3177 if ((m->m_flags & M_EXT || m->m_len < minlen) && 3178 (m = m_pullup(m, minlen)) == 0) { 3179 log(LOG_ERR, "pim_input: m_pullup failure\n"); 3180 return; 3181 } 3182 /* m_pullup() may have given us a new mbuf so reset ip. */ 3183 ip = mtod(m, struct ip *); 3184 ip_tos = ip->ip_tos; 3185 3186 /* adjust mbuf to point to the PIM header */ 3187 m->m_data += iphlen; 3188 m->m_len -= iphlen; 3189 pim = mtod(m, struct pim *); 3190 3191 /* 3192 * Validate checksum. If PIM REGISTER, exclude the data packet. 3193 * 3194 * XXX: some older PIMv2 implementations don't make this distinction, 3195 * so for compatibility reason perform the checksum over part of the 3196 * message, and if error, then over the whole message. 3197 */ 3198 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { 3199 /* do nothing, checksum okay */ 3200 } else if (in_cksum(m, datalen)) { 3201 pimstat.pims_rcv_badsum++; 3202 if (mrtdebug & DEBUG_PIM) 3203 log(LOG_DEBUG, "pim_input: invalid checksum"); 3204 m_freem(m); 3205 return; 3206 } 3207 3208 /* PIM version check */ 3209 if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { 3210 pimstat.pims_rcv_badversion++; 3211 log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n", 3212 PIM_VT_V(pim->pim_vt), PIM_VERSION); 3213 m_freem(m); 3214 return; 3215 } 3216 3217 /* restore mbuf back to the outer IP */ 3218 m->m_data -= iphlen; 3219 m->m_len += iphlen; 3220 3221 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { 3222 /* 3223 * Since this is a REGISTER, we'll make a copy of the register 3224 * headers ip + pim + u_int32 + encap_ip, to be passed up to the 3225 * routing daemon. 3226 */ 3227 struct sockaddr_in dst = { sizeof(dst), AF_INET }; 3228 struct mbuf *mcp; 3229 struct ip *encap_ip; 3230 u_int32_t *reghdr; 3231 struct ifnet *vifp; 3232 3233 VIF_LOCK(); 3234 if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) { 3235 VIF_UNLOCK(); 3236 if (mrtdebug & DEBUG_PIM) 3237 log(LOG_DEBUG, 3238 "pim_input: register vif not set: %d\n", reg_vif_num); 3239 m_freem(m); 3240 return; 3241 } 3242 /* XXX need refcnt? */ 3243 vifp = viftable[reg_vif_num].v_ifp; 3244 VIF_UNLOCK(); 3245 3246 /* 3247 * Validate length 3248 */ 3249 if (datalen < PIM_REG_MINLEN) { 3250 pimstat.pims_rcv_tooshort++; 3251 pimstat.pims_rcv_badregisters++; 3252 log(LOG_ERR, 3253 "pim_input: register packet size too small %d from %lx\n", 3254 datalen, (u_long)ip->ip_src.s_addr); 3255 m_freem(m); 3256 return; 3257 } 3258 3259 reghdr = (u_int32_t *)(pim + 1); 3260 encap_ip = (struct ip *)(reghdr + 1); 3261 3262 if (mrtdebug & DEBUG_PIM) { 3263 log(LOG_DEBUG, 3264 "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n", 3265 (u_long)ntohl(encap_ip->ip_src.s_addr), 3266 (u_long)ntohl(encap_ip->ip_dst.s_addr), 3267 ntohs(encap_ip->ip_len)); 3268 } 3269 3270 /* verify the version number of the inner packet */ 3271 if (encap_ip->ip_v != IPVERSION) { 3272 pimstat.pims_rcv_badregisters++; 3273 if (mrtdebug & DEBUG_PIM) { 3274 log(LOG_DEBUG, "pim_input: invalid IP version (%d) " 3275 "of the inner packet\n", encap_ip->ip_v); 3276 } 3277 m_freem(m); 3278 return; 3279 } 3280 3281 /* verify the inner packet is destined to a mcast group */ 3282 if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { 3283 pimstat.pims_rcv_badregisters++; 3284 if (mrtdebug & DEBUG_PIM) 3285 log(LOG_DEBUG, 3286 "pim_input: inner packet of register is not " 3287 "multicast %lx\n", 3288 (u_long)ntohl(encap_ip->ip_dst.s_addr)); 3289 m_freem(m); 3290 return; 3291 } 3292 3293 /* If a NULL_REGISTER, pass it to the daemon */ 3294 if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) 3295 goto pim_input_to_daemon; 3296 3297 /* 3298 * Copy the TOS from the outer IP header to the inner IP header. 3299 */ 3300 if (encap_ip->ip_tos != ip_tos) { 3301 /* Outer TOS -> inner TOS */ 3302 encap_ip->ip_tos = ip_tos; 3303 /* Recompute the inner header checksum. Sigh... */ 3304 3305 /* adjust mbuf to point to the inner IP header */ 3306 m->m_data += (iphlen + PIM_MINLEN); 3307 m->m_len -= (iphlen + PIM_MINLEN); 3308 3309 encap_ip->ip_sum = 0; 3310 encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); 3311 3312 /* restore mbuf to point back to the outer IP header */ 3313 m->m_data -= (iphlen + PIM_MINLEN); 3314 m->m_len += (iphlen + PIM_MINLEN); 3315 } 3316 3317 /* 3318 * Decapsulate the inner IP packet and loopback to forward it 3319 * as a normal multicast packet. Also, make a copy of the 3320 * outer_iphdr + pimhdr + reghdr + encap_iphdr 3321 * to pass to the daemon later, so it can take the appropriate 3322 * actions (e.g., send back PIM_REGISTER_STOP). 3323 * XXX: here m->m_data points to the outer IP header. 3324 */ 3325 mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN); 3326 if (mcp == NULL) { 3327 log(LOG_ERR, 3328 "pim_input: pim register: could not copy register head\n"); 3329 m_freem(m); 3330 return; 3331 } 3332 3333 /* Keep statistics */ 3334 /* XXX: registers_bytes include only the encap. mcast pkt */ 3335 pimstat.pims_rcv_registers_msgs++; 3336 pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len); 3337 3338 /* 3339 * forward the inner ip packet; point m_data at the inner ip. 3340 */ 3341 m_adj(m, iphlen + PIM_MINLEN); 3342 3343 if (mrtdebug & DEBUG_PIM) { 3344 log(LOG_DEBUG, 3345 "pim_input: forwarding decapsulated register: " 3346 "src %lx, dst %lx, vif %d\n", 3347 (u_long)ntohl(encap_ip->ip_src.s_addr), 3348 (u_long)ntohl(encap_ip->ip_dst.s_addr), 3349 reg_vif_num); 3350 } 3351 /* NB: vifp was collected above; can it change on us? */ 3352 if_simloop(vifp, m, dst.sin_family, 0); 3353 3354 /* prepare the register head to send to the mrouting daemon */ 3355 m = mcp; 3356 } 3357 3358pim_input_to_daemon: 3359 /* 3360 * Pass the PIM message up to the daemon; if it is a Register message, 3361 * pass the 'head' only up to the daemon. This includes the 3362 * outer IP header, PIM header, PIM-Register header and the 3363 * inner IP header. 3364 * XXX: the outer IP header pkt size of a Register is not adjust to 3365 * reflect the fact that the inner multicast data is truncated. 3366 */ 3367 rip_input(m, iphlen); 3368 3369 return; 3370} 3371#endif /* PIM */ 3372 3373static int 3374ip_mroute_modevent(module_t mod, int type, void *unused) 3375{ 3376 switch (type) { 3377 case MOD_LOAD: 3378 mtx_init(&mrouter_mtx, "mrouter initialization", NULL, MTX_DEF); 3379 MFC_LOCK_INIT(); 3380 VIF_LOCK_INIT(); 3381 ip_mrouter_reset(); 3382 ip_mcast_src = X_ip_mcast_src; 3383 ip_mforward = X_ip_mforward; 3384 ip_mrouter_done = X_ip_mrouter_done; 3385 ip_mrouter_get = X_ip_mrouter_get; 3386 ip_mrouter_set = X_ip_mrouter_set; 3387 ip_rsvp_force_done = X_ip_rsvp_force_done; 3388 ip_rsvp_vif = X_ip_rsvp_vif; 3389 legal_vif_num = X_legal_vif_num; 3390 mrt_ioctl = X_mrt_ioctl; 3391 rsvp_input_p = X_rsvp_input; 3392 break; 3393 3394 case MOD_UNLOAD: 3395 /* 3396 * Typically module unload happens after the user-level 3397 * process has shutdown the kernel services (the check 3398 * below insures someone can't just yank the module out 3399 * from under a running process). But if the module is 3400 * just loaded and then unloaded w/o starting up a user 3401 * process we still need to cleanup. 3402 */ 3403 if (ip_mrouter) 3404 return EINVAL; 3405 3406 X_ip_mrouter_done(); 3407 ip_mcast_src = NULL; 3408 ip_mforward = NULL; 3409 ip_mrouter_done = NULL; 3410 ip_mrouter_get = NULL; 3411 ip_mrouter_set = NULL; 3412 ip_rsvp_force_done = NULL; 3413 ip_rsvp_vif = NULL; 3414 legal_vif_num = NULL; 3415 mrt_ioctl = NULL; 3416 rsvp_input_p = NULL; 3417 VIF_LOCK_DESTROY(); 3418 MFC_LOCK_DESTROY(); 3419 mtx_destroy(&mrouter_mtx); 3420 break; 3421 } 3422 return 0; 3423} 3424 3425static moduledata_t ip_mroutemod = { 3426 "ip_mroute", 3427 ip_mroute_modevent, 3428 0 3429}; 3430DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY); 3431