ip_mroute.c revision 133874
1/* 2 * IP multicast forwarding procedures 3 * 4 * Written by David Waitzman, BBN Labs, August 1988. 5 * Modified by Steve Deering, Stanford, February 1989. 6 * Modified by Mark J. Steiglitz, Stanford, May, 1991 7 * Modified by Van Jacobson, LBL, January 1993 8 * Modified by Ajit Thyagarajan, PARC, August 1993 9 * Modified by Bill Fenner, PARC, April 1995 10 * Modified by Ahmed Helmy, SGI, June 1996 11 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 12 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 13 * Modified by Hitoshi Asaeda, WIDE, August 2000 14 * Modified by Pavlin Radoslavov, ICSI, October 2002 15 * 16 * MROUTING Revision: 3.5 17 * and PIM-SMv2 and PIM-DM support, advanced API support, 18 * bandwidth metering and signaling 19 * 20 * $FreeBSD: head/sys/netinet/ip_mroute.c 133874 2004-08-16 18:32:07Z rwatson $ 21 */ 22 23#include "opt_mac.h" 24#include "opt_mrouting.h" 25 26#ifdef PIM 27#define _PIM_VT 1 28#endif 29 30#include <sys/param.h> 31#include <sys/kernel.h> 32#include <sys/lock.h> 33#include <sys/mac.h> 34#include <sys/malloc.h> 35#include <sys/mbuf.h> 36#include <sys/module.h> 37#include <sys/protosw.h> 38#include <sys/signalvar.h> 39#include <sys/socket.h> 40#include <sys/socketvar.h> 41#include <sys/sockio.h> 42#include <sys/sx.h> 43#include <sys/sysctl.h> 44#include <sys/syslog.h> 45#include <sys/systm.h> 46#include <sys/time.h> 47#include <net/if.h> 48#include <net/netisr.h> 49#include <net/route.h> 50#include <netinet/in.h> 51#include <netinet/igmp.h> 52#include <netinet/in_systm.h> 53#include <netinet/in_var.h> 54#include <netinet/ip.h> 55#include <netinet/ip_encap.h> 56#include <netinet/ip_mroute.h> 57#include <netinet/ip_var.h> 58#ifdef PIM 59#include <netinet/pim.h> 60#include <netinet/pim_var.h> 61#endif 62#include <netinet/udp.h> 63#include <machine/in_cksum.h> 64 65/* 66 * Control debugging code for rsvp and multicast routing code. 67 * Can only set them with the debugger. 68 */ 69static u_int rsvpdebug; /* non-zero enables debugging */ 70 71static u_int mrtdebug; /* any set of the flags below */ 72#define DEBUG_MFC 0x02 73#define DEBUG_FORWARD 0x04 74#define DEBUG_EXPIRE 0x08 75#define DEBUG_XMIT 0x10 76#define DEBUG_PIM 0x20 77 78#define VIFI_INVALID ((vifi_t) -1) 79 80#define M_HASCL(m) ((m)->m_flags & M_EXT) 81 82static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables"); 83 84/* 85 * Locking. We use two locks: one for the virtual interface table and 86 * one for the forwarding table. These locks may be nested in which case 87 * the VIF lock must always be taken first. Note that each lock is used 88 * to cover not only the specific data structure but also related data 89 * structures. It may be better to add more fine-grained locking later; 90 * it's not clear how performance-critical this code is. 91 */ 92 93static struct mrtstat mrtstat; 94SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW, 95 &mrtstat, mrtstat, 96 "Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)"); 97 98static struct mfc *mfctable[MFCTBLSIZ]; 99SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, 100 &mfctable, sizeof(mfctable), "S,*mfc[MFCTBLSIZ]", 101 "Multicast Forwarding Table (struct *mfc[MFCTBLSIZ], netinet/ip_mroute.h)"); 102 103static struct mtx mfc_mtx; 104#define MFC_LOCK() mtx_lock(&mfc_mtx) 105#define MFC_UNLOCK() mtx_unlock(&mfc_mtx) 106#define MFC_LOCK_ASSERT() do { \ 107 mtx_assert(&mfc_mtx, MA_OWNED); \ 108 NET_ASSERT_GIANT(); \ 109} while (0) 110#define MFC_LOCK_INIT() mtx_init(&mfc_mtx, "mroute mfc table", NULL, MTX_DEF) 111#define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) 112 113static struct vif viftable[MAXVIFS]; 114SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD, 115 &viftable, sizeof(viftable), "S,vif[MAXVIFS]", 116 "Multicast Virtual Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); 117 118static struct mtx vif_mtx; 119#define VIF_LOCK() mtx_lock(&vif_mtx) 120#define VIF_UNLOCK() mtx_unlock(&vif_mtx) 121#define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) 122#define VIF_LOCK_INIT() mtx_init(&vif_mtx, "mroute vif table", NULL, MTX_DEF) 123#define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) 124 125static u_char nexpire[MFCTBLSIZ]; 126 127static struct callout expire_upcalls_ch; 128 129#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ 130#define UPCALL_EXPIRE 6 /* number of timeouts */ 131 132/* 133 * Define the token bucket filter structures 134 * tbftable -> each vif has one of these for storing info 135 */ 136 137static struct tbf tbftable[MAXVIFS]; 138#define TBF_REPROCESS (hz / 100) /* 100x / second */ 139 140/* 141 * 'Interfaces' associated with decapsulator (so we can tell 142 * packets that went through it from ones that get reflected 143 * by a broken gateway). These interfaces are never linked into 144 * the system ifnet list & no routes point to them. I.e., packets 145 * can't be sent this way. They only exist as a placeholder for 146 * multicast source verification. 147 */ 148static struct ifnet multicast_decap_if[MAXVIFS]; 149 150#define ENCAP_TTL 64 151#define ENCAP_PROTO IPPROTO_IPIP /* 4 */ 152 153/* prototype IP hdr for encapsulated packets */ 154static struct ip multicast_encap_iphdr = { 155#if BYTE_ORDER == LITTLE_ENDIAN 156 sizeof(struct ip) >> 2, IPVERSION, 157#else 158 IPVERSION, sizeof(struct ip) >> 2, 159#endif 160 0, /* tos */ 161 sizeof(struct ip), /* total length */ 162 0, /* id */ 163 0, /* frag offset */ 164 ENCAP_TTL, ENCAP_PROTO, 165 0, /* checksum */ 166}; 167 168/* 169 * Bandwidth meter variables and constants 170 */ 171static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); 172/* 173 * Pending timeouts are stored in a hash table, the key being the 174 * expiration time. Periodically, the entries are analysed and processed. 175 */ 176#define BW_METER_BUCKETS 1024 177static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS]; 178static struct callout bw_meter_ch; 179#define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ 180 181/* 182 * Pending upcalls are stored in a vector which is flushed when 183 * full, or periodically 184 */ 185static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX]; 186static u_int bw_upcalls_n; /* # of pending upcalls */ 187static struct callout bw_upcalls_ch; 188#define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ 189 190#ifdef PIM 191static struct pimstat pimstat; 192SYSCTL_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD, 193 &pimstat, pimstat, 194 "PIM Statistics (struct pimstat, netinet/pim_var.h)"); 195 196/* 197 * Note: the PIM Register encapsulation adds the following in front of a 198 * data packet: 199 * 200 * struct pim_encap_hdr { 201 * struct ip ip; 202 * struct pim_encap_pimhdr pim; 203 * } 204 * 205 */ 206 207struct pim_encap_pimhdr { 208 struct pim pim; 209 uint32_t flags; 210}; 211 212static struct ip pim_encap_iphdr = { 213#if BYTE_ORDER == LITTLE_ENDIAN 214 sizeof(struct ip) >> 2, 215 IPVERSION, 216#else 217 IPVERSION, 218 sizeof(struct ip) >> 2, 219#endif 220 0, /* tos */ 221 sizeof(struct ip), /* total length */ 222 0, /* id */ 223 0, /* frag offset */ 224 ENCAP_TTL, 225 IPPROTO_PIM, 226 0, /* checksum */ 227}; 228 229static struct pim_encap_pimhdr pim_encap_pimhdr = { 230 { 231 PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 232 0, /* reserved */ 233 0, /* checksum */ 234 }, 235 0 /* flags */ 236}; 237 238static struct ifnet multicast_register_if; 239static vifi_t reg_vif_num = VIFI_INVALID; 240#endif /* PIM */ 241 242/* 243 * Private variables. 244 */ 245static vifi_t numvifs; 246static const struct encaptab *encap_cookie; 247 248/* 249 * one-back cache used by mroute_encapcheck to locate a tunnel's vif 250 * given a datagram's src ip address. 251 */ 252static u_long last_encap_src; 253static struct vif *last_encap_vif; 254 255/* 256 * Callout for queue processing. 257 */ 258static struct callout tbf_reprocess_ch; 259 260static u_long X_ip_mcast_src(int vifi); 261static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, 262 struct mbuf *m, struct ip_moptions *imo); 263static int X_ip_mrouter_done(void); 264static int X_ip_mrouter_get(struct socket *so, struct sockopt *m); 265static int X_ip_mrouter_set(struct socket *so, struct sockopt *m); 266static int X_legal_vif_num(int vif); 267static int X_mrt_ioctl(int cmd, caddr_t data); 268 269static int get_sg_cnt(struct sioc_sg_req *); 270static int get_vif_cnt(struct sioc_vif_req *); 271static int ip_mrouter_init(struct socket *, int); 272static int add_vif(struct vifctl *); 273static int del_vif(vifi_t); 274static int add_mfc(struct mfcctl2 *); 275static int del_mfc(struct mfcctl2 *); 276static int set_api_config(uint32_t *); /* chose API capabilities */ 277static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); 278static int set_assert(int); 279static void expire_upcalls(void *); 280static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); 281static void phyint_send(struct ip *, struct vif *, struct mbuf *); 282static void encap_send(struct ip *, struct vif *, struct mbuf *); 283static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long); 284static void tbf_queue(struct vif *, struct mbuf *); 285static void tbf_process_q(struct vif *); 286static void tbf_reprocess_q(void *); 287static int tbf_dq_sel(struct vif *, struct ip *); 288static void tbf_send_packet(struct vif *, struct mbuf *); 289static void tbf_update_tokens(struct vif *); 290static int priority(struct vif *, struct ip *); 291 292/* 293 * Bandwidth monitoring 294 */ 295static void free_bw_list(struct bw_meter *list); 296static int add_bw_upcall(struct bw_upcall *); 297static int del_bw_upcall(struct bw_upcall *); 298static void bw_meter_receive_packet(struct bw_meter *x, int plen, 299 struct timeval *nowp); 300static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp); 301static void bw_upcalls_send(void); 302static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp); 303static void unschedule_bw_meter(struct bw_meter *x); 304static void bw_meter_process(void); 305static void expire_bw_upcalls_send(void *); 306static void expire_bw_meter_process(void *); 307 308#ifdef PIM 309static int pim_register_send(struct ip *, struct vif *, 310 struct mbuf *, struct mfc *); 311static int pim_register_send_rp(struct ip *, struct vif *, 312 struct mbuf *, struct mfc *); 313static int pim_register_send_upcall(struct ip *, struct vif *, 314 struct mbuf *, struct mfc *); 315static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *); 316#endif 317 318/* 319 * whether or not special PIM assert processing is enabled. 320 */ 321static int pim_assert; 322/* 323 * Rate limit for assert notification messages, in usec 324 */ 325#define ASSERT_MSG_TIME 3000000 326 327/* 328 * Kernel multicast routing API capabilities and setup. 329 * If more API capabilities are added to the kernel, they should be 330 * recorded in `mrt_api_support'. 331 */ 332static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | 333 MRT_MFC_FLAGS_BORDER_VIF | 334 MRT_MFC_RP | 335 MRT_MFC_BW_UPCALL); 336static uint32_t mrt_api_config = 0; 337 338/* 339 * Hash function for a source, group entry 340 */ 341#define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 342 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 343 344/* 345 * Find a route for a given origin IP address and Multicast group address 346 * Type of service parameter to be added in the future!!! 347 * Statistics are updated by the caller if needed 348 * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses) 349 */ 350static struct mfc * 351mfc_find(in_addr_t o, in_addr_t g) 352{ 353 struct mfc *rt; 354 355 MFC_LOCK_ASSERT(); 356 357 for (rt = mfctable[MFCHASH(o,g)]; rt; rt = rt->mfc_next) 358 if ((rt->mfc_origin.s_addr == o) && 359 (rt->mfc_mcastgrp.s_addr == g) && (rt->mfc_stall == NULL)) 360 break; 361 return rt; 362} 363 364/* 365 * Macros to compute elapsed time efficiently 366 * Borrowed from Van Jacobson's scheduling code 367 */ 368#define TV_DELTA(a, b, delta) { \ 369 int xxs; \ 370 delta = (a).tv_usec - (b).tv_usec; \ 371 if ((xxs = (a).tv_sec - (b).tv_sec)) { \ 372 switch (xxs) { \ 373 case 2: \ 374 delta += 1000000; \ 375 /* FALLTHROUGH */ \ 376 case 1: \ 377 delta += 1000000; \ 378 break; \ 379 default: \ 380 delta += (1000000 * xxs); \ 381 } \ 382 } \ 383} 384 385#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ 386 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 387 388/* 389 * Handle MRT setsockopt commands to modify the multicast routing tables. 390 */ 391static int 392X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) 393{ 394 int error, optval; 395 vifi_t vifi; 396 struct vifctl vifc; 397 struct mfcctl2 mfc; 398 struct bw_upcall bw_upcall; 399 uint32_t i; 400 401 if (so != ip_mrouter && sopt->sopt_name != MRT_INIT) 402 return EPERM; 403 404 error = 0; 405 switch (sopt->sopt_name) { 406 case MRT_INIT: 407 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 408 if (error) 409 break; 410 error = ip_mrouter_init(so, optval); 411 break; 412 413 case MRT_DONE: 414 error = ip_mrouter_done(); 415 break; 416 417 case MRT_ADD_VIF: 418 error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); 419 if (error) 420 break; 421 error = add_vif(&vifc); 422 break; 423 424 case MRT_DEL_VIF: 425 error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); 426 if (error) 427 break; 428 error = del_vif(vifi); 429 break; 430 431 case MRT_ADD_MFC: 432 case MRT_DEL_MFC: 433 /* 434 * select data size depending on API version. 435 */ 436 if (sopt->sopt_name == MRT_ADD_MFC && 437 mrt_api_config & MRT_API_FLAGS_ALL) { 438 error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), 439 sizeof(struct mfcctl2)); 440 } else { 441 error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), 442 sizeof(struct mfcctl)); 443 bzero((caddr_t)&mfc + sizeof(struct mfcctl), 444 sizeof(mfc) - sizeof(struct mfcctl)); 445 } 446 if (error) 447 break; 448 if (sopt->sopt_name == MRT_ADD_MFC) 449 error = add_mfc(&mfc); 450 else 451 error = del_mfc(&mfc); 452 break; 453 454 case MRT_ASSERT: 455 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 456 if (error) 457 break; 458 set_assert(optval); 459 break; 460 461 case MRT_API_CONFIG: 462 error = sooptcopyin(sopt, &i, sizeof i, sizeof i); 463 if (!error) 464 error = set_api_config(&i); 465 if (!error) 466 error = sooptcopyout(sopt, &i, sizeof i); 467 break; 468 469 case MRT_ADD_BW_UPCALL: 470 case MRT_DEL_BW_UPCALL: 471 error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, 472 sizeof bw_upcall); 473 if (error) 474 break; 475 if (sopt->sopt_name == MRT_ADD_BW_UPCALL) 476 error = add_bw_upcall(&bw_upcall); 477 else 478 error = del_bw_upcall(&bw_upcall); 479 break; 480 481 default: 482 error = EOPNOTSUPP; 483 break; 484 } 485 return error; 486} 487 488/* 489 * Handle MRT getsockopt commands 490 */ 491static int 492X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) 493{ 494 int error; 495 static int version = 0x0305; /* !!! why is this here? XXX */ 496 497 switch (sopt->sopt_name) { 498 case MRT_VERSION: 499 error = sooptcopyout(sopt, &version, sizeof version); 500 break; 501 502 case MRT_ASSERT: 503 error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert); 504 break; 505 506 case MRT_API_SUPPORT: 507 error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); 508 break; 509 510 case MRT_API_CONFIG: 511 error = sooptcopyout(sopt, &mrt_api_config, sizeof mrt_api_config); 512 break; 513 514 default: 515 error = EOPNOTSUPP; 516 break; 517 } 518 return error; 519} 520 521/* 522 * Handle ioctl commands to obtain information from the cache 523 */ 524static int 525X_mrt_ioctl(int cmd, caddr_t data) 526{ 527 int error = 0; 528 529 switch (cmd) { 530 case (SIOCGETVIFCNT): 531 error = get_vif_cnt((struct sioc_vif_req *)data); 532 break; 533 534 case (SIOCGETSGCNT): 535 error = get_sg_cnt((struct sioc_sg_req *)data); 536 break; 537 538 default: 539 error = EINVAL; 540 break; 541 } 542 return error; 543} 544 545/* 546 * returns the packet, byte, rpf-failure count for the source group provided 547 */ 548static int 549get_sg_cnt(struct sioc_sg_req *req) 550{ 551 struct mfc *rt; 552 553 MFC_LOCK(); 554 rt = mfc_find(req->src.s_addr, req->grp.s_addr); 555 if (rt == NULL) { 556 MFC_UNLOCK(); 557 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; 558 return EADDRNOTAVAIL; 559 } 560 req->pktcnt = rt->mfc_pkt_cnt; 561 req->bytecnt = rt->mfc_byte_cnt; 562 req->wrong_if = rt->mfc_wrong_if; 563 MFC_UNLOCK(); 564 return 0; 565} 566 567/* 568 * returns the input and output packet and byte counts on the vif provided 569 */ 570static int 571get_vif_cnt(struct sioc_vif_req *req) 572{ 573 vifi_t vifi = req->vifi; 574 575 VIF_LOCK(); 576 if (vifi >= numvifs) { 577 VIF_UNLOCK(); 578 return EINVAL; 579 } 580 581 req->icount = viftable[vifi].v_pkt_in; 582 req->ocount = viftable[vifi].v_pkt_out; 583 req->ibytes = viftable[vifi].v_bytes_in; 584 req->obytes = viftable[vifi].v_bytes_out; 585 VIF_UNLOCK(); 586 587 return 0; 588} 589 590static void 591ip_mrouter_reset(void) 592{ 593 bzero((caddr_t)mfctable, sizeof(mfctable)); 594 bzero((caddr_t)nexpire, sizeof(nexpire)); 595 596 pim_assert = 0; 597 mrt_api_config = 0; 598 599 callout_init(&expire_upcalls_ch, CALLOUT_MPSAFE); 600 601 bw_upcalls_n = 0; 602 bzero((caddr_t)bw_meter_timers, sizeof(bw_meter_timers)); 603 callout_init(&bw_upcalls_ch, CALLOUT_MPSAFE); 604 callout_init(&bw_meter_ch, CALLOUT_MPSAFE); 605 606 callout_init(&tbf_reprocess_ch, CALLOUT_MPSAFE); 607} 608 609static struct mtx mrouter_mtx; /* used to synch init/done work */ 610 611/* 612 * Enable multicast routing 613 */ 614static int 615ip_mrouter_init(struct socket *so, int version) 616{ 617 if (mrtdebug) 618 log(LOG_DEBUG, "ip_mrouter_init: so_type = %d, pr_protocol = %d\n", 619 so->so_type, so->so_proto->pr_protocol); 620 621 if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) 622 return EOPNOTSUPP; 623 624 if (version != 1) 625 return ENOPROTOOPT; 626 627 mtx_lock(&mrouter_mtx); 628 629 if (ip_mrouter != NULL) { 630 mtx_unlock(&mrouter_mtx); 631 return EADDRINUSE; 632 } 633 634 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); 635 636 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 637 expire_bw_upcalls_send, NULL); 638 callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); 639 640 ip_mrouter = so; 641 642 mtx_unlock(&mrouter_mtx); 643 644 if (mrtdebug) 645 log(LOG_DEBUG, "ip_mrouter_init\n"); 646 647 return 0; 648} 649 650/* 651 * Disable multicast routing 652 */ 653static int 654X_ip_mrouter_done(void) 655{ 656 vifi_t vifi; 657 int i; 658 struct ifnet *ifp; 659 struct ifreq ifr; 660 struct mfc *rt; 661 struct rtdetq *rte; 662 663 mtx_lock(&mrouter_mtx); 664 665 if (ip_mrouter == NULL) { 666 mtx_unlock(&mrouter_mtx); 667 return EINVAL; 668 } 669 670 /* 671 * Detach/disable hooks to the reset of the system. 672 */ 673 ip_mrouter = NULL; 674 mrt_api_config = 0; 675 676 VIF_LOCK(); 677 if (encap_cookie) { 678 const struct encaptab *c = encap_cookie; 679 encap_cookie = NULL; 680 encap_detach(c); 681 } 682 VIF_UNLOCK(); 683 684 callout_stop(&tbf_reprocess_ch); 685 686 VIF_LOCK(); 687 /* 688 * For each phyint in use, disable promiscuous reception of all IP 689 * multicasts. 690 */ 691 for (vifi = 0; vifi < numvifs; vifi++) { 692 if (viftable[vifi].v_lcl_addr.s_addr != 0 && 693 !(viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 694 struct sockaddr_in *so = (struct sockaddr_in *)&(ifr.ifr_addr); 695 696 so->sin_len = sizeof(struct sockaddr_in); 697 so->sin_family = AF_INET; 698 so->sin_addr.s_addr = INADDR_ANY; 699 ifp = viftable[vifi].v_ifp; 700 if_allmulti(ifp, 0); 701 } 702 } 703 bzero((caddr_t)tbftable, sizeof(tbftable)); 704 bzero((caddr_t)viftable, sizeof(viftable)); 705 numvifs = 0; 706 pim_assert = 0; 707 VIF_UNLOCK(); 708 709 /* 710 * Free all multicast forwarding cache entries. 711 */ 712 callout_stop(&expire_upcalls_ch); 713 callout_stop(&bw_upcalls_ch); 714 callout_stop(&bw_meter_ch); 715 716 MFC_LOCK(); 717 for (i = 0; i < MFCTBLSIZ; i++) { 718 for (rt = mfctable[i]; rt != NULL; ) { 719 struct mfc *nr = rt->mfc_next; 720 721 for (rte = rt->mfc_stall; rte != NULL; ) { 722 struct rtdetq *n = rte->next; 723 724 m_freem(rte->m); 725 free(rte, M_MRTABLE); 726 rte = n; 727 } 728 free_bw_list(rt->mfc_bw_meter); 729 free(rt, M_MRTABLE); 730 rt = nr; 731 } 732 } 733 bzero((caddr_t)mfctable, sizeof(mfctable)); 734 bzero((caddr_t)nexpire, sizeof(nexpire)); 735 bw_upcalls_n = 0; 736 bzero(bw_meter_timers, sizeof(bw_meter_timers)); 737 MFC_UNLOCK(); 738 739 /* 740 * Reset de-encapsulation cache 741 */ 742 last_encap_src = INADDR_ANY; 743 last_encap_vif = NULL; 744#ifdef PIM 745 reg_vif_num = VIFI_INVALID; 746#endif 747 748 mtx_unlock(&mrouter_mtx); 749 750 if (mrtdebug) 751 log(LOG_DEBUG, "ip_mrouter_done\n"); 752 753 return 0; 754} 755 756/* 757 * Set PIM assert processing global 758 */ 759static int 760set_assert(int i) 761{ 762 if ((i != 1) && (i != 0)) 763 return EINVAL; 764 765 pim_assert = i; 766 767 return 0; 768} 769 770/* 771 * Configure API capabilities 772 */ 773int 774set_api_config(uint32_t *apival) 775{ 776 int i; 777 778 /* 779 * We can set the API capabilities only if it is the first operation 780 * after MRT_INIT. I.e.: 781 * - there are no vifs installed 782 * - pim_assert is not enabled 783 * - the MFC table is empty 784 */ 785 if (numvifs > 0) { 786 *apival = 0; 787 return EPERM; 788 } 789 if (pim_assert) { 790 *apival = 0; 791 return EPERM; 792 } 793 for (i = 0; i < MFCTBLSIZ; i++) { 794 if (mfctable[i] != NULL) { 795 *apival = 0; 796 return EPERM; 797 } 798 } 799 800 mrt_api_config = *apival & mrt_api_support; 801 *apival = mrt_api_config; 802 803 return 0; 804} 805 806/* 807 * Decide if a packet is from a tunnelled peer. 808 * Return 0 if not, 64 if so. XXX yuck.. 64 ??? 809 */ 810static int 811mroute_encapcheck(const struct mbuf *m, int off, int proto, void *arg) 812{ 813 struct ip *ip = mtod(m, struct ip *); 814 int hlen = ip->ip_hl << 2; 815 816 /* 817 * don't claim the packet if it's not to a multicast destination or if 818 * we don't have an encapsulating tunnel with the source. 819 * Note: This code assumes that the remote site IP address 820 * uniquely identifies the tunnel (i.e., that this site has 821 * at most one tunnel with the remote site). 822 */ 823 if (!IN_MULTICAST(ntohl(((struct ip *)((char *)ip+hlen))->ip_dst.s_addr))) 824 return 0; 825 if (ip->ip_src.s_addr != last_encap_src) { 826 struct vif *vifp = viftable; 827 struct vif *vife = vifp + numvifs; 828 829 last_encap_src = ip->ip_src.s_addr; 830 last_encap_vif = NULL; 831 for ( ; vifp < vife; ++vifp) 832 if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) { 833 if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT)) == VIFF_TUNNEL) 834 last_encap_vif = vifp; 835 break; 836 } 837 } 838 if (last_encap_vif == NULL) { 839 last_encap_src = INADDR_ANY; 840 return 0; 841 } 842 return 64; 843} 844 845/* 846 * De-encapsulate a packet and feed it back through ip input (this 847 * routine is called whenever IP gets a packet that mroute_encap_func() 848 * claimed). 849 */ 850static void 851mroute_encap_input(struct mbuf *m, int off) 852{ 853 struct ip *ip = mtod(m, struct ip *); 854 int hlen = ip->ip_hl << 2; 855 856 if (hlen > sizeof(struct ip)) 857 ip_stripoptions(m, (struct mbuf *) 0); 858 m->m_data += sizeof(struct ip); 859 m->m_len -= sizeof(struct ip); 860 m->m_pkthdr.len -= sizeof(struct ip); 861 862 m->m_pkthdr.rcvif = last_encap_vif->v_ifp; 863 864 netisr_queue(NETISR_IP, m); 865 /* 866 * normally we would need a "schednetisr(NETISR_IP)" 867 * here but we were called by ip_input and it is going 868 * to loop back & try to dequeue the packet we just 869 * queued as soon as we return so we avoid the 870 * unnecessary software interrrupt. 871 * 872 * XXX 873 * This no longer holds - we may have direct-dispatched the packet, 874 * or there may be a queue processing limit. 875 */ 876} 877 878extern struct domain inetdomain; 879static struct protosw mroute_encap_protosw = 880{ SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR, 881 mroute_encap_input, 0, 0, rip_ctloutput, 882 0, 883 0, 0, 0, 0, 884 &rip_usrreqs 885}; 886 887/* 888 * Add a vif to the vif table 889 */ 890static int 891add_vif(struct vifctl *vifcp) 892{ 893 struct vif *vifp = viftable + vifcp->vifc_vifi; 894 struct sockaddr_in sin = {sizeof sin, AF_INET}; 895 struct ifaddr *ifa; 896 struct ifnet *ifp; 897 int error; 898 struct tbf *v_tbf = tbftable + vifcp->vifc_vifi; 899 900 VIF_LOCK(); 901 if (vifcp->vifc_vifi >= MAXVIFS) { 902 VIF_UNLOCK(); 903 return EINVAL; 904 } 905 if (vifp->v_lcl_addr.s_addr != INADDR_ANY) { 906 VIF_UNLOCK(); 907 return EADDRINUSE; 908 } 909 if (vifcp->vifc_lcl_addr.s_addr == INADDR_ANY) { 910 VIF_UNLOCK(); 911 return EADDRNOTAVAIL; 912 } 913 914 /* Find the interface with an address in AF_INET family */ 915#ifdef PIM 916 if (vifcp->vifc_flags & VIFF_REGISTER) { 917 /* 918 * XXX: Because VIFF_REGISTER does not really need a valid 919 * local interface (e.g. it could be 127.0.0.2), we don't 920 * check its address. 921 */ 922 ifp = NULL; 923 } else 924#endif 925 { 926 sin.sin_addr = vifcp->vifc_lcl_addr; 927 ifa = ifa_ifwithaddr((struct sockaddr *)&sin); 928 if (ifa == NULL) { 929 VIF_UNLOCK(); 930 return EADDRNOTAVAIL; 931 } 932 ifp = ifa->ifa_ifp; 933 } 934 935 if (vifcp->vifc_flags & VIFF_TUNNEL) { 936 if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) { 937 /* 938 * An encapsulating tunnel is wanted. Tell 939 * mroute_encap_input() to start paying attention 940 * to encapsulated packets. 941 */ 942 if (encap_cookie == NULL) { 943 int i; 944 945 encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4, 946 mroute_encapcheck, 947 (struct protosw *)&mroute_encap_protosw, NULL); 948 949 if (encap_cookie == NULL) { 950 printf("ip_mroute: unable to attach encap\n"); 951 VIF_UNLOCK(); 952 return EIO; /* XXX */ 953 } 954 for (i = 0; i < MAXVIFS; ++i) { 955 if_initname(&multicast_decap_if[i], "mdecap", i); 956 } 957 } 958 /* 959 * Set interface to fake encapsulator interface 960 */ 961 ifp = &multicast_decap_if[vifcp->vifc_vifi]; 962 /* 963 * Prepare cached route entry 964 */ 965 bzero(&vifp->v_route, sizeof(vifp->v_route)); 966 } else { 967 log(LOG_ERR, "source routed tunnels not supported\n"); 968 VIF_UNLOCK(); 969 return EOPNOTSUPP; 970 } 971#ifdef PIM 972 } else if (vifcp->vifc_flags & VIFF_REGISTER) { 973 ifp = &multicast_register_if; 974 if (mrtdebug) 975 log(LOG_DEBUG, "Adding a register vif, ifp: %p\n", 976 (void *)&multicast_register_if); 977 if (reg_vif_num == VIFI_INVALID) { 978 if_initname(&multicast_register_if, "register_vif", 0); 979 multicast_register_if.if_flags = IFF_LOOPBACK; 980 bzero(&vifp->v_route, sizeof(vifp->v_route)); 981 reg_vif_num = vifcp->vifc_vifi; 982 } 983#endif 984 } else { /* Make sure the interface supports multicast */ 985 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 986 VIF_UNLOCK(); 987 return EOPNOTSUPP; 988 } 989 990 /* Enable promiscuous reception of all IP multicasts from the if */ 991 error = if_allmulti(ifp, 1); 992 if (error) { 993 VIF_UNLOCK(); 994 return error; 995 } 996 } 997 998 /* define parameters for the tbf structure */ 999 vifp->v_tbf = v_tbf; 1000 GET_TIME(vifp->v_tbf->tbf_last_pkt_t); 1001 vifp->v_tbf->tbf_n_tok = 0; 1002 vifp->v_tbf->tbf_q_len = 0; 1003 vifp->v_tbf->tbf_max_q_len = MAXQSIZE; 1004 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; 1005 1006 vifp->v_flags = vifcp->vifc_flags; 1007 vifp->v_threshold = vifcp->vifc_threshold; 1008 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 1009 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 1010 vifp->v_ifp = ifp; 1011 /* scaling up here allows division by 1024 in critical code */ 1012 vifp->v_rate_limit= vifcp->vifc_rate_limit * 1024 / 1000; 1013 vifp->v_rsvp_on = 0; 1014 vifp->v_rsvpd = NULL; 1015 /* initialize per vif pkt counters */ 1016 vifp->v_pkt_in = 0; 1017 vifp->v_pkt_out = 0; 1018 vifp->v_bytes_in = 0; 1019 vifp->v_bytes_out = 0; 1020 1021 /* Adjust numvifs up if the vifi is higher than numvifs */ 1022 if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1; 1023 1024 VIF_UNLOCK(); 1025 1026 if (mrtdebug) 1027 log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x, rate %d\n", 1028 vifcp->vifc_vifi, 1029 (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr), 1030 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1031 (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr), 1032 vifcp->vifc_threshold, 1033 vifcp->vifc_rate_limit); 1034 1035 return 0; 1036} 1037 1038/* 1039 * Delete a vif from the vif table 1040 */ 1041static int 1042del_vif(vifi_t vifi) 1043{ 1044 struct vif *vifp; 1045 1046 VIF_LOCK(); 1047 1048 if (vifi >= numvifs) { 1049 VIF_UNLOCK(); 1050 return EINVAL; 1051 } 1052 vifp = &viftable[vifi]; 1053 if (vifp->v_lcl_addr.s_addr == INADDR_ANY) { 1054 VIF_UNLOCK(); 1055 return EADDRNOTAVAIL; 1056 } 1057 1058 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) 1059 if_allmulti(vifp->v_ifp, 0); 1060 1061 if (vifp == last_encap_vif) { 1062 last_encap_vif = NULL; 1063 last_encap_src = INADDR_ANY; 1064 } 1065 1066 /* 1067 * Free packets queued at the interface 1068 */ 1069 while (vifp->v_tbf->tbf_q) { 1070 struct mbuf *m = vifp->v_tbf->tbf_q; 1071 1072 vifp->v_tbf->tbf_q = m->m_act; 1073 m_freem(m); 1074 } 1075 1076#ifdef PIM 1077 if (vifp->v_flags & VIFF_REGISTER) 1078 reg_vif_num = VIFI_INVALID; 1079#endif 1080 1081 bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf))); 1082 bzero((caddr_t)vifp, sizeof (*vifp)); 1083 1084 if (mrtdebug) 1085 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs); 1086 1087 /* Adjust numvifs down */ 1088 for (vifi = numvifs; vifi > 0; vifi--) 1089 if (viftable[vifi-1].v_lcl_addr.s_addr != INADDR_ANY) 1090 break; 1091 numvifs = vifi; 1092 1093 VIF_UNLOCK(); 1094 1095 return 0; 1096} 1097 1098/* 1099 * update an mfc entry without resetting counters and S,G addresses. 1100 */ 1101static void 1102update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 1103{ 1104 int i; 1105 1106 rt->mfc_parent = mfccp->mfcc_parent; 1107 for (i = 0; i < numvifs; i++) { 1108 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1109 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config & 1110 MRT_MFC_FLAGS_ALL; 1111 } 1112 /* set the RP address */ 1113 if (mrt_api_config & MRT_MFC_RP) 1114 rt->mfc_rp = mfccp->mfcc_rp; 1115 else 1116 rt->mfc_rp.s_addr = INADDR_ANY; 1117} 1118 1119/* 1120 * fully initialize an mfc entry from the parameter. 1121 */ 1122static void 1123init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 1124{ 1125 rt->mfc_origin = mfccp->mfcc_origin; 1126 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1127 1128 update_mfc_params(rt, mfccp); 1129 1130 /* initialize pkt counters per src-grp */ 1131 rt->mfc_pkt_cnt = 0; 1132 rt->mfc_byte_cnt = 0; 1133 rt->mfc_wrong_if = 0; 1134 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; 1135} 1136 1137 1138/* 1139 * Add an mfc entry 1140 */ 1141static int 1142add_mfc(struct mfcctl2 *mfccp) 1143{ 1144 struct mfc *rt; 1145 u_long hash; 1146 struct rtdetq *rte; 1147 u_short nstl; 1148 1149 VIF_LOCK(); 1150 MFC_LOCK(); 1151 1152 rt = mfc_find(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); 1153 1154 /* If an entry already exists, just update the fields */ 1155 if (rt) { 1156 if (mrtdebug & DEBUG_MFC) 1157 log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n", 1158 (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1159 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1160 mfccp->mfcc_parent); 1161 1162 update_mfc_params(rt, mfccp); 1163 MFC_UNLOCK(); 1164 VIF_UNLOCK(); 1165 return 0; 1166 } 1167 1168 /* 1169 * Find the entry for which the upcall was made and update 1170 */ 1171 hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); 1172 for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) { 1173 1174 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1175 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && 1176 (rt->mfc_stall != NULL)) { 1177 1178 if (nstl++) 1179 log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n", 1180 "multiple kernel entries", 1181 (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1182 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1183 mfccp->mfcc_parent, (void *)rt->mfc_stall); 1184 1185 if (mrtdebug & DEBUG_MFC) 1186 log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n", 1187 (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1188 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1189 mfccp->mfcc_parent, (void *)rt->mfc_stall); 1190 1191 init_mfc_params(rt, mfccp); 1192 1193 rt->mfc_expire = 0; /* Don't clean this guy up */ 1194 nexpire[hash]--; 1195 1196 /* free packets Qed at the end of this entry */ 1197 for (rte = rt->mfc_stall; rte != NULL; ) { 1198 struct rtdetq *n = rte->next; 1199 1200 ip_mdq(rte->m, rte->ifp, rt, -1); 1201 m_freem(rte->m); 1202 free(rte, M_MRTABLE); 1203 rte = n; 1204 } 1205 rt->mfc_stall = NULL; 1206 } 1207 } 1208 1209 /* 1210 * It is possible that an entry is being inserted without an upcall 1211 */ 1212 if (nstl == 0) { 1213 if (mrtdebug & DEBUG_MFC) 1214 log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n", 1215 hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1216 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1217 mfccp->mfcc_parent); 1218 1219 for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) { 1220 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1221 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) { 1222 init_mfc_params(rt, mfccp); 1223 if (rt->mfc_expire) 1224 nexpire[hash]--; 1225 rt->mfc_expire = 0; 1226 break; /* XXX */ 1227 } 1228 } 1229 if (rt == NULL) { /* no upcall, so make a new entry */ 1230 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); 1231 if (rt == NULL) { 1232 MFC_UNLOCK(); 1233 VIF_UNLOCK(); 1234 return ENOBUFS; 1235 } 1236 1237 init_mfc_params(rt, mfccp); 1238 rt->mfc_expire = 0; 1239 rt->mfc_stall = NULL; 1240 1241 rt->mfc_bw_meter = NULL; 1242 /* insert new entry at head of hash chain */ 1243 rt->mfc_next = mfctable[hash]; 1244 mfctable[hash] = rt; 1245 } 1246 } 1247 MFC_UNLOCK(); 1248 VIF_UNLOCK(); 1249 return 0; 1250} 1251 1252/* 1253 * Delete an mfc entry 1254 */ 1255static int 1256del_mfc(struct mfcctl2 *mfccp) 1257{ 1258 struct in_addr origin; 1259 struct in_addr mcastgrp; 1260 struct mfc *rt; 1261 struct mfc **nptr; 1262 u_long hash; 1263 struct bw_meter *list; 1264 1265 origin = mfccp->mfcc_origin; 1266 mcastgrp = mfccp->mfcc_mcastgrp; 1267 1268 if (mrtdebug & DEBUG_MFC) 1269 log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n", 1270 (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); 1271 1272 MFC_LOCK(); 1273 1274 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); 1275 for (nptr = &mfctable[hash]; (rt = *nptr) != NULL; nptr = &rt->mfc_next) 1276 if (origin.s_addr == rt->mfc_origin.s_addr && 1277 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && 1278 rt->mfc_stall == NULL) 1279 break; 1280 if (rt == NULL) { 1281 MFC_UNLOCK(); 1282 return EADDRNOTAVAIL; 1283 } 1284 1285 *nptr = rt->mfc_next; 1286 1287 /* 1288 * free the bw_meter entries 1289 */ 1290 list = rt->mfc_bw_meter; 1291 rt->mfc_bw_meter = NULL; 1292 1293 free(rt, M_MRTABLE); 1294 1295 free_bw_list(list); 1296 1297 MFC_UNLOCK(); 1298 1299 return 0; 1300} 1301 1302/* 1303 * Send a message to mrouted on the multicast routing socket 1304 */ 1305static int 1306socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) 1307{ 1308 if (s) { 1309 SOCKBUF_LOCK(&s->so_rcv); 1310 if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm, 1311 NULL) != 0) { 1312 sorwakeup_locked(s); 1313 return 0; 1314 } 1315 SOCKBUF_UNLOCK(&s->so_rcv); 1316 } 1317 m_freem(mm); 1318 return -1; 1319} 1320 1321/* 1322 * IP multicast forwarding function. This function assumes that the packet 1323 * pointed to by "ip" has arrived on (or is about to be sent to) the interface 1324 * pointed to by "ifp", and the packet is to be relayed to other networks 1325 * that have members of the packet's destination IP multicast group. 1326 * 1327 * The packet is returned unscathed to the caller, unless it is 1328 * erroneous, in which case a non-zero return value tells the caller to 1329 * discard it. 1330 */ 1331 1332#define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1333 1334static int 1335X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, 1336 struct ip_moptions *imo) 1337{ 1338 struct mfc *rt; 1339 int error; 1340 vifi_t vifi; 1341 1342 if (mrtdebug & DEBUG_FORWARD) 1343 log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n", 1344 (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), 1345 (void *)ifp); 1346 1347 if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || 1348 ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { 1349 /* 1350 * Packet arrived via a physical interface or 1351 * an encapsulated tunnel or a register_vif. 1352 */ 1353 } else { 1354 /* 1355 * Packet arrived through a source-route tunnel. 1356 * Source-route tunnels are no longer supported. 1357 */ 1358 static int last_log; 1359 if (last_log != time_second) { 1360 last_log = time_second; 1361 log(LOG_ERR, 1362 "ip_mforward: received source-routed packet from %lx\n", 1363 (u_long)ntohl(ip->ip_src.s_addr)); 1364 } 1365 return 1; 1366 } 1367 1368 VIF_LOCK(); 1369 MFC_LOCK(); 1370 if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) { 1371 if (ip->ip_ttl < 255) 1372 ip->ip_ttl++; /* compensate for -1 in *_send routines */ 1373 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { 1374 struct vif *vifp = viftable + vifi; 1375 1376 printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s)\n", 1377 (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr), 1378 vifi, 1379 (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "", 1380 vifp->v_ifp->if_xname); 1381 } 1382 error = ip_mdq(m, ifp, NULL, vifi); 1383 MFC_UNLOCK(); 1384 VIF_UNLOCK(); 1385 return error; 1386 } 1387 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { 1388 printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n", 1389 (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr)); 1390 if (!imo) 1391 printf("In fact, no options were specified at all\n"); 1392 } 1393 1394 /* 1395 * Don't forward a packet with time-to-live of zero or one, 1396 * or a packet destined to a local-only group. 1397 */ 1398 if (ip->ip_ttl <= 1 || ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP) { 1399 MFC_UNLOCK(); 1400 VIF_UNLOCK(); 1401 return 0; 1402 } 1403 1404 /* 1405 * Determine forwarding vifs from the forwarding cache table 1406 */ 1407 ++mrtstat.mrts_mfc_lookups; 1408 rt = mfc_find(ip->ip_src.s_addr, ip->ip_dst.s_addr); 1409 1410 /* Entry exists, so forward if necessary */ 1411 if (rt != NULL) { 1412 error = ip_mdq(m, ifp, rt, -1); 1413 MFC_UNLOCK(); 1414 VIF_UNLOCK(); 1415 return error; 1416 } else { 1417 /* 1418 * If we don't have a route for packet's origin, 1419 * Make a copy of the packet & send message to routing daemon 1420 */ 1421 1422 struct mbuf *mb0; 1423 struct rtdetq *rte; 1424 u_long hash; 1425 int hlen = ip->ip_hl << 2; 1426 1427 ++mrtstat.mrts_mfc_misses; 1428 1429 mrtstat.mrts_no_route++; 1430 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) 1431 log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n", 1432 (u_long)ntohl(ip->ip_src.s_addr), 1433 (u_long)ntohl(ip->ip_dst.s_addr)); 1434 1435 /* 1436 * Allocate mbufs early so that we don't do extra work if we are 1437 * just going to fail anyway. Make sure to pullup the header so 1438 * that other people can't step on it. 1439 */ 1440 rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT); 1441 if (rte == NULL) { 1442 MFC_UNLOCK(); 1443 VIF_UNLOCK(); 1444 return ENOBUFS; 1445 } 1446 mb0 = m_copypacket(m, M_DONTWAIT); 1447 if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) 1448 mb0 = m_pullup(mb0, hlen); 1449 if (mb0 == NULL) { 1450 free(rte, M_MRTABLE); 1451 MFC_UNLOCK(); 1452 VIF_UNLOCK(); 1453 return ENOBUFS; 1454 } 1455 1456 /* is there an upcall waiting for this flow ? */ 1457 hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr); 1458 for (rt = mfctable[hash]; rt; rt = rt->mfc_next) { 1459 if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) && 1460 (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) && 1461 (rt->mfc_stall != NULL)) 1462 break; 1463 } 1464 1465 if (rt == NULL) { 1466 int i; 1467 struct igmpmsg *im; 1468 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 1469 struct mbuf *mm; 1470 1471 /* 1472 * Locate the vifi for the incoming interface for this packet. 1473 * If none found, drop packet. 1474 */ 1475 for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) 1476 ; 1477 if (vifi >= numvifs) /* vif not found, drop packet */ 1478 goto non_fatal; 1479 1480 /* no upcall, so make a new entry */ 1481 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); 1482 if (rt == NULL) 1483 goto fail; 1484 /* Make a copy of the header to send to the user level process */ 1485 mm = m_copy(mb0, 0, hlen); 1486 if (mm == NULL) 1487 goto fail1; 1488 1489 /* 1490 * Send message to routing daemon to install 1491 * a route into the kernel table 1492 */ 1493 1494 im = mtod(mm, struct igmpmsg *); 1495 im->im_msgtype = IGMPMSG_NOCACHE; 1496 im->im_mbz = 0; 1497 im->im_vif = vifi; 1498 1499 mrtstat.mrts_upcalls++; 1500 1501 k_igmpsrc.sin_addr = ip->ip_src; 1502 if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { 1503 log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); 1504 ++mrtstat.mrts_upq_sockfull; 1505fail1: 1506 free(rt, M_MRTABLE); 1507fail: 1508 free(rte, M_MRTABLE); 1509 m_freem(mb0); 1510 MFC_UNLOCK(); 1511 VIF_UNLOCK(); 1512 return ENOBUFS; 1513 } 1514 1515 /* insert new entry at head of hash chain */ 1516 rt->mfc_origin.s_addr = ip->ip_src.s_addr; 1517 rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; 1518 rt->mfc_expire = UPCALL_EXPIRE; 1519 nexpire[hash]++; 1520 for (i = 0; i < numvifs; i++) { 1521 rt->mfc_ttls[i] = 0; 1522 rt->mfc_flags[i] = 0; 1523 } 1524 rt->mfc_parent = -1; 1525 1526 rt->mfc_rp.s_addr = INADDR_ANY; /* clear the RP address */ 1527 1528 rt->mfc_bw_meter = NULL; 1529 1530 /* link into table */ 1531 rt->mfc_next = mfctable[hash]; 1532 mfctable[hash] = rt; 1533 rt->mfc_stall = rte; 1534 1535 } else { 1536 /* determine if q has overflowed */ 1537 int npkts = 0; 1538 struct rtdetq **p; 1539 1540 /* 1541 * XXX ouch! we need to append to the list, but we 1542 * only have a pointer to the front, so we have to 1543 * scan the entire list every time. 1544 */ 1545 for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) 1546 npkts++; 1547 1548 if (npkts > MAX_UPQ) { 1549 mrtstat.mrts_upq_ovflw++; 1550non_fatal: 1551 free(rte, M_MRTABLE); 1552 m_freem(mb0); 1553 MFC_UNLOCK(); 1554 VIF_UNLOCK(); 1555 return 0; 1556 } 1557 1558 /* Add this entry to the end of the queue */ 1559 *p = rte; 1560 } 1561 1562 rte->m = mb0; 1563 rte->ifp = ifp; 1564 rte->next = NULL; 1565 1566 MFC_UNLOCK(); 1567 VIF_UNLOCK(); 1568 1569 return 0; 1570 } 1571} 1572 1573/* 1574 * Clean up the cache entry if upcall is not serviced 1575 */ 1576static void 1577expire_upcalls(void *unused) 1578{ 1579 struct rtdetq *rte; 1580 struct mfc *mfc, **nptr; 1581 int i; 1582 1583 MFC_LOCK(); 1584 for (i = 0; i < MFCTBLSIZ; i++) { 1585 if (nexpire[i] == 0) 1586 continue; 1587 nptr = &mfctable[i]; 1588 for (mfc = *nptr; mfc != NULL; mfc = *nptr) { 1589 /* 1590 * Skip real cache entries 1591 * Make sure it wasn't marked to not expire (shouldn't happen) 1592 * If it expires now 1593 */ 1594 if (mfc->mfc_stall != NULL && mfc->mfc_expire != 0 && 1595 --mfc->mfc_expire == 0) { 1596 if (mrtdebug & DEBUG_EXPIRE) 1597 log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n", 1598 (u_long)ntohl(mfc->mfc_origin.s_addr), 1599 (u_long)ntohl(mfc->mfc_mcastgrp.s_addr)); 1600 /* 1601 * drop all the packets 1602 * free the mbuf with the pkt, if, timing info 1603 */ 1604 for (rte = mfc->mfc_stall; rte; ) { 1605 struct rtdetq *n = rte->next; 1606 1607 m_freem(rte->m); 1608 free(rte, M_MRTABLE); 1609 rte = n; 1610 } 1611 ++mrtstat.mrts_cache_cleanups; 1612 nexpire[i]--; 1613 1614 /* 1615 * free the bw_meter entries 1616 */ 1617 while (mfc->mfc_bw_meter != NULL) { 1618 struct bw_meter *x = mfc->mfc_bw_meter; 1619 1620 mfc->mfc_bw_meter = x->bm_mfc_next; 1621 free(x, M_BWMETER); 1622 } 1623 1624 *nptr = mfc->mfc_next; 1625 free(mfc, M_MRTABLE); 1626 } else { 1627 nptr = &mfc->mfc_next; 1628 } 1629 } 1630 } 1631 MFC_UNLOCK(); 1632 1633 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); 1634} 1635 1636/* 1637 * Packet forwarding routine once entry in the cache is made 1638 */ 1639static int 1640ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) 1641{ 1642 struct ip *ip = mtod(m, struct ip *); 1643 vifi_t vifi; 1644 int plen = ip->ip_len; 1645 1646 VIF_LOCK_ASSERT(); 1647/* 1648 * Macro to send packet on vif. Since RSVP packets don't get counted on 1649 * input, they shouldn't get counted on output, so statistics keeping is 1650 * separate. 1651 */ 1652#define MC_SEND(ip,vifp,m) { \ 1653 if ((vifp)->v_flags & VIFF_TUNNEL) \ 1654 encap_send((ip), (vifp), (m)); \ 1655 else \ 1656 phyint_send((ip), (vifp), (m)); \ 1657} 1658 1659 /* 1660 * If xmt_vif is not -1, send on only the requested vif. 1661 * 1662 * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) 1663 */ 1664 if (xmt_vif < numvifs) { 1665#ifdef PIM 1666 if (viftable[xmt_vif].v_flags & VIFF_REGISTER) 1667 pim_register_send(ip, viftable + xmt_vif, m, rt); 1668 else 1669#endif 1670 MC_SEND(ip, viftable + xmt_vif, m); 1671 return 1; 1672 } 1673 1674 /* 1675 * Don't forward if it didn't arrive from the parent vif for its origin. 1676 */ 1677 vifi = rt->mfc_parent; 1678 if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { 1679 /* came in the wrong interface */ 1680 if (mrtdebug & DEBUG_FORWARD) 1681 log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", 1682 (void *)ifp, vifi, (void *)viftable[vifi].v_ifp); 1683 ++mrtstat.mrts_wrong_if; 1684 ++rt->mfc_wrong_if; 1685 /* 1686 * If we are doing PIM assert processing, send a message 1687 * to the routing daemon. 1688 * 1689 * XXX: A PIM-SM router needs the WRONGVIF detection so it 1690 * can complete the SPT switch, regardless of the type 1691 * of the iif (broadcast media, GRE tunnel, etc). 1692 */ 1693 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) { 1694 struct timeval now; 1695 u_long delta; 1696 1697#ifdef PIM 1698 if (ifp == &multicast_register_if) 1699 pimstat.pims_rcv_registers_wrongiif++; 1700#endif 1701 1702 /* Get vifi for the incoming packet */ 1703 for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) 1704 ; 1705 if (vifi >= numvifs) 1706 return 0; /* The iif is not found: ignore the packet. */ 1707 1708 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) 1709 return 0; /* WRONGVIF disabled: ignore the packet */ 1710 1711 GET_TIME(now); 1712 1713 TV_DELTA(rt->mfc_last_assert, now, delta); 1714 1715 if (delta > ASSERT_MSG_TIME) { 1716 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 1717 struct igmpmsg *im; 1718 int hlen = ip->ip_hl << 2; 1719 struct mbuf *mm = m_copy(m, 0, hlen); 1720 1721 if (mm && (M_HASCL(mm) || mm->m_len < hlen)) 1722 mm = m_pullup(mm, hlen); 1723 if (mm == NULL) 1724 return ENOBUFS; 1725 1726 rt->mfc_last_assert = now; 1727 1728 im = mtod(mm, struct igmpmsg *); 1729 im->im_msgtype = IGMPMSG_WRONGVIF; 1730 im->im_mbz = 0; 1731 im->im_vif = vifi; 1732 1733 mrtstat.mrts_upcalls++; 1734 1735 k_igmpsrc.sin_addr = im->im_src; 1736 if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { 1737 log(LOG_WARNING, 1738 "ip_mforward: ip_mrouter socket queue full\n"); 1739 ++mrtstat.mrts_upq_sockfull; 1740 return ENOBUFS; 1741 } 1742 } 1743 } 1744 return 0; 1745 } 1746 1747 /* If I sourced this packet, it counts as output, else it was input. */ 1748 if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) { 1749 viftable[vifi].v_pkt_out++; 1750 viftable[vifi].v_bytes_out += plen; 1751 } else { 1752 viftable[vifi].v_pkt_in++; 1753 viftable[vifi].v_bytes_in += plen; 1754 } 1755 rt->mfc_pkt_cnt++; 1756 rt->mfc_byte_cnt += plen; 1757 1758 /* 1759 * For each vif, decide if a copy of the packet should be forwarded. 1760 * Forward if: 1761 * - the ttl exceeds the vif's threshold 1762 * - there are group members downstream on interface 1763 */ 1764 for (vifi = 0; vifi < numvifs; vifi++) 1765 if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { 1766 viftable[vifi].v_pkt_out++; 1767 viftable[vifi].v_bytes_out += plen; 1768#ifdef PIM 1769 if (viftable[vifi].v_flags & VIFF_REGISTER) 1770 pim_register_send(ip, viftable + vifi, m, rt); 1771 else 1772#endif 1773 MC_SEND(ip, viftable+vifi, m); 1774 } 1775 1776 /* 1777 * Perform upcall-related bw measuring. 1778 */ 1779 if (rt->mfc_bw_meter != NULL) { 1780 struct bw_meter *x; 1781 struct timeval now; 1782 1783 GET_TIME(now); 1784 MFC_LOCK_ASSERT(); 1785 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) 1786 bw_meter_receive_packet(x, plen, &now); 1787 } 1788 1789 return 0; 1790} 1791 1792/* 1793 * check if a vif number is legal/ok. This is used by ip_output. 1794 */ 1795static int 1796X_legal_vif_num(int vif) 1797{ 1798 /* XXX unlocked, matter? */ 1799 return (vif >= 0 && vif < numvifs); 1800} 1801 1802/* 1803 * Return the local address used by this vif 1804 */ 1805static u_long 1806X_ip_mcast_src(int vifi) 1807{ 1808 /* XXX unlocked, matter? */ 1809 if (vifi >= 0 && vifi < numvifs) 1810 return viftable[vifi].v_lcl_addr.s_addr; 1811 else 1812 return INADDR_ANY; 1813} 1814 1815static void 1816phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1817{ 1818 struct mbuf *mb_copy; 1819 int hlen = ip->ip_hl << 2; 1820 1821 VIF_LOCK_ASSERT(); 1822 1823 /* 1824 * Make a new reference to the packet; make sure that 1825 * the IP header is actually copied, not just referenced, 1826 * so that ip_output() only scribbles on the copy. 1827 */ 1828 mb_copy = m_copypacket(m, M_DONTWAIT); 1829 if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) 1830 mb_copy = m_pullup(mb_copy, hlen); 1831 if (mb_copy == NULL) 1832 return; 1833 1834 if (vifp->v_rate_limit == 0) 1835 tbf_send_packet(vifp, mb_copy); 1836 else 1837 tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len); 1838} 1839 1840static void 1841encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1842{ 1843 struct mbuf *mb_copy; 1844 struct ip *ip_copy; 1845 int i, len = ip->ip_len; 1846 1847 VIF_LOCK_ASSERT(); 1848 1849 /* Take care of delayed checksums */ 1850 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 1851 in_delayed_cksum(m); 1852 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 1853 } 1854 1855 /* 1856 * copy the old packet & pullup its IP header into the 1857 * new mbuf so we can modify it. Try to fill the new 1858 * mbuf since if we don't the ethernet driver will. 1859 */ 1860 MGETHDR(mb_copy, M_DONTWAIT, MT_HEADER); 1861 if (mb_copy == NULL) 1862 return; 1863#ifdef MAC 1864 mac_create_mbuf_multicast_encap(m, vifp->v_ifp, mb_copy); 1865#endif 1866 mb_copy->m_data += max_linkhdr; 1867 mb_copy->m_len = sizeof(multicast_encap_iphdr); 1868 1869 if ((mb_copy->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) { 1870 m_freem(mb_copy); 1871 return; 1872 } 1873 i = MHLEN - M_LEADINGSPACE(mb_copy); 1874 if (i > len) 1875 i = len; 1876 mb_copy = m_pullup(mb_copy, i); 1877 if (mb_copy == NULL) 1878 return; 1879 mb_copy->m_pkthdr.len = len + sizeof(multicast_encap_iphdr); 1880 1881 /* 1882 * fill in the encapsulating IP header. 1883 */ 1884 ip_copy = mtod(mb_copy, struct ip *); 1885 *ip_copy = multicast_encap_iphdr; 1886 ip_copy->ip_id = ip_newid(); 1887 ip_copy->ip_len += len; 1888 ip_copy->ip_src = vifp->v_lcl_addr; 1889 ip_copy->ip_dst = vifp->v_rmt_addr; 1890 1891 /* 1892 * turn the encapsulated IP header back into a valid one. 1893 */ 1894 ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr)); 1895 --ip->ip_ttl; 1896 ip->ip_len = htons(ip->ip_len); 1897 ip->ip_off = htons(ip->ip_off); 1898 ip->ip_sum = 0; 1899 mb_copy->m_data += sizeof(multicast_encap_iphdr); 1900 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 1901 mb_copy->m_data -= sizeof(multicast_encap_iphdr); 1902 1903 if (vifp->v_rate_limit == 0) 1904 tbf_send_packet(vifp, mb_copy); 1905 else 1906 tbf_control(vifp, mb_copy, ip, ip_copy->ip_len); 1907} 1908 1909/* 1910 * Token bucket filter module 1911 */ 1912 1913static void 1914tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_long p_len) 1915{ 1916 struct tbf *t = vifp->v_tbf; 1917 1918 VIF_LOCK_ASSERT(); 1919 1920 if (p_len > MAX_BKT_SIZE) { /* drop if packet is too large */ 1921 mrtstat.mrts_pkt2large++; 1922 m_freem(m); 1923 return; 1924 } 1925 1926 tbf_update_tokens(vifp); 1927 1928 if (t->tbf_q_len == 0) { /* queue empty... */ 1929 if (p_len <= t->tbf_n_tok) { /* send packet if enough tokens */ 1930 t->tbf_n_tok -= p_len; 1931 tbf_send_packet(vifp, m); 1932 } else { /* no, queue packet and try later */ 1933 tbf_queue(vifp, m); 1934 callout_reset(&tbf_reprocess_ch, TBF_REPROCESS, 1935 tbf_reprocess_q, vifp); 1936 } 1937 } else if (t->tbf_q_len < t->tbf_max_q_len) { 1938 /* finite queue length, so queue pkts and process queue */ 1939 tbf_queue(vifp, m); 1940 tbf_process_q(vifp); 1941 } else { 1942 /* queue full, try to dq and queue and process */ 1943 if (!tbf_dq_sel(vifp, ip)) { 1944 mrtstat.mrts_q_overflow++; 1945 m_freem(m); 1946 } else { 1947 tbf_queue(vifp, m); 1948 tbf_process_q(vifp); 1949 } 1950 } 1951} 1952 1953/* 1954 * adds a packet to the queue at the interface 1955 */ 1956static void 1957tbf_queue(struct vif *vifp, struct mbuf *m) 1958{ 1959 struct tbf *t = vifp->v_tbf; 1960 1961 VIF_LOCK_ASSERT(); 1962 1963 if (t->tbf_t == NULL) /* Queue was empty */ 1964 t->tbf_q = m; 1965 else /* Insert at tail */ 1966 t->tbf_t->m_act = m; 1967 1968 t->tbf_t = m; /* Set new tail pointer */ 1969 1970#ifdef DIAGNOSTIC 1971 /* Make sure we didn't get fed a bogus mbuf */ 1972 if (m->m_act) 1973 panic("tbf_queue: m_act"); 1974#endif 1975 m->m_act = NULL; 1976 1977 t->tbf_q_len++; 1978} 1979 1980/* 1981 * processes the queue at the interface 1982 */ 1983static void 1984tbf_process_q(struct vif *vifp) 1985{ 1986 struct tbf *t = vifp->v_tbf; 1987 1988 VIF_LOCK_ASSERT(); 1989 1990 /* loop through the queue at the interface and send as many packets 1991 * as possible 1992 */ 1993 while (t->tbf_q_len > 0) { 1994 struct mbuf *m = t->tbf_q; 1995 int len = mtod(m, struct ip *)->ip_len; 1996 1997 /* determine if the packet can be sent */ 1998 if (len > t->tbf_n_tok) /* not enough tokens, we are done */ 1999 break; 2000 /* ok, reduce no of tokens, dequeue and send the packet. */ 2001 t->tbf_n_tok -= len; 2002 2003 t->tbf_q = m->m_act; 2004 if (--t->tbf_q_len == 0) 2005 t->tbf_t = NULL; 2006 2007 m->m_act = NULL; 2008 tbf_send_packet(vifp, m); 2009 } 2010} 2011 2012static void 2013tbf_reprocess_q(void *xvifp) 2014{ 2015 struct vif *vifp = xvifp; 2016 2017 if (ip_mrouter == NULL) 2018 return; 2019 VIF_LOCK(); 2020 tbf_update_tokens(vifp); 2021 tbf_process_q(vifp); 2022 if (vifp->v_tbf->tbf_q_len) 2023 callout_reset(&tbf_reprocess_ch, TBF_REPROCESS, tbf_reprocess_q, vifp); 2024 VIF_UNLOCK(); 2025} 2026 2027/* function that will selectively discard a member of the queue 2028 * based on the precedence value and the priority 2029 */ 2030static int 2031tbf_dq_sel(struct vif *vifp, struct ip *ip) 2032{ 2033 u_int p; 2034 struct mbuf *m, *last; 2035 struct mbuf **np; 2036 struct tbf *t = vifp->v_tbf; 2037 2038 VIF_LOCK_ASSERT(); 2039 2040 p = priority(vifp, ip); 2041 2042 np = &t->tbf_q; 2043 last = NULL; 2044 while ((m = *np) != NULL) { 2045 if (p > priority(vifp, mtod(m, struct ip *))) { 2046 *np = m->m_act; 2047 /* If we're removing the last packet, fix the tail pointer */ 2048 if (m == t->tbf_t) 2049 t->tbf_t = last; 2050 m_freem(m); 2051 /* It's impossible for the queue to be empty, but check anyways. */ 2052 if (--t->tbf_q_len == 0) 2053 t->tbf_t = NULL; 2054 mrtstat.mrts_drop_sel++; 2055 return 1; 2056 } 2057 np = &m->m_act; 2058 last = m; 2059 } 2060 return 0; 2061} 2062 2063static void 2064tbf_send_packet(struct vif *vifp, struct mbuf *m) 2065{ 2066 VIF_LOCK_ASSERT(); 2067 2068 if (vifp->v_flags & VIFF_TUNNEL) /* If tunnel options */ 2069 ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, NULL, NULL); 2070 else { 2071 struct ip_moptions imo; 2072 int error; 2073 static struct route ro; /* XXX check this */ 2074 2075 imo.imo_multicast_ifp = vifp->v_ifp; 2076 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; 2077 imo.imo_multicast_loop = 1; 2078 imo.imo_multicast_vif = -1; 2079 2080 /* 2081 * Re-entrancy should not be a problem here, because 2082 * the packets that we send out and are looped back at us 2083 * should get rejected because they appear to come from 2084 * the loopback interface, thus preventing looping. 2085 */ 2086 error = ip_output(m, NULL, &ro, IP_FORWARDING, &imo, NULL); 2087 2088 if (mrtdebug & DEBUG_XMIT) 2089 log(LOG_DEBUG, "phyint_send on vif %d err %d\n", 2090 (int)(vifp - viftable), error); 2091 } 2092} 2093 2094/* determine the current time and then 2095 * the elapsed time (between the last time and time now) 2096 * in milliseconds & update the no. of tokens in the bucket 2097 */ 2098static void 2099tbf_update_tokens(struct vif *vifp) 2100{ 2101 struct timeval tp; 2102 u_long tm; 2103 struct tbf *t = vifp->v_tbf; 2104 2105 VIF_LOCK_ASSERT(); 2106 2107 GET_TIME(tp); 2108 2109 TV_DELTA(tp, t->tbf_last_pkt_t, tm); 2110 2111 /* 2112 * This formula is actually 2113 * "time in seconds" * "bytes/second". 2114 * 2115 * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8) 2116 * 2117 * The (1000/1024) was introduced in add_vif to optimize 2118 * this divide into a shift. 2119 */ 2120 t->tbf_n_tok += tm * vifp->v_rate_limit / 1024 / 8; 2121 t->tbf_last_pkt_t = tp; 2122 2123 if (t->tbf_n_tok > MAX_BKT_SIZE) 2124 t->tbf_n_tok = MAX_BKT_SIZE; 2125} 2126 2127static int 2128priority(struct vif *vifp, struct ip *ip) 2129{ 2130 int prio = 50; /* the lowest priority -- default case */ 2131 2132 /* temporary hack; may add general packet classifier some day */ 2133 2134 /* 2135 * The UDP port space is divided up into four priority ranges: 2136 * [0, 16384) : unclassified - lowest priority 2137 * [16384, 32768) : audio - highest priority 2138 * [32768, 49152) : whiteboard - medium priority 2139 * [49152, 65536) : video - low priority 2140 * 2141 * Everything else gets lowest priority. 2142 */ 2143 if (ip->ip_p == IPPROTO_UDP) { 2144 struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2)); 2145 switch (ntohs(udp->uh_dport) & 0xc000) { 2146 case 0x4000: 2147 prio = 70; 2148 break; 2149 case 0x8000: 2150 prio = 60; 2151 break; 2152 case 0xc000: 2153 prio = 55; 2154 break; 2155 } 2156 } 2157 return prio; 2158} 2159 2160/* 2161 * End of token bucket filter modifications 2162 */ 2163 2164static int 2165X_ip_rsvp_vif(struct socket *so, struct sockopt *sopt) 2166{ 2167 int error, vifi; 2168 2169 if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) 2170 return EOPNOTSUPP; 2171 2172 error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); 2173 if (error) 2174 return error; 2175 2176 VIF_LOCK(); 2177 2178 if (vifi < 0 || vifi >= numvifs) { /* Error if vif is invalid */ 2179 VIF_UNLOCK(); 2180 return EADDRNOTAVAIL; 2181 } 2182 2183 if (sopt->sopt_name == IP_RSVP_VIF_ON) { 2184 /* Check if socket is available. */ 2185 if (viftable[vifi].v_rsvpd != NULL) { 2186 VIF_UNLOCK(); 2187 return EADDRINUSE; 2188 } 2189 2190 viftable[vifi].v_rsvpd = so; 2191 /* This may seem silly, but we need to be sure we don't over-increment 2192 * the RSVP counter, in case something slips up. 2193 */ 2194 if (!viftable[vifi].v_rsvp_on) { 2195 viftable[vifi].v_rsvp_on = 1; 2196 rsvp_on++; 2197 } 2198 } else { /* must be VIF_OFF */ 2199 /* 2200 * XXX as an additional consistency check, one could make sure 2201 * that viftable[vifi].v_rsvpd == so, otherwise passing so as 2202 * first parameter is pretty useless. 2203 */ 2204 viftable[vifi].v_rsvpd = NULL; 2205 /* 2206 * This may seem silly, but we need to be sure we don't over-decrement 2207 * the RSVP counter, in case something slips up. 2208 */ 2209 if (viftable[vifi].v_rsvp_on) { 2210 viftable[vifi].v_rsvp_on = 0; 2211 rsvp_on--; 2212 } 2213 } 2214 VIF_UNLOCK(); 2215 return 0; 2216} 2217 2218static void 2219X_ip_rsvp_force_done(struct socket *so) 2220{ 2221 int vifi; 2222 2223 /* Don't bother if it is not the right type of socket. */ 2224 if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) 2225 return; 2226 2227 VIF_LOCK(); 2228 2229 /* The socket may be attached to more than one vif...this 2230 * is perfectly legal. 2231 */ 2232 for (vifi = 0; vifi < numvifs; vifi++) { 2233 if (viftable[vifi].v_rsvpd == so) { 2234 viftable[vifi].v_rsvpd = NULL; 2235 /* This may seem silly, but we need to be sure we don't 2236 * over-decrement the RSVP counter, in case something slips up. 2237 */ 2238 if (viftable[vifi].v_rsvp_on) { 2239 viftable[vifi].v_rsvp_on = 0; 2240 rsvp_on--; 2241 } 2242 } 2243 } 2244 2245 VIF_UNLOCK(); 2246} 2247 2248static void 2249X_rsvp_input(struct mbuf *m, int off) 2250{ 2251 int vifi; 2252 struct ip *ip = mtod(m, struct ip *); 2253 struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET }; 2254 struct ifnet *ifp; 2255 2256 if (rsvpdebug) 2257 printf("rsvp_input: rsvp_on %d\n",rsvp_on); 2258 2259 /* Can still get packets with rsvp_on = 0 if there is a local member 2260 * of the group to which the RSVP packet is addressed. But in this 2261 * case we want to throw the packet away. 2262 */ 2263 if (!rsvp_on) { 2264 m_freem(m); 2265 return; 2266 } 2267 2268 if (rsvpdebug) 2269 printf("rsvp_input: check vifs\n"); 2270 2271#ifdef DIAGNOSTIC 2272 M_ASSERTPKTHDR(m); 2273#endif 2274 2275 ifp = m->m_pkthdr.rcvif; 2276 2277 VIF_LOCK(); 2278 /* Find which vif the packet arrived on. */ 2279 for (vifi = 0; vifi < numvifs; vifi++) 2280 if (viftable[vifi].v_ifp == ifp) 2281 break; 2282 2283 if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) { 2284 /* 2285 * Drop the lock here to avoid holding it across rip_input. 2286 * This could make rsvpdebug printfs wrong. If you care, 2287 * record the state of stuff before dropping the lock. 2288 */ 2289 VIF_UNLOCK(); 2290 /* 2291 * If the old-style non-vif-associated socket is set, 2292 * then use it. Otherwise, drop packet since there 2293 * is no specific socket for this vif. 2294 */ 2295 if (ip_rsvpd != NULL) { 2296 if (rsvpdebug) 2297 printf("rsvp_input: Sending packet up old-style socket\n"); 2298 rip_input(m, off); /* xxx */ 2299 } else { 2300 if (rsvpdebug && vifi == numvifs) 2301 printf("rsvp_input: Can't find vif for packet.\n"); 2302 else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL) 2303 printf("rsvp_input: No socket defined for vif %d\n",vifi); 2304 m_freem(m); 2305 } 2306 return; 2307 } 2308 rsvp_src.sin_addr = ip->ip_src; 2309 2310 if (rsvpdebug && m) 2311 printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n", 2312 m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv))); 2313 2314 if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) { 2315 if (rsvpdebug) 2316 printf("rsvp_input: Failed to append to socket\n"); 2317 } else { 2318 if (rsvpdebug) 2319 printf("rsvp_input: send packet up\n"); 2320 } 2321 VIF_UNLOCK(); 2322} 2323 2324/* 2325 * Code for bandwidth monitors 2326 */ 2327 2328/* 2329 * Define common interface for timeval-related methods 2330 */ 2331#define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) 2332#define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) 2333#define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) 2334 2335static uint32_t 2336compute_bw_meter_flags(struct bw_upcall *req) 2337{ 2338 uint32_t flags = 0; 2339 2340 if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) 2341 flags |= BW_METER_UNIT_PACKETS; 2342 if (req->bu_flags & BW_UPCALL_UNIT_BYTES) 2343 flags |= BW_METER_UNIT_BYTES; 2344 if (req->bu_flags & BW_UPCALL_GEQ) 2345 flags |= BW_METER_GEQ; 2346 if (req->bu_flags & BW_UPCALL_LEQ) 2347 flags |= BW_METER_LEQ; 2348 2349 return flags; 2350} 2351 2352/* 2353 * Add a bw_meter entry 2354 */ 2355static int 2356add_bw_upcall(struct bw_upcall *req) 2357{ 2358 struct mfc *mfc; 2359 struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, 2360 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; 2361 struct timeval now; 2362 struct bw_meter *x; 2363 uint32_t flags; 2364 2365 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2366 return EOPNOTSUPP; 2367 2368 /* Test if the flags are valid */ 2369 if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) 2370 return EINVAL; 2371 if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) 2372 return EINVAL; 2373 if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 2374 == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 2375 return EINVAL; 2376 2377 /* Test if the threshold time interval is valid */ 2378 if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) 2379 return EINVAL; 2380 2381 flags = compute_bw_meter_flags(req); 2382 2383 /* 2384 * Find if we have already same bw_meter entry 2385 */ 2386 MFC_LOCK(); 2387 mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr); 2388 if (mfc == NULL) { 2389 MFC_UNLOCK(); 2390 return EADDRNOTAVAIL; 2391 } 2392 for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { 2393 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2394 &req->bu_threshold.b_time, ==)) && 2395 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 2396 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2397 (x->bm_flags & BW_METER_USER_FLAGS) == flags) { 2398 MFC_UNLOCK(); 2399 return 0; /* XXX Already installed */ 2400 } 2401 } 2402 2403 /* Allocate the new bw_meter entry */ 2404 x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); 2405 if (x == NULL) { 2406 MFC_UNLOCK(); 2407 return ENOBUFS; 2408 } 2409 2410 /* Set the new bw_meter entry */ 2411 x->bm_threshold.b_time = req->bu_threshold.b_time; 2412 GET_TIME(now); 2413 x->bm_start_time = now; 2414 x->bm_threshold.b_packets = req->bu_threshold.b_packets; 2415 x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; 2416 x->bm_measured.b_packets = 0; 2417 x->bm_measured.b_bytes = 0; 2418 x->bm_flags = flags; 2419 x->bm_time_next = NULL; 2420 x->bm_time_hash = BW_METER_BUCKETS; 2421 2422 /* Add the new bw_meter entry to the front of entries for this MFC */ 2423 x->bm_mfc = mfc; 2424 x->bm_mfc_next = mfc->mfc_bw_meter; 2425 mfc->mfc_bw_meter = x; 2426 schedule_bw_meter(x, &now); 2427 MFC_UNLOCK(); 2428 2429 return 0; 2430} 2431 2432static void 2433free_bw_list(struct bw_meter *list) 2434{ 2435 while (list != NULL) { 2436 struct bw_meter *x = list; 2437 2438 list = list->bm_mfc_next; 2439 unschedule_bw_meter(x); 2440 free(x, M_BWMETER); 2441 } 2442} 2443 2444/* 2445 * Delete one or multiple bw_meter entries 2446 */ 2447static int 2448del_bw_upcall(struct bw_upcall *req) 2449{ 2450 struct mfc *mfc; 2451 struct bw_meter *x; 2452 2453 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2454 return EOPNOTSUPP; 2455 2456 MFC_LOCK(); 2457 /* Find the corresponding MFC entry */ 2458 mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr); 2459 if (mfc == NULL) { 2460 MFC_UNLOCK(); 2461 return EADDRNOTAVAIL; 2462 } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { 2463 /* 2464 * Delete all bw_meter entries for this mfc 2465 */ 2466 struct bw_meter *list; 2467 2468 list = mfc->mfc_bw_meter; 2469 mfc->mfc_bw_meter = NULL; 2470 free_bw_list(list); 2471 MFC_UNLOCK(); 2472 return 0; 2473 } else { /* Delete a single bw_meter entry */ 2474 struct bw_meter *prev; 2475 uint32_t flags = 0; 2476 2477 flags = compute_bw_meter_flags(req); 2478 2479 /* Find the bw_meter entry to delete */ 2480 for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; 2481 prev = x, x = x->bm_mfc_next) { 2482 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2483 &req->bu_threshold.b_time, ==)) && 2484 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 2485 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2486 (x->bm_flags & BW_METER_USER_FLAGS) == flags) 2487 break; 2488 } 2489 if (x != NULL) { /* Delete entry from the list for this MFC */ 2490 if (prev != NULL) 2491 prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ 2492 else 2493 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ 2494 2495 unschedule_bw_meter(x); 2496 MFC_UNLOCK(); 2497 /* Free the bw_meter entry */ 2498 free(x, M_BWMETER); 2499 return 0; 2500 } else { 2501 MFC_UNLOCK(); 2502 return EINVAL; 2503 } 2504 } 2505 /* NOTREACHED */ 2506} 2507 2508/* 2509 * Perform bandwidth measurement processing that may result in an upcall 2510 */ 2511static void 2512bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) 2513{ 2514 struct timeval delta; 2515 2516 MFC_LOCK_ASSERT(); 2517 2518 delta = *nowp; 2519 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2520 2521 if (x->bm_flags & BW_METER_GEQ) { 2522 /* 2523 * Processing for ">=" type of bw_meter entry 2524 */ 2525 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2526 /* Reset the bw_meter entry */ 2527 x->bm_start_time = *nowp; 2528 x->bm_measured.b_packets = 0; 2529 x->bm_measured.b_bytes = 0; 2530 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2531 } 2532 2533 /* Record that a packet is received */ 2534 x->bm_measured.b_packets++; 2535 x->bm_measured.b_bytes += plen; 2536 2537 /* 2538 * Test if we should deliver an upcall 2539 */ 2540 if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { 2541 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2542 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || 2543 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2544 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { 2545 /* Prepare an upcall for delivery */ 2546 bw_meter_prepare_upcall(x, nowp); 2547 x->bm_flags |= BW_METER_UPCALL_DELIVERED; 2548 } 2549 } 2550 } else if (x->bm_flags & BW_METER_LEQ) { 2551 /* 2552 * Processing for "<=" type of bw_meter entry 2553 */ 2554 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2555 /* 2556 * We are behind time with the multicast forwarding table 2557 * scanning for "<=" type of bw_meter entries, so test now 2558 * if we should deliver an upcall. 2559 */ 2560 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2561 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 2562 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2563 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 2564 /* Prepare an upcall for delivery */ 2565 bw_meter_prepare_upcall(x, nowp); 2566 } 2567 /* Reschedule the bw_meter entry */ 2568 unschedule_bw_meter(x); 2569 schedule_bw_meter(x, nowp); 2570 } 2571 2572 /* Record that a packet is received */ 2573 x->bm_measured.b_packets++; 2574 x->bm_measured.b_bytes += plen; 2575 2576 /* 2577 * Test if we should restart the measuring interval 2578 */ 2579 if ((x->bm_flags & BW_METER_UNIT_PACKETS && 2580 x->bm_measured.b_packets <= x->bm_threshold.b_packets) || 2581 (x->bm_flags & BW_METER_UNIT_BYTES && 2582 x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { 2583 /* Don't restart the measuring interval */ 2584 } else { 2585 /* Do restart the measuring interval */ 2586 /* 2587 * XXX: note that we don't unschedule and schedule, because this 2588 * might be too much overhead per packet. Instead, when we process 2589 * all entries for a given timer hash bin, we check whether it is 2590 * really a timeout. If not, we reschedule at that time. 2591 */ 2592 x->bm_start_time = *nowp; 2593 x->bm_measured.b_packets = 0; 2594 x->bm_measured.b_bytes = 0; 2595 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2596 } 2597 } 2598} 2599 2600/* 2601 * Prepare a bandwidth-related upcall 2602 */ 2603static void 2604bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) 2605{ 2606 struct timeval delta; 2607 struct bw_upcall *u; 2608 2609 MFC_LOCK_ASSERT(); 2610 2611 /* 2612 * Compute the measured time interval 2613 */ 2614 delta = *nowp; 2615 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2616 2617 /* 2618 * If there are too many pending upcalls, deliver them now 2619 */ 2620 if (bw_upcalls_n >= BW_UPCALLS_MAX) 2621 bw_upcalls_send(); 2622 2623 /* 2624 * Set the bw_upcall entry 2625 */ 2626 u = &bw_upcalls[bw_upcalls_n++]; 2627 u->bu_src = x->bm_mfc->mfc_origin; 2628 u->bu_dst = x->bm_mfc->mfc_mcastgrp; 2629 u->bu_threshold.b_time = x->bm_threshold.b_time; 2630 u->bu_threshold.b_packets = x->bm_threshold.b_packets; 2631 u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; 2632 u->bu_measured.b_time = delta; 2633 u->bu_measured.b_packets = x->bm_measured.b_packets; 2634 u->bu_measured.b_bytes = x->bm_measured.b_bytes; 2635 u->bu_flags = 0; 2636 if (x->bm_flags & BW_METER_UNIT_PACKETS) 2637 u->bu_flags |= BW_UPCALL_UNIT_PACKETS; 2638 if (x->bm_flags & BW_METER_UNIT_BYTES) 2639 u->bu_flags |= BW_UPCALL_UNIT_BYTES; 2640 if (x->bm_flags & BW_METER_GEQ) 2641 u->bu_flags |= BW_UPCALL_GEQ; 2642 if (x->bm_flags & BW_METER_LEQ) 2643 u->bu_flags |= BW_UPCALL_LEQ; 2644} 2645 2646/* 2647 * Send the pending bandwidth-related upcalls 2648 */ 2649static void 2650bw_upcalls_send(void) 2651{ 2652 struct mbuf *m; 2653 int len = bw_upcalls_n * sizeof(bw_upcalls[0]); 2654 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 2655 static struct igmpmsg igmpmsg = { 0, /* unused1 */ 2656 0, /* unused2 */ 2657 IGMPMSG_BW_UPCALL,/* im_msgtype */ 2658 0, /* im_mbz */ 2659 0, /* im_vif */ 2660 0, /* unused3 */ 2661 { 0 }, /* im_src */ 2662 { 0 } }; /* im_dst */ 2663 2664 MFC_LOCK_ASSERT(); 2665 2666 if (bw_upcalls_n == 0) 2667 return; /* No pending upcalls */ 2668 2669 bw_upcalls_n = 0; 2670 2671 /* 2672 * Allocate a new mbuf, initialize it with the header and 2673 * the payload for the pending calls. 2674 */ 2675 MGETHDR(m, M_DONTWAIT, MT_HEADER); 2676 if (m == NULL) { 2677 log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); 2678 return; 2679 } 2680 2681 m->m_len = m->m_pkthdr.len = 0; 2682 m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); 2683 m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]); 2684 2685 /* 2686 * Send the upcalls 2687 * XXX do we need to set the address in k_igmpsrc ? 2688 */ 2689 mrtstat.mrts_upcalls++; 2690 if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) { 2691 log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); 2692 ++mrtstat.mrts_upq_sockfull; 2693 } 2694} 2695 2696/* 2697 * Compute the timeout hash value for the bw_meter entries 2698 */ 2699#define BW_METER_TIMEHASH(bw_meter, hash) \ 2700 do { \ 2701 struct timeval next_timeval = (bw_meter)->bm_start_time; \ 2702 \ 2703 BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ 2704 (hash) = next_timeval.tv_sec; \ 2705 if (next_timeval.tv_usec) \ 2706 (hash)++; /* XXX: make sure we don't timeout early */ \ 2707 (hash) %= BW_METER_BUCKETS; \ 2708 } while (0) 2709 2710/* 2711 * Schedule a timer to process periodically bw_meter entry of type "<=" 2712 * by linking the entry in the proper hash bucket. 2713 */ 2714static void 2715schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) 2716{ 2717 int time_hash; 2718 2719 MFC_LOCK_ASSERT(); 2720 2721 if (!(x->bm_flags & BW_METER_LEQ)) 2722 return; /* XXX: we schedule timers only for "<=" entries */ 2723 2724 /* 2725 * Reset the bw_meter entry 2726 */ 2727 x->bm_start_time = *nowp; 2728 x->bm_measured.b_packets = 0; 2729 x->bm_measured.b_bytes = 0; 2730 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2731 2732 /* 2733 * Compute the timeout hash value and insert the entry 2734 */ 2735 BW_METER_TIMEHASH(x, time_hash); 2736 x->bm_time_next = bw_meter_timers[time_hash]; 2737 bw_meter_timers[time_hash] = x; 2738 x->bm_time_hash = time_hash; 2739} 2740 2741/* 2742 * Unschedule the periodic timer that processes bw_meter entry of type "<=" 2743 * by removing the entry from the proper hash bucket. 2744 */ 2745static void 2746unschedule_bw_meter(struct bw_meter *x) 2747{ 2748 int time_hash; 2749 struct bw_meter *prev, *tmp; 2750 2751 MFC_LOCK_ASSERT(); 2752 2753 if (!(x->bm_flags & BW_METER_LEQ)) 2754 return; /* XXX: we schedule timers only for "<=" entries */ 2755 2756 /* 2757 * Compute the timeout hash value and delete the entry 2758 */ 2759 time_hash = x->bm_time_hash; 2760 if (time_hash >= BW_METER_BUCKETS) 2761 return; /* Entry was not scheduled */ 2762 2763 for (prev = NULL, tmp = bw_meter_timers[time_hash]; 2764 tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) 2765 if (tmp == x) 2766 break; 2767 2768 if (tmp == NULL) 2769 panic("unschedule_bw_meter: bw_meter entry not found"); 2770 2771 if (prev != NULL) 2772 prev->bm_time_next = x->bm_time_next; 2773 else 2774 bw_meter_timers[time_hash] = x->bm_time_next; 2775 2776 x->bm_time_next = NULL; 2777 x->bm_time_hash = BW_METER_BUCKETS; 2778} 2779 2780 2781/* 2782 * Process all "<=" type of bw_meter that should be processed now, 2783 * and for each entry prepare an upcall if necessary. Each processed 2784 * entry is rescheduled again for the (periodic) processing. 2785 * 2786 * This is run periodically (once per second normally). On each round, 2787 * all the potentially matching entries are in the hash slot that we are 2788 * looking at. 2789 */ 2790static void 2791bw_meter_process() 2792{ 2793 static uint32_t last_tv_sec; /* last time we processed this */ 2794 2795 uint32_t loops; 2796 int i; 2797 struct timeval now, process_endtime; 2798 2799 GET_TIME(now); 2800 if (last_tv_sec == now.tv_sec) 2801 return; /* nothing to do */ 2802 2803 loops = now.tv_sec - last_tv_sec; 2804 last_tv_sec = now.tv_sec; 2805 if (loops > BW_METER_BUCKETS) 2806 loops = BW_METER_BUCKETS; 2807 2808 MFC_LOCK(); 2809 /* 2810 * Process all bins of bw_meter entries from the one after the last 2811 * processed to the current one. On entry, i points to the last bucket 2812 * visited, so we need to increment i at the beginning of the loop. 2813 */ 2814 for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { 2815 struct bw_meter *x, *tmp_list; 2816 2817 if (++i >= BW_METER_BUCKETS) 2818 i = 0; 2819 2820 /* Disconnect the list of bw_meter entries from the bin */ 2821 tmp_list = bw_meter_timers[i]; 2822 bw_meter_timers[i] = NULL; 2823 2824 /* Process the list of bw_meter entries */ 2825 while (tmp_list != NULL) { 2826 x = tmp_list; 2827 tmp_list = tmp_list->bm_time_next; 2828 2829 /* Test if the time interval is over */ 2830 process_endtime = x->bm_start_time; 2831 BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); 2832 if (BW_TIMEVALCMP(&process_endtime, &now, >)) { 2833 /* Not yet: reschedule, but don't reset */ 2834 int time_hash; 2835 2836 BW_METER_TIMEHASH(x, time_hash); 2837 if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { 2838 /* 2839 * XXX: somehow the bin processing is a bit ahead of time. 2840 * Put the entry in the next bin. 2841 */ 2842 if (++time_hash >= BW_METER_BUCKETS) 2843 time_hash = 0; 2844 } 2845 x->bm_time_next = bw_meter_timers[time_hash]; 2846 bw_meter_timers[time_hash] = x; 2847 x->bm_time_hash = time_hash; 2848 2849 continue; 2850 } 2851 2852 /* 2853 * Test if we should deliver an upcall 2854 */ 2855 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2856 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 2857 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2858 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 2859 /* Prepare an upcall for delivery */ 2860 bw_meter_prepare_upcall(x, &now); 2861 } 2862 2863 /* 2864 * Reschedule for next processing 2865 */ 2866 schedule_bw_meter(x, &now); 2867 } 2868 } 2869 2870 /* Send all upcalls that are pending delivery */ 2871 bw_upcalls_send(); 2872 2873 MFC_UNLOCK(); 2874} 2875 2876/* 2877 * A periodic function for sending all upcalls that are pending delivery 2878 */ 2879static void 2880expire_bw_upcalls_send(void *unused) 2881{ 2882 MFC_LOCK(); 2883 bw_upcalls_send(); 2884 MFC_UNLOCK(); 2885 2886 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 2887 expire_bw_upcalls_send, NULL); 2888} 2889 2890/* 2891 * A periodic function for periodic scanning of the multicast forwarding 2892 * table for processing all "<=" bw_meter entries. 2893 */ 2894static void 2895expire_bw_meter_process(void *unused) 2896{ 2897 if (mrt_api_config & MRT_MFC_BW_UPCALL) 2898 bw_meter_process(); 2899 2900 callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); 2901} 2902 2903/* 2904 * End of bandwidth monitoring code 2905 */ 2906 2907#ifdef PIM 2908/* 2909 * Send the packet up to the user daemon, or eventually do kernel encapsulation 2910 * 2911 */ 2912static int 2913pim_register_send(struct ip *ip, struct vif *vifp, 2914 struct mbuf *m, struct mfc *rt) 2915{ 2916 struct mbuf *mb_copy, *mm; 2917 2918 if (mrtdebug & DEBUG_PIM) 2919 log(LOG_DEBUG, "pim_register_send: "); 2920 2921 mb_copy = pim_register_prepare(ip, m); 2922 if (mb_copy == NULL) 2923 return ENOBUFS; 2924 2925 /* 2926 * Send all the fragments. Note that the mbuf for each fragment 2927 * is freed by the sending machinery. 2928 */ 2929 for (mm = mb_copy; mm; mm = mb_copy) { 2930 mb_copy = mm->m_nextpkt; 2931 mm->m_nextpkt = 0; 2932 mm = m_pullup(mm, sizeof(struct ip)); 2933 if (mm != NULL) { 2934 ip = mtod(mm, struct ip *); 2935 if ((mrt_api_config & MRT_MFC_RP) && 2936 (rt->mfc_rp.s_addr != INADDR_ANY)) { 2937 pim_register_send_rp(ip, vifp, mm, rt); 2938 } else { 2939 pim_register_send_upcall(ip, vifp, mm, rt); 2940 } 2941 } 2942 } 2943 2944 return 0; 2945} 2946 2947/* 2948 * Return a copy of the data packet that is ready for PIM Register 2949 * encapsulation. 2950 * XXX: Note that in the returned copy the IP header is a valid one. 2951 */ 2952static struct mbuf * 2953pim_register_prepare(struct ip *ip, struct mbuf *m) 2954{ 2955 struct mbuf *mb_copy = NULL; 2956 int mtu; 2957 2958 /* Take care of delayed checksums */ 2959 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 2960 in_delayed_cksum(m); 2961 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 2962 } 2963 2964 /* 2965 * Copy the old packet & pullup its IP header into the 2966 * new mbuf so we can modify it. 2967 */ 2968 mb_copy = m_copypacket(m, M_DONTWAIT); 2969 if (mb_copy == NULL) 2970 return NULL; 2971 mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); 2972 if (mb_copy == NULL) 2973 return NULL; 2974 2975 /* take care of the TTL */ 2976 ip = mtod(mb_copy, struct ip *); 2977 --ip->ip_ttl; 2978 2979 /* Compute the MTU after the PIM Register encapsulation */ 2980 mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); 2981 2982 if (ip->ip_len <= mtu) { 2983 /* Turn the IP header into a valid one */ 2984 ip->ip_len = htons(ip->ip_len); 2985 ip->ip_off = htons(ip->ip_off); 2986 ip->ip_sum = 0; 2987 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 2988 } else { 2989 /* Fragment the packet */ 2990 if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) { 2991 m_freem(mb_copy); 2992 return NULL; 2993 } 2994 } 2995 return mb_copy; 2996} 2997 2998/* 2999 * Send an upcall with the data packet to the user-level process. 3000 */ 3001static int 3002pim_register_send_upcall(struct ip *ip, struct vif *vifp, 3003 struct mbuf *mb_copy, struct mfc *rt) 3004{ 3005 struct mbuf *mb_first; 3006 int len = ntohs(ip->ip_len); 3007 struct igmpmsg *im; 3008 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 3009 3010 VIF_LOCK_ASSERT(); 3011 3012 /* 3013 * Add a new mbuf with an upcall header 3014 */ 3015 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); 3016 if (mb_first == NULL) { 3017 m_freem(mb_copy); 3018 return ENOBUFS; 3019 } 3020 mb_first->m_data += max_linkhdr; 3021 mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); 3022 mb_first->m_len = sizeof(struct igmpmsg); 3023 mb_first->m_next = mb_copy; 3024 3025 /* Send message to routing daemon */ 3026 im = mtod(mb_first, struct igmpmsg *); 3027 im->im_msgtype = IGMPMSG_WHOLEPKT; 3028 im->im_mbz = 0; 3029 im->im_vif = vifp - viftable; 3030 im->im_src = ip->ip_src; 3031 im->im_dst = ip->ip_dst; 3032 3033 k_igmpsrc.sin_addr = ip->ip_src; 3034 3035 mrtstat.mrts_upcalls++; 3036 3037 if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) { 3038 if (mrtdebug & DEBUG_PIM) 3039 log(LOG_WARNING, 3040 "mcast: pim_register_send_upcall: ip_mrouter socket queue full"); 3041 ++mrtstat.mrts_upq_sockfull; 3042 return ENOBUFS; 3043 } 3044 3045 /* Keep statistics */ 3046 pimstat.pims_snd_registers_msgs++; 3047 pimstat.pims_snd_registers_bytes += len; 3048 3049 return 0; 3050} 3051 3052/* 3053 * Encapsulate the data packet in PIM Register message and send it to the RP. 3054 */ 3055static int 3056pim_register_send_rp(struct ip *ip, struct vif *vifp, 3057 struct mbuf *mb_copy, struct mfc *rt) 3058{ 3059 struct mbuf *mb_first; 3060 struct ip *ip_outer; 3061 struct pim_encap_pimhdr *pimhdr; 3062 int len = ntohs(ip->ip_len); 3063 vifi_t vifi = rt->mfc_parent; 3064 3065 VIF_LOCK_ASSERT(); 3066 3067 if ((vifi >= numvifs) || (viftable[vifi].v_lcl_addr.s_addr == 0)) { 3068 m_freem(mb_copy); 3069 return EADDRNOTAVAIL; /* The iif vif is invalid */ 3070 } 3071 3072 /* 3073 * Add a new mbuf with the encapsulating header 3074 */ 3075 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); 3076 if (mb_first == NULL) { 3077 m_freem(mb_copy); 3078 return ENOBUFS; 3079 } 3080 mb_first->m_data += max_linkhdr; 3081 mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); 3082 mb_first->m_next = mb_copy; 3083 3084 mb_first->m_pkthdr.len = len + mb_first->m_len; 3085 3086 /* 3087 * Fill in the encapsulating IP and PIM header 3088 */ 3089 ip_outer = mtod(mb_first, struct ip *); 3090 *ip_outer = pim_encap_iphdr; 3091 ip_outer->ip_id = ip_newid(); 3092 ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); 3093 ip_outer->ip_src = viftable[vifi].v_lcl_addr; 3094 ip_outer->ip_dst = rt->mfc_rp; 3095 /* 3096 * Copy the inner header TOS to the outer header, and take care of the 3097 * IP_DF bit. 3098 */ 3099 ip_outer->ip_tos = ip->ip_tos; 3100 if (ntohs(ip->ip_off) & IP_DF) 3101 ip_outer->ip_off |= IP_DF; 3102 pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer 3103 + sizeof(pim_encap_iphdr)); 3104 *pimhdr = pim_encap_pimhdr; 3105 /* If the iif crosses a border, set the Border-bit */ 3106 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config) 3107 pimhdr->flags |= htonl(PIM_BORDER_REGISTER); 3108 3109 mb_first->m_data += sizeof(pim_encap_iphdr); 3110 pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); 3111 mb_first->m_data -= sizeof(pim_encap_iphdr); 3112 3113 if (vifp->v_rate_limit == 0) 3114 tbf_send_packet(vifp, mb_first); 3115 else 3116 tbf_control(vifp, mb_first, ip, ip_outer->ip_len); 3117 3118 /* Keep statistics */ 3119 pimstat.pims_snd_registers_msgs++; 3120 pimstat.pims_snd_registers_bytes += len; 3121 3122 return 0; 3123} 3124 3125/* 3126 * PIM-SMv2 and PIM-DM messages processing. 3127 * Receives and verifies the PIM control messages, and passes them 3128 * up to the listening socket, using rip_input(). 3129 * The only message with special processing is the PIM_REGISTER message 3130 * (used by PIM-SM): the PIM header is stripped off, and the inner packet 3131 * is passed to if_simloop(). 3132 */ 3133void 3134pim_input(struct mbuf *m, int off) 3135{ 3136 struct ip *ip = mtod(m, struct ip *); 3137 struct pim *pim; 3138 int minlen; 3139 int datalen = ip->ip_len; 3140 int ip_tos; 3141 int iphlen = off; 3142 3143 /* Keep statistics */ 3144 pimstat.pims_rcv_total_msgs++; 3145 pimstat.pims_rcv_total_bytes += datalen; 3146 3147 /* 3148 * Validate lengths 3149 */ 3150 if (datalen < PIM_MINLEN) { 3151 pimstat.pims_rcv_tooshort++; 3152 log(LOG_ERR, "pim_input: packet size too small %d from %lx\n", 3153 datalen, (u_long)ip->ip_src.s_addr); 3154 m_freem(m); 3155 return; 3156 } 3157 3158 /* 3159 * If the packet is at least as big as a REGISTER, go agead 3160 * and grab the PIM REGISTER header size, to avoid another 3161 * possible m_pullup() later. 3162 * 3163 * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 3164 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 3165 */ 3166 minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); 3167 /* 3168 * Get the IP and PIM headers in contiguous memory, and 3169 * possibly the PIM REGISTER header. 3170 */ 3171 if ((m->m_flags & M_EXT || m->m_len < minlen) && 3172 (m = m_pullup(m, minlen)) == 0) { 3173 log(LOG_ERR, "pim_input: m_pullup failure\n"); 3174 return; 3175 } 3176 /* m_pullup() may have given us a new mbuf so reset ip. */ 3177 ip = mtod(m, struct ip *); 3178 ip_tos = ip->ip_tos; 3179 3180 /* adjust mbuf to point to the PIM header */ 3181 m->m_data += iphlen; 3182 m->m_len -= iphlen; 3183 pim = mtod(m, struct pim *); 3184 3185 /* 3186 * Validate checksum. If PIM REGISTER, exclude the data packet. 3187 * 3188 * XXX: some older PIMv2 implementations don't make this distinction, 3189 * so for compatibility reason perform the checksum over part of the 3190 * message, and if error, then over the whole message. 3191 */ 3192 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { 3193 /* do nothing, checksum okay */ 3194 } else if (in_cksum(m, datalen)) { 3195 pimstat.pims_rcv_badsum++; 3196 if (mrtdebug & DEBUG_PIM) 3197 log(LOG_DEBUG, "pim_input: invalid checksum"); 3198 m_freem(m); 3199 return; 3200 } 3201 3202 /* PIM version check */ 3203 if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { 3204 pimstat.pims_rcv_badversion++; 3205 log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n", 3206 PIM_VT_V(pim->pim_vt), PIM_VERSION); 3207 m_freem(m); 3208 return; 3209 } 3210 3211 /* restore mbuf back to the outer IP */ 3212 m->m_data -= iphlen; 3213 m->m_len += iphlen; 3214 3215 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { 3216 /* 3217 * Since this is a REGISTER, we'll make a copy of the register 3218 * headers ip + pim + u_int32 + encap_ip, to be passed up to the 3219 * routing daemon. 3220 */ 3221 struct sockaddr_in dst = { sizeof(dst), AF_INET }; 3222 struct mbuf *mcp; 3223 struct ip *encap_ip; 3224 u_int32_t *reghdr; 3225 struct ifnet *vifp; 3226 3227 VIF_LOCK(); 3228 if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) { 3229 VIF_UNLOCK(); 3230 if (mrtdebug & DEBUG_PIM) 3231 log(LOG_DEBUG, 3232 "pim_input: register vif not set: %d\n", reg_vif_num); 3233 m_freem(m); 3234 return; 3235 } 3236 /* XXX need refcnt? */ 3237 vifp = viftable[reg_vif_num].v_ifp; 3238 VIF_UNLOCK(); 3239 3240 /* 3241 * Validate length 3242 */ 3243 if (datalen < PIM_REG_MINLEN) { 3244 pimstat.pims_rcv_tooshort++; 3245 pimstat.pims_rcv_badregisters++; 3246 log(LOG_ERR, 3247 "pim_input: register packet size too small %d from %lx\n", 3248 datalen, (u_long)ip->ip_src.s_addr); 3249 m_freem(m); 3250 return; 3251 } 3252 3253 reghdr = (u_int32_t *)(pim + 1); 3254 encap_ip = (struct ip *)(reghdr + 1); 3255 3256 if (mrtdebug & DEBUG_PIM) { 3257 log(LOG_DEBUG, 3258 "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n", 3259 (u_long)ntohl(encap_ip->ip_src.s_addr), 3260 (u_long)ntohl(encap_ip->ip_dst.s_addr), 3261 ntohs(encap_ip->ip_len)); 3262 } 3263 3264 /* verify the version number of the inner packet */ 3265 if (encap_ip->ip_v != IPVERSION) { 3266 pimstat.pims_rcv_badregisters++; 3267 if (mrtdebug & DEBUG_PIM) { 3268 log(LOG_DEBUG, "pim_input: invalid IP version (%d) " 3269 "of the inner packet\n", encap_ip->ip_v); 3270 } 3271 m_freem(m); 3272 return; 3273 } 3274 3275 /* verify the inner packet is destined to a mcast group */ 3276 if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { 3277 pimstat.pims_rcv_badregisters++; 3278 if (mrtdebug & DEBUG_PIM) 3279 log(LOG_DEBUG, 3280 "pim_input: inner packet of register is not " 3281 "multicast %lx\n", 3282 (u_long)ntohl(encap_ip->ip_dst.s_addr)); 3283 m_freem(m); 3284 return; 3285 } 3286 3287 /* If a NULL_REGISTER, pass it to the daemon */ 3288 if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) 3289 goto pim_input_to_daemon; 3290 3291 /* 3292 * Copy the TOS from the outer IP header to the inner IP header. 3293 */ 3294 if (encap_ip->ip_tos != ip_tos) { 3295 /* Outer TOS -> inner TOS */ 3296 encap_ip->ip_tos = ip_tos; 3297 /* Recompute the inner header checksum. Sigh... */ 3298 3299 /* adjust mbuf to point to the inner IP header */ 3300 m->m_data += (iphlen + PIM_MINLEN); 3301 m->m_len -= (iphlen + PIM_MINLEN); 3302 3303 encap_ip->ip_sum = 0; 3304 encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); 3305 3306 /* restore mbuf to point back to the outer IP header */ 3307 m->m_data -= (iphlen + PIM_MINLEN); 3308 m->m_len += (iphlen + PIM_MINLEN); 3309 } 3310 3311 /* 3312 * Decapsulate the inner IP packet and loopback to forward it 3313 * as a normal multicast packet. Also, make a copy of the 3314 * outer_iphdr + pimhdr + reghdr + encap_iphdr 3315 * to pass to the daemon later, so it can take the appropriate 3316 * actions (e.g., send back PIM_REGISTER_STOP). 3317 * XXX: here m->m_data points to the outer IP header. 3318 */ 3319 mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN); 3320 if (mcp == NULL) { 3321 log(LOG_ERR, 3322 "pim_input: pim register: could not copy register head\n"); 3323 m_freem(m); 3324 return; 3325 } 3326 3327 /* Keep statistics */ 3328 /* XXX: registers_bytes include only the encap. mcast pkt */ 3329 pimstat.pims_rcv_registers_msgs++; 3330 pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len); 3331 3332 /* 3333 * forward the inner ip packet; point m_data at the inner ip. 3334 */ 3335 m_adj(m, iphlen + PIM_MINLEN); 3336 3337 if (mrtdebug & DEBUG_PIM) { 3338 log(LOG_DEBUG, 3339 "pim_input: forwarding decapsulated register: " 3340 "src %lx, dst %lx, vif %d\n", 3341 (u_long)ntohl(encap_ip->ip_src.s_addr), 3342 (u_long)ntohl(encap_ip->ip_dst.s_addr), 3343 reg_vif_num); 3344 } 3345 /* NB: vifp was collected above; can it change on us? */ 3346 if_simloop(vifp, m, dst.sin_family, 0); 3347 3348 /* prepare the register head to send to the mrouting daemon */ 3349 m = mcp; 3350 } 3351 3352pim_input_to_daemon: 3353 /* 3354 * Pass the PIM message up to the daemon; if it is a Register message, 3355 * pass the 'head' only up to the daemon. This includes the 3356 * outer IP header, PIM header, PIM-Register header and the 3357 * inner IP header. 3358 * XXX: the outer IP header pkt size of a Register is not adjust to 3359 * reflect the fact that the inner multicast data is truncated. 3360 */ 3361 rip_input(m, iphlen); 3362 3363 return; 3364} 3365#endif /* PIM */ 3366 3367static int 3368ip_mroute_modevent(module_t mod, int type, void *unused) 3369{ 3370 switch (type) { 3371 case MOD_LOAD: 3372 mtx_init(&mrouter_mtx, "mrouter initialization", NULL, MTX_DEF); 3373 MFC_LOCK_INIT(); 3374 VIF_LOCK_INIT(); 3375 ip_mrouter_reset(); 3376 ip_mcast_src = X_ip_mcast_src; 3377 ip_mforward = X_ip_mforward; 3378 ip_mrouter_done = X_ip_mrouter_done; 3379 ip_mrouter_get = X_ip_mrouter_get; 3380 ip_mrouter_set = X_ip_mrouter_set; 3381 ip_rsvp_force_done = X_ip_rsvp_force_done; 3382 ip_rsvp_vif = X_ip_rsvp_vif; 3383 legal_vif_num = X_legal_vif_num; 3384 mrt_ioctl = X_mrt_ioctl; 3385 rsvp_input_p = X_rsvp_input; 3386 break; 3387 3388 case MOD_UNLOAD: 3389 /* 3390 * Typically module unload happens after the user-level 3391 * process has shutdown the kernel services (the check 3392 * below insures someone can't just yank the module out 3393 * from under a running process). But if the module is 3394 * just loaded and then unloaded w/o starting up a user 3395 * process we still need to cleanup. 3396 */ 3397 if (ip_mrouter) 3398 return EINVAL; 3399 3400 X_ip_mrouter_done(); 3401 ip_mcast_src = NULL; 3402 ip_mforward = NULL; 3403 ip_mrouter_done = NULL; 3404 ip_mrouter_get = NULL; 3405 ip_mrouter_set = NULL; 3406 ip_rsvp_force_done = NULL; 3407 ip_rsvp_vif = NULL; 3408 legal_vif_num = NULL; 3409 mrt_ioctl = NULL; 3410 rsvp_input_p = NULL; 3411 VIF_LOCK_DESTROY(); 3412 MFC_LOCK_DESTROY(); 3413 mtx_destroy(&mrouter_mtx); 3414 break; 3415 default: 3416 return EOPNOTSUPP; 3417 } 3418 return 0; 3419} 3420 3421static moduledata_t ip_mroutemod = { 3422 "ip_mroute", 3423 ip_mroute_modevent, 3424 0 3425}; 3426DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY); 3427