ip_mroute.c revision 121700
1/* 2 * IP multicast forwarding procedures 3 * 4 * Written by David Waitzman, BBN Labs, August 1988. 5 * Modified by Steve Deering, Stanford, February 1989. 6 * Modified by Mark J. Steiglitz, Stanford, May, 1991 7 * Modified by Van Jacobson, LBL, January 1993 8 * Modified by Ajit Thyagarajan, PARC, August 1993 9 * Modified by Bill Fenner, PARC, April 1995 10 * Modified by Ahmed Helmy, SGI, June 1996 11 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 12 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 13 * Modified by Hitoshi Asaeda, WIDE, August 2000 14 * Modified by Pavlin Radoslavov, ICSI, October 2002 15 * 16 * MROUTING Revision: 3.5 17 * and PIM-SMv2 and PIM-DM support, advanced API support, 18 * bandwidth metering and signaling 19 * 20 * $FreeBSD: head/sys/netinet/ip_mroute.c 121700 2003-10-29 19:15:00Z sam $ 21 */ 22 23#include "opt_mac.h" 24#include "opt_mrouting.h" 25#include "opt_random_ip_id.h" 26 27#ifdef PIM 28#define _PIM_VT 1 29#endif 30 31#include <sys/param.h> 32#include <sys/kernel.h> 33#include <sys/lock.h> 34#include <sys/mac.h> 35#include <sys/malloc.h> 36#include <sys/mbuf.h> 37#include <sys/protosw.h> 38#include <sys/signalvar.h> 39#include <sys/socket.h> 40#include <sys/socketvar.h> 41#include <sys/sockio.h> 42#include <sys/sx.h> 43#include <sys/sysctl.h> 44#include <sys/syslog.h> 45#include <sys/systm.h> 46#include <sys/time.h> 47#include <net/if.h> 48#include <net/netisr.h> 49#include <net/route.h> 50#include <netinet/in.h> 51#include <netinet/igmp.h> 52#include <netinet/in_systm.h> 53#include <netinet/in_var.h> 54#include <netinet/ip.h> 55#include <netinet/ip_encap.h> 56#include <netinet/ip_mroute.h> 57#include <netinet/ip_var.h> 58#ifdef PIM 59#include <netinet/pim.h> 60#include <netinet/pim_var.h> 61#endif 62#include <netinet/udp.h> 63#include <machine/in_cksum.h> 64 65/* 66 * Control debugging code for rsvp and multicast routing code. 67 * Can only set them with the debugger. 68 */ 69static u_int rsvpdebug; /* non-zero enables debugging */ 70 71static u_int mrtdebug; /* any set of the flags below */ 72#define DEBUG_MFC 0x02 73#define DEBUG_FORWARD 0x04 74#define DEBUG_EXPIRE 0x08 75#define DEBUG_XMIT 0x10 76#define DEBUG_PIM 0x20 77 78#define VIFI_INVALID ((vifi_t) -1) 79 80#define M_HASCL(m) ((m)->m_flags & M_EXT) 81 82static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables"); 83 84/* 85 * Locking. We use two locks: one for the virtual interface table and 86 * one for the forwarding table. These locks may be nested in which case 87 * the VIF lock must always be taken first. Note that each lock is used 88 * to cover not only the specific data structure but also related data 89 * structures. It may be better to add more fine-grained locking later; 90 * it's not clear how performance-critical this code is. 91 */ 92 93static struct mrtstat mrtstat; 94SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW, 95 &mrtstat, mrtstat, 96 "Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)"); 97 98static struct mfc *mfctable[MFCTBLSIZ]; 99SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, 100 &mfctable, sizeof(mfctable), "S,*mfc[MFCTBLSIZ]", 101 "Multicast Forwarding Table (struct *mfc[MFCTBLSIZ], netinet/ip_mroute.h)"); 102 103static struct mtx mfc_mtx; 104#define MFC_LOCK() mtx_lock(&mfc_mtx) 105#define MFC_UNLOCK() mtx_unlock(&mfc_mtx) 106#define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED) 107#define MFC_LOCK_INIT() mtx_init(&mfc_mtx, "mroute mfc table", NULL, MTX_DEF) 108#define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) 109 110static struct vif viftable[MAXVIFS]; 111SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD, 112 &viftable, sizeof(viftable), "S,vif[MAXVIFS]", 113 "Multicast Virtual Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); 114 115static struct mtx vif_mtx; 116#define VIF_LOCK() mtx_lock(&vif_mtx) 117#define VIF_UNLOCK() mtx_unlock(&vif_mtx) 118#define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) 119#define VIF_LOCK_INIT() mtx_init(&vif_mtx, "mroute vif table", NULL, MTX_DEF) 120#define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) 121 122static u_char nexpire[MFCTBLSIZ]; 123 124static struct callout expire_upcalls_ch; 125 126#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ 127#define UPCALL_EXPIRE 6 /* number of timeouts */ 128 129/* 130 * Define the token bucket filter structures 131 * tbftable -> each vif has one of these for storing info 132 */ 133 134static struct tbf tbftable[MAXVIFS]; 135#define TBF_REPROCESS (hz / 100) /* 100x / second */ 136 137/* 138 * 'Interfaces' associated with decapsulator (so we can tell 139 * packets that went through it from ones that get reflected 140 * by a broken gateway). These interfaces are never linked into 141 * the system ifnet list & no routes point to them. I.e., packets 142 * can't be sent this way. They only exist as a placeholder for 143 * multicast source verification. 144 */ 145static struct ifnet multicast_decap_if[MAXVIFS]; 146 147#define ENCAP_TTL 64 148#define ENCAP_PROTO IPPROTO_IPIP /* 4 */ 149 150/* prototype IP hdr for encapsulated packets */ 151static struct ip multicast_encap_iphdr = { 152#if BYTE_ORDER == LITTLE_ENDIAN 153 sizeof(struct ip) >> 2, IPVERSION, 154#else 155 IPVERSION, sizeof(struct ip) >> 2, 156#endif 157 0, /* tos */ 158 sizeof(struct ip), /* total length */ 159 0, /* id */ 160 0, /* frag offset */ 161 ENCAP_TTL, ENCAP_PROTO, 162 0, /* checksum */ 163}; 164 165/* 166 * Bandwidth meter variables and constants 167 */ 168static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); 169/* 170 * Pending timeouts are stored in a hash table, the key being the 171 * expiration time. Periodically, the entries are analysed and processed. 172 */ 173#define BW_METER_BUCKETS 1024 174static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS]; 175static struct callout bw_meter_ch; 176#define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ 177 178/* 179 * Pending upcalls are stored in a vector which is flushed when 180 * full, or periodically 181 */ 182static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX]; 183static u_int bw_upcalls_n; /* # of pending upcalls */ 184static struct callout bw_upcalls_ch; 185#define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ 186 187#ifdef PIM 188static struct pimstat pimstat; 189SYSCTL_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD, 190 &pimstat, pimstat, 191 "PIM Statistics (struct pimstat, netinet/pim_var.h)"); 192 193/* 194 * Note: the PIM Register encapsulation adds the following in front of a 195 * data packet: 196 * 197 * struct pim_encap_hdr { 198 * struct ip ip; 199 * struct pim_encap_pimhdr pim; 200 * } 201 * 202 */ 203 204struct pim_encap_pimhdr { 205 struct pim pim; 206 uint32_t flags; 207}; 208 209static struct ip pim_encap_iphdr = { 210#if BYTE_ORDER == LITTLE_ENDIAN 211 sizeof(struct ip) >> 2, 212 IPVERSION, 213#else 214 IPVERSION, 215 sizeof(struct ip) >> 2, 216#endif 217 0, /* tos */ 218 sizeof(struct ip), /* total length */ 219 0, /* id */ 220 0, /* frag offset */ 221 ENCAP_TTL, 222 IPPROTO_PIM, 223 0, /* checksum */ 224}; 225 226static struct pim_encap_pimhdr pim_encap_pimhdr = { 227 { 228 PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 229 0, /* reserved */ 230 0, /* checksum */ 231 }, 232 0 /* flags */ 233}; 234 235static struct ifnet multicast_register_if; 236static vifi_t reg_vif_num = VIFI_INVALID; 237#endif /* PIM */ 238 239/* 240 * Private variables. 241 */ 242static vifi_t numvifs; 243static const struct encaptab *encap_cookie; 244 245/* 246 * one-back cache used by mroute_encapcheck to locate a tunnel's vif 247 * given a datagram's src ip address. 248 */ 249static u_long last_encap_src; 250static struct vif *last_encap_vif; 251 252/* 253 * Callout for queue processing. 254 */ 255static struct callout tbf_reprocess_ch; 256 257static u_long X_ip_mcast_src(int vifi); 258static int X_ip_mforward(struct ip *ip, struct ifnet *ifp, 259 struct mbuf *m, struct ip_moptions *imo); 260static int X_ip_mrouter_done(void); 261static int X_ip_mrouter_get(struct socket *so, struct sockopt *m); 262static int X_ip_mrouter_set(struct socket *so, struct sockopt *m); 263static int X_legal_vif_num(int vif); 264static int X_mrt_ioctl(int cmd, caddr_t data); 265 266static int get_sg_cnt(struct sioc_sg_req *); 267static int get_vif_cnt(struct sioc_vif_req *); 268static int ip_mrouter_init(struct socket *, int); 269static int add_vif(struct vifctl *); 270static int del_vif(vifi_t); 271static int add_mfc(struct mfcctl2 *); 272static int del_mfc(struct mfcctl2 *); 273static int set_api_config(uint32_t *); /* chose API capabilities */ 274static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *); 275static int set_assert(int); 276static void expire_upcalls(void *); 277static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); 278static void phyint_send(struct ip *, struct vif *, struct mbuf *); 279static void encap_send(struct ip *, struct vif *, struct mbuf *); 280static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long); 281static void tbf_queue(struct vif *, struct mbuf *); 282static void tbf_process_q(struct vif *); 283static void tbf_reprocess_q(void *); 284static int tbf_dq_sel(struct vif *, struct ip *); 285static void tbf_send_packet(struct vif *, struct mbuf *); 286static void tbf_update_tokens(struct vif *); 287static int priority(struct vif *, struct ip *); 288 289/* 290 * Bandwidth monitoring 291 */ 292static void free_bw_list(struct bw_meter *list); 293static int add_bw_upcall(struct bw_upcall *); 294static int del_bw_upcall(struct bw_upcall *); 295static void bw_meter_receive_packet(struct bw_meter *x, int plen, 296 struct timeval *nowp); 297static void bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp); 298static void bw_upcalls_send(void); 299static void schedule_bw_meter(struct bw_meter *x, struct timeval *nowp); 300static void unschedule_bw_meter(struct bw_meter *x); 301static void bw_meter_process(void); 302static void expire_bw_upcalls_send(void *); 303static void expire_bw_meter_process(void *); 304 305#ifdef PIM 306static int pim_register_send(struct ip *, struct vif *, 307 struct mbuf *, struct mfc *); 308static int pim_register_send_rp(struct ip *, struct vif *, 309 struct mbuf *, struct mfc *); 310static int pim_register_send_upcall(struct ip *, struct vif *, 311 struct mbuf *, struct mfc *); 312static struct mbuf *pim_register_prepare(struct ip *, struct mbuf *); 313#endif 314 315/* 316 * whether or not special PIM assert processing is enabled. 317 */ 318static int pim_assert; 319/* 320 * Rate limit for assert notification messages, in usec 321 */ 322#define ASSERT_MSG_TIME 3000000 323 324/* 325 * Kernel multicast routing API capabilities and setup. 326 * If more API capabilities are added to the kernel, they should be 327 * recorded in `mrt_api_support'. 328 */ 329static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | 330 MRT_MFC_FLAGS_BORDER_VIF | 331 MRT_MFC_RP | 332 MRT_MFC_BW_UPCALL); 333static uint32_t mrt_api_config = 0; 334 335/* 336 * Hash function for a source, group entry 337 */ 338#define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \ 339 ((g) >> 20) ^ ((g) >> 10) ^ (g)) 340 341/* 342 * Find a route for a given origin IP address and Multicast group address 343 * Type of service parameter to be added in the future!!! 344 * Statistics are updated by the caller if needed 345 * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses) 346 */ 347static struct mfc * 348mfc_find(in_addr_t o, in_addr_t g) 349{ 350 struct mfc *rt; 351 352 MFC_LOCK_ASSERT(); 353 354 for (rt = mfctable[MFCHASH(o,g)]; rt; rt = rt->mfc_next) 355 if ((rt->mfc_origin.s_addr == o) && 356 (rt->mfc_mcastgrp.s_addr == g) && (rt->mfc_stall == NULL)) 357 break; 358 return rt; 359} 360 361/* 362 * Macros to compute elapsed time efficiently 363 * Borrowed from Van Jacobson's scheduling code 364 */ 365#define TV_DELTA(a, b, delta) { \ 366 int xxs; \ 367 delta = (a).tv_usec - (b).tv_usec; \ 368 if ((xxs = (a).tv_sec - (b).tv_sec)) { \ 369 switch (xxs) { \ 370 case 2: \ 371 delta += 1000000; \ 372 /* FALLTHROUGH */ \ 373 case 1: \ 374 delta += 1000000; \ 375 break; \ 376 default: \ 377 delta += (1000000 * xxs); \ 378 } \ 379 } \ 380} 381 382#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \ 383 (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec) 384 385/* 386 * Handle MRT setsockopt commands to modify the multicast routing tables. 387 */ 388static int 389X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) 390{ 391 int error, optval; 392 vifi_t vifi; 393 struct vifctl vifc; 394 struct mfcctl2 mfc; 395 struct bw_upcall bw_upcall; 396 uint32_t i; 397 398 if (so != ip_mrouter && sopt->sopt_name != MRT_INIT) 399 return EPERM; 400 401 error = 0; 402 switch (sopt->sopt_name) { 403 case MRT_INIT: 404 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 405 if (error) 406 break; 407 error = ip_mrouter_init(so, optval); 408 break; 409 410 case MRT_DONE: 411 error = ip_mrouter_done(); 412 break; 413 414 case MRT_ADD_VIF: 415 error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); 416 if (error) 417 break; 418 error = add_vif(&vifc); 419 break; 420 421 case MRT_DEL_VIF: 422 error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); 423 if (error) 424 break; 425 error = del_vif(vifi); 426 break; 427 428 case MRT_ADD_MFC: 429 case MRT_DEL_MFC: 430 /* 431 * select data size depending on API version. 432 */ 433 if (sopt->sopt_name == MRT_ADD_MFC && 434 mrt_api_config & MRT_API_FLAGS_ALL) { 435 error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), 436 sizeof(struct mfcctl2)); 437 } else { 438 error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), 439 sizeof(struct mfcctl)); 440 bzero((caddr_t)&mfc + sizeof(struct mfcctl), 441 sizeof(mfc) - sizeof(struct mfcctl)); 442 } 443 if (error) 444 break; 445 if (sopt->sopt_name == MRT_ADD_MFC) 446 error = add_mfc(&mfc); 447 else 448 error = del_mfc(&mfc); 449 break; 450 451 case MRT_ASSERT: 452 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 453 if (error) 454 break; 455 set_assert(optval); 456 break; 457 458 case MRT_API_CONFIG: 459 error = sooptcopyin(sopt, &i, sizeof i, sizeof i); 460 if (!error) 461 error = set_api_config(&i); 462 if (!error) 463 error = sooptcopyout(sopt, &i, sizeof i); 464 break; 465 466 case MRT_ADD_BW_UPCALL: 467 case MRT_DEL_BW_UPCALL: 468 error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, 469 sizeof bw_upcall); 470 if (error) 471 break; 472 if (sopt->sopt_name == MRT_ADD_BW_UPCALL) 473 error = add_bw_upcall(&bw_upcall); 474 else 475 error = del_bw_upcall(&bw_upcall); 476 break; 477 478 default: 479 error = EOPNOTSUPP; 480 break; 481 } 482 return error; 483} 484 485/* 486 * Handle MRT getsockopt commands 487 */ 488static int 489X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) 490{ 491 int error; 492 static int version = 0x0305; /* !!! why is this here? XXX */ 493 494 switch (sopt->sopt_name) { 495 case MRT_VERSION: 496 error = sooptcopyout(sopt, &version, sizeof version); 497 break; 498 499 case MRT_ASSERT: 500 error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert); 501 break; 502 503 case MRT_API_SUPPORT: 504 error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); 505 break; 506 507 case MRT_API_CONFIG: 508 error = sooptcopyout(sopt, &mrt_api_config, sizeof mrt_api_config); 509 break; 510 511 default: 512 error = EOPNOTSUPP; 513 break; 514 } 515 return error; 516} 517 518/* 519 * Handle ioctl commands to obtain information from the cache 520 */ 521static int 522X_mrt_ioctl(int cmd, caddr_t data) 523{ 524 int error = 0; 525 526 switch (cmd) { 527 case (SIOCGETVIFCNT): 528 error = get_vif_cnt((struct sioc_vif_req *)data); 529 break; 530 531 case (SIOCGETSGCNT): 532 error = get_sg_cnt((struct sioc_sg_req *)data); 533 break; 534 535 default: 536 error = EINVAL; 537 break; 538 } 539 return error; 540} 541 542/* 543 * returns the packet, byte, rpf-failure count for the source group provided 544 */ 545static int 546get_sg_cnt(struct sioc_sg_req *req) 547{ 548 struct mfc *rt; 549 550 MFC_LOCK(); 551 rt = mfc_find(req->src.s_addr, req->grp.s_addr); 552 if (rt == NULL) { 553 MFC_UNLOCK(); 554 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; 555 return EADDRNOTAVAIL; 556 } 557 req->pktcnt = rt->mfc_pkt_cnt; 558 req->bytecnt = rt->mfc_byte_cnt; 559 req->wrong_if = rt->mfc_wrong_if; 560 MFC_UNLOCK(); 561 return 0; 562} 563 564/* 565 * returns the input and output packet and byte counts on the vif provided 566 */ 567static int 568get_vif_cnt(struct sioc_vif_req *req) 569{ 570 vifi_t vifi = req->vifi; 571 572 VIF_LOCK(); 573 if (vifi >= numvifs) { 574 VIF_UNLOCK(); 575 return EINVAL; 576 } 577 578 req->icount = viftable[vifi].v_pkt_in; 579 req->ocount = viftable[vifi].v_pkt_out; 580 req->ibytes = viftable[vifi].v_bytes_in; 581 req->obytes = viftable[vifi].v_bytes_out; 582 VIF_UNLOCK(); 583 584 return 0; 585} 586 587static void 588ip_mrouter_reset(void) 589{ 590 bzero((caddr_t)mfctable, sizeof(mfctable)); 591 MFC_LOCK_INIT(); 592 VIF_LOCK_INIT(); 593 bzero((caddr_t)nexpire, sizeof(nexpire)); 594 595 pim_assert = 0; 596 mrt_api_config = 0; 597 598 callout_init(&expire_upcalls_ch, CALLOUT_MPSAFE); 599 600 bw_upcalls_n = 0; 601 bzero((caddr_t)bw_meter_timers, sizeof(bw_meter_timers)); 602 callout_init(&bw_upcalls_ch, CALLOUT_MPSAFE); 603 callout_init(&bw_meter_ch, CALLOUT_MPSAFE); 604 605 callout_init(&tbf_reprocess_ch, CALLOUT_MPSAFE); 606} 607 608/* 609 * Enable multicast routing 610 */ 611static int 612ip_mrouter_init(struct socket *so, int version) 613{ 614 if (mrtdebug) 615 log(LOG_DEBUG, "ip_mrouter_init: so_type = %d, pr_protocol = %d\n", 616 so->so_type, so->so_proto->pr_protocol); 617 618 if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) 619 return EOPNOTSUPP; 620 621 if (version != 1) 622 return ENOPROTOOPT; 623 624 if (ip_mrouter != NULL) 625 return EADDRINUSE; 626 627 ip_mrouter_reset(); 628 629 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); 630 631 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 632 expire_bw_upcalls_send, NULL); 633 callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); 634 635 ip_mrouter = so; 636 637 if (mrtdebug) 638 log(LOG_DEBUG, "ip_mrouter_init\n"); 639 640 return 0; 641} 642 643/* 644 * Disable multicast routing 645 */ 646static int 647X_ip_mrouter_done(void) 648{ 649 vifi_t vifi; 650 int i; 651 struct ifnet *ifp; 652 struct ifreq ifr; 653 struct mfc *rt; 654 struct rtdetq *rte; 655 656 /* 657 * Detach/disable hooks to the reset of the system. 658 */ 659 ip_mrouter = NULL; 660 mrt_api_config = 0; 661 662 VIF_LOCK(); 663 if (encap_cookie) { 664 const struct encaptab *c = encap_cookie; 665 encap_cookie = NULL; 666 encap_detach(c); 667 } 668 VIF_UNLOCK(); 669 670 callout_stop(&tbf_reprocess_ch); 671 672 VIF_LOCK(); 673 /* 674 * For each phyint in use, disable promiscuous reception of all IP 675 * multicasts. 676 */ 677 for (vifi = 0; vifi < numvifs; vifi++) { 678 if (viftable[vifi].v_lcl_addr.s_addr != 0 && 679 !(viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 680 struct sockaddr_in *so = (struct sockaddr_in *)&(ifr.ifr_addr); 681 682 so->sin_len = sizeof(struct sockaddr_in); 683 so->sin_family = AF_INET; 684 so->sin_addr.s_addr = INADDR_ANY; 685 ifp = viftable[vifi].v_ifp; 686 if_allmulti(ifp, 0); 687 } 688 } 689 bzero((caddr_t)tbftable, sizeof(tbftable)); 690 bzero((caddr_t)viftable, sizeof(viftable)); 691 numvifs = 0; 692 pim_assert = 0; 693 VIF_LOCK_DESTROY(); 694 695 /* 696 * Free all multicast forwarding cache entries. 697 */ 698 callout_stop(&expire_upcalls_ch); 699 callout_stop(&bw_upcalls_ch); 700 callout_stop(&bw_meter_ch); 701 702 MFC_LOCK(); 703 for (i = 0; i < MFCTBLSIZ; i++) { 704 for (rt = mfctable[i]; rt != NULL; ) { 705 struct mfc *nr = rt->mfc_next; 706 707 for (rte = rt->mfc_stall; rte != NULL; ) { 708 struct rtdetq *n = rte->next; 709 710 m_freem(rte->m); 711 free(rte, M_MRTABLE); 712 rte = n; 713 } 714 free_bw_list(rt->mfc_bw_meter); 715 free(rt, M_MRTABLE); 716 rt = nr; 717 } 718 } 719 bzero((caddr_t)mfctable, sizeof(mfctable)); 720 bw_upcalls_n = 0; 721 bzero(bw_meter_timers, sizeof(bw_meter_timers)); 722 MFC_LOCK_DESTROY(); 723 724 /* 725 * Reset de-encapsulation cache 726 */ 727 last_encap_src = INADDR_ANY; 728 last_encap_vif = NULL; 729#ifdef PIM 730 reg_vif_num = VIFI_INVALID; 731#endif 732 733 if (mrtdebug) 734 log(LOG_DEBUG, "ip_mrouter_done\n"); 735 736 return 0; 737} 738 739/* 740 * Set PIM assert processing global 741 */ 742static int 743set_assert(int i) 744{ 745 if ((i != 1) && (i != 0)) 746 return EINVAL; 747 748 pim_assert = i; 749 750 return 0; 751} 752 753/* 754 * Configure API capabilities 755 */ 756int 757set_api_config(uint32_t *apival) 758{ 759 int i; 760 761 /* 762 * We can set the API capabilities only if it is the first operation 763 * after MRT_INIT. I.e.: 764 * - there are no vifs installed 765 * - pim_assert is not enabled 766 * - the MFC table is empty 767 */ 768 if (numvifs > 0) { 769 *apival = 0; 770 return EPERM; 771 } 772 if (pim_assert) { 773 *apival = 0; 774 return EPERM; 775 } 776 for (i = 0; i < MFCTBLSIZ; i++) { 777 if (mfctable[i] != NULL) { 778 *apival = 0; 779 return EPERM; 780 } 781 } 782 783 mrt_api_config = *apival & mrt_api_support; 784 *apival = mrt_api_config; 785 786 return 0; 787} 788 789/* 790 * Decide if a packet is from a tunnelled peer. 791 * Return 0 if not, 64 if so. XXX yuck.. 64 ??? 792 */ 793static int 794mroute_encapcheck(const struct mbuf *m, int off, int proto, void *arg) 795{ 796 struct ip *ip = mtod(m, struct ip *); 797 int hlen = ip->ip_hl << 2; 798 799 /* 800 * don't claim the packet if it's not to a multicast destination or if 801 * we don't have an encapsulating tunnel with the source. 802 * Note: This code assumes that the remote site IP address 803 * uniquely identifies the tunnel (i.e., that this site has 804 * at most one tunnel with the remote site). 805 */ 806 if (!IN_MULTICAST(ntohl(((struct ip *)((char *)ip+hlen))->ip_dst.s_addr))) 807 return 0; 808 if (ip->ip_src.s_addr != last_encap_src) { 809 struct vif *vifp = viftable; 810 struct vif *vife = vifp + numvifs; 811 812 last_encap_src = ip->ip_src.s_addr; 813 last_encap_vif = NULL; 814 for ( ; vifp < vife; ++vifp) 815 if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) { 816 if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT)) == VIFF_TUNNEL) 817 last_encap_vif = vifp; 818 break; 819 } 820 } 821 if (last_encap_vif == NULL) { 822 last_encap_src = INADDR_ANY; 823 return 0; 824 } 825 return 64; 826} 827 828/* 829 * De-encapsulate a packet and feed it back through ip input (this 830 * routine is called whenever IP gets a packet that mroute_encap_func() 831 * claimed). 832 */ 833static void 834mroute_encap_input(struct mbuf *m, int off) 835{ 836 struct ip *ip = mtod(m, struct ip *); 837 int hlen = ip->ip_hl << 2; 838 839 if (hlen > sizeof(struct ip)) 840 ip_stripoptions(m, (struct mbuf *) 0); 841 m->m_data += sizeof(struct ip); 842 m->m_len -= sizeof(struct ip); 843 m->m_pkthdr.len -= sizeof(struct ip); 844 845 m->m_pkthdr.rcvif = last_encap_vif->v_ifp; 846 847 netisr_queue(NETISR_IP, m); 848 /* 849 * normally we would need a "schednetisr(NETISR_IP)" 850 * here but we were called by ip_input and it is going 851 * to loop back & try to dequeue the packet we just 852 * queued as soon as we return so we avoid the 853 * unnecessary software interrrupt. 854 * 855 * XXX 856 * This no longer holds - we may have direct-dispatched the packet, 857 * or there may be a queue processing limit. 858 */ 859} 860 861extern struct domain inetdomain; 862static struct protosw mroute_encap_protosw = 863{ SOCK_RAW, &inetdomain, IPPROTO_IPV4, PR_ATOMIC|PR_ADDR, 864 mroute_encap_input, 0, 0, rip_ctloutput, 865 0, 866 0, 0, 0, 0, 867 &rip_usrreqs 868}; 869 870/* 871 * Add a vif to the vif table 872 */ 873static int 874add_vif(struct vifctl *vifcp) 875{ 876 struct vif *vifp = viftable + vifcp->vifc_vifi; 877 struct sockaddr_in sin = {sizeof sin, AF_INET}; 878 struct ifaddr *ifa; 879 struct ifnet *ifp; 880 int error; 881 struct tbf *v_tbf = tbftable + vifcp->vifc_vifi; 882 883 VIF_LOCK(); 884 if (vifcp->vifc_vifi >= MAXVIFS) { 885 VIF_UNLOCK(); 886 return EINVAL; 887 } 888 if (vifp->v_lcl_addr.s_addr != INADDR_ANY) { 889 VIF_UNLOCK(); 890 return EADDRINUSE; 891 } 892 if (vifcp->vifc_lcl_addr.s_addr == INADDR_ANY) { 893 VIF_UNLOCK(); 894 return EADDRNOTAVAIL; 895 } 896 897 /* Find the interface with an address in AF_INET family */ 898#ifdef PIM 899 if (vifcp->vifc_flags & VIFF_REGISTER) { 900 /* 901 * XXX: Because VIFF_REGISTER does not really need a valid 902 * local interface (e.g. it could be 127.0.0.2), we don't 903 * check its address. 904 */ 905 ifp = NULL; 906 } else 907#endif 908 { 909 sin.sin_addr = vifcp->vifc_lcl_addr; 910 ifa = ifa_ifwithaddr((struct sockaddr *)&sin); 911 if (ifa == NULL) { 912 VIF_UNLOCK(); 913 return EADDRNOTAVAIL; 914 } 915 ifp = ifa->ifa_ifp; 916 } 917 918 if (vifcp->vifc_flags & VIFF_TUNNEL) { 919 if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) { 920 /* 921 * An encapsulating tunnel is wanted. Tell 922 * mroute_encap_input() to start paying attention 923 * to encapsulated packets. 924 */ 925 if (encap_cookie == NULL) { 926 int i; 927 928 encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4, 929 mroute_encapcheck, 930 (struct protosw *)&mroute_encap_protosw, NULL); 931 932 if (encap_cookie == NULL) { 933 printf("ip_mroute: unable to attach encap\n"); 934 VIF_UNLOCK(); 935 return EIO; /* XXX */ 936 } 937 for (i = 0; i < MAXVIFS; ++i) { 938 multicast_decap_if[i].if_name = "mdecap"; 939 multicast_decap_if[i].if_unit = i; 940 } 941 } 942 /* 943 * Set interface to fake encapsulator interface 944 */ 945 ifp = &multicast_decap_if[vifcp->vifc_vifi]; 946 /* 947 * Prepare cached route entry 948 */ 949 bzero(&vifp->v_route, sizeof(vifp->v_route)); 950 } else { 951 log(LOG_ERR, "source routed tunnels not supported\n"); 952 VIF_UNLOCK(); 953 return EOPNOTSUPP; 954 } 955#ifdef PIM 956 } else if (vifcp->vifc_flags & VIFF_REGISTER) { 957 ifp = &multicast_register_if; 958 if (mrtdebug) 959 log(LOG_DEBUG, "Adding a register vif, ifp: %p\n", 960 (void *)&multicast_register_if); 961 if (reg_vif_num == VIFI_INVALID) { 962 multicast_register_if.if_name = "register_vif"; 963 multicast_register_if.if_unit = 0; 964 multicast_register_if.if_flags = IFF_LOOPBACK; 965 bzero(&vifp->v_route, sizeof(vifp->v_route)); 966 reg_vif_num = vifcp->vifc_vifi; 967 } 968#endif 969 } else { /* Make sure the interface supports multicast */ 970 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 971 VIF_UNLOCK(); 972 return EOPNOTSUPP; 973 } 974 975 /* Enable promiscuous reception of all IP multicasts from the if */ 976 error = if_allmulti(ifp, 1); 977 if (error) { 978 VIF_UNLOCK(); 979 return error; 980 } 981 } 982 983 /* define parameters for the tbf structure */ 984 vifp->v_tbf = v_tbf; 985 GET_TIME(vifp->v_tbf->tbf_last_pkt_t); 986 vifp->v_tbf->tbf_n_tok = 0; 987 vifp->v_tbf->tbf_q_len = 0; 988 vifp->v_tbf->tbf_max_q_len = MAXQSIZE; 989 vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL; 990 991 vifp->v_flags = vifcp->vifc_flags; 992 vifp->v_threshold = vifcp->vifc_threshold; 993 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 994 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 995 vifp->v_ifp = ifp; 996 /* scaling up here allows division by 1024 in critical code */ 997 vifp->v_rate_limit= vifcp->vifc_rate_limit * 1024 / 1000; 998 vifp->v_rsvp_on = 0; 999 vifp->v_rsvpd = NULL; 1000 /* initialize per vif pkt counters */ 1001 vifp->v_pkt_in = 0; 1002 vifp->v_pkt_out = 0; 1003 vifp->v_bytes_in = 0; 1004 vifp->v_bytes_out = 0; 1005 1006 /* Adjust numvifs up if the vifi is higher than numvifs */ 1007 if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1; 1008 1009 VIF_UNLOCK(); 1010 1011 if (mrtdebug) 1012 log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x, rate %d\n", 1013 vifcp->vifc_vifi, 1014 (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr), 1015 (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask", 1016 (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr), 1017 vifcp->vifc_threshold, 1018 vifcp->vifc_rate_limit); 1019 1020 return 0; 1021} 1022 1023/* 1024 * Delete a vif from the vif table 1025 */ 1026static int 1027del_vif(vifi_t vifi) 1028{ 1029 struct vif *vifp; 1030 1031 VIF_LOCK(); 1032 1033 if (vifi >= numvifs) { 1034 VIF_UNLOCK(); 1035 return EINVAL; 1036 } 1037 vifp = &viftable[vifi]; 1038 if (vifp->v_lcl_addr.s_addr == INADDR_ANY) { 1039 VIF_UNLOCK(); 1040 return EADDRNOTAVAIL; 1041 } 1042 1043 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) 1044 if_allmulti(vifp->v_ifp, 0); 1045 1046 if (vifp == last_encap_vif) { 1047 last_encap_vif = NULL; 1048 last_encap_src = INADDR_ANY; 1049 } 1050 1051 /* 1052 * Free packets queued at the interface 1053 */ 1054 while (vifp->v_tbf->tbf_q) { 1055 struct mbuf *m = vifp->v_tbf->tbf_q; 1056 1057 vifp->v_tbf->tbf_q = m->m_act; 1058 m_freem(m); 1059 } 1060 1061#ifdef PIM 1062 if (vifp->v_flags & VIFF_REGISTER) 1063 reg_vif_num = VIFI_INVALID; 1064#endif 1065 1066 bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf))); 1067 bzero((caddr_t)vifp, sizeof (*vifp)); 1068 1069 if (mrtdebug) 1070 log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs); 1071 1072 /* Adjust numvifs down */ 1073 for (vifi = numvifs; vifi > 0; vifi--) 1074 if (viftable[vifi-1].v_lcl_addr.s_addr != INADDR_ANY) 1075 break; 1076 numvifs = vifi; 1077 1078 VIF_UNLOCK(); 1079 1080 return 0; 1081} 1082 1083/* 1084 * update an mfc entry without resetting counters and S,G addresses. 1085 */ 1086static void 1087update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 1088{ 1089 int i; 1090 1091 rt->mfc_parent = mfccp->mfcc_parent; 1092 for (i = 0; i < numvifs; i++) { 1093 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 1094 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config & 1095 MRT_MFC_FLAGS_ALL; 1096 } 1097 /* set the RP address */ 1098 if (mrt_api_config & MRT_MFC_RP) 1099 rt->mfc_rp = mfccp->mfcc_rp; 1100 else 1101 rt->mfc_rp.s_addr = INADDR_ANY; 1102} 1103 1104/* 1105 * fully initialize an mfc entry from the parameter. 1106 */ 1107static void 1108init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 1109{ 1110 rt->mfc_origin = mfccp->mfcc_origin; 1111 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1112 1113 update_mfc_params(rt, mfccp); 1114 1115 /* initialize pkt counters per src-grp */ 1116 rt->mfc_pkt_cnt = 0; 1117 rt->mfc_byte_cnt = 0; 1118 rt->mfc_wrong_if = 0; 1119 rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0; 1120} 1121 1122 1123/* 1124 * Add an mfc entry 1125 */ 1126static int 1127add_mfc(struct mfcctl2 *mfccp) 1128{ 1129 struct mfc *rt; 1130 u_long hash; 1131 struct rtdetq *rte; 1132 u_short nstl; 1133 1134 VIF_LOCK(); 1135 MFC_LOCK(); 1136 1137 rt = mfc_find(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); 1138 1139 /* If an entry already exists, just update the fields */ 1140 if (rt) { 1141 if (mrtdebug & DEBUG_MFC) 1142 log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n", 1143 (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1144 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1145 mfccp->mfcc_parent); 1146 1147 update_mfc_params(rt, mfccp); 1148 MFC_UNLOCK(); 1149 VIF_UNLOCK(); 1150 return 0; 1151 } 1152 1153 /* 1154 * Find the entry for which the upcall was made and update 1155 */ 1156 hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr); 1157 for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) { 1158 1159 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1160 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) && 1161 (rt->mfc_stall != NULL)) { 1162 1163 if (nstl++) 1164 log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n", 1165 "multiple kernel entries", 1166 (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1167 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1168 mfccp->mfcc_parent, (void *)rt->mfc_stall); 1169 1170 if (mrtdebug & DEBUG_MFC) 1171 log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n", 1172 (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1173 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1174 mfccp->mfcc_parent, (void *)rt->mfc_stall); 1175 1176 init_mfc_params(rt, mfccp); 1177 1178 rt->mfc_expire = 0; /* Don't clean this guy up */ 1179 nexpire[hash]--; 1180 1181 /* free packets Qed at the end of this entry */ 1182 for (rte = rt->mfc_stall; rte != NULL; ) { 1183 struct rtdetq *n = rte->next; 1184 1185 ip_mdq(rte->m, rte->ifp, rt, -1); 1186 m_freem(rte->m); 1187 free(rte, M_MRTABLE); 1188 rte = n; 1189 } 1190 rt->mfc_stall = NULL; 1191 } 1192 } 1193 1194 /* 1195 * It is possible that an entry is being inserted without an upcall 1196 */ 1197 if (nstl == 0) { 1198 if (mrtdebug & DEBUG_MFC) 1199 log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n", 1200 hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr), 1201 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1202 mfccp->mfcc_parent); 1203 1204 for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) { 1205 if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) && 1206 (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) { 1207 init_mfc_params(rt, mfccp); 1208 if (rt->mfc_expire) 1209 nexpire[hash]--; 1210 rt->mfc_expire = 0; 1211 break; /* XXX */ 1212 } 1213 } 1214 if (rt == NULL) { /* no upcall, so make a new entry */ 1215 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); 1216 if (rt == NULL) { 1217 MFC_UNLOCK(); 1218 VIF_UNLOCK(); 1219 return ENOBUFS; 1220 } 1221 1222 init_mfc_params(rt, mfccp); 1223 rt->mfc_expire = 0; 1224 rt->mfc_stall = NULL; 1225 1226 rt->mfc_bw_meter = NULL; 1227 /* insert new entry at head of hash chain */ 1228 rt->mfc_next = mfctable[hash]; 1229 mfctable[hash] = rt; 1230 } 1231 } 1232 MFC_UNLOCK(); 1233 VIF_UNLOCK(); 1234 return 0; 1235} 1236 1237/* 1238 * Delete an mfc entry 1239 */ 1240static int 1241del_mfc(struct mfcctl2 *mfccp) 1242{ 1243 struct in_addr origin; 1244 struct in_addr mcastgrp; 1245 struct mfc *rt; 1246 struct mfc **nptr; 1247 u_long hash; 1248 struct bw_meter *list; 1249 1250 origin = mfccp->mfcc_origin; 1251 mcastgrp = mfccp->mfcc_mcastgrp; 1252 1253 if (mrtdebug & DEBUG_MFC) 1254 log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n", 1255 (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr)); 1256 1257 MFC_LOCK(); 1258 1259 hash = MFCHASH(origin.s_addr, mcastgrp.s_addr); 1260 for (nptr = &mfctable[hash]; (rt = *nptr) != NULL; nptr = &rt->mfc_next) 1261 if (origin.s_addr == rt->mfc_origin.s_addr && 1262 mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr && 1263 rt->mfc_stall == NULL) 1264 break; 1265 if (rt == NULL) { 1266 MFC_UNLOCK(); 1267 return EADDRNOTAVAIL; 1268 } 1269 1270 *nptr = rt->mfc_next; 1271 1272 /* 1273 * free the bw_meter entries 1274 */ 1275 list = rt->mfc_bw_meter; 1276 rt->mfc_bw_meter = NULL; 1277 1278 free(rt, M_MRTABLE); 1279 1280 free_bw_list(list); 1281 1282 MFC_UNLOCK(); 1283 1284 return 0; 1285} 1286 1287/* 1288 * Send a message to mrouted on the multicast routing socket 1289 */ 1290static int 1291socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) 1292{ 1293 if (s) { 1294 if (sbappendaddr(&s->so_rcv, (struct sockaddr *)src, mm, NULL) != 0) { 1295 sorwakeup(s); 1296 return 0; 1297 } 1298 } 1299 m_freem(mm); 1300 return -1; 1301} 1302 1303/* 1304 * IP multicast forwarding function. This function assumes that the packet 1305 * pointed to by "ip" has arrived on (or is about to be sent to) the interface 1306 * pointed to by "ifp", and the packet is to be relayed to other networks 1307 * that have members of the packet's destination IP multicast group. 1308 * 1309 * The packet is returned unscathed to the caller, unless it is 1310 * erroneous, in which case a non-zero return value tells the caller to 1311 * discard it. 1312 */ 1313 1314#define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1315 1316static int 1317X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, 1318 struct ip_moptions *imo) 1319{ 1320 struct mfc *rt; 1321 int error; 1322 vifi_t vifi; 1323 1324 if (mrtdebug & DEBUG_FORWARD) 1325 log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n", 1326 (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), 1327 (void *)ifp); 1328 1329 if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || 1330 ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { 1331 /* 1332 * Packet arrived via a physical interface or 1333 * an encapsulated tunnel or a register_vif. 1334 */ 1335 } else { 1336 /* 1337 * Packet arrived through a source-route tunnel. 1338 * Source-route tunnels are no longer supported. 1339 */ 1340 static int last_log; 1341 if (last_log != time_second) { 1342 last_log = time_second; 1343 log(LOG_ERR, 1344 "ip_mforward: received source-routed packet from %lx\n", 1345 (u_long)ntohl(ip->ip_src.s_addr)); 1346 } 1347 return 1; 1348 } 1349 1350 VIF_LOCK(); 1351 MFC_LOCK(); 1352 if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) { 1353 if (ip->ip_ttl < 255) 1354 ip->ip_ttl++; /* compensate for -1 in *_send routines */ 1355 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { 1356 struct vif *vifp = viftable + vifi; 1357 1358 printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s%d)\n", 1359 (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr), 1360 vifi, 1361 (vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "", 1362 vifp->v_ifp->if_name, vifp->v_ifp->if_unit); 1363 } 1364 error = ip_mdq(m, ifp, NULL, vifi); 1365 MFC_UNLOCK(); 1366 VIF_UNLOCK(); 1367 return error; 1368 } 1369 if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) { 1370 printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n", 1371 (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr)); 1372 if (!imo) 1373 printf("In fact, no options were specified at all\n"); 1374 } 1375 1376 /* 1377 * Don't forward a packet with time-to-live of zero or one, 1378 * or a packet destined to a local-only group. 1379 */ 1380 if (ip->ip_ttl <= 1 || ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP) { 1381 MFC_UNLOCK(); 1382 VIF_UNLOCK(); 1383 return 0; 1384 } 1385 1386 /* 1387 * Determine forwarding vifs from the forwarding cache table 1388 */ 1389 ++mrtstat.mrts_mfc_lookups; 1390 rt = mfc_find(ip->ip_src.s_addr, ip->ip_dst.s_addr); 1391 1392 /* Entry exists, so forward if necessary */ 1393 if (rt != NULL) { 1394 error = ip_mdq(m, ifp, rt, -1); 1395 MFC_UNLOCK(); 1396 VIF_UNLOCK(); 1397 return error; 1398 } else { 1399 /* 1400 * If we don't have a route for packet's origin, 1401 * Make a copy of the packet & send message to routing daemon 1402 */ 1403 1404 struct mbuf *mb0; 1405 struct rtdetq *rte; 1406 u_long hash; 1407 int hlen = ip->ip_hl << 2; 1408 1409 ++mrtstat.mrts_mfc_misses; 1410 1411 mrtstat.mrts_no_route++; 1412 if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC)) 1413 log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n", 1414 (u_long)ntohl(ip->ip_src.s_addr), 1415 (u_long)ntohl(ip->ip_dst.s_addr)); 1416 1417 /* 1418 * Allocate mbufs early so that we don't do extra work if we are 1419 * just going to fail anyway. Make sure to pullup the header so 1420 * that other people can't step on it. 1421 */ 1422 rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT); 1423 if (rte == NULL) { 1424 MFC_UNLOCK(); 1425 VIF_UNLOCK(); 1426 return ENOBUFS; 1427 } 1428 mb0 = m_copypacket(m, M_DONTWAIT); 1429 if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) 1430 mb0 = m_pullup(mb0, hlen); 1431 if (mb0 == NULL) { 1432 free(rte, M_MRTABLE); 1433 MFC_UNLOCK(); 1434 VIF_UNLOCK(); 1435 return ENOBUFS; 1436 } 1437 1438 /* is there an upcall waiting for this flow ? */ 1439 hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr); 1440 for (rt = mfctable[hash]; rt; rt = rt->mfc_next) { 1441 if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) && 1442 (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) && 1443 (rt->mfc_stall != NULL)) 1444 break; 1445 } 1446 1447 if (rt == NULL) { 1448 int i; 1449 struct igmpmsg *im; 1450 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 1451 struct mbuf *mm; 1452 1453 /* 1454 * Locate the vifi for the incoming interface for this packet. 1455 * If none found, drop packet. 1456 */ 1457 for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) 1458 ; 1459 if (vifi >= numvifs) /* vif not found, drop packet */ 1460 goto non_fatal; 1461 1462 /* no upcall, so make a new entry */ 1463 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); 1464 if (rt == NULL) 1465 goto fail; 1466 /* Make a copy of the header to send to the user level process */ 1467 mm = m_copy(mb0, 0, hlen); 1468 if (mm == NULL) 1469 goto fail1; 1470 1471 /* 1472 * Send message to routing daemon to install 1473 * a route into the kernel table 1474 */ 1475 1476 im = mtod(mm, struct igmpmsg *); 1477 im->im_msgtype = IGMPMSG_NOCACHE; 1478 im->im_mbz = 0; 1479 im->im_vif = vifi; 1480 1481 mrtstat.mrts_upcalls++; 1482 1483 k_igmpsrc.sin_addr = ip->ip_src; 1484 if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { 1485 log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n"); 1486 ++mrtstat.mrts_upq_sockfull; 1487fail1: 1488 free(rt, M_MRTABLE); 1489fail: 1490 free(rte, M_MRTABLE); 1491 m_freem(mb0); 1492 MFC_UNLOCK(); 1493 VIF_UNLOCK(); 1494 return ENOBUFS; 1495 } 1496 1497 /* insert new entry at head of hash chain */ 1498 rt->mfc_origin.s_addr = ip->ip_src.s_addr; 1499 rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; 1500 rt->mfc_expire = UPCALL_EXPIRE; 1501 nexpire[hash]++; 1502 for (i = 0; i < numvifs; i++) { 1503 rt->mfc_ttls[i] = 0; 1504 rt->mfc_flags[i] = 0; 1505 } 1506 rt->mfc_parent = -1; 1507 1508 rt->mfc_rp.s_addr = INADDR_ANY; /* clear the RP address */ 1509 1510 rt->mfc_bw_meter = NULL; 1511 1512 /* link into table */ 1513 rt->mfc_next = mfctable[hash]; 1514 mfctable[hash] = rt; 1515 rt->mfc_stall = rte; 1516 1517 } else { 1518 /* determine if q has overflowed */ 1519 int npkts = 0; 1520 struct rtdetq **p; 1521 1522 /* 1523 * XXX ouch! we need to append to the list, but we 1524 * only have a pointer to the front, so we have to 1525 * scan the entire list every time. 1526 */ 1527 for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next) 1528 npkts++; 1529 1530 if (npkts > MAX_UPQ) { 1531 mrtstat.mrts_upq_ovflw++; 1532non_fatal: 1533 free(rte, M_MRTABLE); 1534 m_freem(mb0); 1535 MFC_UNLOCK(); 1536 VIF_UNLOCK(); 1537 return 0; 1538 } 1539 1540 /* Add this entry to the end of the queue */ 1541 *p = rte; 1542 } 1543 1544 rte->m = mb0; 1545 rte->ifp = ifp; 1546 rte->next = NULL; 1547 1548 MFC_UNLOCK(); 1549 VIF_UNLOCK(); 1550 1551 return 0; 1552 } 1553} 1554 1555/* 1556 * Clean up the cache entry if upcall is not serviced 1557 */ 1558static void 1559expire_upcalls(void *unused) 1560{ 1561 struct rtdetq *rte; 1562 struct mfc *mfc, **nptr; 1563 int i; 1564 1565 MFC_LOCK(); 1566 for (i = 0; i < MFCTBLSIZ; i++) { 1567 if (nexpire[i] == 0) 1568 continue; 1569 nptr = &mfctable[i]; 1570 for (mfc = *nptr; mfc != NULL; mfc = *nptr) { 1571 /* 1572 * Skip real cache entries 1573 * Make sure it wasn't marked to not expire (shouldn't happen) 1574 * If it expires now 1575 */ 1576 if (mfc->mfc_stall != NULL && mfc->mfc_expire != 0 && 1577 --mfc->mfc_expire == 0) { 1578 if (mrtdebug & DEBUG_EXPIRE) 1579 log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n", 1580 (u_long)ntohl(mfc->mfc_origin.s_addr), 1581 (u_long)ntohl(mfc->mfc_mcastgrp.s_addr)); 1582 /* 1583 * drop all the packets 1584 * free the mbuf with the pkt, if, timing info 1585 */ 1586 for (rte = mfc->mfc_stall; rte; ) { 1587 struct rtdetq *n = rte->next; 1588 1589 m_freem(rte->m); 1590 free(rte, M_MRTABLE); 1591 rte = n; 1592 } 1593 ++mrtstat.mrts_cache_cleanups; 1594 nexpire[i]--; 1595 1596 /* 1597 * free the bw_meter entries 1598 */ 1599 while (mfc->mfc_bw_meter != NULL) { 1600 struct bw_meter *x = mfc->mfc_bw_meter; 1601 1602 mfc->mfc_bw_meter = x->bm_mfc_next; 1603 free(x, M_BWMETER); 1604 } 1605 1606 *nptr = mfc->mfc_next; 1607 free(mfc, M_MRTABLE); 1608 } else { 1609 nptr = &mfc->mfc_next; 1610 } 1611 } 1612 } 1613 MFC_UNLOCK(); 1614 1615 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); 1616} 1617 1618/* 1619 * Packet forwarding routine once entry in the cache is made 1620 */ 1621static int 1622ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) 1623{ 1624 struct ip *ip = mtod(m, struct ip *); 1625 vifi_t vifi; 1626 int plen = ip->ip_len; 1627 1628 VIF_LOCK_ASSERT(); 1629/* 1630 * Macro to send packet on vif. Since RSVP packets don't get counted on 1631 * input, they shouldn't get counted on output, so statistics keeping is 1632 * separate. 1633 */ 1634#define MC_SEND(ip,vifp,m) { \ 1635 if ((vifp)->v_flags & VIFF_TUNNEL) \ 1636 encap_send((ip), (vifp), (m)); \ 1637 else \ 1638 phyint_send((ip), (vifp), (m)); \ 1639} 1640 1641 /* 1642 * If xmt_vif is not -1, send on only the requested vif. 1643 * 1644 * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) 1645 */ 1646 if (xmt_vif < numvifs) { 1647#ifdef PIM 1648 if (viftable[xmt_vif].v_flags & VIFF_REGISTER) 1649 pim_register_send(ip, viftable + xmt_vif, m, rt); 1650 else 1651#endif 1652 MC_SEND(ip, viftable + xmt_vif, m); 1653 return 1; 1654 } 1655 1656 /* 1657 * Don't forward if it didn't arrive from the parent vif for its origin. 1658 */ 1659 vifi = rt->mfc_parent; 1660 if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { 1661 /* came in the wrong interface */ 1662 if (mrtdebug & DEBUG_FORWARD) 1663 log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n", 1664 (void *)ifp, vifi, (void *)viftable[vifi].v_ifp); 1665 ++mrtstat.mrts_wrong_if; 1666 ++rt->mfc_wrong_if; 1667 /* 1668 * If we are doing PIM assert processing, send a message 1669 * to the routing daemon. 1670 * 1671 * XXX: A PIM-SM router needs the WRONGVIF detection so it 1672 * can complete the SPT switch, regardless of the type 1673 * of the iif (broadcast media, GRE tunnel, etc). 1674 */ 1675 if (pim_assert && (vifi < numvifs) && viftable[vifi].v_ifp) { 1676 struct timeval now; 1677 u_long delta; 1678 1679#ifdef PIM 1680 if (ifp == &multicast_register_if) 1681 pimstat.pims_rcv_registers_wrongiif++; 1682#endif 1683 1684 /* Get vifi for the incoming packet */ 1685 for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) 1686 ; 1687 if (vifi >= numvifs) 1688 return 0; /* The iif is not found: ignore the packet. */ 1689 1690 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) 1691 return 0; /* WRONGVIF disabled: ignore the packet */ 1692 1693 GET_TIME(now); 1694 1695 TV_DELTA(rt->mfc_last_assert, now, delta); 1696 1697 if (delta > ASSERT_MSG_TIME) { 1698 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 1699 struct igmpmsg *im; 1700 int hlen = ip->ip_hl << 2; 1701 struct mbuf *mm = m_copy(m, 0, hlen); 1702 1703 if (mm && (M_HASCL(mm) || mm->m_len < hlen)) 1704 mm = m_pullup(mm, hlen); 1705 if (mm == NULL) 1706 return ENOBUFS; 1707 1708 rt->mfc_last_assert = now; 1709 1710 im = mtod(mm, struct igmpmsg *); 1711 im->im_msgtype = IGMPMSG_WRONGVIF; 1712 im->im_mbz = 0; 1713 im->im_vif = vifi; 1714 1715 mrtstat.mrts_upcalls++; 1716 1717 k_igmpsrc.sin_addr = im->im_src; 1718 if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) { 1719 log(LOG_WARNING, 1720 "ip_mforward: ip_mrouter socket queue full\n"); 1721 ++mrtstat.mrts_upq_sockfull; 1722 return ENOBUFS; 1723 } 1724 } 1725 } 1726 return 0; 1727 } 1728 1729 /* If I sourced this packet, it counts as output, else it was input. */ 1730 if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) { 1731 viftable[vifi].v_pkt_out++; 1732 viftable[vifi].v_bytes_out += plen; 1733 } else { 1734 viftable[vifi].v_pkt_in++; 1735 viftable[vifi].v_bytes_in += plen; 1736 } 1737 rt->mfc_pkt_cnt++; 1738 rt->mfc_byte_cnt += plen; 1739 1740 /* 1741 * For each vif, decide if a copy of the packet should be forwarded. 1742 * Forward if: 1743 * - the ttl exceeds the vif's threshold 1744 * - there are group members downstream on interface 1745 */ 1746 for (vifi = 0; vifi < numvifs; vifi++) 1747 if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { 1748 viftable[vifi].v_pkt_out++; 1749 viftable[vifi].v_bytes_out += plen; 1750#ifdef PIM 1751 if (viftable[vifi].v_flags & VIFF_REGISTER) 1752 pim_register_send(ip, viftable + vifi, m, rt); 1753 else 1754#endif 1755 MC_SEND(ip, viftable+vifi, m); 1756 } 1757 1758 /* 1759 * Perform upcall-related bw measuring. 1760 */ 1761 if (rt->mfc_bw_meter != NULL) { 1762 struct bw_meter *x; 1763 struct timeval now; 1764 1765 GET_TIME(now); 1766 MFC_LOCK_ASSERT(); 1767 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) 1768 bw_meter_receive_packet(x, plen, &now); 1769 } 1770 1771 return 0; 1772} 1773 1774/* 1775 * check if a vif number is legal/ok. This is used by ip_output. 1776 */ 1777static int 1778X_legal_vif_num(int vif) 1779{ 1780 /* XXX unlocked, matter? */ 1781 return (vif >= 0 && vif < numvifs); 1782} 1783 1784/* 1785 * Return the local address used by this vif 1786 */ 1787static u_long 1788X_ip_mcast_src(int vifi) 1789{ 1790 /* XXX unlocked, matter? */ 1791 if (vifi >= 0 && vifi < numvifs) 1792 return viftable[vifi].v_lcl_addr.s_addr; 1793 else 1794 return INADDR_ANY; 1795} 1796 1797static void 1798phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1799{ 1800 struct mbuf *mb_copy; 1801 int hlen = ip->ip_hl << 2; 1802 1803 VIF_LOCK_ASSERT(); 1804 1805 /* 1806 * Make a new reference to the packet; make sure that 1807 * the IP header is actually copied, not just referenced, 1808 * so that ip_output() only scribbles on the copy. 1809 */ 1810 mb_copy = m_copypacket(m, M_DONTWAIT); 1811 if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) 1812 mb_copy = m_pullup(mb_copy, hlen); 1813 if (mb_copy == NULL) 1814 return; 1815 1816 if (vifp->v_rate_limit == 0) 1817 tbf_send_packet(vifp, mb_copy); 1818 else 1819 tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len); 1820} 1821 1822static void 1823encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1824{ 1825 struct mbuf *mb_copy; 1826 struct ip *ip_copy; 1827 int i, len = ip->ip_len; 1828 1829 VIF_LOCK_ASSERT(); 1830 1831 /* Take care of delayed checksums */ 1832 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 1833 in_delayed_cksum(m); 1834 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 1835 } 1836 1837 /* 1838 * copy the old packet & pullup its IP header into the 1839 * new mbuf so we can modify it. Try to fill the new 1840 * mbuf since if we don't the ethernet driver will. 1841 */ 1842 MGETHDR(mb_copy, M_DONTWAIT, MT_HEADER); 1843 if (mb_copy == NULL) 1844 return; 1845#ifdef MAC 1846 mac_create_mbuf_multicast_encap(m, vifp->v_ifp, mb_copy); 1847#endif 1848 mb_copy->m_data += max_linkhdr; 1849 mb_copy->m_len = sizeof(multicast_encap_iphdr); 1850 1851 if ((mb_copy->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) { 1852 m_freem(mb_copy); 1853 return; 1854 } 1855 i = MHLEN - M_LEADINGSPACE(mb_copy); 1856 if (i > len) 1857 i = len; 1858 mb_copy = m_pullup(mb_copy, i); 1859 if (mb_copy == NULL) 1860 return; 1861 mb_copy->m_pkthdr.len = len + sizeof(multicast_encap_iphdr); 1862 1863 /* 1864 * fill in the encapsulating IP header. 1865 */ 1866 ip_copy = mtod(mb_copy, struct ip *); 1867 *ip_copy = multicast_encap_iphdr; 1868#ifdef RANDOM_IP_ID 1869 ip_copy->ip_id = ip_randomid(); 1870#else 1871 ip_copy->ip_id = htons(ip_id++); 1872#endif 1873 ip_copy->ip_len += len; 1874 ip_copy->ip_src = vifp->v_lcl_addr; 1875 ip_copy->ip_dst = vifp->v_rmt_addr; 1876 1877 /* 1878 * turn the encapsulated IP header back into a valid one. 1879 */ 1880 ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr)); 1881 --ip->ip_ttl; 1882 ip->ip_len = htons(ip->ip_len); 1883 ip->ip_off = htons(ip->ip_off); 1884 ip->ip_sum = 0; 1885 mb_copy->m_data += sizeof(multicast_encap_iphdr); 1886 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 1887 mb_copy->m_data -= sizeof(multicast_encap_iphdr); 1888 1889 if (vifp->v_rate_limit == 0) 1890 tbf_send_packet(vifp, mb_copy); 1891 else 1892 tbf_control(vifp, mb_copy, ip, ip_copy->ip_len); 1893} 1894 1895/* 1896 * Token bucket filter module 1897 */ 1898 1899static void 1900tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_long p_len) 1901{ 1902 struct tbf *t = vifp->v_tbf; 1903 1904 VIF_LOCK_ASSERT(); 1905 1906 if (p_len > MAX_BKT_SIZE) { /* drop if packet is too large */ 1907 mrtstat.mrts_pkt2large++; 1908 m_freem(m); 1909 return; 1910 } 1911 1912 tbf_update_tokens(vifp); 1913 1914 if (t->tbf_q_len == 0) { /* queue empty... */ 1915 if (p_len <= t->tbf_n_tok) { /* send packet if enough tokens */ 1916 t->tbf_n_tok -= p_len; 1917 tbf_send_packet(vifp, m); 1918 } else { /* no, queue packet and try later */ 1919 tbf_queue(vifp, m); 1920 callout_reset(&tbf_reprocess_ch, TBF_REPROCESS, 1921 tbf_reprocess_q, vifp); 1922 } 1923 } else if (t->tbf_q_len < t->tbf_max_q_len) { 1924 /* finite queue length, so queue pkts and process queue */ 1925 tbf_queue(vifp, m); 1926 tbf_process_q(vifp); 1927 } else { 1928 /* queue full, try to dq and queue and process */ 1929 if (!tbf_dq_sel(vifp, ip)) { 1930 mrtstat.mrts_q_overflow++; 1931 m_freem(m); 1932 } else { 1933 tbf_queue(vifp, m); 1934 tbf_process_q(vifp); 1935 } 1936 } 1937} 1938 1939/* 1940 * adds a packet to the queue at the interface 1941 */ 1942static void 1943tbf_queue(struct vif *vifp, struct mbuf *m) 1944{ 1945 struct tbf *t = vifp->v_tbf; 1946 1947 VIF_LOCK_ASSERT(); 1948 1949 if (t->tbf_t == NULL) /* Queue was empty */ 1950 t->tbf_q = m; 1951 else /* Insert at tail */ 1952 t->tbf_t->m_act = m; 1953 1954 t->tbf_t = m; /* Set new tail pointer */ 1955 1956#ifdef DIAGNOSTIC 1957 /* Make sure we didn't get fed a bogus mbuf */ 1958 if (m->m_act) 1959 panic("tbf_queue: m_act"); 1960#endif 1961 m->m_act = NULL; 1962 1963 t->tbf_q_len++; 1964} 1965 1966/* 1967 * processes the queue at the interface 1968 */ 1969static void 1970tbf_process_q(struct vif *vifp) 1971{ 1972 struct tbf *t = vifp->v_tbf; 1973 1974 VIF_LOCK_ASSERT(); 1975 1976 /* loop through the queue at the interface and send as many packets 1977 * as possible 1978 */ 1979 while (t->tbf_q_len > 0) { 1980 struct mbuf *m = t->tbf_q; 1981 int len = mtod(m, struct ip *)->ip_len; 1982 1983 /* determine if the packet can be sent */ 1984 if (len > t->tbf_n_tok) /* not enough tokens, we are done */ 1985 break; 1986 /* ok, reduce no of tokens, dequeue and send the packet. */ 1987 t->tbf_n_tok -= len; 1988 1989 t->tbf_q = m->m_act; 1990 if (--t->tbf_q_len == 0) 1991 t->tbf_t = NULL; 1992 1993 m->m_act = NULL; 1994 tbf_send_packet(vifp, m); 1995 } 1996} 1997 1998static void 1999tbf_reprocess_q(void *xvifp) 2000{ 2001 struct vif *vifp = xvifp; 2002 2003 if (ip_mrouter == NULL) 2004 return; 2005 VIF_LOCK(); 2006 tbf_update_tokens(vifp); 2007 tbf_process_q(vifp); 2008 if (vifp->v_tbf->tbf_q_len) 2009 callout_reset(&tbf_reprocess_ch, TBF_REPROCESS, tbf_reprocess_q, vifp); 2010 VIF_UNLOCK(); 2011} 2012 2013/* function that will selectively discard a member of the queue 2014 * based on the precedence value and the priority 2015 */ 2016static int 2017tbf_dq_sel(struct vif *vifp, struct ip *ip) 2018{ 2019 u_int p; 2020 struct mbuf *m, *last; 2021 struct mbuf **np; 2022 struct tbf *t = vifp->v_tbf; 2023 2024 VIF_LOCK_ASSERT(); 2025 2026 p = priority(vifp, ip); 2027 2028 np = &t->tbf_q; 2029 last = NULL; 2030 while ((m = *np) != NULL) { 2031 if (p > priority(vifp, mtod(m, struct ip *))) { 2032 *np = m->m_act; 2033 /* If we're removing the last packet, fix the tail pointer */ 2034 if (m == t->tbf_t) 2035 t->tbf_t = last; 2036 m_freem(m); 2037 /* It's impossible for the queue to be empty, but check anyways. */ 2038 if (--t->tbf_q_len == 0) 2039 t->tbf_t = NULL; 2040 mrtstat.mrts_drop_sel++; 2041 return 1; 2042 } 2043 np = &m->m_act; 2044 last = m; 2045 } 2046 return 0; 2047} 2048 2049static void 2050tbf_send_packet(struct vif *vifp, struct mbuf *m) 2051{ 2052 VIF_LOCK_ASSERT(); 2053 2054 if (vifp->v_flags & VIFF_TUNNEL) /* If tunnel options */ 2055 ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, NULL, NULL); 2056 else { 2057 struct ip_moptions imo; 2058 int error; 2059 static struct route ro; /* XXX check this */ 2060 2061 imo.imo_multicast_ifp = vifp->v_ifp; 2062 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; 2063 imo.imo_multicast_loop = 1; 2064 imo.imo_multicast_vif = -1; 2065 2066 /* 2067 * Re-entrancy should not be a problem here, because 2068 * the packets that we send out and are looped back at us 2069 * should get rejected because they appear to come from 2070 * the loopback interface, thus preventing looping. 2071 */ 2072 error = ip_output(m, NULL, &ro, IP_FORWARDING, &imo, NULL); 2073 2074 if (mrtdebug & DEBUG_XMIT) 2075 log(LOG_DEBUG, "phyint_send on vif %d err %d\n", 2076 (int)(vifp - viftable), error); 2077 } 2078} 2079 2080/* determine the current time and then 2081 * the elapsed time (between the last time and time now) 2082 * in milliseconds & update the no. of tokens in the bucket 2083 */ 2084static void 2085tbf_update_tokens(struct vif *vifp) 2086{ 2087 struct timeval tp; 2088 u_long tm; 2089 struct tbf *t = vifp->v_tbf; 2090 2091 VIF_LOCK_ASSERT(); 2092 2093 GET_TIME(tp); 2094 2095 TV_DELTA(tp, t->tbf_last_pkt_t, tm); 2096 2097 /* 2098 * This formula is actually 2099 * "time in seconds" * "bytes/second". 2100 * 2101 * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8) 2102 * 2103 * The (1000/1024) was introduced in add_vif to optimize 2104 * this divide into a shift. 2105 */ 2106 t->tbf_n_tok += tm * vifp->v_rate_limit / 1024 / 8; 2107 t->tbf_last_pkt_t = tp; 2108 2109 if (t->tbf_n_tok > MAX_BKT_SIZE) 2110 t->tbf_n_tok = MAX_BKT_SIZE; 2111} 2112 2113static int 2114priority(struct vif *vifp, struct ip *ip) 2115{ 2116 int prio = 50; /* the lowest priority -- default case */ 2117 2118 /* temporary hack; may add general packet classifier some day */ 2119 2120 /* 2121 * The UDP port space is divided up into four priority ranges: 2122 * [0, 16384) : unclassified - lowest priority 2123 * [16384, 32768) : audio - highest priority 2124 * [32768, 49152) : whiteboard - medium priority 2125 * [49152, 65536) : video - low priority 2126 * 2127 * Everything else gets lowest priority. 2128 */ 2129 if (ip->ip_p == IPPROTO_UDP) { 2130 struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2)); 2131 switch (ntohs(udp->uh_dport) & 0xc000) { 2132 case 0x4000: 2133 prio = 70; 2134 break; 2135 case 0x8000: 2136 prio = 60; 2137 break; 2138 case 0xc000: 2139 prio = 55; 2140 break; 2141 } 2142 } 2143 return prio; 2144} 2145 2146/* 2147 * End of token bucket filter modifications 2148 */ 2149 2150static int 2151X_ip_rsvp_vif(struct socket *so, struct sockopt *sopt) 2152{ 2153 int error, vifi; 2154 2155 if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) 2156 return EOPNOTSUPP; 2157 2158 error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); 2159 if (error) 2160 return error; 2161 2162 VIF_LOCK(); 2163 2164 if (vifi < 0 || vifi >= numvifs) { /* Error if vif is invalid */ 2165 VIF_UNLOCK(); 2166 return EADDRNOTAVAIL; 2167 } 2168 2169 if (sopt->sopt_name == IP_RSVP_VIF_ON) { 2170 /* Check if socket is available. */ 2171 if (viftable[vifi].v_rsvpd != NULL) { 2172 VIF_UNLOCK(); 2173 return EADDRINUSE; 2174 } 2175 2176 viftable[vifi].v_rsvpd = so; 2177 /* This may seem silly, but we need to be sure we don't over-increment 2178 * the RSVP counter, in case something slips up. 2179 */ 2180 if (!viftable[vifi].v_rsvp_on) { 2181 viftable[vifi].v_rsvp_on = 1; 2182 rsvp_on++; 2183 } 2184 } else { /* must be VIF_OFF */ 2185 /* 2186 * XXX as an additional consistency check, one could make sure 2187 * that viftable[vifi].v_rsvpd == so, otherwise passing so as 2188 * first parameter is pretty useless. 2189 */ 2190 viftable[vifi].v_rsvpd = NULL; 2191 /* 2192 * This may seem silly, but we need to be sure we don't over-decrement 2193 * the RSVP counter, in case something slips up. 2194 */ 2195 if (viftable[vifi].v_rsvp_on) { 2196 viftable[vifi].v_rsvp_on = 0; 2197 rsvp_on--; 2198 } 2199 } 2200 VIF_UNLOCK(); 2201 return 0; 2202} 2203 2204static void 2205X_ip_rsvp_force_done(struct socket *so) 2206{ 2207 int vifi; 2208 2209 /* Don't bother if it is not the right type of socket. */ 2210 if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) 2211 return; 2212 2213 VIF_LOCK(); 2214 2215 /* The socket may be attached to more than one vif...this 2216 * is perfectly legal. 2217 */ 2218 for (vifi = 0; vifi < numvifs; vifi++) { 2219 if (viftable[vifi].v_rsvpd == so) { 2220 viftable[vifi].v_rsvpd = NULL; 2221 /* This may seem silly, but we need to be sure we don't 2222 * over-decrement the RSVP counter, in case something slips up. 2223 */ 2224 if (viftable[vifi].v_rsvp_on) { 2225 viftable[vifi].v_rsvp_on = 0; 2226 rsvp_on--; 2227 } 2228 } 2229 } 2230 2231 VIF_UNLOCK(); 2232} 2233 2234static void 2235X_rsvp_input(struct mbuf *m, int off) 2236{ 2237 int vifi; 2238 struct ip *ip = mtod(m, struct ip *); 2239 struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET }; 2240 struct ifnet *ifp; 2241 2242 if (rsvpdebug) 2243 printf("rsvp_input: rsvp_on %d\n",rsvp_on); 2244 2245 /* Can still get packets with rsvp_on = 0 if there is a local member 2246 * of the group to which the RSVP packet is addressed. But in this 2247 * case we want to throw the packet away. 2248 */ 2249 if (!rsvp_on) { 2250 m_freem(m); 2251 return; 2252 } 2253 2254 if (rsvpdebug) 2255 printf("rsvp_input: check vifs\n"); 2256 2257#ifdef DIAGNOSTIC 2258 M_ASSERTPKTHDR(m); 2259#endif 2260 2261 ifp = m->m_pkthdr.rcvif; 2262 2263 VIF_LOCK(); 2264 /* Find which vif the packet arrived on. */ 2265 for (vifi = 0; vifi < numvifs; vifi++) 2266 if (viftable[vifi].v_ifp == ifp) 2267 break; 2268 2269 if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) { 2270 /* 2271 * Drop the lock here to avoid holding it across rip_input. 2272 * This could make rsvpdebug printfs wrong. If you care, 2273 * record the state of stuff before dropping the lock. 2274 */ 2275 VIF_UNLOCK(); 2276 /* 2277 * If the old-style non-vif-associated socket is set, 2278 * then use it. Otherwise, drop packet since there 2279 * is no specific socket for this vif. 2280 */ 2281 if (ip_rsvpd != NULL) { 2282 if (rsvpdebug) 2283 printf("rsvp_input: Sending packet up old-style socket\n"); 2284 rip_input(m, off); /* xxx */ 2285 } else { 2286 if (rsvpdebug && vifi == numvifs) 2287 printf("rsvp_input: Can't find vif for packet.\n"); 2288 else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL) 2289 printf("rsvp_input: No socket defined for vif %d\n",vifi); 2290 m_freem(m); 2291 } 2292 return; 2293 } 2294 rsvp_src.sin_addr = ip->ip_src; 2295 2296 if (rsvpdebug && m) 2297 printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n", 2298 m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv))); 2299 2300 if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) { 2301 if (rsvpdebug) 2302 printf("rsvp_input: Failed to append to socket\n"); 2303 } else { 2304 if (rsvpdebug) 2305 printf("rsvp_input: send packet up\n"); 2306 } 2307 VIF_UNLOCK(); 2308} 2309 2310/* 2311 * Code for bandwidth monitors 2312 */ 2313 2314/* 2315 * Define common interface for timeval-related methods 2316 */ 2317#define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) 2318#define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) 2319#define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) 2320 2321static uint32_t 2322compute_bw_meter_flags(struct bw_upcall *req) 2323{ 2324 uint32_t flags = 0; 2325 2326 if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) 2327 flags |= BW_METER_UNIT_PACKETS; 2328 if (req->bu_flags & BW_UPCALL_UNIT_BYTES) 2329 flags |= BW_METER_UNIT_BYTES; 2330 if (req->bu_flags & BW_UPCALL_GEQ) 2331 flags |= BW_METER_GEQ; 2332 if (req->bu_flags & BW_UPCALL_LEQ) 2333 flags |= BW_METER_LEQ; 2334 2335 return flags; 2336} 2337 2338/* 2339 * Add a bw_meter entry 2340 */ 2341static int 2342add_bw_upcall(struct bw_upcall *req) 2343{ 2344 struct mfc *mfc; 2345 struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, 2346 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; 2347 struct timeval now; 2348 struct bw_meter *x; 2349 uint32_t flags; 2350 2351 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2352 return EOPNOTSUPP; 2353 2354 /* Test if the flags are valid */ 2355 if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) 2356 return EINVAL; 2357 if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) 2358 return EINVAL; 2359 if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 2360 == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 2361 return EINVAL; 2362 2363 /* Test if the threshold time interval is valid */ 2364 if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) 2365 return EINVAL; 2366 2367 flags = compute_bw_meter_flags(req); 2368 2369 /* 2370 * Find if we have already same bw_meter entry 2371 */ 2372 MFC_LOCK(); 2373 mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr); 2374 if (mfc == NULL) { 2375 MFC_UNLOCK(); 2376 return EADDRNOTAVAIL; 2377 } 2378 for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { 2379 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2380 &req->bu_threshold.b_time, ==)) && 2381 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 2382 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2383 (x->bm_flags & BW_METER_USER_FLAGS) == flags) { 2384 MFC_UNLOCK(); 2385 return 0; /* XXX Already installed */ 2386 } 2387 } 2388 2389 /* Allocate the new bw_meter entry */ 2390 x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); 2391 if (x == NULL) { 2392 MFC_UNLOCK(); 2393 return ENOBUFS; 2394 } 2395 2396 /* Set the new bw_meter entry */ 2397 x->bm_threshold.b_time = req->bu_threshold.b_time; 2398 GET_TIME(now); 2399 x->bm_start_time = now; 2400 x->bm_threshold.b_packets = req->bu_threshold.b_packets; 2401 x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; 2402 x->bm_measured.b_packets = 0; 2403 x->bm_measured.b_bytes = 0; 2404 x->bm_flags = flags; 2405 x->bm_time_next = NULL; 2406 x->bm_time_hash = BW_METER_BUCKETS; 2407 2408 /* Add the new bw_meter entry to the front of entries for this MFC */ 2409 x->bm_mfc = mfc; 2410 x->bm_mfc_next = mfc->mfc_bw_meter; 2411 mfc->mfc_bw_meter = x; 2412 schedule_bw_meter(x, &now); 2413 MFC_UNLOCK(); 2414 2415 return 0; 2416} 2417 2418static void 2419free_bw_list(struct bw_meter *list) 2420{ 2421 while (list != NULL) { 2422 struct bw_meter *x = list; 2423 2424 list = list->bm_mfc_next; 2425 unschedule_bw_meter(x); 2426 free(x, M_BWMETER); 2427 } 2428} 2429 2430/* 2431 * Delete one or multiple bw_meter entries 2432 */ 2433static int 2434del_bw_upcall(struct bw_upcall *req) 2435{ 2436 struct mfc *mfc; 2437 struct bw_meter *x; 2438 2439 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 2440 return EOPNOTSUPP; 2441 2442 MFC_LOCK(); 2443 /* Find the corresponding MFC entry */ 2444 mfc = mfc_find(req->bu_src.s_addr, req->bu_dst.s_addr); 2445 if (mfc == NULL) { 2446 MFC_UNLOCK(); 2447 return EADDRNOTAVAIL; 2448 } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { 2449 /* 2450 * Delete all bw_meter entries for this mfc 2451 */ 2452 struct bw_meter *list; 2453 2454 list = mfc->mfc_bw_meter; 2455 mfc->mfc_bw_meter = NULL; 2456 free_bw_list(list); 2457 MFC_UNLOCK(); 2458 return 0; 2459 } else { /* Delete a single bw_meter entry */ 2460 struct bw_meter *prev; 2461 uint32_t flags = 0; 2462 2463 flags = compute_bw_meter_flags(req); 2464 2465 /* Find the bw_meter entry to delete */ 2466 for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; 2467 x = x->bm_mfc_next) { 2468 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 2469 &req->bu_threshold.b_time, ==)) && 2470 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 2471 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 2472 (x->bm_flags & BW_METER_USER_FLAGS) == flags) 2473 break; 2474 } 2475 if (x != NULL) { /* Delete entry from the list for this MFC */ 2476 if (prev != NULL) 2477 prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ 2478 else 2479 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ 2480 2481 unschedule_bw_meter(x); 2482 MFC_UNLOCK(); 2483 /* Free the bw_meter entry */ 2484 free(x, M_BWMETER); 2485 return 0; 2486 } else { 2487 MFC_UNLOCK(); 2488 return EINVAL; 2489 } 2490 } 2491 /* NOTREACHED */ 2492} 2493 2494/* 2495 * Perform bandwidth measurement processing that may result in an upcall 2496 */ 2497static void 2498bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) 2499{ 2500 struct timeval delta; 2501 2502 MFC_LOCK_ASSERT(); 2503 2504 delta = *nowp; 2505 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2506 2507 if (x->bm_flags & BW_METER_GEQ) { 2508 /* 2509 * Processing for ">=" type of bw_meter entry 2510 */ 2511 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2512 /* Reset the bw_meter entry */ 2513 x->bm_start_time = *nowp; 2514 x->bm_measured.b_packets = 0; 2515 x->bm_measured.b_bytes = 0; 2516 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2517 } 2518 2519 /* Record that a packet is received */ 2520 x->bm_measured.b_packets++; 2521 x->bm_measured.b_bytes += plen; 2522 2523 /* 2524 * Test if we should deliver an upcall 2525 */ 2526 if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { 2527 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2528 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || 2529 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2530 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { 2531 /* Prepare an upcall for delivery */ 2532 bw_meter_prepare_upcall(x, nowp); 2533 x->bm_flags |= BW_METER_UPCALL_DELIVERED; 2534 } 2535 } 2536 } else if (x->bm_flags & BW_METER_LEQ) { 2537 /* 2538 * Processing for "<=" type of bw_meter entry 2539 */ 2540 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 2541 /* 2542 * We are behind time with the multicast forwarding table 2543 * scanning for "<=" type of bw_meter entries, so test now 2544 * if we should deliver an upcall. 2545 */ 2546 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2547 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 2548 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2549 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 2550 /* Prepare an upcall for delivery */ 2551 bw_meter_prepare_upcall(x, nowp); 2552 } 2553 /* Reschedule the bw_meter entry */ 2554 unschedule_bw_meter(x); 2555 schedule_bw_meter(x, nowp); 2556 } 2557 2558 /* Record that a packet is received */ 2559 x->bm_measured.b_packets++; 2560 x->bm_measured.b_bytes += plen; 2561 2562 /* 2563 * Test if we should restart the measuring interval 2564 */ 2565 if ((x->bm_flags & BW_METER_UNIT_PACKETS && 2566 x->bm_measured.b_packets <= x->bm_threshold.b_packets) || 2567 (x->bm_flags & BW_METER_UNIT_BYTES && 2568 x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { 2569 /* Don't restart the measuring interval */ 2570 } else { 2571 /* Do restart the measuring interval */ 2572 /* 2573 * XXX: note that we don't unschedule and schedule, because this 2574 * might be too much overhead per packet. Instead, when we process 2575 * all entries for a given timer hash bin, we check whether it is 2576 * really a timeout. If not, we reschedule at that time. 2577 */ 2578 x->bm_start_time = *nowp; 2579 x->bm_measured.b_packets = 0; 2580 x->bm_measured.b_bytes = 0; 2581 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2582 } 2583 } 2584} 2585 2586/* 2587 * Prepare a bandwidth-related upcall 2588 */ 2589static void 2590bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) 2591{ 2592 struct timeval delta; 2593 struct bw_upcall *u; 2594 2595 MFC_LOCK_ASSERT(); 2596 2597 /* 2598 * Compute the measured time interval 2599 */ 2600 delta = *nowp; 2601 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2602 2603 /* 2604 * If there are too many pending upcalls, deliver them now 2605 */ 2606 if (bw_upcalls_n >= BW_UPCALLS_MAX) 2607 bw_upcalls_send(); 2608 2609 /* 2610 * Set the bw_upcall entry 2611 */ 2612 u = &bw_upcalls[bw_upcalls_n++]; 2613 u->bu_src = x->bm_mfc->mfc_origin; 2614 u->bu_dst = x->bm_mfc->mfc_mcastgrp; 2615 u->bu_threshold.b_time = x->bm_threshold.b_time; 2616 u->bu_threshold.b_packets = x->bm_threshold.b_packets; 2617 u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; 2618 u->bu_measured.b_time = delta; 2619 u->bu_measured.b_packets = x->bm_measured.b_packets; 2620 u->bu_measured.b_bytes = x->bm_measured.b_bytes; 2621 u->bu_flags = 0; 2622 if (x->bm_flags & BW_METER_UNIT_PACKETS) 2623 u->bu_flags |= BW_UPCALL_UNIT_PACKETS; 2624 if (x->bm_flags & BW_METER_UNIT_BYTES) 2625 u->bu_flags |= BW_UPCALL_UNIT_BYTES; 2626 if (x->bm_flags & BW_METER_GEQ) 2627 u->bu_flags |= BW_UPCALL_GEQ; 2628 if (x->bm_flags & BW_METER_LEQ) 2629 u->bu_flags |= BW_UPCALL_LEQ; 2630} 2631 2632/* 2633 * Send the pending bandwidth-related upcalls 2634 */ 2635static void 2636bw_upcalls_send(void) 2637{ 2638 struct mbuf *m; 2639 int len = bw_upcalls_n * sizeof(bw_upcalls[0]); 2640 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 2641 static struct igmpmsg igmpmsg = { 0, /* unused1 */ 2642 0, /* unused2 */ 2643 IGMPMSG_BW_UPCALL,/* im_msgtype */ 2644 0, /* im_mbz */ 2645 0, /* im_vif */ 2646 0, /* unused3 */ 2647 { 0 }, /* im_src */ 2648 { 0 } }; /* im_dst */ 2649 2650 MFC_LOCK_ASSERT(); 2651 2652 if (bw_upcalls_n == 0) 2653 return; /* No pending upcalls */ 2654 2655 bw_upcalls_n = 0; 2656 2657 /* 2658 * Allocate a new mbuf, initialize it with the header and 2659 * the payload for the pending calls. 2660 */ 2661 MGETHDR(m, M_DONTWAIT, MT_HEADER); 2662 if (m == NULL) { 2663 log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); 2664 return; 2665 } 2666 2667 m->m_len = m->m_pkthdr.len = 0; 2668 m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); 2669 m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]); 2670 2671 /* 2672 * Send the upcalls 2673 * XXX do we need to set the address in k_igmpsrc ? 2674 */ 2675 mrtstat.mrts_upcalls++; 2676 if (socket_send(ip_mrouter, m, &k_igmpsrc) < 0) { 2677 log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); 2678 ++mrtstat.mrts_upq_sockfull; 2679 } 2680} 2681 2682/* 2683 * Compute the timeout hash value for the bw_meter entries 2684 */ 2685#define BW_METER_TIMEHASH(bw_meter, hash) \ 2686 do { \ 2687 struct timeval next_timeval = (bw_meter)->bm_start_time; \ 2688 \ 2689 BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ 2690 (hash) = next_timeval.tv_sec; \ 2691 if (next_timeval.tv_usec) \ 2692 (hash)++; /* XXX: make sure we don't timeout early */ \ 2693 (hash) %= BW_METER_BUCKETS; \ 2694 } while (0) 2695 2696/* 2697 * Schedule a timer to process periodically bw_meter entry of type "<=" 2698 * by linking the entry in the proper hash bucket. 2699 */ 2700static void 2701schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) 2702{ 2703 int time_hash; 2704 2705 MFC_LOCK_ASSERT(); 2706 2707 if (!(x->bm_flags & BW_METER_LEQ)) 2708 return; /* XXX: we schedule timers only for "<=" entries */ 2709 2710 /* 2711 * Reset the bw_meter entry 2712 */ 2713 x->bm_start_time = *nowp; 2714 x->bm_measured.b_packets = 0; 2715 x->bm_measured.b_bytes = 0; 2716 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2717 2718 /* 2719 * Compute the timeout hash value and insert the entry 2720 */ 2721 BW_METER_TIMEHASH(x, time_hash); 2722 x->bm_time_next = bw_meter_timers[time_hash]; 2723 bw_meter_timers[time_hash] = x; 2724 x->bm_time_hash = time_hash; 2725} 2726 2727/* 2728 * Unschedule the periodic timer that processes bw_meter entry of type "<=" 2729 * by removing the entry from the proper hash bucket. 2730 */ 2731static void 2732unschedule_bw_meter(struct bw_meter *x) 2733{ 2734 int time_hash; 2735 struct bw_meter *prev, *tmp; 2736 2737 MFC_LOCK_ASSERT(); 2738 2739 if (!(x->bm_flags & BW_METER_LEQ)) 2740 return; /* XXX: we schedule timers only for "<=" entries */ 2741 2742 /* 2743 * Compute the timeout hash value and delete the entry 2744 */ 2745 time_hash = x->bm_time_hash; 2746 if (time_hash >= BW_METER_BUCKETS) 2747 return; /* Entry was not scheduled */ 2748 2749 for (prev = NULL, tmp = bw_meter_timers[time_hash]; 2750 tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) 2751 if (tmp == x) 2752 break; 2753 2754 if (tmp == NULL) 2755 panic("unschedule_bw_meter: bw_meter entry not found"); 2756 2757 if (prev != NULL) 2758 prev->bm_time_next = x->bm_time_next; 2759 else 2760 bw_meter_timers[time_hash] = x->bm_time_next; 2761 2762 x->bm_time_next = NULL; 2763 x->bm_time_hash = BW_METER_BUCKETS; 2764} 2765 2766 2767/* 2768 * Process all "<=" type of bw_meter that should be processed now, 2769 * and for each entry prepare an upcall if necessary. Each processed 2770 * entry is rescheduled again for the (periodic) processing. 2771 * 2772 * This is run periodically (once per second normally). On each round, 2773 * all the potentially matching entries are in the hash slot that we are 2774 * looking at. 2775 */ 2776static void 2777bw_meter_process() 2778{ 2779 static uint32_t last_tv_sec; /* last time we processed this */ 2780 2781 uint32_t loops; 2782 int i; 2783 struct timeval now, process_endtime; 2784 2785 GET_TIME(now); 2786 if (last_tv_sec == now.tv_sec) 2787 return; /* nothing to do */ 2788 2789 loops = now.tv_sec - last_tv_sec; 2790 last_tv_sec = now.tv_sec; 2791 if (loops > BW_METER_BUCKETS) 2792 loops = BW_METER_BUCKETS; 2793 2794 MFC_LOCK(); 2795 /* 2796 * Process all bins of bw_meter entries from the one after the last 2797 * processed to the current one. On entry, i points to the last bucket 2798 * visited, so we need to increment i at the beginning of the loop. 2799 */ 2800 for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { 2801 struct bw_meter *x, *tmp_list; 2802 2803 if (++i >= BW_METER_BUCKETS) 2804 i = 0; 2805 2806 /* Disconnect the list of bw_meter entries from the bin */ 2807 tmp_list = bw_meter_timers[i]; 2808 bw_meter_timers[i] = NULL; 2809 2810 /* Process the list of bw_meter entries */ 2811 while (tmp_list != NULL) { 2812 x = tmp_list; 2813 tmp_list = tmp_list->bm_time_next; 2814 2815 /* Test if the time interval is over */ 2816 process_endtime = x->bm_start_time; 2817 BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); 2818 if (BW_TIMEVALCMP(&process_endtime, &now, >)) { 2819 /* Not yet: reschedule, but don't reset */ 2820 int time_hash; 2821 2822 BW_METER_TIMEHASH(x, time_hash); 2823 if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { 2824 /* 2825 * XXX: somehow the bin processing is a bit ahead of time. 2826 * Put the entry in the next bin. 2827 */ 2828 if (++time_hash >= BW_METER_BUCKETS) 2829 time_hash = 0; 2830 } 2831 x->bm_time_next = bw_meter_timers[time_hash]; 2832 bw_meter_timers[time_hash] = x; 2833 x->bm_time_hash = time_hash; 2834 2835 continue; 2836 } 2837 2838 /* 2839 * Test if we should deliver an upcall 2840 */ 2841 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2842 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 2843 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2844 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 2845 /* Prepare an upcall for delivery */ 2846 bw_meter_prepare_upcall(x, &now); 2847 } 2848 2849 /* 2850 * Reschedule for next processing 2851 */ 2852 schedule_bw_meter(x, &now); 2853 } 2854 } 2855 2856 /* Send all upcalls that are pending delivery */ 2857 bw_upcalls_send(); 2858 2859 MFC_UNLOCK(); 2860} 2861 2862/* 2863 * A periodic function for sending all upcalls that are pending delivery 2864 */ 2865static void 2866expire_bw_upcalls_send(void *unused) 2867{ 2868 MFC_LOCK(); 2869 bw_upcalls_send(); 2870 MFC_UNLOCK(); 2871 2872 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 2873 expire_bw_upcalls_send, NULL); 2874} 2875 2876/* 2877 * A periodic function for periodic scanning of the multicast forwarding 2878 * table for processing all "<=" bw_meter entries. 2879 */ 2880static void 2881expire_bw_meter_process(void *unused) 2882{ 2883 if (mrt_api_config & MRT_MFC_BW_UPCALL) 2884 bw_meter_process(); 2885 2886 callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); 2887} 2888 2889/* 2890 * End of bandwidth monitoring code 2891 */ 2892 2893#ifdef PIM 2894/* 2895 * Send the packet up to the user daemon, or eventually do kernel encapsulation 2896 * 2897 */ 2898static int 2899pim_register_send(struct ip *ip, struct vif *vifp, 2900 struct mbuf *m, struct mfc *rt) 2901{ 2902 struct mbuf *mb_copy, *mm; 2903 2904 if (mrtdebug & DEBUG_PIM) 2905 log(LOG_DEBUG, "pim_register_send: "); 2906 2907 mb_copy = pim_register_prepare(ip, m); 2908 if (mb_copy == NULL) 2909 return ENOBUFS; 2910 2911 /* 2912 * Send all the fragments. Note that the mbuf for each fragment 2913 * is freed by the sending machinery. 2914 */ 2915 for (mm = mb_copy; mm; mm = mb_copy) { 2916 mb_copy = mm->m_nextpkt; 2917 mm->m_nextpkt = 0; 2918 mm = m_pullup(mm, sizeof(struct ip)); 2919 if (mm != NULL) { 2920 ip = mtod(mm, struct ip *); 2921 if ((mrt_api_config & MRT_MFC_RP) && 2922 (rt->mfc_rp.s_addr != INADDR_ANY)) { 2923 pim_register_send_rp(ip, vifp, mm, rt); 2924 } else { 2925 pim_register_send_upcall(ip, vifp, mm, rt); 2926 } 2927 } 2928 } 2929 2930 return 0; 2931} 2932 2933/* 2934 * Return a copy of the data packet that is ready for PIM Register 2935 * encapsulation. 2936 * XXX: Note that in the returned copy the IP header is a valid one. 2937 */ 2938static struct mbuf * 2939pim_register_prepare(struct ip *ip, struct mbuf *m) 2940{ 2941 struct mbuf *mb_copy = NULL; 2942 int mtu; 2943 2944 /* Take care of delayed checksums */ 2945 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 2946 in_delayed_cksum(m); 2947 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 2948 } 2949 2950 /* 2951 * Copy the old packet & pullup its IP header into the 2952 * new mbuf so we can modify it. 2953 */ 2954 mb_copy = m_copypacket(m, M_DONTWAIT); 2955 if (mb_copy == NULL) 2956 return NULL; 2957 mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); 2958 if (mb_copy == NULL) 2959 return NULL; 2960 2961 /* take care of the TTL */ 2962 ip = mtod(mb_copy, struct ip *); 2963 --ip->ip_ttl; 2964 2965 /* Compute the MTU after the PIM Register encapsulation */ 2966 mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); 2967 2968 if (ip->ip_len <= mtu) { 2969 /* Turn the IP header into a valid one */ 2970 ip->ip_len = htons(ip->ip_len); 2971 ip->ip_off = htons(ip->ip_off); 2972 ip->ip_sum = 0; 2973 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 2974 } else { 2975 /* Fragment the packet */ 2976 if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) { 2977 m_freem(mb_copy); 2978 return NULL; 2979 } 2980 } 2981 return mb_copy; 2982} 2983 2984/* 2985 * Send an upcall with the data packet to the user-level process. 2986 */ 2987static int 2988pim_register_send_upcall(struct ip *ip, struct vif *vifp, 2989 struct mbuf *mb_copy, struct mfc *rt) 2990{ 2991 struct mbuf *mb_first; 2992 int len = ntohs(ip->ip_len); 2993 struct igmpmsg *im; 2994 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 2995 2996 VIF_LOCK_ASSERT(); 2997 2998 /* 2999 * Add a new mbuf with an upcall header 3000 */ 3001 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); 3002 if (mb_first == NULL) { 3003 m_freem(mb_copy); 3004 return ENOBUFS; 3005 } 3006 mb_first->m_data += max_linkhdr; 3007 mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); 3008 mb_first->m_len = sizeof(struct igmpmsg); 3009 mb_first->m_next = mb_copy; 3010 3011 /* Send message to routing daemon */ 3012 im = mtod(mb_first, struct igmpmsg *); 3013 im->im_msgtype = IGMPMSG_WHOLEPKT; 3014 im->im_mbz = 0; 3015 im->im_vif = vifp - viftable; 3016 im->im_src = ip->ip_src; 3017 im->im_dst = ip->ip_dst; 3018 3019 k_igmpsrc.sin_addr = ip->ip_src; 3020 3021 mrtstat.mrts_upcalls++; 3022 3023 if (socket_send(ip_mrouter, mb_first, &k_igmpsrc) < 0) { 3024 if (mrtdebug & DEBUG_PIM) 3025 log(LOG_WARNING, 3026 "mcast: pim_register_send_upcall: ip_mrouter socket queue full"); 3027 ++mrtstat.mrts_upq_sockfull; 3028 return ENOBUFS; 3029 } 3030 3031 /* Keep statistics */ 3032 pimstat.pims_snd_registers_msgs++; 3033 pimstat.pims_snd_registers_bytes += len; 3034 3035 return 0; 3036} 3037 3038/* 3039 * Encapsulate the data packet in PIM Register message and send it to the RP. 3040 */ 3041static int 3042pim_register_send_rp(struct ip *ip, struct vif *vifp, 3043 struct mbuf *mb_copy, struct mfc *rt) 3044{ 3045 struct mbuf *mb_first; 3046 struct ip *ip_outer; 3047 struct pim_encap_pimhdr *pimhdr; 3048 int len = ntohs(ip->ip_len); 3049 vifi_t vifi = rt->mfc_parent; 3050 3051 VIF_LOCK_ASSERT(); 3052 3053 if ((vifi >= numvifs) || (viftable[vifi].v_lcl_addr.s_addr == 0)) { 3054 m_freem(mb_copy); 3055 return EADDRNOTAVAIL; /* The iif vif is invalid */ 3056 } 3057 3058 /* 3059 * Add a new mbuf with the encapsulating header 3060 */ 3061 MGETHDR(mb_first, M_DONTWAIT, MT_HEADER); 3062 if (mb_first == NULL) { 3063 m_freem(mb_copy); 3064 return ENOBUFS; 3065 } 3066 mb_first->m_data += max_linkhdr; 3067 mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); 3068 mb_first->m_next = mb_copy; 3069 3070 mb_first->m_pkthdr.len = len + mb_first->m_len; 3071 3072 /* 3073 * Fill in the encapsulating IP and PIM header 3074 */ 3075 ip_outer = mtod(mb_first, struct ip *); 3076 *ip_outer = pim_encap_iphdr; 3077#ifdef RANDOM_IP_ID 3078 ip_outer->ip_id = ip_randomid(); 3079#else 3080 ip_outer->ip_id = htons(ip_id++); 3081#endif 3082 ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); 3083 ip_outer->ip_src = viftable[vifi].v_lcl_addr; 3084 ip_outer->ip_dst = rt->mfc_rp; 3085 /* 3086 * Copy the inner header TOS to the outer header, and take care of the 3087 * IP_DF bit. 3088 */ 3089 ip_outer->ip_tos = ip->ip_tos; 3090 if (ntohs(ip->ip_off) & IP_DF) 3091 ip_outer->ip_off |= IP_DF; 3092 pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer 3093 + sizeof(pim_encap_iphdr)); 3094 *pimhdr = pim_encap_pimhdr; 3095 /* If the iif crosses a border, set the Border-bit */ 3096 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config) 3097 pimhdr->flags |= htonl(PIM_BORDER_REGISTER); 3098 3099 mb_first->m_data += sizeof(pim_encap_iphdr); 3100 pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); 3101 mb_first->m_data -= sizeof(pim_encap_iphdr); 3102 3103 if (vifp->v_rate_limit == 0) 3104 tbf_send_packet(vifp, mb_first); 3105 else 3106 tbf_control(vifp, mb_first, ip, ip_outer->ip_len); 3107 3108 /* Keep statistics */ 3109 pimstat.pims_snd_registers_msgs++; 3110 pimstat.pims_snd_registers_bytes += len; 3111 3112 return 0; 3113} 3114 3115/* 3116 * PIM-SMv2 and PIM-DM messages processing. 3117 * Receives and verifies the PIM control messages, and passes them 3118 * up to the listening socket, using rip_input(). 3119 * The only message with special processing is the PIM_REGISTER message 3120 * (used by PIM-SM): the PIM header is stripped off, and the inner packet 3121 * is passed to if_simloop(). 3122 */ 3123void 3124pim_input(struct mbuf *m, int off) 3125{ 3126 struct ip *ip = mtod(m, struct ip *); 3127 struct pim *pim; 3128 int minlen; 3129 int datalen = ip->ip_len; 3130 int ip_tos; 3131 int iphlen = off; 3132 3133 /* Keep statistics */ 3134 pimstat.pims_rcv_total_msgs++; 3135 pimstat.pims_rcv_total_bytes += datalen; 3136 3137 /* 3138 * Validate lengths 3139 */ 3140 if (datalen < PIM_MINLEN) { 3141 pimstat.pims_rcv_tooshort++; 3142 log(LOG_ERR, "pim_input: packet size too small %d from %lx\n", 3143 datalen, (u_long)ip->ip_src.s_addr); 3144 m_freem(m); 3145 return; 3146 } 3147 3148 /* 3149 * If the packet is at least as big as a REGISTER, go agead 3150 * and grab the PIM REGISTER header size, to avoid another 3151 * possible m_pullup() later. 3152 * 3153 * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 3154 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 3155 */ 3156 minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); 3157 /* 3158 * Get the IP and PIM headers in contiguous memory, and 3159 * possibly the PIM REGISTER header. 3160 */ 3161 if ((m->m_flags & M_EXT || m->m_len < minlen) && 3162 (m = m_pullup(m, minlen)) == 0) { 3163 log(LOG_ERR, "pim_input: m_pullup failure\n"); 3164 return; 3165 } 3166 /* m_pullup() may have given us a new mbuf so reset ip. */ 3167 ip = mtod(m, struct ip *); 3168 ip_tos = ip->ip_tos; 3169 3170 /* adjust mbuf to point to the PIM header */ 3171 m->m_data += iphlen; 3172 m->m_len -= iphlen; 3173 pim = mtod(m, struct pim *); 3174 3175 /* 3176 * Validate checksum. If PIM REGISTER, exclude the data packet. 3177 * 3178 * XXX: some older PIMv2 implementations don't make this distinction, 3179 * so for compatibility reason perform the checksum over part of the 3180 * message, and if error, then over the whole message. 3181 */ 3182 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { 3183 /* do nothing, checksum okay */ 3184 } else if (in_cksum(m, datalen)) { 3185 pimstat.pims_rcv_badsum++; 3186 if (mrtdebug & DEBUG_PIM) 3187 log(LOG_DEBUG, "pim_input: invalid checksum"); 3188 m_freem(m); 3189 return; 3190 } 3191 3192 /* PIM version check */ 3193 if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { 3194 pimstat.pims_rcv_badversion++; 3195 log(LOG_ERR, "pim_input: incorrect version %d, expecting %d\n", 3196 PIM_VT_V(pim->pim_vt), PIM_VERSION); 3197 m_freem(m); 3198 return; 3199 } 3200 3201 /* restore mbuf back to the outer IP */ 3202 m->m_data -= iphlen; 3203 m->m_len += iphlen; 3204 3205 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { 3206 /* 3207 * Since this is a REGISTER, we'll make a copy of the register 3208 * headers ip + pim + u_int32 + encap_ip, to be passed up to the 3209 * routing daemon. 3210 */ 3211 struct sockaddr_in dst = { sizeof(dst), AF_INET }; 3212 struct mbuf *mcp; 3213 struct ip *encap_ip; 3214 u_int32_t *reghdr; 3215 struct ifnet *vifp; 3216 3217 VIF_LOCK(); 3218 if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) { 3219 VIF_UNLOCK(); 3220 if (mrtdebug & DEBUG_PIM) 3221 log(LOG_DEBUG, 3222 "pim_input: register vif not set: %d\n", reg_vif_num); 3223 m_freem(m); 3224 return; 3225 } 3226 /* XXX need refcnt? */ 3227 vifp = viftable[reg_vif_num].v_ifp; 3228 VIF_UNLOCK(); 3229 3230 /* 3231 * Validate length 3232 */ 3233 if (datalen < PIM_REG_MINLEN) { 3234 pimstat.pims_rcv_tooshort++; 3235 pimstat.pims_rcv_badregisters++; 3236 log(LOG_ERR, 3237 "pim_input: register packet size too small %d from %lx\n", 3238 datalen, (u_long)ip->ip_src.s_addr); 3239 m_freem(m); 3240 return; 3241 } 3242 3243 reghdr = (u_int32_t *)(pim + 1); 3244 encap_ip = (struct ip *)(reghdr + 1); 3245 3246 if (mrtdebug & DEBUG_PIM) { 3247 log(LOG_DEBUG, 3248 "pim_input[register], encap_ip: %lx -> %lx, encap_ip len %d\n", 3249 (u_long)ntohl(encap_ip->ip_src.s_addr), 3250 (u_long)ntohl(encap_ip->ip_dst.s_addr), 3251 ntohs(encap_ip->ip_len)); 3252 } 3253 3254 /* verify the version number of the inner packet */ 3255 if (encap_ip->ip_v != IPVERSION) { 3256 pimstat.pims_rcv_badregisters++; 3257 if (mrtdebug & DEBUG_PIM) { 3258 log(LOG_DEBUG, "pim_input: invalid IP version (%d) " 3259 "of the inner packet\n", encap_ip->ip_v); 3260 } 3261 m_freem(m); 3262 return; 3263 } 3264 3265 /* verify the inner packet is destined to a mcast group */ 3266 if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { 3267 pimstat.pims_rcv_badregisters++; 3268 if (mrtdebug & DEBUG_PIM) 3269 log(LOG_DEBUG, 3270 "pim_input: inner packet of register is not " 3271 "multicast %lx\n", 3272 (u_long)ntohl(encap_ip->ip_dst.s_addr)); 3273 m_freem(m); 3274 return; 3275 } 3276 3277 /* 3278 * Copy the TOS from the outer IP header to the inner IP header. 3279 */ 3280 if (encap_ip->ip_tos != ip_tos) { 3281 /* Outer TOS -> inner TOS */ 3282 encap_ip->ip_tos = ip_tos; 3283 /* Recompute the inner header checksum. Sigh... */ 3284 3285 /* adjust mbuf to point to the inner IP header */ 3286 m->m_data += (iphlen + PIM_MINLEN); 3287 m->m_len -= (iphlen + PIM_MINLEN); 3288 3289 encap_ip->ip_sum = 0; 3290 encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); 3291 3292 /* restore mbuf to point back to the outer IP header */ 3293 m->m_data -= (iphlen + PIM_MINLEN); 3294 m->m_len += (iphlen + PIM_MINLEN); 3295 } 3296 3297 /* If a NULL_REGISTER, pass it to the daemon */ 3298 if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) 3299 goto pim_input_to_daemon; 3300 3301 /* 3302 * Decapsulate the inner IP packet and loopback to forward it 3303 * as a normal multicast packet. Also, make a copy of the 3304 * outer_iphdr + pimhdr + reghdr + encap_iphdr 3305 * to pass to the daemon later, so it can take the appropriate 3306 * actions (e.g., send back PIM_REGISTER_STOP). 3307 * XXX: here m->m_data points to the outer IP header. 3308 */ 3309 mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN); 3310 if (mcp == NULL) { 3311 log(LOG_ERR, 3312 "pim_input: pim register: could not copy register head\n"); 3313 m_freem(m); 3314 return; 3315 } 3316 3317 /* Keep statistics */ 3318 /* XXX: registers_bytes include only the encap. mcast pkt */ 3319 pimstat.pims_rcv_registers_msgs++; 3320 pimstat.pims_rcv_registers_bytes += ntohs(encap_ip->ip_len); 3321 3322 /* 3323 * forward the inner ip packet; point m_data at the inner ip. 3324 */ 3325 m_adj(m, iphlen + PIM_MINLEN); 3326 3327 if (mrtdebug & DEBUG_PIM) { 3328 log(LOG_DEBUG, 3329 "pim_input: forwarding decapsulated register: " 3330 "src %lx, dst %lx, vif %d\n", 3331 (u_long)ntohl(encap_ip->ip_src.s_addr), 3332 (u_long)ntohl(encap_ip->ip_dst.s_addr), 3333 reg_vif_num); 3334 } 3335 /* NB: vifp was collected above; can it change on us? */ 3336 if_simloop(vifp, m, dst.sin_family, 0); 3337 3338 /* prepare the register head to send to the mrouting daemon */ 3339 m = mcp; 3340 } 3341 3342pim_input_to_daemon: 3343 /* 3344 * Pass the PIM message up to the daemon; if it is a Register message, 3345 * pass the 'head' only up to the daemon. This includes the 3346 * outer IP header, PIM header, PIM-Register header and the 3347 * inner IP header. 3348 * XXX: the outer IP header pkt size of a Register is not adjust to 3349 * reflect the fact that the inner multicast data is truncated. 3350 */ 3351 rip_input(m, iphlen); 3352 3353 return; 3354} 3355#endif /* PIM */ 3356 3357static int 3358ip_mroute_modevent(module_t mod, int type, void *unused) 3359{ 3360 int s; 3361 3362 switch (type) { 3363 case MOD_LOAD: 3364 s = splnet(); 3365 ip_mrouter_reset(); 3366 /* XXX synchronize setup */ 3367 ip_mcast_src = X_ip_mcast_src; 3368 ip_mforward = X_ip_mforward; 3369 ip_mrouter_done = X_ip_mrouter_done; 3370 ip_mrouter_get = X_ip_mrouter_get; 3371 ip_mrouter_set = X_ip_mrouter_set; 3372 ip_rsvp_force_done = X_ip_rsvp_force_done; 3373 ip_rsvp_vif = X_ip_rsvp_vif; 3374 legal_vif_num = X_legal_vif_num; 3375 mrt_ioctl = X_mrt_ioctl; 3376 rsvp_input_p = X_rsvp_input; 3377 break; 3378 3379 case MOD_UNLOAD: 3380 /* 3381 * Typically module unload happens after the user-level 3382 * process has shutdown the kernel services (the check 3383 * below insures someone can't just yank the module out 3384 * from under a running process). But if the module is 3385 * just loaded and then unloaded w/o starting up a user 3386 * process we still need to cleanup. 3387 */ 3388 if (ip_mrouter) 3389 return EINVAL; 3390 3391 X_ip_mrouter_done(); 3392 ip_mcast_src = NULL; 3393 ip_mforward = NULL; 3394 ip_mrouter_done = NULL; 3395 ip_mrouter_get = NULL; 3396 ip_mrouter_set = NULL; 3397 ip_rsvp_force_done = NULL; 3398 ip_rsvp_vif = NULL; 3399 legal_vif_num = NULL; 3400 mrt_ioctl = NULL; 3401 rsvp_input_p = NULL; 3402 break; 3403 } 3404 return 0; 3405} 3406 3407static moduledata_t ip_mroutemod = { 3408 "ip_mroute", 3409 ip_mroute_modevent, 3410 0 3411}; 3412DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY); 3413