ip_mroute.c revision 118501
1/*
2 * IP multicast forwarding procedures
3 *
4 * Written by David Waitzman, BBN Labs, August 1988.
5 * Modified by Steve Deering, Stanford, February 1989.
6 * Modified by Mark J. Steiglitz, Stanford, May, 1991
7 * Modified by Van Jacobson, LBL, January 1993
8 * Modified by Ajit Thyagarajan, PARC, August 1993
9 * Modified by Bill Fenner, PARC, April 1995
10 *
11 * MROUTING Revision: 3.5
12 * $FreeBSD: head/sys/netinet/ip_mroute.c 118501 2003-08-05 17:01:33Z hsu $
13 */
14
15#include "opt_mac.h"
16#include "opt_mrouting.h"
17#include "opt_random_ip_id.h"
18
19#include <sys/param.h>
20#include <sys/kernel.h>
21#include <sys/lock.h>
22#include <sys/mac.h>
23#include <sys/malloc.h>
24#include <sys/mbuf.h>
25#include <sys/protosw.h>
26#include <sys/signalvar.h>
27#include <sys/socket.h>
28#include <sys/socketvar.h>
29#include <sys/sockio.h>
30#include <sys/sx.h>
31#include <sys/sysctl.h>
32#include <sys/syslog.h>
33#include <sys/systm.h>
34#include <sys/time.h>
35#include <net/if.h>
36#include <net/netisr.h>
37#include <net/route.h>
38#include <netinet/in.h>
39#include <netinet/igmp.h>
40#include <netinet/in_systm.h>
41#include <netinet/in_var.h>
42#include <netinet/ip.h>
43#include <netinet/ip_encap.h>
44#include <netinet/ip_mroute.h>
45#include <netinet/ip_var.h>
46#include <netinet/udp.h>
47#include <machine/in_cksum.h>
48
49/*
50 * Control debugging code for rsvp and multicast routing code.
51 * Can only set them with the debugger.
52 */
53static u_int    rsvpdebug;		/* non-zero enables debugging	*/
54
55static u_int	mrtdebug;		/* any set of the flags below	*/
56#define		DEBUG_MFC	0x02
57#define		DEBUG_FORWARD	0x04
58#define		DEBUG_EXPIRE	0x08
59#define		DEBUG_XMIT	0x10
60
61#define M_HASCL(m)	((m)->m_flags & M_EXT)
62
63static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast routing tables");
64
65static struct mrtstat	mrtstat;
66SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW,
67    &mrtstat, mrtstat,
68    "Multicast Routing Statistics (struct mrtstat, netinet/ip_mroute.h)");
69
70static struct mfc	*mfctable[MFCTBLSIZ];
71SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD,
72    &mfctable, sizeof(mfctable), "S,*mfc[MFCTBLSIZ]",
73    "Multicast Forwarding Table (struct *mfc[MFCTBLSIZ], netinet/ip_mroute.h)");
74
75static struct vif	viftable[MAXVIFS];
76SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD,
77    &viftable, sizeof(viftable), "S,vif[MAXVIFS]",
78    "Multicast Virtual Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)");
79
80static u_char		nexpire[MFCTBLSIZ];
81
82static struct callout_handle expire_upcalls_ch;
83
84#define		EXPIRE_TIMEOUT	(hz / 4)	/* 4x / second		*/
85#define		UPCALL_EXPIRE	6		/* number of timeouts	*/
86
87/*
88 * Define the token bucket filter structures
89 * tbftable -> each vif has one of these for storing info
90 */
91
92static struct tbf tbftable[MAXVIFS];
93#define		TBF_REPROCESS	(hz / 100)	/* 100x / second */
94
95/*
96 * 'Interfaces' associated with decapsulator (so we can tell
97 * packets that went through it from ones that get reflected
98 * by a broken gateway).  These interfaces are never linked into
99 * the system ifnet list & no routes point to them.  I.e., packets
100 * can't be sent this way.  They only exist as a placeholder for
101 * multicast source verification.
102 */
103static struct ifnet multicast_decap_if[MAXVIFS];
104
105#define ENCAP_TTL 64
106#define ENCAP_PROTO IPPROTO_IPIP	/* 4 */
107
108/* prototype IP hdr for encapsulated packets */
109static struct ip multicast_encap_iphdr = {
110#if BYTE_ORDER == LITTLE_ENDIAN
111	sizeof(struct ip) >> 2, IPVERSION,
112#else
113	IPVERSION, sizeof(struct ip) >> 2,
114#endif
115	0,				/* tos */
116	sizeof(struct ip),		/* total length */
117	0,				/* id */
118	0,				/* frag offset */
119	ENCAP_TTL, ENCAP_PROTO,
120	0,				/* checksum */
121};
122
123/*
124 * Private variables.
125 */
126static vifi_t	   numvifs;
127static const struct encaptab *encap_cookie;
128
129/*
130 * one-back cache used by mroute_encapcheck to locate a tunnel's vif
131 * given a datagram's src ip address.
132 */
133static u_long last_encap_src;
134static struct vif *last_encap_vif;
135
136static u_long	X_ip_mcast_src(int vifi);
137static int	X_ip_mforward(struct ip *ip, struct ifnet *ifp,
138			struct mbuf *m, struct ip_moptions *imo);
139static int	X_ip_mrouter_done(void);
140static int	X_ip_mrouter_get(struct socket *so, struct sockopt *m);
141static int	X_ip_mrouter_set(struct socket *so, struct sockopt *m);
142static int	X_legal_vif_num(int vif);
143static int	X_mrt_ioctl(int cmd, caddr_t data);
144
145static int get_sg_cnt(struct sioc_sg_req *);
146static int get_vif_cnt(struct sioc_vif_req *);
147static int ip_mrouter_init(struct socket *, int);
148static int add_vif(struct vifctl *);
149static int del_vif(vifi_t);
150static int add_mfc(struct mfcctl *);
151static int del_mfc(struct mfcctl *);
152static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in *);
153static int set_assert(int);
154static void expire_upcalls(void *);
155static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t);
156static void phyint_send(struct ip *, struct vif *, struct mbuf *);
157static void encap_send(struct ip *, struct vif *, struct mbuf *);
158static void tbf_control(struct vif *, struct mbuf *, struct ip *, u_long);
159static void tbf_queue(struct vif *, struct mbuf *);
160static void tbf_process_q(struct vif *);
161static void tbf_reprocess_q(void *);
162static int tbf_dq_sel(struct vif *, struct ip *);
163static void tbf_send_packet(struct vif *, struct mbuf *);
164static void tbf_update_tokens(struct vif *);
165static int priority(struct vif *, struct ip *);
166
167/*
168 * whether or not special PIM assert processing is enabled.
169 */
170static int pim_assert;
171/*
172 * Rate limit for assert notification messages, in usec
173 */
174#define ASSERT_MSG_TIME		3000000
175
176/*
177 * Hash function for a source, group entry
178 */
179#define MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
180			((g) >> 20) ^ ((g) >> 10) ^ (g))
181
182/*
183 * Find a route for a given origin IP address and Multicast group address
184 * Type of service parameter to be added in the future!!!
185 * Statistics are updated by the caller if needed
186 * (mrtstat.mrts_mfc_lookups and mrtstat.mrts_mfc_misses)
187 */
188static struct mfc *
189mfc_find(in_addr_t o, in_addr_t g)
190{
191    struct mfc *rt;
192
193    for (rt = mfctable[MFCHASH(o,g)]; rt; rt = rt->mfc_next)
194	if ((rt->mfc_origin.s_addr == o) &&
195		(rt->mfc_mcastgrp.s_addr == g) && (rt->mfc_stall == NULL))
196	    break;
197    return rt;
198}
199
200/*
201 * Macros to compute elapsed time efficiently
202 * Borrowed from Van Jacobson's scheduling code
203 */
204#define TV_DELTA(a, b, delta) {					\
205	int xxs;						\
206	delta = (a).tv_usec - (b).tv_usec;			\
207	if ((xxs = (a).tv_sec - (b).tv_sec)) {			\
208		switch (xxs) {					\
209		case 2:						\
210		      delta += 1000000;				\
211		      /* FALLTHROUGH */				\
212		case 1:						\
213		      delta += 1000000;				\
214		      break;					\
215		default:					\
216		      delta += (1000000 * xxs);			\
217		}						\
218	}							\
219}
220
221#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
222	      (a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
223
224/*
225 * Handle MRT setsockopt commands to modify the multicast routing tables.
226 */
227static int
228X_ip_mrouter_set(struct socket *so, struct sockopt *sopt)
229{
230    int	error, optval;
231    vifi_t	vifi;
232    struct	vifctl vifc;
233    struct	mfcctl mfc;
234
235    if (so != ip_mrouter && sopt->sopt_name != MRT_INIT)
236	return EPERM;
237
238    error = 0;
239    switch (sopt->sopt_name) {
240    case MRT_INIT:
241	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
242	if (error)
243	    break;
244	error = ip_mrouter_init(so, optval);
245	break;
246
247    case MRT_DONE:
248	error = ip_mrouter_done();
249	break;
250
251    case MRT_ADD_VIF:
252	error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc);
253	if (error)
254	    break;
255	error = add_vif(&vifc);
256	break;
257
258    case MRT_DEL_VIF:
259	error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
260	if (error)
261	    break;
262	error = del_vif(vifi);
263	break;
264
265    case MRT_ADD_MFC:
266    case MRT_DEL_MFC:
267	error = sooptcopyin(sopt, &mfc, sizeof mfc, sizeof mfc);
268	if (error)
269	    break;
270	if (sopt->sopt_name == MRT_ADD_MFC)
271	    error = add_mfc(&mfc);
272	else
273	    error = del_mfc(&mfc);
274	break;
275
276    case MRT_ASSERT:
277	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
278	if (error)
279	    break;
280	set_assert(optval);
281	break;
282
283    default:
284	error = EOPNOTSUPP;
285	break;
286    }
287    return error;
288}
289
290/*
291 * Handle MRT getsockopt commands
292 */
293static int
294X_ip_mrouter_get(struct socket *so, struct sockopt *sopt)
295{
296    int error;
297    static int version = 0x0305; /* !!! why is this here? XXX */
298
299    switch (sopt->sopt_name) {
300    case MRT_VERSION:
301	error = sooptcopyout(sopt, &version, sizeof version);
302	break;
303
304    case MRT_ASSERT:
305	error = sooptcopyout(sopt, &pim_assert, sizeof pim_assert);
306	break;
307
308    default:
309	error = EOPNOTSUPP;
310	break;
311    }
312    return error;
313}
314
315/*
316 * Handle ioctl commands to obtain information from the cache
317 */
318static int
319X_mrt_ioctl(int cmd, caddr_t data)
320{
321    int error = 0;
322
323    switch (cmd) {
324    case (SIOCGETVIFCNT):
325	error = get_vif_cnt((struct sioc_vif_req *)data);
326	break;
327
328    case (SIOCGETSGCNT):
329	error = get_sg_cnt((struct sioc_sg_req *)data);
330	break;
331
332    default:
333	error = EINVAL;
334	break;
335    }
336    return error;
337}
338
339/*
340 * returns the packet, byte, rpf-failure count for the source group provided
341 */
342static int
343get_sg_cnt(struct sioc_sg_req *req)
344{
345    int s;
346    struct mfc *rt;
347
348    s = splnet();
349    rt = mfc_find(req->src.s_addr, req->grp.s_addr);
350    splx(s);
351    if (rt == NULL) {
352	req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
353	return EADDRNOTAVAIL;
354    }
355    req->pktcnt = rt->mfc_pkt_cnt;
356    req->bytecnt = rt->mfc_byte_cnt;
357    req->wrong_if = rt->mfc_wrong_if;
358    return 0;
359}
360
361/*
362 * returns the input and output packet and byte counts on the vif provided
363 */
364static int
365get_vif_cnt(struct sioc_vif_req *req)
366{
367    vifi_t vifi = req->vifi;
368
369    if (vifi >= numvifs)
370	return EINVAL;
371
372    req->icount = viftable[vifi].v_pkt_in;
373    req->ocount = viftable[vifi].v_pkt_out;
374    req->ibytes = viftable[vifi].v_bytes_in;
375    req->obytes = viftable[vifi].v_bytes_out;
376
377    return 0;
378}
379
380/*
381 * Enable multicast routing
382 */
383static int
384ip_mrouter_init(struct socket *so, int version)
385{
386    if (mrtdebug)
387	log(LOG_DEBUG, "ip_mrouter_init: so_type = %d, pr_protocol = %d\n",
388	    so->so_type, so->so_proto->pr_protocol);
389
390    if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP)
391	return EOPNOTSUPP;
392
393    if (version != 1)
394	return ENOPROTOOPT;
395
396    if (ip_mrouter != NULL)
397	return EADDRINUSE;
398
399    ip_mrouter = so;
400
401    bzero((caddr_t)mfctable, sizeof(mfctable));
402    bzero((caddr_t)nexpire, sizeof(nexpire));
403
404    pim_assert = 0;
405
406    expire_upcalls_ch = timeout(expire_upcalls, NULL, EXPIRE_TIMEOUT);
407
408    if (mrtdebug)
409	log(LOG_DEBUG, "ip_mrouter_init\n");
410
411    return 0;
412}
413
414/*
415 * Disable multicast routing
416 */
417static int
418X_ip_mrouter_done(void)
419{
420    vifi_t vifi;
421    int i;
422    struct ifnet *ifp;
423    struct ifreq ifr;
424    struct mfc *rt;
425    struct rtdetq *rte;
426    int s;
427
428    s = splnet();
429
430    /*
431     * For each phyint in use, disable promiscuous reception of all IP
432     * multicasts.
433     */
434    for (vifi = 0; vifi < numvifs; vifi++) {
435	if (viftable[vifi].v_lcl_addr.s_addr != 0 &&
436		!(viftable[vifi].v_flags & VIFF_TUNNEL)) {
437	    struct sockaddr_in *so = (struct sockaddr_in *)&(ifr.ifr_addr);
438
439	    so->sin_len = sizeof(struct sockaddr_in);
440	    so->sin_family = AF_INET;
441	    so->sin_addr.s_addr = INADDR_ANY;
442	    ifp = viftable[vifi].v_ifp;
443	    if_allmulti(ifp, 0);
444	}
445    }
446    bzero((caddr_t)tbftable, sizeof(tbftable));
447    bzero((caddr_t)viftable, sizeof(viftable));
448    numvifs = 0;
449    pim_assert = 0;
450
451    untimeout(expire_upcalls, NULL, expire_upcalls_ch);
452
453    /*
454     * Free all multicast forwarding cache entries.
455     */
456    for (i = 0; i < MFCTBLSIZ; i++) {
457	for (rt = mfctable[i]; rt != NULL; ) {
458	    struct mfc *nr = rt->mfc_next;
459
460	    for (rte = rt->mfc_stall; rte != NULL; ) {
461		struct rtdetq *n = rte->next;
462
463		m_freem(rte->m);
464		free(rte, M_MRTABLE);
465		rte = n;
466	    }
467	    free(rt, M_MRTABLE);
468	    rt = nr;
469	}
470    }
471
472    bzero((caddr_t)mfctable, sizeof(mfctable));
473
474    /*
475     * Reset de-encapsulation cache
476     */
477    last_encap_src = INADDR_ANY;
478    last_encap_vif = NULL;
479    if (encap_cookie) {
480	encap_detach(encap_cookie);
481	encap_cookie = NULL;
482    }
483
484    ip_mrouter = NULL;
485
486    splx(s);
487
488    if (mrtdebug)
489	log(LOG_DEBUG, "ip_mrouter_done\n");
490
491    return 0;
492}
493
494/*
495 * Set PIM assert processing global
496 */
497static int
498set_assert(int i)
499{
500    if ((i != 1) && (i != 0))
501	return EINVAL;
502
503    pim_assert = i;
504
505    return 0;
506}
507
508/*
509 * Decide if a packet is from a tunnelled peer.
510 * Return 0 if not, 64 if so.  XXX yuck.. 64 ???
511 */
512static int
513mroute_encapcheck(const struct mbuf *m, int off, int proto, void *arg)
514{
515    struct ip *ip = mtod(m, struct ip *);
516    int hlen = ip->ip_hl << 2;
517
518    /*
519     * don't claim the packet if it's not to a multicast destination or if
520     * we don't have an encapsulating tunnel with the source.
521     * Note:  This code assumes that the remote site IP address
522     * uniquely identifies the tunnel (i.e., that this site has
523     * at most one tunnel with the remote site).
524     */
525    if (!IN_MULTICAST(ntohl(((struct ip *)((char *)ip+hlen))->ip_dst.s_addr)))
526	return 0;
527    if (ip->ip_src.s_addr != last_encap_src) {
528	struct vif *vifp = viftable;
529	struct vif *vife = vifp + numvifs;
530
531	last_encap_src = ip->ip_src.s_addr;
532	last_encap_vif = NULL;
533	for ( ; vifp < vife; ++vifp)
534	    if (vifp->v_rmt_addr.s_addr == ip->ip_src.s_addr) {
535		if ((vifp->v_flags & (VIFF_TUNNEL|VIFF_SRCRT)) == VIFF_TUNNEL)
536		    last_encap_vif = vifp;
537		break;
538	    }
539    }
540    if (last_encap_vif == NULL) {
541	last_encap_src = INADDR_ANY;
542	return 0;
543    }
544    return 64;
545}
546
547/*
548 * De-encapsulate a packet and feed it back through ip input (this
549 * routine is called whenever IP gets a packet that mroute_encap_func()
550 * claimed).
551 */
552static void
553mroute_encap_input(struct mbuf *m, int off)
554{
555    struct ip *ip = mtod(m, struct ip *);
556    int hlen = ip->ip_hl << 2;
557
558    if (hlen > sizeof(struct ip))
559	ip_stripoptions(m, (struct mbuf *) 0);
560    m->m_data += sizeof(struct ip);
561    m->m_len -= sizeof(struct ip);
562    m->m_pkthdr.len -= sizeof(struct ip);
563
564    m->m_pkthdr.rcvif = last_encap_vif->v_ifp;
565
566    netisr_queue(NETISR_IP, m);
567    /*
568     * normally we would need a "schednetisr(NETISR_IP)"
569     * here but we were called by ip_input and it is going
570     * to loop back & try to dequeue the packet we just
571     * queued as soon as we return so we avoid the
572     * unnecessary software interrrupt.
573     *
574     * XXX
575     * This no longer holds - we may have direct-dispatched the packet,
576     * or there may be a queue processing limit.
577     */
578}
579
580extern struct domain inetdomain;
581static struct protosw mroute_encap_protosw =
582{ SOCK_RAW,	&inetdomain,	IPPROTO_IPV4,	PR_ATOMIC|PR_ADDR,
583  mroute_encap_input,	0,	0,		rip_ctloutput,
584  0,
585  0,		0,		0,		0,
586  &rip_usrreqs
587};
588
589/*
590 * Add a vif to the vif table
591 */
592static int
593add_vif(struct vifctl *vifcp)
594{
595    struct vif *vifp = viftable + vifcp->vifc_vifi;
596    struct sockaddr_in sin = {sizeof sin, AF_INET};
597    struct ifaddr *ifa;
598    struct ifnet *ifp;
599    int error, s;
600    struct tbf *v_tbf = tbftable + vifcp->vifc_vifi;
601
602    if (vifcp->vifc_vifi >= MAXVIFS)
603	return EINVAL;
604    if (vifp->v_lcl_addr.s_addr != INADDR_ANY)
605	return EADDRINUSE;
606    if (vifcp->vifc_lcl_addr.s_addr == INADDR_ANY)
607	return EADDRNOTAVAIL;
608
609    /* Find the interface with an address in AF_INET family */
610    sin.sin_addr = vifcp->vifc_lcl_addr;
611    ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
612    if (ifa == NULL)
613	return EADDRNOTAVAIL;
614    ifp = ifa->ifa_ifp;
615
616    if (vifcp->vifc_flags & VIFF_TUNNEL) {
617	if ((vifcp->vifc_flags & VIFF_SRCRT) == 0) {
618	    /*
619	     * An encapsulating tunnel is wanted.  Tell
620	     * mroute_encap_input() to start paying attention
621	     * to encapsulated packets.
622	     */
623	    if (encap_cookie == NULL) {
624		encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV4,
625				mroute_encapcheck,
626				(struct protosw *)&mroute_encap_protosw, NULL);
627
628		if (encap_cookie == NULL) {
629		    printf("ip_mroute: unable to attach encap\n");
630		    return EIO;	/* XXX */
631		}
632		for (s = 0; s < MAXVIFS; ++s) {
633		    multicast_decap_if[s].if_name = "mdecap";
634		    multicast_decap_if[s].if_unit = s;
635		}
636	    }
637	    /*
638	     * Set interface to fake encapsulator interface
639	     */
640	    ifp = &multicast_decap_if[vifcp->vifc_vifi];
641	    /*
642	     * Prepare cached route entry
643	     */
644	    bzero(&vifp->v_route, sizeof(vifp->v_route));
645	} else {
646	    log(LOG_ERR, "source routed tunnels not supported\n");
647	    return EOPNOTSUPP;
648	}
649    } else {		/* Make sure the interface supports multicast */
650	if ((ifp->if_flags & IFF_MULTICAST) == 0)
651	    return EOPNOTSUPP;
652
653	/* Enable promiscuous reception of all IP multicasts from the if */
654	s = splnet();
655	error = if_allmulti(ifp, 1);
656	splx(s);
657	if (error)
658	    return error;
659    }
660
661    s = splnet();
662    /* define parameters for the tbf structure */
663    vifp->v_tbf = v_tbf;
664    GET_TIME(vifp->v_tbf->tbf_last_pkt_t);
665    vifp->v_tbf->tbf_n_tok = 0;
666    vifp->v_tbf->tbf_q_len = 0;
667    vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
668    vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
669
670    vifp->v_flags     = vifcp->vifc_flags;
671    vifp->v_threshold = vifcp->vifc_threshold;
672    vifp->v_lcl_addr  = vifcp->vifc_lcl_addr;
673    vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
674    vifp->v_ifp       = ifp;
675    /* scaling up here allows division by 1024 in critical code */
676    vifp->v_rate_limit= vifcp->vifc_rate_limit * 1024 / 1000;
677    vifp->v_rsvp_on   = 0;
678    vifp->v_rsvpd     = NULL;
679    /* initialize per vif pkt counters */
680    vifp->v_pkt_in    = 0;
681    vifp->v_pkt_out   = 0;
682    vifp->v_bytes_in  = 0;
683    vifp->v_bytes_out = 0;
684    splx(s);
685
686    /* Adjust numvifs up if the vifi is higher than numvifs */
687    if (numvifs <= vifcp->vifc_vifi) numvifs = vifcp->vifc_vifi + 1;
688
689    if (mrtdebug)
690	log(LOG_DEBUG, "add_vif #%d, lcladdr %lx, %s %lx, thresh %x, rate %d\n",
691	    vifcp->vifc_vifi,
692	    (u_long)ntohl(vifcp->vifc_lcl_addr.s_addr),
693	    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
694	    (u_long)ntohl(vifcp->vifc_rmt_addr.s_addr),
695	    vifcp->vifc_threshold,
696	    vifcp->vifc_rate_limit);
697
698    return 0;
699}
700
701/*
702 * Delete a vif from the vif table
703 */
704static int
705del_vif(vifi_t vifi)
706{
707    struct vif *vifp;
708    int s;
709
710    if (vifi >= numvifs)
711	return EINVAL;
712    vifp = &viftable[vifi];
713    if (vifp->v_lcl_addr.s_addr == INADDR_ANY)
714	return EADDRNOTAVAIL;
715
716    s = splnet();
717
718    if (!(vifp->v_flags & VIFF_TUNNEL))
719	if_allmulti(vifp->v_ifp, 0);
720
721    if (vifp == last_encap_vif) {
722	last_encap_vif = NULL;
723	last_encap_src = INADDR_ANY;
724    }
725
726    /*
727     * Free packets queued at the interface
728     */
729    while (vifp->v_tbf->tbf_q) {
730	struct mbuf *m = vifp->v_tbf->tbf_q;
731
732	vifp->v_tbf->tbf_q = m->m_act;
733	m_freem(m);
734    }
735
736    bzero((caddr_t)vifp->v_tbf, sizeof(*(vifp->v_tbf)));
737    bzero((caddr_t)vifp, sizeof (*vifp));
738
739    if (mrtdebug)
740	log(LOG_DEBUG, "del_vif %d, numvifs %d\n", vifi, numvifs);
741
742    /* Adjust numvifs down */
743    for (vifi = numvifs; vifi > 0; vifi--)
744	if (viftable[vifi-1].v_lcl_addr.s_addr != INADDR_ANY)
745	    break;
746    numvifs = vifi;
747
748    splx(s);
749
750    return 0;
751}
752
753/*
754 * update an mfc entry without resetting counters and S,G addresses.
755 */
756static void
757update_mfc_params(struct mfc *rt, struct mfcctl *mfccp)
758{
759    int i;
760
761    rt->mfc_parent = mfccp->mfcc_parent;
762    for (i = 0; i < numvifs; i++)
763	rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
764}
765
766/*
767 * fully initialize an mfc entry from the parameter.
768 */
769static void
770init_mfc_params(struct mfc *rt, struct mfcctl *mfccp)
771{
772    rt->mfc_origin     = mfccp->mfcc_origin;
773    rt->mfc_mcastgrp   = mfccp->mfcc_mcastgrp;
774
775    update_mfc_params(rt, mfccp);
776
777    /* initialize pkt counters per src-grp */
778    rt->mfc_pkt_cnt    = 0;
779    rt->mfc_byte_cnt   = 0;
780    rt->mfc_wrong_if   = 0;
781    rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_usec = 0;
782}
783
784
785/*
786 * Add an mfc entry
787 */
788static int
789add_mfc(struct mfcctl *mfccp)
790{
791    struct mfc *rt;
792    u_long hash;
793    struct rtdetq *rte;
794    u_short nstl;
795    int s;
796
797    rt = mfc_find(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr);
798
799    /* If an entry already exists, just update the fields */
800    if (rt) {
801	if (mrtdebug & DEBUG_MFC)
802	    log(LOG_DEBUG,"add_mfc update o %lx g %lx p %x\n",
803		(u_long)ntohl(mfccp->mfcc_origin.s_addr),
804		(u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
805		mfccp->mfcc_parent);
806
807	s = splnet();
808	update_mfc_params(rt, mfccp);
809	splx(s);
810	return 0;
811    }
812
813    /*
814     * Find the entry for which the upcall was made and update
815     */
816    s = splnet();
817    hash = MFCHASH(mfccp->mfcc_origin.s_addr, mfccp->mfcc_mcastgrp.s_addr);
818    for (rt = mfctable[hash], nstl = 0; rt; rt = rt->mfc_next) {
819
820	if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
821		(rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
822		(rt->mfc_stall != NULL)) {
823
824	    if (nstl++)
825		log(LOG_ERR, "add_mfc %s o %lx g %lx p %x dbx %p\n",
826		    "multiple kernel entries",
827		    (u_long)ntohl(mfccp->mfcc_origin.s_addr),
828		    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
829		    mfccp->mfcc_parent, (void *)rt->mfc_stall);
830
831	    if (mrtdebug & DEBUG_MFC)
832		log(LOG_DEBUG,"add_mfc o %lx g %lx p %x dbg %p\n",
833		    (u_long)ntohl(mfccp->mfcc_origin.s_addr),
834		    (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
835		    mfccp->mfcc_parent, (void *)rt->mfc_stall);
836
837	    init_mfc_params(rt, mfccp);
838
839	    rt->mfc_expire = 0;	/* Don't clean this guy up */
840	    nexpire[hash]--;
841
842	    /* free packets Qed at the end of this entry */
843	    for (rte = rt->mfc_stall; rte != NULL; ) {
844		struct rtdetq *n = rte->next;
845
846		ip_mdq(rte->m, rte->ifp, rt, -1);
847		m_freem(rte->m);
848		free(rte, M_MRTABLE);
849		rte = n;
850	    }
851	    rt->mfc_stall = NULL;
852	}
853    }
854
855    /*
856     * It is possible that an entry is being inserted without an upcall
857     */
858    if (nstl == 0) {
859	if (mrtdebug & DEBUG_MFC)
860	    log(LOG_DEBUG,"add_mfc no upcall h %lu o %lx g %lx p %x\n",
861		hash, (u_long)ntohl(mfccp->mfcc_origin.s_addr),
862		(u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
863		mfccp->mfcc_parent);
864
865	for (rt = mfctable[hash]; rt != NULL; rt = rt->mfc_next) {
866	    if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
867		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr)) {
868		init_mfc_params(rt, mfccp);
869		if (rt->mfc_expire)
870		    nexpire[hash]--;
871		rt->mfc_expire = 0;
872		break; /* XXX */
873	    }
874	}
875	if (rt == NULL) {		/* no upcall, so make a new entry */
876	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
877	    if (rt == NULL) {
878		splx(s);
879		return ENOBUFS;
880	    }
881
882	    init_mfc_params(rt, mfccp);
883	    rt->mfc_expire     = 0;
884	    rt->mfc_stall      = NULL;
885
886	    /* insert new entry at head of hash chain */
887	    rt->mfc_next = mfctable[hash];
888	    mfctable[hash] = rt;
889	}
890    }
891    splx(s);
892    return 0;
893}
894
895/*
896 * Delete an mfc entry
897 */
898static int
899del_mfc(struct mfcctl *mfccp)
900{
901    struct in_addr 	origin;
902    struct in_addr 	mcastgrp;
903    struct mfc 		*rt;
904    struct mfc	 	**nptr;
905    u_long 		hash;
906    int s;
907
908    origin = mfccp->mfcc_origin;
909    mcastgrp = mfccp->mfcc_mcastgrp;
910
911    if (mrtdebug & DEBUG_MFC)
912	log(LOG_DEBUG,"del_mfc orig %lx mcastgrp %lx\n",
913	    (u_long)ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr));
914
915    s = splnet();
916
917    hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
918    for (nptr = &mfctable[hash]; (rt = *nptr) != NULL; nptr = &rt->mfc_next)
919	if (origin.s_addr == rt->mfc_origin.s_addr &&
920		mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
921		rt->mfc_stall == NULL)
922	    break;
923    if (rt == NULL) {
924	splx(s);
925	return EADDRNOTAVAIL;
926    }
927
928    *nptr = rt->mfc_next;
929    free(rt, M_MRTABLE);
930
931    splx(s);
932
933    return 0;
934}
935
936/*
937 * Send a message to mrouted on the multicast routing socket
938 */
939static int
940socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
941{
942    if (s) {
943	if (sbappendaddr(&s->so_rcv, (struct sockaddr *)src, mm, NULL) != 0) {
944	    sorwakeup(s);
945	    return 0;
946	}
947    }
948    m_freem(mm);
949    return -1;
950}
951
952/*
953 * IP multicast forwarding function. This function assumes that the packet
954 * pointed to by "ip" has arrived on (or is about to be sent to) the interface
955 * pointed to by "ifp", and the packet is to be relayed to other networks
956 * that have members of the packet's destination IP multicast group.
957 *
958 * The packet is returned unscathed to the caller, unless it is
959 * erroneous, in which case a non-zero return value tells the caller to
960 * discard it.
961 */
962
963#define TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
964
965static int
966X_ip_mforward(struct ip *ip, struct ifnet *ifp,
967	struct mbuf *m, struct ip_moptions *imo)
968{
969    struct mfc *rt;
970    int s;
971    vifi_t vifi;
972
973    if (mrtdebug & DEBUG_FORWARD)
974	log(LOG_DEBUG, "ip_mforward: src %lx, dst %lx, ifp %p\n",
975	    (u_long)ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr),
976	    (void *)ifp);
977
978    if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 ||
979		((u_char *)(ip + 1))[1] != IPOPT_LSRR ) {
980	/*
981	 * Packet arrived via a physical interface or
982	 * an encapsulated tunnel.
983	 */
984    } else {
985	/*
986	 * Packet arrived through a source-route tunnel.
987	 * Source-route tunnels are no longer supported.
988	 */
989	static int last_log;
990	if (last_log != time_second) {
991	    last_log = time_second;
992	    log(LOG_ERR,
993		"ip_mforward: received source-routed packet from %lx\n",
994		(u_long)ntohl(ip->ip_src.s_addr));
995	}
996	return 1;
997    }
998
999    if ((imo) && ((vifi = imo->imo_multicast_vif) < numvifs)) {
1000	if (ip->ip_ttl < 255)
1001	    ip->ip_ttl++;	/* compensate for -1 in *_send routines */
1002	if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
1003	    struct vif *vifp = viftable + vifi;
1004
1005	    printf("Sending IPPROTO_RSVP from %lx to %lx on vif %d (%s%s%d)\n",
1006		(long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr),
1007		vifi,
1008		(vifp->v_flags & VIFF_TUNNEL) ? "tunnel on " : "",
1009		vifp->v_ifp->if_name, vifp->v_ifp->if_unit);
1010	}
1011	return ip_mdq(m, ifp, NULL, vifi);
1012    }
1013    if (rsvpdebug && ip->ip_p == IPPROTO_RSVP) {
1014	printf("Warning: IPPROTO_RSVP from %lx to %lx without vif option\n",
1015	    (long)ntohl(ip->ip_src.s_addr), (long)ntohl(ip->ip_dst.s_addr));
1016	if (!imo)
1017	    printf("In fact, no options were specified at all\n");
1018    }
1019
1020    /*
1021     * Don't forward a packet with time-to-live of zero or one,
1022     * or a packet destined to a local-only group.
1023     */
1024    if (ip->ip_ttl <= 1 || ntohl(ip->ip_dst.s_addr) <= INADDR_MAX_LOCAL_GROUP)
1025	return 0;
1026
1027    /*
1028     * Determine forwarding vifs from the forwarding cache table
1029     */
1030    s = splnet();
1031    ++mrtstat.mrts_mfc_lookups;
1032    rt = mfc_find(ip->ip_src.s_addr, ip->ip_dst.s_addr);
1033
1034    /* Entry exists, so forward if necessary */
1035    if (rt != NULL) {
1036	splx(s);
1037	return ip_mdq(m, ifp, rt, -1);
1038    } else {
1039	/*
1040	 * If we don't have a route for packet's origin,
1041	 * Make a copy of the packet & send message to routing daemon
1042	 */
1043
1044	struct mbuf *mb0;
1045	struct rtdetq *rte;
1046	u_long hash;
1047	int hlen = ip->ip_hl << 2;
1048
1049	++mrtstat.mrts_mfc_misses;
1050
1051	mrtstat.mrts_no_route++;
1052	if (mrtdebug & (DEBUG_FORWARD | DEBUG_MFC))
1053	    log(LOG_DEBUG, "ip_mforward: no rte s %lx g %lx\n",
1054		(u_long)ntohl(ip->ip_src.s_addr),
1055		(u_long)ntohl(ip->ip_dst.s_addr));
1056
1057	/*
1058	 * Allocate mbufs early so that we don't do extra work if we are
1059	 * just going to fail anyway.  Make sure to pullup the header so
1060	 * that other people can't step on it.
1061	 */
1062	rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, M_NOWAIT);
1063	if (rte == NULL) {
1064	    splx(s);
1065	    return ENOBUFS;
1066	}
1067	mb0 = m_copy(m, 0, M_COPYALL);
1068	if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen))
1069	    mb0 = m_pullup(mb0, hlen);
1070	if (mb0 == NULL) {
1071	    free(rte, M_MRTABLE);
1072	    splx(s);
1073	    return ENOBUFS;
1074	}
1075
1076	/* is there an upcall waiting for this flow ? */
1077	hash = MFCHASH(ip->ip_src.s_addr, ip->ip_dst.s_addr);
1078	for (rt = mfctable[hash]; rt; rt = rt->mfc_next) {
1079	    if ((ip->ip_src.s_addr == rt->mfc_origin.s_addr) &&
1080		    (ip->ip_dst.s_addr == rt->mfc_mcastgrp.s_addr) &&
1081		    (rt->mfc_stall != NULL))
1082		break;
1083	}
1084
1085	if (rt == NULL) {
1086	    int i;
1087	    struct igmpmsg *im;
1088	    struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
1089	    struct mbuf *mm;
1090
1091	    /*
1092	     * Locate the vifi for the incoming interface for this packet.
1093	     * If none found, drop packet.
1094	     */
1095	    for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++)
1096		;
1097	    if (vifi >= numvifs)	/* vif not found, drop packet */
1098		goto non_fatal;
1099
1100	    /* no upcall, so make a new entry */
1101	    rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
1102	    if (rt == NULL)
1103		goto fail;
1104	    /* Make a copy of the header to send to the user level process */
1105	    mm = m_copy(mb0, 0, hlen);
1106	    if (mm == NULL)
1107		goto fail1;
1108
1109	    /*
1110	     * Send message to routing daemon to install
1111	     * a route into the kernel table
1112	     */
1113
1114	    im = mtod(mm, struct igmpmsg *);
1115	    im->im_msgtype = IGMPMSG_NOCACHE;
1116	    im->im_mbz = 0;
1117	    im->im_vif = vifi;
1118
1119	    mrtstat.mrts_upcalls++;
1120
1121	    k_igmpsrc.sin_addr = ip->ip_src;
1122	    if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) {
1123		log(LOG_WARNING, "ip_mforward: ip_mrouter socket queue full\n");
1124		++mrtstat.mrts_upq_sockfull;
1125fail1:
1126		free(rt, M_MRTABLE);
1127fail:
1128		free(rte, M_MRTABLE);
1129		m_freem(mb0);
1130		splx(s);
1131		return ENOBUFS;
1132	    }
1133
1134	    /* insert new entry at head of hash chain */
1135	    rt->mfc_origin.s_addr     = ip->ip_src.s_addr;
1136	    rt->mfc_mcastgrp.s_addr   = ip->ip_dst.s_addr;
1137	    rt->mfc_expire	      = UPCALL_EXPIRE;
1138	    nexpire[hash]++;
1139	    for (i = 0; i < numvifs; i++)
1140		rt->mfc_ttls[i] = 0;
1141	    rt->mfc_parent = -1;
1142
1143	    /* link into table */
1144	    rt->mfc_next   = mfctable[hash];
1145	    mfctable[hash] = rt;
1146	    rt->mfc_stall = rte;
1147
1148	} else {
1149	    /* determine if q has overflowed */
1150	    int npkts = 0;
1151	    struct rtdetq **p;
1152
1153	    /*
1154	     * XXX ouch! we need to append to the list, but we
1155	     * only have a pointer to the front, so we have to
1156	     * scan the entire list every time.
1157	     */
1158	    for (p = &rt->mfc_stall; *p != NULL; p = &(*p)->next)
1159		npkts++;
1160
1161	    if (npkts > MAX_UPQ) {
1162		mrtstat.mrts_upq_ovflw++;
1163non_fatal:
1164		free(rte, M_MRTABLE);
1165		m_freem(mb0);
1166		splx(s);
1167		return 0;
1168	    }
1169
1170	    /* Add this entry to the end of the queue */
1171	    *p = rte;
1172	}
1173
1174	rte->m 			= mb0;
1175	rte->ifp 		= ifp;
1176	rte->next		= NULL;
1177
1178	splx(s);
1179
1180	return 0;
1181    }
1182}
1183
1184/*
1185 * Clean up the cache entry if upcall is not serviced
1186 */
1187static void
1188expire_upcalls(void *unused)
1189{
1190    struct rtdetq *rte;
1191    struct mfc *mfc, **nptr;
1192    int i;
1193    int s;
1194
1195    s = splnet();
1196    for (i = 0; i < MFCTBLSIZ; i++) {
1197	if (nexpire[i] == 0)
1198	    continue;
1199	nptr = &mfctable[i];
1200	for (mfc = *nptr; mfc != NULL; mfc = *nptr) {
1201	    /*
1202	     * Skip real cache entries
1203	     * Make sure it wasn't marked to not expire (shouldn't happen)
1204	     * If it expires now
1205	     */
1206	    if (mfc->mfc_stall != NULL && mfc->mfc_expire != 0 &&
1207		    --mfc->mfc_expire == 0) {
1208		if (mrtdebug & DEBUG_EXPIRE)
1209		    log(LOG_DEBUG, "expire_upcalls: expiring (%lx %lx)\n",
1210			(u_long)ntohl(mfc->mfc_origin.s_addr),
1211			(u_long)ntohl(mfc->mfc_mcastgrp.s_addr));
1212		/*
1213		 * drop all the packets
1214		 * free the mbuf with the pkt, if, timing info
1215		 */
1216		for (rte = mfc->mfc_stall; rte; ) {
1217		    struct rtdetq *n = rte->next;
1218
1219		    m_freem(rte->m);
1220		    free(rte, M_MRTABLE);
1221		    rte = n;
1222		}
1223		++mrtstat.mrts_cache_cleanups;
1224		nexpire[i]--;
1225
1226		*nptr = mfc->mfc_next;
1227		free(mfc, M_MRTABLE);
1228	    } else {
1229		nptr = &mfc->mfc_next;
1230	    }
1231	}
1232    }
1233    splx(s);
1234    expire_upcalls_ch = timeout(expire_upcalls, NULL, EXPIRE_TIMEOUT);
1235}
1236
1237/*
1238 * Packet forwarding routine once entry in the cache is made
1239 */
1240static int
1241ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
1242{
1243    struct ip  *ip = mtod(m, struct ip *);
1244    vifi_t vifi;
1245    int plen = ip->ip_len;
1246
1247/*
1248 * Macro to send packet on vif.  Since RSVP packets don't get counted on
1249 * input, they shouldn't get counted on output, so statistics keeping is
1250 * separate.
1251 */
1252#define MC_SEND(ip,vifp,m) {				\
1253		if ((vifp)->v_flags & VIFF_TUNNEL)	\
1254		    encap_send((ip), (vifp), (m));	\
1255		else					\
1256		    phyint_send((ip), (vifp), (m));	\
1257}
1258
1259    /*
1260     * If xmt_vif is not -1, send on only the requested vif.
1261     *
1262     * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.)
1263     */
1264    if (xmt_vif < numvifs) {
1265	MC_SEND(ip, viftable + xmt_vif, m);
1266	return 1;
1267    }
1268
1269    /*
1270     * Don't forward if it didn't arrive from the parent vif for its origin.
1271     */
1272    vifi = rt->mfc_parent;
1273    if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) {
1274	/* came in the wrong interface */
1275	if (mrtdebug & DEBUG_FORWARD)
1276	    log(LOG_DEBUG, "wrong if: ifp %p vifi %d vififp %p\n",
1277		(void *)ifp, vifi, (void *)viftable[vifi].v_ifp);
1278	++mrtstat.mrts_wrong_if;
1279	++rt->mfc_wrong_if;
1280	/*
1281	 * If we are doing PIM assert processing, and we are forwarding
1282	 * packets on this interface, and it is a broadcast medium
1283	 * interface (and not a tunnel), send a message to the routing daemon.
1284	 */
1285	if (pim_assert && rt->mfc_ttls[vifi] &&
1286		(ifp->if_flags & IFF_BROADCAST) &&
1287		!(viftable[vifi].v_flags & VIFF_TUNNEL)) {
1288	    struct timeval now;
1289	    u_long delta;
1290
1291	    /* Get vifi for the incoming packet */
1292	    for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++)
1293		;
1294	    if (vifi >= numvifs)
1295		return 0;	/* if not found: ignore the packet */
1296
1297	    GET_TIME(now);
1298
1299	    TV_DELTA(rt->mfc_last_assert, now, delta);
1300
1301	    if (delta > ASSERT_MSG_TIME) {
1302		struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
1303		struct igmpmsg *im;
1304		int hlen = ip->ip_hl << 2;
1305		struct mbuf *mm = m_copy(m, 0, hlen);
1306
1307		if (mm && (M_HASCL(mm) || mm->m_len < hlen))
1308		    mm = m_pullup(mm, hlen);
1309		if (mm == NULL)
1310		    return ENOBUFS;
1311
1312		rt->mfc_last_assert = now;
1313
1314		im = mtod(mm, struct igmpmsg *);
1315		im->im_msgtype	= IGMPMSG_WRONGVIF;
1316		im->im_mbz		= 0;
1317		im->im_vif		= vifi;
1318
1319		mrtstat.mrts_upcalls++;
1320
1321		k_igmpsrc.sin_addr = im->im_src;
1322		if (socket_send(ip_mrouter, mm, &k_igmpsrc) < 0) {
1323		    log(LOG_WARNING,
1324			"ip_mforward: ip_mrouter socket queue full\n");
1325		    ++mrtstat.mrts_upq_sockfull;
1326		    return ENOBUFS;
1327		}
1328	    }
1329	}
1330	return 0;
1331    }
1332
1333    /* If I sourced this packet, it counts as output, else it was input. */
1334    if (ip->ip_src.s_addr == viftable[vifi].v_lcl_addr.s_addr) {
1335	viftable[vifi].v_pkt_out++;
1336	viftable[vifi].v_bytes_out += plen;
1337    } else {
1338	viftable[vifi].v_pkt_in++;
1339	viftable[vifi].v_bytes_in += plen;
1340    }
1341    rt->mfc_pkt_cnt++;
1342    rt->mfc_byte_cnt += plen;
1343
1344    /*
1345     * For each vif, decide if a copy of the packet should be forwarded.
1346     * Forward if:
1347     *		- the ttl exceeds the vif's threshold
1348     *		- there are group members downstream on interface
1349     */
1350    for (vifi = 0; vifi < numvifs; vifi++)
1351	if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) {
1352	    viftable[vifi].v_pkt_out++;
1353	    viftable[vifi].v_bytes_out += plen;
1354	    MC_SEND(ip, viftable+vifi, m);
1355	}
1356
1357    return 0;
1358}
1359
1360/*
1361 * check if a vif number is legal/ok. This is used by ip_output.
1362 */
1363static int
1364X_legal_vif_num(int vif)
1365{
1366    return (vif >= 0 && vif < numvifs);
1367}
1368
1369/*
1370 * Return the local address used by this vif
1371 */
1372static u_long
1373X_ip_mcast_src(int vifi)
1374{
1375    if (vifi >= 0 && vifi < numvifs)
1376	return viftable[vifi].v_lcl_addr.s_addr;
1377    else
1378	return INADDR_ANY;
1379}
1380
1381static void
1382phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
1383{
1384    struct mbuf *mb_copy;
1385    int hlen = ip->ip_hl << 2;
1386
1387    /*
1388     * Make a new reference to the packet; make sure that
1389     * the IP header is actually copied, not just referenced,
1390     * so that ip_output() only scribbles on the copy.
1391     */
1392    mb_copy = m_copy(m, 0, M_COPYALL);
1393    if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen))
1394	mb_copy = m_pullup(mb_copy, hlen);
1395    if (mb_copy == NULL)
1396	return;
1397
1398    if (vifp->v_rate_limit == 0)
1399	tbf_send_packet(vifp, mb_copy);
1400    else
1401	tbf_control(vifp, mb_copy, mtod(mb_copy, struct ip *), ip->ip_len);
1402}
1403
1404static void
1405encap_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
1406{
1407    struct mbuf *mb_copy;
1408    struct ip *ip_copy;
1409    int i, len = ip->ip_len;
1410
1411    /*
1412     * XXX: take care of delayed checksums.
1413     * XXX: if network interfaces are capable of computing checksum for
1414     * encapsulated multicast data packets, we need to reconsider this.
1415     */
1416    if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1417	in_delayed_cksum(m);
1418	m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
1419    }
1420
1421    /*
1422     * copy the old packet & pullup its IP header into the
1423     * new mbuf so we can modify it.  Try to fill the new
1424     * mbuf since if we don't the ethernet driver will.
1425     */
1426    MGETHDR(mb_copy, M_DONTWAIT, MT_HEADER);
1427    if (mb_copy == NULL)
1428	return;
1429#ifdef MAC
1430    mac_create_mbuf_multicast_encap(m, vifp->v_ifp, mb_copy);
1431#endif
1432    mb_copy->m_data += max_linkhdr;
1433    mb_copy->m_len = sizeof(multicast_encap_iphdr);
1434
1435    if ((mb_copy->m_next = m_copy(m, 0, M_COPYALL)) == NULL) {
1436	m_freem(mb_copy);
1437	return;
1438    }
1439    i = MHLEN - M_LEADINGSPACE(mb_copy);
1440    if (i > len)
1441	i = len;
1442    mb_copy = m_pullup(mb_copy, i);
1443    if (mb_copy == NULL)
1444	return;
1445    mb_copy->m_pkthdr.len = len + sizeof(multicast_encap_iphdr);
1446
1447    /*
1448     * fill in the encapsulating IP header.
1449     */
1450    ip_copy = mtod(mb_copy, struct ip *);
1451    *ip_copy = multicast_encap_iphdr;
1452#ifdef RANDOM_IP_ID
1453    ip_copy->ip_id = ip_randomid();
1454#else
1455    ip_copy->ip_id = htons(ip_id++);
1456#endif
1457    ip_copy->ip_len += len;
1458    ip_copy->ip_src = vifp->v_lcl_addr;
1459    ip_copy->ip_dst = vifp->v_rmt_addr;
1460
1461    /*
1462     * turn the encapsulated IP header back into a valid one.
1463     */
1464    ip = (struct ip *)((caddr_t)ip_copy + sizeof(multicast_encap_iphdr));
1465    --ip->ip_ttl;
1466    ip->ip_len = htons(ip->ip_len);
1467    ip->ip_off = htons(ip->ip_off);
1468    ip->ip_sum = 0;
1469    mb_copy->m_data += sizeof(multicast_encap_iphdr);
1470    ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
1471    mb_copy->m_data -= sizeof(multicast_encap_iphdr);
1472
1473    if (vifp->v_rate_limit == 0)
1474	tbf_send_packet(vifp, mb_copy);
1475    else
1476	tbf_control(vifp, mb_copy, ip, ip_copy->ip_len);
1477}
1478
1479/*
1480 * Token bucket filter module
1481 */
1482
1483static void
1484tbf_control(struct vif *vifp, struct mbuf *m, struct ip *ip, u_long p_len)
1485{
1486    struct tbf *t = vifp->v_tbf;
1487
1488    if (p_len > MAX_BKT_SIZE) {		/* drop if packet is too large */
1489	mrtstat.mrts_pkt2large++;
1490	m_freem(m);
1491	return;
1492    }
1493
1494    tbf_update_tokens(vifp);
1495
1496    if (t->tbf_q_len == 0) {		/* queue empty...		*/
1497	if (p_len <= t->tbf_n_tok) {	/* send packet if enough tokens */
1498	    t->tbf_n_tok -= p_len;
1499	    tbf_send_packet(vifp, m);
1500	} else {			/* no, queue packet and try later */
1501	    tbf_queue(vifp, m);
1502	    timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS);
1503	}
1504    } else if (t->tbf_q_len < t->tbf_max_q_len) {
1505	/* finite queue length, so queue pkts and process queue */
1506	tbf_queue(vifp, m);
1507	tbf_process_q(vifp);
1508    } else {
1509	/* queue full, try to dq and queue and process */
1510	if (!tbf_dq_sel(vifp, ip)) {
1511	    mrtstat.mrts_q_overflow++;
1512	    m_freem(m);
1513	} else {
1514	    tbf_queue(vifp, m);
1515	    tbf_process_q(vifp);
1516	}
1517    }
1518}
1519
1520/*
1521 * adds a packet to the queue at the interface
1522 */
1523static void
1524tbf_queue(struct vif *vifp, struct mbuf *m)
1525{
1526    int s = splnet();
1527    struct tbf *t = vifp->v_tbf;
1528
1529    if (t->tbf_t == NULL)	/* Queue was empty */
1530	t->tbf_q = m;
1531    else			/* Insert at tail */
1532	t->tbf_t->m_act = m;
1533
1534    t->tbf_t = m;		/* Set new tail pointer */
1535
1536#ifdef DIAGNOSTIC
1537    /* Make sure we didn't get fed a bogus mbuf */
1538    if (m->m_act)
1539	panic("tbf_queue: m_act");
1540#endif
1541    m->m_act = NULL;
1542
1543    t->tbf_q_len++;
1544
1545    splx(s);
1546}
1547
1548/*
1549 * processes the queue at the interface
1550 */
1551static void
1552tbf_process_q(struct vif *vifp)
1553{
1554    int s = splnet();
1555    struct tbf *t = vifp->v_tbf;
1556
1557    /* loop through the queue at the interface and send as many packets
1558     * as possible
1559     */
1560    while (t->tbf_q_len > 0) {
1561	struct mbuf *m = t->tbf_q;
1562	int len = mtod(m, struct ip *)->ip_len;
1563
1564	/* determine if the packet can be sent */
1565	if (len > t->tbf_n_tok)	/* not enough tokens, we are done */
1566	    break;
1567	/* ok, reduce no of tokens, dequeue and send the packet. */
1568	t->tbf_n_tok -= len;
1569
1570	t->tbf_q = m->m_act;
1571	if (--t->tbf_q_len == 0)
1572	    t->tbf_t = NULL;
1573
1574	m->m_act = NULL;
1575	tbf_send_packet(vifp, m);
1576    }
1577    splx(s);
1578}
1579
1580static void
1581tbf_reprocess_q(void *xvifp)
1582{
1583    struct vif *vifp = xvifp;
1584
1585    if (ip_mrouter == NULL)
1586	return;
1587    tbf_update_tokens(vifp);
1588    tbf_process_q(vifp);
1589    if (vifp->v_tbf->tbf_q_len)
1590	timeout(tbf_reprocess_q, (caddr_t)vifp, TBF_REPROCESS);
1591}
1592
1593/* function that will selectively discard a member of the queue
1594 * based on the precedence value and the priority
1595 */
1596static int
1597tbf_dq_sel(struct vif *vifp, struct ip *ip)
1598{
1599    int s = splnet();
1600    u_int p;
1601    struct mbuf *m, *last;
1602    struct mbuf **np;
1603    struct tbf *t = vifp->v_tbf;
1604
1605    p = priority(vifp, ip);
1606
1607    np = &t->tbf_q;
1608    last = NULL;
1609    while ((m = *np) != NULL) {
1610	if (p > priority(vifp, mtod(m, struct ip *))) {
1611	    *np = m->m_act;
1612	    /* If we're removing the last packet, fix the tail pointer */
1613	    if (m == t->tbf_t)
1614		t->tbf_t = last;
1615	    m_freem(m);
1616	    /* It's impossible for the queue to be empty, but check anyways. */
1617	    if (--t->tbf_q_len == 0)
1618		t->tbf_t = NULL;
1619	    splx(s);
1620	    mrtstat.mrts_drop_sel++;
1621	    return 1;
1622	}
1623	np = &m->m_act;
1624	last = m;
1625    }
1626    splx(s);
1627    return 0;
1628}
1629
1630static void
1631tbf_send_packet(struct vif *vifp, struct mbuf *m)
1632{
1633    int s = splnet();
1634
1635    if (vifp->v_flags & VIFF_TUNNEL)	/* If tunnel options */
1636	ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, NULL, NULL);
1637    else {
1638	struct ip_moptions imo;
1639	int error;
1640	static struct route ro; /* XXX check this */
1641
1642	imo.imo_multicast_ifp  = vifp->v_ifp;
1643	imo.imo_multicast_ttl  = mtod(m, struct ip *)->ip_ttl - 1;
1644	imo.imo_multicast_loop = 1;
1645	imo.imo_multicast_vif  = -1;
1646
1647	/*
1648	 * Re-entrancy should not be a problem here, because
1649	 * the packets that we send out and are looped back at us
1650	 * should get rejected because they appear to come from
1651	 * the loopback interface, thus preventing looping.
1652	 */
1653	error = ip_output(m, NULL, &ro, IP_FORWARDING, &imo, NULL);
1654
1655	if (mrtdebug & DEBUG_XMIT)
1656	    log(LOG_DEBUG, "phyint_send on vif %d err %d\n",
1657		(int)(vifp - viftable), error);
1658    }
1659    splx(s);
1660}
1661
1662/* determine the current time and then
1663 * the elapsed time (between the last time and time now)
1664 * in milliseconds & update the no. of tokens in the bucket
1665 */
1666static void
1667tbf_update_tokens(struct vif *vifp)
1668{
1669    struct timeval tp;
1670    u_long tm;
1671    int s = splnet();
1672    struct tbf *t = vifp->v_tbf;
1673
1674    GET_TIME(tp);
1675
1676    TV_DELTA(tp, t->tbf_last_pkt_t, tm);
1677
1678    /*
1679     * This formula is actually
1680     * "time in seconds" * "bytes/second".
1681     *
1682     * (tm / 1000000) * (v_rate_limit * 1000 * (1000/1024) / 8)
1683     *
1684     * The (1000/1024) was introduced in add_vif to optimize
1685     * this divide into a shift.
1686     */
1687    t->tbf_n_tok += tm * vifp->v_rate_limit / 1024 / 8;
1688    t->tbf_last_pkt_t = tp;
1689
1690    if (t->tbf_n_tok > MAX_BKT_SIZE)
1691	t->tbf_n_tok = MAX_BKT_SIZE;
1692
1693    splx(s);
1694}
1695
1696static int
1697priority(struct vif *vifp, struct ip *ip)
1698{
1699    int prio = 50; /* the lowest priority -- default case */
1700
1701    /* temporary hack; may add general packet classifier some day */
1702
1703    /*
1704     * The UDP port space is divided up into four priority ranges:
1705     * [0, 16384)     : unclassified - lowest priority
1706     * [16384, 32768) : audio - highest priority
1707     * [32768, 49152) : whiteboard - medium priority
1708     * [49152, 65536) : video - low priority
1709     *
1710     * Everything else gets lowest priority.
1711     */
1712    if (ip->ip_p == IPPROTO_UDP) {
1713	struct udphdr *udp = (struct udphdr *)(((char *)ip) + (ip->ip_hl << 2));
1714	switch (ntohs(udp->uh_dport) & 0xc000) {
1715	case 0x4000:
1716	    prio = 70;
1717	    break;
1718	case 0x8000:
1719	    prio = 60;
1720	    break;
1721	case 0xc000:
1722	    prio = 55;
1723	    break;
1724	}
1725    }
1726    return prio;
1727}
1728
1729/*
1730 * End of token bucket filter modifications
1731 */
1732
1733static int
1734X_ip_rsvp_vif(struct socket *so, struct sockopt *sopt)
1735{
1736    int error, vifi, s;
1737
1738    if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
1739	return EOPNOTSUPP;
1740
1741    error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
1742    if (error)
1743	return error;
1744
1745    s = splnet();
1746
1747    if (vifi < 0 || vifi >= numvifs) {	/* Error if vif is invalid */
1748	splx(s);
1749	return EADDRNOTAVAIL;
1750    }
1751
1752    if (sopt->sopt_name == IP_RSVP_VIF_ON) {
1753	/* Check if socket is available. */
1754	if (viftable[vifi].v_rsvpd != NULL) {
1755	    splx(s);
1756	    return EADDRINUSE;
1757	}
1758
1759	viftable[vifi].v_rsvpd = so;
1760	/* This may seem silly, but we need to be sure we don't over-increment
1761	 * the RSVP counter, in case something slips up.
1762	 */
1763	if (!viftable[vifi].v_rsvp_on) {
1764	    viftable[vifi].v_rsvp_on = 1;
1765	    rsvp_on++;
1766	}
1767    } else { /* must be VIF_OFF */
1768	/*
1769	 * XXX as an additional consistency check, one could make sure
1770	 * that viftable[vifi].v_rsvpd == so, otherwise passing so as
1771	 * first parameter is pretty useless.
1772	 */
1773	viftable[vifi].v_rsvpd = NULL;
1774	/*
1775	 * This may seem silly, but we need to be sure we don't over-decrement
1776	 * the RSVP counter, in case something slips up.
1777	 */
1778	if (viftable[vifi].v_rsvp_on) {
1779	    viftable[vifi].v_rsvp_on = 0;
1780	    rsvp_on--;
1781	}
1782    }
1783    splx(s);
1784    return 0;
1785}
1786
1787static void
1788X_ip_rsvp_force_done(struct socket *so)
1789{
1790    int vifi;
1791    int s;
1792
1793    /* Don't bother if it is not the right type of socket. */
1794    if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP)
1795	return;
1796
1797    s = splnet();
1798
1799    /* The socket may be attached to more than one vif...this
1800     * is perfectly legal.
1801     */
1802    for (vifi = 0; vifi < numvifs; vifi++) {
1803	if (viftable[vifi].v_rsvpd == so) {
1804	    viftable[vifi].v_rsvpd = NULL;
1805	    /* This may seem silly, but we need to be sure we don't
1806	     * over-decrement the RSVP counter, in case something slips up.
1807	     */
1808	    if (viftable[vifi].v_rsvp_on) {
1809		viftable[vifi].v_rsvp_on = 0;
1810		rsvp_on--;
1811	    }
1812	}
1813    }
1814
1815    splx(s);
1816}
1817
1818static void
1819X_rsvp_input(struct mbuf *m, int off)
1820{
1821    int vifi;
1822    struct ip *ip = mtod(m, struct ip *);
1823    struct sockaddr_in rsvp_src = { sizeof rsvp_src, AF_INET };
1824    int s;
1825    struct ifnet *ifp;
1826
1827    if (rsvpdebug)
1828	printf("rsvp_input: rsvp_on %d\n",rsvp_on);
1829
1830    /* Can still get packets with rsvp_on = 0 if there is a local member
1831     * of the group to which the RSVP packet is addressed.  But in this
1832     * case we want to throw the packet away.
1833     */
1834    if (!rsvp_on) {
1835	m_freem(m);
1836	return;
1837    }
1838
1839    s = splnet();
1840
1841    if (rsvpdebug)
1842	printf("rsvp_input: check vifs\n");
1843
1844#ifdef DIAGNOSTIC
1845    M_ASSERTPKTHDR(m);
1846#endif
1847
1848    ifp = m->m_pkthdr.rcvif;
1849    /* Find which vif the packet arrived on. */
1850    for (vifi = 0; vifi < numvifs; vifi++)
1851	if (viftable[vifi].v_ifp == ifp)
1852	    break;
1853
1854    if (vifi == numvifs || viftable[vifi].v_rsvpd == NULL) {
1855	/*
1856	 * If the old-style non-vif-associated socket is set,
1857	 * then use it.  Otherwise, drop packet since there
1858	 * is no specific socket for this vif.
1859	 */
1860	if (ip_rsvpd != NULL) {
1861	    if (rsvpdebug)
1862		printf("rsvp_input: Sending packet up old-style socket\n");
1863	    rip_input(m, off);  /* xxx */
1864	} else {
1865	    if (rsvpdebug && vifi == numvifs)
1866		printf("rsvp_input: Can't find vif for packet.\n");
1867	    else if (rsvpdebug && viftable[vifi].v_rsvpd == NULL)
1868		printf("rsvp_input: No socket defined for vif %d\n",vifi);
1869	    m_freem(m);
1870	}
1871	splx(s);
1872	return;
1873    }
1874    rsvp_src.sin_addr = ip->ip_src;
1875
1876    if (rsvpdebug && m)
1877	printf("rsvp_input: m->m_len = %d, sbspace() = %ld\n",
1878	       m->m_len,sbspace(&(viftable[vifi].v_rsvpd->so_rcv)));
1879
1880    if (socket_send(viftable[vifi].v_rsvpd, m, &rsvp_src) < 0) {
1881	if (rsvpdebug)
1882	    printf("rsvp_input: Failed to append to socket\n");
1883    } else {
1884	if (rsvpdebug)
1885	    printf("rsvp_input: send packet up\n");
1886    }
1887
1888    splx(s);
1889}
1890
1891static int
1892ip_mroute_modevent(module_t mod, int type, void *unused)
1893{
1894    int s;
1895
1896    switch (type) {
1897    case MOD_LOAD:
1898	s = splnet();
1899	/* XXX Protect against multiple loading */
1900	ip_mcast_src = X_ip_mcast_src;
1901	ip_mforward = X_ip_mforward;
1902	ip_mrouter_done = X_ip_mrouter_done;
1903	ip_mrouter_get = X_ip_mrouter_get;
1904	ip_mrouter_set = X_ip_mrouter_set;
1905	ip_rsvp_force_done = X_ip_rsvp_force_done;
1906	ip_rsvp_vif = X_ip_rsvp_vif;
1907	legal_vif_num = X_legal_vif_num;
1908	mrt_ioctl = X_mrt_ioctl;
1909	rsvp_input_p = X_rsvp_input;
1910	splx(s);
1911	break;
1912
1913    case MOD_UNLOAD:
1914	if (ip_mrouter)
1915	    return EINVAL;
1916
1917	s = splnet();
1918	ip_mcast_src = NULL;
1919	ip_mforward = NULL;
1920	ip_mrouter_done = NULL;
1921	ip_mrouter_get = NULL;
1922	ip_mrouter_set = NULL;
1923	ip_rsvp_force_done = NULL;
1924	ip_rsvp_vif = NULL;
1925	legal_vif_num = NULL;
1926	mrt_ioctl = NULL;
1927	rsvp_input_p = NULL;
1928	splx(s);
1929	break;
1930    }
1931    return 0;
1932}
1933
1934static moduledata_t ip_mroutemod = {
1935    "ip_mroute",
1936    ip_mroute_modevent,
1937    0
1938};
1939DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY);
1940