igmp.c revision 190692
1/*-
2 * Copyright (c) 2007-2009 Bruce Simpson.
3 * Copyright (c) 1988 Stephen Deering.
4 * Copyright (c) 1992, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Stephen Deering of Stanford University.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
35 */
36
37/*
38 * Internet Group Management Protocol (IGMP) routines.
39 * [RFC1112, RFC2236, RFC3376]
40 *
41 * Written by Steve Deering, Stanford, May 1988.
42 * Modified by Rosen Sharma, Stanford, Aug 1994.
43 * Modified by Bill Fenner, Xerox PARC, Feb 1995.
44 * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
45 * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson.
46 *
47 * MULTICAST Revision: 3.5.1.4
48 */
49
50#include <sys/cdefs.h>
51__FBSDID("$FreeBSD: head/sys/netinet/igmp.c 190692 2009-04-04 15:32:23Z bms $");
52
53#include "opt_mac.h"
54#include "opt_route.h"
55
56#include <sys/param.h>
57#include <sys/systm.h>
58#include <sys/module.h>
59#include <sys/malloc.h>
60#include <sys/mbuf.h>
61#include <sys/socket.h>
62#include <sys/protosw.h>
63#include <sys/kernel.h>
64#include <sys/sysctl.h>
65#include <sys/vimage.h>
66#include <sys/ktr.h>
67#include <sys/condvar.h>
68
69#include <net/if.h>
70#include <net/netisr.h>
71#include <net/route.h>
72#include <net/vnet.h>
73
74#include <netinet/in.h>
75#include <netinet/in_var.h>
76#include <netinet/in_systm.h>
77#include <netinet/ip.h>
78#include <netinet/ip_var.h>
79#include <netinet/ip_options.h>
80#include <netinet/igmp.h>
81#include <netinet/igmp_var.h>
82#include <netinet/vinet.h>
83
84#include <machine/in_cksum.h>
85
86#include <security/mac/mac_framework.h>
87
88#ifndef KTR_IGMPV3
89#define KTR_IGMPV3 KTR_SUBSYS
90#endif
91
92static struct igmp_ifinfo *
93		igi_alloc_locked(struct ifnet *);
94static void	igi_delete_locked(const struct ifnet *);
95static void	igmp_dispatch_queue(struct ifqueue *, int, const int);
96static void	igmp_fasttimo_vnet(void);
97static void	igmp_final_leave(struct in_multi *, struct igmp_ifinfo *);
98static int	igmp_handle_state_change(struct in_multi *,
99		    struct igmp_ifinfo *);
100static int	igmp_initial_join(struct in_multi *, struct igmp_ifinfo *);
101static int	igmp_input_v1_query(struct ifnet *, const struct ip *);
102static int	igmp_input_v2_query(struct ifnet *, const struct ip *,
103		    const struct igmp *);
104static int	igmp_input_v3_query(struct ifnet *, const struct ip *,
105		    /*const*/ struct igmpv3 *);
106static int	igmp_input_v3_group_query(struct in_multi *,
107		    struct igmp_ifinfo *, int, /*const*/ struct igmpv3 *);
108static int	igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *,
109		    /*const*/ struct igmp *);
110static int	igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *,
111		    /*const*/ struct igmp *);
112static void	igmp_intr(struct mbuf *);
113static int	igmp_isgroupreported(const struct in_addr);
114static struct mbuf *
115		igmp_ra_alloc(void);
116#ifdef KTR
117static char *	igmp_rec_type_to_str(const int);
118#endif
119static void	igmp_set_version(struct igmp_ifinfo *, const int);
120static void	igmp_slowtimo_vnet(void);
121static void	igmp_sysinit(void);
122static int	igmp_v1v2_queue_report(struct in_multi *, const int);
123static void	igmp_v1v2_process_group_timer(struct in_multi *, const int);
124static void	igmp_v1v2_process_querier_timers(struct igmp_ifinfo *);
125static void	igmp_v2_update_group(struct in_multi *, const int);
126static void	igmp_v3_cancel_link_timers(struct igmp_ifinfo *);
127static void	igmp_v3_dispatch_general_query(struct igmp_ifinfo *);
128static struct mbuf *
129		igmp_v3_encap_report(struct ifnet *, struct mbuf *);
130static int	igmp_v3_enqueue_group_record(struct ifqueue *,
131		    struct in_multi *, const int, const int, const int);
132static int	igmp_v3_enqueue_filter_change(struct ifqueue *,
133		    struct in_multi *);
134static void	igmp_v3_process_group_timers(struct igmp_ifinfo *,
135		    struct ifqueue *, struct ifqueue *, struct in_multi *,
136		    const int);
137static int	igmp_v3_merge_state_changes(struct in_multi *,
138		    struct ifqueue *);
139static void	igmp_v3_suppress_group_record(struct in_multi *);
140static int	sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS);
141static int	sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS);
142static int	sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS);
143
144#ifdef VIMAGE
145static vnet_attach_fn	vnet_igmp_iattach;
146static vnet_detach_fn	vnet_igmp_idetach;
147#else
148static int	vnet_igmp_iattach(const void *);
149static int	vnet_igmp_idetach(const void *);
150#endif /* VIMAGE */
151
152/*
153 * System-wide globals.
154 *
155 * Unlocked access to these is OK, except for the global IGMP output
156 * queue. The IGMP subsystem lock ends up being system-wide for the moment,
157 * because all VIMAGEs have to share a global output queue, as netisrs
158 * themselves are not virtualized.
159 *
160 * Locking:
161 *  * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
162 *    Any may be taken independently; if any are held at the same
163 *    time, the above lock order must be followed.
164 *  * All output is delegated to the netisr to handle IFF_NEEDSGIANT.
165 *    Most of the time, direct dispatch will be fine.
166 *  * IN_MULTI_LOCK covers in_multi.
167 *  * IGMP_LOCK covers igmp_ifinfo and any global variables in this file,
168 *    including the output queue.
169 *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
170 *    per-link state iterators.
171 *  * igmp_ifinfo is valid as long as PF_INET is attached to the interface,
172 *    therefore it is not refcounted.
173 *    We allow unlocked reads of igmp_ifinfo when accessed via in_multi.
174 *
175 * Reference counting
176 *  * IGMP acquires its own reference every time an in_multi is passed to
177 *    it and the group is being joined for the first time.
178 *  * IGMP releases its reference(s) on in_multi in a deferred way,
179 *    because the operations which process the release run as part of
180 *    a loop whose control variables are directly affected by the release
181 *    (that, and not recursing on the IF_ADDR_LOCK).
182 *
183 * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds
184 * to a vnet in ifp->if_vnet.
185 *
186 * SMPng: XXX We may potentially race operations on ifma_protospec.
187 * The problem is that we currently lack a clean way of taking the
188 * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing,
189 * as anything which modifies ifma needs to be covered by that lock.
190 * So check for ifma_protospec being NULL before proceeding.
191 */
192struct mtx		 igmp_mtx;
193int			 mpsafe_igmp = 0;
194SYSCTL_INT(_debug, OID_AUTO, mpsafe_igmp, CTLFLAG_RDTUN, &mpsafe_igmp, 0,
195    "Enable SMP-safe IGMPv3");
196
197struct mbuf		*m_raopt;		 /* Router Alert option */
198MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
199
200/*
201 * Global netisr output queue.
202 * This is only used as a last resort if we cannot directly dispatch.
203 * As IN_MULTI_LOCK is no longer in the bottom half of IP, we can do
204 * this, providing mpsafe_igmp is set. If it is not, we take Giant,
205 * and queueing is forced.
206 */
207struct ifqueue		 igmpoq;
208
209/*
210 * VIMAGE-wide globals.
211 *
212 * The IGMPv3 timers themselves need to run per-image, however,
213 * protosw timers run globally (see tcp).
214 * An ifnet can only be in one vimage at a time, and the loopback
215 * ifnet, loif, is itself virtualized.
216 * It would otherwise be possible to seriously hose IGMP state,
217 * and create inconsistencies in upstream multicast routing, if you have
218 * multiple VIMAGEs running on the same link joining different multicast
219 * groups, UNLESS the "primary IP address" is different. This is because
220 * IGMP for IPv4 does not force link-local addresses to be used for each
221 * node, unlike MLD for IPv6.
222 * Obviously the IGMPv3 per-interface state has per-vimage granularity
223 * also as a result.
224 *
225 * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection
226 * policy to control the address used by IGMP on the link.
227 */
228#ifdef VIMAGE_GLOBALS
229int	 interface_timers_running;	 /* IGMPv3 general query response */
230int	 state_change_timers_running;	 /* IGMPv3 state-change retransmit */
231int	 current_state_timers_running;	 /* IGMPv1/v2 host report;
232					  * IGMPv3 g/sg query response */
233
234LIST_HEAD(, igmp_ifinfo)	 igi_head;
235struct igmpstat			 igmpstat;
236struct timeval			 igmp_gsrdelay;
237
238int	 igmp_recvifkludge;
239int	 igmp_sendra;
240int	 igmp_sendlocal;
241int	 igmp_v1enable;
242int	 igmp_v2enable;
243int	 igmp_legacysupp;
244int	 igmp_default_version;
245#endif /* VIMAGE_GLOBALS */
246
247/*
248 * Virtualized sysctls.
249 */
250SYSCTL_V_STRUCT(V_NET, vnet_inet, _net_inet_igmp, IGMPCTL_STATS, stats,
251    CTLFLAG_RW, igmpstat, igmpstat, "");
252SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, recvifkludge,
253    CTLFLAG_RW, igmp_recvifkludge, 0,
254    "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address");
255SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, sendra,
256    CTLFLAG_RW, igmp_sendra, 0,
257    "Send IP Router Alert option in IGMPv2/v3 messages");
258SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, sendlocal,
259    CTLFLAG_RW, igmp_sendlocal, 0,
260    "Send IGMP membership reports for 224.0.0.0/24 groups");
261SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, v1enable,
262    CTLFLAG_RW, igmp_v1enable, 0,
263    "Enable backwards compatibility with IGMPv1");
264SYSCTL_V_INT(V_NET, vnet_inet,  _net_inet_igmp, OID_AUTO, v2enable,
265    CTLFLAG_RW, igmp_v2enable, 0,
266    "Enable backwards compatibility with IGMPv2");
267SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, legacysupp,
268    CTLFLAG_RW, igmp_legacysupp, 0,
269    "Allow v1/v2 reports to suppress v3 group responses");
270SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, default_version,
271    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, igmp_default_version, 0,
272    sysctl_igmp_default_version, "I",
273    "Default version of IGMP to run on each interface");
274SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_igmp, OID_AUTO, gsrdelay,
275    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, igmp_gsrdelay.tv_sec, 0,
276    sysctl_igmp_gsr, "I",
277    "Rate limit for IGMPv3 Group-and-Source queries in seconds");
278
279/*
280 * Non-virtualized sysctls.
281 */
282SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE,
283    sysctl_igmp_ifinfo, "Per-interface IGMPv3 state");
284
285static __inline void
286igmp_save_context(struct mbuf *m, struct ifnet *ifp)
287{
288
289#ifdef VIMAGE
290	m->m_pkthdr.header = ifp->if_vnet;
291#endif /* VIMAGE */
292	m->m_pkthdr.flowid = ifp->if_index;
293}
294
295static __inline void
296igmp_scrub_context(struct mbuf *m)
297{
298
299	m->m_pkthdr.header = NULL;
300	m->m_pkthdr.flowid = 0;
301}
302
303#ifdef KTR
304static __inline char *
305inet_ntoa_haddr(in_addr_t haddr)
306{
307	struct in_addr ia;
308
309	ia.s_addr = htonl(haddr);
310	return (inet_ntoa(ia));
311}
312#endif
313
314/*
315 * Restore context from a queued IGMP output chain.
316 * Return saved ifindex.
317 *
318 * VIMAGE: The assertion is there to make sure that we
319 * actually called CURVNET_SET() with what's in the mbuf chain.
320 */
321static __inline uint32_t
322igmp_restore_context(struct mbuf *m)
323{
324
325#ifdef notyet
326#if defined(VIMAGE) && defined(INVARIANTS)
327	KASSERT(curvnet == (m->m_pkthdr.header),
328	    ("%s: called when curvnet was not restored", __func__));
329#endif
330#endif
331	return (m->m_pkthdr.flowid);
332}
333
334/*
335 * Retrieve or set default IGMP version.
336 *
337 * VIMAGE: Assume curvnet set by caller.
338 * SMPng: NOTE: Serialized by IGMP lock.
339 */
340static int
341sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS)
342{
343	int	 error;
344	int	 new;
345
346	error = sysctl_wire_old_buffer(req, sizeof(int));
347	if (error)
348		return (error);
349
350	IGMP_LOCK();
351
352	new = V_igmp_default_version;
353
354	error = sysctl_handle_int(oidp, &new, 0, req);
355	if (error || !req->newptr)
356		goto out_locked;
357
358	if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) {
359		error = EINVAL;
360		goto out_locked;
361	}
362
363	CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d",
364	     V_igmp_default_version, new);
365
366	V_igmp_default_version = new;
367
368out_locked:
369	IGMP_UNLOCK();
370	return (error);
371}
372
373/*
374 * Retrieve or set threshold between group-source queries in seconds.
375 *
376 * VIMAGE: Assume curvnet set by caller.
377 * SMPng: NOTE: Serialized by IGMP lock.
378 */
379static int
380sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS)
381{
382	int error;
383	int i;
384
385	error = sysctl_wire_old_buffer(req, sizeof(int));
386	if (error)
387		return (error);
388
389	IGMP_LOCK();
390
391	i = V_igmp_gsrdelay.tv_sec;
392
393	error = sysctl_handle_int(oidp, &i, 0, req);
394	if (error || !req->newptr)
395		goto out_locked;
396
397	if (i < -1 || i >= 60) {
398		error = EINVAL;
399		goto out_locked;
400	}
401
402	CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d",
403	     V_igmp_gsrdelay.tv_sec, i);
404	V_igmp_gsrdelay.tv_sec = i;
405
406out_locked:
407	IGMP_UNLOCK();
408	return (error);
409}
410
411/*
412 * Expose struct igmp_ifinfo to userland, keyed by ifindex.
413 * For use by ifmcstat(8).
414 *
415 * SMPng: NOTE: Does an unlocked ifindex space read.
416 * VIMAGE: Assume curvnet set by caller. The node handler itself
417 * is not directly virtualized.
418 */
419static int
420sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
421{
422	INIT_VNET_NET(curvnet);
423	int			*name;
424	int			 error;
425	u_int			 namelen;
426	struct ifnet		*ifp;
427	struct igmp_ifinfo	*igi;
428
429	name = (int *)arg1;
430	namelen = arg2;
431
432	if (req->newptr != NULL)
433		return (EPERM);
434
435	if (namelen != 1)
436		return (EINVAL);
437
438	error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo));
439	if (error)
440		return (error);
441
442	IN_MULTI_LOCK();
443	IGMP_LOCK();
444
445	if (name[0] <= 0 || name[0] > V_if_index) {
446		error = ENOENT;
447		goto out_locked;
448	}
449
450	error = ENOENT;
451
452	ifp = ifnet_byindex(name[0]);
453	if (ifp == NULL)
454		goto out_locked;
455
456	LIST_FOREACH(igi, &V_igi_head, igi_link) {
457		if (ifp == igi->igi_ifp) {
458			error = SYSCTL_OUT(req, igi,
459			    sizeof(struct igmp_ifinfo));
460			break;
461		}
462	}
463
464out_locked:
465	IGMP_UNLOCK();
466	IN_MULTI_UNLOCK();
467	return (error);
468}
469
470/*
471 * Dispatch an entire queue of pending packet chains
472 * using the netisr.
473 * VIMAGE: Assumes the vnet pointer has been set.
474 */
475static void
476igmp_dispatch_queue(struct ifqueue *ifq, int limit, const int loop)
477{
478	struct mbuf *m;
479
480	for (;;) {
481		_IF_DEQUEUE(ifq, m);
482		if (m == NULL)
483			break;
484		CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, ifq, m);
485		if (loop)
486			m->m_flags |= M_IGMP_LOOP;
487		netisr_dispatch(NETISR_IGMP, m);
488		if (--limit == 0)
489			break;
490	}
491}
492
493/*
494 * Filter outgoing IGMP report state by group.
495 *
496 * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1).
497 * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are
498 * disabled for all groups in the 224.0.0.0/24 link-local scope. However,
499 * this may break certain IGMP snooping switches which rely on the old
500 * report behaviour.
501 *
502 * Return zero if the given group is one for which IGMP reports
503 * should be suppressed, or non-zero if reports should be issued.
504 */
505static __inline int
506igmp_isgroupreported(const struct in_addr addr)
507{
508
509	if (in_allhosts(addr) ||
510	    ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr)))))
511		return (0);
512
513	return (1);
514}
515
516/*
517 * Construct a Router Alert option to use in outgoing packets.
518 */
519static struct mbuf *
520igmp_ra_alloc(void)
521{
522	struct mbuf	*m;
523	struct ipoption	*p;
524
525	MGET(m, M_DONTWAIT, MT_DATA);
526	p = mtod(m, struct ipoption *);
527	p->ipopt_dst.s_addr = INADDR_ANY;
528	p->ipopt_list[0] = IPOPT_RA;	/* Router Alert Option */
529	p->ipopt_list[1] = 0x04;	/* 4 bytes long */
530	p->ipopt_list[2] = IPOPT_EOL;	/* End of IP option list */
531	p->ipopt_list[3] = 0x00;	/* pad byte */
532	m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1];
533
534	return (m);
535}
536
537/*
538 * Attach IGMP when PF_INET is attached to an interface.
539 *
540 * VIMAGE: Currently we set the vnet pointer, although it is
541 * likely that it was already set by our caller.
542 */
543struct igmp_ifinfo *
544igmp_domifattach(struct ifnet *ifp)
545{
546	struct igmp_ifinfo *igi;
547
548	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
549	    __func__, ifp, ifp->if_xname);
550
551	CURVNET_SET(ifp->if_vnet);
552	IGMP_LOCK();
553
554	igi = igi_alloc_locked(ifp);
555	if (!(ifp->if_flags & IFF_MULTICAST))
556		igi->igi_flags |= IGIF_SILENT;
557
558	IGMP_UNLOCK();
559	CURVNET_RESTORE();
560
561	return (igi);
562}
563
564/*
565 * VIMAGE: assume curvnet set by caller.
566 */
567static struct igmp_ifinfo *
568igi_alloc_locked(/*const*/ struct ifnet *ifp)
569{
570	struct igmp_ifinfo *igi;
571
572	IGMP_LOCK_ASSERT();
573
574	igi = malloc(sizeof(struct igmp_ifinfo), M_IGMP, M_NOWAIT|M_ZERO);
575	if (igi == NULL)
576		goto out;
577
578	igi->igi_ifp = ifp;
579	igi->igi_version = V_igmp_default_version;
580	igi->igi_flags = 0;
581	igi->igi_rv = IGMP_RV_INIT;
582	igi->igi_qi = IGMP_QI_INIT;
583	igi->igi_qri = IGMP_QRI_INIT;
584	igi->igi_uri = IGMP_URI_INIT;
585
586	SLIST_INIT(&igi->igi_relinmhead);
587
588	/*
589	 * Responses to general queries are subject to bounds.
590	 */
591	IFQ_SET_MAXLEN(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
592
593	LIST_INSERT_HEAD(&V_igi_head, igi, igi_link);
594
595	CTR2(KTR_IGMPV3, "allocate igmp_ifinfo for ifp %p(%s)",
596	     ifp, ifp->if_xname);
597
598out:
599	return (igi);
600}
601
602/*
603 * Hook for ifdetach.
604 *
605 * NOTE: Some finalization tasks need to run before the protocol domain
606 * is detached, but also before the link layer does its cleanup.
607 *
608 * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK().
609 * XXX This is also bitten by unlocked ifma_protospec access.
610 *
611 * VIMAGE: curvnet should have been set by caller, but let's not assume
612 * that for now.
613 */
614void
615igmp_ifdetach(struct ifnet *ifp)
616{
617	struct igmp_ifinfo	*igi;
618	struct ifmultiaddr	*ifma;
619	struct in_multi		*inm, *tinm;
620
621	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp,
622	    ifp->if_xname);
623
624	CURVNET_SET(ifp->if_vnet);
625
626	IGMP_LOCK();
627
628	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
629	if (igi->igi_version == IGMP_VERSION_3) {
630		IF_ADDR_LOCK(ifp);
631		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
632			if (ifma->ifma_addr->sa_family != AF_INET ||
633			    ifma->ifma_protospec == NULL)
634				continue;
635#if 0
636			KASSERT(ifma->ifma_protospec != NULL,
637			    ("%s: ifma_protospec is NULL", __func__));
638#endif
639			inm = (struct in_multi *)ifma->ifma_protospec;
640			if (inm->inm_state == IGMP_LEAVING_MEMBER) {
641				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
642				    inm, inm_nrele);
643			}
644			inm_clear_recorded(inm);
645		}
646		IF_ADDR_UNLOCK(ifp);
647		/*
648		 * Free the in_multi reference(s) for this IGMP lifecycle.
649		 */
650		SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele,
651		    tinm) {
652			SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
653			inm_release_locked(inm);
654		}
655	}
656
657	IGMP_UNLOCK();
658
659#ifdef VIMAGE
660	/*
661	 * Plug the potential race which may occur when a VIMAGE
662	 * is detached and we are forced to queue pending IGMP output for
663	 * output netisr processing due to !mpsafe_igmp. In this case it
664	 * is possible that igmp_intr() is about to see mbuf chains with
665	 * invalid cached curvnet pointers.
666	 * This is a rare condition, so just blow them all away.
667	 * FUTURE: This may in fact not be needed, because IFF_NEEDSGIANT
668	 * is being removed in 8.x and the netisr may then be eliminated;
669	 * it is needed only if VIMAGE and IFF_NEEDSGIANT need to co-exist
670	 */
671	if (!mpsafe_igmp) {
672		int drops;
673
674		IF_LOCK(&igmpoq);
675		drops = igmpoq.ifq_len;
676		_IF_DRAIN(&igmpoq);
677		IF_UNLOCK(&igmpoq);
678		if (bootverbose && drops) {
679			printf("%s: dropped %d pending IGMP output packets\n",
680			    __func__, drops);
681		}
682	}
683#endif /* VIMAGE */
684
685	CURVNET_RESTORE();
686}
687
688/*
689 * Hook for domifdetach.
690 *
691 * VIMAGE: curvnet should have been set by caller, but let's not assume
692 * that for now.
693 */
694void
695igmp_domifdetach(struct ifnet *ifp)
696{
697	struct igmp_ifinfo *igi;
698
699	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
700	    __func__, ifp, ifp->if_xname);
701
702	CURVNET_SET(ifp->if_vnet);
703	IGMP_LOCK();
704
705	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
706	igi_delete_locked(ifp);
707
708	IGMP_UNLOCK();
709	CURVNET_RESTORE();
710}
711
712static void
713igi_delete_locked(const struct ifnet *ifp)
714{
715	struct igmp_ifinfo *igi, *tigi;
716
717	CTR3(KTR_IGMPV3, "%s: freeing igmp_ifinfo for ifp %p(%s)",
718	    __func__, ifp, ifp->if_xname);
719
720	IGMP_LOCK_ASSERT();
721
722	LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) {
723		if (igi->igi_ifp == ifp) {
724			/*
725			 * Free deferred General Query responses.
726			 */
727			_IF_DRAIN(&igi->igi_gq);
728
729			LIST_REMOVE(igi, igi_link);
730
731			KASSERT(SLIST_EMPTY(&igi->igi_relinmhead),
732			    ("%s: there are dangling in_multi references",
733			    __func__));
734
735			free(igi, M_IGMP);
736			return;
737		}
738	}
739
740#ifdef INVARIANTS
741	panic("%s: igmp_ifinfo not found for ifp %p\n", __func__,  ifp);
742#endif
743}
744
745/*
746 * Process a received IGMPv1 query.
747 * Return non-zero if the message should be dropped.
748 *
749 * VIMAGE: The curvnet pointer is derived from the input ifp.
750 */
751static int
752igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip)
753{
754	INIT_VNET_INET(ifp->if_vnet);
755	struct ifmultiaddr	*ifma;
756	struct igmp_ifinfo	*igi;
757	struct in_multi		*inm;
758
759	/*
760	 * IGMPv1 General Queries SHOULD always addressed to 224.0.0.1.
761	 * igmp_group is always ignored. Do not drop it as a userland
762	 * daemon may wish to see it.
763	 */
764	if (!in_allhosts(ip->ip_dst)) {
765		++V_igmpstat.igps_rcv_badqueries;
766		return (0);
767	}
768
769	++V_igmpstat.igps_rcv_gen_queries;
770
771	/*
772	 * Switch to IGMPv1 host compatibility mode.
773	 */
774	IN_MULTI_LOCK();
775	IGMP_LOCK();
776
777	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
778	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
779
780	if (igi->igi_flags & IGIF_LOOPBACK) {
781		CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)",
782		    ifp, ifp->if_xname);
783		goto out_locked;
784	}
785
786	igmp_set_version(igi, IGMP_VERSION_1);
787
788	CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname);
789
790	/*
791	 * Start the timers in all of our group records
792	 * for the interface on which the query arrived,
793	 * except those which are already running.
794	 */
795	IF_ADDR_LOCK(ifp);
796	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
797		if (ifma->ifma_addr->sa_family != AF_INET ||
798		    ifma->ifma_protospec == NULL)
799			continue;
800		inm = (struct in_multi *)ifma->ifma_protospec;
801		if (inm->inm_timer != 0)
802			continue;
803		switch (inm->inm_state) {
804		case IGMP_NOT_MEMBER:
805		case IGMP_SILENT_MEMBER:
806			break;
807		case IGMP_G_QUERY_PENDING_MEMBER:
808		case IGMP_SG_QUERY_PENDING_MEMBER:
809		case IGMP_REPORTING_MEMBER:
810		case IGMP_IDLE_MEMBER:
811		case IGMP_LAZY_MEMBER:
812		case IGMP_SLEEPING_MEMBER:
813		case IGMP_AWAKENING_MEMBER:
814			inm->inm_state = IGMP_REPORTING_MEMBER;
815			inm->inm_timer = IGMP_RANDOM_DELAY(
816			    IGMP_V1V2_MAX_RI * PR_FASTHZ);
817			V_current_state_timers_running = 1;
818			break;
819		case IGMP_LEAVING_MEMBER:
820			break;
821		}
822	}
823	IF_ADDR_UNLOCK(ifp);
824
825out_locked:
826	IGMP_UNLOCK();
827	IN_MULTI_UNLOCK();
828
829	return (0);
830}
831
832/*
833 * Process a received IGMPv2 general or group-specific query.
834 */
835static int
836igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
837    const struct igmp *igmp)
838{
839	struct ifmultiaddr	*ifma;
840	struct igmp_ifinfo	*igi;
841	struct in_multi		*inm;
842	uint16_t		 timer;
843
844	/*
845	 * Perform lazy allocation of IGMP link info if required,
846	 * and switch to IGMPv2 host compatibility mode.
847	 */
848	IN_MULTI_LOCK();
849	IGMP_LOCK();
850
851	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
852	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
853
854	if (igi->igi_flags & IGIF_LOOPBACK) {
855		CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)",
856		    ifp, ifp->if_xname);
857		goto out_locked;
858	}
859
860	igmp_set_version(igi, IGMP_VERSION_2);
861
862	timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
863	if (timer == 0)
864		timer = 1;
865
866	if (!in_nullhost(igmp->igmp_group)) {
867		/*
868		 * IGMPv2 Group-Specific Query.
869		 * If this is a group-specific IGMPv2 query, we need only
870		 * look up the single group to process it.
871		 */
872		inm = inm_lookup(ifp, igmp->igmp_group);
873		if (inm != NULL) {
874			CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)",
875			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
876			igmp_v2_update_group(inm, timer);
877		}
878		++V_igmpstat.igps_rcv_group_queries;
879	} else {
880		/*
881		 * IGMPv2 General Query.
882		 * If this was not sent to the all-hosts group, ignore it.
883		 */
884		if (in_allhosts(ip->ip_dst)) {
885			/*
886			 * For each reporting group joined on this
887			 * interface, kick the report timer.
888			 */
889			CTR2(KTR_IGMPV3,
890			    "process v2 general query on ifp %p(%s)",
891			    ifp, ifp->if_xname);
892
893			IF_ADDR_LOCK(ifp);
894			TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
895				if (ifma->ifma_addr->sa_family != AF_INET ||
896				    ifma->ifma_protospec == NULL)
897					continue;
898				inm = (struct in_multi *)ifma->ifma_protospec;
899				igmp_v2_update_group(inm, timer);
900			}
901			IF_ADDR_UNLOCK(ifp);
902		}
903		++V_igmpstat.igps_rcv_gen_queries;
904	}
905
906out_locked:
907	IGMP_UNLOCK();
908	IN_MULTI_UNLOCK();
909
910	return (0);
911}
912
913/*
914 * Update the report timer on a group in response to an IGMPv2 query.
915 *
916 * If we are becoming the reporting member for this group, start the timer.
917 * If we already are the reporting member for this group, and timer is
918 * below the threshold, reset it.
919 *
920 * We may be updating the group for the first time since we switched
921 * to IGMPv3. If we are, then we must clear any recorded source lists,
922 * and transition to REPORTING state; the group timer is overloaded
923 * for group and group-source query responses.
924 *
925 * Unlike IGMPv3, the delay per group should be jittered
926 * to avoid bursts of IGMPv2 reports.
927 */
928static void
929igmp_v2_update_group(struct in_multi *inm, const int timer)
930{
931
932	CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__,
933	    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer);
934
935	IN_MULTI_LOCK_ASSERT();
936
937	switch (inm->inm_state) {
938	case IGMP_NOT_MEMBER:
939	case IGMP_SILENT_MEMBER:
940		break;
941	case IGMP_REPORTING_MEMBER:
942		if (inm->inm_timer != 0 &&
943		    inm->inm_timer <= timer) {
944			CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, "
945			    "skipping.", __func__);
946			break;
947		}
948		/* FALLTHROUGH */
949	case IGMP_SG_QUERY_PENDING_MEMBER:
950	case IGMP_G_QUERY_PENDING_MEMBER:
951	case IGMP_IDLE_MEMBER:
952	case IGMP_LAZY_MEMBER:
953	case IGMP_AWAKENING_MEMBER:
954		CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__);
955		inm->inm_state = IGMP_REPORTING_MEMBER;
956		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
957		V_current_state_timers_running = 1;
958		break;
959	case IGMP_SLEEPING_MEMBER:
960		CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__);
961		inm->inm_state = IGMP_AWAKENING_MEMBER;
962		break;
963	case IGMP_LEAVING_MEMBER:
964		break;
965	}
966}
967
968/*
969 * Process a received IGMPv3 general, group-specific or
970 * group-and-source-specific query.
971 * Assumes m has already been pulled up to the full IGMP message length.
972 * Return 0 if successful, otherwise an appropriate error code is returned.
973 */
974static int
975igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
976    /*const*/ struct igmpv3 *igmpv3)
977{
978	struct igmp_ifinfo	*igi;
979	struct in_multi		*inm;
980	uint32_t		 maxresp, nsrc, qqi;
981	uint16_t		 timer;
982	uint8_t			 qrv;
983
984	CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname);
985
986	maxresp = igmpv3->igmp_code;	/* in 1/10ths of a second */
987	if (maxresp >= 128) {
988		maxresp = IGMP_MANT(igmpv3->igmp_code) <<
989			  (IGMP_EXP(igmpv3->igmp_code) + 3);
990	}
991
992	/*
993	 * Robustness must never be less than 2 for on-wire IGMPv3.
994	 * FIXME: Check if ifp has IGIF_LOOPBACK set, as we make
995	 * an exception for interfaces whose IGMPv3 state changes
996	 * are redirected to loopback (e.g. MANET).
997	 */
998	qrv = IGMP_QRV(igmpv3->igmp_misc);
999	if (qrv < 2) {
1000		CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__,
1001		    qrv, IGMP_RV_INIT);
1002		qrv = IGMP_RV_INIT;
1003	}
1004
1005	qqi = igmpv3->igmp_qqi;
1006	if (qqi >= 128) {
1007		qqi = IGMP_MANT(igmpv3->igmp_qqi) <<
1008		     (IGMP_EXP(igmpv3->igmp_qqi) + 3);
1009	}
1010
1011	timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE;
1012	if (timer == 0)
1013		timer = 1;
1014
1015	nsrc = ntohs(igmpv3->igmp_numsrc);
1016
1017	IN_MULTI_LOCK();
1018	IGMP_LOCK();
1019
1020	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
1021	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
1022
1023	if (igi->igi_flags & IGIF_LOOPBACK) {
1024		CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)",
1025		    ifp, ifp->if_xname);
1026		goto out_locked;
1027	}
1028
1029	igmp_set_version(igi, IGMP_VERSION_3);
1030
1031	igi->igi_rv = qrv;
1032	igi->igi_qi = qqi;
1033	igi->igi_qri = maxresp;
1034
1035	CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi,
1036	    maxresp);
1037
1038	if (in_nullhost(igmpv3->igmp_group)) {
1039		/*
1040		 * IGMPv3 General Query.
1041		 * Schedule a current-state report on this ifp for
1042		 * all groups, possibly containing source lists.
1043		 */
1044		++V_igmpstat.igps_rcv_gen_queries;
1045
1046		if (!in_allhosts(ip->ip_dst) || nsrc > 0) {
1047			/*
1048			 * General Queries SHOULD be directed to 224.0.0.1.
1049			 * A general query with a source list has undefined
1050			 * behaviour; discard it.
1051			 */
1052			++V_igmpstat.igps_rcv_badqueries;
1053			goto out_locked;
1054		}
1055
1056		CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)",
1057		    ifp, ifp->if_xname);
1058
1059		/*
1060		 * If there is a pending General Query response
1061		 * scheduled earlier than the selected delay, do
1062		 * not schedule any other reports.
1063		 * Otherwise, reset the interface timer.
1064		 */
1065		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) {
1066			igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer);
1067			V_interface_timers_running = 1;
1068		}
1069	} else {
1070		/*
1071		 * IGMPv3 Group-specific or Group-and-source-specific Query.
1072		 *
1073		 * Group-source-specific queries are throttled on
1074		 * a per-group basis to defeat denial-of-service attempts.
1075		 * Queries for groups we are not a member of on this
1076		 * link are simply ignored.
1077		 */
1078		inm = inm_lookup(ifp, igmpv3->igmp_group);
1079		if (inm == NULL)
1080			goto out_locked;
1081		if (nsrc > 0) {
1082			++V_igmpstat.igps_rcv_gsr_queries;
1083			if (!ratecheck(&inm->inm_lastgsrtv,
1084			    &V_igmp_gsrdelay)) {
1085				CTR1(KTR_IGMPV3, "%s: GS query throttled.",
1086				    __func__);
1087				++V_igmpstat.igps_drop_gsr_queries;
1088				goto out_locked;
1089			}
1090		} else {
1091			++V_igmpstat.igps_rcv_group_queries;
1092		}
1093		CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)",
1094		     inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname);
1095		/*
1096		 * If there is a pending General Query response
1097		 * scheduled sooner than the selected delay, no
1098		 * further report need be scheduled.
1099		 * Otherwise, prepare to respond to the
1100		 * group-specific or group-and-source query.
1101		 */
1102		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer)
1103			igmp_input_v3_group_query(inm, igi, timer, igmpv3);
1104	}
1105
1106out_locked:
1107	IGMP_UNLOCK();
1108	IN_MULTI_UNLOCK();
1109
1110	return (0);
1111}
1112
1113/*
1114 * Process a recieved IGMPv3 group-specific or group-and-source-specific
1115 * query.
1116 * Return <0 if any error occured. Currently this is ignored.
1117 */
1118static int
1119igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifinfo *igi,
1120    int timer, /*const*/ struct igmpv3 *igmpv3)
1121{
1122	int			 retval;
1123	uint16_t		 nsrc;
1124
1125	IN_MULTI_LOCK_ASSERT();
1126	IGMP_LOCK_ASSERT();
1127
1128	retval = 0;
1129
1130	switch (inm->inm_state) {
1131	case IGMP_NOT_MEMBER:
1132	case IGMP_SILENT_MEMBER:
1133	case IGMP_SLEEPING_MEMBER:
1134	case IGMP_LAZY_MEMBER:
1135	case IGMP_AWAKENING_MEMBER:
1136	case IGMP_IDLE_MEMBER:
1137	case IGMP_LEAVING_MEMBER:
1138		return (retval);
1139		break;
1140	case IGMP_REPORTING_MEMBER:
1141	case IGMP_G_QUERY_PENDING_MEMBER:
1142	case IGMP_SG_QUERY_PENDING_MEMBER:
1143		break;
1144	}
1145
1146	nsrc = ntohs(igmpv3->igmp_numsrc);
1147
1148	/*
1149	 * Deal with group-specific queries upfront.
1150	 * If any group query is already pending, purge any recorded
1151	 * source-list state if it exists, and schedule a query response
1152	 * for this group-specific query.
1153	 */
1154	if (nsrc == 0) {
1155		if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
1156		    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) {
1157			inm_clear_recorded(inm);
1158			timer = min(inm->inm_timer, timer);
1159		}
1160		inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER;
1161		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
1162		V_current_state_timers_running = 1;
1163		return (retval);
1164	}
1165
1166	/*
1167	 * Deal with the case where a group-and-source-specific query has
1168	 * been received but a group-specific query is already pending.
1169	 */
1170	if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) {
1171		timer = min(inm->inm_timer, timer);
1172		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
1173		V_current_state_timers_running = 1;
1174		return (retval);
1175	}
1176
1177	/*
1178	 * Finally, deal with the case where a group-and-source-specific
1179	 * query has been received, where a response to a previous g-s-r
1180	 * query exists, or none exists.
1181	 * In this case, we need to parse the source-list which the Querier
1182	 * has provided us with and check if we have any source list filter
1183	 * entries at T1 for these sources. If we do not, there is no need
1184	 * schedule a report and the query may be dropped.
1185	 * If we do, we must record them and schedule a current-state
1186	 * report for those sources.
1187	 * FIXME: Handling source lists larger than 1 mbuf requires that
1188	 * we pass the mbuf chain pointer down to this function, and use
1189	 * m_getptr() to walk the chain.
1190	 */
1191	if (inm->inm_nsrc > 0) {
1192		const struct in_addr	*ap;
1193		int			 i, nrecorded;
1194
1195		ap = (const struct in_addr *)(igmpv3 + 1);
1196		nrecorded = 0;
1197		for (i = 0; i < nsrc; i++, ap++) {
1198			retval = inm_record_source(inm, ap->s_addr);
1199			if (retval < 0)
1200				break;
1201			nrecorded += retval;
1202		}
1203		if (nrecorded > 0) {
1204			CTR1(KTR_IGMPV3,
1205			    "%s: schedule response to SG query", __func__);
1206			inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER;
1207			inm->inm_timer = IGMP_RANDOM_DELAY(timer);
1208			V_current_state_timers_running = 1;
1209		}
1210	}
1211
1212	return (retval);
1213}
1214
1215/*
1216 * Process a received IGMPv1 host membership report.
1217 *
1218 * NOTE: 0.0.0.0 workaround breaks const correctness.
1219 */
1220static int
1221igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
1222    /*const*/ struct igmp *igmp)
1223{
1224	struct in_ifaddr *ia;
1225	struct in_multi *inm;
1226
1227	++V_igmpstat.igps_rcv_reports;
1228
1229	if (ifp->if_flags & IFF_LOOPBACK)
1230		return (0);
1231
1232	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr) ||
1233	    !in_hosteq(igmp->igmp_group, ip->ip_dst))) {
1234		++V_igmpstat.igps_rcv_badreports;
1235		return (EINVAL);
1236	}
1237
1238	/*
1239	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
1240	 * Booting clients may use the source address 0.0.0.0. Some
1241	 * IGMP daemons may not know how to use IP_RECVIF to determine
1242	 * the interface upon which this message was received.
1243	 * Replace 0.0.0.0 with the subnet address if told to do so.
1244	 */
1245	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
1246		IFP_TO_IA(ifp, ia);
1247		if (ia != NULL)
1248			ip->ip_src.s_addr = htonl(ia->ia_subnet);
1249	}
1250
1251	CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)",
1252	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1253
1254	/*
1255	 * IGMPv1 report suppression.
1256	 * If we are a member of this group, and our membership should be
1257	 * reported, stop our group timer and transition to the 'lazy' state.
1258	 */
1259	IN_MULTI_LOCK();
1260	inm = inm_lookup(ifp, igmp->igmp_group);
1261	if (inm != NULL) {
1262		struct igmp_ifinfo *igi;
1263
1264		igi = inm->inm_igi;
1265		if (igi == NULL) {
1266			KASSERT(igi != NULL,
1267			    ("%s: no igi for ifp %p", __func__, ifp));
1268			goto out_locked;
1269		}
1270
1271		++V_igmpstat.igps_rcv_ourreports;
1272
1273		/*
1274		 * If we are in IGMPv3 host mode, do not allow the
1275		 * other host's IGMPv1 report to suppress our reports
1276		 * unless explicitly configured to do so.
1277		 */
1278		if (igi->igi_version == IGMP_VERSION_3) {
1279			if (V_igmp_legacysupp)
1280				igmp_v3_suppress_group_record(inm);
1281			goto out_locked;
1282		}
1283
1284		inm->inm_timer = 0;
1285
1286		switch (inm->inm_state) {
1287		case IGMP_NOT_MEMBER:
1288		case IGMP_SILENT_MEMBER:
1289			break;
1290		case IGMP_IDLE_MEMBER:
1291		case IGMP_LAZY_MEMBER:
1292		case IGMP_AWAKENING_MEMBER:
1293			CTR3(KTR_IGMPV3,
1294			    "report suppressed for %s on ifp %p(%s)",
1295			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1296		case IGMP_SLEEPING_MEMBER:
1297			inm->inm_state = IGMP_SLEEPING_MEMBER;
1298			break;
1299		case IGMP_REPORTING_MEMBER:
1300			CTR3(KTR_IGMPV3,
1301			    "report suppressed for %s on ifp %p(%s)",
1302			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1303			if (igi->igi_version == IGMP_VERSION_1)
1304				inm->inm_state = IGMP_LAZY_MEMBER;
1305			else if (igi->igi_version == IGMP_VERSION_2)
1306				inm->inm_state = IGMP_SLEEPING_MEMBER;
1307			break;
1308		case IGMP_G_QUERY_PENDING_MEMBER:
1309		case IGMP_SG_QUERY_PENDING_MEMBER:
1310		case IGMP_LEAVING_MEMBER:
1311			break;
1312		}
1313	}
1314
1315out_locked:
1316	IN_MULTI_UNLOCK();
1317
1318	return (0);
1319}
1320
1321/*
1322 * Process a received IGMPv2 host membership report.
1323 *
1324 * NOTE: 0.0.0.0 workaround breaks const correctness.
1325 */
1326static int
1327igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
1328    /*const*/ struct igmp *igmp)
1329{
1330	struct in_ifaddr *ia;
1331	struct in_multi *inm;
1332
1333	/*
1334	 * Make sure we don't hear our own membership report.  Fast
1335	 * leave requires knowing that we are the only member of a
1336	 * group.
1337	 */
1338	IFP_TO_IA(ifp, ia);
1339	if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr))
1340		return (0);
1341
1342	++V_igmpstat.igps_rcv_reports;
1343
1344	if (ifp->if_flags & IFF_LOOPBACK)
1345		return (0);
1346
1347	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
1348	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
1349		++V_igmpstat.igps_rcv_badreports;
1350		return (EINVAL);
1351	}
1352
1353	/*
1354	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
1355	 * Booting clients may use the source address 0.0.0.0. Some
1356	 * IGMP daemons may not know how to use IP_RECVIF to determine
1357	 * the interface upon which this message was received.
1358	 * Replace 0.0.0.0 with the subnet address if told to do so.
1359	 */
1360	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
1361		if (ia != NULL)
1362			ip->ip_src.s_addr = htonl(ia->ia_subnet);
1363	}
1364
1365	CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)",
1366	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1367
1368	/*
1369	 * IGMPv2 report suppression.
1370	 * If we are a member of this group, and our membership should be
1371	 * reported, and our group timer is pending or about to be reset,
1372	 * stop our group timer by transitioning to the 'lazy' state.
1373	 */
1374	IN_MULTI_LOCK();
1375	inm = inm_lookup(ifp, igmp->igmp_group);
1376	if (inm != NULL) {
1377		struct igmp_ifinfo *igi;
1378
1379		igi = inm->inm_igi;
1380		KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp));
1381
1382		++V_igmpstat.igps_rcv_ourreports;
1383
1384		/*
1385		 * If we are in IGMPv3 host mode, do not allow the
1386		 * other host's IGMPv1 report to suppress our reports
1387		 * unless explicitly configured to do so.
1388		 */
1389		if (igi->igi_version == IGMP_VERSION_3) {
1390			if (V_igmp_legacysupp)
1391				igmp_v3_suppress_group_record(inm);
1392			goto out_locked;
1393		}
1394
1395		inm->inm_timer = 0;
1396
1397		switch (inm->inm_state) {
1398		case IGMP_NOT_MEMBER:
1399		case IGMP_SILENT_MEMBER:
1400		case IGMP_SLEEPING_MEMBER:
1401			break;
1402		case IGMP_REPORTING_MEMBER:
1403		case IGMP_IDLE_MEMBER:
1404		case IGMP_AWAKENING_MEMBER:
1405			CTR3(KTR_IGMPV3,
1406			    "report suppressed for %s on ifp %p(%s)",
1407			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1408		case IGMP_LAZY_MEMBER:
1409			inm->inm_state = IGMP_LAZY_MEMBER;
1410			break;
1411		case IGMP_G_QUERY_PENDING_MEMBER:
1412		case IGMP_SG_QUERY_PENDING_MEMBER:
1413		case IGMP_LEAVING_MEMBER:
1414			break;
1415		}
1416	}
1417
1418out_locked:
1419	IN_MULTI_UNLOCK();
1420
1421	return (0);
1422}
1423
1424void
1425igmp_input(struct mbuf *m, int off)
1426{
1427	int iphlen;
1428	struct ifnet *ifp;
1429	struct igmp *igmp;
1430	struct ip *ip;
1431	int igmplen;
1432	int minlen;
1433	int queryver;
1434
1435	CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, m, off);
1436
1437	ifp = m->m_pkthdr.rcvif;
1438	INIT_VNET_INET(ifp->if_vnet);
1439
1440	++V_igmpstat.igps_rcv_total;
1441
1442	ip = mtod(m, struct ip *);
1443	iphlen = off;
1444	igmplen = ip->ip_len;
1445
1446	/*
1447	 * Validate lengths.
1448	 */
1449	if (igmplen < IGMP_MINLEN) {
1450		++V_igmpstat.igps_rcv_tooshort;
1451		m_freem(m);
1452		return;
1453	}
1454
1455	/*
1456	 * Always pullup to the minimum size for v1/v2 or v3
1457	 * to amortize calls to m_pullup().
1458	 */
1459	minlen = iphlen;
1460	if (igmplen >= IGMP_V3_QUERY_MINLEN)
1461		minlen += IGMP_V3_QUERY_MINLEN;
1462	else
1463		minlen += IGMP_MINLEN;
1464	if ((m->m_flags & M_EXT || m->m_len < minlen) &&
1465	    (m = m_pullup(m, minlen)) == 0) {
1466		++V_igmpstat.igps_rcv_tooshort;
1467		return;
1468	}
1469	ip = mtod(m, struct ip *);
1470
1471	if (ip->ip_ttl != 1) {
1472		++V_igmpstat.igps_rcv_badttl;
1473		m_freem(m);
1474		return;
1475	}
1476
1477	/*
1478	 * Validate checksum.
1479	 */
1480	m->m_data += iphlen;
1481	m->m_len -= iphlen;
1482	igmp = mtod(m, struct igmp *);
1483	if (in_cksum(m, igmplen)) {
1484		++V_igmpstat.igps_rcv_badsum;
1485		m_freem(m);
1486		return;
1487	}
1488	m->m_data -= iphlen;
1489	m->m_len += iphlen;
1490
1491	switch (igmp->igmp_type) {
1492	case IGMP_HOST_MEMBERSHIP_QUERY:
1493		if (igmplen == IGMP_MINLEN) {
1494			if (igmp->igmp_code == 0)
1495				queryver = IGMP_VERSION_1;
1496			else
1497				queryver = IGMP_VERSION_2;
1498		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
1499			queryver = IGMP_VERSION_3;
1500		} else {
1501			++V_igmpstat.igps_rcv_tooshort;
1502			m_freem(m);
1503			return;
1504		}
1505
1506		switch (queryver) {
1507		case IGMP_VERSION_1:
1508			++V_igmpstat.igps_rcv_v1v2_queries;
1509			if (!V_igmp_v1enable)
1510				break;
1511			if (igmp_input_v1_query(ifp, ip) != 0) {
1512				m_freem(m);
1513				return;
1514			}
1515			break;
1516
1517		case IGMP_VERSION_2:
1518			++V_igmpstat.igps_rcv_v1v2_queries;
1519			if (!V_igmp_v2enable)
1520				break;
1521			if (igmp_input_v2_query(ifp, ip, igmp) != 0) {
1522				m_freem(m);
1523				return;
1524			}
1525			break;
1526
1527		case IGMP_VERSION_3: {
1528				struct igmpv3 *igmpv3;
1529				uint16_t igmpv3len;
1530				uint16_t srclen;
1531				int nsrc;
1532
1533				++V_igmpstat.igps_rcv_v3_queries;
1534				igmpv3 = (struct igmpv3 *)igmp;
1535				/*
1536				 * Validate length based on source count.
1537				 */
1538				nsrc = ntohs(igmpv3->igmp_numsrc);
1539				srclen = sizeof(struct in_addr) * nsrc;
1540				if (nsrc * sizeof(in_addr_t) > srclen) {
1541					++V_igmpstat.igps_rcv_tooshort;
1542					return;
1543				}
1544				/*
1545				 * m_pullup() may modify m, so pullup in
1546				 * this scope.
1547				 */
1548				igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN +
1549				    srclen;
1550				if ((m->m_flags & M_EXT ||
1551				     m->m_len < igmpv3len) &&
1552				    (m = m_pullup(m, igmpv3len)) == NULL) {
1553					++V_igmpstat.igps_rcv_tooshort;
1554					return;
1555				}
1556				igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *)
1557				    + iphlen);
1558				if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) {
1559					m_freem(m);
1560					return;
1561				}
1562			}
1563			break;
1564		}
1565		break;
1566
1567	case IGMP_v1_HOST_MEMBERSHIP_REPORT:
1568		if (!V_igmp_v1enable)
1569			break;
1570		if (igmp_input_v1_report(ifp, ip, igmp) != 0) {
1571			m_freem(m);
1572			return;
1573		}
1574		break;
1575
1576	case IGMP_v2_HOST_MEMBERSHIP_REPORT:
1577		if (!V_igmp_v2enable)
1578			break;
1579		if (!ip_checkrouteralert(m))
1580			++V_igmpstat.igps_rcv_nora;
1581		if (igmp_input_v2_report(ifp, ip, igmp) != 0) {
1582			m_freem(m);
1583			return;
1584		}
1585		break;
1586
1587	case IGMP_v3_HOST_MEMBERSHIP_REPORT:
1588		/*
1589		 * Hosts do not need to process IGMPv3 membership reports,
1590		 * as report suppression is no longer required.
1591		 */
1592		if (!ip_checkrouteralert(m))
1593			++V_igmpstat.igps_rcv_nora;
1594		break;
1595
1596	default:
1597		break;
1598	}
1599
1600	/*
1601	 * Pass all valid IGMP packets up to any process(es) listening on a
1602	 * raw IGMP socket.
1603	 */
1604	rip_input(m, off);
1605}
1606
1607
1608/*
1609 * Fast timeout handler (global).
1610 * VIMAGE: Timeout handlers are expected to service all vimages.
1611 */
1612void
1613igmp_fasttimo(void)
1614{
1615#ifdef VIMAGE
1616	VNET_ITERATOR_DECL(vnet_iter);
1617
1618	VNET_LIST_RLOCK();
1619	VNET_FOREACH(vnet_iter) {
1620		CURVNET_SET(vnet_iter);
1621		INIT_VNET_INET(vnet_iter);
1622		igmp_fasttimo_vnet();
1623		CURVNET_RESTORE();
1624	}
1625	VNET_LIST_RUNLOCK();
1626#else /* !VIMAGE */
1627
1628	igmp_fasttimo_vnet();
1629#endif /* VIMAGE */
1630}
1631
1632/*
1633 * Fast timeout handler (per-vnet).
1634 * Sends are shuffled off to a netisr to deal with Giant.
1635 *
1636 * VIMAGE: Assume caller has set up our curvnet.
1637 */
1638static void
1639igmp_fasttimo_vnet(void)
1640{
1641	struct ifqueue		 scq;	/* State-change packets */
1642	struct ifqueue		 qrq;	/* Query response packets */
1643	struct ifnet		*ifp;
1644	struct igmp_ifinfo	*igi;
1645	struct ifmultiaddr	*ifma, *tifma;
1646	struct in_multi		*inm;
1647	int			 loop, uri_fasthz;
1648
1649	loop = 0;
1650	uri_fasthz = 0;
1651
1652	/*
1653	 * Quick check to see if any work needs to be done, in order to
1654	 * minimize the overhead of fasttimo processing.
1655	 * SMPng: XXX Unlocked reads.
1656	 */
1657	if (!V_current_state_timers_running &&
1658	    !V_interface_timers_running &&
1659	    !V_state_change_timers_running)
1660		return;
1661
1662	if (!mpsafe_igmp)
1663		mtx_lock(&Giant);
1664
1665	IN_MULTI_LOCK();
1666	IGMP_LOCK();
1667
1668	/*
1669	 * IGMPv3 General Query response timer processing.
1670	 */
1671	if (V_interface_timers_running) {
1672		CTR1(KTR_IGMPV3, "%s: interface timers running", __func__);
1673
1674		V_interface_timers_running = 0;
1675		LIST_FOREACH(igi, &V_igi_head, igi_link) {
1676			if (igi->igi_v3_timer == 0) {
1677				/* Do nothing. */
1678			} else if (--igi->igi_v3_timer == 0) {
1679				igmp_v3_dispatch_general_query(igi);
1680			} else {
1681				V_interface_timers_running = 1;
1682			}
1683		}
1684	}
1685
1686	if (!V_current_state_timers_running &&
1687	    !V_state_change_timers_running)
1688		goto out_locked;
1689
1690	V_current_state_timers_running = 0;
1691	V_state_change_timers_running = 0;
1692
1693	CTR1(KTR_IGMPV3, "%s: state change timers running", __func__);
1694
1695	/*
1696	 * IGMPv1/v2/v3 host report and state-change timer processing.
1697	 * Note: Processing a v3 group timer may remove a node.
1698	 */
1699	LIST_FOREACH(igi, &V_igi_head, igi_link) {
1700		ifp = igi->igi_ifp;
1701
1702		if (igi->igi_version == IGMP_VERSION_3) {
1703			loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
1704			uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri *
1705			    PR_FASTHZ);
1706
1707			memset(&qrq, 0, sizeof(struct ifqueue));
1708			IFQ_SET_MAXLEN(&qrq, IGMP_MAX_G_GS_PACKETS);
1709
1710			memset(&scq, 0, sizeof(struct ifqueue));
1711			IFQ_SET_MAXLEN(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
1712		}
1713
1714		IF_ADDR_LOCK(ifp);
1715		TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link,
1716		    tifma) {
1717			if (ifma->ifma_addr->sa_family != AF_INET ||
1718			    ifma->ifma_protospec == NULL)
1719				continue;
1720			inm = (struct in_multi *)ifma->ifma_protospec;
1721			switch (igi->igi_version) {
1722			case IGMP_VERSION_1:
1723			case IGMP_VERSION_2:
1724				igmp_v1v2_process_group_timer(inm,
1725				    igi->igi_version);
1726				break;
1727			case IGMP_VERSION_3:
1728				igmp_v3_process_group_timers(igi, &qrq,
1729				    &scq, inm, uri_fasthz);
1730				break;
1731			}
1732		}
1733		IF_ADDR_UNLOCK(ifp);
1734
1735		if (igi->igi_version == IGMP_VERSION_3) {
1736			struct in_multi		*tinm;
1737
1738			igmp_dispatch_queue(&qrq, 0, loop);
1739			igmp_dispatch_queue(&scq, 0, loop);
1740
1741			/*
1742			 * Free the in_multi reference(s) for this
1743			 * IGMP lifecycle.
1744			 */
1745			SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead,
1746			    inm_nrele, tinm) {
1747				SLIST_REMOVE_HEAD(&igi->igi_relinmhead,
1748				    inm_nrele);
1749				inm_release_locked(inm);
1750			}
1751		}
1752	}
1753
1754out_locked:
1755	IGMP_UNLOCK();
1756	IN_MULTI_UNLOCK();
1757	if (!mpsafe_igmp)
1758		mtx_unlock(&Giant);
1759}
1760
1761/*
1762 * Update host report group timer for IGMPv1/v2.
1763 * Will update the global pending timer flags.
1764 */
1765static void
1766igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
1767{
1768	int report_timer_expired;
1769
1770	IN_MULTI_LOCK_ASSERT();
1771	IGMP_LOCK_ASSERT();
1772
1773	if (inm->inm_timer == 0) {
1774		report_timer_expired = 0;
1775	} else if (--inm->inm_timer == 0) {
1776		report_timer_expired = 1;
1777	} else {
1778		V_current_state_timers_running = 1;
1779		return;
1780	}
1781
1782	switch (inm->inm_state) {
1783	case IGMP_NOT_MEMBER:
1784	case IGMP_SILENT_MEMBER:
1785	case IGMP_IDLE_MEMBER:
1786	case IGMP_LAZY_MEMBER:
1787	case IGMP_SLEEPING_MEMBER:
1788	case IGMP_AWAKENING_MEMBER:
1789		break;
1790	case IGMP_REPORTING_MEMBER:
1791		if (report_timer_expired) {
1792			inm->inm_state = IGMP_IDLE_MEMBER;
1793			(void)igmp_v1v2_queue_report(inm,
1794			    (version == IGMP_VERSION_2) ?
1795			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
1796			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
1797		}
1798		break;
1799	case IGMP_G_QUERY_PENDING_MEMBER:
1800	case IGMP_SG_QUERY_PENDING_MEMBER:
1801	case IGMP_LEAVING_MEMBER:
1802		break;
1803	}
1804}
1805
1806/*
1807 * Update a group's timers for IGMPv3.
1808 * Will update the global pending timer flags.
1809 * Note: Unlocked read from igi.
1810 */
1811static void
1812igmp_v3_process_group_timers(struct igmp_ifinfo *igi,
1813    struct ifqueue *qrq, struct ifqueue *scq,
1814    struct in_multi *inm, const int uri_fasthz)
1815{
1816	int query_response_timer_expired;
1817	int state_change_retransmit_timer_expired;
1818
1819	IN_MULTI_LOCK_ASSERT();
1820	IGMP_LOCK_ASSERT();
1821
1822	query_response_timer_expired = 0;
1823	state_change_retransmit_timer_expired = 0;
1824
1825	/*
1826	 * During a transition from v1/v2 compatibility mode back to v3,
1827	 * a group record in REPORTING state may still have its group
1828	 * timer active. This is a no-op in this function; it is easier
1829	 * to deal with it here than to complicate the slow-timeout path.
1830	 */
1831	if (inm->inm_timer == 0) {
1832		query_response_timer_expired = 0;
1833	} else if (--inm->inm_timer == 0) {
1834		query_response_timer_expired = 1;
1835	} else {
1836		V_current_state_timers_running = 1;
1837	}
1838
1839	if (inm->inm_sctimer == 0) {
1840		state_change_retransmit_timer_expired = 0;
1841	} else if (--inm->inm_sctimer == 0) {
1842		state_change_retransmit_timer_expired = 1;
1843	} else {
1844		V_state_change_timers_running = 1;
1845	}
1846
1847	/* We are in fasttimo, so be quick about it. */
1848	if (!state_change_retransmit_timer_expired &&
1849	    !query_response_timer_expired)
1850		return;
1851
1852	switch (inm->inm_state) {
1853	case IGMP_NOT_MEMBER:
1854	case IGMP_SILENT_MEMBER:
1855	case IGMP_SLEEPING_MEMBER:
1856	case IGMP_LAZY_MEMBER:
1857	case IGMP_AWAKENING_MEMBER:
1858	case IGMP_IDLE_MEMBER:
1859		break;
1860	case IGMP_G_QUERY_PENDING_MEMBER:
1861	case IGMP_SG_QUERY_PENDING_MEMBER:
1862		/*
1863		 * Respond to a previously pending Group-Specific
1864		 * or Group-and-Source-Specific query by enqueueing
1865		 * the appropriate Current-State report for
1866		 * immediate transmission.
1867		 */
1868		if (query_response_timer_expired) {
1869			int retval;
1870
1871			retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1,
1872			    (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER));
1873			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
1874			    __func__, retval);
1875			inm->inm_state = IGMP_REPORTING_MEMBER;
1876			/* XXX Clear recorded sources for next time. */
1877			inm_clear_recorded(inm);
1878		}
1879		/* FALLTHROUGH */
1880	case IGMP_REPORTING_MEMBER:
1881	case IGMP_LEAVING_MEMBER:
1882		if (state_change_retransmit_timer_expired) {
1883			/*
1884			 * State-change retransmission timer fired.
1885			 * If there are any further pending retransmissions,
1886			 * set the global pending state-change flag, and
1887			 * reset the timer.
1888			 */
1889			if (--inm->inm_scrv > 0) {
1890				inm->inm_sctimer = uri_fasthz;
1891				V_state_change_timers_running = 1;
1892			}
1893			/*
1894			 * Retransmit the previously computed state-change
1895			 * report. If there are no further pending
1896			 * retransmissions, the mbuf queue will be consumed.
1897			 * Update T0 state to T1 as we have now sent
1898			 * a state-change.
1899			 */
1900			(void)igmp_v3_merge_state_changes(inm, scq);
1901
1902			inm_commit(inm);
1903			CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
1904			    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
1905
1906			/*
1907			 * If we are leaving the group for good, make sure
1908			 * we release IGMP's reference to it.
1909			 * This release must be deferred using a SLIST,
1910			 * as we are called from a loop which traverses
1911			 * the in_ifmultiaddr TAILQ.
1912			 */
1913			if (inm->inm_state == IGMP_LEAVING_MEMBER &&
1914			    inm->inm_scrv == 0) {
1915				inm->inm_state = IGMP_NOT_MEMBER;
1916				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
1917				    inm, inm_nrele);
1918			}
1919		}
1920		break;
1921	}
1922}
1923
1924
1925/*
1926 * Suppress a group's pending response to a group or source/group query.
1927 *
1928 * Do NOT suppress state changes. This leads to IGMPv3 inconsistency.
1929 * Do NOT update ST1/ST0 as this operation merely suppresses
1930 * the currently pending group record.
1931 * Do NOT suppress the response to a general query. It is possible but
1932 * it would require adding another state or flag.
1933 */
1934static void
1935igmp_v3_suppress_group_record(struct in_multi *inm)
1936{
1937
1938	IN_MULTI_LOCK_ASSERT();
1939
1940	KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3,
1941		("%s: not IGMPv3 mode on link", __func__));
1942
1943	if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER ||
1944	    inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER)
1945		return;
1946
1947	if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
1948		inm_clear_recorded(inm);
1949
1950	inm->inm_timer = 0;
1951	inm->inm_state = IGMP_REPORTING_MEMBER;
1952}
1953
1954/*
1955 * Switch to a different IGMP version on the given interface,
1956 * as per Section 7.2.1.
1957 */
1958static void
1959igmp_set_version(struct igmp_ifinfo *igi, const int version)
1960{
1961
1962	IGMP_LOCK_ASSERT();
1963
1964	CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__,
1965	    version, igi->igi_ifp, igi->igi_ifp->if_xname);
1966
1967	if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) {
1968		int old_version_timer;
1969		/*
1970		 * Compute the "Older Version Querier Present" timer as per
1971		 * Section 8.12.
1972		 */
1973		old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri;
1974		old_version_timer *= PR_SLOWHZ;
1975
1976		if (version == IGMP_VERSION_1) {
1977			igi->igi_v1_timer = old_version_timer;
1978			igi->igi_v2_timer = 0;
1979		} else if (version == IGMP_VERSION_2) {
1980			igi->igi_v1_timer = 0;
1981			igi->igi_v2_timer = old_version_timer;
1982		}
1983	}
1984
1985	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
1986		if (igi->igi_version != IGMP_VERSION_2) {
1987			igi->igi_version = IGMP_VERSION_2;
1988			igmp_v3_cancel_link_timers(igi);
1989		}
1990	} else if (igi->igi_v1_timer > 0) {
1991		if (igi->igi_version != IGMP_VERSION_1) {
1992			igi->igi_version = IGMP_VERSION_1;
1993			igmp_v3_cancel_link_timers(igi);
1994		}
1995	}
1996}
1997
1998/*
1999 * Cancel pending IGMPv3 timers for the given link and all groups
2000 * joined on it; state-change, general-query, and group-query timers.
2001 */
2002static void
2003igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi)
2004{
2005	struct ifmultiaddr	*ifma;
2006	struct ifnet		*ifp;
2007	struct in_multi		*inm;
2008
2009	CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__,
2010	    igi->igi_ifp, igi->igi_ifp->if_xname);
2011
2012	IN_MULTI_LOCK_ASSERT();
2013	IGMP_LOCK_ASSERT();
2014
2015	/*
2016	 * Fast-track this potentially expensive operation
2017	 * by checking all the global 'timer pending' flags.
2018	 */
2019	if (!V_interface_timers_running &&
2020	    !V_state_change_timers_running &&
2021	    !V_current_state_timers_running)
2022		return;
2023
2024	igi->igi_v3_timer = 0;
2025
2026	ifp = igi->igi_ifp;
2027
2028	IF_ADDR_LOCK(ifp);
2029	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2030		if (ifma->ifma_addr->sa_family != AF_INET)
2031			continue;
2032		inm = (struct in_multi *)ifma->ifma_protospec;
2033		switch (inm->inm_state) {
2034		case IGMP_NOT_MEMBER:
2035		case IGMP_SILENT_MEMBER:
2036		case IGMP_IDLE_MEMBER:
2037		case IGMP_LAZY_MEMBER:
2038		case IGMP_SLEEPING_MEMBER:
2039		case IGMP_AWAKENING_MEMBER:
2040			break;
2041		case IGMP_LEAVING_MEMBER:
2042			/*
2043			 * If we are leaving the group and switching
2044			 * IGMP version, we need to release the final
2045			 * reference held for issuing the INCLUDE {}.
2046			 *
2047			 * SMPNG: Must drop and re-acquire IF_ADDR_LOCK
2048			 * around inm_release_locked(), as it is not
2049			 * a recursive mutex.
2050			 */
2051			IF_ADDR_UNLOCK(ifp);
2052			inm_release_locked(inm);
2053			IF_ADDR_LOCK(ifp);
2054			/* FALLTHROUGH */
2055		case IGMP_G_QUERY_PENDING_MEMBER:
2056		case IGMP_SG_QUERY_PENDING_MEMBER:
2057			inm_clear_recorded(inm);
2058			/* FALLTHROUGH */
2059		case IGMP_REPORTING_MEMBER:
2060			inm->inm_sctimer = 0;
2061			inm->inm_timer = 0;
2062			inm->inm_state = IGMP_REPORTING_MEMBER;
2063			/*
2064			 * Free any pending IGMPv3 state-change records.
2065			 */
2066			_IF_DRAIN(&inm->inm_scq);
2067			break;
2068		}
2069	}
2070	IF_ADDR_UNLOCK(ifp);
2071}
2072
2073/*
2074 * Update the Older Version Querier Present timers for a link.
2075 * See Section 7.2.1 of RFC 3376.
2076 */
2077static void
2078igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi)
2079{
2080
2081	IGMP_LOCK_ASSERT();
2082
2083	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) {
2084		/*
2085		 * IGMPv1 and IGMPv2 Querier Present timers expired.
2086		 *
2087		 * Revert to IGMPv3.
2088		 */
2089		if (igi->igi_version != IGMP_VERSION_3) {
2090			CTR5(KTR_IGMPV3,
2091			    "%s: transition from v%d -> v%d on %p(%s)",
2092			    __func__, igi->igi_version, IGMP_VERSION_3,
2093			    igi->igi_ifp, igi->igi_ifp->if_xname);
2094			igi->igi_version = IGMP_VERSION_3;
2095		}
2096	} else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
2097		/*
2098		 * IGMPv1 Querier Present timer expired,
2099		 * IGMPv2 Querier Present timer running.
2100		 * If IGMPv2 was disabled since last timeout,
2101		 * revert to IGMPv3.
2102		 * If IGMPv2 is enabled, revert to IGMPv2.
2103		 */
2104		if (!V_igmp_v2enable) {
2105			CTR5(KTR_IGMPV3,
2106			    "%s: transition from v%d -> v%d on %p(%s)",
2107			    __func__, igi->igi_version, IGMP_VERSION_3,
2108			    igi->igi_ifp, igi->igi_ifp->if_xname);
2109			igi->igi_v2_timer = 0;
2110			igi->igi_version = IGMP_VERSION_3;
2111		} else {
2112			--igi->igi_v2_timer;
2113			if (igi->igi_version != IGMP_VERSION_2) {
2114				CTR5(KTR_IGMPV3,
2115				    "%s: transition from v%d -> v%d on %p(%s)",
2116				    __func__, igi->igi_version, IGMP_VERSION_2,
2117				    igi->igi_ifp, igi->igi_ifp->if_xname);
2118				igi->igi_version = IGMP_VERSION_2;
2119			}
2120		}
2121	} else if (igi->igi_v1_timer > 0) {
2122		/*
2123		 * IGMPv1 Querier Present timer running.
2124		 * Stop IGMPv2 timer if running.
2125		 *
2126		 * If IGMPv1 was disabled since last timeout,
2127		 * revert to IGMPv3.
2128		 * If IGMPv1 is enabled, reset IGMPv2 timer if running.
2129		 */
2130		if (!V_igmp_v1enable) {
2131			CTR5(KTR_IGMPV3,
2132			    "%s: transition from v%d -> v%d on %p(%s)",
2133			    __func__, igi->igi_version, IGMP_VERSION_3,
2134			    igi->igi_ifp, igi->igi_ifp->if_xname);
2135			igi->igi_v1_timer = 0;
2136			igi->igi_version = IGMP_VERSION_3;
2137		} else {
2138			--igi->igi_v1_timer;
2139		}
2140		if (igi->igi_v2_timer > 0) {
2141			CTR3(KTR_IGMPV3,
2142			    "%s: cancel v2 timer on %p(%s)",
2143			    __func__, igi->igi_ifp, igi->igi_ifp->if_xname);
2144			igi->igi_v2_timer = 0;
2145		}
2146	}
2147}
2148
2149/*
2150 * Global slowtimo handler.
2151 * VIMAGE: Timeout handlers are expected to service all vimages.
2152 */
2153void
2154igmp_slowtimo(void)
2155{
2156#ifdef VIMAGE
2157	VNET_ITERATOR_DECL(vnet_iter);
2158
2159	VNET_LIST_RLOCK();
2160	VNET_FOREACH(vnet_iter) {
2161		CURVNET_SET(vnet_iter);
2162		INIT_VNET_INET(vnet_iter);
2163		igmp_slowtimo_vnet();
2164		CURVNET_RESTORE();
2165	}
2166	VNET_LIST_RUNLOCK();
2167#else /* !VIMAGE */
2168	igmp_slowtimo_vnet();
2169#endif /* VIMAGE */
2170}
2171
2172/*
2173 * Per-vnet slowtimo handler.
2174 */
2175static void
2176igmp_slowtimo_vnet(void)
2177{
2178	struct igmp_ifinfo *igi;
2179
2180	IGMP_LOCK();
2181
2182	LIST_FOREACH(igi, &V_igi_head, igi_link) {
2183		igmp_v1v2_process_querier_timers(igi);
2184	}
2185
2186	IGMP_UNLOCK();
2187}
2188
2189/*
2190 * Dispatch an IGMPv1/v2 host report or leave message.
2191 * These are always small enough to fit inside a single mbuf.
2192 */
2193static int
2194igmp_v1v2_queue_report(struct in_multi *inm, const int type)
2195{
2196	struct ifnet		*ifp;
2197	struct igmp		*igmp;
2198	struct ip		*ip;
2199	struct mbuf		*m;
2200
2201	IN_MULTI_LOCK_ASSERT();
2202	IGMP_LOCK_ASSERT();
2203
2204	ifp = inm->inm_ifp;
2205	/* XXX are these needed ? */
2206	INIT_VNET_NET(ifp->if_vnet);
2207	INIT_VNET_INET(ifp->if_vnet);
2208
2209	MGETHDR(m, M_DONTWAIT, MT_DATA);
2210	if (m == NULL)
2211		return (ENOMEM);
2212	MH_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
2213
2214	m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp);
2215
2216	m->m_data += sizeof(struct ip);
2217	m->m_len = sizeof(struct igmp);
2218
2219	igmp = mtod(m, struct igmp *);
2220	igmp->igmp_type = type;
2221	igmp->igmp_code = 0;
2222	igmp->igmp_group = inm->inm_addr;
2223	igmp->igmp_cksum = 0;
2224	igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp));
2225
2226	m->m_data -= sizeof(struct ip);
2227	m->m_len += sizeof(struct ip);
2228
2229	ip = mtod(m, struct ip *);
2230	ip->ip_tos = 0;
2231	ip->ip_len = sizeof(struct ip) + sizeof(struct igmp);
2232	ip->ip_off = 0;
2233	ip->ip_p = IPPROTO_IGMP;
2234	ip->ip_src.s_addr = INADDR_ANY;
2235
2236	if (type == IGMP_HOST_LEAVE_MESSAGE)
2237		ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP);
2238	else
2239		ip->ip_dst = inm->inm_addr;
2240
2241	igmp_save_context(m, ifp);
2242
2243	m->m_flags |= M_IGMPV2;
2244	if (inm->inm_igi->igi_flags & IGIF_LOOPBACK)
2245		m->m_flags |= M_IGMP_LOOP;
2246
2247	CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m);
2248	netisr_dispatch(NETISR_IGMP, m);
2249
2250	return (0);
2251}
2252
2253/*
2254 * Process a state change from the upper layer for the given IPv4 group.
2255 *
2256 * Each socket holds a reference on the in_multi in its own ip_moptions.
2257 * The socket layer will have made the necessary updates to.the group
2258 * state, it is now up to IGMP to issue a state change report if there
2259 * has been any change between T0 (when the last state-change was issued)
2260 * and T1 (now).
2261 *
2262 * We use the IGMPv3 state machine at group level. The IGMP module
2263 * however makes the decision as to which IGMP protocol version to speak.
2264 * A state change *from* INCLUDE {} always means an initial join.
2265 * A state change *to* INCLUDE {} always means a final leave.
2266 *
2267 * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can
2268 * save ourselves a bunch of work; any exclusive mode groups need not
2269 * compute source filter lists.
2270 *
2271 * VIMAGE: curvnet should have been set by caller, as this routine
2272 * is called from the socket option handlers.
2273 */
2274int
2275igmp_change_state(struct in_multi *inm)
2276{
2277	struct igmp_ifinfo *igi;
2278	struct ifnet *ifp;
2279	int error;
2280
2281	IN_MULTI_LOCK_ASSERT();
2282
2283	error = 0;
2284
2285	/*
2286	 * Try to detect if the upper layer just asked us to change state
2287	 * for an interface which has now gone away.
2288	 */
2289	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
2290	ifp = inm->inm_ifma->ifma_ifp;
2291	if (ifp != NULL) {
2292		/*
2293		 * Sanity check that netinet's notion of ifp is the
2294		 * same as net's.
2295		 */
2296		KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
2297	}
2298
2299	IGMP_LOCK();
2300
2301	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
2302	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
2303
2304	/*
2305	 * If we detect a state transition to or from MCAST_UNDEFINED
2306	 * for this group, then we are starting or finishing an IGMP
2307	 * life cycle for this group.
2308	 */
2309	if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) {
2310		CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__,
2311		    inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode);
2312		if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) {
2313			CTR1(KTR_IGMPV3, "%s: initial join", __func__);
2314			error = igmp_initial_join(inm, igi);
2315			goto out_locked;
2316		} else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) {
2317			CTR1(KTR_IGMPV3, "%s: final leave", __func__);
2318			igmp_final_leave(inm, igi);
2319			goto out_locked;
2320		}
2321	} else {
2322		CTR1(KTR_IGMPV3, "%s: filter set change", __func__);
2323	}
2324
2325	error = igmp_handle_state_change(inm, igi);
2326
2327out_locked:
2328	IGMP_UNLOCK();
2329	return (error);
2330}
2331
2332/*
2333 * Perform the initial join for an IGMP group.
2334 *
2335 * When joining a group:
2336 *  If the group should have its IGMP traffic suppressed, do nothing.
2337 *  IGMPv1 starts sending IGMPv1 host membership reports.
2338 *  IGMPv2 starts sending IGMPv2 host membership reports.
2339 *  IGMPv3 will schedule an IGMPv3 state-change report containing the
2340 *  initial state of the membership.
2341 */
2342static int
2343igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi)
2344{
2345	struct ifnet		*ifp;
2346	struct ifqueue		*ifq;
2347	int			 error, retval, syncstates;
2348
2349	CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)",
2350	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
2351	    inm->inm_ifp->if_xname);
2352
2353	error = 0;
2354	syncstates = 1;
2355
2356	ifp = inm->inm_ifp;
2357
2358	IN_MULTI_LOCK_ASSERT();
2359	IGMP_LOCK_ASSERT();
2360
2361	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
2362
2363	/*
2364	 * Groups joined on loopback or marked as 'not reported',
2365	 * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and
2366	 * are never reported in any IGMP protocol exchanges.
2367	 * All other groups enter the appropriate IGMP state machine
2368	 * for the version in use on this link.
2369	 * A link marked as IGIF_SILENT causes IGMP to be completely
2370	 * disabled for the link.
2371	 */
2372	if ((ifp->if_flags & IFF_LOOPBACK) ||
2373	    (igi->igi_flags & IGIF_SILENT) ||
2374	    !igmp_isgroupreported(inm->inm_addr)) {
2375		CTR1(KTR_IGMPV3,
2376"%s: not kicking state machine for silent group", __func__);
2377		inm->inm_state = IGMP_SILENT_MEMBER;
2378		inm->inm_timer = 0;
2379	} else {
2380		/*
2381		 * Deal with overlapping in_multi lifecycle.
2382		 * If this group was LEAVING, then make sure
2383		 * we drop the reference we picked up to keep the
2384		 * group around for the final INCLUDE {} enqueue.
2385		 */
2386		if (igi->igi_version == IGMP_VERSION_3 &&
2387		    inm->inm_state == IGMP_LEAVING_MEMBER)
2388			inm_release_locked(inm);
2389
2390		inm->inm_state = IGMP_REPORTING_MEMBER;
2391
2392		switch (igi->igi_version) {
2393		case IGMP_VERSION_1:
2394		case IGMP_VERSION_2:
2395			inm->inm_state = IGMP_IDLE_MEMBER;
2396			error = igmp_v1v2_queue_report(inm,
2397			    (igi->igi_version == IGMP_VERSION_2) ?
2398			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
2399			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
2400			if (error == 0) {
2401				inm->inm_timer = IGMP_RANDOM_DELAY(
2402				    IGMP_V1V2_MAX_RI * PR_FASTHZ);
2403				V_current_state_timers_running = 1;
2404			}
2405			break;
2406
2407		case IGMP_VERSION_3:
2408			/*
2409			 * Defer update of T0 to T1, until the first copy
2410			 * of the state change has been transmitted.
2411			 */
2412			syncstates = 0;
2413
2414			/*
2415			 * Immediately enqueue a State-Change Report for
2416			 * this interface, freeing any previous reports.
2417			 * Don't kick the timers if there is nothing to do,
2418			 * or if an error occurred.
2419			 */
2420			ifq = &inm->inm_scq;
2421			_IF_DRAIN(ifq);
2422			retval = igmp_v3_enqueue_group_record(ifq, inm, 1,
2423			    0, 0);
2424			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
2425			    __func__, retval);
2426			if (retval <= 0) {
2427				error = retval * -1;
2428				break;
2429			}
2430
2431			/*
2432			 * Schedule transmission of pending state-change
2433			 * report up to RV times for this link. The timer
2434			 * will fire at the next igmp_fasttimo (~200ms),
2435			 * giving us an opportunity to merge the reports.
2436			 */
2437			if (igi->igi_flags & IGIF_LOOPBACK) {
2438				inm->inm_scrv = 1;
2439			} else {
2440				KASSERT(igi->igi_rv > 1,
2441				   ("%s: invalid robustness %d", __func__,
2442				    igi->igi_rv));
2443				inm->inm_scrv = igi->igi_rv;
2444			}
2445			inm->inm_sctimer = 1;
2446			V_state_change_timers_running = 1;
2447
2448			error = 0;
2449			break;
2450		}
2451	}
2452
2453	/*
2454	 * Only update the T0 state if state change is atomic,
2455	 * i.e. we don't need to wait for a timer to fire before we
2456	 * can consider the state change to have been communicated.
2457	 */
2458	if (syncstates) {
2459		inm_commit(inm);
2460		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
2461		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2462	}
2463
2464	return (error);
2465}
2466
2467/*
2468 * Issue an intermediate state change during the IGMP life-cycle.
2469 */
2470static int
2471igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi)
2472{
2473	struct ifnet		*ifp;
2474	int			 retval;
2475
2476	CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)",
2477	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
2478	    inm->inm_ifp->if_xname);
2479
2480	ifp = inm->inm_ifp;
2481
2482	IN_MULTI_LOCK_ASSERT();
2483	IGMP_LOCK_ASSERT();
2484
2485	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
2486
2487	if ((ifp->if_flags & IFF_LOOPBACK) ||
2488	    (igi->igi_flags & IGIF_SILENT) ||
2489	    !igmp_isgroupreported(inm->inm_addr) ||
2490	    (igi->igi_version != IGMP_VERSION_3)) {
2491		if (!igmp_isgroupreported(inm->inm_addr)) {
2492			CTR1(KTR_IGMPV3,
2493"%s: not kicking state machine for silent group", __func__);
2494		}
2495		CTR1(KTR_IGMPV3, "%s: nothing to do", __func__);
2496		inm_commit(inm);
2497		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
2498		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2499		return (0);
2500	}
2501
2502	_IF_DRAIN(&inm->inm_scq);
2503
2504	retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0);
2505	CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval);
2506	if (retval <= 0)
2507		return (-retval);
2508
2509	/*
2510	 * If record(s) were enqueued, start the state-change
2511	 * report timer for this group.
2512	 */
2513	inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv);
2514	inm->inm_sctimer = 1;
2515	V_state_change_timers_running = 1;
2516
2517	return (0);
2518}
2519
2520/*
2521 * Perform the final leave for an IGMP group.
2522 *
2523 * When leaving a group:
2524 *  IGMPv1 does nothing.
2525 *  IGMPv2 sends a host leave message, if and only if we are the reporter.
2526 *  IGMPv3 enqueues a state-change report containing a transition
2527 *  to INCLUDE {} for immediate transmission.
2528 */
2529static void
2530igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi)
2531{
2532	int syncstates;
2533
2534	syncstates = 1;
2535
2536	CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)",
2537	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
2538	    inm->inm_ifp->if_xname);
2539
2540	IN_MULTI_LOCK_ASSERT();
2541	IGMP_LOCK_ASSERT();
2542
2543	switch (inm->inm_state) {
2544	case IGMP_NOT_MEMBER:
2545	case IGMP_SILENT_MEMBER:
2546	case IGMP_LEAVING_MEMBER:
2547		/* Already leaving or left; do nothing. */
2548		CTR1(KTR_IGMPV3,
2549"%s: not kicking state machine for silent group", __func__);
2550		break;
2551	case IGMP_REPORTING_MEMBER:
2552	case IGMP_IDLE_MEMBER:
2553	case IGMP_G_QUERY_PENDING_MEMBER:
2554	case IGMP_SG_QUERY_PENDING_MEMBER:
2555		if (igi->igi_version == IGMP_VERSION_2) {
2556#ifdef INVARIANTS
2557			if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
2558			    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
2559			panic("%s: IGMPv3 state reached, not IGMPv3 mode",
2560			     __func__);
2561#endif
2562			igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE);
2563			inm->inm_state = IGMP_NOT_MEMBER;
2564		} else if (igi->igi_version == IGMP_VERSION_3) {
2565			/*
2566			 * Stop group timer and all pending reports.
2567			 * Immediately enqueue a state-change report
2568			 * TO_IN {} to be sent on the next fast timeout,
2569			 * giving us an opportunity to merge reports.
2570			 */
2571			_IF_DRAIN(&inm->inm_scq);
2572			inm->inm_timer = 0;
2573			if (igi->igi_flags & IGIF_LOOPBACK) {
2574				inm->inm_scrv = 1;
2575			} else {
2576				inm->inm_scrv = igi->igi_rv;
2577			}
2578			CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d "
2579			    "pending retransmissions.", __func__,
2580			    inet_ntoa(inm->inm_addr),
2581			    inm->inm_ifp->if_xname, inm->inm_scrv);
2582			if (inm->inm_scrv == 0) {
2583				inm->inm_state = IGMP_NOT_MEMBER;
2584				inm->inm_sctimer = 0;
2585			} else {
2586				int retval;
2587
2588				inm_acquire_locked(inm);
2589
2590				retval = igmp_v3_enqueue_group_record(
2591				    &inm->inm_scq, inm, 1, 0, 0);
2592				KASSERT(retval != 0,
2593				    ("%s: enqueue record = %d", __func__,
2594				     retval));
2595
2596				inm->inm_state = IGMP_LEAVING_MEMBER;
2597				inm->inm_sctimer = 1;
2598				V_state_change_timers_running = 1;
2599				syncstates = 0;
2600			}
2601			break;
2602		}
2603		break;
2604	case IGMP_LAZY_MEMBER:
2605	case IGMP_SLEEPING_MEMBER:
2606	case IGMP_AWAKENING_MEMBER:
2607		/* Our reports are suppressed; do nothing. */
2608		break;
2609	}
2610
2611	if (syncstates) {
2612		inm_commit(inm);
2613		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
2614		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2615		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
2616		CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s",
2617		    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2618	}
2619}
2620
2621/*
2622 * Enqueue an IGMPv3 group record to the given output queue.
2623 *
2624 * XXX This function could do with having the allocation code
2625 * split out, and the multiple-tree-walks coalesced into a single
2626 * routine as has been done in igmp_v3_enqueue_filter_change().
2627 *
2628 * If is_state_change is zero, a current-state record is appended.
2629 * If is_state_change is non-zero, a state-change report is appended.
2630 *
2631 * If is_group_query is non-zero, an mbuf packet chain is allocated.
2632 * If is_group_query is zero, and if there is a packet with free space
2633 * at the tail of the queue, it will be appended to providing there
2634 * is enough free space.
2635 * Otherwise a new mbuf packet chain is allocated.
2636 *
2637 * If is_source_query is non-zero, each source is checked to see if
2638 * it was recorded for a Group-Source query, and will be omitted if
2639 * it is not both in-mode and recorded.
2640 *
2641 * The function will attempt to allocate leading space in the packet
2642 * for the IP/IGMP header to be prepended without fragmenting the chain.
2643 *
2644 * If successful the size of all data appended to the queue is returned,
2645 * otherwise an error code less than zero is returned, or zero if
2646 * no record(s) were appended.
2647 */
2648static int
2649igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
2650    const int is_state_change, const int is_group_query,
2651    const int is_source_query)
2652{
2653	struct igmp_grouprec	 ig;
2654	struct igmp_grouprec	*pig;
2655	struct ifnet		*ifp;
2656	struct ip_msource	*ims, *nims;
2657	struct mbuf		*m0, *m, *md;
2658	int			 error, is_filter_list_change;
2659	int			 minrec0len, m0srcs, msrcs, nbytes, off;
2660	int			 record_has_sources;
2661	int			 now;
2662	int			 type;
2663	in_addr_t		 naddr;
2664	uint8_t			 mode;
2665
2666	IN_MULTI_LOCK_ASSERT();
2667
2668	error = 0;
2669	ifp = inm->inm_ifp;
2670	is_filter_list_change = 0;
2671	m = NULL;
2672	m0 = NULL;
2673	m0srcs = 0;
2674	msrcs = 0;
2675	nbytes = 0;
2676	nims = NULL;
2677	record_has_sources = 1;
2678	pig = NULL;
2679	type = IGMP_DO_NOTHING;
2680	mode = inm->inm_st[1].iss_fmode;
2681
2682	/*
2683	 * If we did not transition out of ASM mode during t0->t1,
2684	 * and there are no source nodes to process, we can skip
2685	 * the generation of source records.
2686	 */
2687	if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 &&
2688	    inm->inm_nsrc == 0)
2689		record_has_sources = 0;
2690
2691	if (is_state_change) {
2692		/*
2693		 * Queue a state change record.
2694		 * If the mode did not change, and there are non-ASM
2695		 * listeners or source filters present,
2696		 * we potentially need to issue two records for the group.
2697		 * If we are transitioning to MCAST_UNDEFINED, we need
2698		 * not send any sources.
2699		 * If there are ASM listeners, and there was no filter
2700		 * mode transition of any kind, do nothing.
2701		 */
2702		if (mode != inm->inm_st[0].iss_fmode) {
2703			if (mode == MCAST_EXCLUDE) {
2704				CTR1(KTR_IGMPV3, "%s: change to EXCLUDE",
2705				    __func__);
2706				type = IGMP_CHANGE_TO_EXCLUDE_MODE;
2707			} else {
2708				CTR1(KTR_IGMPV3, "%s: change to INCLUDE",
2709				    __func__);
2710				type = IGMP_CHANGE_TO_INCLUDE_MODE;
2711				if (mode == MCAST_UNDEFINED)
2712					record_has_sources = 0;
2713			}
2714		} else {
2715			if (record_has_sources) {
2716				is_filter_list_change = 1;
2717			} else {
2718				type = IGMP_DO_NOTHING;
2719			}
2720		}
2721	} else {
2722		/*
2723		 * Queue a current state record.
2724		 */
2725		if (mode == MCAST_EXCLUDE) {
2726			type = IGMP_MODE_IS_EXCLUDE;
2727		} else if (mode == MCAST_INCLUDE) {
2728			type = IGMP_MODE_IS_INCLUDE;
2729			KASSERT(inm->inm_st[1].iss_asm == 0,
2730			    ("%s: inm %p is INCLUDE but ASM count is %d",
2731			     __func__, inm, inm->inm_st[1].iss_asm));
2732		}
2733	}
2734
2735	/*
2736	 * Generate the filter list changes using a separate function.
2737	 */
2738	if (is_filter_list_change)
2739		return (igmp_v3_enqueue_filter_change(ifq, inm));
2740
2741	if (type == IGMP_DO_NOTHING) {
2742		CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s",
2743		    __func__, inet_ntoa(inm->inm_addr),
2744		    inm->inm_ifp->if_xname);
2745		return (0);
2746	}
2747
2748	/*
2749	 * If any sources are present, we must be able to fit at least
2750	 * one in the trailing space of the tail packet's mbuf,
2751	 * ideally more.
2752	 */
2753	minrec0len = sizeof(struct igmp_grouprec);
2754	if (record_has_sources)
2755		minrec0len += sizeof(in_addr_t);
2756
2757	CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__,
2758	    igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr),
2759	    inm->inm_ifp->if_xname);
2760
2761	/*
2762	 * Check if we have a packet in the tail of the queue for this
2763	 * group into which the first group record for this group will fit.
2764	 * Otherwise allocate a new packet.
2765	 * Always allocate leading space for IP+RA_OPT+IGMP+REPORT.
2766	 * Note: Group records for G/GSR query responses MUST be sent
2767	 * in their own packet.
2768	 */
2769	m0 = ifq->ifq_tail;
2770	if (!is_group_query &&
2771	    m0 != NULL &&
2772	    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) &&
2773	    (m0->m_pkthdr.len + minrec0len) <
2774	     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
2775		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
2776			    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
2777		m = m0;
2778		CTR1(KTR_IGMPV3, "%s: use existing packet", __func__);
2779	} else {
2780		if (_IF_QFULL(ifq)) {
2781			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
2782			return (-ENOMEM);
2783		}
2784		m = NULL;
2785		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
2786		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
2787		if (!is_state_change && !is_group_query) {
2788			m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2789			if (m)
2790				m->m_data += IGMP_LEADINGSPACE;
2791		}
2792		if (m == NULL) {
2793			m = m_gethdr(M_DONTWAIT, MT_DATA);
2794			if (m)
2795				MH_ALIGN(m, IGMP_LEADINGSPACE);
2796		}
2797		if (m == NULL)
2798			return (-ENOMEM);
2799
2800		igmp_save_context(m, ifp);
2801
2802		CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__);
2803	}
2804
2805	/*
2806	 * Append group record.
2807	 * If we have sources, we don't know how many yet.
2808	 */
2809	ig.ig_type = type;
2810	ig.ig_datalen = 0;
2811	ig.ig_numsrc = 0;
2812	ig.ig_group = inm->inm_addr;
2813	if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
2814		if (m != m0)
2815			m_freem(m);
2816		CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
2817		return (-ENOMEM);
2818	}
2819	nbytes += sizeof(struct igmp_grouprec);
2820
2821	/*
2822	 * Append as many sources as will fit in the first packet.
2823	 * If we are appending to a new packet, the chain allocation
2824	 * may potentially use clusters; use m_getptr() in this case.
2825	 * If we are appending to an existing packet, we need to obtain
2826	 * a pointer to the group record after m_append(), in case a new
2827	 * mbuf was allocated.
2828	 * Only append sources which are in-mode at t1. If we are
2829	 * transitioning to MCAST_UNDEFINED state on the group, do not
2830	 * include source entries.
2831	 * Only report recorded sources in our filter set when responding
2832	 * to a group-source query.
2833	 */
2834	if (record_has_sources) {
2835		if (m == m0) {
2836			md = m_last(m);
2837			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
2838			    md->m_len - nbytes);
2839		} else {
2840			md = m_getptr(m, 0, &off);
2841			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
2842			    off);
2843		}
2844		msrcs = 0;
2845		RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) {
2846			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
2847			    inet_ntoa_haddr(ims->ims_haddr));
2848			now = ims_get_mode(inm, ims, 1);
2849			CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now);
2850			if ((now != mode) ||
2851			    (now == mode && mode == MCAST_UNDEFINED)) {
2852				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
2853				continue;
2854			}
2855			if (is_source_query && ims->ims_stp == 0) {
2856				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
2857				    __func__);
2858				continue;
2859			}
2860			CTR1(KTR_IGMPV3, "%s: append node", __func__);
2861			naddr = htonl(ims->ims_haddr);
2862			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
2863				if (m != m0)
2864					m_freem(m);
2865				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
2866				    __func__);
2867				return (-ENOMEM);
2868			}
2869			nbytes += sizeof(in_addr_t);
2870			++msrcs;
2871			if (msrcs == m0srcs)
2872				break;
2873		}
2874		CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__,
2875		    msrcs);
2876		pig->ig_numsrc = htons(msrcs);
2877		nbytes += (msrcs * sizeof(in_addr_t));
2878	}
2879
2880	if (is_source_query && msrcs == 0) {
2881		CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__);
2882		if (m != m0)
2883			m_freem(m);
2884		return (0);
2885	}
2886
2887	/*
2888	 * We are good to go with first packet.
2889	 */
2890	if (m != m0) {
2891		CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__);
2892		m->m_pkthdr.PH_vt.vt_nrecs = 1;
2893		_IF_ENQUEUE(ifq, m);
2894	} else
2895		m->m_pkthdr.PH_vt.vt_nrecs++;
2896
2897	/*
2898	 * No further work needed if no source list in packet(s).
2899	 */
2900	if (!record_has_sources)
2901		return (nbytes);
2902
2903	/*
2904	 * Whilst sources remain to be announced, we need to allocate
2905	 * a new packet and fill out as many sources as will fit.
2906	 * Always try for a cluster first.
2907	 */
2908	while (nims != NULL) {
2909		if (_IF_QFULL(ifq)) {
2910			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
2911			return (-ENOMEM);
2912		}
2913		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2914		if (m)
2915			m->m_data += IGMP_LEADINGSPACE;
2916		if (m == NULL) {
2917			m = m_gethdr(M_DONTWAIT, MT_DATA);
2918			if (m)
2919				MH_ALIGN(m, IGMP_LEADINGSPACE);
2920		}
2921		if (m == NULL)
2922			return (-ENOMEM);
2923		igmp_save_context(m, ifp);
2924		md = m_getptr(m, 0, &off);
2925		pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off);
2926		CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__);
2927
2928		if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
2929			if (m != m0)
2930				m_freem(m);
2931			CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
2932			return (-ENOMEM);
2933		}
2934		m->m_pkthdr.PH_vt.vt_nrecs = 1;
2935		nbytes += sizeof(struct igmp_grouprec);
2936
2937		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
2938		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
2939
2940		msrcs = 0;
2941		RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
2942			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
2943			    inet_ntoa_haddr(ims->ims_haddr));
2944			now = ims_get_mode(inm, ims, 1);
2945			if ((now != mode) ||
2946			    (now == mode && mode == MCAST_UNDEFINED)) {
2947				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
2948				continue;
2949			}
2950			if (is_source_query && ims->ims_stp == 0) {
2951				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
2952				    __func__);
2953				continue;
2954			}
2955			CTR1(KTR_IGMPV3, "%s: append node", __func__);
2956			naddr = htonl(ims->ims_haddr);
2957			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
2958				if (m != m0)
2959					m_freem(m);
2960				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
2961				    __func__);
2962				return (-ENOMEM);
2963			}
2964			++msrcs;
2965			if (msrcs == m0srcs)
2966				break;
2967		}
2968		pig->ig_numsrc = htons(msrcs);
2969		nbytes += (msrcs * sizeof(in_addr_t));
2970
2971		CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__);
2972		_IF_ENQUEUE(ifq, m);
2973	}
2974
2975	return (nbytes);
2976}
2977
2978/*
2979 * Type used to mark record pass completion.
2980 * We exploit the fact we can cast to this easily from the
2981 * current filter modes on each ip_msource node.
2982 */
2983typedef enum {
2984	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
2985	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
2986	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
2987	REC_FULL = REC_ALLOW | REC_BLOCK
2988} rectype_t;
2989
2990/*
2991 * Enqueue an IGMPv3 filter list change to the given output queue.
2992 *
2993 * Source list filter state is held in an RB-tree. When the filter list
2994 * for a group is changed without changing its mode, we need to compute
2995 * the deltas between T0 and T1 for each source in the filter set,
2996 * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
2997 *
2998 * As we may potentially queue two record types, and the entire R-B tree
2999 * needs to be walked at once, we break this out into its own function
3000 * so we can generate a tightly packed queue of packets.
3001 *
3002 * XXX This could be written to only use one tree walk, although that makes
3003 * serializing into the mbuf chains a bit harder. For now we do two walks
3004 * which makes things easier on us, and it may or may not be harder on
3005 * the L2 cache.
3006 *
3007 * If successful the size of all data appended to the queue is returned,
3008 * otherwise an error code less than zero is returned, or zero if
3009 * no record(s) were appended.
3010 */
3011static int
3012igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm)
3013{
3014	static const int MINRECLEN =
3015	    sizeof(struct igmp_grouprec) + sizeof(in_addr_t);
3016	struct ifnet		*ifp;
3017	struct igmp_grouprec	 ig;
3018	struct igmp_grouprec	*pig;
3019	struct ip_msource	*ims, *nims;
3020	struct mbuf		*m, *m0, *md;
3021	in_addr_t		 naddr;
3022	int			 m0srcs, nbytes, off, rsrcs, schanged;
3023	int			 nallow, nblock;
3024	uint8_t			 mode, now, then;
3025	rectype_t		 crt, drt, nrt;
3026
3027	IN_MULTI_LOCK_ASSERT();
3028
3029	if (inm->inm_nsrc == 0 ||
3030	    (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0))
3031		return (0);
3032
3033	ifp = inm->inm_ifp;			/* interface */
3034	mode = inm->inm_st[1].iss_fmode;	/* filter mode at t1 */
3035	crt = REC_NONE;	/* current group record type */
3036	drt = REC_NONE;	/* mask of completed group record types */
3037	nrt = REC_NONE;	/* record type for current node */
3038	m0srcs = 0;	/* # source which will fit in current mbuf chain */
3039	nbytes = 0;	/* # of bytes appended to group's state-change queue */
3040	rsrcs = 0;	/* # sources encoded in current record */
3041	schanged = 0;	/* # nodes encoded in overall filter change */
3042	nallow = 0;	/* # of source entries in ALLOW_NEW */
3043	nblock = 0;	/* # of source entries in BLOCK_OLD */
3044	nims = NULL;	/* next tree node pointer */
3045
3046	/*
3047	 * For each possible filter record mode.
3048	 * The first kind of source we encounter tells us which
3049	 * is the first kind of record we start appending.
3050	 * If a node transitioned to UNDEFINED at t1, its mode is treated
3051	 * as the inverse of the group's filter mode.
3052	 */
3053	while (drt != REC_FULL) {
3054		do {
3055			m0 = ifq->ifq_tail;
3056			if (m0 != NULL &&
3057			    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
3058			     IGMP_V3_REPORT_MAXRECS) &&
3059			    (m0->m_pkthdr.len + MINRECLEN) <
3060			     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
3061				m = m0;
3062				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
3063					    sizeof(struct igmp_grouprec)) /
3064				    sizeof(in_addr_t);
3065				CTR1(KTR_IGMPV3,
3066				    "%s: use previous packet", __func__);
3067			} else {
3068				m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
3069				if (m)
3070					m->m_data += IGMP_LEADINGSPACE;
3071				if (m == NULL) {
3072					m = m_gethdr(M_DONTWAIT, MT_DATA);
3073					if (m)
3074						MH_ALIGN(m, IGMP_LEADINGSPACE);
3075				}
3076				if (m == NULL) {
3077					CTR1(KTR_IGMPV3,
3078					    "%s: m_get*() failed", __func__);
3079					return (-ENOMEM);
3080				}
3081				m->m_pkthdr.PH_vt.vt_nrecs = 0;
3082				igmp_save_context(m, ifp);
3083				m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
3084				    sizeof(struct igmp_grouprec)) /
3085				    sizeof(in_addr_t);
3086				CTR1(KTR_IGMPV3,
3087				    "%s: allocated new packet", __func__);
3088			}
3089			/*
3090			 * Append the IGMP group record header to the
3091			 * current packet's data area.
3092			 * Recalculate pointer to free space for next
3093			 * group record, in case m_append() allocated
3094			 * a new mbuf or cluster.
3095			 */
3096			memset(&ig, 0, sizeof(ig));
3097			ig.ig_group = inm->inm_addr;
3098			if (!m_append(m, sizeof(ig), (void *)&ig)) {
3099				if (m != m0)
3100					m_freem(m);
3101				CTR1(KTR_IGMPV3,
3102				    "%s: m_append() failed", __func__);
3103				return (-ENOMEM);
3104			}
3105			nbytes += sizeof(struct igmp_grouprec);
3106			if (m == m0) {
3107				md = m_last(m);
3108				pig = (struct igmp_grouprec *)(mtod(md,
3109				    uint8_t *) + md->m_len - nbytes);
3110			} else {
3111				md = m_getptr(m, 0, &off);
3112				pig = (struct igmp_grouprec *)(mtod(md,
3113				    uint8_t *) + off);
3114			}
3115			/*
3116			 * Begin walking the tree for this record type
3117			 * pass, or continue from where we left off
3118			 * previously if we had to allocate a new packet.
3119			 * Only report deltas in-mode at t1.
3120			 * We need not report included sources as allowed
3121			 * if we are in inclusive mode on the group,
3122			 * however the converse is not true.
3123			 */
3124			rsrcs = 0;
3125			if (nims == NULL)
3126				nims = RB_MIN(ip_msource_tree, &inm->inm_srcs);
3127			RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
3128				CTR2(KTR_IGMPV3, "%s: visit node %s",
3129				    __func__, inet_ntoa_haddr(ims->ims_haddr));
3130				now = ims_get_mode(inm, ims, 1);
3131				then = ims_get_mode(inm, ims, 0);
3132				CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d",
3133				    __func__, then, now);
3134				if (now == then) {
3135					CTR1(KTR_IGMPV3,
3136					    "%s: skip unchanged", __func__);
3137					continue;
3138				}
3139				if (mode == MCAST_EXCLUDE &&
3140				    now == MCAST_INCLUDE) {
3141					CTR1(KTR_IGMPV3,
3142					    "%s: skip IN src on EX group",
3143					    __func__);
3144					continue;
3145				}
3146				nrt = (rectype_t)now;
3147				if (nrt == REC_NONE)
3148					nrt = (rectype_t)(~mode & REC_FULL);
3149				if (schanged++ == 0) {
3150					crt = nrt;
3151				} else if (crt != nrt)
3152					continue;
3153				naddr = htonl(ims->ims_haddr);
3154				if (!m_append(m, sizeof(in_addr_t),
3155				    (void *)&naddr)) {
3156					if (m != m0)
3157						m_freem(m);
3158					CTR1(KTR_IGMPV3,
3159					    "%s: m_append() failed", __func__);
3160					return (-ENOMEM);
3161				}
3162				nallow += !!(crt == REC_ALLOW);
3163				nblock += !!(crt == REC_BLOCK);
3164				if (++rsrcs == m0srcs)
3165					break;
3166			}
3167			/*
3168			 * If we did not append any tree nodes on this
3169			 * pass, back out of allocations.
3170			 */
3171			if (rsrcs == 0) {
3172				nbytes -= sizeof(struct igmp_grouprec);
3173				if (m != m0) {
3174					CTR1(KTR_IGMPV3,
3175					    "%s: m_free(m)", __func__);
3176					m_freem(m);
3177				} else {
3178					CTR1(KTR_IGMPV3,
3179					    "%s: m_adj(m, -ig)", __func__);
3180					m_adj(m, -((int)sizeof(
3181					    struct igmp_grouprec)));
3182				}
3183				continue;
3184			}
3185			nbytes += (rsrcs * sizeof(in_addr_t));
3186			if (crt == REC_ALLOW)
3187				pig->ig_type = IGMP_ALLOW_NEW_SOURCES;
3188			else if (crt == REC_BLOCK)
3189				pig->ig_type = IGMP_BLOCK_OLD_SOURCES;
3190			pig->ig_numsrc = htons(rsrcs);
3191			/*
3192			 * Count the new group record, and enqueue this
3193			 * packet if it wasn't already queued.
3194			 */
3195			m->m_pkthdr.PH_vt.vt_nrecs++;
3196			if (m != m0)
3197				_IF_ENQUEUE(ifq, m);
3198		} while (nims != NULL);
3199		drt |= crt;
3200		crt = (~crt & REC_FULL);
3201	}
3202
3203	CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
3204	    nallow, nblock);
3205
3206	return (nbytes);
3207}
3208
3209static int
3210igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq)
3211{
3212	struct ifqueue	*gq;
3213	struct mbuf	*m;		/* pending state-change */
3214	struct mbuf	*m0;		/* copy of pending state-change */
3215	struct mbuf	*mt;		/* last state-change in packet */
3216	int		 docopy, domerge;
3217	u_int		 recslen;
3218
3219	docopy = 0;
3220	domerge = 0;
3221	recslen = 0;
3222
3223	IN_MULTI_LOCK_ASSERT();
3224	IGMP_LOCK_ASSERT();
3225
3226	/*
3227	 * If there are further pending retransmissions, make a writable
3228	 * copy of each queued state-change message before merging.
3229	 */
3230	if (inm->inm_scrv > 0)
3231		docopy = 1;
3232
3233	gq = &inm->inm_scq;
3234#ifdef KTR
3235	if (gq->ifq_head == NULL) {
3236		CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty",
3237		    __func__, inm);
3238	}
3239#endif
3240
3241	m = gq->ifq_head;
3242	while (m != NULL) {
3243		/*
3244		 * Only merge the report into the current packet if
3245		 * there is sufficient space to do so; an IGMPv3 report
3246		 * packet may only contain 65,535 group records.
3247		 * Always use a simple mbuf chain concatentation to do this,
3248		 * as large state changes for single groups may have
3249		 * allocated clusters.
3250		 */
3251		domerge = 0;
3252		mt = ifscq->ifq_tail;
3253		if (mt != NULL) {
3254			recslen = m_length(m, NULL);
3255
3256			if ((mt->m_pkthdr.PH_vt.vt_nrecs +
3257			    m->m_pkthdr.PH_vt.vt_nrecs <=
3258			    IGMP_V3_REPORT_MAXRECS) &&
3259			    (mt->m_pkthdr.len + recslen <=
3260			    (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE)))
3261				domerge = 1;
3262		}
3263
3264		if (!domerge && _IF_QFULL(gq)) {
3265			CTR2(KTR_IGMPV3,
3266			    "%s: outbound queue full, skipping whole packet %p",
3267			    __func__, m);
3268			mt = m->m_nextpkt;
3269			if (!docopy)
3270				m_freem(m);
3271			m = mt;
3272			continue;
3273		}
3274
3275		if (!docopy) {
3276			CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m);
3277			_IF_DEQUEUE(gq, m0);
3278			m = m0->m_nextpkt;
3279		} else {
3280			CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m);
3281			m0 = m_dup(m, M_NOWAIT);
3282			if (m0 == NULL)
3283				return (ENOMEM);
3284			m0->m_nextpkt = NULL;
3285			m = m->m_nextpkt;
3286		}
3287
3288		if (!domerge) {
3289			CTR3(KTR_IGMPV3, "%s: queueing %p to ifscq %p)",
3290			    __func__, m0, ifscq);
3291			_IF_ENQUEUE(ifscq, m0);
3292		} else {
3293			struct mbuf *mtl;	/* last mbuf of packet mt */
3294
3295			CTR3(KTR_IGMPV3, "%s: merging %p with ifscq tail %p)",
3296			    __func__, m0, mt);
3297
3298			mtl = m_last(mt);
3299			m0->m_flags &= ~M_PKTHDR;
3300			mt->m_pkthdr.len += recslen;
3301			mt->m_pkthdr.PH_vt.vt_nrecs +=
3302			    m0->m_pkthdr.PH_vt.vt_nrecs;
3303
3304			mtl->m_next = m0;
3305		}
3306	}
3307
3308	return (0);
3309}
3310
3311/*
3312 * Respond to a pending IGMPv3 General Query.
3313 */
3314static void
3315igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi)
3316{
3317	struct ifmultiaddr	*ifma, *tifma;
3318	struct ifnet		*ifp;
3319	struct in_multi		*inm;
3320	int			 retval, loop;
3321
3322	IN_MULTI_LOCK_ASSERT();
3323	IGMP_LOCK_ASSERT();
3324
3325	KASSERT(igi->igi_version == IGMP_VERSION_3,
3326	    ("%s: called when version %d", __func__, igi->igi_version));
3327
3328	ifp = igi->igi_ifp;
3329
3330	IF_ADDR_LOCK(ifp);
3331	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, tifma) {
3332		if (ifma->ifma_addr->sa_family != AF_INET ||
3333		    ifma->ifma_protospec == NULL)
3334			continue;
3335
3336		inm = (struct in_multi *)ifma->ifma_protospec;
3337		KASSERT(ifp == inm->inm_ifp,
3338		    ("%s: inconsistent ifp", __func__));
3339
3340		switch (inm->inm_state) {
3341		case IGMP_NOT_MEMBER:
3342		case IGMP_SILENT_MEMBER:
3343			break;
3344		case IGMP_REPORTING_MEMBER:
3345		case IGMP_IDLE_MEMBER:
3346		case IGMP_LAZY_MEMBER:
3347		case IGMP_SLEEPING_MEMBER:
3348		case IGMP_AWAKENING_MEMBER:
3349			inm->inm_state = IGMP_REPORTING_MEMBER;
3350			retval = igmp_v3_enqueue_group_record(&igi->igi_gq,
3351			    inm, 0, 0, 0);
3352			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
3353			    __func__, retval);
3354			break;
3355		case IGMP_G_QUERY_PENDING_MEMBER:
3356		case IGMP_SG_QUERY_PENDING_MEMBER:
3357		case IGMP_LEAVING_MEMBER:
3358			break;
3359		}
3360	}
3361	IF_ADDR_UNLOCK(ifp);
3362
3363	loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
3364	igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop);
3365
3366	/*
3367	 * Slew transmission of bursts over 500ms intervals.
3368	 */
3369	if (igi->igi_gq.ifq_head != NULL) {
3370		igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY(
3371		    IGMP_RESPONSE_BURST_INTERVAL);
3372		V_interface_timers_running = 1;
3373	}
3374}
3375
3376/*
3377 * Transmit the next pending IGMP message in the output queue.
3378 *
3379 * We get called from netisr_processqueue(). A mutex private to igmpoq
3380 * will be acquired and released around this routine.
3381 *
3382 * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
3383 * MRT: Nothing needs to be done, as IGMP traffic is always local to
3384 * a link and uses a link-scope multicast address.
3385 */
3386static void
3387igmp_intr(struct mbuf *m)
3388{
3389	struct ip_moptions	 imo;
3390	struct ifnet		*ifp;
3391	struct mbuf		*ipopts, *m0;
3392	int			 error;
3393	uint32_t		 ifindex;
3394
3395	CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m);
3396
3397	/*
3398	 * Restore VNET image pointer from enqueued mbuf chain
3399	 * before doing anything else. Whilst we use interface
3400	 * indexes to guard against interface detach, they are
3401	 * unique to each VIMAGE and must be retrieved.
3402	 */
3403	CURVNET_SET(m->m_pkthdr.header);
3404	ifindex = igmp_restore_context(m);
3405
3406	/*
3407	 * Check if the ifnet still exists. This limits the scope of
3408	 * any race in the absence of a global ifp lock for low cost
3409	 * (an array lookup).
3410	 */
3411	ifp = ifnet_byindex(ifindex);
3412	if (ifp == NULL) {
3413		CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.",
3414		    __func__, m, ifindex);
3415		m_freem(m);
3416		V_ipstat.ips_noroute++;
3417		goto out;
3418	}
3419
3420	ipopts = V_igmp_sendra ? m_raopt : NULL;
3421
3422	imo.imo_multicast_ttl  = 1;
3423	imo.imo_multicast_vif  = -1;
3424	imo.imo_multicast_loop = (V_ip_mrouter != NULL);
3425
3426	/*
3427	 * If the user requested that IGMP traffic be explicitly
3428	 * redirected to the loopback interface (e.g. they are running a
3429	 * MANET interface and the routing protocol needs to see the
3430	 * updates), handle this now.
3431	 */
3432	if (m->m_flags & M_IGMP_LOOP)
3433		imo.imo_multicast_ifp = V_loif;
3434	else
3435		imo.imo_multicast_ifp = ifp;
3436
3437	if (m->m_flags & M_IGMPV2) {
3438		m0 = m;
3439	} else {
3440		m0 = igmp_v3_encap_report(ifp, m);
3441		if (m0 == NULL) {
3442			CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m);
3443			m_freem(m);
3444			V_ipstat.ips_odropped++;
3445			goto out;
3446		}
3447	}
3448
3449	igmp_scrub_context(m0);
3450	m->m_flags &= ~(M_PROTOFLAGS);
3451	m0->m_pkthdr.rcvif = V_loif;
3452#ifdef MAC
3453	mac_netinet_igmp_send(ifp, m0);
3454#endif
3455	error = ip_output(m0, ipopts, NULL, 0, &imo, NULL);
3456	if (error) {
3457		CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error);
3458		goto out;
3459	}
3460
3461	++V_igmpstat.igps_snd_reports;
3462
3463out:
3464	/*
3465	 * We must restore the existing vnet pointer before
3466	 * continuing as we are run from netisr context.
3467	 */
3468	CURVNET_RESTORE();
3469}
3470
3471/*
3472 * Encapsulate an IGMPv3 report.
3473 *
3474 * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf
3475 * chain has already had its IP/IGMPv3 header prepended. In this case
3476 * the function will not attempt to prepend; the lengths and checksums
3477 * will however be re-computed.
3478 *
3479 * Returns a pointer to the new mbuf chain head, or NULL if the
3480 * allocation failed.
3481 */
3482static struct mbuf *
3483igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
3484{
3485	INIT_VNET_NET(curvnet);
3486	INIT_VNET_INET(curvnet);
3487	struct igmp_report	*igmp;
3488	struct ip		*ip;
3489	int			 hdrlen, igmpreclen;
3490
3491	KASSERT((m->m_flags & M_PKTHDR),
3492	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
3493
3494	igmpreclen = m_length(m, NULL);
3495	hdrlen = sizeof(struct ip) + sizeof(struct igmp_report);
3496
3497	if (m->m_flags & M_IGMPV3_HDR) {
3498		igmpreclen -= hdrlen;
3499	} else {
3500		M_PREPEND(m, hdrlen, M_DONTWAIT);
3501		if (m == NULL)
3502			return (NULL);
3503		m->m_flags |= M_IGMPV3_HDR;
3504	}
3505
3506	CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen);
3507
3508	m->m_data += sizeof(struct ip);
3509	m->m_len -= sizeof(struct ip);
3510
3511	igmp = mtod(m, struct igmp_report *);
3512	igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT;
3513	igmp->ir_rsv1 = 0;
3514	igmp->ir_rsv2 = 0;
3515	igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs);
3516	igmp->ir_cksum = 0;
3517	igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen);
3518	m->m_pkthdr.PH_vt.vt_nrecs = 0;
3519
3520	m->m_data -= sizeof(struct ip);
3521	m->m_len += sizeof(struct ip);
3522
3523	ip = mtod(m, struct ip *);
3524	ip->ip_tos = IPTOS_PREC_INTERNETCONTROL;
3525	ip->ip_len = hdrlen + igmpreclen;
3526	ip->ip_off = IP_DF;
3527	ip->ip_p = IPPROTO_IGMP;
3528	ip->ip_sum = 0;
3529
3530	ip->ip_src.s_addr = INADDR_ANY;
3531
3532	if (m->m_flags & M_IGMP_LOOP) {
3533		struct in_ifaddr *ia;
3534
3535		IFP_TO_IA(ifp, ia);
3536		if (ia != NULL)
3537			ip->ip_src = ia->ia_addr.sin_addr;
3538	}
3539
3540	ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP);
3541
3542	return (m);
3543}
3544
3545#ifdef KTR
3546static char *
3547igmp_rec_type_to_str(const int type)
3548{
3549
3550	switch (type) {
3551		case IGMP_CHANGE_TO_EXCLUDE_MODE:
3552			return "TO_EX";
3553			break;
3554		case IGMP_CHANGE_TO_INCLUDE_MODE:
3555			return "TO_IN";
3556			break;
3557		case IGMP_MODE_IS_EXCLUDE:
3558			return "MODE_EX";
3559			break;
3560		case IGMP_MODE_IS_INCLUDE:
3561			return "MODE_IN";
3562			break;
3563		case IGMP_ALLOW_NEW_SOURCES:
3564			return "ALLOW_NEW";
3565			break;
3566		case IGMP_BLOCK_OLD_SOURCES:
3567			return "BLOCK_OLD";
3568			break;
3569		default:
3570			break;
3571	}
3572	return "unknown";
3573}
3574#endif
3575
3576static void
3577igmp_sysinit(void)
3578{
3579
3580	CTR1(KTR_IGMPV3, "%s: initializing", __func__);
3581
3582	IGMP_LOCK_INIT();
3583	TUNABLE_INT_FETCH("debug.mpsafeigmp", &mpsafe_igmp);
3584
3585	mtx_init(&igmpoq.ifq_mtx, "igmpoq_mtx", NULL, MTX_DEF);
3586	IFQ_SET_MAXLEN(&igmpoq, IFQ_MAXLEN);
3587
3588	m_raopt = igmp_ra_alloc();
3589
3590#if __FreeBSD_version < 800000
3591	netisr_register(NETISR_IGMP, igmp_intr, &igmpoq,
3592	    mpsafe_igmp ? NETISR_MPSAFE : 0);
3593#else
3594	netisr_register(NETISR_IGMP, igmp_intr, &igmpoq,
3595	    mpsafe_igmp ? 0 : NETISR_FORCEQUEUE);
3596#endif
3597}
3598
3599static void
3600igmp_sysuninit(void)
3601{
3602
3603	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
3604
3605	netisr_unregister(NETISR_IGMP);
3606	mtx_destroy(&igmpoq.ifq_mtx);
3607
3608	m_free(m_raopt);
3609	m_raopt = NULL;
3610
3611	IGMP_LOCK_DESTROY();
3612}
3613
3614/*
3615 * Initialize an IGMPv3 instance.
3616 * VIMAGE: Assumes curvnet set by caller and called per vimage.
3617 */
3618static int
3619vnet_igmp_iattach(const void *unused __unused)
3620{
3621	INIT_VNET_INET(curvnet);
3622
3623	CTR1(KTR_IGMPV3, "%s: initializing", __func__);
3624
3625	LIST_INIT(&V_igi_head);
3626
3627	V_current_state_timers_running = 0;
3628	V_state_change_timers_running = 0;
3629	V_interface_timers_running = 0;
3630
3631	/*
3632	 * Initialize sysctls to default values.
3633	 */
3634	V_igmp_recvifkludge = 1;
3635	V_igmp_sendra = 1;
3636	V_igmp_sendlocal = 1;
3637	V_igmp_v1enable = 1;
3638	V_igmp_v2enable = 1;
3639	V_igmp_legacysupp = 0;
3640	V_igmp_default_version = IGMP_VERSION_3;
3641	V_igmp_gsrdelay.tv_sec = 10;
3642	V_igmp_gsrdelay.tv_usec = 0;
3643
3644	memset(&V_igmpstat, 0, sizeof(struct igmpstat));
3645	V_igmpstat.igps_version = IGPS_VERSION_3;
3646	V_igmpstat.igps_len = sizeof(struct igmpstat);
3647
3648	return (0);
3649}
3650
3651static int
3652vnet_igmp_idetach(const void *unused __unused)
3653{
3654	INIT_VNET_INET(curvnet);
3655
3656	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
3657
3658	KASSERT(LIST_EMPTY(&V_igi_head),
3659	    ("%s: igi list not empty; ifnets not detached?", __func__));
3660
3661	return (0);
3662}
3663
3664#ifdef VIMAGE
3665static struct vnet_symmap vnet_igmp_symmap[] = {
3666	VNET_SYMMAP(igmp, igi_head),
3667	VNET_SYMMAP(igmp, igmpstat),
3668	VNET_SYMMAP_END
3669};
3670VNET_MOD_DECLARE(IGMP, igmp, vnet_igmp_iattach, vnet_igmp_idetach,
3671    vnet_igmp_symmap);
3672#endif /* VIMAGE */
3673
3674static int
3675igmp_modevent(module_t mod, int type, void *unused __unused)
3676{
3677
3678    switch (type) {
3679    case MOD_LOAD:
3680	igmp_sysinit();
3681#ifdef VIMAGE
3682	vnet_mod_register(&vnet_igmp_modinfo);
3683#else
3684	(void)vnet_igmp_iattach(NULL);
3685#endif /* VIMAGE */
3686	break;
3687    case MOD_UNLOAD:
3688#ifdef VIMAGE
3689	/*
3690	 * TODO: Allow module unload if any VIMAGE instances
3691	 * are using this module.
3692	 */
3693	return (EBUSY);
3694#else
3695	(void)vnet_igmp_idetach(NULL);
3696#endif /* VIMAGE */
3697	igmp_sysuninit();
3698	break;
3699    default:
3700	return (EOPNOTSUPP);
3701    }
3702    return (0);
3703}
3704
3705static moduledata_t igmp_mod = {
3706    "igmp",
3707    igmp_modevent,
3708    0
3709};
3710DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
3711