igmp.c revision 227293
1/*-
2 * Copyright (c) 2007-2009 Bruce Simpson.
3 * Copyright (c) 1988 Stephen Deering.
4 * Copyright (c) 1992, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Stephen Deering of Stanford University.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
35 */
36
37/*
38 * Internet Group Management Protocol (IGMP) routines.
39 * [RFC1112, RFC2236, RFC3376]
40 *
41 * Written by Steve Deering, Stanford, May 1988.
42 * Modified by Rosen Sharma, Stanford, Aug 1994.
43 * Modified by Bill Fenner, Xerox PARC, Feb 1995.
44 * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
45 * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson.
46 *
47 * MULTICAST Revision: 3.5.1.4
48 */
49
50#include <sys/cdefs.h>
51__FBSDID("$FreeBSD: head/sys/netinet/igmp.c 227293 2011-11-07 06:44:47Z ed $");
52
53#include <sys/param.h>
54#include <sys/systm.h>
55#include <sys/module.h>
56#include <sys/malloc.h>
57#include <sys/mbuf.h>
58#include <sys/socket.h>
59#include <sys/protosw.h>
60#include <sys/kernel.h>
61#include <sys/sysctl.h>
62#include <sys/ktr.h>
63#include <sys/condvar.h>
64
65#include <net/if.h>
66#include <net/netisr.h>
67#include <net/vnet.h>
68
69#include <netinet/in.h>
70#include <netinet/in_var.h>
71#include <netinet/in_systm.h>
72#include <netinet/ip.h>
73#include <netinet/ip_var.h>
74#include <netinet/ip_options.h>
75#include <netinet/igmp.h>
76#include <netinet/igmp_var.h>
77
78#include <machine/in_cksum.h>
79
80#include <security/mac/mac_framework.h>
81
82#ifndef KTR_IGMPV3
83#define KTR_IGMPV3 KTR_INET
84#endif
85
86static struct igmp_ifinfo *
87		igi_alloc_locked(struct ifnet *);
88static void	igi_delete_locked(const struct ifnet *);
89static void	igmp_dispatch_queue(struct ifqueue *, int, const int);
90static void	igmp_fasttimo_vnet(void);
91static void	igmp_final_leave(struct in_multi *, struct igmp_ifinfo *);
92static int	igmp_handle_state_change(struct in_multi *,
93		    struct igmp_ifinfo *);
94static int	igmp_initial_join(struct in_multi *, struct igmp_ifinfo *);
95static int	igmp_input_v1_query(struct ifnet *, const struct ip *,
96		    const struct igmp *);
97static int	igmp_input_v2_query(struct ifnet *, const struct ip *,
98		    const struct igmp *);
99static int	igmp_input_v3_query(struct ifnet *, const struct ip *,
100		    /*const*/ struct igmpv3 *);
101static int	igmp_input_v3_group_query(struct in_multi *,
102		    struct igmp_ifinfo *, int, /*const*/ struct igmpv3 *);
103static int	igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *,
104		    /*const*/ struct igmp *);
105static int	igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *,
106		    /*const*/ struct igmp *);
107static void	igmp_intr(struct mbuf *);
108static int	igmp_isgroupreported(const struct in_addr);
109static struct mbuf *
110		igmp_ra_alloc(void);
111#ifdef KTR
112static char *	igmp_rec_type_to_str(const int);
113#endif
114static void	igmp_set_version(struct igmp_ifinfo *, const int);
115static void	igmp_slowtimo_vnet(void);
116static int	igmp_v1v2_queue_report(struct in_multi *, const int);
117static void	igmp_v1v2_process_group_timer(struct in_multi *, const int);
118static void	igmp_v1v2_process_querier_timers(struct igmp_ifinfo *);
119static void	igmp_v2_update_group(struct in_multi *, const int);
120static void	igmp_v3_cancel_link_timers(struct igmp_ifinfo *);
121static void	igmp_v3_dispatch_general_query(struct igmp_ifinfo *);
122static struct mbuf *
123		igmp_v3_encap_report(struct ifnet *, struct mbuf *);
124static int	igmp_v3_enqueue_group_record(struct ifqueue *,
125		    struct in_multi *, const int, const int, const int);
126static int	igmp_v3_enqueue_filter_change(struct ifqueue *,
127		    struct in_multi *);
128static void	igmp_v3_process_group_timers(struct igmp_ifinfo *,
129		    struct ifqueue *, struct ifqueue *, struct in_multi *,
130		    const int);
131static int	igmp_v3_merge_state_changes(struct in_multi *,
132		    struct ifqueue *);
133static void	igmp_v3_suppress_group_record(struct in_multi *);
134static int	sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS);
135static int	sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS);
136static int	sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS);
137
138static const struct netisr_handler igmp_nh = {
139	.nh_name = "igmp",
140	.nh_handler = igmp_intr,
141	.nh_proto = NETISR_IGMP,
142	.nh_policy = NETISR_POLICY_SOURCE,
143};
144
145/*
146 * System-wide globals.
147 *
148 * Unlocked access to these is OK, except for the global IGMP output
149 * queue. The IGMP subsystem lock ends up being system-wide for the moment,
150 * because all VIMAGEs have to share a global output queue, as netisrs
151 * themselves are not virtualized.
152 *
153 * Locking:
154 *  * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
155 *    Any may be taken independently; if any are held at the same
156 *    time, the above lock order must be followed.
157 *  * All output is delegated to the netisr.
158 *    Now that Giant has been eliminated, the netisr may be inlined.
159 *  * IN_MULTI_LOCK covers in_multi.
160 *  * IGMP_LOCK covers igmp_ifinfo and any global variables in this file,
161 *    including the output queue.
162 *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
163 *    per-link state iterators.
164 *  * igmp_ifinfo is valid as long as PF_INET is attached to the interface,
165 *    therefore it is not refcounted.
166 *    We allow unlocked reads of igmp_ifinfo when accessed via in_multi.
167 *
168 * Reference counting
169 *  * IGMP acquires its own reference every time an in_multi is passed to
170 *    it and the group is being joined for the first time.
171 *  * IGMP releases its reference(s) on in_multi in a deferred way,
172 *    because the operations which process the release run as part of
173 *    a loop whose control variables are directly affected by the release
174 *    (that, and not recursing on the IF_ADDR_LOCK).
175 *
176 * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds
177 * to a vnet in ifp->if_vnet.
178 *
179 * SMPng: XXX We may potentially race operations on ifma_protospec.
180 * The problem is that we currently lack a clean way of taking the
181 * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing,
182 * as anything which modifies ifma needs to be covered by that lock.
183 * So check for ifma_protospec being NULL before proceeding.
184 */
185struct mtx		 igmp_mtx;
186
187struct mbuf		*m_raopt;		 /* Router Alert option */
188static MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
189
190/*
191 * VIMAGE-wide globals.
192 *
193 * The IGMPv3 timers themselves need to run per-image, however,
194 * protosw timers run globally (see tcp).
195 * An ifnet can only be in one vimage at a time, and the loopback
196 * ifnet, loif, is itself virtualized.
197 * It would otherwise be possible to seriously hose IGMP state,
198 * and create inconsistencies in upstream multicast routing, if you have
199 * multiple VIMAGEs running on the same link joining different multicast
200 * groups, UNLESS the "primary IP address" is different. This is because
201 * IGMP for IPv4 does not force link-local addresses to be used for each
202 * node, unlike MLD for IPv6.
203 * Obviously the IGMPv3 per-interface state has per-vimage granularity
204 * also as a result.
205 *
206 * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection
207 * policy to control the address used by IGMP on the link.
208 */
209static VNET_DEFINE(int, interface_timers_running);	/* IGMPv3 general
210							 * query response */
211static VNET_DEFINE(int, state_change_timers_running);	/* IGMPv3 state-change
212							 * retransmit */
213static VNET_DEFINE(int, current_state_timers_running);	/* IGMPv1/v2 host
214							 * report; IGMPv3 g/sg
215							 * query response */
216
217#define	V_interface_timers_running	VNET(interface_timers_running)
218#define	V_state_change_timers_running	VNET(state_change_timers_running)
219#define	V_current_state_timers_running	VNET(current_state_timers_running)
220
221static VNET_DEFINE(LIST_HEAD(, igmp_ifinfo), igi_head);
222static VNET_DEFINE(struct igmpstat, igmpstat) = {
223	.igps_version = IGPS_VERSION_3,
224	.igps_len = sizeof(struct igmpstat),
225};
226static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0};
227
228#define	V_igi_head			VNET(igi_head)
229#define	V_igmpstat			VNET(igmpstat)
230#define	V_igmp_gsrdelay			VNET(igmp_gsrdelay)
231
232static VNET_DEFINE(int, igmp_recvifkludge) = 1;
233static VNET_DEFINE(int, igmp_sendra) = 1;
234static VNET_DEFINE(int, igmp_sendlocal) = 1;
235static VNET_DEFINE(int, igmp_v1enable) = 1;
236static VNET_DEFINE(int, igmp_v2enable) = 1;
237static VNET_DEFINE(int, igmp_legacysupp);
238static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3;
239
240#define	V_igmp_recvifkludge		VNET(igmp_recvifkludge)
241#define	V_igmp_sendra			VNET(igmp_sendra)
242#define	V_igmp_sendlocal		VNET(igmp_sendlocal)
243#define	V_igmp_v1enable			VNET(igmp_v1enable)
244#define	V_igmp_v2enable			VNET(igmp_v2enable)
245#define	V_igmp_legacysupp		VNET(igmp_legacysupp)
246#define	V_igmp_default_version		VNET(igmp_default_version)
247
248/*
249 * Virtualized sysctls.
250 */
251SYSCTL_VNET_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW,
252    &VNET_NAME(igmpstat), igmpstat, "");
253SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_RW,
254    &VNET_NAME(igmp_recvifkludge), 0,
255    "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address");
256SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_RW,
257    &VNET_NAME(igmp_sendra), 0,
258    "Send IP Router Alert option in IGMPv2/v3 messages");
259SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_RW,
260    &VNET_NAME(igmp_sendlocal), 0,
261    "Send IGMP membership reports for 224.0.0.0/24 groups");
262SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_RW,
263    &VNET_NAME(igmp_v1enable), 0,
264    "Enable backwards compatibility with IGMPv1");
265SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_RW,
266    &VNET_NAME(igmp_v2enable), 0,
267    "Enable backwards compatibility with IGMPv2");
268SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_RW,
269    &VNET_NAME(igmp_legacysupp), 0,
270    "Allow v1/v2 reports to suppress v3 group responses");
271SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, default_version,
272    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
273    &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I",
274    "Default version of IGMP to run on each interface");
275SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
276    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
277    &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I",
278    "Rate limit for IGMPv3 Group-and-Source queries in seconds");
279
280/*
281 * Non-virtualized sysctls.
282 */
283SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE,
284    sysctl_igmp_ifinfo, "Per-interface IGMPv3 state");
285
286static __inline void
287igmp_save_context(struct mbuf *m, struct ifnet *ifp)
288{
289
290#ifdef VIMAGE
291	m->m_pkthdr.header = ifp->if_vnet;
292#endif /* VIMAGE */
293	m->m_pkthdr.flowid = ifp->if_index;
294}
295
296static __inline void
297igmp_scrub_context(struct mbuf *m)
298{
299
300	m->m_pkthdr.header = NULL;
301	m->m_pkthdr.flowid = 0;
302}
303
304#ifdef KTR
305static __inline char *
306inet_ntoa_haddr(in_addr_t haddr)
307{
308	struct in_addr ia;
309
310	ia.s_addr = htonl(haddr);
311	return (inet_ntoa(ia));
312}
313#endif
314
315/*
316 * Restore context from a queued IGMP output chain.
317 * Return saved ifindex.
318 *
319 * VIMAGE: The assertion is there to make sure that we
320 * actually called CURVNET_SET() with what's in the mbuf chain.
321 */
322static __inline uint32_t
323igmp_restore_context(struct mbuf *m)
324{
325
326#ifdef notyet
327#if defined(VIMAGE) && defined(INVARIANTS)
328	KASSERT(curvnet == (m->m_pkthdr.header),
329	    ("%s: called when curvnet was not restored", __func__));
330#endif
331#endif
332	return (m->m_pkthdr.flowid);
333}
334
335/*
336 * Retrieve or set default IGMP version.
337 *
338 * VIMAGE: Assume curvnet set by caller.
339 * SMPng: NOTE: Serialized by IGMP lock.
340 */
341static int
342sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS)
343{
344	int	 error;
345	int	 new;
346
347	error = sysctl_wire_old_buffer(req, sizeof(int));
348	if (error)
349		return (error);
350
351	IGMP_LOCK();
352
353	new = V_igmp_default_version;
354
355	error = sysctl_handle_int(oidp, &new, 0, req);
356	if (error || !req->newptr)
357		goto out_locked;
358
359	if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) {
360		error = EINVAL;
361		goto out_locked;
362	}
363
364	CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d",
365	     V_igmp_default_version, new);
366
367	V_igmp_default_version = new;
368
369out_locked:
370	IGMP_UNLOCK();
371	return (error);
372}
373
374/*
375 * Retrieve or set threshold between group-source queries in seconds.
376 *
377 * VIMAGE: Assume curvnet set by caller.
378 * SMPng: NOTE: Serialized by IGMP lock.
379 */
380static int
381sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS)
382{
383	int error;
384	int i;
385
386	error = sysctl_wire_old_buffer(req, sizeof(int));
387	if (error)
388		return (error);
389
390	IGMP_LOCK();
391
392	i = V_igmp_gsrdelay.tv_sec;
393
394	error = sysctl_handle_int(oidp, &i, 0, req);
395	if (error || !req->newptr)
396		goto out_locked;
397
398	if (i < -1 || i >= 60) {
399		error = EINVAL;
400		goto out_locked;
401	}
402
403	CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d",
404	     V_igmp_gsrdelay.tv_sec, i);
405	V_igmp_gsrdelay.tv_sec = i;
406
407out_locked:
408	IGMP_UNLOCK();
409	return (error);
410}
411
412/*
413 * Expose struct igmp_ifinfo to userland, keyed by ifindex.
414 * For use by ifmcstat(8).
415 *
416 * SMPng: NOTE: Does an unlocked ifindex space read.
417 * VIMAGE: Assume curvnet set by caller. The node handler itself
418 * is not directly virtualized.
419 */
420static int
421sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
422{
423	int			*name;
424	int			 error;
425	u_int			 namelen;
426	struct ifnet		*ifp;
427	struct igmp_ifinfo	*igi;
428
429	name = (int *)arg1;
430	namelen = arg2;
431
432	if (req->newptr != NULL)
433		return (EPERM);
434
435	if (namelen != 1)
436		return (EINVAL);
437
438	error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo));
439	if (error)
440		return (error);
441
442	IN_MULTI_LOCK();
443	IGMP_LOCK();
444
445	if (name[0] <= 0 || name[0] > V_if_index) {
446		error = ENOENT;
447		goto out_locked;
448	}
449
450	error = ENOENT;
451
452	ifp = ifnet_byindex(name[0]);
453	if (ifp == NULL)
454		goto out_locked;
455
456	LIST_FOREACH(igi, &V_igi_head, igi_link) {
457		if (ifp == igi->igi_ifp) {
458			error = SYSCTL_OUT(req, igi,
459			    sizeof(struct igmp_ifinfo));
460			break;
461		}
462	}
463
464out_locked:
465	IGMP_UNLOCK();
466	IN_MULTI_UNLOCK();
467	return (error);
468}
469
470/*
471 * Dispatch an entire queue of pending packet chains
472 * using the netisr.
473 * VIMAGE: Assumes the vnet pointer has been set.
474 */
475static void
476igmp_dispatch_queue(struct ifqueue *ifq, int limit, const int loop)
477{
478	struct mbuf *m;
479
480	for (;;) {
481		_IF_DEQUEUE(ifq, m);
482		if (m == NULL)
483			break;
484		CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, ifq, m);
485		if (loop)
486			m->m_flags |= M_IGMP_LOOP;
487		netisr_dispatch(NETISR_IGMP, m);
488		if (--limit == 0)
489			break;
490	}
491}
492
493/*
494 * Filter outgoing IGMP report state by group.
495 *
496 * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1).
497 * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are
498 * disabled for all groups in the 224.0.0.0/24 link-local scope. However,
499 * this may break certain IGMP snooping switches which rely on the old
500 * report behaviour.
501 *
502 * Return zero if the given group is one for which IGMP reports
503 * should be suppressed, or non-zero if reports should be issued.
504 */
505static __inline int
506igmp_isgroupreported(const struct in_addr addr)
507{
508
509	if (in_allhosts(addr) ||
510	    ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr)))))
511		return (0);
512
513	return (1);
514}
515
516/*
517 * Construct a Router Alert option to use in outgoing packets.
518 */
519static struct mbuf *
520igmp_ra_alloc(void)
521{
522	struct mbuf	*m;
523	struct ipoption	*p;
524
525	MGET(m, M_DONTWAIT, MT_DATA);
526	p = mtod(m, struct ipoption *);
527	p->ipopt_dst.s_addr = INADDR_ANY;
528	p->ipopt_list[0] = IPOPT_RA;	/* Router Alert Option */
529	p->ipopt_list[1] = 0x04;	/* 4 bytes long */
530	p->ipopt_list[2] = IPOPT_EOL;	/* End of IP option list */
531	p->ipopt_list[3] = 0x00;	/* pad byte */
532	m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1];
533
534	return (m);
535}
536
537/*
538 * Attach IGMP when PF_INET is attached to an interface.
539 */
540struct igmp_ifinfo *
541igmp_domifattach(struct ifnet *ifp)
542{
543	struct igmp_ifinfo *igi;
544
545	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
546	    __func__, ifp, ifp->if_xname);
547
548	IGMP_LOCK();
549
550	igi = igi_alloc_locked(ifp);
551	if (!(ifp->if_flags & IFF_MULTICAST))
552		igi->igi_flags |= IGIF_SILENT;
553
554	IGMP_UNLOCK();
555
556	return (igi);
557}
558
559/*
560 * VIMAGE: assume curvnet set by caller.
561 */
562static struct igmp_ifinfo *
563igi_alloc_locked(/*const*/ struct ifnet *ifp)
564{
565	struct igmp_ifinfo *igi;
566
567	IGMP_LOCK_ASSERT();
568
569	igi = malloc(sizeof(struct igmp_ifinfo), M_IGMP, M_NOWAIT|M_ZERO);
570	if (igi == NULL)
571		goto out;
572
573	igi->igi_ifp = ifp;
574	igi->igi_version = V_igmp_default_version;
575	igi->igi_flags = 0;
576	igi->igi_rv = IGMP_RV_INIT;
577	igi->igi_qi = IGMP_QI_INIT;
578	igi->igi_qri = IGMP_QRI_INIT;
579	igi->igi_uri = IGMP_URI_INIT;
580
581	SLIST_INIT(&igi->igi_relinmhead);
582
583	/*
584	 * Responses to general queries are subject to bounds.
585	 */
586	IFQ_SET_MAXLEN(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
587
588	LIST_INSERT_HEAD(&V_igi_head, igi, igi_link);
589
590	CTR2(KTR_IGMPV3, "allocate igmp_ifinfo for ifp %p(%s)",
591	     ifp, ifp->if_xname);
592
593out:
594	return (igi);
595}
596
597/*
598 * Hook for ifdetach.
599 *
600 * NOTE: Some finalization tasks need to run before the protocol domain
601 * is detached, but also before the link layer does its cleanup.
602 *
603 * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK().
604 * XXX This is also bitten by unlocked ifma_protospec access.
605 */
606void
607igmp_ifdetach(struct ifnet *ifp)
608{
609	struct igmp_ifinfo	*igi;
610	struct ifmultiaddr	*ifma;
611	struct in_multi		*inm, *tinm;
612
613	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp,
614	    ifp->if_xname);
615
616	IGMP_LOCK();
617
618	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
619	if (igi->igi_version == IGMP_VERSION_3) {
620		IF_ADDR_LOCK(ifp);
621		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
622			if (ifma->ifma_addr->sa_family != AF_INET ||
623			    ifma->ifma_protospec == NULL)
624				continue;
625#if 0
626			KASSERT(ifma->ifma_protospec != NULL,
627			    ("%s: ifma_protospec is NULL", __func__));
628#endif
629			inm = (struct in_multi *)ifma->ifma_protospec;
630			if (inm->inm_state == IGMP_LEAVING_MEMBER) {
631				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
632				    inm, inm_nrele);
633			}
634			inm_clear_recorded(inm);
635		}
636		IF_ADDR_UNLOCK(ifp);
637		/*
638		 * Free the in_multi reference(s) for this IGMP lifecycle.
639		 */
640		SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele,
641		    tinm) {
642			SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
643			inm_release_locked(inm);
644		}
645	}
646
647	IGMP_UNLOCK();
648}
649
650/*
651 * Hook for domifdetach.
652 */
653void
654igmp_domifdetach(struct ifnet *ifp)
655{
656	struct igmp_ifinfo *igi;
657
658	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
659	    __func__, ifp, ifp->if_xname);
660
661	IGMP_LOCK();
662
663	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
664	igi_delete_locked(ifp);
665
666	IGMP_UNLOCK();
667}
668
669static void
670igi_delete_locked(const struct ifnet *ifp)
671{
672	struct igmp_ifinfo *igi, *tigi;
673
674	CTR3(KTR_IGMPV3, "%s: freeing igmp_ifinfo for ifp %p(%s)",
675	    __func__, ifp, ifp->if_xname);
676
677	IGMP_LOCK_ASSERT();
678
679	LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) {
680		if (igi->igi_ifp == ifp) {
681			/*
682			 * Free deferred General Query responses.
683			 */
684			_IF_DRAIN(&igi->igi_gq);
685
686			LIST_REMOVE(igi, igi_link);
687
688			KASSERT(SLIST_EMPTY(&igi->igi_relinmhead),
689			    ("%s: there are dangling in_multi references",
690			    __func__));
691
692			free(igi, M_IGMP);
693			return;
694		}
695	}
696
697#ifdef INVARIANTS
698	panic("%s: igmp_ifinfo not found for ifp %p\n", __func__,  ifp);
699#endif
700}
701
702/*
703 * Process a received IGMPv1 query.
704 * Return non-zero if the message should be dropped.
705 *
706 * VIMAGE: The curvnet pointer is derived from the input ifp.
707 */
708static int
709igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
710    const struct igmp *igmp)
711{
712	struct ifmultiaddr	*ifma;
713	struct igmp_ifinfo	*igi;
714	struct in_multi		*inm;
715
716	/*
717	 * IGMPv1 Host Mmembership Queries SHOULD always be addressed to
718	 * 224.0.0.1. They are always treated as General Queries.
719	 * igmp_group is always ignored. Do not drop it as a userland
720	 * daemon may wish to see it.
721	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
722	 */
723	if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) {
724		IGMPSTAT_INC(igps_rcv_badqueries);
725		return (0);
726	}
727	IGMPSTAT_INC(igps_rcv_gen_queries);
728
729	IN_MULTI_LOCK();
730	IGMP_LOCK();
731
732	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
733	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
734
735	if (igi->igi_flags & IGIF_LOOPBACK) {
736		CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)",
737		    ifp, ifp->if_xname);
738		goto out_locked;
739	}
740
741	/*
742	 * Switch to IGMPv1 host compatibility mode.
743	 */
744	igmp_set_version(igi, IGMP_VERSION_1);
745
746	CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname);
747
748	/*
749	 * Start the timers in all of our group records
750	 * for the interface on which the query arrived,
751	 * except those which are already running.
752	 */
753	IF_ADDR_LOCK(ifp);
754	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
755		if (ifma->ifma_addr->sa_family != AF_INET ||
756		    ifma->ifma_protospec == NULL)
757			continue;
758		inm = (struct in_multi *)ifma->ifma_protospec;
759		if (inm->inm_timer != 0)
760			continue;
761		switch (inm->inm_state) {
762		case IGMP_NOT_MEMBER:
763		case IGMP_SILENT_MEMBER:
764			break;
765		case IGMP_G_QUERY_PENDING_MEMBER:
766		case IGMP_SG_QUERY_PENDING_MEMBER:
767		case IGMP_REPORTING_MEMBER:
768		case IGMP_IDLE_MEMBER:
769		case IGMP_LAZY_MEMBER:
770		case IGMP_SLEEPING_MEMBER:
771		case IGMP_AWAKENING_MEMBER:
772			inm->inm_state = IGMP_REPORTING_MEMBER;
773			inm->inm_timer = IGMP_RANDOM_DELAY(
774			    IGMP_V1V2_MAX_RI * PR_FASTHZ);
775			V_current_state_timers_running = 1;
776			break;
777		case IGMP_LEAVING_MEMBER:
778			break;
779		}
780	}
781	IF_ADDR_UNLOCK(ifp);
782
783out_locked:
784	IGMP_UNLOCK();
785	IN_MULTI_UNLOCK();
786
787	return (0);
788}
789
790/*
791 * Process a received IGMPv2 general or group-specific query.
792 */
793static int
794igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
795    const struct igmp *igmp)
796{
797	struct ifmultiaddr	*ifma;
798	struct igmp_ifinfo	*igi;
799	struct in_multi		*inm;
800	int			 is_general_query;
801	uint16_t		 timer;
802
803	is_general_query = 0;
804
805	/*
806	 * Validate address fields upfront.
807	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
808	 */
809	if (in_nullhost(igmp->igmp_group)) {
810		/*
811		 * IGMPv2 General Query.
812		 * If this was not sent to the all-hosts group, ignore it.
813		 */
814		if (!in_allhosts(ip->ip_dst))
815			return (0);
816		IGMPSTAT_INC(igps_rcv_gen_queries);
817		is_general_query = 1;
818	} else {
819		/* IGMPv2 Group-Specific Query. */
820		IGMPSTAT_INC(igps_rcv_group_queries);
821	}
822
823	IN_MULTI_LOCK();
824	IGMP_LOCK();
825
826	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
827	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
828
829	if (igi->igi_flags & IGIF_LOOPBACK) {
830		CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)",
831		    ifp, ifp->if_xname);
832		goto out_locked;
833	}
834
835	/*
836	 * Ignore v2 query if in v1 Compatibility Mode.
837	 */
838	if (igi->igi_version == IGMP_VERSION_1)
839		goto out_locked;
840
841	igmp_set_version(igi, IGMP_VERSION_2);
842
843	timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
844	if (timer == 0)
845		timer = 1;
846
847	if (is_general_query) {
848		/*
849		 * For each reporting group joined on this
850		 * interface, kick the report timer.
851		 */
852		CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)",
853		    ifp, ifp->if_xname);
854		IF_ADDR_LOCK(ifp);
855		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
856			if (ifma->ifma_addr->sa_family != AF_INET ||
857			    ifma->ifma_protospec == NULL)
858				continue;
859			inm = (struct in_multi *)ifma->ifma_protospec;
860			igmp_v2_update_group(inm, timer);
861		}
862		IF_ADDR_UNLOCK(ifp);
863	} else {
864		/*
865		 * Group-specific IGMPv2 query, we need only
866		 * look up the single group to process it.
867		 */
868		inm = inm_lookup(ifp, igmp->igmp_group);
869		if (inm != NULL) {
870			CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)",
871			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
872			igmp_v2_update_group(inm, timer);
873		}
874	}
875
876out_locked:
877	IGMP_UNLOCK();
878	IN_MULTI_UNLOCK();
879
880	return (0);
881}
882
883/*
884 * Update the report timer on a group in response to an IGMPv2 query.
885 *
886 * If we are becoming the reporting member for this group, start the timer.
887 * If we already are the reporting member for this group, and timer is
888 * below the threshold, reset it.
889 *
890 * We may be updating the group for the first time since we switched
891 * to IGMPv3. If we are, then we must clear any recorded source lists,
892 * and transition to REPORTING state; the group timer is overloaded
893 * for group and group-source query responses.
894 *
895 * Unlike IGMPv3, the delay per group should be jittered
896 * to avoid bursts of IGMPv2 reports.
897 */
898static void
899igmp_v2_update_group(struct in_multi *inm, const int timer)
900{
901
902	CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__,
903	    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer);
904
905	IN_MULTI_LOCK_ASSERT();
906
907	switch (inm->inm_state) {
908	case IGMP_NOT_MEMBER:
909	case IGMP_SILENT_MEMBER:
910		break;
911	case IGMP_REPORTING_MEMBER:
912		if (inm->inm_timer != 0 &&
913		    inm->inm_timer <= timer) {
914			CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, "
915			    "skipping.", __func__);
916			break;
917		}
918		/* FALLTHROUGH */
919	case IGMP_SG_QUERY_PENDING_MEMBER:
920	case IGMP_G_QUERY_PENDING_MEMBER:
921	case IGMP_IDLE_MEMBER:
922	case IGMP_LAZY_MEMBER:
923	case IGMP_AWAKENING_MEMBER:
924		CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__);
925		inm->inm_state = IGMP_REPORTING_MEMBER;
926		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
927		V_current_state_timers_running = 1;
928		break;
929	case IGMP_SLEEPING_MEMBER:
930		CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__);
931		inm->inm_state = IGMP_AWAKENING_MEMBER;
932		break;
933	case IGMP_LEAVING_MEMBER:
934		break;
935	}
936}
937
938/*
939 * Process a received IGMPv3 general, group-specific or
940 * group-and-source-specific query.
941 * Assumes m has already been pulled up to the full IGMP message length.
942 * Return 0 if successful, otherwise an appropriate error code is returned.
943 */
944static int
945igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
946    /*const*/ struct igmpv3 *igmpv3)
947{
948	struct igmp_ifinfo	*igi;
949	struct in_multi		*inm;
950	int			 is_general_query;
951	uint32_t		 maxresp, nsrc, qqi;
952	uint16_t		 timer;
953	uint8_t			 qrv;
954
955	is_general_query = 0;
956
957	CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname);
958
959	maxresp = igmpv3->igmp_code;	/* in 1/10ths of a second */
960	if (maxresp >= 128) {
961		maxresp = IGMP_MANT(igmpv3->igmp_code) <<
962			  (IGMP_EXP(igmpv3->igmp_code) + 3);
963	}
964
965	/*
966	 * Robustness must never be less than 2 for on-wire IGMPv3.
967	 * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make
968	 * an exception for interfaces whose IGMPv3 state changes
969	 * are redirected to loopback (e.g. MANET).
970	 */
971	qrv = IGMP_QRV(igmpv3->igmp_misc);
972	if (qrv < 2) {
973		CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__,
974		    qrv, IGMP_RV_INIT);
975		qrv = IGMP_RV_INIT;
976	}
977
978	qqi = igmpv3->igmp_qqi;
979	if (qqi >= 128) {
980		qqi = IGMP_MANT(igmpv3->igmp_qqi) <<
981		     (IGMP_EXP(igmpv3->igmp_qqi) + 3);
982	}
983
984	timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE;
985	if (timer == 0)
986		timer = 1;
987
988	nsrc = ntohs(igmpv3->igmp_numsrc);
989
990	/*
991	 * Validate address fields and versions upfront before
992	 * accepting v3 query.
993	 * XXX SMPng: Unlocked access to igmpstat counters here.
994	 */
995	if (in_nullhost(igmpv3->igmp_group)) {
996		/*
997		 * IGMPv3 General Query.
998		 *
999		 * General Queries SHOULD be directed to 224.0.0.1.
1000		 * A general query with a source list has undefined
1001		 * behaviour; discard it.
1002		 */
1003		IGMPSTAT_INC(igps_rcv_gen_queries);
1004		if (!in_allhosts(ip->ip_dst) || nsrc > 0) {
1005			IGMPSTAT_INC(igps_rcv_badqueries);
1006			return (0);
1007		}
1008		is_general_query = 1;
1009	} else {
1010		/* Group or group-source specific query. */
1011		if (nsrc == 0)
1012			IGMPSTAT_INC(igps_rcv_group_queries);
1013		else
1014			IGMPSTAT_INC(igps_rcv_gsr_queries);
1015	}
1016
1017	IN_MULTI_LOCK();
1018	IGMP_LOCK();
1019
1020	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
1021	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
1022
1023	if (igi->igi_flags & IGIF_LOOPBACK) {
1024		CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)",
1025		    ifp, ifp->if_xname);
1026		goto out_locked;
1027	}
1028
1029	/*
1030	 * Discard the v3 query if we're in Compatibility Mode.
1031	 * The RFC is not obviously worded that hosts need to stay in
1032	 * compatibility mode until the Old Version Querier Present
1033	 * timer expires.
1034	 */
1035	if (igi->igi_version != IGMP_VERSION_3) {
1036		CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)",
1037		    igi->igi_version, ifp, ifp->if_xname);
1038		goto out_locked;
1039	}
1040
1041	igmp_set_version(igi, IGMP_VERSION_3);
1042	igi->igi_rv = qrv;
1043	igi->igi_qi = qqi;
1044	igi->igi_qri = maxresp;
1045
1046	CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi,
1047	    maxresp);
1048
1049	if (is_general_query) {
1050		/*
1051		 * Schedule a current-state report on this ifp for
1052		 * all groups, possibly containing source lists.
1053		 * If there is a pending General Query response
1054		 * scheduled earlier than the selected delay, do
1055		 * not schedule any other reports.
1056		 * Otherwise, reset the interface timer.
1057		 */
1058		CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)",
1059		    ifp, ifp->if_xname);
1060		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) {
1061			igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer);
1062			V_interface_timers_running = 1;
1063		}
1064	} else {
1065		/*
1066		 * Group-source-specific queries are throttled on
1067		 * a per-group basis to defeat denial-of-service attempts.
1068		 * Queries for groups we are not a member of on this
1069		 * link are simply ignored.
1070		 */
1071		inm = inm_lookup(ifp, igmpv3->igmp_group);
1072		if (inm == NULL)
1073			goto out_locked;
1074		if (nsrc > 0) {
1075			if (!ratecheck(&inm->inm_lastgsrtv,
1076			    &V_igmp_gsrdelay)) {
1077				CTR1(KTR_IGMPV3, "%s: GS query throttled.",
1078				    __func__);
1079				IGMPSTAT_INC(igps_drop_gsr_queries);
1080				goto out_locked;
1081			}
1082		}
1083		CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)",
1084		     inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname);
1085		/*
1086		 * If there is a pending General Query response
1087		 * scheduled sooner than the selected delay, no
1088		 * further report need be scheduled.
1089		 * Otherwise, prepare to respond to the
1090		 * group-specific or group-and-source query.
1091		 */
1092		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer)
1093			igmp_input_v3_group_query(inm, igi, timer, igmpv3);
1094	}
1095
1096out_locked:
1097	IGMP_UNLOCK();
1098	IN_MULTI_UNLOCK();
1099
1100	return (0);
1101}
1102
1103/*
1104 * Process a recieved IGMPv3 group-specific or group-and-source-specific
1105 * query.
1106 * Return <0 if any error occured. Currently this is ignored.
1107 */
1108static int
1109igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifinfo *igi,
1110    int timer, /*const*/ struct igmpv3 *igmpv3)
1111{
1112	int			 retval;
1113	uint16_t		 nsrc;
1114
1115	IN_MULTI_LOCK_ASSERT();
1116	IGMP_LOCK_ASSERT();
1117
1118	retval = 0;
1119
1120	switch (inm->inm_state) {
1121	case IGMP_NOT_MEMBER:
1122	case IGMP_SILENT_MEMBER:
1123	case IGMP_SLEEPING_MEMBER:
1124	case IGMP_LAZY_MEMBER:
1125	case IGMP_AWAKENING_MEMBER:
1126	case IGMP_IDLE_MEMBER:
1127	case IGMP_LEAVING_MEMBER:
1128		return (retval);
1129		break;
1130	case IGMP_REPORTING_MEMBER:
1131	case IGMP_G_QUERY_PENDING_MEMBER:
1132	case IGMP_SG_QUERY_PENDING_MEMBER:
1133		break;
1134	}
1135
1136	nsrc = ntohs(igmpv3->igmp_numsrc);
1137
1138	/*
1139	 * Deal with group-specific queries upfront.
1140	 * If any group query is already pending, purge any recorded
1141	 * source-list state if it exists, and schedule a query response
1142	 * for this group-specific query.
1143	 */
1144	if (nsrc == 0) {
1145		if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
1146		    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) {
1147			inm_clear_recorded(inm);
1148			timer = min(inm->inm_timer, timer);
1149		}
1150		inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER;
1151		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
1152		V_current_state_timers_running = 1;
1153		return (retval);
1154	}
1155
1156	/*
1157	 * Deal with the case where a group-and-source-specific query has
1158	 * been received but a group-specific query is already pending.
1159	 */
1160	if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) {
1161		timer = min(inm->inm_timer, timer);
1162		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
1163		V_current_state_timers_running = 1;
1164		return (retval);
1165	}
1166
1167	/*
1168	 * Finally, deal with the case where a group-and-source-specific
1169	 * query has been received, where a response to a previous g-s-r
1170	 * query exists, or none exists.
1171	 * In this case, we need to parse the source-list which the Querier
1172	 * has provided us with and check if we have any source list filter
1173	 * entries at T1 for these sources. If we do not, there is no need
1174	 * schedule a report and the query may be dropped.
1175	 * If we do, we must record them and schedule a current-state
1176	 * report for those sources.
1177	 * FIXME: Handling source lists larger than 1 mbuf requires that
1178	 * we pass the mbuf chain pointer down to this function, and use
1179	 * m_getptr() to walk the chain.
1180	 */
1181	if (inm->inm_nsrc > 0) {
1182		const struct in_addr	*ap;
1183		int			 i, nrecorded;
1184
1185		ap = (const struct in_addr *)(igmpv3 + 1);
1186		nrecorded = 0;
1187		for (i = 0; i < nsrc; i++, ap++) {
1188			retval = inm_record_source(inm, ap->s_addr);
1189			if (retval < 0)
1190				break;
1191			nrecorded += retval;
1192		}
1193		if (nrecorded > 0) {
1194			CTR1(KTR_IGMPV3,
1195			    "%s: schedule response to SG query", __func__);
1196			inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER;
1197			inm->inm_timer = IGMP_RANDOM_DELAY(timer);
1198			V_current_state_timers_running = 1;
1199		}
1200	}
1201
1202	return (retval);
1203}
1204
1205/*
1206 * Process a received IGMPv1 host membership report.
1207 *
1208 * NOTE: 0.0.0.0 workaround breaks const correctness.
1209 */
1210static int
1211igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
1212    /*const*/ struct igmp *igmp)
1213{
1214	struct in_ifaddr *ia;
1215	struct in_multi *inm;
1216
1217	IGMPSTAT_INC(igps_rcv_reports);
1218
1219	if (ifp->if_flags & IFF_LOOPBACK)
1220		return (0);
1221
1222	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
1223	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
1224		IGMPSTAT_INC(igps_rcv_badreports);
1225		return (EINVAL);
1226	}
1227
1228	/*
1229	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
1230	 * Booting clients may use the source address 0.0.0.0. Some
1231	 * IGMP daemons may not know how to use IP_RECVIF to determine
1232	 * the interface upon which this message was received.
1233	 * Replace 0.0.0.0 with the subnet address if told to do so.
1234	 */
1235	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
1236		IFP_TO_IA(ifp, ia);
1237		if (ia != NULL) {
1238			ip->ip_src.s_addr = htonl(ia->ia_subnet);
1239			ifa_free(&ia->ia_ifa);
1240		}
1241	}
1242
1243	CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)",
1244	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1245
1246	/*
1247	 * IGMPv1 report suppression.
1248	 * If we are a member of this group, and our membership should be
1249	 * reported, stop our group timer and transition to the 'lazy' state.
1250	 */
1251	IN_MULTI_LOCK();
1252	inm = inm_lookup(ifp, igmp->igmp_group);
1253	if (inm != NULL) {
1254		struct igmp_ifinfo *igi;
1255
1256		igi = inm->inm_igi;
1257		if (igi == NULL) {
1258			KASSERT(igi != NULL,
1259			    ("%s: no igi for ifp %p", __func__, ifp));
1260			goto out_locked;
1261		}
1262
1263		IGMPSTAT_INC(igps_rcv_ourreports);
1264
1265		/*
1266		 * If we are in IGMPv3 host mode, do not allow the
1267		 * other host's IGMPv1 report to suppress our reports
1268		 * unless explicitly configured to do so.
1269		 */
1270		if (igi->igi_version == IGMP_VERSION_3) {
1271			if (V_igmp_legacysupp)
1272				igmp_v3_suppress_group_record(inm);
1273			goto out_locked;
1274		}
1275
1276		inm->inm_timer = 0;
1277
1278		switch (inm->inm_state) {
1279		case IGMP_NOT_MEMBER:
1280		case IGMP_SILENT_MEMBER:
1281			break;
1282		case IGMP_IDLE_MEMBER:
1283		case IGMP_LAZY_MEMBER:
1284		case IGMP_AWAKENING_MEMBER:
1285			CTR3(KTR_IGMPV3,
1286			    "report suppressed for %s on ifp %p(%s)",
1287			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1288		case IGMP_SLEEPING_MEMBER:
1289			inm->inm_state = IGMP_SLEEPING_MEMBER;
1290			break;
1291		case IGMP_REPORTING_MEMBER:
1292			CTR3(KTR_IGMPV3,
1293			    "report suppressed for %s on ifp %p(%s)",
1294			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1295			if (igi->igi_version == IGMP_VERSION_1)
1296				inm->inm_state = IGMP_LAZY_MEMBER;
1297			else if (igi->igi_version == IGMP_VERSION_2)
1298				inm->inm_state = IGMP_SLEEPING_MEMBER;
1299			break;
1300		case IGMP_G_QUERY_PENDING_MEMBER:
1301		case IGMP_SG_QUERY_PENDING_MEMBER:
1302		case IGMP_LEAVING_MEMBER:
1303			break;
1304		}
1305	}
1306
1307out_locked:
1308	IN_MULTI_UNLOCK();
1309
1310	return (0);
1311}
1312
1313/*
1314 * Process a received IGMPv2 host membership report.
1315 *
1316 * NOTE: 0.0.0.0 workaround breaks const correctness.
1317 */
1318static int
1319igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
1320    /*const*/ struct igmp *igmp)
1321{
1322	struct in_ifaddr *ia;
1323	struct in_multi *inm;
1324
1325	/*
1326	 * Make sure we don't hear our own membership report.  Fast
1327	 * leave requires knowing that we are the only member of a
1328	 * group.
1329	 */
1330	IFP_TO_IA(ifp, ia);
1331	if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) {
1332		ifa_free(&ia->ia_ifa);
1333		return (0);
1334	}
1335
1336	IGMPSTAT_INC(igps_rcv_reports);
1337
1338	if (ifp->if_flags & IFF_LOOPBACK) {
1339		if (ia != NULL)
1340			ifa_free(&ia->ia_ifa);
1341		return (0);
1342	}
1343
1344	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
1345	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
1346		if (ia != NULL)
1347			ifa_free(&ia->ia_ifa);
1348		IGMPSTAT_INC(igps_rcv_badreports);
1349		return (EINVAL);
1350	}
1351
1352	/*
1353	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
1354	 * Booting clients may use the source address 0.0.0.0. Some
1355	 * IGMP daemons may not know how to use IP_RECVIF to determine
1356	 * the interface upon which this message was received.
1357	 * Replace 0.0.0.0 with the subnet address if told to do so.
1358	 */
1359	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
1360		if (ia != NULL)
1361			ip->ip_src.s_addr = htonl(ia->ia_subnet);
1362	}
1363	if (ia != NULL)
1364		ifa_free(&ia->ia_ifa);
1365
1366	CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)",
1367	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1368
1369	/*
1370	 * IGMPv2 report suppression.
1371	 * If we are a member of this group, and our membership should be
1372	 * reported, and our group timer is pending or about to be reset,
1373	 * stop our group timer by transitioning to the 'lazy' state.
1374	 */
1375	IN_MULTI_LOCK();
1376	inm = inm_lookup(ifp, igmp->igmp_group);
1377	if (inm != NULL) {
1378		struct igmp_ifinfo *igi;
1379
1380		igi = inm->inm_igi;
1381		KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp));
1382
1383		IGMPSTAT_INC(igps_rcv_ourreports);
1384
1385		/*
1386		 * If we are in IGMPv3 host mode, do not allow the
1387		 * other host's IGMPv1 report to suppress our reports
1388		 * unless explicitly configured to do so.
1389		 */
1390		if (igi->igi_version == IGMP_VERSION_3) {
1391			if (V_igmp_legacysupp)
1392				igmp_v3_suppress_group_record(inm);
1393			goto out_locked;
1394		}
1395
1396		inm->inm_timer = 0;
1397
1398		switch (inm->inm_state) {
1399		case IGMP_NOT_MEMBER:
1400		case IGMP_SILENT_MEMBER:
1401		case IGMP_SLEEPING_MEMBER:
1402			break;
1403		case IGMP_REPORTING_MEMBER:
1404		case IGMP_IDLE_MEMBER:
1405		case IGMP_AWAKENING_MEMBER:
1406			CTR3(KTR_IGMPV3,
1407			    "report suppressed for %s on ifp %p(%s)",
1408			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1409		case IGMP_LAZY_MEMBER:
1410			inm->inm_state = IGMP_LAZY_MEMBER;
1411			break;
1412		case IGMP_G_QUERY_PENDING_MEMBER:
1413		case IGMP_SG_QUERY_PENDING_MEMBER:
1414		case IGMP_LEAVING_MEMBER:
1415			break;
1416		}
1417	}
1418
1419out_locked:
1420	IN_MULTI_UNLOCK();
1421
1422	return (0);
1423}
1424
1425void
1426igmp_input(struct mbuf *m, int off)
1427{
1428	int iphlen;
1429	struct ifnet *ifp;
1430	struct igmp *igmp;
1431	struct ip *ip;
1432	int igmplen;
1433	int minlen;
1434	int queryver;
1435
1436	CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, m, off);
1437
1438	ifp = m->m_pkthdr.rcvif;
1439
1440	IGMPSTAT_INC(igps_rcv_total);
1441
1442	ip = mtod(m, struct ip *);
1443	iphlen = off;
1444	igmplen = ip->ip_len;
1445
1446	/*
1447	 * Validate lengths.
1448	 */
1449	if (igmplen < IGMP_MINLEN) {
1450		IGMPSTAT_INC(igps_rcv_tooshort);
1451		m_freem(m);
1452		return;
1453	}
1454
1455	/*
1456	 * Always pullup to the minimum size for v1/v2 or v3
1457	 * to amortize calls to m_pullup().
1458	 */
1459	minlen = iphlen;
1460	if (igmplen >= IGMP_V3_QUERY_MINLEN)
1461		minlen += IGMP_V3_QUERY_MINLEN;
1462	else
1463		minlen += IGMP_MINLEN;
1464	if ((m->m_flags & M_EXT || m->m_len < minlen) &&
1465	    (m = m_pullup(m, minlen)) == 0) {
1466		IGMPSTAT_INC(igps_rcv_tooshort);
1467		return;
1468	}
1469	ip = mtod(m, struct ip *);
1470
1471	/*
1472	 * Validate checksum.
1473	 */
1474	m->m_data += iphlen;
1475	m->m_len -= iphlen;
1476	igmp = mtod(m, struct igmp *);
1477	if (in_cksum(m, igmplen)) {
1478		IGMPSTAT_INC(igps_rcv_badsum);
1479		m_freem(m);
1480		return;
1481	}
1482	m->m_data -= iphlen;
1483	m->m_len += iphlen;
1484
1485	/*
1486	 * IGMP control traffic is link-scope, and must have a TTL of 1.
1487	 * DVMRP traffic (e.g. mrinfo, mtrace) is an exception;
1488	 * probe packets may come from beyond the LAN.
1489	 */
1490	if (igmp->igmp_type != IGMP_DVMRP && ip->ip_ttl != 1) {
1491		IGMPSTAT_INC(igps_rcv_badttl);
1492		m_freem(m);
1493		return;
1494	}
1495
1496	switch (igmp->igmp_type) {
1497	case IGMP_HOST_MEMBERSHIP_QUERY:
1498		if (igmplen == IGMP_MINLEN) {
1499			if (igmp->igmp_code == 0)
1500				queryver = IGMP_VERSION_1;
1501			else
1502				queryver = IGMP_VERSION_2;
1503		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
1504			queryver = IGMP_VERSION_3;
1505		} else {
1506			IGMPSTAT_INC(igps_rcv_tooshort);
1507			m_freem(m);
1508			return;
1509		}
1510
1511		switch (queryver) {
1512		case IGMP_VERSION_1:
1513			IGMPSTAT_INC(igps_rcv_v1v2_queries);
1514			if (!V_igmp_v1enable)
1515				break;
1516			if (igmp_input_v1_query(ifp, ip, igmp) != 0) {
1517				m_freem(m);
1518				return;
1519			}
1520			break;
1521
1522		case IGMP_VERSION_2:
1523			IGMPSTAT_INC(igps_rcv_v1v2_queries);
1524			if (!V_igmp_v2enable)
1525				break;
1526			if (igmp_input_v2_query(ifp, ip, igmp) != 0) {
1527				m_freem(m);
1528				return;
1529			}
1530			break;
1531
1532		case IGMP_VERSION_3: {
1533				struct igmpv3 *igmpv3;
1534				uint16_t igmpv3len;
1535				uint16_t srclen;
1536				int nsrc;
1537
1538				IGMPSTAT_INC(igps_rcv_v3_queries);
1539				igmpv3 = (struct igmpv3 *)igmp;
1540				/*
1541				 * Validate length based on source count.
1542				 */
1543				nsrc = ntohs(igmpv3->igmp_numsrc);
1544				srclen = sizeof(struct in_addr) * nsrc;
1545				if (nsrc * sizeof(in_addr_t) > srclen) {
1546					IGMPSTAT_INC(igps_rcv_tooshort);
1547					return;
1548				}
1549				/*
1550				 * m_pullup() may modify m, so pullup in
1551				 * this scope.
1552				 */
1553				igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN +
1554				    srclen;
1555				if ((m->m_flags & M_EXT ||
1556				     m->m_len < igmpv3len) &&
1557				    (m = m_pullup(m, igmpv3len)) == NULL) {
1558					IGMPSTAT_INC(igps_rcv_tooshort);
1559					return;
1560				}
1561				igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *)
1562				    + iphlen);
1563				if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) {
1564					m_freem(m);
1565					return;
1566				}
1567			}
1568			break;
1569		}
1570		break;
1571
1572	case IGMP_v1_HOST_MEMBERSHIP_REPORT:
1573		if (!V_igmp_v1enable)
1574			break;
1575		if (igmp_input_v1_report(ifp, ip, igmp) != 0) {
1576			m_freem(m);
1577			return;
1578		}
1579		break;
1580
1581	case IGMP_v2_HOST_MEMBERSHIP_REPORT:
1582		if (!V_igmp_v2enable)
1583			break;
1584		if (!ip_checkrouteralert(m))
1585			IGMPSTAT_INC(igps_rcv_nora);
1586		if (igmp_input_v2_report(ifp, ip, igmp) != 0) {
1587			m_freem(m);
1588			return;
1589		}
1590		break;
1591
1592	case IGMP_v3_HOST_MEMBERSHIP_REPORT:
1593		/*
1594		 * Hosts do not need to process IGMPv3 membership reports,
1595		 * as report suppression is no longer required.
1596		 */
1597		if (!ip_checkrouteralert(m))
1598			IGMPSTAT_INC(igps_rcv_nora);
1599		break;
1600
1601	default:
1602		break;
1603	}
1604
1605	/*
1606	 * Pass all valid IGMP packets up to any process(es) listening on a
1607	 * raw IGMP socket.
1608	 */
1609	rip_input(m, off);
1610}
1611
1612
1613/*
1614 * Fast timeout handler (global).
1615 * VIMAGE: Timeout handlers are expected to service all vimages.
1616 */
1617void
1618igmp_fasttimo(void)
1619{
1620	VNET_ITERATOR_DECL(vnet_iter);
1621
1622	VNET_LIST_RLOCK_NOSLEEP();
1623	VNET_FOREACH(vnet_iter) {
1624		CURVNET_SET(vnet_iter);
1625		igmp_fasttimo_vnet();
1626		CURVNET_RESTORE();
1627	}
1628	VNET_LIST_RUNLOCK_NOSLEEP();
1629}
1630
1631/*
1632 * Fast timeout handler (per-vnet).
1633 * Sends are shuffled off to a netisr to deal with Giant.
1634 *
1635 * VIMAGE: Assume caller has set up our curvnet.
1636 */
1637static void
1638igmp_fasttimo_vnet(void)
1639{
1640	struct ifqueue		 scq;	/* State-change packets */
1641	struct ifqueue		 qrq;	/* Query response packets */
1642	struct ifnet		*ifp;
1643	struct igmp_ifinfo	*igi;
1644	struct ifmultiaddr	*ifma, *tifma;
1645	struct in_multi		*inm;
1646	int			 loop, uri_fasthz;
1647
1648	loop = 0;
1649	uri_fasthz = 0;
1650
1651	/*
1652	 * Quick check to see if any work needs to be done, in order to
1653	 * minimize the overhead of fasttimo processing.
1654	 * SMPng: XXX Unlocked reads.
1655	 */
1656	if (!V_current_state_timers_running &&
1657	    !V_interface_timers_running &&
1658	    !V_state_change_timers_running)
1659		return;
1660
1661	IN_MULTI_LOCK();
1662	IGMP_LOCK();
1663
1664	/*
1665	 * IGMPv3 General Query response timer processing.
1666	 */
1667	if (V_interface_timers_running) {
1668		CTR1(KTR_IGMPV3, "%s: interface timers running", __func__);
1669
1670		V_interface_timers_running = 0;
1671		LIST_FOREACH(igi, &V_igi_head, igi_link) {
1672			if (igi->igi_v3_timer == 0) {
1673				/* Do nothing. */
1674			} else if (--igi->igi_v3_timer == 0) {
1675				igmp_v3_dispatch_general_query(igi);
1676			} else {
1677				V_interface_timers_running = 1;
1678			}
1679		}
1680	}
1681
1682	if (!V_current_state_timers_running &&
1683	    !V_state_change_timers_running)
1684		goto out_locked;
1685
1686	V_current_state_timers_running = 0;
1687	V_state_change_timers_running = 0;
1688
1689	CTR1(KTR_IGMPV3, "%s: state change timers running", __func__);
1690
1691	/*
1692	 * IGMPv1/v2/v3 host report and state-change timer processing.
1693	 * Note: Processing a v3 group timer may remove a node.
1694	 */
1695	LIST_FOREACH(igi, &V_igi_head, igi_link) {
1696		ifp = igi->igi_ifp;
1697
1698		if (igi->igi_version == IGMP_VERSION_3) {
1699			loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
1700			uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri *
1701			    PR_FASTHZ);
1702
1703			memset(&qrq, 0, sizeof(struct ifqueue));
1704			IFQ_SET_MAXLEN(&qrq, IGMP_MAX_G_GS_PACKETS);
1705
1706			memset(&scq, 0, sizeof(struct ifqueue));
1707			IFQ_SET_MAXLEN(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
1708		}
1709
1710		IF_ADDR_LOCK(ifp);
1711		TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link,
1712		    tifma) {
1713			if (ifma->ifma_addr->sa_family != AF_INET ||
1714			    ifma->ifma_protospec == NULL)
1715				continue;
1716			inm = (struct in_multi *)ifma->ifma_protospec;
1717			switch (igi->igi_version) {
1718			case IGMP_VERSION_1:
1719			case IGMP_VERSION_2:
1720				igmp_v1v2_process_group_timer(inm,
1721				    igi->igi_version);
1722				break;
1723			case IGMP_VERSION_3:
1724				igmp_v3_process_group_timers(igi, &qrq,
1725				    &scq, inm, uri_fasthz);
1726				break;
1727			}
1728		}
1729		IF_ADDR_UNLOCK(ifp);
1730
1731		if (igi->igi_version == IGMP_VERSION_3) {
1732			struct in_multi		*tinm;
1733
1734			igmp_dispatch_queue(&qrq, 0, loop);
1735			igmp_dispatch_queue(&scq, 0, loop);
1736
1737			/*
1738			 * Free the in_multi reference(s) for this
1739			 * IGMP lifecycle.
1740			 */
1741			SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead,
1742			    inm_nrele, tinm) {
1743				SLIST_REMOVE_HEAD(&igi->igi_relinmhead,
1744				    inm_nrele);
1745				inm_release_locked(inm);
1746			}
1747		}
1748	}
1749
1750out_locked:
1751	IGMP_UNLOCK();
1752	IN_MULTI_UNLOCK();
1753}
1754
1755/*
1756 * Update host report group timer for IGMPv1/v2.
1757 * Will update the global pending timer flags.
1758 */
1759static void
1760igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
1761{
1762	int report_timer_expired;
1763
1764	IN_MULTI_LOCK_ASSERT();
1765	IGMP_LOCK_ASSERT();
1766
1767	if (inm->inm_timer == 0) {
1768		report_timer_expired = 0;
1769	} else if (--inm->inm_timer == 0) {
1770		report_timer_expired = 1;
1771	} else {
1772		V_current_state_timers_running = 1;
1773		return;
1774	}
1775
1776	switch (inm->inm_state) {
1777	case IGMP_NOT_MEMBER:
1778	case IGMP_SILENT_MEMBER:
1779	case IGMP_IDLE_MEMBER:
1780	case IGMP_LAZY_MEMBER:
1781	case IGMP_SLEEPING_MEMBER:
1782	case IGMP_AWAKENING_MEMBER:
1783		break;
1784	case IGMP_REPORTING_MEMBER:
1785		if (report_timer_expired) {
1786			inm->inm_state = IGMP_IDLE_MEMBER;
1787			(void)igmp_v1v2_queue_report(inm,
1788			    (version == IGMP_VERSION_2) ?
1789			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
1790			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
1791		}
1792		break;
1793	case IGMP_G_QUERY_PENDING_MEMBER:
1794	case IGMP_SG_QUERY_PENDING_MEMBER:
1795	case IGMP_LEAVING_MEMBER:
1796		break;
1797	}
1798}
1799
1800/*
1801 * Update a group's timers for IGMPv3.
1802 * Will update the global pending timer flags.
1803 * Note: Unlocked read from igi.
1804 */
1805static void
1806igmp_v3_process_group_timers(struct igmp_ifinfo *igi,
1807    struct ifqueue *qrq, struct ifqueue *scq,
1808    struct in_multi *inm, const int uri_fasthz)
1809{
1810	int query_response_timer_expired;
1811	int state_change_retransmit_timer_expired;
1812
1813	IN_MULTI_LOCK_ASSERT();
1814	IGMP_LOCK_ASSERT();
1815
1816	query_response_timer_expired = 0;
1817	state_change_retransmit_timer_expired = 0;
1818
1819	/*
1820	 * During a transition from v1/v2 compatibility mode back to v3,
1821	 * a group record in REPORTING state may still have its group
1822	 * timer active. This is a no-op in this function; it is easier
1823	 * to deal with it here than to complicate the slow-timeout path.
1824	 */
1825	if (inm->inm_timer == 0) {
1826		query_response_timer_expired = 0;
1827	} else if (--inm->inm_timer == 0) {
1828		query_response_timer_expired = 1;
1829	} else {
1830		V_current_state_timers_running = 1;
1831	}
1832
1833	if (inm->inm_sctimer == 0) {
1834		state_change_retransmit_timer_expired = 0;
1835	} else if (--inm->inm_sctimer == 0) {
1836		state_change_retransmit_timer_expired = 1;
1837	} else {
1838		V_state_change_timers_running = 1;
1839	}
1840
1841	/* We are in fasttimo, so be quick about it. */
1842	if (!state_change_retransmit_timer_expired &&
1843	    !query_response_timer_expired)
1844		return;
1845
1846	switch (inm->inm_state) {
1847	case IGMP_NOT_MEMBER:
1848	case IGMP_SILENT_MEMBER:
1849	case IGMP_SLEEPING_MEMBER:
1850	case IGMP_LAZY_MEMBER:
1851	case IGMP_AWAKENING_MEMBER:
1852	case IGMP_IDLE_MEMBER:
1853		break;
1854	case IGMP_G_QUERY_PENDING_MEMBER:
1855	case IGMP_SG_QUERY_PENDING_MEMBER:
1856		/*
1857		 * Respond to a previously pending Group-Specific
1858		 * or Group-and-Source-Specific query by enqueueing
1859		 * the appropriate Current-State report for
1860		 * immediate transmission.
1861		 */
1862		if (query_response_timer_expired) {
1863			int retval;
1864
1865			retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1,
1866			    (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER));
1867			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
1868			    __func__, retval);
1869			inm->inm_state = IGMP_REPORTING_MEMBER;
1870			/* XXX Clear recorded sources for next time. */
1871			inm_clear_recorded(inm);
1872		}
1873		/* FALLTHROUGH */
1874	case IGMP_REPORTING_MEMBER:
1875	case IGMP_LEAVING_MEMBER:
1876		if (state_change_retransmit_timer_expired) {
1877			/*
1878			 * State-change retransmission timer fired.
1879			 * If there are any further pending retransmissions,
1880			 * set the global pending state-change flag, and
1881			 * reset the timer.
1882			 */
1883			if (--inm->inm_scrv > 0) {
1884				inm->inm_sctimer = uri_fasthz;
1885				V_state_change_timers_running = 1;
1886			}
1887			/*
1888			 * Retransmit the previously computed state-change
1889			 * report. If there are no further pending
1890			 * retransmissions, the mbuf queue will be consumed.
1891			 * Update T0 state to T1 as we have now sent
1892			 * a state-change.
1893			 */
1894			(void)igmp_v3_merge_state_changes(inm, scq);
1895
1896			inm_commit(inm);
1897			CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
1898			    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
1899
1900			/*
1901			 * If we are leaving the group for good, make sure
1902			 * we release IGMP's reference to it.
1903			 * This release must be deferred using a SLIST,
1904			 * as we are called from a loop which traverses
1905			 * the in_ifmultiaddr TAILQ.
1906			 */
1907			if (inm->inm_state == IGMP_LEAVING_MEMBER &&
1908			    inm->inm_scrv == 0) {
1909				inm->inm_state = IGMP_NOT_MEMBER;
1910				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
1911				    inm, inm_nrele);
1912			}
1913		}
1914		break;
1915	}
1916}
1917
1918
1919/*
1920 * Suppress a group's pending response to a group or source/group query.
1921 *
1922 * Do NOT suppress state changes. This leads to IGMPv3 inconsistency.
1923 * Do NOT update ST1/ST0 as this operation merely suppresses
1924 * the currently pending group record.
1925 * Do NOT suppress the response to a general query. It is possible but
1926 * it would require adding another state or flag.
1927 */
1928static void
1929igmp_v3_suppress_group_record(struct in_multi *inm)
1930{
1931
1932	IN_MULTI_LOCK_ASSERT();
1933
1934	KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3,
1935		("%s: not IGMPv3 mode on link", __func__));
1936
1937	if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER ||
1938	    inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER)
1939		return;
1940
1941	if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
1942		inm_clear_recorded(inm);
1943
1944	inm->inm_timer = 0;
1945	inm->inm_state = IGMP_REPORTING_MEMBER;
1946}
1947
1948/*
1949 * Switch to a different IGMP version on the given interface,
1950 * as per Section 7.2.1.
1951 */
1952static void
1953igmp_set_version(struct igmp_ifinfo *igi, const int version)
1954{
1955	int old_version_timer;
1956
1957	IGMP_LOCK_ASSERT();
1958
1959	CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__,
1960	    version, igi->igi_ifp, igi->igi_ifp->if_xname);
1961
1962	if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) {
1963		/*
1964		 * Compute the "Older Version Querier Present" timer as per
1965		 * Section 8.12.
1966		 */
1967		old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri;
1968		old_version_timer *= PR_SLOWHZ;
1969
1970		if (version == IGMP_VERSION_1) {
1971			igi->igi_v1_timer = old_version_timer;
1972			igi->igi_v2_timer = 0;
1973		} else if (version == IGMP_VERSION_2) {
1974			igi->igi_v1_timer = 0;
1975			igi->igi_v2_timer = old_version_timer;
1976		}
1977	}
1978
1979	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
1980		if (igi->igi_version != IGMP_VERSION_2) {
1981			igi->igi_version = IGMP_VERSION_2;
1982			igmp_v3_cancel_link_timers(igi);
1983		}
1984	} else if (igi->igi_v1_timer > 0) {
1985		if (igi->igi_version != IGMP_VERSION_1) {
1986			igi->igi_version = IGMP_VERSION_1;
1987			igmp_v3_cancel_link_timers(igi);
1988		}
1989	}
1990}
1991
1992/*
1993 * Cancel pending IGMPv3 timers for the given link and all groups
1994 * joined on it; state-change, general-query, and group-query timers.
1995 *
1996 * Only ever called on a transition from v3 to Compatibility mode. Kill
1997 * the timers stone dead (this may be expensive for large N groups), they
1998 * will be restarted if Compatibility Mode deems that they must be due to
1999 * query processing.
2000 */
2001static void
2002igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi)
2003{
2004	struct ifmultiaddr	*ifma;
2005	struct ifnet		*ifp;
2006	struct in_multi		*inm;
2007
2008	CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__,
2009	    igi->igi_ifp, igi->igi_ifp->if_xname);
2010
2011	IN_MULTI_LOCK_ASSERT();
2012	IGMP_LOCK_ASSERT();
2013
2014	/*
2015	 * Stop the v3 General Query Response on this link stone dead.
2016	 * If fasttimo is woken up due to V_interface_timers_running,
2017	 * the flag will be cleared if there are no pending link timers.
2018	 */
2019	igi->igi_v3_timer = 0;
2020
2021	/*
2022	 * Now clear the current-state and state-change report timers
2023	 * for all memberships scoped to this link.
2024	 */
2025	ifp = igi->igi_ifp;
2026	IF_ADDR_LOCK(ifp);
2027	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2028		if (ifma->ifma_addr->sa_family != AF_INET ||
2029		    ifma->ifma_protospec == NULL)
2030			continue;
2031		inm = (struct in_multi *)ifma->ifma_protospec;
2032		switch (inm->inm_state) {
2033		case IGMP_NOT_MEMBER:
2034		case IGMP_SILENT_MEMBER:
2035		case IGMP_IDLE_MEMBER:
2036		case IGMP_LAZY_MEMBER:
2037		case IGMP_SLEEPING_MEMBER:
2038		case IGMP_AWAKENING_MEMBER:
2039			/*
2040			 * These states are either not relevant in v3 mode,
2041			 * or are unreported. Do nothing.
2042			 */
2043			break;
2044		case IGMP_LEAVING_MEMBER:
2045			/*
2046			 * If we are leaving the group and switching to
2047			 * compatibility mode, we need to release the final
2048			 * reference held for issuing the INCLUDE {}, and
2049			 * transition to REPORTING to ensure the host leave
2050			 * message is sent upstream to the old querier --
2051			 * transition to NOT would lose the leave and race.
2052			 *
2053			 * SMPNG: Must drop and re-acquire IF_ADDR_LOCK
2054			 * around inm_release_locked(), as it is not
2055			 * a recursive mutex.
2056			 */
2057			IF_ADDR_UNLOCK(ifp);
2058			inm_release_locked(inm);
2059			IF_ADDR_LOCK(ifp);
2060			/* FALLTHROUGH */
2061		case IGMP_G_QUERY_PENDING_MEMBER:
2062		case IGMP_SG_QUERY_PENDING_MEMBER:
2063			inm_clear_recorded(inm);
2064			/* FALLTHROUGH */
2065		case IGMP_REPORTING_MEMBER:
2066			inm->inm_state = IGMP_REPORTING_MEMBER;
2067			break;
2068		}
2069		/*
2070		 * Always clear state-change and group report timers.
2071		 * Free any pending IGMPv3 state-change records.
2072		 */
2073		inm->inm_sctimer = 0;
2074		inm->inm_timer = 0;
2075		_IF_DRAIN(&inm->inm_scq);
2076	}
2077	IF_ADDR_UNLOCK(ifp);
2078}
2079
2080/*
2081 * Update the Older Version Querier Present timers for a link.
2082 * See Section 7.2.1 of RFC 3376.
2083 */
2084static void
2085igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi)
2086{
2087
2088	IGMP_LOCK_ASSERT();
2089
2090	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) {
2091		/*
2092		 * IGMPv1 and IGMPv2 Querier Present timers expired.
2093		 *
2094		 * Revert to IGMPv3.
2095		 */
2096		if (igi->igi_version != IGMP_VERSION_3) {
2097			CTR5(KTR_IGMPV3,
2098			    "%s: transition from v%d -> v%d on %p(%s)",
2099			    __func__, igi->igi_version, IGMP_VERSION_3,
2100			    igi->igi_ifp, igi->igi_ifp->if_xname);
2101			igi->igi_version = IGMP_VERSION_3;
2102		}
2103	} else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
2104		/*
2105		 * IGMPv1 Querier Present timer expired,
2106		 * IGMPv2 Querier Present timer running.
2107		 * If IGMPv2 was disabled since last timeout,
2108		 * revert to IGMPv3.
2109		 * If IGMPv2 is enabled, revert to IGMPv2.
2110		 */
2111		if (!V_igmp_v2enable) {
2112			CTR5(KTR_IGMPV3,
2113			    "%s: transition from v%d -> v%d on %p(%s)",
2114			    __func__, igi->igi_version, IGMP_VERSION_3,
2115			    igi->igi_ifp, igi->igi_ifp->if_xname);
2116			igi->igi_v2_timer = 0;
2117			igi->igi_version = IGMP_VERSION_3;
2118		} else {
2119			--igi->igi_v2_timer;
2120			if (igi->igi_version != IGMP_VERSION_2) {
2121				CTR5(KTR_IGMPV3,
2122				    "%s: transition from v%d -> v%d on %p(%s)",
2123				    __func__, igi->igi_version, IGMP_VERSION_2,
2124				    igi->igi_ifp, igi->igi_ifp->if_xname);
2125				igi->igi_version = IGMP_VERSION_2;
2126			}
2127		}
2128	} else if (igi->igi_v1_timer > 0) {
2129		/*
2130		 * IGMPv1 Querier Present timer running.
2131		 * Stop IGMPv2 timer if running.
2132		 *
2133		 * If IGMPv1 was disabled since last timeout,
2134		 * revert to IGMPv3.
2135		 * If IGMPv1 is enabled, reset IGMPv2 timer if running.
2136		 */
2137		if (!V_igmp_v1enable) {
2138			CTR5(KTR_IGMPV3,
2139			    "%s: transition from v%d -> v%d on %p(%s)",
2140			    __func__, igi->igi_version, IGMP_VERSION_3,
2141			    igi->igi_ifp, igi->igi_ifp->if_xname);
2142			igi->igi_v1_timer = 0;
2143			igi->igi_version = IGMP_VERSION_3;
2144		} else {
2145			--igi->igi_v1_timer;
2146		}
2147		if (igi->igi_v2_timer > 0) {
2148			CTR3(KTR_IGMPV3,
2149			    "%s: cancel v2 timer on %p(%s)",
2150			    __func__, igi->igi_ifp, igi->igi_ifp->if_xname);
2151			igi->igi_v2_timer = 0;
2152		}
2153	}
2154}
2155
2156/*
2157 * Global slowtimo handler.
2158 * VIMAGE: Timeout handlers are expected to service all vimages.
2159 */
2160void
2161igmp_slowtimo(void)
2162{
2163	VNET_ITERATOR_DECL(vnet_iter);
2164
2165	VNET_LIST_RLOCK_NOSLEEP();
2166	VNET_FOREACH(vnet_iter) {
2167		CURVNET_SET(vnet_iter);
2168		igmp_slowtimo_vnet();
2169		CURVNET_RESTORE();
2170	}
2171	VNET_LIST_RUNLOCK_NOSLEEP();
2172}
2173
2174/*
2175 * Per-vnet slowtimo handler.
2176 */
2177static void
2178igmp_slowtimo_vnet(void)
2179{
2180	struct igmp_ifinfo *igi;
2181
2182	IGMP_LOCK();
2183
2184	LIST_FOREACH(igi, &V_igi_head, igi_link) {
2185		igmp_v1v2_process_querier_timers(igi);
2186	}
2187
2188	IGMP_UNLOCK();
2189}
2190
2191/*
2192 * Dispatch an IGMPv1/v2 host report or leave message.
2193 * These are always small enough to fit inside a single mbuf.
2194 */
2195static int
2196igmp_v1v2_queue_report(struct in_multi *inm, const int type)
2197{
2198	struct ifnet		*ifp;
2199	struct igmp		*igmp;
2200	struct ip		*ip;
2201	struct mbuf		*m;
2202
2203	IN_MULTI_LOCK_ASSERT();
2204	IGMP_LOCK_ASSERT();
2205
2206	ifp = inm->inm_ifp;
2207
2208	MGETHDR(m, M_DONTWAIT, MT_DATA);
2209	if (m == NULL)
2210		return (ENOMEM);
2211	MH_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
2212
2213	m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp);
2214
2215	m->m_data += sizeof(struct ip);
2216	m->m_len = sizeof(struct igmp);
2217
2218	igmp = mtod(m, struct igmp *);
2219	igmp->igmp_type = type;
2220	igmp->igmp_code = 0;
2221	igmp->igmp_group = inm->inm_addr;
2222	igmp->igmp_cksum = 0;
2223	igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp));
2224
2225	m->m_data -= sizeof(struct ip);
2226	m->m_len += sizeof(struct ip);
2227
2228	ip = mtod(m, struct ip *);
2229	ip->ip_tos = 0;
2230	ip->ip_len = sizeof(struct ip) + sizeof(struct igmp);
2231	ip->ip_off = 0;
2232	ip->ip_p = IPPROTO_IGMP;
2233	ip->ip_src.s_addr = INADDR_ANY;
2234
2235	if (type == IGMP_HOST_LEAVE_MESSAGE)
2236		ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP);
2237	else
2238		ip->ip_dst = inm->inm_addr;
2239
2240	igmp_save_context(m, ifp);
2241
2242	m->m_flags |= M_IGMPV2;
2243	if (inm->inm_igi->igi_flags & IGIF_LOOPBACK)
2244		m->m_flags |= M_IGMP_LOOP;
2245
2246	CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m);
2247	netisr_dispatch(NETISR_IGMP, m);
2248
2249	return (0);
2250}
2251
2252/*
2253 * Process a state change from the upper layer for the given IPv4 group.
2254 *
2255 * Each socket holds a reference on the in_multi in its own ip_moptions.
2256 * The socket layer will have made the necessary updates to.the group
2257 * state, it is now up to IGMP to issue a state change report if there
2258 * has been any change between T0 (when the last state-change was issued)
2259 * and T1 (now).
2260 *
2261 * We use the IGMPv3 state machine at group level. The IGMP module
2262 * however makes the decision as to which IGMP protocol version to speak.
2263 * A state change *from* INCLUDE {} always means an initial join.
2264 * A state change *to* INCLUDE {} always means a final leave.
2265 *
2266 * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can
2267 * save ourselves a bunch of work; any exclusive mode groups need not
2268 * compute source filter lists.
2269 *
2270 * VIMAGE: curvnet should have been set by caller, as this routine
2271 * is called from the socket option handlers.
2272 */
2273int
2274igmp_change_state(struct in_multi *inm)
2275{
2276	struct igmp_ifinfo *igi;
2277	struct ifnet *ifp;
2278	int error;
2279
2280	IN_MULTI_LOCK_ASSERT();
2281
2282	error = 0;
2283
2284	/*
2285	 * Try to detect if the upper layer just asked us to change state
2286	 * for an interface which has now gone away.
2287	 */
2288	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
2289	ifp = inm->inm_ifma->ifma_ifp;
2290	if (ifp != NULL) {
2291		/*
2292		 * Sanity check that netinet's notion of ifp is the
2293		 * same as net's.
2294		 */
2295		KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
2296	}
2297
2298	IGMP_LOCK();
2299
2300	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
2301	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
2302
2303	/*
2304	 * If we detect a state transition to or from MCAST_UNDEFINED
2305	 * for this group, then we are starting or finishing an IGMP
2306	 * life cycle for this group.
2307	 */
2308	if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) {
2309		CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__,
2310		    inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode);
2311		if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) {
2312			CTR1(KTR_IGMPV3, "%s: initial join", __func__);
2313			error = igmp_initial_join(inm, igi);
2314			goto out_locked;
2315		} else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) {
2316			CTR1(KTR_IGMPV3, "%s: final leave", __func__);
2317			igmp_final_leave(inm, igi);
2318			goto out_locked;
2319		}
2320	} else {
2321		CTR1(KTR_IGMPV3, "%s: filter set change", __func__);
2322	}
2323
2324	error = igmp_handle_state_change(inm, igi);
2325
2326out_locked:
2327	IGMP_UNLOCK();
2328	return (error);
2329}
2330
2331/*
2332 * Perform the initial join for an IGMP group.
2333 *
2334 * When joining a group:
2335 *  If the group should have its IGMP traffic suppressed, do nothing.
2336 *  IGMPv1 starts sending IGMPv1 host membership reports.
2337 *  IGMPv2 starts sending IGMPv2 host membership reports.
2338 *  IGMPv3 will schedule an IGMPv3 state-change report containing the
2339 *  initial state of the membership.
2340 */
2341static int
2342igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi)
2343{
2344	struct ifnet		*ifp;
2345	struct ifqueue		*ifq;
2346	int			 error, retval, syncstates;
2347
2348	CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)",
2349	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
2350	    inm->inm_ifp->if_xname);
2351
2352	error = 0;
2353	syncstates = 1;
2354
2355	ifp = inm->inm_ifp;
2356
2357	IN_MULTI_LOCK_ASSERT();
2358	IGMP_LOCK_ASSERT();
2359
2360	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
2361
2362	/*
2363	 * Groups joined on loopback or marked as 'not reported',
2364	 * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and
2365	 * are never reported in any IGMP protocol exchanges.
2366	 * All other groups enter the appropriate IGMP state machine
2367	 * for the version in use on this link.
2368	 * A link marked as IGIF_SILENT causes IGMP to be completely
2369	 * disabled for the link.
2370	 */
2371	if ((ifp->if_flags & IFF_LOOPBACK) ||
2372	    (igi->igi_flags & IGIF_SILENT) ||
2373	    !igmp_isgroupreported(inm->inm_addr)) {
2374		CTR1(KTR_IGMPV3,
2375"%s: not kicking state machine for silent group", __func__);
2376		inm->inm_state = IGMP_SILENT_MEMBER;
2377		inm->inm_timer = 0;
2378	} else {
2379		/*
2380		 * Deal with overlapping in_multi lifecycle.
2381		 * If this group was LEAVING, then make sure
2382		 * we drop the reference we picked up to keep the
2383		 * group around for the final INCLUDE {} enqueue.
2384		 */
2385		if (igi->igi_version == IGMP_VERSION_3 &&
2386		    inm->inm_state == IGMP_LEAVING_MEMBER)
2387			inm_release_locked(inm);
2388
2389		inm->inm_state = IGMP_REPORTING_MEMBER;
2390
2391		switch (igi->igi_version) {
2392		case IGMP_VERSION_1:
2393		case IGMP_VERSION_2:
2394			inm->inm_state = IGMP_IDLE_MEMBER;
2395			error = igmp_v1v2_queue_report(inm,
2396			    (igi->igi_version == IGMP_VERSION_2) ?
2397			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
2398			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
2399			if (error == 0) {
2400				inm->inm_timer = IGMP_RANDOM_DELAY(
2401				    IGMP_V1V2_MAX_RI * PR_FASTHZ);
2402				V_current_state_timers_running = 1;
2403			}
2404			break;
2405
2406		case IGMP_VERSION_3:
2407			/*
2408			 * Defer update of T0 to T1, until the first copy
2409			 * of the state change has been transmitted.
2410			 */
2411			syncstates = 0;
2412
2413			/*
2414			 * Immediately enqueue a State-Change Report for
2415			 * this interface, freeing any previous reports.
2416			 * Don't kick the timers if there is nothing to do,
2417			 * or if an error occurred.
2418			 */
2419			ifq = &inm->inm_scq;
2420			_IF_DRAIN(ifq);
2421			retval = igmp_v3_enqueue_group_record(ifq, inm, 1,
2422			    0, 0);
2423			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
2424			    __func__, retval);
2425			if (retval <= 0) {
2426				error = retval * -1;
2427				break;
2428			}
2429
2430			/*
2431			 * Schedule transmission of pending state-change
2432			 * report up to RV times for this link. The timer
2433			 * will fire at the next igmp_fasttimo (~200ms),
2434			 * giving us an opportunity to merge the reports.
2435			 */
2436			if (igi->igi_flags & IGIF_LOOPBACK) {
2437				inm->inm_scrv = 1;
2438			} else {
2439				KASSERT(igi->igi_rv > 1,
2440				   ("%s: invalid robustness %d", __func__,
2441				    igi->igi_rv));
2442				inm->inm_scrv = igi->igi_rv;
2443			}
2444			inm->inm_sctimer = 1;
2445			V_state_change_timers_running = 1;
2446
2447			error = 0;
2448			break;
2449		}
2450	}
2451
2452	/*
2453	 * Only update the T0 state if state change is atomic,
2454	 * i.e. we don't need to wait for a timer to fire before we
2455	 * can consider the state change to have been communicated.
2456	 */
2457	if (syncstates) {
2458		inm_commit(inm);
2459		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
2460		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2461	}
2462
2463	return (error);
2464}
2465
2466/*
2467 * Issue an intermediate state change during the IGMP life-cycle.
2468 */
2469static int
2470igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi)
2471{
2472	struct ifnet		*ifp;
2473	int			 retval;
2474
2475	CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)",
2476	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
2477	    inm->inm_ifp->if_xname);
2478
2479	ifp = inm->inm_ifp;
2480
2481	IN_MULTI_LOCK_ASSERT();
2482	IGMP_LOCK_ASSERT();
2483
2484	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
2485
2486	if ((ifp->if_flags & IFF_LOOPBACK) ||
2487	    (igi->igi_flags & IGIF_SILENT) ||
2488	    !igmp_isgroupreported(inm->inm_addr) ||
2489	    (igi->igi_version != IGMP_VERSION_3)) {
2490		if (!igmp_isgroupreported(inm->inm_addr)) {
2491			CTR1(KTR_IGMPV3,
2492"%s: not kicking state machine for silent group", __func__);
2493		}
2494		CTR1(KTR_IGMPV3, "%s: nothing to do", __func__);
2495		inm_commit(inm);
2496		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
2497		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2498		return (0);
2499	}
2500
2501	_IF_DRAIN(&inm->inm_scq);
2502
2503	retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0);
2504	CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval);
2505	if (retval <= 0)
2506		return (-retval);
2507
2508	/*
2509	 * If record(s) were enqueued, start the state-change
2510	 * report timer for this group.
2511	 */
2512	inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv);
2513	inm->inm_sctimer = 1;
2514	V_state_change_timers_running = 1;
2515
2516	return (0);
2517}
2518
2519/*
2520 * Perform the final leave for an IGMP group.
2521 *
2522 * When leaving a group:
2523 *  IGMPv1 does nothing.
2524 *  IGMPv2 sends a host leave message, if and only if we are the reporter.
2525 *  IGMPv3 enqueues a state-change report containing a transition
2526 *  to INCLUDE {} for immediate transmission.
2527 */
2528static void
2529igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi)
2530{
2531	int syncstates;
2532
2533	syncstates = 1;
2534
2535	CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)",
2536	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
2537	    inm->inm_ifp->if_xname);
2538
2539	IN_MULTI_LOCK_ASSERT();
2540	IGMP_LOCK_ASSERT();
2541
2542	switch (inm->inm_state) {
2543	case IGMP_NOT_MEMBER:
2544	case IGMP_SILENT_MEMBER:
2545	case IGMP_LEAVING_MEMBER:
2546		/* Already leaving or left; do nothing. */
2547		CTR1(KTR_IGMPV3,
2548"%s: not kicking state machine for silent group", __func__);
2549		break;
2550	case IGMP_REPORTING_MEMBER:
2551	case IGMP_IDLE_MEMBER:
2552	case IGMP_G_QUERY_PENDING_MEMBER:
2553	case IGMP_SG_QUERY_PENDING_MEMBER:
2554		if (igi->igi_version == IGMP_VERSION_2) {
2555#ifdef INVARIANTS
2556			if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
2557			    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
2558			panic("%s: IGMPv3 state reached, not IGMPv3 mode",
2559			     __func__);
2560#endif
2561			igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE);
2562			inm->inm_state = IGMP_NOT_MEMBER;
2563		} else if (igi->igi_version == IGMP_VERSION_3) {
2564			/*
2565			 * Stop group timer and all pending reports.
2566			 * Immediately enqueue a state-change report
2567			 * TO_IN {} to be sent on the next fast timeout,
2568			 * giving us an opportunity to merge reports.
2569			 */
2570			_IF_DRAIN(&inm->inm_scq);
2571			inm->inm_timer = 0;
2572			if (igi->igi_flags & IGIF_LOOPBACK) {
2573				inm->inm_scrv = 1;
2574			} else {
2575				inm->inm_scrv = igi->igi_rv;
2576			}
2577			CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d "
2578			    "pending retransmissions.", __func__,
2579			    inet_ntoa(inm->inm_addr),
2580			    inm->inm_ifp->if_xname, inm->inm_scrv);
2581			if (inm->inm_scrv == 0) {
2582				inm->inm_state = IGMP_NOT_MEMBER;
2583				inm->inm_sctimer = 0;
2584			} else {
2585				int retval;
2586
2587				inm_acquire_locked(inm);
2588
2589				retval = igmp_v3_enqueue_group_record(
2590				    &inm->inm_scq, inm, 1, 0, 0);
2591				KASSERT(retval != 0,
2592				    ("%s: enqueue record = %d", __func__,
2593				     retval));
2594
2595				inm->inm_state = IGMP_LEAVING_MEMBER;
2596				inm->inm_sctimer = 1;
2597				V_state_change_timers_running = 1;
2598				syncstates = 0;
2599			}
2600			break;
2601		}
2602		break;
2603	case IGMP_LAZY_MEMBER:
2604	case IGMP_SLEEPING_MEMBER:
2605	case IGMP_AWAKENING_MEMBER:
2606		/* Our reports are suppressed; do nothing. */
2607		break;
2608	}
2609
2610	if (syncstates) {
2611		inm_commit(inm);
2612		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
2613		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2614		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
2615		CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s",
2616		    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2617	}
2618}
2619
2620/*
2621 * Enqueue an IGMPv3 group record to the given output queue.
2622 *
2623 * XXX This function could do with having the allocation code
2624 * split out, and the multiple-tree-walks coalesced into a single
2625 * routine as has been done in igmp_v3_enqueue_filter_change().
2626 *
2627 * If is_state_change is zero, a current-state record is appended.
2628 * If is_state_change is non-zero, a state-change report is appended.
2629 *
2630 * If is_group_query is non-zero, an mbuf packet chain is allocated.
2631 * If is_group_query is zero, and if there is a packet with free space
2632 * at the tail of the queue, it will be appended to providing there
2633 * is enough free space.
2634 * Otherwise a new mbuf packet chain is allocated.
2635 *
2636 * If is_source_query is non-zero, each source is checked to see if
2637 * it was recorded for a Group-Source query, and will be omitted if
2638 * it is not both in-mode and recorded.
2639 *
2640 * The function will attempt to allocate leading space in the packet
2641 * for the IP/IGMP header to be prepended without fragmenting the chain.
2642 *
2643 * If successful the size of all data appended to the queue is returned,
2644 * otherwise an error code less than zero is returned, or zero if
2645 * no record(s) were appended.
2646 */
2647static int
2648igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
2649    const int is_state_change, const int is_group_query,
2650    const int is_source_query)
2651{
2652	struct igmp_grouprec	 ig;
2653	struct igmp_grouprec	*pig;
2654	struct ifnet		*ifp;
2655	struct ip_msource	*ims, *nims;
2656	struct mbuf		*m0, *m, *md;
2657	int			 error, is_filter_list_change;
2658	int			 minrec0len, m0srcs, msrcs, nbytes, off;
2659	int			 record_has_sources;
2660	int			 now;
2661	int			 type;
2662	in_addr_t		 naddr;
2663	uint8_t			 mode;
2664
2665	IN_MULTI_LOCK_ASSERT();
2666
2667	error = 0;
2668	ifp = inm->inm_ifp;
2669	is_filter_list_change = 0;
2670	m = NULL;
2671	m0 = NULL;
2672	m0srcs = 0;
2673	msrcs = 0;
2674	nbytes = 0;
2675	nims = NULL;
2676	record_has_sources = 1;
2677	pig = NULL;
2678	type = IGMP_DO_NOTHING;
2679	mode = inm->inm_st[1].iss_fmode;
2680
2681	/*
2682	 * If we did not transition out of ASM mode during t0->t1,
2683	 * and there are no source nodes to process, we can skip
2684	 * the generation of source records.
2685	 */
2686	if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 &&
2687	    inm->inm_nsrc == 0)
2688		record_has_sources = 0;
2689
2690	if (is_state_change) {
2691		/*
2692		 * Queue a state change record.
2693		 * If the mode did not change, and there are non-ASM
2694		 * listeners or source filters present,
2695		 * we potentially need to issue two records for the group.
2696		 * If we are transitioning to MCAST_UNDEFINED, we need
2697		 * not send any sources.
2698		 * If there are ASM listeners, and there was no filter
2699		 * mode transition of any kind, do nothing.
2700		 */
2701		if (mode != inm->inm_st[0].iss_fmode) {
2702			if (mode == MCAST_EXCLUDE) {
2703				CTR1(KTR_IGMPV3, "%s: change to EXCLUDE",
2704				    __func__);
2705				type = IGMP_CHANGE_TO_EXCLUDE_MODE;
2706			} else {
2707				CTR1(KTR_IGMPV3, "%s: change to INCLUDE",
2708				    __func__);
2709				type = IGMP_CHANGE_TO_INCLUDE_MODE;
2710				if (mode == MCAST_UNDEFINED)
2711					record_has_sources = 0;
2712			}
2713		} else {
2714			if (record_has_sources) {
2715				is_filter_list_change = 1;
2716			} else {
2717				type = IGMP_DO_NOTHING;
2718			}
2719		}
2720	} else {
2721		/*
2722		 * Queue a current state record.
2723		 */
2724		if (mode == MCAST_EXCLUDE) {
2725			type = IGMP_MODE_IS_EXCLUDE;
2726		} else if (mode == MCAST_INCLUDE) {
2727			type = IGMP_MODE_IS_INCLUDE;
2728			KASSERT(inm->inm_st[1].iss_asm == 0,
2729			    ("%s: inm %p is INCLUDE but ASM count is %d",
2730			     __func__, inm, inm->inm_st[1].iss_asm));
2731		}
2732	}
2733
2734	/*
2735	 * Generate the filter list changes using a separate function.
2736	 */
2737	if (is_filter_list_change)
2738		return (igmp_v3_enqueue_filter_change(ifq, inm));
2739
2740	if (type == IGMP_DO_NOTHING) {
2741		CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s",
2742		    __func__, inet_ntoa(inm->inm_addr),
2743		    inm->inm_ifp->if_xname);
2744		return (0);
2745	}
2746
2747	/*
2748	 * If any sources are present, we must be able to fit at least
2749	 * one in the trailing space of the tail packet's mbuf,
2750	 * ideally more.
2751	 */
2752	minrec0len = sizeof(struct igmp_grouprec);
2753	if (record_has_sources)
2754		minrec0len += sizeof(in_addr_t);
2755
2756	CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__,
2757	    igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr),
2758	    inm->inm_ifp->if_xname);
2759
2760	/*
2761	 * Check if we have a packet in the tail of the queue for this
2762	 * group into which the first group record for this group will fit.
2763	 * Otherwise allocate a new packet.
2764	 * Always allocate leading space for IP+RA_OPT+IGMP+REPORT.
2765	 * Note: Group records for G/GSR query responses MUST be sent
2766	 * in their own packet.
2767	 */
2768	m0 = ifq->ifq_tail;
2769	if (!is_group_query &&
2770	    m0 != NULL &&
2771	    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) &&
2772	    (m0->m_pkthdr.len + minrec0len) <
2773	     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
2774		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
2775			    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
2776		m = m0;
2777		CTR1(KTR_IGMPV3, "%s: use existing packet", __func__);
2778	} else {
2779		if (_IF_QFULL(ifq)) {
2780			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
2781			return (-ENOMEM);
2782		}
2783		m = NULL;
2784		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
2785		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
2786		if (!is_state_change && !is_group_query) {
2787			m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2788			if (m)
2789				m->m_data += IGMP_LEADINGSPACE;
2790		}
2791		if (m == NULL) {
2792			m = m_gethdr(M_DONTWAIT, MT_DATA);
2793			if (m)
2794				MH_ALIGN(m, IGMP_LEADINGSPACE);
2795		}
2796		if (m == NULL)
2797			return (-ENOMEM);
2798
2799		igmp_save_context(m, ifp);
2800
2801		CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__);
2802	}
2803
2804	/*
2805	 * Append group record.
2806	 * If we have sources, we don't know how many yet.
2807	 */
2808	ig.ig_type = type;
2809	ig.ig_datalen = 0;
2810	ig.ig_numsrc = 0;
2811	ig.ig_group = inm->inm_addr;
2812	if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
2813		if (m != m0)
2814			m_freem(m);
2815		CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
2816		return (-ENOMEM);
2817	}
2818	nbytes += sizeof(struct igmp_grouprec);
2819
2820	/*
2821	 * Append as many sources as will fit in the first packet.
2822	 * If we are appending to a new packet, the chain allocation
2823	 * may potentially use clusters; use m_getptr() in this case.
2824	 * If we are appending to an existing packet, we need to obtain
2825	 * a pointer to the group record after m_append(), in case a new
2826	 * mbuf was allocated.
2827	 * Only append sources which are in-mode at t1. If we are
2828	 * transitioning to MCAST_UNDEFINED state on the group, do not
2829	 * include source entries.
2830	 * Only report recorded sources in our filter set when responding
2831	 * to a group-source query.
2832	 */
2833	if (record_has_sources) {
2834		if (m == m0) {
2835			md = m_last(m);
2836			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
2837			    md->m_len - nbytes);
2838		} else {
2839			md = m_getptr(m, 0, &off);
2840			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
2841			    off);
2842		}
2843		msrcs = 0;
2844		RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) {
2845			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
2846			    inet_ntoa_haddr(ims->ims_haddr));
2847			now = ims_get_mode(inm, ims, 1);
2848			CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now);
2849			if ((now != mode) ||
2850			    (now == mode && mode == MCAST_UNDEFINED)) {
2851				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
2852				continue;
2853			}
2854			if (is_source_query && ims->ims_stp == 0) {
2855				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
2856				    __func__);
2857				continue;
2858			}
2859			CTR1(KTR_IGMPV3, "%s: append node", __func__);
2860			naddr = htonl(ims->ims_haddr);
2861			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
2862				if (m != m0)
2863					m_freem(m);
2864				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
2865				    __func__);
2866				return (-ENOMEM);
2867			}
2868			nbytes += sizeof(in_addr_t);
2869			++msrcs;
2870			if (msrcs == m0srcs)
2871				break;
2872		}
2873		CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__,
2874		    msrcs);
2875		pig->ig_numsrc = htons(msrcs);
2876		nbytes += (msrcs * sizeof(in_addr_t));
2877	}
2878
2879	if (is_source_query && msrcs == 0) {
2880		CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__);
2881		if (m != m0)
2882			m_freem(m);
2883		return (0);
2884	}
2885
2886	/*
2887	 * We are good to go with first packet.
2888	 */
2889	if (m != m0) {
2890		CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__);
2891		m->m_pkthdr.PH_vt.vt_nrecs = 1;
2892		_IF_ENQUEUE(ifq, m);
2893	} else
2894		m->m_pkthdr.PH_vt.vt_nrecs++;
2895
2896	/*
2897	 * No further work needed if no source list in packet(s).
2898	 */
2899	if (!record_has_sources)
2900		return (nbytes);
2901
2902	/*
2903	 * Whilst sources remain to be announced, we need to allocate
2904	 * a new packet and fill out as many sources as will fit.
2905	 * Always try for a cluster first.
2906	 */
2907	while (nims != NULL) {
2908		if (_IF_QFULL(ifq)) {
2909			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
2910			return (-ENOMEM);
2911		}
2912		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2913		if (m)
2914			m->m_data += IGMP_LEADINGSPACE;
2915		if (m == NULL) {
2916			m = m_gethdr(M_DONTWAIT, MT_DATA);
2917			if (m)
2918				MH_ALIGN(m, IGMP_LEADINGSPACE);
2919		}
2920		if (m == NULL)
2921			return (-ENOMEM);
2922		igmp_save_context(m, ifp);
2923		md = m_getptr(m, 0, &off);
2924		pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off);
2925		CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__);
2926
2927		if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
2928			if (m != m0)
2929				m_freem(m);
2930			CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
2931			return (-ENOMEM);
2932		}
2933		m->m_pkthdr.PH_vt.vt_nrecs = 1;
2934		nbytes += sizeof(struct igmp_grouprec);
2935
2936		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
2937		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
2938
2939		msrcs = 0;
2940		RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
2941			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
2942			    inet_ntoa_haddr(ims->ims_haddr));
2943			now = ims_get_mode(inm, ims, 1);
2944			if ((now != mode) ||
2945			    (now == mode && mode == MCAST_UNDEFINED)) {
2946				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
2947				continue;
2948			}
2949			if (is_source_query && ims->ims_stp == 0) {
2950				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
2951				    __func__);
2952				continue;
2953			}
2954			CTR1(KTR_IGMPV3, "%s: append node", __func__);
2955			naddr = htonl(ims->ims_haddr);
2956			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
2957				if (m != m0)
2958					m_freem(m);
2959				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
2960				    __func__);
2961				return (-ENOMEM);
2962			}
2963			++msrcs;
2964			if (msrcs == m0srcs)
2965				break;
2966		}
2967		pig->ig_numsrc = htons(msrcs);
2968		nbytes += (msrcs * sizeof(in_addr_t));
2969
2970		CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__);
2971		_IF_ENQUEUE(ifq, m);
2972	}
2973
2974	return (nbytes);
2975}
2976
2977/*
2978 * Type used to mark record pass completion.
2979 * We exploit the fact we can cast to this easily from the
2980 * current filter modes on each ip_msource node.
2981 */
2982typedef enum {
2983	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
2984	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
2985	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
2986	REC_FULL = REC_ALLOW | REC_BLOCK
2987} rectype_t;
2988
2989/*
2990 * Enqueue an IGMPv3 filter list change to the given output queue.
2991 *
2992 * Source list filter state is held in an RB-tree. When the filter list
2993 * for a group is changed without changing its mode, we need to compute
2994 * the deltas between T0 and T1 for each source in the filter set,
2995 * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
2996 *
2997 * As we may potentially queue two record types, and the entire R-B tree
2998 * needs to be walked at once, we break this out into its own function
2999 * so we can generate a tightly packed queue of packets.
3000 *
3001 * XXX This could be written to only use one tree walk, although that makes
3002 * serializing into the mbuf chains a bit harder. For now we do two walks
3003 * which makes things easier on us, and it may or may not be harder on
3004 * the L2 cache.
3005 *
3006 * If successful the size of all data appended to the queue is returned,
3007 * otherwise an error code less than zero is returned, or zero if
3008 * no record(s) were appended.
3009 */
3010static int
3011igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm)
3012{
3013	static const int MINRECLEN =
3014	    sizeof(struct igmp_grouprec) + sizeof(in_addr_t);
3015	struct ifnet		*ifp;
3016	struct igmp_grouprec	 ig;
3017	struct igmp_grouprec	*pig;
3018	struct ip_msource	*ims, *nims;
3019	struct mbuf		*m, *m0, *md;
3020	in_addr_t		 naddr;
3021	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
3022	int			 nallow, nblock;
3023	uint8_t			 mode, now, then;
3024	rectype_t		 crt, drt, nrt;
3025
3026	IN_MULTI_LOCK_ASSERT();
3027
3028	if (inm->inm_nsrc == 0 ||
3029	    (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0))
3030		return (0);
3031
3032	ifp = inm->inm_ifp;			/* interface */
3033	mode = inm->inm_st[1].iss_fmode;	/* filter mode at t1 */
3034	crt = REC_NONE;	/* current group record type */
3035	drt = REC_NONE;	/* mask of completed group record types */
3036	nrt = REC_NONE;	/* record type for current node */
3037	m0srcs = 0;	/* # source which will fit in current mbuf chain */
3038	nbytes = 0;	/* # of bytes appended to group's state-change queue */
3039	npbytes = 0;	/* # of bytes appended this packet */
3040	rsrcs = 0;	/* # sources encoded in current record */
3041	schanged = 0;	/* # nodes encoded in overall filter change */
3042	nallow = 0;	/* # of source entries in ALLOW_NEW */
3043	nblock = 0;	/* # of source entries in BLOCK_OLD */
3044	nims = NULL;	/* next tree node pointer */
3045
3046	/*
3047	 * For each possible filter record mode.
3048	 * The first kind of source we encounter tells us which
3049	 * is the first kind of record we start appending.
3050	 * If a node transitioned to UNDEFINED at t1, its mode is treated
3051	 * as the inverse of the group's filter mode.
3052	 */
3053	while (drt != REC_FULL) {
3054		do {
3055			m0 = ifq->ifq_tail;
3056			if (m0 != NULL &&
3057			    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
3058			     IGMP_V3_REPORT_MAXRECS) &&
3059			    (m0->m_pkthdr.len + MINRECLEN) <
3060			     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
3061				m = m0;
3062				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
3063					    sizeof(struct igmp_grouprec)) /
3064				    sizeof(in_addr_t);
3065				CTR1(KTR_IGMPV3,
3066				    "%s: use previous packet", __func__);
3067			} else {
3068				m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
3069				if (m)
3070					m->m_data += IGMP_LEADINGSPACE;
3071				if (m == NULL) {
3072					m = m_gethdr(M_DONTWAIT, MT_DATA);
3073					if (m)
3074						MH_ALIGN(m, IGMP_LEADINGSPACE);
3075				}
3076				if (m == NULL) {
3077					CTR1(KTR_IGMPV3,
3078					    "%s: m_get*() failed", __func__);
3079					return (-ENOMEM);
3080				}
3081				m->m_pkthdr.PH_vt.vt_nrecs = 0;
3082				igmp_save_context(m, ifp);
3083				m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
3084				    sizeof(struct igmp_grouprec)) /
3085				    sizeof(in_addr_t);
3086				npbytes = 0;
3087				CTR1(KTR_IGMPV3,
3088				    "%s: allocated new packet", __func__);
3089			}
3090			/*
3091			 * Append the IGMP group record header to the
3092			 * current packet's data area.
3093			 * Recalculate pointer to free space for next
3094			 * group record, in case m_append() allocated
3095			 * a new mbuf or cluster.
3096			 */
3097			memset(&ig, 0, sizeof(ig));
3098			ig.ig_group = inm->inm_addr;
3099			if (!m_append(m, sizeof(ig), (void *)&ig)) {
3100				if (m != m0)
3101					m_freem(m);
3102				CTR1(KTR_IGMPV3,
3103				    "%s: m_append() failed", __func__);
3104				return (-ENOMEM);
3105			}
3106			npbytes += sizeof(struct igmp_grouprec);
3107			if (m != m0) {
3108				/* new packet; offset in c hain */
3109				md = m_getptr(m, npbytes -
3110				    sizeof(struct igmp_grouprec), &off);
3111				pig = (struct igmp_grouprec *)(mtod(md,
3112				    uint8_t *) + off);
3113			} else {
3114				/* current packet; offset from last append */
3115				md = m_last(m);
3116				pig = (struct igmp_grouprec *)(mtod(md,
3117				    uint8_t *) + md->m_len -
3118				    sizeof(struct igmp_grouprec));
3119			}
3120			/*
3121			 * Begin walking the tree for this record type
3122			 * pass, or continue from where we left off
3123			 * previously if we had to allocate a new packet.
3124			 * Only report deltas in-mode at t1.
3125			 * We need not report included sources as allowed
3126			 * if we are in inclusive mode on the group,
3127			 * however the converse is not true.
3128			 */
3129			rsrcs = 0;
3130			if (nims == NULL)
3131				nims = RB_MIN(ip_msource_tree, &inm->inm_srcs);
3132			RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
3133				CTR2(KTR_IGMPV3, "%s: visit node %s",
3134				    __func__, inet_ntoa_haddr(ims->ims_haddr));
3135				now = ims_get_mode(inm, ims, 1);
3136				then = ims_get_mode(inm, ims, 0);
3137				CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d",
3138				    __func__, then, now);
3139				if (now == then) {
3140					CTR1(KTR_IGMPV3,
3141					    "%s: skip unchanged", __func__);
3142					continue;
3143				}
3144				if (mode == MCAST_EXCLUDE &&
3145				    now == MCAST_INCLUDE) {
3146					CTR1(KTR_IGMPV3,
3147					    "%s: skip IN src on EX group",
3148					    __func__);
3149					continue;
3150				}
3151				nrt = (rectype_t)now;
3152				if (nrt == REC_NONE)
3153					nrt = (rectype_t)(~mode & REC_FULL);
3154				if (schanged++ == 0) {
3155					crt = nrt;
3156				} else if (crt != nrt)
3157					continue;
3158				naddr = htonl(ims->ims_haddr);
3159				if (!m_append(m, sizeof(in_addr_t),
3160				    (void *)&naddr)) {
3161					if (m != m0)
3162						m_freem(m);
3163					CTR1(KTR_IGMPV3,
3164					    "%s: m_append() failed", __func__);
3165					return (-ENOMEM);
3166				}
3167				nallow += !!(crt == REC_ALLOW);
3168				nblock += !!(crt == REC_BLOCK);
3169				if (++rsrcs == m0srcs)
3170					break;
3171			}
3172			/*
3173			 * If we did not append any tree nodes on this
3174			 * pass, back out of allocations.
3175			 */
3176			if (rsrcs == 0) {
3177				npbytes -= sizeof(struct igmp_grouprec);
3178				if (m != m0) {
3179					CTR1(KTR_IGMPV3,
3180					    "%s: m_free(m)", __func__);
3181					m_freem(m);
3182				} else {
3183					CTR1(KTR_IGMPV3,
3184					    "%s: m_adj(m, -ig)", __func__);
3185					m_adj(m, -((int)sizeof(
3186					    struct igmp_grouprec)));
3187				}
3188				continue;
3189			}
3190			npbytes += (rsrcs * sizeof(in_addr_t));
3191			if (crt == REC_ALLOW)
3192				pig->ig_type = IGMP_ALLOW_NEW_SOURCES;
3193			else if (crt == REC_BLOCK)
3194				pig->ig_type = IGMP_BLOCK_OLD_SOURCES;
3195			pig->ig_numsrc = htons(rsrcs);
3196			/*
3197			 * Count the new group record, and enqueue this
3198			 * packet if it wasn't already queued.
3199			 */
3200			m->m_pkthdr.PH_vt.vt_nrecs++;
3201			if (m != m0)
3202				_IF_ENQUEUE(ifq, m);
3203			nbytes += npbytes;
3204		} while (nims != NULL);
3205		drt |= crt;
3206		crt = (~crt & REC_FULL);
3207	}
3208
3209	CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
3210	    nallow, nblock);
3211
3212	return (nbytes);
3213}
3214
3215static int
3216igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq)
3217{
3218	struct ifqueue	*gq;
3219	struct mbuf	*m;		/* pending state-change */
3220	struct mbuf	*m0;		/* copy of pending state-change */
3221	struct mbuf	*mt;		/* last state-change in packet */
3222	int		 docopy, domerge;
3223	u_int		 recslen;
3224
3225	docopy = 0;
3226	domerge = 0;
3227	recslen = 0;
3228
3229	IN_MULTI_LOCK_ASSERT();
3230	IGMP_LOCK_ASSERT();
3231
3232	/*
3233	 * If there are further pending retransmissions, make a writable
3234	 * copy of each queued state-change message before merging.
3235	 */
3236	if (inm->inm_scrv > 0)
3237		docopy = 1;
3238
3239	gq = &inm->inm_scq;
3240#ifdef KTR
3241	if (gq->ifq_head == NULL) {
3242		CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty",
3243		    __func__, inm);
3244	}
3245#endif
3246
3247	m = gq->ifq_head;
3248	while (m != NULL) {
3249		/*
3250		 * Only merge the report into the current packet if
3251		 * there is sufficient space to do so; an IGMPv3 report
3252		 * packet may only contain 65,535 group records.
3253		 * Always use a simple mbuf chain concatentation to do this,
3254		 * as large state changes for single groups may have
3255		 * allocated clusters.
3256		 */
3257		domerge = 0;
3258		mt = ifscq->ifq_tail;
3259		if (mt != NULL) {
3260			recslen = m_length(m, NULL);
3261
3262			if ((mt->m_pkthdr.PH_vt.vt_nrecs +
3263			    m->m_pkthdr.PH_vt.vt_nrecs <=
3264			    IGMP_V3_REPORT_MAXRECS) &&
3265			    (mt->m_pkthdr.len + recslen <=
3266			    (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE)))
3267				domerge = 1;
3268		}
3269
3270		if (!domerge && _IF_QFULL(gq)) {
3271			CTR2(KTR_IGMPV3,
3272			    "%s: outbound queue full, skipping whole packet %p",
3273			    __func__, m);
3274			mt = m->m_nextpkt;
3275			if (!docopy)
3276				m_freem(m);
3277			m = mt;
3278			continue;
3279		}
3280
3281		if (!docopy) {
3282			CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m);
3283			_IF_DEQUEUE(gq, m0);
3284			m = m0->m_nextpkt;
3285		} else {
3286			CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m);
3287			m0 = m_dup(m, M_NOWAIT);
3288			if (m0 == NULL)
3289				return (ENOMEM);
3290			m0->m_nextpkt = NULL;
3291			m = m->m_nextpkt;
3292		}
3293
3294		if (!domerge) {
3295			CTR3(KTR_IGMPV3, "%s: queueing %p to ifscq %p)",
3296			    __func__, m0, ifscq);
3297			_IF_ENQUEUE(ifscq, m0);
3298		} else {
3299			struct mbuf *mtl;	/* last mbuf of packet mt */
3300
3301			CTR3(KTR_IGMPV3, "%s: merging %p with ifscq tail %p)",
3302			    __func__, m0, mt);
3303
3304			mtl = m_last(mt);
3305			m0->m_flags &= ~M_PKTHDR;
3306			mt->m_pkthdr.len += recslen;
3307			mt->m_pkthdr.PH_vt.vt_nrecs +=
3308			    m0->m_pkthdr.PH_vt.vt_nrecs;
3309
3310			mtl->m_next = m0;
3311		}
3312	}
3313
3314	return (0);
3315}
3316
3317/*
3318 * Respond to a pending IGMPv3 General Query.
3319 */
3320static void
3321igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi)
3322{
3323	struct ifmultiaddr	*ifma, *tifma;
3324	struct ifnet		*ifp;
3325	struct in_multi		*inm;
3326	int			 retval, loop;
3327
3328	IN_MULTI_LOCK_ASSERT();
3329	IGMP_LOCK_ASSERT();
3330
3331	KASSERT(igi->igi_version == IGMP_VERSION_3,
3332	    ("%s: called when version %d", __func__, igi->igi_version));
3333
3334	ifp = igi->igi_ifp;
3335
3336	IF_ADDR_LOCK(ifp);
3337	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, tifma) {
3338		if (ifma->ifma_addr->sa_family != AF_INET ||
3339		    ifma->ifma_protospec == NULL)
3340			continue;
3341
3342		inm = (struct in_multi *)ifma->ifma_protospec;
3343		KASSERT(ifp == inm->inm_ifp,
3344		    ("%s: inconsistent ifp", __func__));
3345
3346		switch (inm->inm_state) {
3347		case IGMP_NOT_MEMBER:
3348		case IGMP_SILENT_MEMBER:
3349			break;
3350		case IGMP_REPORTING_MEMBER:
3351		case IGMP_IDLE_MEMBER:
3352		case IGMP_LAZY_MEMBER:
3353		case IGMP_SLEEPING_MEMBER:
3354		case IGMP_AWAKENING_MEMBER:
3355			inm->inm_state = IGMP_REPORTING_MEMBER;
3356			retval = igmp_v3_enqueue_group_record(&igi->igi_gq,
3357			    inm, 0, 0, 0);
3358			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
3359			    __func__, retval);
3360			break;
3361		case IGMP_G_QUERY_PENDING_MEMBER:
3362		case IGMP_SG_QUERY_PENDING_MEMBER:
3363		case IGMP_LEAVING_MEMBER:
3364			break;
3365		}
3366	}
3367	IF_ADDR_UNLOCK(ifp);
3368
3369	loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
3370	igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop);
3371
3372	/*
3373	 * Slew transmission of bursts over 500ms intervals.
3374	 */
3375	if (igi->igi_gq.ifq_head != NULL) {
3376		igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY(
3377		    IGMP_RESPONSE_BURST_INTERVAL);
3378		V_interface_timers_running = 1;
3379	}
3380}
3381
3382/*
3383 * Transmit the next pending IGMP message in the output queue.
3384 *
3385 * We get called from netisr_processqueue(). A mutex private to igmpoq
3386 * will be acquired and released around this routine.
3387 *
3388 * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
3389 * MRT: Nothing needs to be done, as IGMP traffic is always local to
3390 * a link and uses a link-scope multicast address.
3391 */
3392static void
3393igmp_intr(struct mbuf *m)
3394{
3395	struct ip_moptions	 imo;
3396	struct ifnet		*ifp;
3397	struct mbuf		*ipopts, *m0;
3398	int			 error;
3399	uint32_t		 ifindex;
3400
3401	CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m);
3402
3403	/*
3404	 * Set VNET image pointer from enqueued mbuf chain
3405	 * before doing anything else. Whilst we use interface
3406	 * indexes to guard against interface detach, they are
3407	 * unique to each VIMAGE and must be retrieved.
3408	 */
3409	CURVNET_SET((struct vnet *)(m->m_pkthdr.header));
3410	ifindex = igmp_restore_context(m);
3411
3412	/*
3413	 * Check if the ifnet still exists. This limits the scope of
3414	 * any race in the absence of a global ifp lock for low cost
3415	 * (an array lookup).
3416	 */
3417	ifp = ifnet_byindex(ifindex);
3418	if (ifp == NULL) {
3419		CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.",
3420		    __func__, m, ifindex);
3421		m_freem(m);
3422		IPSTAT_INC(ips_noroute);
3423		goto out;
3424	}
3425
3426	ipopts = V_igmp_sendra ? m_raopt : NULL;
3427
3428	imo.imo_multicast_ttl  = 1;
3429	imo.imo_multicast_vif  = -1;
3430	imo.imo_multicast_loop = (V_ip_mrouter != NULL);
3431
3432	/*
3433	 * If the user requested that IGMP traffic be explicitly
3434	 * redirected to the loopback interface (e.g. they are running a
3435	 * MANET interface and the routing protocol needs to see the
3436	 * updates), handle this now.
3437	 */
3438	if (m->m_flags & M_IGMP_LOOP)
3439		imo.imo_multicast_ifp = V_loif;
3440	else
3441		imo.imo_multicast_ifp = ifp;
3442
3443	if (m->m_flags & M_IGMPV2) {
3444		m0 = m;
3445	} else {
3446		m0 = igmp_v3_encap_report(ifp, m);
3447		if (m0 == NULL) {
3448			CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m);
3449			m_freem(m);
3450			IPSTAT_INC(ips_odropped);
3451			goto out;
3452		}
3453	}
3454
3455	igmp_scrub_context(m0);
3456	m->m_flags &= ~(M_PROTOFLAGS);
3457	m0->m_pkthdr.rcvif = V_loif;
3458#ifdef MAC
3459	mac_netinet_igmp_send(ifp, m0);
3460#endif
3461	error = ip_output(m0, ipopts, NULL, 0, &imo, NULL);
3462	if (error) {
3463		CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error);
3464		goto out;
3465	}
3466
3467	IGMPSTAT_INC(igps_snd_reports);
3468
3469out:
3470	/*
3471	 * We must restore the existing vnet pointer before
3472	 * continuing as we are run from netisr context.
3473	 */
3474	CURVNET_RESTORE();
3475}
3476
3477/*
3478 * Encapsulate an IGMPv3 report.
3479 *
3480 * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf
3481 * chain has already had its IP/IGMPv3 header prepended. In this case
3482 * the function will not attempt to prepend; the lengths and checksums
3483 * will however be re-computed.
3484 *
3485 * Returns a pointer to the new mbuf chain head, or NULL if the
3486 * allocation failed.
3487 */
3488static struct mbuf *
3489igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
3490{
3491	struct igmp_report	*igmp;
3492	struct ip		*ip;
3493	int			 hdrlen, igmpreclen;
3494
3495	KASSERT((m->m_flags & M_PKTHDR),
3496	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
3497
3498	igmpreclen = m_length(m, NULL);
3499	hdrlen = sizeof(struct ip) + sizeof(struct igmp_report);
3500
3501	if (m->m_flags & M_IGMPV3_HDR) {
3502		igmpreclen -= hdrlen;
3503	} else {
3504		M_PREPEND(m, hdrlen, M_DONTWAIT);
3505		if (m == NULL)
3506			return (NULL);
3507		m->m_flags |= M_IGMPV3_HDR;
3508	}
3509
3510	CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen);
3511
3512	m->m_data += sizeof(struct ip);
3513	m->m_len -= sizeof(struct ip);
3514
3515	igmp = mtod(m, struct igmp_report *);
3516	igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT;
3517	igmp->ir_rsv1 = 0;
3518	igmp->ir_rsv2 = 0;
3519	igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs);
3520	igmp->ir_cksum = 0;
3521	igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen);
3522	m->m_pkthdr.PH_vt.vt_nrecs = 0;
3523
3524	m->m_data -= sizeof(struct ip);
3525	m->m_len += sizeof(struct ip);
3526
3527	ip = mtod(m, struct ip *);
3528	ip->ip_tos = IPTOS_PREC_INTERNETCONTROL;
3529	ip->ip_len = hdrlen + igmpreclen;
3530	ip->ip_off = IP_DF;
3531	ip->ip_p = IPPROTO_IGMP;
3532	ip->ip_sum = 0;
3533
3534	ip->ip_src.s_addr = INADDR_ANY;
3535
3536	if (m->m_flags & M_IGMP_LOOP) {
3537		struct in_ifaddr *ia;
3538
3539		IFP_TO_IA(ifp, ia);
3540		if (ia != NULL) {
3541			ip->ip_src = ia->ia_addr.sin_addr;
3542			ifa_free(&ia->ia_ifa);
3543		}
3544	}
3545
3546	ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP);
3547
3548	return (m);
3549}
3550
3551#ifdef KTR
3552static char *
3553igmp_rec_type_to_str(const int type)
3554{
3555
3556	switch (type) {
3557		case IGMP_CHANGE_TO_EXCLUDE_MODE:
3558			return "TO_EX";
3559			break;
3560		case IGMP_CHANGE_TO_INCLUDE_MODE:
3561			return "TO_IN";
3562			break;
3563		case IGMP_MODE_IS_EXCLUDE:
3564			return "MODE_EX";
3565			break;
3566		case IGMP_MODE_IS_INCLUDE:
3567			return "MODE_IN";
3568			break;
3569		case IGMP_ALLOW_NEW_SOURCES:
3570			return "ALLOW_NEW";
3571			break;
3572		case IGMP_BLOCK_OLD_SOURCES:
3573			return "BLOCK_OLD";
3574			break;
3575		default:
3576			break;
3577	}
3578	return "unknown";
3579}
3580#endif
3581
3582static void
3583igmp_init(void *unused __unused)
3584{
3585
3586	CTR1(KTR_IGMPV3, "%s: initializing", __func__);
3587
3588	IGMP_LOCK_INIT();
3589
3590	m_raopt = igmp_ra_alloc();
3591
3592	netisr_register(&igmp_nh);
3593}
3594SYSINIT(igmp_init, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_init, NULL);
3595
3596static void
3597igmp_uninit(void *unused __unused)
3598{
3599
3600	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
3601
3602	netisr_unregister(&igmp_nh);
3603
3604	m_free(m_raopt);
3605	m_raopt = NULL;
3606
3607	IGMP_LOCK_DESTROY();
3608}
3609SYSUNINIT(igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_MIDDLE, igmp_uninit, NULL);
3610
3611static void
3612vnet_igmp_init(const void *unused __unused)
3613{
3614
3615	CTR1(KTR_IGMPV3, "%s: initializing", __func__);
3616
3617	LIST_INIT(&V_igi_head);
3618}
3619VNET_SYSINIT(vnet_igmp_init, SI_SUB_PSEUDO, SI_ORDER_ANY, vnet_igmp_init,
3620    NULL);
3621
3622static void
3623vnet_igmp_uninit(const void *unused __unused)
3624{
3625
3626	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
3627
3628	KASSERT(LIST_EMPTY(&V_igi_head),
3629	    ("%s: igi list not empty; ifnets not detached?", __func__));
3630}
3631VNET_SYSUNINIT(vnet_igmp_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
3632    vnet_igmp_uninit, NULL);
3633
3634static int
3635igmp_modevent(module_t mod, int type, void *unused __unused)
3636{
3637
3638    switch (type) {
3639    case MOD_LOAD:
3640    case MOD_UNLOAD:
3641	break;
3642    default:
3643	return (EOPNOTSUPP);
3644    }
3645    return (0);
3646}
3647
3648static moduledata_t igmp_mod = {
3649    "igmp",
3650    igmp_modevent,
3651    0
3652};
3653DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
3654