igmp.c revision 195727
1/*-
2 * Copyright (c) 2007-2009 Bruce Simpson.
3 * Copyright (c) 1988 Stephen Deering.
4 * Copyright (c) 1992, 1993
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Stephen Deering of Stanford University.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)igmp.c	8.1 (Berkeley) 7/19/93
35 */
36
37/*
38 * Internet Group Management Protocol (IGMP) routines.
39 * [RFC1112, RFC2236, RFC3376]
40 *
41 * Written by Steve Deering, Stanford, May 1988.
42 * Modified by Rosen Sharma, Stanford, Aug 1994.
43 * Modified by Bill Fenner, Xerox PARC, Feb 1995.
44 * Modified to fully comply to IGMPv2 by Bill Fenner, Oct 1995.
45 * Significantly rewritten for IGMPv3, VIMAGE, and SMP by Bruce Simpson.
46 *
47 * MULTICAST Revision: 3.5.1.4
48 */
49
50#include <sys/cdefs.h>
51__FBSDID("$FreeBSD: head/sys/netinet/igmp.c 195727 2009-07-16 21:13:04Z rwatson $");
52
53#include <sys/param.h>
54#include <sys/systm.h>
55#include <sys/module.h>
56#include <sys/malloc.h>
57#include <sys/mbuf.h>
58#include <sys/socket.h>
59#include <sys/protosw.h>
60#include <sys/kernel.h>
61#include <sys/sysctl.h>
62#include <sys/vimage.h>
63#include <sys/ktr.h>
64#include <sys/condvar.h>
65
66#include <net/if.h>
67#include <net/netisr.h>
68#include <net/vnet.h>
69
70#include <netinet/in.h>
71#include <netinet/in_var.h>
72#include <netinet/in_systm.h>
73#include <netinet/ip.h>
74#include <netinet/ip_var.h>
75#include <netinet/ip_options.h>
76#include <netinet/igmp.h>
77#include <netinet/igmp_var.h>
78
79#include <machine/in_cksum.h>
80
81#include <security/mac/mac_framework.h>
82
83#ifndef KTR_IGMPV3
84#define KTR_IGMPV3 KTR_INET
85#endif
86
87static struct igmp_ifinfo *
88		igi_alloc_locked(struct ifnet *);
89static void	igi_delete_locked(const struct ifnet *);
90static void	igmp_dispatch_queue(struct ifqueue *, int, const int);
91static void	igmp_fasttimo_vnet(void);
92static void	igmp_final_leave(struct in_multi *, struct igmp_ifinfo *);
93static int	igmp_handle_state_change(struct in_multi *,
94		    struct igmp_ifinfo *);
95static int	igmp_initial_join(struct in_multi *, struct igmp_ifinfo *);
96static int	igmp_input_v1_query(struct ifnet *, const struct ip *,
97		    const struct igmp *);
98static int	igmp_input_v2_query(struct ifnet *, const struct ip *,
99		    const struct igmp *);
100static int	igmp_input_v3_query(struct ifnet *, const struct ip *,
101		    /*const*/ struct igmpv3 *);
102static int	igmp_input_v3_group_query(struct in_multi *,
103		    struct igmp_ifinfo *, int, /*const*/ struct igmpv3 *);
104static int	igmp_input_v1_report(struct ifnet *, /*const*/ struct ip *,
105		    /*const*/ struct igmp *);
106static int	igmp_input_v2_report(struct ifnet *, /*const*/ struct ip *,
107		    /*const*/ struct igmp *);
108static void	igmp_intr(struct mbuf *);
109static int	igmp_isgroupreported(const struct in_addr);
110static struct mbuf *
111		igmp_ra_alloc(void);
112#ifdef KTR
113static char *	igmp_rec_type_to_str(const int);
114#endif
115static void	igmp_set_version(struct igmp_ifinfo *, const int);
116static void	igmp_slowtimo_vnet(void);
117static void	igmp_sysinit(void);
118static int	igmp_v1v2_queue_report(struct in_multi *, const int);
119static void	igmp_v1v2_process_group_timer(struct in_multi *, const int);
120static void	igmp_v1v2_process_querier_timers(struct igmp_ifinfo *);
121static void	igmp_v2_update_group(struct in_multi *, const int);
122static void	igmp_v3_cancel_link_timers(struct igmp_ifinfo *);
123static void	igmp_v3_dispatch_general_query(struct igmp_ifinfo *);
124static struct mbuf *
125		igmp_v3_encap_report(struct ifnet *, struct mbuf *);
126static int	igmp_v3_enqueue_group_record(struct ifqueue *,
127		    struct in_multi *, const int, const int, const int);
128static int	igmp_v3_enqueue_filter_change(struct ifqueue *,
129		    struct in_multi *);
130static void	igmp_v3_process_group_timers(struct igmp_ifinfo *,
131		    struct ifqueue *, struct ifqueue *, struct in_multi *,
132		    const int);
133static int	igmp_v3_merge_state_changes(struct in_multi *,
134		    struct ifqueue *);
135static void	igmp_v3_suppress_group_record(struct in_multi *);
136static int	sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS);
137static int	sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS);
138static int	sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS);
139
140static vnet_attach_fn	vnet_igmp_iattach;
141static vnet_detach_fn	vnet_igmp_idetach;
142
143static const struct netisr_handler igmp_nh = {
144	.nh_name = "igmp",
145	.nh_handler = igmp_intr,
146	.nh_proto = NETISR_IGMP,
147	.nh_policy = NETISR_POLICY_SOURCE,
148};
149
150/*
151 * System-wide globals.
152 *
153 * Unlocked access to these is OK, except for the global IGMP output
154 * queue. The IGMP subsystem lock ends up being system-wide for the moment,
155 * because all VIMAGEs have to share a global output queue, as netisrs
156 * themselves are not virtualized.
157 *
158 * Locking:
159 *  * The permitted lock order is: IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
160 *    Any may be taken independently; if any are held at the same
161 *    time, the above lock order must be followed.
162 *  * All output is delegated to the netisr.
163 *    Now that Giant has been eliminated, the netisr may be inlined.
164 *  * IN_MULTI_LOCK covers in_multi.
165 *  * IGMP_LOCK covers igmp_ifinfo and any global variables in this file,
166 *    including the output queue.
167 *  * IF_ADDR_LOCK covers if_multiaddrs, which is used for a variety of
168 *    per-link state iterators.
169 *  * igmp_ifinfo is valid as long as PF_INET is attached to the interface,
170 *    therefore it is not refcounted.
171 *    We allow unlocked reads of igmp_ifinfo when accessed via in_multi.
172 *
173 * Reference counting
174 *  * IGMP acquires its own reference every time an in_multi is passed to
175 *    it and the group is being joined for the first time.
176 *  * IGMP releases its reference(s) on in_multi in a deferred way,
177 *    because the operations which process the release run as part of
178 *    a loop whose control variables are directly affected by the release
179 *    (that, and not recursing on the IF_ADDR_LOCK).
180 *
181 * VIMAGE: Each in_multi corresponds to an ifp, and each ifp corresponds
182 * to a vnet in ifp->if_vnet.
183 *
184 * SMPng: XXX We may potentially race operations on ifma_protospec.
185 * The problem is that we currently lack a clean way of taking the
186 * IF_ADDR_LOCK() between the ifnet and in layers w/o recursing,
187 * as anything which modifies ifma needs to be covered by that lock.
188 * So check for ifma_protospec being NULL before proceeding.
189 */
190struct mtx		 igmp_mtx;
191
192struct mbuf		*m_raopt;		 /* Router Alert option */
193MALLOC_DEFINE(M_IGMP, "igmp", "igmp state");
194
195/*
196 * VIMAGE-wide globals.
197 *
198 * The IGMPv3 timers themselves need to run per-image, however,
199 * protosw timers run globally (see tcp).
200 * An ifnet can only be in one vimage at a time, and the loopback
201 * ifnet, loif, is itself virtualized.
202 * It would otherwise be possible to seriously hose IGMP state,
203 * and create inconsistencies in upstream multicast routing, if you have
204 * multiple VIMAGEs running on the same link joining different multicast
205 * groups, UNLESS the "primary IP address" is different. This is because
206 * IGMP for IPv4 does not force link-local addresses to be used for each
207 * node, unlike MLD for IPv6.
208 * Obviously the IGMPv3 per-interface state has per-vimage granularity
209 * also as a result.
210 *
211 * FUTURE: Stop using IFP_TO_IA/INADDR_ANY, and use source address selection
212 * policy to control the address used by IGMP on the link.
213 */
214static VNET_DEFINE(int, interface_timers_running);	/* IGMPv3 general
215							 * query response */
216static VNET_DEFINE(int, state_change_timers_running);	/* IGMPv3 state-change
217							 * retransmit */
218static VNET_DEFINE(int, current_state_timers_running);	/* IGMPv1/v2 host
219							 * report; IGMPv3 g/sg
220							 * query response */
221
222#define	V_interface_timers_running	VNET(interface_timers_running)
223#define	V_state_change_timers_running	VNET(state_change_timers_running)
224#define	V_current_state_timers_running	VNET(current_state_timers_running)
225
226static VNET_DEFINE(LIST_HEAD(, igmp_ifinfo), igi_head);
227static VNET_DEFINE(struct igmpstat, igmpstat);
228static VNET_DEFINE(struct timeval, igmp_gsrdelay) = {10, 0};
229
230#define	V_igi_head			VNET(igi_head)
231#define	V_igmpstat			VNET(igmpstat)
232#define	V_igmp_gsrdelay			VNET(igmp_gsrdelay)
233
234static VNET_DEFINE(int, igmp_recvifkludge) = 1;
235static VNET_DEFINE(int, igmp_sendra) = 1;
236static VNET_DEFINE(int, igmp_sendlocal) = 1;
237static VNET_DEFINE(int, igmp_v1enable) = 1;
238static VNET_DEFINE(int, igmp_v2enable) = 1;
239static VNET_DEFINE(int, igmp_legacysupp);
240static VNET_DEFINE(int, igmp_default_version) = IGMP_VERSION_3;
241
242#define	V_igmp_recvifkludge		VNET(igmp_recvifkludge)
243#define	V_igmp_sendra			VNET(igmp_sendra)
244#define	V_igmp_sendlocal		VNET(igmp_sendlocal)
245#define	V_igmp_v1enable			VNET(igmp_v1enable)
246#define	V_igmp_v2enable			VNET(igmp_v2enable)
247#define	V_igmp_legacysupp		VNET(igmp_legacysupp)
248#define	V_igmp_default_version		VNET(igmp_default_version)
249
250/*
251 * Virtualized sysctls.
252 */
253SYSCTL_VNET_STRUCT(_net_inet_igmp, IGMPCTL_STATS, stats, CTLFLAG_RW,
254    &VNET_NAME(igmpstat), igmpstat, "");
255SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, recvifkludge, CTLFLAG_RW,
256    &VNET_NAME(igmp_recvifkludge), 0,
257    "Rewrite IGMPv1/v2 reports from 0.0.0.0 to contain subnet address");
258SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendra, CTLFLAG_RW,
259    &VNET_NAME(igmp_sendra), 0,
260    "Send IP Router Alert option in IGMPv2/v3 messages");
261SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, sendlocal, CTLFLAG_RW,
262    &VNET_NAME(igmp_sendlocal), 0,
263    "Send IGMP membership reports for 224.0.0.0/24 groups");
264SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v1enable, CTLFLAG_RW,
265    &VNET_NAME(igmp_v1enable), 0,
266    "Enable backwards compatibility with IGMPv1");
267SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, v2enable, CTLFLAG_RW,
268    &VNET_NAME(igmp_v2enable), 0,
269    "Enable backwards compatibility with IGMPv2");
270SYSCTL_VNET_INT(_net_inet_igmp, OID_AUTO, legacysupp, CTLFLAG_RW,
271    &VNET_NAME(igmp_legacysupp), 0,
272    "Allow v1/v2 reports to suppress v3 group responses");
273SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, default_version,
274    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
275    &VNET_NAME(igmp_default_version), 0, sysctl_igmp_default_version, "I",
276    "Default version of IGMP to run on each interface");
277SYSCTL_VNET_PROC(_net_inet_igmp, OID_AUTO, gsrdelay,
278    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
279    &VNET_NAME(igmp_gsrdelay.tv_sec), 0, sysctl_igmp_gsr, "I",
280    "Rate limit for IGMPv3 Group-and-Source queries in seconds");
281
282/*
283 * Non-virtualized sysctls.
284 */
285SYSCTL_NODE(_net_inet_igmp, OID_AUTO, ifinfo, CTLFLAG_RD | CTLFLAG_MPSAFE,
286    sysctl_igmp_ifinfo, "Per-interface IGMPv3 state");
287
288static __inline void
289igmp_save_context(struct mbuf *m, struct ifnet *ifp)
290{
291
292#ifdef VIMAGE
293	m->m_pkthdr.header = ifp->if_vnet;
294#endif /* VIMAGE */
295	m->m_pkthdr.flowid = ifp->if_index;
296}
297
298static __inline void
299igmp_scrub_context(struct mbuf *m)
300{
301
302	m->m_pkthdr.header = NULL;
303	m->m_pkthdr.flowid = 0;
304}
305
306#ifdef KTR
307static __inline char *
308inet_ntoa_haddr(in_addr_t haddr)
309{
310	struct in_addr ia;
311
312	ia.s_addr = htonl(haddr);
313	return (inet_ntoa(ia));
314}
315#endif
316
317/*
318 * Restore context from a queued IGMP output chain.
319 * Return saved ifindex.
320 *
321 * VIMAGE: The assertion is there to make sure that we
322 * actually called CURVNET_SET() with what's in the mbuf chain.
323 */
324static __inline uint32_t
325igmp_restore_context(struct mbuf *m)
326{
327
328#ifdef notyet
329#if defined(VIMAGE) && defined(INVARIANTS)
330	KASSERT(curvnet == (m->m_pkthdr.header),
331	    ("%s: called when curvnet was not restored", __func__));
332#endif
333#endif
334	return (m->m_pkthdr.flowid);
335}
336
337/*
338 * Retrieve or set default IGMP version.
339 *
340 * VIMAGE: Assume curvnet set by caller.
341 * SMPng: NOTE: Serialized by IGMP lock.
342 */
343static int
344sysctl_igmp_default_version(SYSCTL_HANDLER_ARGS)
345{
346	int	 error;
347	int	 new;
348
349	error = sysctl_wire_old_buffer(req, sizeof(int));
350	if (error)
351		return (error);
352
353	IGMP_LOCK();
354
355	new = V_igmp_default_version;
356
357	error = sysctl_handle_int(oidp, &new, 0, req);
358	if (error || !req->newptr)
359		goto out_locked;
360
361	if (new < IGMP_VERSION_1 || new > IGMP_VERSION_3) {
362		error = EINVAL;
363		goto out_locked;
364	}
365
366	CTR2(KTR_IGMPV3, "change igmp_default_version from %d to %d",
367	     V_igmp_default_version, new);
368
369	V_igmp_default_version = new;
370
371out_locked:
372	IGMP_UNLOCK();
373	return (error);
374}
375
376/*
377 * Retrieve or set threshold between group-source queries in seconds.
378 *
379 * VIMAGE: Assume curvnet set by caller.
380 * SMPng: NOTE: Serialized by IGMP lock.
381 */
382static int
383sysctl_igmp_gsr(SYSCTL_HANDLER_ARGS)
384{
385	int error;
386	int i;
387
388	error = sysctl_wire_old_buffer(req, sizeof(int));
389	if (error)
390		return (error);
391
392	IGMP_LOCK();
393
394	i = V_igmp_gsrdelay.tv_sec;
395
396	error = sysctl_handle_int(oidp, &i, 0, req);
397	if (error || !req->newptr)
398		goto out_locked;
399
400	if (i < -1 || i >= 60) {
401		error = EINVAL;
402		goto out_locked;
403	}
404
405	CTR2(KTR_IGMPV3, "change igmp_gsrdelay from %d to %d",
406	     V_igmp_gsrdelay.tv_sec, i);
407	V_igmp_gsrdelay.tv_sec = i;
408
409out_locked:
410	IGMP_UNLOCK();
411	return (error);
412}
413
414/*
415 * Expose struct igmp_ifinfo to userland, keyed by ifindex.
416 * For use by ifmcstat(8).
417 *
418 * SMPng: NOTE: Does an unlocked ifindex space read.
419 * VIMAGE: Assume curvnet set by caller. The node handler itself
420 * is not directly virtualized.
421 */
422static int
423sysctl_igmp_ifinfo(SYSCTL_HANDLER_ARGS)
424{
425	int			*name;
426	int			 error;
427	u_int			 namelen;
428	struct ifnet		*ifp;
429	struct igmp_ifinfo	*igi;
430
431	name = (int *)arg1;
432	namelen = arg2;
433
434	if (req->newptr != NULL)
435		return (EPERM);
436
437	if (namelen != 1)
438		return (EINVAL);
439
440	error = sysctl_wire_old_buffer(req, sizeof(struct igmp_ifinfo));
441	if (error)
442		return (error);
443
444	IN_MULTI_LOCK();
445	IGMP_LOCK();
446
447	if (name[0] <= 0 || name[0] > V_if_index) {
448		error = ENOENT;
449		goto out_locked;
450	}
451
452	error = ENOENT;
453
454	ifp = ifnet_byindex(name[0]);
455	if (ifp == NULL)
456		goto out_locked;
457
458	LIST_FOREACH(igi, &V_igi_head, igi_link) {
459		if (ifp == igi->igi_ifp) {
460			error = SYSCTL_OUT(req, igi,
461			    sizeof(struct igmp_ifinfo));
462			break;
463		}
464	}
465
466out_locked:
467	IGMP_UNLOCK();
468	IN_MULTI_UNLOCK();
469	return (error);
470}
471
472/*
473 * Dispatch an entire queue of pending packet chains
474 * using the netisr.
475 * VIMAGE: Assumes the vnet pointer has been set.
476 */
477static void
478igmp_dispatch_queue(struct ifqueue *ifq, int limit, const int loop)
479{
480	struct mbuf *m;
481
482	for (;;) {
483		_IF_DEQUEUE(ifq, m);
484		if (m == NULL)
485			break;
486		CTR3(KTR_IGMPV3, "%s: dispatch %p from %p", __func__, ifq, m);
487		if (loop)
488			m->m_flags |= M_IGMP_LOOP;
489		netisr_dispatch(NETISR_IGMP, m);
490		if (--limit == 0)
491			break;
492	}
493}
494
495/*
496 * Filter outgoing IGMP report state by group.
497 *
498 * Reports are ALWAYS suppressed for ALL-HOSTS (224.0.0.1).
499 * If the net.inet.igmp.sendlocal sysctl is 0, then IGMP reports are
500 * disabled for all groups in the 224.0.0.0/24 link-local scope. However,
501 * this may break certain IGMP snooping switches which rely on the old
502 * report behaviour.
503 *
504 * Return zero if the given group is one for which IGMP reports
505 * should be suppressed, or non-zero if reports should be issued.
506 */
507static __inline int
508igmp_isgroupreported(const struct in_addr addr)
509{
510
511	if (in_allhosts(addr) ||
512	    ((!V_igmp_sendlocal && IN_LOCAL_GROUP(ntohl(addr.s_addr)))))
513		return (0);
514
515	return (1);
516}
517
518/*
519 * Construct a Router Alert option to use in outgoing packets.
520 */
521static struct mbuf *
522igmp_ra_alloc(void)
523{
524	struct mbuf	*m;
525	struct ipoption	*p;
526
527	MGET(m, M_DONTWAIT, MT_DATA);
528	p = mtod(m, struct ipoption *);
529	p->ipopt_dst.s_addr = INADDR_ANY;
530	p->ipopt_list[0] = IPOPT_RA;	/* Router Alert Option */
531	p->ipopt_list[1] = 0x04;	/* 4 bytes long */
532	p->ipopt_list[2] = IPOPT_EOL;	/* End of IP option list */
533	p->ipopt_list[3] = 0x00;	/* pad byte */
534	m->m_len = sizeof(p->ipopt_dst) + p->ipopt_list[1];
535
536	return (m);
537}
538
539/*
540 * Attach IGMP when PF_INET is attached to an interface.
541 */
542struct igmp_ifinfo *
543igmp_domifattach(struct ifnet *ifp)
544{
545	struct igmp_ifinfo *igi;
546
547	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
548	    __func__, ifp, ifp->if_xname);
549
550	IGMP_LOCK();
551
552	igi = igi_alloc_locked(ifp);
553	if (!(ifp->if_flags & IFF_MULTICAST))
554		igi->igi_flags |= IGIF_SILENT;
555
556	IGMP_UNLOCK();
557
558	return (igi);
559}
560
561/*
562 * VIMAGE: assume curvnet set by caller.
563 */
564static struct igmp_ifinfo *
565igi_alloc_locked(/*const*/ struct ifnet *ifp)
566{
567	struct igmp_ifinfo *igi;
568
569	IGMP_LOCK_ASSERT();
570
571	igi = malloc(sizeof(struct igmp_ifinfo), M_IGMP, M_NOWAIT|M_ZERO);
572	if (igi == NULL)
573		goto out;
574
575	igi->igi_ifp = ifp;
576	igi->igi_version = V_igmp_default_version;
577	igi->igi_flags = 0;
578	igi->igi_rv = IGMP_RV_INIT;
579	igi->igi_qi = IGMP_QI_INIT;
580	igi->igi_qri = IGMP_QRI_INIT;
581	igi->igi_uri = IGMP_URI_INIT;
582
583	SLIST_INIT(&igi->igi_relinmhead);
584
585	/*
586	 * Responses to general queries are subject to bounds.
587	 */
588	IFQ_SET_MAXLEN(&igi->igi_gq, IGMP_MAX_RESPONSE_PACKETS);
589
590	LIST_INSERT_HEAD(&V_igi_head, igi, igi_link);
591
592	CTR2(KTR_IGMPV3, "allocate igmp_ifinfo for ifp %p(%s)",
593	     ifp, ifp->if_xname);
594
595out:
596	return (igi);
597}
598
599/*
600 * Hook for ifdetach.
601 *
602 * NOTE: Some finalization tasks need to run before the protocol domain
603 * is detached, but also before the link layer does its cleanup.
604 *
605 * SMPNG: igmp_ifdetach() needs to take IF_ADDR_LOCK().
606 * XXX This is also bitten by unlocked ifma_protospec access.
607 */
608void
609igmp_ifdetach(struct ifnet *ifp)
610{
611	struct igmp_ifinfo	*igi;
612	struct ifmultiaddr	*ifma;
613	struct in_multi		*inm, *tinm;
614
615	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)", __func__, ifp,
616	    ifp->if_xname);
617
618	IGMP_LOCK();
619
620	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
621	if (igi->igi_version == IGMP_VERSION_3) {
622		IF_ADDR_LOCK(ifp);
623		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
624			if (ifma->ifma_addr->sa_family != AF_INET ||
625			    ifma->ifma_protospec == NULL)
626				continue;
627#if 0
628			KASSERT(ifma->ifma_protospec != NULL,
629			    ("%s: ifma_protospec is NULL", __func__));
630#endif
631			inm = (struct in_multi *)ifma->ifma_protospec;
632			if (inm->inm_state == IGMP_LEAVING_MEMBER) {
633				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
634				    inm, inm_nrele);
635			}
636			inm_clear_recorded(inm);
637		}
638		IF_ADDR_UNLOCK(ifp);
639		/*
640		 * Free the in_multi reference(s) for this IGMP lifecycle.
641		 */
642		SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead, inm_nrele,
643		    tinm) {
644			SLIST_REMOVE_HEAD(&igi->igi_relinmhead, inm_nrele);
645			inm_release_locked(inm);
646		}
647	}
648
649	IGMP_UNLOCK();
650}
651
652/*
653 * Hook for domifdetach.
654 */
655void
656igmp_domifdetach(struct ifnet *ifp)
657{
658	struct igmp_ifinfo *igi;
659
660	CTR3(KTR_IGMPV3, "%s: called for ifp %p(%s)",
661	    __func__, ifp, ifp->if_xname);
662
663	IGMP_LOCK();
664
665	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
666	igi_delete_locked(ifp);
667
668	IGMP_UNLOCK();
669}
670
671static void
672igi_delete_locked(const struct ifnet *ifp)
673{
674	struct igmp_ifinfo *igi, *tigi;
675
676	CTR3(KTR_IGMPV3, "%s: freeing igmp_ifinfo for ifp %p(%s)",
677	    __func__, ifp, ifp->if_xname);
678
679	IGMP_LOCK_ASSERT();
680
681	LIST_FOREACH_SAFE(igi, &V_igi_head, igi_link, tigi) {
682		if (igi->igi_ifp == ifp) {
683			/*
684			 * Free deferred General Query responses.
685			 */
686			_IF_DRAIN(&igi->igi_gq);
687
688			LIST_REMOVE(igi, igi_link);
689
690			KASSERT(SLIST_EMPTY(&igi->igi_relinmhead),
691			    ("%s: there are dangling in_multi references",
692			    __func__));
693
694			free(igi, M_IGMP);
695			return;
696		}
697	}
698
699#ifdef INVARIANTS
700	panic("%s: igmp_ifinfo not found for ifp %p\n", __func__,  ifp);
701#endif
702}
703
704/*
705 * Process a received IGMPv1 query.
706 * Return non-zero if the message should be dropped.
707 *
708 * VIMAGE: The curvnet pointer is derived from the input ifp.
709 */
710static int
711igmp_input_v1_query(struct ifnet *ifp, const struct ip *ip,
712    const struct igmp *igmp)
713{
714	struct ifmultiaddr	*ifma;
715	struct igmp_ifinfo	*igi;
716	struct in_multi		*inm;
717
718	/*
719	 * IGMPv1 Host Mmembership Queries SHOULD always be addressed to
720	 * 224.0.0.1. They are always treated as General Queries.
721	 * igmp_group is always ignored. Do not drop it as a userland
722	 * daemon may wish to see it.
723	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
724	 */
725	if (!in_allhosts(ip->ip_dst) || !in_nullhost(igmp->igmp_group)) {
726		IGMPSTAT_INC(igps_rcv_badqueries);
727		return (0);
728	}
729	IGMPSTAT_INC(igps_rcv_gen_queries);
730
731	IN_MULTI_LOCK();
732	IGMP_LOCK();
733
734	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
735	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
736
737	if (igi->igi_flags & IGIF_LOOPBACK) {
738		CTR2(KTR_IGMPV3, "ignore v1 query on IGIF_LOOPBACK ifp %p(%s)",
739		    ifp, ifp->if_xname);
740		goto out_locked;
741	}
742
743	/*
744	 * Switch to IGMPv1 host compatibility mode.
745	 */
746	igmp_set_version(igi, IGMP_VERSION_1);
747
748	CTR2(KTR_IGMPV3, "process v1 query on ifp %p(%s)", ifp, ifp->if_xname);
749
750	/*
751	 * Start the timers in all of our group records
752	 * for the interface on which the query arrived,
753	 * except those which are already running.
754	 */
755	IF_ADDR_LOCK(ifp);
756	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
757		if (ifma->ifma_addr->sa_family != AF_INET ||
758		    ifma->ifma_protospec == NULL)
759			continue;
760		inm = (struct in_multi *)ifma->ifma_protospec;
761		if (inm->inm_timer != 0)
762			continue;
763		switch (inm->inm_state) {
764		case IGMP_NOT_MEMBER:
765		case IGMP_SILENT_MEMBER:
766			break;
767		case IGMP_G_QUERY_PENDING_MEMBER:
768		case IGMP_SG_QUERY_PENDING_MEMBER:
769		case IGMP_REPORTING_MEMBER:
770		case IGMP_IDLE_MEMBER:
771		case IGMP_LAZY_MEMBER:
772		case IGMP_SLEEPING_MEMBER:
773		case IGMP_AWAKENING_MEMBER:
774			inm->inm_state = IGMP_REPORTING_MEMBER;
775			inm->inm_timer = IGMP_RANDOM_DELAY(
776			    IGMP_V1V2_MAX_RI * PR_FASTHZ);
777			V_current_state_timers_running = 1;
778			break;
779		case IGMP_LEAVING_MEMBER:
780			break;
781		}
782	}
783	IF_ADDR_UNLOCK(ifp);
784
785out_locked:
786	IGMP_UNLOCK();
787	IN_MULTI_UNLOCK();
788
789	return (0);
790}
791
792/*
793 * Process a received IGMPv2 general or group-specific query.
794 */
795static int
796igmp_input_v2_query(struct ifnet *ifp, const struct ip *ip,
797    const struct igmp *igmp)
798{
799	struct ifmultiaddr	*ifma;
800	struct igmp_ifinfo	*igi;
801	struct in_multi		*inm;
802	int			 is_general_query;
803	uint16_t		 timer;
804
805	is_general_query = 0;
806
807	/*
808	 * Validate address fields upfront.
809	 * XXX SMPng: unlocked increments in igmpstat assumed atomic.
810	 */
811	if (in_nullhost(igmp->igmp_group)) {
812		/*
813		 * IGMPv2 General Query.
814		 * If this was not sent to the all-hosts group, ignore it.
815		 */
816		if (!in_allhosts(ip->ip_dst))
817			return (0);
818		IGMPSTAT_INC(igps_rcv_gen_queries);
819		is_general_query = 1;
820	} else {
821		/* IGMPv2 Group-Specific Query. */
822		IGMPSTAT_INC(igps_rcv_group_queries);
823	}
824
825	IN_MULTI_LOCK();
826	IGMP_LOCK();
827
828	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
829	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
830
831	if (igi->igi_flags & IGIF_LOOPBACK) {
832		CTR2(KTR_IGMPV3, "ignore v2 query on IGIF_LOOPBACK ifp %p(%s)",
833		    ifp, ifp->if_xname);
834		goto out_locked;
835	}
836
837	/*
838	 * Ignore v2 query if in v1 Compatibility Mode.
839	 */
840	if (igi->igi_version == IGMP_VERSION_1)
841		goto out_locked;
842
843	igmp_set_version(igi, IGMP_VERSION_2);
844
845	timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
846	if (timer == 0)
847		timer = 1;
848
849	if (is_general_query) {
850		/*
851		 * For each reporting group joined on this
852		 * interface, kick the report timer.
853		 */
854		CTR2(KTR_IGMPV3, "process v2 general query on ifp %p(%s)",
855		    ifp, ifp->if_xname);
856		IF_ADDR_LOCK(ifp);
857		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
858			if (ifma->ifma_addr->sa_family != AF_INET ||
859			    ifma->ifma_protospec == NULL)
860				continue;
861			inm = (struct in_multi *)ifma->ifma_protospec;
862			igmp_v2_update_group(inm, timer);
863		}
864		IF_ADDR_UNLOCK(ifp);
865	} else {
866		/*
867		 * Group-specific IGMPv2 query, we need only
868		 * look up the single group to process it.
869		 */
870		inm = inm_lookup(ifp, igmp->igmp_group);
871		if (inm != NULL) {
872			CTR3(KTR_IGMPV3, "process v2 query %s on ifp %p(%s)",
873			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
874			igmp_v2_update_group(inm, timer);
875		}
876	}
877
878out_locked:
879	IGMP_UNLOCK();
880	IN_MULTI_UNLOCK();
881
882	return (0);
883}
884
885/*
886 * Update the report timer on a group in response to an IGMPv2 query.
887 *
888 * If we are becoming the reporting member for this group, start the timer.
889 * If we already are the reporting member for this group, and timer is
890 * below the threshold, reset it.
891 *
892 * We may be updating the group for the first time since we switched
893 * to IGMPv3. If we are, then we must clear any recorded source lists,
894 * and transition to REPORTING state; the group timer is overloaded
895 * for group and group-source query responses.
896 *
897 * Unlike IGMPv3, the delay per group should be jittered
898 * to avoid bursts of IGMPv2 reports.
899 */
900static void
901igmp_v2_update_group(struct in_multi *inm, const int timer)
902{
903
904	CTR4(KTR_IGMPV3, "%s: %s/%s timer=%d", __func__,
905	    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname, timer);
906
907	IN_MULTI_LOCK_ASSERT();
908
909	switch (inm->inm_state) {
910	case IGMP_NOT_MEMBER:
911	case IGMP_SILENT_MEMBER:
912		break;
913	case IGMP_REPORTING_MEMBER:
914		if (inm->inm_timer != 0 &&
915		    inm->inm_timer <= timer) {
916			CTR1(KTR_IGMPV3, "%s: REPORTING and timer running, "
917			    "skipping.", __func__);
918			break;
919		}
920		/* FALLTHROUGH */
921	case IGMP_SG_QUERY_PENDING_MEMBER:
922	case IGMP_G_QUERY_PENDING_MEMBER:
923	case IGMP_IDLE_MEMBER:
924	case IGMP_LAZY_MEMBER:
925	case IGMP_AWAKENING_MEMBER:
926		CTR1(KTR_IGMPV3, "%s: ->REPORTING", __func__);
927		inm->inm_state = IGMP_REPORTING_MEMBER;
928		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
929		V_current_state_timers_running = 1;
930		break;
931	case IGMP_SLEEPING_MEMBER:
932		CTR1(KTR_IGMPV3, "%s: ->AWAKENING", __func__);
933		inm->inm_state = IGMP_AWAKENING_MEMBER;
934		break;
935	case IGMP_LEAVING_MEMBER:
936		break;
937	}
938}
939
940/*
941 * Process a received IGMPv3 general, group-specific or
942 * group-and-source-specific query.
943 * Assumes m has already been pulled up to the full IGMP message length.
944 * Return 0 if successful, otherwise an appropriate error code is returned.
945 */
946static int
947igmp_input_v3_query(struct ifnet *ifp, const struct ip *ip,
948    /*const*/ struct igmpv3 *igmpv3)
949{
950	struct igmp_ifinfo	*igi;
951	struct in_multi		*inm;
952	int			 is_general_query;
953	uint32_t		 maxresp, nsrc, qqi;
954	uint16_t		 timer;
955	uint8_t			 qrv;
956
957	is_general_query = 0;
958
959	CTR2(KTR_IGMPV3, "process v3 query on ifp %p(%s)", ifp, ifp->if_xname);
960
961	maxresp = igmpv3->igmp_code;	/* in 1/10ths of a second */
962	if (maxresp >= 128) {
963		maxresp = IGMP_MANT(igmpv3->igmp_code) <<
964			  (IGMP_EXP(igmpv3->igmp_code) + 3);
965	}
966
967	/*
968	 * Robustness must never be less than 2 for on-wire IGMPv3.
969	 * FUTURE: Check if ifp has IGIF_LOOPBACK set, as we will make
970	 * an exception for interfaces whose IGMPv3 state changes
971	 * are redirected to loopback (e.g. MANET).
972	 */
973	qrv = IGMP_QRV(igmpv3->igmp_misc);
974	if (qrv < 2) {
975		CTR3(KTR_IGMPV3, "%s: clamping qrv %d to %d", __func__,
976		    qrv, IGMP_RV_INIT);
977		qrv = IGMP_RV_INIT;
978	}
979
980	qqi = igmpv3->igmp_qqi;
981	if (qqi >= 128) {
982		qqi = IGMP_MANT(igmpv3->igmp_qqi) <<
983		     (IGMP_EXP(igmpv3->igmp_qqi) + 3);
984	}
985
986	timer = maxresp * PR_FASTHZ / IGMP_TIMER_SCALE;
987	if (timer == 0)
988		timer = 1;
989
990	nsrc = ntohs(igmpv3->igmp_numsrc);
991
992	/*
993	 * Validate address fields and versions upfront before
994	 * accepting v3 query.
995	 * XXX SMPng: Unlocked access to igmpstat counters here.
996	 */
997	if (in_nullhost(igmpv3->igmp_group)) {
998		/*
999		 * IGMPv3 General Query.
1000		 *
1001		 * General Queries SHOULD be directed to 224.0.0.1.
1002		 * A general query with a source list has undefined
1003		 * behaviour; discard it.
1004		 */
1005		IGMPSTAT_INC(igps_rcv_gen_queries);
1006		if (!in_allhosts(ip->ip_dst) || nsrc > 0) {
1007			IGMPSTAT_INC(igps_rcv_badqueries);
1008			return (0);
1009		}
1010		is_general_query = 1;
1011	} else {
1012		/* Group or group-source specific query. */
1013		if (nsrc == 0)
1014			IGMPSTAT_INC(igps_rcv_group_queries);
1015		else
1016			IGMPSTAT_INC(igps_rcv_gsr_queries);
1017	}
1018
1019	IN_MULTI_LOCK();
1020	IGMP_LOCK();
1021
1022	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
1023	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
1024
1025	if (igi->igi_flags & IGIF_LOOPBACK) {
1026		CTR2(KTR_IGMPV3, "ignore v3 query on IGIF_LOOPBACK ifp %p(%s)",
1027		    ifp, ifp->if_xname);
1028		goto out_locked;
1029	}
1030
1031	/*
1032	 * Discard the v3 query if we're in Compatibility Mode.
1033	 * The RFC is not obviously worded that hosts need to stay in
1034	 * compatibility mode until the Old Version Querier Present
1035	 * timer expires.
1036	 */
1037	if (igi->igi_version != IGMP_VERSION_3) {
1038		CTR3(KTR_IGMPV3, "ignore v3 query in v%d mode on ifp %p(%s)",
1039		    igi->igi_version, ifp, ifp->if_xname);
1040		goto out_locked;
1041	}
1042
1043	igmp_set_version(igi, IGMP_VERSION_3);
1044	igi->igi_rv = qrv;
1045	igi->igi_qi = qqi;
1046	igi->igi_qri = maxresp;
1047
1048	CTR4(KTR_IGMPV3, "%s: qrv %d qi %d qri %d", __func__, qrv, qqi,
1049	    maxresp);
1050
1051	if (is_general_query) {
1052		/*
1053		 * Schedule a current-state report on this ifp for
1054		 * all groups, possibly containing source lists.
1055		 * If there is a pending General Query response
1056		 * scheduled earlier than the selected delay, do
1057		 * not schedule any other reports.
1058		 * Otherwise, reset the interface timer.
1059		 */
1060		CTR2(KTR_IGMPV3, "process v3 general query on ifp %p(%s)",
1061		    ifp, ifp->if_xname);
1062		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer) {
1063			igi->igi_v3_timer = IGMP_RANDOM_DELAY(timer);
1064			V_interface_timers_running = 1;
1065		}
1066	} else {
1067		/*
1068		 * Group-source-specific queries are throttled on
1069		 * a per-group basis to defeat denial-of-service attempts.
1070		 * Queries for groups we are not a member of on this
1071		 * link are simply ignored.
1072		 */
1073		inm = inm_lookup(ifp, igmpv3->igmp_group);
1074		if (inm == NULL)
1075			goto out_locked;
1076		if (nsrc > 0) {
1077			if (!ratecheck(&inm->inm_lastgsrtv,
1078			    &V_igmp_gsrdelay)) {
1079				CTR1(KTR_IGMPV3, "%s: GS query throttled.",
1080				    __func__);
1081				IGMPSTAT_INC(igps_drop_gsr_queries);
1082				goto out_locked;
1083			}
1084		}
1085		CTR3(KTR_IGMPV3, "process v3 %s query on ifp %p(%s)",
1086		     inet_ntoa(igmpv3->igmp_group), ifp, ifp->if_xname);
1087		/*
1088		 * If there is a pending General Query response
1089		 * scheduled sooner than the selected delay, no
1090		 * further report need be scheduled.
1091		 * Otherwise, prepare to respond to the
1092		 * group-specific or group-and-source query.
1093		 */
1094		if (igi->igi_v3_timer == 0 || igi->igi_v3_timer >= timer)
1095			igmp_input_v3_group_query(inm, igi, timer, igmpv3);
1096	}
1097
1098out_locked:
1099	IGMP_UNLOCK();
1100	IN_MULTI_UNLOCK();
1101
1102	return (0);
1103}
1104
1105/*
1106 * Process a recieved IGMPv3 group-specific or group-and-source-specific
1107 * query.
1108 * Return <0 if any error occured. Currently this is ignored.
1109 */
1110static int
1111igmp_input_v3_group_query(struct in_multi *inm, struct igmp_ifinfo *igi,
1112    int timer, /*const*/ struct igmpv3 *igmpv3)
1113{
1114	int			 retval;
1115	uint16_t		 nsrc;
1116
1117	IN_MULTI_LOCK_ASSERT();
1118	IGMP_LOCK_ASSERT();
1119
1120	retval = 0;
1121
1122	switch (inm->inm_state) {
1123	case IGMP_NOT_MEMBER:
1124	case IGMP_SILENT_MEMBER:
1125	case IGMP_SLEEPING_MEMBER:
1126	case IGMP_LAZY_MEMBER:
1127	case IGMP_AWAKENING_MEMBER:
1128	case IGMP_IDLE_MEMBER:
1129	case IGMP_LEAVING_MEMBER:
1130		return (retval);
1131		break;
1132	case IGMP_REPORTING_MEMBER:
1133	case IGMP_G_QUERY_PENDING_MEMBER:
1134	case IGMP_SG_QUERY_PENDING_MEMBER:
1135		break;
1136	}
1137
1138	nsrc = ntohs(igmpv3->igmp_numsrc);
1139
1140	/*
1141	 * Deal with group-specific queries upfront.
1142	 * If any group query is already pending, purge any recorded
1143	 * source-list state if it exists, and schedule a query response
1144	 * for this group-specific query.
1145	 */
1146	if (nsrc == 0) {
1147		if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
1148		    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER) {
1149			inm_clear_recorded(inm);
1150			timer = min(inm->inm_timer, timer);
1151		}
1152		inm->inm_state = IGMP_G_QUERY_PENDING_MEMBER;
1153		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
1154		V_current_state_timers_running = 1;
1155		return (retval);
1156	}
1157
1158	/*
1159	 * Deal with the case where a group-and-source-specific query has
1160	 * been received but a group-specific query is already pending.
1161	 */
1162	if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER) {
1163		timer = min(inm->inm_timer, timer);
1164		inm->inm_timer = IGMP_RANDOM_DELAY(timer);
1165		V_current_state_timers_running = 1;
1166		return (retval);
1167	}
1168
1169	/*
1170	 * Finally, deal with the case where a group-and-source-specific
1171	 * query has been received, where a response to a previous g-s-r
1172	 * query exists, or none exists.
1173	 * In this case, we need to parse the source-list which the Querier
1174	 * has provided us with and check if we have any source list filter
1175	 * entries at T1 for these sources. If we do not, there is no need
1176	 * schedule a report and the query may be dropped.
1177	 * If we do, we must record them and schedule a current-state
1178	 * report for those sources.
1179	 * FIXME: Handling source lists larger than 1 mbuf requires that
1180	 * we pass the mbuf chain pointer down to this function, and use
1181	 * m_getptr() to walk the chain.
1182	 */
1183	if (inm->inm_nsrc > 0) {
1184		const struct in_addr	*ap;
1185		int			 i, nrecorded;
1186
1187		ap = (const struct in_addr *)(igmpv3 + 1);
1188		nrecorded = 0;
1189		for (i = 0; i < nsrc; i++, ap++) {
1190			retval = inm_record_source(inm, ap->s_addr);
1191			if (retval < 0)
1192				break;
1193			nrecorded += retval;
1194		}
1195		if (nrecorded > 0) {
1196			CTR1(KTR_IGMPV3,
1197			    "%s: schedule response to SG query", __func__);
1198			inm->inm_state = IGMP_SG_QUERY_PENDING_MEMBER;
1199			inm->inm_timer = IGMP_RANDOM_DELAY(timer);
1200			V_current_state_timers_running = 1;
1201		}
1202	}
1203
1204	return (retval);
1205}
1206
1207/*
1208 * Process a received IGMPv1 host membership report.
1209 *
1210 * NOTE: 0.0.0.0 workaround breaks const correctness.
1211 */
1212static int
1213igmp_input_v1_report(struct ifnet *ifp, /*const*/ struct ip *ip,
1214    /*const*/ struct igmp *igmp)
1215{
1216	struct in_ifaddr *ia;
1217	struct in_multi *inm;
1218
1219	IGMPSTAT_INC(igps_rcv_reports);
1220
1221	if (ifp->if_flags & IFF_LOOPBACK)
1222		return (0);
1223
1224	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr) ||
1225	    !in_hosteq(igmp->igmp_group, ip->ip_dst))) {
1226		IGMPSTAT_INC(igps_rcv_badreports);
1227		return (EINVAL);
1228	}
1229
1230	/*
1231	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
1232	 * Booting clients may use the source address 0.0.0.0. Some
1233	 * IGMP daemons may not know how to use IP_RECVIF to determine
1234	 * the interface upon which this message was received.
1235	 * Replace 0.0.0.0 with the subnet address if told to do so.
1236	 */
1237	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
1238		IFP_TO_IA(ifp, ia);
1239		if (ia != NULL) {
1240			ip->ip_src.s_addr = htonl(ia->ia_subnet);
1241			ifa_free(&ia->ia_ifa);
1242		}
1243	}
1244
1245	CTR3(KTR_IGMPV3, "process v1 report %s on ifp %p(%s)",
1246	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1247
1248	/*
1249	 * IGMPv1 report suppression.
1250	 * If we are a member of this group, and our membership should be
1251	 * reported, stop our group timer and transition to the 'lazy' state.
1252	 */
1253	IN_MULTI_LOCK();
1254	inm = inm_lookup(ifp, igmp->igmp_group);
1255	if (inm != NULL) {
1256		struct igmp_ifinfo *igi;
1257
1258		igi = inm->inm_igi;
1259		if (igi == NULL) {
1260			KASSERT(igi != NULL,
1261			    ("%s: no igi for ifp %p", __func__, ifp));
1262			goto out_locked;
1263		}
1264
1265		IGMPSTAT_INC(igps_rcv_ourreports);
1266
1267		/*
1268		 * If we are in IGMPv3 host mode, do not allow the
1269		 * other host's IGMPv1 report to suppress our reports
1270		 * unless explicitly configured to do so.
1271		 */
1272		if (igi->igi_version == IGMP_VERSION_3) {
1273			if (V_igmp_legacysupp)
1274				igmp_v3_suppress_group_record(inm);
1275			goto out_locked;
1276		}
1277
1278		inm->inm_timer = 0;
1279
1280		switch (inm->inm_state) {
1281		case IGMP_NOT_MEMBER:
1282		case IGMP_SILENT_MEMBER:
1283			break;
1284		case IGMP_IDLE_MEMBER:
1285		case IGMP_LAZY_MEMBER:
1286		case IGMP_AWAKENING_MEMBER:
1287			CTR3(KTR_IGMPV3,
1288			    "report suppressed for %s on ifp %p(%s)",
1289			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1290		case IGMP_SLEEPING_MEMBER:
1291			inm->inm_state = IGMP_SLEEPING_MEMBER;
1292			break;
1293		case IGMP_REPORTING_MEMBER:
1294			CTR3(KTR_IGMPV3,
1295			    "report suppressed for %s on ifp %p(%s)",
1296			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1297			if (igi->igi_version == IGMP_VERSION_1)
1298				inm->inm_state = IGMP_LAZY_MEMBER;
1299			else if (igi->igi_version == IGMP_VERSION_2)
1300				inm->inm_state = IGMP_SLEEPING_MEMBER;
1301			break;
1302		case IGMP_G_QUERY_PENDING_MEMBER:
1303		case IGMP_SG_QUERY_PENDING_MEMBER:
1304		case IGMP_LEAVING_MEMBER:
1305			break;
1306		}
1307	}
1308
1309out_locked:
1310	IN_MULTI_UNLOCK();
1311
1312	return (0);
1313}
1314
1315/*
1316 * Process a received IGMPv2 host membership report.
1317 *
1318 * NOTE: 0.0.0.0 workaround breaks const correctness.
1319 */
1320static int
1321igmp_input_v2_report(struct ifnet *ifp, /*const*/ struct ip *ip,
1322    /*const*/ struct igmp *igmp)
1323{
1324	struct in_ifaddr *ia;
1325	struct in_multi *inm;
1326
1327	/*
1328	 * Make sure we don't hear our own membership report.  Fast
1329	 * leave requires knowing that we are the only member of a
1330	 * group.
1331	 */
1332	IFP_TO_IA(ifp, ia);
1333	if (ia != NULL && in_hosteq(ip->ip_src, IA_SIN(ia)->sin_addr)) {
1334		ifa_free(&ia->ia_ifa);
1335		return (0);
1336	}
1337
1338	IGMPSTAT_INC(igps_rcv_reports);
1339
1340	if (ifp->if_flags & IFF_LOOPBACK) {
1341		if (ia != NULL)
1342			ifa_free(&ia->ia_ifa);
1343		return (0);
1344	}
1345
1346	if (!IN_MULTICAST(ntohl(igmp->igmp_group.s_addr)) ||
1347	    !in_hosteq(igmp->igmp_group, ip->ip_dst)) {
1348		if (ia != NULL)
1349			ifa_free(&ia->ia_ifa);
1350		IGMPSTAT_INC(igps_rcv_badreports);
1351		return (EINVAL);
1352	}
1353
1354	/*
1355	 * RFC 3376, Section 4.2.13, 9.2, 9.3:
1356	 * Booting clients may use the source address 0.0.0.0. Some
1357	 * IGMP daemons may not know how to use IP_RECVIF to determine
1358	 * the interface upon which this message was received.
1359	 * Replace 0.0.0.0 with the subnet address if told to do so.
1360	 */
1361	if (V_igmp_recvifkludge && in_nullhost(ip->ip_src)) {
1362		if (ia != NULL)
1363			ip->ip_src.s_addr = htonl(ia->ia_subnet);
1364	}
1365	if (ia != NULL)
1366		ifa_free(&ia->ia_ifa);
1367
1368	CTR3(KTR_IGMPV3, "process v2 report %s on ifp %p(%s)",
1369	     inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1370
1371	/*
1372	 * IGMPv2 report suppression.
1373	 * If we are a member of this group, and our membership should be
1374	 * reported, and our group timer is pending or about to be reset,
1375	 * stop our group timer by transitioning to the 'lazy' state.
1376	 */
1377	IN_MULTI_LOCK();
1378	inm = inm_lookup(ifp, igmp->igmp_group);
1379	if (inm != NULL) {
1380		struct igmp_ifinfo *igi;
1381
1382		igi = inm->inm_igi;
1383		KASSERT(igi != NULL, ("%s: no igi for ifp %p", __func__, ifp));
1384
1385		IGMPSTAT_INC(igps_rcv_ourreports);
1386
1387		/*
1388		 * If we are in IGMPv3 host mode, do not allow the
1389		 * other host's IGMPv1 report to suppress our reports
1390		 * unless explicitly configured to do so.
1391		 */
1392		if (igi->igi_version == IGMP_VERSION_3) {
1393			if (V_igmp_legacysupp)
1394				igmp_v3_suppress_group_record(inm);
1395			goto out_locked;
1396		}
1397
1398		inm->inm_timer = 0;
1399
1400		switch (inm->inm_state) {
1401		case IGMP_NOT_MEMBER:
1402		case IGMP_SILENT_MEMBER:
1403		case IGMP_SLEEPING_MEMBER:
1404			break;
1405		case IGMP_REPORTING_MEMBER:
1406		case IGMP_IDLE_MEMBER:
1407		case IGMP_AWAKENING_MEMBER:
1408			CTR3(KTR_IGMPV3,
1409			    "report suppressed for %s on ifp %p(%s)",
1410			    inet_ntoa(igmp->igmp_group), ifp, ifp->if_xname);
1411		case IGMP_LAZY_MEMBER:
1412			inm->inm_state = IGMP_LAZY_MEMBER;
1413			break;
1414		case IGMP_G_QUERY_PENDING_MEMBER:
1415		case IGMP_SG_QUERY_PENDING_MEMBER:
1416		case IGMP_LEAVING_MEMBER:
1417			break;
1418		}
1419	}
1420
1421out_locked:
1422	IN_MULTI_UNLOCK();
1423
1424	return (0);
1425}
1426
1427void
1428igmp_input(struct mbuf *m, int off)
1429{
1430	int iphlen;
1431	struct ifnet *ifp;
1432	struct igmp *igmp;
1433	struct ip *ip;
1434	int igmplen;
1435	int minlen;
1436	int queryver;
1437
1438	CTR3(KTR_IGMPV3, "%s: called w/mbuf (%p,%d)", __func__, m, off);
1439
1440	ifp = m->m_pkthdr.rcvif;
1441
1442	IGMPSTAT_INC(igps_rcv_total);
1443
1444	ip = mtod(m, struct ip *);
1445	iphlen = off;
1446	igmplen = ip->ip_len;
1447
1448	/*
1449	 * Validate lengths.
1450	 */
1451	if (igmplen < IGMP_MINLEN) {
1452		IGMPSTAT_INC(igps_rcv_tooshort);
1453		m_freem(m);
1454		return;
1455	}
1456
1457	/*
1458	 * Always pullup to the minimum size for v1/v2 or v3
1459	 * to amortize calls to m_pullup().
1460	 */
1461	minlen = iphlen;
1462	if (igmplen >= IGMP_V3_QUERY_MINLEN)
1463		minlen += IGMP_V3_QUERY_MINLEN;
1464	else
1465		minlen += IGMP_MINLEN;
1466	if ((m->m_flags & M_EXT || m->m_len < minlen) &&
1467	    (m = m_pullup(m, minlen)) == 0) {
1468		IGMPSTAT_INC(igps_rcv_tooshort);
1469		return;
1470	}
1471	ip = mtod(m, struct ip *);
1472
1473	if (ip->ip_ttl != 1) {
1474		IGMPSTAT_INC(igps_rcv_badttl);
1475		m_freem(m);
1476		return;
1477	}
1478
1479	/*
1480	 * Validate checksum.
1481	 */
1482	m->m_data += iphlen;
1483	m->m_len -= iphlen;
1484	igmp = mtod(m, struct igmp *);
1485	if (in_cksum(m, igmplen)) {
1486		IGMPSTAT_INC(igps_rcv_badsum);
1487		m_freem(m);
1488		return;
1489	}
1490	m->m_data -= iphlen;
1491	m->m_len += iphlen;
1492
1493	switch (igmp->igmp_type) {
1494	case IGMP_HOST_MEMBERSHIP_QUERY:
1495		if (igmplen == IGMP_MINLEN) {
1496			if (igmp->igmp_code == 0)
1497				queryver = IGMP_VERSION_1;
1498			else
1499				queryver = IGMP_VERSION_2;
1500		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
1501			queryver = IGMP_VERSION_3;
1502		} else {
1503			IGMPSTAT_INC(igps_rcv_tooshort);
1504			m_freem(m);
1505			return;
1506		}
1507
1508		switch (queryver) {
1509		case IGMP_VERSION_1:
1510			IGMPSTAT_INC(igps_rcv_v1v2_queries);
1511			if (!V_igmp_v1enable)
1512				break;
1513			if (igmp_input_v1_query(ifp, ip, igmp) != 0) {
1514				m_freem(m);
1515				return;
1516			}
1517			break;
1518
1519		case IGMP_VERSION_2:
1520			IGMPSTAT_INC(igps_rcv_v1v2_queries);
1521			if (!V_igmp_v2enable)
1522				break;
1523			if (igmp_input_v2_query(ifp, ip, igmp) != 0) {
1524				m_freem(m);
1525				return;
1526			}
1527			break;
1528
1529		case IGMP_VERSION_3: {
1530				struct igmpv3 *igmpv3;
1531				uint16_t igmpv3len;
1532				uint16_t srclen;
1533				int nsrc;
1534
1535				IGMPSTAT_INC(igps_rcv_v3_queries);
1536				igmpv3 = (struct igmpv3 *)igmp;
1537				/*
1538				 * Validate length based on source count.
1539				 */
1540				nsrc = ntohs(igmpv3->igmp_numsrc);
1541				srclen = sizeof(struct in_addr) * nsrc;
1542				if (nsrc * sizeof(in_addr_t) > srclen) {
1543					IGMPSTAT_INC(igps_rcv_tooshort);
1544					return;
1545				}
1546				/*
1547				 * m_pullup() may modify m, so pullup in
1548				 * this scope.
1549				 */
1550				igmpv3len = iphlen + IGMP_V3_QUERY_MINLEN +
1551				    srclen;
1552				if ((m->m_flags & M_EXT ||
1553				     m->m_len < igmpv3len) &&
1554				    (m = m_pullup(m, igmpv3len)) == NULL) {
1555					IGMPSTAT_INC(igps_rcv_tooshort);
1556					return;
1557				}
1558				igmpv3 = (struct igmpv3 *)(mtod(m, uint8_t *)
1559				    + iphlen);
1560				if (igmp_input_v3_query(ifp, ip, igmpv3) != 0) {
1561					m_freem(m);
1562					return;
1563				}
1564			}
1565			break;
1566		}
1567		break;
1568
1569	case IGMP_v1_HOST_MEMBERSHIP_REPORT:
1570		if (!V_igmp_v1enable)
1571			break;
1572		if (igmp_input_v1_report(ifp, ip, igmp) != 0) {
1573			m_freem(m);
1574			return;
1575		}
1576		break;
1577
1578	case IGMP_v2_HOST_MEMBERSHIP_REPORT:
1579		if (!V_igmp_v2enable)
1580			break;
1581		if (!ip_checkrouteralert(m))
1582			IGMPSTAT_INC(igps_rcv_nora);
1583		if (igmp_input_v2_report(ifp, ip, igmp) != 0) {
1584			m_freem(m);
1585			return;
1586		}
1587		break;
1588
1589	case IGMP_v3_HOST_MEMBERSHIP_REPORT:
1590		/*
1591		 * Hosts do not need to process IGMPv3 membership reports,
1592		 * as report suppression is no longer required.
1593		 */
1594		if (!ip_checkrouteralert(m))
1595			IGMPSTAT_INC(igps_rcv_nora);
1596		break;
1597
1598	default:
1599		break;
1600	}
1601
1602	/*
1603	 * Pass all valid IGMP packets up to any process(es) listening on a
1604	 * raw IGMP socket.
1605	 */
1606	rip_input(m, off);
1607}
1608
1609
1610/*
1611 * Fast timeout handler (global).
1612 * VIMAGE: Timeout handlers are expected to service all vimages.
1613 */
1614void
1615igmp_fasttimo(void)
1616{
1617	VNET_ITERATOR_DECL(vnet_iter);
1618
1619	VNET_LIST_RLOCK();
1620	VNET_FOREACH(vnet_iter) {
1621		CURVNET_SET(vnet_iter);
1622		igmp_fasttimo_vnet();
1623		CURVNET_RESTORE();
1624	}
1625	VNET_LIST_RUNLOCK();
1626}
1627
1628/*
1629 * Fast timeout handler (per-vnet).
1630 * Sends are shuffled off to a netisr to deal with Giant.
1631 *
1632 * VIMAGE: Assume caller has set up our curvnet.
1633 */
1634static void
1635igmp_fasttimo_vnet(void)
1636{
1637	struct ifqueue		 scq;	/* State-change packets */
1638	struct ifqueue		 qrq;	/* Query response packets */
1639	struct ifnet		*ifp;
1640	struct igmp_ifinfo	*igi;
1641	struct ifmultiaddr	*ifma, *tifma;
1642	struct in_multi		*inm;
1643	int			 loop, uri_fasthz;
1644
1645	loop = 0;
1646	uri_fasthz = 0;
1647
1648	/*
1649	 * Quick check to see if any work needs to be done, in order to
1650	 * minimize the overhead of fasttimo processing.
1651	 * SMPng: XXX Unlocked reads.
1652	 */
1653	if (!V_current_state_timers_running &&
1654	    !V_interface_timers_running &&
1655	    !V_state_change_timers_running)
1656		return;
1657
1658	IN_MULTI_LOCK();
1659	IGMP_LOCK();
1660
1661	/*
1662	 * IGMPv3 General Query response timer processing.
1663	 */
1664	if (V_interface_timers_running) {
1665		CTR1(KTR_IGMPV3, "%s: interface timers running", __func__);
1666
1667		V_interface_timers_running = 0;
1668		LIST_FOREACH(igi, &V_igi_head, igi_link) {
1669			if (igi->igi_v3_timer == 0) {
1670				/* Do nothing. */
1671			} else if (--igi->igi_v3_timer == 0) {
1672				igmp_v3_dispatch_general_query(igi);
1673			} else {
1674				V_interface_timers_running = 1;
1675			}
1676		}
1677	}
1678
1679	if (!V_current_state_timers_running &&
1680	    !V_state_change_timers_running)
1681		goto out_locked;
1682
1683	V_current_state_timers_running = 0;
1684	V_state_change_timers_running = 0;
1685
1686	CTR1(KTR_IGMPV3, "%s: state change timers running", __func__);
1687
1688	/*
1689	 * IGMPv1/v2/v3 host report and state-change timer processing.
1690	 * Note: Processing a v3 group timer may remove a node.
1691	 */
1692	LIST_FOREACH(igi, &V_igi_head, igi_link) {
1693		ifp = igi->igi_ifp;
1694
1695		if (igi->igi_version == IGMP_VERSION_3) {
1696			loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
1697			uri_fasthz = IGMP_RANDOM_DELAY(igi->igi_uri *
1698			    PR_FASTHZ);
1699
1700			memset(&qrq, 0, sizeof(struct ifqueue));
1701			IFQ_SET_MAXLEN(&qrq, IGMP_MAX_G_GS_PACKETS);
1702
1703			memset(&scq, 0, sizeof(struct ifqueue));
1704			IFQ_SET_MAXLEN(&scq, IGMP_MAX_STATE_CHANGE_PACKETS);
1705		}
1706
1707		IF_ADDR_LOCK(ifp);
1708		TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link,
1709		    tifma) {
1710			if (ifma->ifma_addr->sa_family != AF_INET ||
1711			    ifma->ifma_protospec == NULL)
1712				continue;
1713			inm = (struct in_multi *)ifma->ifma_protospec;
1714			switch (igi->igi_version) {
1715			case IGMP_VERSION_1:
1716			case IGMP_VERSION_2:
1717				igmp_v1v2_process_group_timer(inm,
1718				    igi->igi_version);
1719				break;
1720			case IGMP_VERSION_3:
1721				igmp_v3_process_group_timers(igi, &qrq,
1722				    &scq, inm, uri_fasthz);
1723				break;
1724			}
1725		}
1726		IF_ADDR_UNLOCK(ifp);
1727
1728		if (igi->igi_version == IGMP_VERSION_3) {
1729			struct in_multi		*tinm;
1730
1731			igmp_dispatch_queue(&qrq, 0, loop);
1732			igmp_dispatch_queue(&scq, 0, loop);
1733
1734			/*
1735			 * Free the in_multi reference(s) for this
1736			 * IGMP lifecycle.
1737			 */
1738			SLIST_FOREACH_SAFE(inm, &igi->igi_relinmhead,
1739			    inm_nrele, tinm) {
1740				SLIST_REMOVE_HEAD(&igi->igi_relinmhead,
1741				    inm_nrele);
1742				inm_release_locked(inm);
1743			}
1744		}
1745	}
1746
1747out_locked:
1748	IGMP_UNLOCK();
1749	IN_MULTI_UNLOCK();
1750}
1751
1752/*
1753 * Update host report group timer for IGMPv1/v2.
1754 * Will update the global pending timer flags.
1755 */
1756static void
1757igmp_v1v2_process_group_timer(struct in_multi *inm, const int version)
1758{
1759	int report_timer_expired;
1760
1761	IN_MULTI_LOCK_ASSERT();
1762	IGMP_LOCK_ASSERT();
1763
1764	if (inm->inm_timer == 0) {
1765		report_timer_expired = 0;
1766	} else if (--inm->inm_timer == 0) {
1767		report_timer_expired = 1;
1768	} else {
1769		V_current_state_timers_running = 1;
1770		return;
1771	}
1772
1773	switch (inm->inm_state) {
1774	case IGMP_NOT_MEMBER:
1775	case IGMP_SILENT_MEMBER:
1776	case IGMP_IDLE_MEMBER:
1777	case IGMP_LAZY_MEMBER:
1778	case IGMP_SLEEPING_MEMBER:
1779	case IGMP_AWAKENING_MEMBER:
1780		break;
1781	case IGMP_REPORTING_MEMBER:
1782		if (report_timer_expired) {
1783			inm->inm_state = IGMP_IDLE_MEMBER;
1784			(void)igmp_v1v2_queue_report(inm,
1785			    (version == IGMP_VERSION_2) ?
1786			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
1787			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
1788		}
1789		break;
1790	case IGMP_G_QUERY_PENDING_MEMBER:
1791	case IGMP_SG_QUERY_PENDING_MEMBER:
1792	case IGMP_LEAVING_MEMBER:
1793		break;
1794	}
1795}
1796
1797/*
1798 * Update a group's timers for IGMPv3.
1799 * Will update the global pending timer flags.
1800 * Note: Unlocked read from igi.
1801 */
1802static void
1803igmp_v3_process_group_timers(struct igmp_ifinfo *igi,
1804    struct ifqueue *qrq, struct ifqueue *scq,
1805    struct in_multi *inm, const int uri_fasthz)
1806{
1807	int query_response_timer_expired;
1808	int state_change_retransmit_timer_expired;
1809
1810	IN_MULTI_LOCK_ASSERT();
1811	IGMP_LOCK_ASSERT();
1812
1813	query_response_timer_expired = 0;
1814	state_change_retransmit_timer_expired = 0;
1815
1816	/*
1817	 * During a transition from v1/v2 compatibility mode back to v3,
1818	 * a group record in REPORTING state may still have its group
1819	 * timer active. This is a no-op in this function; it is easier
1820	 * to deal with it here than to complicate the slow-timeout path.
1821	 */
1822	if (inm->inm_timer == 0) {
1823		query_response_timer_expired = 0;
1824	} else if (--inm->inm_timer == 0) {
1825		query_response_timer_expired = 1;
1826	} else {
1827		V_current_state_timers_running = 1;
1828	}
1829
1830	if (inm->inm_sctimer == 0) {
1831		state_change_retransmit_timer_expired = 0;
1832	} else if (--inm->inm_sctimer == 0) {
1833		state_change_retransmit_timer_expired = 1;
1834	} else {
1835		V_state_change_timers_running = 1;
1836	}
1837
1838	/* We are in fasttimo, so be quick about it. */
1839	if (!state_change_retransmit_timer_expired &&
1840	    !query_response_timer_expired)
1841		return;
1842
1843	switch (inm->inm_state) {
1844	case IGMP_NOT_MEMBER:
1845	case IGMP_SILENT_MEMBER:
1846	case IGMP_SLEEPING_MEMBER:
1847	case IGMP_LAZY_MEMBER:
1848	case IGMP_AWAKENING_MEMBER:
1849	case IGMP_IDLE_MEMBER:
1850		break;
1851	case IGMP_G_QUERY_PENDING_MEMBER:
1852	case IGMP_SG_QUERY_PENDING_MEMBER:
1853		/*
1854		 * Respond to a previously pending Group-Specific
1855		 * or Group-and-Source-Specific query by enqueueing
1856		 * the appropriate Current-State report for
1857		 * immediate transmission.
1858		 */
1859		if (query_response_timer_expired) {
1860			int retval;
1861
1862			retval = igmp_v3_enqueue_group_record(qrq, inm, 0, 1,
1863			    (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER));
1864			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
1865			    __func__, retval);
1866			inm->inm_state = IGMP_REPORTING_MEMBER;
1867			/* XXX Clear recorded sources for next time. */
1868			inm_clear_recorded(inm);
1869		}
1870		/* FALLTHROUGH */
1871	case IGMP_REPORTING_MEMBER:
1872	case IGMP_LEAVING_MEMBER:
1873		if (state_change_retransmit_timer_expired) {
1874			/*
1875			 * State-change retransmission timer fired.
1876			 * If there are any further pending retransmissions,
1877			 * set the global pending state-change flag, and
1878			 * reset the timer.
1879			 */
1880			if (--inm->inm_scrv > 0) {
1881				inm->inm_sctimer = uri_fasthz;
1882				V_state_change_timers_running = 1;
1883			}
1884			/*
1885			 * Retransmit the previously computed state-change
1886			 * report. If there are no further pending
1887			 * retransmissions, the mbuf queue will be consumed.
1888			 * Update T0 state to T1 as we have now sent
1889			 * a state-change.
1890			 */
1891			(void)igmp_v3_merge_state_changes(inm, scq);
1892
1893			inm_commit(inm);
1894			CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
1895			    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
1896
1897			/*
1898			 * If we are leaving the group for good, make sure
1899			 * we release IGMP's reference to it.
1900			 * This release must be deferred using a SLIST,
1901			 * as we are called from a loop which traverses
1902			 * the in_ifmultiaddr TAILQ.
1903			 */
1904			if (inm->inm_state == IGMP_LEAVING_MEMBER &&
1905			    inm->inm_scrv == 0) {
1906				inm->inm_state = IGMP_NOT_MEMBER;
1907				SLIST_INSERT_HEAD(&igi->igi_relinmhead,
1908				    inm, inm_nrele);
1909			}
1910		}
1911		break;
1912	}
1913}
1914
1915
1916/*
1917 * Suppress a group's pending response to a group or source/group query.
1918 *
1919 * Do NOT suppress state changes. This leads to IGMPv3 inconsistency.
1920 * Do NOT update ST1/ST0 as this operation merely suppresses
1921 * the currently pending group record.
1922 * Do NOT suppress the response to a general query. It is possible but
1923 * it would require adding another state or flag.
1924 */
1925static void
1926igmp_v3_suppress_group_record(struct in_multi *inm)
1927{
1928
1929	IN_MULTI_LOCK_ASSERT();
1930
1931	KASSERT(inm->inm_igi->igi_version == IGMP_VERSION_3,
1932		("%s: not IGMPv3 mode on link", __func__));
1933
1934	if (inm->inm_state != IGMP_G_QUERY_PENDING_MEMBER ||
1935	    inm->inm_state != IGMP_SG_QUERY_PENDING_MEMBER)
1936		return;
1937
1938	if (inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
1939		inm_clear_recorded(inm);
1940
1941	inm->inm_timer = 0;
1942	inm->inm_state = IGMP_REPORTING_MEMBER;
1943}
1944
1945/*
1946 * Switch to a different IGMP version on the given interface,
1947 * as per Section 7.2.1.
1948 */
1949static void
1950igmp_set_version(struct igmp_ifinfo *igi, const int version)
1951{
1952	int old_version_timer;
1953
1954	IGMP_LOCK_ASSERT();
1955
1956	CTR4(KTR_IGMPV3, "%s: switching to v%d on ifp %p(%s)", __func__,
1957	    version, igi->igi_ifp, igi->igi_ifp->if_xname);
1958
1959	if (version == IGMP_VERSION_1 || version == IGMP_VERSION_2) {
1960		/*
1961		 * Compute the "Older Version Querier Present" timer as per
1962		 * Section 8.12.
1963		 */
1964		old_version_timer = igi->igi_rv * igi->igi_qi + igi->igi_qri;
1965		old_version_timer *= PR_SLOWHZ;
1966
1967		if (version == IGMP_VERSION_1) {
1968			igi->igi_v1_timer = old_version_timer;
1969			igi->igi_v2_timer = 0;
1970		} else if (version == IGMP_VERSION_2) {
1971			igi->igi_v1_timer = 0;
1972			igi->igi_v2_timer = old_version_timer;
1973		}
1974	}
1975
1976	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
1977		if (igi->igi_version != IGMP_VERSION_2) {
1978			igi->igi_version = IGMP_VERSION_2;
1979			igmp_v3_cancel_link_timers(igi);
1980		}
1981	} else if (igi->igi_v1_timer > 0) {
1982		if (igi->igi_version != IGMP_VERSION_1) {
1983			igi->igi_version = IGMP_VERSION_1;
1984			igmp_v3_cancel_link_timers(igi);
1985		}
1986	}
1987}
1988
1989/*
1990 * Cancel pending IGMPv3 timers for the given link and all groups
1991 * joined on it; state-change, general-query, and group-query timers.
1992 *
1993 * Only ever called on a transition from v3 to Compatibility mode. Kill
1994 * the timers stone dead (this may be expensive for large N groups), they
1995 * will be restarted if Compatibility Mode deems that they must be due to
1996 * query processing.
1997 */
1998static void
1999igmp_v3_cancel_link_timers(struct igmp_ifinfo *igi)
2000{
2001	struct ifmultiaddr	*ifma;
2002	struct ifnet		*ifp;
2003	struct in_multi		*inm;
2004
2005	CTR3(KTR_IGMPV3, "%s: cancel v3 timers on ifp %p(%s)", __func__,
2006	    igi->igi_ifp, igi->igi_ifp->if_xname);
2007
2008	IN_MULTI_LOCK_ASSERT();
2009	IGMP_LOCK_ASSERT();
2010
2011	/*
2012	 * Stop the v3 General Query Response on this link stone dead.
2013	 * If fasttimo is woken up due to V_interface_timers_running,
2014	 * the flag will be cleared if there are no pending link timers.
2015	 */
2016	igi->igi_v3_timer = 0;
2017
2018	/*
2019	 * Now clear the current-state and state-change report timers
2020	 * for all memberships scoped to this link.
2021	 */
2022	ifp = igi->igi_ifp;
2023	IF_ADDR_LOCK(ifp);
2024	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2025		if (ifma->ifma_addr->sa_family != AF_INET ||
2026		    ifma->ifma_protospec == NULL)
2027			continue;
2028		inm = (struct in_multi *)ifma->ifma_protospec;
2029		switch (inm->inm_state) {
2030		case IGMP_NOT_MEMBER:
2031		case IGMP_SILENT_MEMBER:
2032		case IGMP_IDLE_MEMBER:
2033		case IGMP_LAZY_MEMBER:
2034		case IGMP_SLEEPING_MEMBER:
2035		case IGMP_AWAKENING_MEMBER:
2036			/*
2037			 * These states are either not relevant in v3 mode,
2038			 * or are unreported. Do nothing.
2039			 */
2040			break;
2041		case IGMP_LEAVING_MEMBER:
2042			/*
2043			 * If we are leaving the group and switching to
2044			 * compatibility mode, we need to release the final
2045			 * reference held for issuing the INCLUDE {}, and
2046			 * transition to REPORTING to ensure the host leave
2047			 * message is sent upstream to the old querier --
2048			 * transition to NOT would lose the leave and race.
2049			 *
2050			 * SMPNG: Must drop and re-acquire IF_ADDR_LOCK
2051			 * around inm_release_locked(), as it is not
2052			 * a recursive mutex.
2053			 */
2054			IF_ADDR_UNLOCK(ifp);
2055			inm_release_locked(inm);
2056			IF_ADDR_LOCK(ifp);
2057			/* FALLTHROUGH */
2058		case IGMP_G_QUERY_PENDING_MEMBER:
2059		case IGMP_SG_QUERY_PENDING_MEMBER:
2060			inm_clear_recorded(inm);
2061			/* FALLTHROUGH */
2062		case IGMP_REPORTING_MEMBER:
2063			inm->inm_state = IGMP_REPORTING_MEMBER;
2064			break;
2065		}
2066		/*
2067		 * Always clear state-change and group report timers.
2068		 * Free any pending IGMPv3 state-change records.
2069		 */
2070		inm->inm_sctimer = 0;
2071		inm->inm_timer = 0;
2072		_IF_DRAIN(&inm->inm_scq);
2073	}
2074	IF_ADDR_UNLOCK(ifp);
2075}
2076
2077/*
2078 * Update the Older Version Querier Present timers for a link.
2079 * See Section 7.2.1 of RFC 3376.
2080 */
2081static void
2082igmp_v1v2_process_querier_timers(struct igmp_ifinfo *igi)
2083{
2084
2085	IGMP_LOCK_ASSERT();
2086
2087	if (igi->igi_v1_timer == 0 && igi->igi_v2_timer == 0) {
2088		/*
2089		 * IGMPv1 and IGMPv2 Querier Present timers expired.
2090		 *
2091		 * Revert to IGMPv3.
2092		 */
2093		if (igi->igi_version != IGMP_VERSION_3) {
2094			CTR5(KTR_IGMPV3,
2095			    "%s: transition from v%d -> v%d on %p(%s)",
2096			    __func__, igi->igi_version, IGMP_VERSION_3,
2097			    igi->igi_ifp, igi->igi_ifp->if_xname);
2098			igi->igi_version = IGMP_VERSION_3;
2099		}
2100	} else if (igi->igi_v1_timer == 0 && igi->igi_v2_timer > 0) {
2101		/*
2102		 * IGMPv1 Querier Present timer expired,
2103		 * IGMPv2 Querier Present timer running.
2104		 * If IGMPv2 was disabled since last timeout,
2105		 * revert to IGMPv3.
2106		 * If IGMPv2 is enabled, revert to IGMPv2.
2107		 */
2108		if (!V_igmp_v2enable) {
2109			CTR5(KTR_IGMPV3,
2110			    "%s: transition from v%d -> v%d on %p(%s)",
2111			    __func__, igi->igi_version, IGMP_VERSION_3,
2112			    igi->igi_ifp, igi->igi_ifp->if_xname);
2113			igi->igi_v2_timer = 0;
2114			igi->igi_version = IGMP_VERSION_3;
2115		} else {
2116			--igi->igi_v2_timer;
2117			if (igi->igi_version != IGMP_VERSION_2) {
2118				CTR5(KTR_IGMPV3,
2119				    "%s: transition from v%d -> v%d on %p(%s)",
2120				    __func__, igi->igi_version, IGMP_VERSION_2,
2121				    igi->igi_ifp, igi->igi_ifp->if_xname);
2122				igi->igi_version = IGMP_VERSION_2;
2123			}
2124		}
2125	} else if (igi->igi_v1_timer > 0) {
2126		/*
2127		 * IGMPv1 Querier Present timer running.
2128		 * Stop IGMPv2 timer if running.
2129		 *
2130		 * If IGMPv1 was disabled since last timeout,
2131		 * revert to IGMPv3.
2132		 * If IGMPv1 is enabled, reset IGMPv2 timer if running.
2133		 */
2134		if (!V_igmp_v1enable) {
2135			CTR5(KTR_IGMPV3,
2136			    "%s: transition from v%d -> v%d on %p(%s)",
2137			    __func__, igi->igi_version, IGMP_VERSION_3,
2138			    igi->igi_ifp, igi->igi_ifp->if_xname);
2139			igi->igi_v1_timer = 0;
2140			igi->igi_version = IGMP_VERSION_3;
2141		} else {
2142			--igi->igi_v1_timer;
2143		}
2144		if (igi->igi_v2_timer > 0) {
2145			CTR3(KTR_IGMPV3,
2146			    "%s: cancel v2 timer on %p(%s)",
2147			    __func__, igi->igi_ifp, igi->igi_ifp->if_xname);
2148			igi->igi_v2_timer = 0;
2149		}
2150	}
2151}
2152
2153/*
2154 * Global slowtimo handler.
2155 * VIMAGE: Timeout handlers are expected to service all vimages.
2156 */
2157void
2158igmp_slowtimo(void)
2159{
2160	VNET_ITERATOR_DECL(vnet_iter);
2161
2162	VNET_LIST_RLOCK();
2163	VNET_FOREACH(vnet_iter) {
2164		CURVNET_SET(vnet_iter);
2165		igmp_slowtimo_vnet();
2166		CURVNET_RESTORE();
2167	}
2168	VNET_LIST_RUNLOCK();
2169}
2170
2171/*
2172 * Per-vnet slowtimo handler.
2173 */
2174static void
2175igmp_slowtimo_vnet(void)
2176{
2177	struct igmp_ifinfo *igi;
2178
2179	IGMP_LOCK();
2180
2181	LIST_FOREACH(igi, &V_igi_head, igi_link) {
2182		igmp_v1v2_process_querier_timers(igi);
2183	}
2184
2185	IGMP_UNLOCK();
2186}
2187
2188/*
2189 * Dispatch an IGMPv1/v2 host report or leave message.
2190 * These are always small enough to fit inside a single mbuf.
2191 */
2192static int
2193igmp_v1v2_queue_report(struct in_multi *inm, const int type)
2194{
2195	struct ifnet		*ifp;
2196	struct igmp		*igmp;
2197	struct ip		*ip;
2198	struct mbuf		*m;
2199
2200	IN_MULTI_LOCK_ASSERT();
2201	IGMP_LOCK_ASSERT();
2202
2203	ifp = inm->inm_ifp;
2204
2205	MGETHDR(m, M_DONTWAIT, MT_DATA);
2206	if (m == NULL)
2207		return (ENOMEM);
2208	MH_ALIGN(m, sizeof(struct ip) + sizeof(struct igmp));
2209
2210	m->m_pkthdr.len = sizeof(struct ip) + sizeof(struct igmp);
2211
2212	m->m_data += sizeof(struct ip);
2213	m->m_len = sizeof(struct igmp);
2214
2215	igmp = mtod(m, struct igmp *);
2216	igmp->igmp_type = type;
2217	igmp->igmp_code = 0;
2218	igmp->igmp_group = inm->inm_addr;
2219	igmp->igmp_cksum = 0;
2220	igmp->igmp_cksum = in_cksum(m, sizeof(struct igmp));
2221
2222	m->m_data -= sizeof(struct ip);
2223	m->m_len += sizeof(struct ip);
2224
2225	ip = mtod(m, struct ip *);
2226	ip->ip_tos = 0;
2227	ip->ip_len = sizeof(struct ip) + sizeof(struct igmp);
2228	ip->ip_off = 0;
2229	ip->ip_p = IPPROTO_IGMP;
2230	ip->ip_src.s_addr = INADDR_ANY;
2231
2232	if (type == IGMP_HOST_LEAVE_MESSAGE)
2233		ip->ip_dst.s_addr = htonl(INADDR_ALLRTRS_GROUP);
2234	else
2235		ip->ip_dst = inm->inm_addr;
2236
2237	igmp_save_context(m, ifp);
2238
2239	m->m_flags |= M_IGMPV2;
2240	if (inm->inm_igi->igi_flags & IGIF_LOOPBACK)
2241		m->m_flags |= M_IGMP_LOOP;
2242
2243	CTR2(KTR_IGMPV3, "%s: netisr_dispatch(NETISR_IGMP, %p)", __func__, m);
2244	netisr_dispatch(NETISR_IGMP, m);
2245
2246	return (0);
2247}
2248
2249/*
2250 * Process a state change from the upper layer for the given IPv4 group.
2251 *
2252 * Each socket holds a reference on the in_multi in its own ip_moptions.
2253 * The socket layer will have made the necessary updates to.the group
2254 * state, it is now up to IGMP to issue a state change report if there
2255 * has been any change between T0 (when the last state-change was issued)
2256 * and T1 (now).
2257 *
2258 * We use the IGMPv3 state machine at group level. The IGMP module
2259 * however makes the decision as to which IGMP protocol version to speak.
2260 * A state change *from* INCLUDE {} always means an initial join.
2261 * A state change *to* INCLUDE {} always means a final leave.
2262 *
2263 * FUTURE: If IGIF_V3LITE is enabled for this interface, then we can
2264 * save ourselves a bunch of work; any exclusive mode groups need not
2265 * compute source filter lists.
2266 *
2267 * VIMAGE: curvnet should have been set by caller, as this routine
2268 * is called from the socket option handlers.
2269 */
2270int
2271igmp_change_state(struct in_multi *inm)
2272{
2273	struct igmp_ifinfo *igi;
2274	struct ifnet *ifp;
2275	int error;
2276
2277	IN_MULTI_LOCK_ASSERT();
2278
2279	error = 0;
2280
2281	/*
2282	 * Try to detect if the upper layer just asked us to change state
2283	 * for an interface which has now gone away.
2284	 */
2285	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
2286	ifp = inm->inm_ifma->ifma_ifp;
2287	if (ifp != NULL) {
2288		/*
2289		 * Sanity check that netinet's notion of ifp is the
2290		 * same as net's.
2291		 */
2292		KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
2293	}
2294
2295	IGMP_LOCK();
2296
2297	igi = ((struct in_ifinfo *)ifp->if_afdata[AF_INET])->ii_igmp;
2298	KASSERT(igi != NULL, ("%s: no igmp_ifinfo for ifp %p", __func__, ifp));
2299
2300	/*
2301	 * If we detect a state transition to or from MCAST_UNDEFINED
2302	 * for this group, then we are starting or finishing an IGMP
2303	 * life cycle for this group.
2304	 */
2305	if (inm->inm_st[1].iss_fmode != inm->inm_st[0].iss_fmode) {
2306		CTR3(KTR_IGMPV3, "%s: inm transition %d -> %d", __func__,
2307		    inm->inm_st[0].iss_fmode, inm->inm_st[1].iss_fmode);
2308		if (inm->inm_st[0].iss_fmode == MCAST_UNDEFINED) {
2309			CTR1(KTR_IGMPV3, "%s: initial join", __func__);
2310			error = igmp_initial_join(inm, igi);
2311			goto out_locked;
2312		} else if (inm->inm_st[1].iss_fmode == MCAST_UNDEFINED) {
2313			CTR1(KTR_IGMPV3, "%s: final leave", __func__);
2314			igmp_final_leave(inm, igi);
2315			goto out_locked;
2316		}
2317	} else {
2318		CTR1(KTR_IGMPV3, "%s: filter set change", __func__);
2319	}
2320
2321	error = igmp_handle_state_change(inm, igi);
2322
2323out_locked:
2324	IGMP_UNLOCK();
2325	return (error);
2326}
2327
2328/*
2329 * Perform the initial join for an IGMP group.
2330 *
2331 * When joining a group:
2332 *  If the group should have its IGMP traffic suppressed, do nothing.
2333 *  IGMPv1 starts sending IGMPv1 host membership reports.
2334 *  IGMPv2 starts sending IGMPv2 host membership reports.
2335 *  IGMPv3 will schedule an IGMPv3 state-change report containing the
2336 *  initial state of the membership.
2337 */
2338static int
2339igmp_initial_join(struct in_multi *inm, struct igmp_ifinfo *igi)
2340{
2341	struct ifnet		*ifp;
2342	struct ifqueue		*ifq;
2343	int			 error, retval, syncstates;
2344
2345	CTR4(KTR_IGMPV3, "%s: initial join %s on ifp %p(%s)",
2346	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
2347	    inm->inm_ifp->if_xname);
2348
2349	error = 0;
2350	syncstates = 1;
2351
2352	ifp = inm->inm_ifp;
2353
2354	IN_MULTI_LOCK_ASSERT();
2355	IGMP_LOCK_ASSERT();
2356
2357	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
2358
2359	/*
2360	 * Groups joined on loopback or marked as 'not reported',
2361	 * e.g. 224.0.0.1, enter the IGMP_SILENT_MEMBER state and
2362	 * are never reported in any IGMP protocol exchanges.
2363	 * All other groups enter the appropriate IGMP state machine
2364	 * for the version in use on this link.
2365	 * A link marked as IGIF_SILENT causes IGMP to be completely
2366	 * disabled for the link.
2367	 */
2368	if ((ifp->if_flags & IFF_LOOPBACK) ||
2369	    (igi->igi_flags & IGIF_SILENT) ||
2370	    !igmp_isgroupreported(inm->inm_addr)) {
2371		CTR1(KTR_IGMPV3,
2372"%s: not kicking state machine for silent group", __func__);
2373		inm->inm_state = IGMP_SILENT_MEMBER;
2374		inm->inm_timer = 0;
2375	} else {
2376		/*
2377		 * Deal with overlapping in_multi lifecycle.
2378		 * If this group was LEAVING, then make sure
2379		 * we drop the reference we picked up to keep the
2380		 * group around for the final INCLUDE {} enqueue.
2381		 */
2382		if (igi->igi_version == IGMP_VERSION_3 &&
2383		    inm->inm_state == IGMP_LEAVING_MEMBER)
2384			inm_release_locked(inm);
2385
2386		inm->inm_state = IGMP_REPORTING_MEMBER;
2387
2388		switch (igi->igi_version) {
2389		case IGMP_VERSION_1:
2390		case IGMP_VERSION_2:
2391			inm->inm_state = IGMP_IDLE_MEMBER;
2392			error = igmp_v1v2_queue_report(inm,
2393			    (igi->igi_version == IGMP_VERSION_2) ?
2394			     IGMP_v2_HOST_MEMBERSHIP_REPORT :
2395			     IGMP_v1_HOST_MEMBERSHIP_REPORT);
2396			if (error == 0) {
2397				inm->inm_timer = IGMP_RANDOM_DELAY(
2398				    IGMP_V1V2_MAX_RI * PR_FASTHZ);
2399				V_current_state_timers_running = 1;
2400			}
2401			break;
2402
2403		case IGMP_VERSION_3:
2404			/*
2405			 * Defer update of T0 to T1, until the first copy
2406			 * of the state change has been transmitted.
2407			 */
2408			syncstates = 0;
2409
2410			/*
2411			 * Immediately enqueue a State-Change Report for
2412			 * this interface, freeing any previous reports.
2413			 * Don't kick the timers if there is nothing to do,
2414			 * or if an error occurred.
2415			 */
2416			ifq = &inm->inm_scq;
2417			_IF_DRAIN(ifq);
2418			retval = igmp_v3_enqueue_group_record(ifq, inm, 1,
2419			    0, 0);
2420			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
2421			    __func__, retval);
2422			if (retval <= 0) {
2423				error = retval * -1;
2424				break;
2425			}
2426
2427			/*
2428			 * Schedule transmission of pending state-change
2429			 * report up to RV times for this link. The timer
2430			 * will fire at the next igmp_fasttimo (~200ms),
2431			 * giving us an opportunity to merge the reports.
2432			 */
2433			if (igi->igi_flags & IGIF_LOOPBACK) {
2434				inm->inm_scrv = 1;
2435			} else {
2436				KASSERT(igi->igi_rv > 1,
2437				   ("%s: invalid robustness %d", __func__,
2438				    igi->igi_rv));
2439				inm->inm_scrv = igi->igi_rv;
2440			}
2441			inm->inm_sctimer = 1;
2442			V_state_change_timers_running = 1;
2443
2444			error = 0;
2445			break;
2446		}
2447	}
2448
2449	/*
2450	 * Only update the T0 state if state change is atomic,
2451	 * i.e. we don't need to wait for a timer to fire before we
2452	 * can consider the state change to have been communicated.
2453	 */
2454	if (syncstates) {
2455		inm_commit(inm);
2456		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
2457		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2458	}
2459
2460	return (error);
2461}
2462
2463/*
2464 * Issue an intermediate state change during the IGMP life-cycle.
2465 */
2466static int
2467igmp_handle_state_change(struct in_multi *inm, struct igmp_ifinfo *igi)
2468{
2469	struct ifnet		*ifp;
2470	int			 retval;
2471
2472	CTR4(KTR_IGMPV3, "%s: state change for %s on ifp %p(%s)",
2473	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
2474	    inm->inm_ifp->if_xname);
2475
2476	ifp = inm->inm_ifp;
2477
2478	IN_MULTI_LOCK_ASSERT();
2479	IGMP_LOCK_ASSERT();
2480
2481	KASSERT(igi && igi->igi_ifp == ifp, ("%s: inconsistent ifp", __func__));
2482
2483	if ((ifp->if_flags & IFF_LOOPBACK) ||
2484	    (igi->igi_flags & IGIF_SILENT) ||
2485	    !igmp_isgroupreported(inm->inm_addr) ||
2486	    (igi->igi_version != IGMP_VERSION_3)) {
2487		if (!igmp_isgroupreported(inm->inm_addr)) {
2488			CTR1(KTR_IGMPV3,
2489"%s: not kicking state machine for silent group", __func__);
2490		}
2491		CTR1(KTR_IGMPV3, "%s: nothing to do", __func__);
2492		inm_commit(inm);
2493		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
2494		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2495		return (0);
2496	}
2497
2498	_IF_DRAIN(&inm->inm_scq);
2499
2500	retval = igmp_v3_enqueue_group_record(&inm->inm_scq, inm, 1, 0, 0);
2501	CTR2(KTR_IGMPV3, "%s: enqueue record = %d", __func__, retval);
2502	if (retval <= 0)
2503		return (-retval);
2504
2505	/*
2506	 * If record(s) were enqueued, start the state-change
2507	 * report timer for this group.
2508	 */
2509	inm->inm_scrv = ((igi->igi_flags & IGIF_LOOPBACK) ? 1 : igi->igi_rv);
2510	inm->inm_sctimer = 1;
2511	V_state_change_timers_running = 1;
2512
2513	return (0);
2514}
2515
2516/*
2517 * Perform the final leave for an IGMP group.
2518 *
2519 * When leaving a group:
2520 *  IGMPv1 does nothing.
2521 *  IGMPv2 sends a host leave message, if and only if we are the reporter.
2522 *  IGMPv3 enqueues a state-change report containing a transition
2523 *  to INCLUDE {} for immediate transmission.
2524 */
2525static void
2526igmp_final_leave(struct in_multi *inm, struct igmp_ifinfo *igi)
2527{
2528	int syncstates;
2529
2530	syncstates = 1;
2531
2532	CTR4(KTR_IGMPV3, "%s: final leave %s on ifp %p(%s)",
2533	    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp,
2534	    inm->inm_ifp->if_xname);
2535
2536	IN_MULTI_LOCK_ASSERT();
2537	IGMP_LOCK_ASSERT();
2538
2539	switch (inm->inm_state) {
2540	case IGMP_NOT_MEMBER:
2541	case IGMP_SILENT_MEMBER:
2542	case IGMP_LEAVING_MEMBER:
2543		/* Already leaving or left; do nothing. */
2544		CTR1(KTR_IGMPV3,
2545"%s: not kicking state machine for silent group", __func__);
2546		break;
2547	case IGMP_REPORTING_MEMBER:
2548	case IGMP_IDLE_MEMBER:
2549	case IGMP_G_QUERY_PENDING_MEMBER:
2550	case IGMP_SG_QUERY_PENDING_MEMBER:
2551		if (igi->igi_version == IGMP_VERSION_2) {
2552#ifdef INVARIANTS
2553			if (inm->inm_state == IGMP_G_QUERY_PENDING_MEMBER ||
2554			    inm->inm_state == IGMP_SG_QUERY_PENDING_MEMBER)
2555			panic("%s: IGMPv3 state reached, not IGMPv3 mode",
2556			     __func__);
2557#endif
2558			igmp_v1v2_queue_report(inm, IGMP_HOST_LEAVE_MESSAGE);
2559			inm->inm_state = IGMP_NOT_MEMBER;
2560		} else if (igi->igi_version == IGMP_VERSION_3) {
2561			/*
2562			 * Stop group timer and all pending reports.
2563			 * Immediately enqueue a state-change report
2564			 * TO_IN {} to be sent on the next fast timeout,
2565			 * giving us an opportunity to merge reports.
2566			 */
2567			_IF_DRAIN(&inm->inm_scq);
2568			inm->inm_timer = 0;
2569			if (igi->igi_flags & IGIF_LOOPBACK) {
2570				inm->inm_scrv = 1;
2571			} else {
2572				inm->inm_scrv = igi->igi_rv;
2573			}
2574			CTR4(KTR_IGMPV3, "%s: Leaving %s/%s with %d "
2575			    "pending retransmissions.", __func__,
2576			    inet_ntoa(inm->inm_addr),
2577			    inm->inm_ifp->if_xname, inm->inm_scrv);
2578			if (inm->inm_scrv == 0) {
2579				inm->inm_state = IGMP_NOT_MEMBER;
2580				inm->inm_sctimer = 0;
2581			} else {
2582				int retval;
2583
2584				inm_acquire_locked(inm);
2585
2586				retval = igmp_v3_enqueue_group_record(
2587				    &inm->inm_scq, inm, 1, 0, 0);
2588				KASSERT(retval != 0,
2589				    ("%s: enqueue record = %d", __func__,
2590				     retval));
2591
2592				inm->inm_state = IGMP_LEAVING_MEMBER;
2593				inm->inm_sctimer = 1;
2594				V_state_change_timers_running = 1;
2595				syncstates = 0;
2596			}
2597			break;
2598		}
2599		break;
2600	case IGMP_LAZY_MEMBER:
2601	case IGMP_SLEEPING_MEMBER:
2602	case IGMP_AWAKENING_MEMBER:
2603		/* Our reports are suppressed; do nothing. */
2604		break;
2605	}
2606
2607	if (syncstates) {
2608		inm_commit(inm);
2609		CTR3(KTR_IGMPV3, "%s: T1 -> T0 for %s/%s", __func__,
2610		    inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2611		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
2612		CTR3(KTR_IGMPV3, "%s: T1 now MCAST_UNDEFINED for %s/%s",
2613		    __func__, inet_ntoa(inm->inm_addr), inm->inm_ifp->if_xname);
2614	}
2615}
2616
2617/*
2618 * Enqueue an IGMPv3 group record to the given output queue.
2619 *
2620 * XXX This function could do with having the allocation code
2621 * split out, and the multiple-tree-walks coalesced into a single
2622 * routine as has been done in igmp_v3_enqueue_filter_change().
2623 *
2624 * If is_state_change is zero, a current-state record is appended.
2625 * If is_state_change is non-zero, a state-change report is appended.
2626 *
2627 * If is_group_query is non-zero, an mbuf packet chain is allocated.
2628 * If is_group_query is zero, and if there is a packet with free space
2629 * at the tail of the queue, it will be appended to providing there
2630 * is enough free space.
2631 * Otherwise a new mbuf packet chain is allocated.
2632 *
2633 * If is_source_query is non-zero, each source is checked to see if
2634 * it was recorded for a Group-Source query, and will be omitted if
2635 * it is not both in-mode and recorded.
2636 *
2637 * The function will attempt to allocate leading space in the packet
2638 * for the IP/IGMP header to be prepended without fragmenting the chain.
2639 *
2640 * If successful the size of all data appended to the queue is returned,
2641 * otherwise an error code less than zero is returned, or zero if
2642 * no record(s) were appended.
2643 */
2644static int
2645igmp_v3_enqueue_group_record(struct ifqueue *ifq, struct in_multi *inm,
2646    const int is_state_change, const int is_group_query,
2647    const int is_source_query)
2648{
2649	struct igmp_grouprec	 ig;
2650	struct igmp_grouprec	*pig;
2651	struct ifnet		*ifp;
2652	struct ip_msource	*ims, *nims;
2653	struct mbuf		*m0, *m, *md;
2654	int			 error, is_filter_list_change;
2655	int			 minrec0len, m0srcs, msrcs, nbytes, off;
2656	int			 record_has_sources;
2657	int			 now;
2658	int			 type;
2659	in_addr_t		 naddr;
2660	uint8_t			 mode;
2661
2662	IN_MULTI_LOCK_ASSERT();
2663
2664	error = 0;
2665	ifp = inm->inm_ifp;
2666	is_filter_list_change = 0;
2667	m = NULL;
2668	m0 = NULL;
2669	m0srcs = 0;
2670	msrcs = 0;
2671	nbytes = 0;
2672	nims = NULL;
2673	record_has_sources = 1;
2674	pig = NULL;
2675	type = IGMP_DO_NOTHING;
2676	mode = inm->inm_st[1].iss_fmode;
2677
2678	/*
2679	 * If we did not transition out of ASM mode during t0->t1,
2680	 * and there are no source nodes to process, we can skip
2681	 * the generation of source records.
2682	 */
2683	if (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0 &&
2684	    inm->inm_nsrc == 0)
2685		record_has_sources = 0;
2686
2687	if (is_state_change) {
2688		/*
2689		 * Queue a state change record.
2690		 * If the mode did not change, and there are non-ASM
2691		 * listeners or source filters present,
2692		 * we potentially need to issue two records for the group.
2693		 * If we are transitioning to MCAST_UNDEFINED, we need
2694		 * not send any sources.
2695		 * If there are ASM listeners, and there was no filter
2696		 * mode transition of any kind, do nothing.
2697		 */
2698		if (mode != inm->inm_st[0].iss_fmode) {
2699			if (mode == MCAST_EXCLUDE) {
2700				CTR1(KTR_IGMPV3, "%s: change to EXCLUDE",
2701				    __func__);
2702				type = IGMP_CHANGE_TO_EXCLUDE_MODE;
2703			} else {
2704				CTR1(KTR_IGMPV3, "%s: change to INCLUDE",
2705				    __func__);
2706				type = IGMP_CHANGE_TO_INCLUDE_MODE;
2707				if (mode == MCAST_UNDEFINED)
2708					record_has_sources = 0;
2709			}
2710		} else {
2711			if (record_has_sources) {
2712				is_filter_list_change = 1;
2713			} else {
2714				type = IGMP_DO_NOTHING;
2715			}
2716		}
2717	} else {
2718		/*
2719		 * Queue a current state record.
2720		 */
2721		if (mode == MCAST_EXCLUDE) {
2722			type = IGMP_MODE_IS_EXCLUDE;
2723		} else if (mode == MCAST_INCLUDE) {
2724			type = IGMP_MODE_IS_INCLUDE;
2725			KASSERT(inm->inm_st[1].iss_asm == 0,
2726			    ("%s: inm %p is INCLUDE but ASM count is %d",
2727			     __func__, inm, inm->inm_st[1].iss_asm));
2728		}
2729	}
2730
2731	/*
2732	 * Generate the filter list changes using a separate function.
2733	 */
2734	if (is_filter_list_change)
2735		return (igmp_v3_enqueue_filter_change(ifq, inm));
2736
2737	if (type == IGMP_DO_NOTHING) {
2738		CTR3(KTR_IGMPV3, "%s: nothing to do for %s/%s",
2739		    __func__, inet_ntoa(inm->inm_addr),
2740		    inm->inm_ifp->if_xname);
2741		return (0);
2742	}
2743
2744	/*
2745	 * If any sources are present, we must be able to fit at least
2746	 * one in the trailing space of the tail packet's mbuf,
2747	 * ideally more.
2748	 */
2749	minrec0len = sizeof(struct igmp_grouprec);
2750	if (record_has_sources)
2751		minrec0len += sizeof(in_addr_t);
2752
2753	CTR4(KTR_IGMPV3, "%s: queueing %s for %s/%s", __func__,
2754	    igmp_rec_type_to_str(type), inet_ntoa(inm->inm_addr),
2755	    inm->inm_ifp->if_xname);
2756
2757	/*
2758	 * Check if we have a packet in the tail of the queue for this
2759	 * group into which the first group record for this group will fit.
2760	 * Otherwise allocate a new packet.
2761	 * Always allocate leading space for IP+RA_OPT+IGMP+REPORT.
2762	 * Note: Group records for G/GSR query responses MUST be sent
2763	 * in their own packet.
2764	 */
2765	m0 = ifq->ifq_tail;
2766	if (!is_group_query &&
2767	    m0 != NULL &&
2768	    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <= IGMP_V3_REPORT_MAXRECS) &&
2769	    (m0->m_pkthdr.len + minrec0len) <
2770	     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
2771		m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
2772			    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
2773		m = m0;
2774		CTR1(KTR_IGMPV3, "%s: use existing packet", __func__);
2775	} else {
2776		if (_IF_QFULL(ifq)) {
2777			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
2778			return (-ENOMEM);
2779		}
2780		m = NULL;
2781		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
2782		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
2783		if (!is_state_change && !is_group_query) {
2784			m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2785			if (m)
2786				m->m_data += IGMP_LEADINGSPACE;
2787		}
2788		if (m == NULL) {
2789			m = m_gethdr(M_DONTWAIT, MT_DATA);
2790			if (m)
2791				MH_ALIGN(m, IGMP_LEADINGSPACE);
2792		}
2793		if (m == NULL)
2794			return (-ENOMEM);
2795
2796		igmp_save_context(m, ifp);
2797
2798		CTR1(KTR_IGMPV3, "%s: allocated first packet", __func__);
2799	}
2800
2801	/*
2802	 * Append group record.
2803	 * If we have sources, we don't know how many yet.
2804	 */
2805	ig.ig_type = type;
2806	ig.ig_datalen = 0;
2807	ig.ig_numsrc = 0;
2808	ig.ig_group = inm->inm_addr;
2809	if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
2810		if (m != m0)
2811			m_freem(m);
2812		CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
2813		return (-ENOMEM);
2814	}
2815	nbytes += sizeof(struct igmp_grouprec);
2816
2817	/*
2818	 * Append as many sources as will fit in the first packet.
2819	 * If we are appending to a new packet, the chain allocation
2820	 * may potentially use clusters; use m_getptr() in this case.
2821	 * If we are appending to an existing packet, we need to obtain
2822	 * a pointer to the group record after m_append(), in case a new
2823	 * mbuf was allocated.
2824	 * Only append sources which are in-mode at t1. If we are
2825	 * transitioning to MCAST_UNDEFINED state on the group, do not
2826	 * include source entries.
2827	 * Only report recorded sources in our filter set when responding
2828	 * to a group-source query.
2829	 */
2830	if (record_has_sources) {
2831		if (m == m0) {
2832			md = m_last(m);
2833			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
2834			    md->m_len - nbytes);
2835		} else {
2836			md = m_getptr(m, 0, &off);
2837			pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) +
2838			    off);
2839		}
2840		msrcs = 0;
2841		RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, nims) {
2842			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
2843			    inet_ntoa_haddr(ims->ims_haddr));
2844			now = ims_get_mode(inm, ims, 1);
2845			CTR2(KTR_IGMPV3, "%s: node is %d", __func__, now);
2846			if ((now != mode) ||
2847			    (now == mode && mode == MCAST_UNDEFINED)) {
2848				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
2849				continue;
2850			}
2851			if (is_source_query && ims->ims_stp == 0) {
2852				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
2853				    __func__);
2854				continue;
2855			}
2856			CTR1(KTR_IGMPV3, "%s: append node", __func__);
2857			naddr = htonl(ims->ims_haddr);
2858			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
2859				if (m != m0)
2860					m_freem(m);
2861				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
2862				    __func__);
2863				return (-ENOMEM);
2864			}
2865			nbytes += sizeof(in_addr_t);
2866			++msrcs;
2867			if (msrcs == m0srcs)
2868				break;
2869		}
2870		CTR2(KTR_IGMPV3, "%s: msrcs is %d this packet", __func__,
2871		    msrcs);
2872		pig->ig_numsrc = htons(msrcs);
2873		nbytes += (msrcs * sizeof(in_addr_t));
2874	}
2875
2876	if (is_source_query && msrcs == 0) {
2877		CTR1(KTR_IGMPV3, "%s: no recorded sources to report", __func__);
2878		if (m != m0)
2879			m_freem(m);
2880		return (0);
2881	}
2882
2883	/*
2884	 * We are good to go with first packet.
2885	 */
2886	if (m != m0) {
2887		CTR1(KTR_IGMPV3, "%s: enqueueing first packet", __func__);
2888		m->m_pkthdr.PH_vt.vt_nrecs = 1;
2889		_IF_ENQUEUE(ifq, m);
2890	} else
2891		m->m_pkthdr.PH_vt.vt_nrecs++;
2892
2893	/*
2894	 * No further work needed if no source list in packet(s).
2895	 */
2896	if (!record_has_sources)
2897		return (nbytes);
2898
2899	/*
2900	 * Whilst sources remain to be announced, we need to allocate
2901	 * a new packet and fill out as many sources as will fit.
2902	 * Always try for a cluster first.
2903	 */
2904	while (nims != NULL) {
2905		if (_IF_QFULL(ifq)) {
2906			CTR1(KTR_IGMPV3, "%s: outbound queue full", __func__);
2907			return (-ENOMEM);
2908		}
2909		m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
2910		if (m)
2911			m->m_data += IGMP_LEADINGSPACE;
2912		if (m == NULL) {
2913			m = m_gethdr(M_DONTWAIT, MT_DATA);
2914			if (m)
2915				MH_ALIGN(m, IGMP_LEADINGSPACE);
2916		}
2917		if (m == NULL)
2918			return (-ENOMEM);
2919		igmp_save_context(m, ifp);
2920		md = m_getptr(m, 0, &off);
2921		pig = (struct igmp_grouprec *)(mtod(md, uint8_t *) + off);
2922		CTR1(KTR_IGMPV3, "%s: allocated next packet", __func__);
2923
2924		if (!m_append(m, sizeof(struct igmp_grouprec), (void *)&ig)) {
2925			if (m != m0)
2926				m_freem(m);
2927			CTR1(KTR_IGMPV3, "%s: m_append() failed.", __func__);
2928			return (-ENOMEM);
2929		}
2930		m->m_pkthdr.PH_vt.vt_nrecs = 1;
2931		nbytes += sizeof(struct igmp_grouprec);
2932
2933		m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
2934		    sizeof(struct igmp_grouprec)) / sizeof(in_addr_t);
2935
2936		msrcs = 0;
2937		RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
2938			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
2939			    inet_ntoa_haddr(ims->ims_haddr));
2940			now = ims_get_mode(inm, ims, 1);
2941			if ((now != mode) ||
2942			    (now == mode && mode == MCAST_UNDEFINED)) {
2943				CTR1(KTR_IGMPV3, "%s: skip node", __func__);
2944				continue;
2945			}
2946			if (is_source_query && ims->ims_stp == 0) {
2947				CTR1(KTR_IGMPV3, "%s: skip unrecorded node",
2948				    __func__);
2949				continue;
2950			}
2951			CTR1(KTR_IGMPV3, "%s: append node", __func__);
2952			naddr = htonl(ims->ims_haddr);
2953			if (!m_append(m, sizeof(in_addr_t), (void *)&naddr)) {
2954				if (m != m0)
2955					m_freem(m);
2956				CTR1(KTR_IGMPV3, "%s: m_append() failed.",
2957				    __func__);
2958				return (-ENOMEM);
2959			}
2960			++msrcs;
2961			if (msrcs == m0srcs)
2962				break;
2963		}
2964		pig->ig_numsrc = htons(msrcs);
2965		nbytes += (msrcs * sizeof(in_addr_t));
2966
2967		CTR1(KTR_IGMPV3, "%s: enqueueing next packet", __func__);
2968		_IF_ENQUEUE(ifq, m);
2969	}
2970
2971	return (nbytes);
2972}
2973
2974/*
2975 * Type used to mark record pass completion.
2976 * We exploit the fact we can cast to this easily from the
2977 * current filter modes on each ip_msource node.
2978 */
2979typedef enum {
2980	REC_NONE = 0x00,	/* MCAST_UNDEFINED */
2981	REC_ALLOW = 0x01,	/* MCAST_INCLUDE */
2982	REC_BLOCK = 0x02,	/* MCAST_EXCLUDE */
2983	REC_FULL = REC_ALLOW | REC_BLOCK
2984} rectype_t;
2985
2986/*
2987 * Enqueue an IGMPv3 filter list change to the given output queue.
2988 *
2989 * Source list filter state is held in an RB-tree. When the filter list
2990 * for a group is changed without changing its mode, we need to compute
2991 * the deltas between T0 and T1 for each source in the filter set,
2992 * and enqueue the appropriate ALLOW_NEW/BLOCK_OLD records.
2993 *
2994 * As we may potentially queue two record types, and the entire R-B tree
2995 * needs to be walked at once, we break this out into its own function
2996 * so we can generate a tightly packed queue of packets.
2997 *
2998 * XXX This could be written to only use one tree walk, although that makes
2999 * serializing into the mbuf chains a bit harder. For now we do two walks
3000 * which makes things easier on us, and it may or may not be harder on
3001 * the L2 cache.
3002 *
3003 * If successful the size of all data appended to the queue is returned,
3004 * otherwise an error code less than zero is returned, or zero if
3005 * no record(s) were appended.
3006 */
3007static int
3008igmp_v3_enqueue_filter_change(struct ifqueue *ifq, struct in_multi *inm)
3009{
3010	static const int MINRECLEN =
3011	    sizeof(struct igmp_grouprec) + sizeof(in_addr_t);
3012	struct ifnet		*ifp;
3013	struct igmp_grouprec	 ig;
3014	struct igmp_grouprec	*pig;
3015	struct ip_msource	*ims, *nims;
3016	struct mbuf		*m, *m0, *md;
3017	in_addr_t		 naddr;
3018	int			 m0srcs, nbytes, npbytes, off, rsrcs, schanged;
3019	int			 nallow, nblock;
3020	uint8_t			 mode, now, then;
3021	rectype_t		 crt, drt, nrt;
3022
3023	IN_MULTI_LOCK_ASSERT();
3024
3025	if (inm->inm_nsrc == 0 ||
3026	    (inm->inm_st[0].iss_asm > 0 && inm->inm_st[1].iss_asm > 0))
3027		return (0);
3028
3029	ifp = inm->inm_ifp;			/* interface */
3030	mode = inm->inm_st[1].iss_fmode;	/* filter mode at t1 */
3031	crt = REC_NONE;	/* current group record type */
3032	drt = REC_NONE;	/* mask of completed group record types */
3033	nrt = REC_NONE;	/* record type for current node */
3034	m0srcs = 0;	/* # source which will fit in current mbuf chain */
3035	nbytes = 0;	/* # of bytes appended to group's state-change queue */
3036	npbytes = 0;	/* # of bytes appended this packet */
3037	rsrcs = 0;	/* # sources encoded in current record */
3038	schanged = 0;	/* # nodes encoded in overall filter change */
3039	nallow = 0;	/* # of source entries in ALLOW_NEW */
3040	nblock = 0;	/* # of source entries in BLOCK_OLD */
3041	nims = NULL;	/* next tree node pointer */
3042
3043	/*
3044	 * For each possible filter record mode.
3045	 * The first kind of source we encounter tells us which
3046	 * is the first kind of record we start appending.
3047	 * If a node transitioned to UNDEFINED at t1, its mode is treated
3048	 * as the inverse of the group's filter mode.
3049	 */
3050	while (drt != REC_FULL) {
3051		do {
3052			m0 = ifq->ifq_tail;
3053			if (m0 != NULL &&
3054			    (m0->m_pkthdr.PH_vt.vt_nrecs + 1 <=
3055			     IGMP_V3_REPORT_MAXRECS) &&
3056			    (m0->m_pkthdr.len + MINRECLEN) <
3057			     (ifp->if_mtu - IGMP_LEADINGSPACE)) {
3058				m = m0;
3059				m0srcs = (ifp->if_mtu - m0->m_pkthdr.len -
3060					    sizeof(struct igmp_grouprec)) /
3061				    sizeof(in_addr_t);
3062				CTR1(KTR_IGMPV3,
3063				    "%s: use previous packet", __func__);
3064			} else {
3065				m = m_getcl(M_DONTWAIT, MT_DATA, M_PKTHDR);
3066				if (m)
3067					m->m_data += IGMP_LEADINGSPACE;
3068				if (m == NULL) {
3069					m = m_gethdr(M_DONTWAIT, MT_DATA);
3070					if (m)
3071						MH_ALIGN(m, IGMP_LEADINGSPACE);
3072				}
3073				if (m == NULL) {
3074					CTR1(KTR_IGMPV3,
3075					    "%s: m_get*() failed", __func__);
3076					return (-ENOMEM);
3077				}
3078				m->m_pkthdr.PH_vt.vt_nrecs = 0;
3079				igmp_save_context(m, ifp);
3080				m0srcs = (ifp->if_mtu - IGMP_LEADINGSPACE -
3081				    sizeof(struct igmp_grouprec)) /
3082				    sizeof(in_addr_t);
3083				npbytes = 0;
3084				CTR1(KTR_IGMPV3,
3085				    "%s: allocated new packet", __func__);
3086			}
3087			/*
3088			 * Append the IGMP group record header to the
3089			 * current packet's data area.
3090			 * Recalculate pointer to free space for next
3091			 * group record, in case m_append() allocated
3092			 * a new mbuf or cluster.
3093			 */
3094			memset(&ig, 0, sizeof(ig));
3095			ig.ig_group = inm->inm_addr;
3096			if (!m_append(m, sizeof(ig), (void *)&ig)) {
3097				if (m != m0)
3098					m_freem(m);
3099				CTR1(KTR_IGMPV3,
3100				    "%s: m_append() failed", __func__);
3101				return (-ENOMEM);
3102			}
3103			npbytes += sizeof(struct igmp_grouprec);
3104			if (m != m0) {
3105				/* new packet; offset in c hain */
3106				md = m_getptr(m, npbytes -
3107				    sizeof(struct igmp_grouprec), &off);
3108				pig = (struct igmp_grouprec *)(mtod(md,
3109				    uint8_t *) + off);
3110			} else {
3111				/* current packet; offset from last append */
3112				md = m_last(m);
3113				pig = (struct igmp_grouprec *)(mtod(md,
3114				    uint8_t *) + md->m_len -
3115				    sizeof(struct igmp_grouprec));
3116			}
3117			/*
3118			 * Begin walking the tree for this record type
3119			 * pass, or continue from where we left off
3120			 * previously if we had to allocate a new packet.
3121			 * Only report deltas in-mode at t1.
3122			 * We need not report included sources as allowed
3123			 * if we are in inclusive mode on the group,
3124			 * however the converse is not true.
3125			 */
3126			rsrcs = 0;
3127			if (nims == NULL)
3128				nims = RB_MIN(ip_msource_tree, &inm->inm_srcs);
3129			RB_FOREACH_FROM(ims, ip_msource_tree, nims) {
3130				CTR2(KTR_IGMPV3, "%s: visit node %s",
3131				    __func__, inet_ntoa_haddr(ims->ims_haddr));
3132				now = ims_get_mode(inm, ims, 1);
3133				then = ims_get_mode(inm, ims, 0);
3134				CTR3(KTR_IGMPV3, "%s: mode: t0 %d, t1 %d",
3135				    __func__, then, now);
3136				if (now == then) {
3137					CTR1(KTR_IGMPV3,
3138					    "%s: skip unchanged", __func__);
3139					continue;
3140				}
3141				if (mode == MCAST_EXCLUDE &&
3142				    now == MCAST_INCLUDE) {
3143					CTR1(KTR_IGMPV3,
3144					    "%s: skip IN src on EX group",
3145					    __func__);
3146					continue;
3147				}
3148				nrt = (rectype_t)now;
3149				if (nrt == REC_NONE)
3150					nrt = (rectype_t)(~mode & REC_FULL);
3151				if (schanged++ == 0) {
3152					crt = nrt;
3153				} else if (crt != nrt)
3154					continue;
3155				naddr = htonl(ims->ims_haddr);
3156				if (!m_append(m, sizeof(in_addr_t),
3157				    (void *)&naddr)) {
3158					if (m != m0)
3159						m_freem(m);
3160					CTR1(KTR_IGMPV3,
3161					    "%s: m_append() failed", __func__);
3162					return (-ENOMEM);
3163				}
3164				nallow += !!(crt == REC_ALLOW);
3165				nblock += !!(crt == REC_BLOCK);
3166				if (++rsrcs == m0srcs)
3167					break;
3168			}
3169			/*
3170			 * If we did not append any tree nodes on this
3171			 * pass, back out of allocations.
3172			 */
3173			if (rsrcs == 0) {
3174				npbytes -= sizeof(struct igmp_grouprec);
3175				if (m != m0) {
3176					CTR1(KTR_IGMPV3,
3177					    "%s: m_free(m)", __func__);
3178					m_freem(m);
3179				} else {
3180					CTR1(KTR_IGMPV3,
3181					    "%s: m_adj(m, -ig)", __func__);
3182					m_adj(m, -((int)sizeof(
3183					    struct igmp_grouprec)));
3184				}
3185				continue;
3186			}
3187			npbytes += (rsrcs * sizeof(in_addr_t));
3188			if (crt == REC_ALLOW)
3189				pig->ig_type = IGMP_ALLOW_NEW_SOURCES;
3190			else if (crt == REC_BLOCK)
3191				pig->ig_type = IGMP_BLOCK_OLD_SOURCES;
3192			pig->ig_numsrc = htons(rsrcs);
3193			/*
3194			 * Count the new group record, and enqueue this
3195			 * packet if it wasn't already queued.
3196			 */
3197			m->m_pkthdr.PH_vt.vt_nrecs++;
3198			if (m != m0)
3199				_IF_ENQUEUE(ifq, m);
3200			nbytes += npbytes;
3201		} while (nims != NULL);
3202		drt |= crt;
3203		crt = (~crt & REC_FULL);
3204	}
3205
3206	CTR3(KTR_IGMPV3, "%s: queued %d ALLOW_NEW, %d BLOCK_OLD", __func__,
3207	    nallow, nblock);
3208
3209	return (nbytes);
3210}
3211
3212static int
3213igmp_v3_merge_state_changes(struct in_multi *inm, struct ifqueue *ifscq)
3214{
3215	struct ifqueue	*gq;
3216	struct mbuf	*m;		/* pending state-change */
3217	struct mbuf	*m0;		/* copy of pending state-change */
3218	struct mbuf	*mt;		/* last state-change in packet */
3219	int		 docopy, domerge;
3220	u_int		 recslen;
3221
3222	docopy = 0;
3223	domerge = 0;
3224	recslen = 0;
3225
3226	IN_MULTI_LOCK_ASSERT();
3227	IGMP_LOCK_ASSERT();
3228
3229	/*
3230	 * If there are further pending retransmissions, make a writable
3231	 * copy of each queued state-change message before merging.
3232	 */
3233	if (inm->inm_scrv > 0)
3234		docopy = 1;
3235
3236	gq = &inm->inm_scq;
3237#ifdef KTR
3238	if (gq->ifq_head == NULL) {
3239		CTR2(KTR_IGMPV3, "%s: WARNING: queue for inm %p is empty",
3240		    __func__, inm);
3241	}
3242#endif
3243
3244	m = gq->ifq_head;
3245	while (m != NULL) {
3246		/*
3247		 * Only merge the report into the current packet if
3248		 * there is sufficient space to do so; an IGMPv3 report
3249		 * packet may only contain 65,535 group records.
3250		 * Always use a simple mbuf chain concatentation to do this,
3251		 * as large state changes for single groups may have
3252		 * allocated clusters.
3253		 */
3254		domerge = 0;
3255		mt = ifscq->ifq_tail;
3256		if (mt != NULL) {
3257			recslen = m_length(m, NULL);
3258
3259			if ((mt->m_pkthdr.PH_vt.vt_nrecs +
3260			    m->m_pkthdr.PH_vt.vt_nrecs <=
3261			    IGMP_V3_REPORT_MAXRECS) &&
3262			    (mt->m_pkthdr.len + recslen <=
3263			    (inm->inm_ifp->if_mtu - IGMP_LEADINGSPACE)))
3264				domerge = 1;
3265		}
3266
3267		if (!domerge && _IF_QFULL(gq)) {
3268			CTR2(KTR_IGMPV3,
3269			    "%s: outbound queue full, skipping whole packet %p",
3270			    __func__, m);
3271			mt = m->m_nextpkt;
3272			if (!docopy)
3273				m_freem(m);
3274			m = mt;
3275			continue;
3276		}
3277
3278		if (!docopy) {
3279			CTR2(KTR_IGMPV3, "%s: dequeueing %p", __func__, m);
3280			_IF_DEQUEUE(gq, m0);
3281			m = m0->m_nextpkt;
3282		} else {
3283			CTR2(KTR_IGMPV3, "%s: copying %p", __func__, m);
3284			m0 = m_dup(m, M_NOWAIT);
3285			if (m0 == NULL)
3286				return (ENOMEM);
3287			m0->m_nextpkt = NULL;
3288			m = m->m_nextpkt;
3289		}
3290
3291		if (!domerge) {
3292			CTR3(KTR_IGMPV3, "%s: queueing %p to ifscq %p)",
3293			    __func__, m0, ifscq);
3294			_IF_ENQUEUE(ifscq, m0);
3295		} else {
3296			struct mbuf *mtl;	/* last mbuf of packet mt */
3297
3298			CTR3(KTR_IGMPV3, "%s: merging %p with ifscq tail %p)",
3299			    __func__, m0, mt);
3300
3301			mtl = m_last(mt);
3302			m0->m_flags &= ~M_PKTHDR;
3303			mt->m_pkthdr.len += recslen;
3304			mt->m_pkthdr.PH_vt.vt_nrecs +=
3305			    m0->m_pkthdr.PH_vt.vt_nrecs;
3306
3307			mtl->m_next = m0;
3308		}
3309	}
3310
3311	return (0);
3312}
3313
3314/*
3315 * Respond to a pending IGMPv3 General Query.
3316 */
3317static void
3318igmp_v3_dispatch_general_query(struct igmp_ifinfo *igi)
3319{
3320	struct ifmultiaddr	*ifma, *tifma;
3321	struct ifnet		*ifp;
3322	struct in_multi		*inm;
3323	int			 retval, loop;
3324
3325	IN_MULTI_LOCK_ASSERT();
3326	IGMP_LOCK_ASSERT();
3327
3328	KASSERT(igi->igi_version == IGMP_VERSION_3,
3329	    ("%s: called when version %d", __func__, igi->igi_version));
3330
3331	ifp = igi->igi_ifp;
3332
3333	IF_ADDR_LOCK(ifp);
3334	TAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, tifma) {
3335		if (ifma->ifma_addr->sa_family != AF_INET ||
3336		    ifma->ifma_protospec == NULL)
3337			continue;
3338
3339		inm = (struct in_multi *)ifma->ifma_protospec;
3340		KASSERT(ifp == inm->inm_ifp,
3341		    ("%s: inconsistent ifp", __func__));
3342
3343		switch (inm->inm_state) {
3344		case IGMP_NOT_MEMBER:
3345		case IGMP_SILENT_MEMBER:
3346			break;
3347		case IGMP_REPORTING_MEMBER:
3348		case IGMP_IDLE_MEMBER:
3349		case IGMP_LAZY_MEMBER:
3350		case IGMP_SLEEPING_MEMBER:
3351		case IGMP_AWAKENING_MEMBER:
3352			inm->inm_state = IGMP_REPORTING_MEMBER;
3353			retval = igmp_v3_enqueue_group_record(&igi->igi_gq,
3354			    inm, 0, 0, 0);
3355			CTR2(KTR_IGMPV3, "%s: enqueue record = %d",
3356			    __func__, retval);
3357			break;
3358		case IGMP_G_QUERY_PENDING_MEMBER:
3359		case IGMP_SG_QUERY_PENDING_MEMBER:
3360		case IGMP_LEAVING_MEMBER:
3361			break;
3362		}
3363	}
3364	IF_ADDR_UNLOCK(ifp);
3365
3366	loop = (igi->igi_flags & IGIF_LOOPBACK) ? 1 : 0;
3367	igmp_dispatch_queue(&igi->igi_gq, IGMP_MAX_RESPONSE_BURST, loop);
3368
3369	/*
3370	 * Slew transmission of bursts over 500ms intervals.
3371	 */
3372	if (igi->igi_gq.ifq_head != NULL) {
3373		igi->igi_v3_timer = 1 + IGMP_RANDOM_DELAY(
3374		    IGMP_RESPONSE_BURST_INTERVAL);
3375		V_interface_timers_running = 1;
3376	}
3377}
3378
3379/*
3380 * Transmit the next pending IGMP message in the output queue.
3381 *
3382 * We get called from netisr_processqueue(). A mutex private to igmpoq
3383 * will be acquired and released around this routine.
3384 *
3385 * VIMAGE: Needs to store/restore vnet pointer on a per-mbuf-chain basis.
3386 * MRT: Nothing needs to be done, as IGMP traffic is always local to
3387 * a link and uses a link-scope multicast address.
3388 */
3389static void
3390igmp_intr(struct mbuf *m)
3391{
3392	struct ip_moptions	 imo;
3393	struct ifnet		*ifp;
3394	struct mbuf		*ipopts, *m0;
3395	int			 error;
3396	uint32_t		 ifindex;
3397
3398	CTR2(KTR_IGMPV3, "%s: transmit %p", __func__, m);
3399
3400	/*
3401	 * Set VNET image pointer from enqueued mbuf chain
3402	 * before doing anything else. Whilst we use interface
3403	 * indexes to guard against interface detach, they are
3404	 * unique to each VIMAGE and must be retrieved.
3405	 */
3406	CURVNET_SET((struct vnet *)(m->m_pkthdr.header));
3407	ifindex = igmp_restore_context(m);
3408
3409	/*
3410	 * Check if the ifnet still exists. This limits the scope of
3411	 * any race in the absence of a global ifp lock for low cost
3412	 * (an array lookup).
3413	 */
3414	ifp = ifnet_byindex(ifindex);
3415	if (ifp == NULL) {
3416		CTR3(KTR_IGMPV3, "%s: dropped %p as ifindex %u went away.",
3417		    __func__, m, ifindex);
3418		m_freem(m);
3419		IPSTAT_INC(ips_noroute);
3420		goto out;
3421	}
3422
3423	ipopts = V_igmp_sendra ? m_raopt : NULL;
3424
3425	imo.imo_multicast_ttl  = 1;
3426	imo.imo_multicast_vif  = -1;
3427	imo.imo_multicast_loop = (V_ip_mrouter != NULL);
3428
3429	/*
3430	 * If the user requested that IGMP traffic be explicitly
3431	 * redirected to the loopback interface (e.g. they are running a
3432	 * MANET interface and the routing protocol needs to see the
3433	 * updates), handle this now.
3434	 */
3435	if (m->m_flags & M_IGMP_LOOP)
3436		imo.imo_multicast_ifp = V_loif;
3437	else
3438		imo.imo_multicast_ifp = ifp;
3439
3440	if (m->m_flags & M_IGMPV2) {
3441		m0 = m;
3442	} else {
3443		m0 = igmp_v3_encap_report(ifp, m);
3444		if (m0 == NULL) {
3445			CTR2(KTR_IGMPV3, "%s: dropped %p", __func__, m);
3446			m_freem(m);
3447			IPSTAT_INC(ips_odropped);
3448			goto out;
3449		}
3450	}
3451
3452	igmp_scrub_context(m0);
3453	m->m_flags &= ~(M_PROTOFLAGS);
3454	m0->m_pkthdr.rcvif = V_loif;
3455#ifdef MAC
3456	mac_netinet_igmp_send(ifp, m0);
3457#endif
3458	error = ip_output(m0, ipopts, NULL, 0, &imo, NULL);
3459	if (error) {
3460		CTR3(KTR_IGMPV3, "%s: ip_output(%p) = %d", __func__, m0, error);
3461		goto out;
3462	}
3463
3464	IGMPSTAT_INC(igps_snd_reports);
3465
3466out:
3467	/*
3468	 * We must restore the existing vnet pointer before
3469	 * continuing as we are run from netisr context.
3470	 */
3471	CURVNET_RESTORE();
3472}
3473
3474/*
3475 * Encapsulate an IGMPv3 report.
3476 *
3477 * The internal mbuf flag M_IGMPV3_HDR is used to indicate that the mbuf
3478 * chain has already had its IP/IGMPv3 header prepended. In this case
3479 * the function will not attempt to prepend; the lengths and checksums
3480 * will however be re-computed.
3481 *
3482 * Returns a pointer to the new mbuf chain head, or NULL if the
3483 * allocation failed.
3484 */
3485static struct mbuf *
3486igmp_v3_encap_report(struct ifnet *ifp, struct mbuf *m)
3487{
3488	struct igmp_report	*igmp;
3489	struct ip		*ip;
3490	int			 hdrlen, igmpreclen;
3491
3492	KASSERT((m->m_flags & M_PKTHDR),
3493	    ("%s: mbuf chain %p is !M_PKTHDR", __func__, m));
3494
3495	igmpreclen = m_length(m, NULL);
3496	hdrlen = sizeof(struct ip) + sizeof(struct igmp_report);
3497
3498	if (m->m_flags & M_IGMPV3_HDR) {
3499		igmpreclen -= hdrlen;
3500	} else {
3501		M_PREPEND(m, hdrlen, M_DONTWAIT);
3502		if (m == NULL)
3503			return (NULL);
3504		m->m_flags |= M_IGMPV3_HDR;
3505	}
3506
3507	CTR2(KTR_IGMPV3, "%s: igmpreclen is %d", __func__, igmpreclen);
3508
3509	m->m_data += sizeof(struct ip);
3510	m->m_len -= sizeof(struct ip);
3511
3512	igmp = mtod(m, struct igmp_report *);
3513	igmp->ir_type = IGMP_v3_HOST_MEMBERSHIP_REPORT;
3514	igmp->ir_rsv1 = 0;
3515	igmp->ir_rsv2 = 0;
3516	igmp->ir_numgrps = htons(m->m_pkthdr.PH_vt.vt_nrecs);
3517	igmp->ir_cksum = 0;
3518	igmp->ir_cksum = in_cksum(m, sizeof(struct igmp_report) + igmpreclen);
3519	m->m_pkthdr.PH_vt.vt_nrecs = 0;
3520
3521	m->m_data -= sizeof(struct ip);
3522	m->m_len += sizeof(struct ip);
3523
3524	ip = mtod(m, struct ip *);
3525	ip->ip_tos = IPTOS_PREC_INTERNETCONTROL;
3526	ip->ip_len = hdrlen + igmpreclen;
3527	ip->ip_off = IP_DF;
3528	ip->ip_p = IPPROTO_IGMP;
3529	ip->ip_sum = 0;
3530
3531	ip->ip_src.s_addr = INADDR_ANY;
3532
3533	if (m->m_flags & M_IGMP_LOOP) {
3534		struct in_ifaddr *ia;
3535
3536		IFP_TO_IA(ifp, ia);
3537		if (ia != NULL) {
3538			ip->ip_src = ia->ia_addr.sin_addr;
3539			ifa_free(&ia->ia_ifa);
3540		}
3541	}
3542
3543	ip->ip_dst.s_addr = htonl(INADDR_ALLRPTS_GROUP);
3544
3545	return (m);
3546}
3547
3548#ifdef KTR
3549static char *
3550igmp_rec_type_to_str(const int type)
3551{
3552
3553	switch (type) {
3554		case IGMP_CHANGE_TO_EXCLUDE_MODE:
3555			return "TO_EX";
3556			break;
3557		case IGMP_CHANGE_TO_INCLUDE_MODE:
3558			return "TO_IN";
3559			break;
3560		case IGMP_MODE_IS_EXCLUDE:
3561			return "MODE_EX";
3562			break;
3563		case IGMP_MODE_IS_INCLUDE:
3564			return "MODE_IN";
3565			break;
3566		case IGMP_ALLOW_NEW_SOURCES:
3567			return "ALLOW_NEW";
3568			break;
3569		case IGMP_BLOCK_OLD_SOURCES:
3570			return "BLOCK_OLD";
3571			break;
3572		default:
3573			break;
3574	}
3575	return "unknown";
3576}
3577#endif
3578
3579static void
3580igmp_sysinit(void)
3581{
3582
3583	CTR1(KTR_IGMPV3, "%s: initializing", __func__);
3584
3585	IGMP_LOCK_INIT();
3586
3587	m_raopt = igmp_ra_alloc();
3588
3589	netisr_register(&igmp_nh);
3590}
3591
3592static void
3593igmp_sysuninit(void)
3594{
3595
3596	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
3597
3598	netisr_unregister(&igmp_nh);
3599
3600	m_free(m_raopt);
3601	m_raopt = NULL;
3602
3603	IGMP_LOCK_DESTROY();
3604}
3605
3606/*
3607 * Initialize an IGMPv3 instance.
3608 * VIMAGE: Assumes curvnet set by caller and called per vimage.
3609 */
3610static int
3611vnet_igmp_iattach(const void *unused __unused)
3612{
3613
3614	CTR1(KTR_IGMPV3, "%s: initializing", __func__);
3615
3616	LIST_INIT(&V_igi_head);
3617
3618	/*
3619	 * Initialize sysctls to default values.
3620	 */
3621	V_igmpstat.igps_version = IGPS_VERSION_3;
3622	V_igmpstat.igps_len = sizeof(struct igmpstat);
3623
3624	return (0);
3625}
3626
3627static int
3628vnet_igmp_idetach(const void *unused __unused)
3629{
3630
3631	CTR1(KTR_IGMPV3, "%s: tearing down", __func__);
3632
3633	KASSERT(LIST_EMPTY(&V_igi_head),
3634	    ("%s: igi list not empty; ifnets not detached?", __func__));
3635
3636	return (0);
3637}
3638
3639#ifdef VIMAGE
3640static vnet_modinfo_t vnet_igmp_modinfo = {
3641	.vmi_id		= VNET_MOD_IGMP,
3642	.vmi_name	= "igmp",
3643	.vmi_dependson	= VNET_MOD_INET,
3644	.vmi_iattach	= vnet_igmp_iattach,
3645	.vmi_idetach	= vnet_igmp_idetach
3646};
3647#endif
3648
3649static int
3650igmp_modevent(module_t mod, int type, void *unused __unused)
3651{
3652
3653    switch (type) {
3654    case MOD_LOAD:
3655	igmp_sysinit();
3656#ifdef VIMAGE
3657	vnet_mod_register(&vnet_igmp_modinfo);
3658#else
3659	vnet_igmp_iattach(NULL);
3660#endif
3661	break;
3662    case MOD_UNLOAD:
3663#ifdef VIMAGE
3664	vnet_mod_deregister(&vnet_igmp_modinfo);
3665#else
3666	vnet_igmp_idetach(NULL);
3667#endif
3668	igmp_sysuninit();
3669	break;
3670    default:
3671	return (EOPNOTSUPP);
3672    }
3673    return (0);
3674}
3675
3676static moduledata_t igmp_mod = {
3677    "igmp",
3678    igmp_modevent,
3679    0
3680};
3681DECLARE_MODULE(igmp, igmp_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
3682