1/*-
2 * Copyright (c) 2002 Michael Shalayeff.
3 * Copyright (c) 2003 Ryan McBride.
4 * Copyright (c) 2011 Gleb Smirnoff <glebius@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR OR HIS RELATIVES BE LIABLE FOR ANY DIRECT,
20 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 * SERVICES; LOSS OF MIND, USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
24 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
25 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
26 * THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/11/sys/netinet/ip_carp.c 356712 2020-01-14 00:51:35Z mav $");
31
32#include "opt_bpf.h"
33#include "opt_inet.h"
34#include "opt_inet6.h"
35
36#include <sys/param.h>
37#include <sys/systm.h>
38#include <sys/bus.h>
39#include <sys/jail.h>
40#include <sys/kernel.h>
41#include <sys/limits.h>
42#include <sys/malloc.h>
43#include <sys/mbuf.h>
44#include <sys/module.h>
45#include <sys/priv.h>
46#include <sys/proc.h>
47#include <sys/protosw.h>
48#include <sys/socket.h>
49#include <sys/sockio.h>
50#include <sys/sysctl.h>
51#include <sys/syslog.h>
52#include <sys/taskqueue.h>
53#include <sys/counter.h>
54
55#include <net/ethernet.h>
56#include <net/fddi.h>
57#include <net/if.h>
58#include <net/if_var.h>
59#include <net/if_dl.h>
60#include <net/if_llatbl.h>
61#include <net/if_types.h>
62#include <net/iso88025.h>
63#include <net/route.h>
64#include <net/vnet.h>
65
66#if defined(INET) || defined(INET6)
67#include <netinet/in.h>
68#include <netinet/in_var.h>
69#include <netinet/ip_carp.h>
70#include <netinet/ip.h>
71#include <machine/in_cksum.h>
72#endif
73#ifdef INET
74#include <netinet/ip_var.h>
75#include <netinet/if_ether.h>
76#endif
77
78#ifdef INET6
79#include <netinet/icmp6.h>
80#include <netinet/ip6.h>
81#include <netinet6/in6_var.h>
82#include <netinet6/ip6_var.h>
83#include <netinet6/scope6_var.h>
84#include <netinet6/nd6.h>
85#endif
86
87#include <crypto/sha1.h>
88
89static MALLOC_DEFINE(M_CARP, "CARP", "CARP addresses");
90
91struct carp_softc {
92	struct ifnet		*sc_carpdev;	/* Pointer to parent ifnet. */
93	struct ifaddr		**sc_ifas;	/* Our ifaddrs. */
94	struct sockaddr_dl	sc_addr;	/* Our link level address. */
95	struct callout		sc_ad_tmo;	/* Advertising timeout. */
96#ifdef INET
97	struct callout		sc_md_tmo;	/* Master down timeout. */
98#endif
99#ifdef INET6
100	struct callout 		sc_md6_tmo;	/* XXX: Master down timeout. */
101#endif
102	struct mtx		sc_mtx;
103
104	int			sc_vhid;
105	int			sc_advskew;
106	int			sc_advbase;
107
108	int			sc_naddrs;
109	int			sc_naddrs6;
110	int			sc_ifasiz;
111	enum { INIT = 0, BACKUP, MASTER }	sc_state;
112	int			sc_suppress;
113	int			sc_sendad_errors;
114#define	CARP_SENDAD_MAX_ERRORS	3
115	int			sc_sendad_success;
116#define	CARP_SENDAD_MIN_SUCCESS 3
117
118	int			sc_init_counter;
119	uint64_t		sc_counter;
120
121	/* authentication */
122#define	CARP_HMAC_PAD	64
123	unsigned char sc_key[CARP_KEY_LEN];
124	unsigned char sc_pad[CARP_HMAC_PAD];
125	SHA1_CTX sc_sha1;
126
127	TAILQ_ENTRY(carp_softc)	sc_list;	/* On the carp_if list. */
128	LIST_ENTRY(carp_softc)	sc_next;	/* On the global list. */
129};
130
131struct carp_if {
132#ifdef INET
133	int	cif_naddrs;
134#endif
135#ifdef INET6
136	int	cif_naddrs6;
137#endif
138	TAILQ_HEAD(, carp_softc) cif_vrs;
139#ifdef INET
140	struct ip_moptions 	 cif_imo;
141#endif
142#ifdef INET6
143	struct ip6_moptions 	 cif_im6o;
144#endif
145	struct ifnet	*cif_ifp;
146	struct mtx	cif_mtx;
147	uint32_t	cif_flags;
148#define	CIF_PROMISC	0x00000001
149};
150
151#define	CARP_INET	0
152#define	CARP_INET6	1
153static int proto_reg[] = {-1, -1};
154
155/*
156 * Brief design of carp(4).
157 *
158 * Any carp-capable ifnet may have a list of carp softcs hanging off
159 * its ifp->if_carp pointer. Each softc represents one unique virtual
160 * host id, or vhid. The softc has a back pointer to the ifnet. All
161 * softcs are joined in a global list, which has quite limited use.
162 *
163 * Any interface address that takes part in CARP negotiation has a
164 * pointer to the softc of its vhid, ifa->ifa_carp. That could be either
165 * AF_INET or AF_INET6 address.
166 *
167 * Although, one can get the softc's backpointer to ifnet and traverse
168 * through its ifp->if_addrhead queue to find all interface addresses
169 * involved in CARP, we keep a growable array of ifaddr pointers. This
170 * allows us to avoid grabbing the IF_ADDR_LOCK() in many traversals that
171 * do calls into the network stack, thus avoiding LORs.
172 *
173 * Locking:
174 *
175 * Each softc has a lock sc_mtx. It is used to synchronise carp_input_c(),
176 * callout-driven events and ioctl()s.
177 *
178 * To traverse the list of softcs on an ifnet we use CIF_LOCK() or carp_sx.
179 * To traverse the global list we use the mutex carp_mtx.
180 *
181 * Known issues with locking:
182 *
183 * - Sending ad, we put the pointer to the softc in an mtag, and no reference
184 *   counting is done on the softc.
185 * - On module unload we may race (?) with packet processing thread
186 *   dereferencing our function pointers.
187 */
188
189/* Accept incoming CARP packets. */
190static VNET_DEFINE(int, carp_allow) = 1;
191#define	V_carp_allow	VNET(carp_allow)
192
193/* Set DSCP in outgoing CARP packets. */
194static VNET_DEFINE(int, carp_dscp) = 56;
195#define	V_carp_dscp	VNET(carp_dscp)
196
197/* Preempt slower nodes. */
198static VNET_DEFINE(int, carp_preempt) = 0;
199#define	V_carp_preempt	VNET(carp_preempt)
200
201/* Log level. */
202static VNET_DEFINE(int, carp_log) = 1;
203#define	V_carp_log	VNET(carp_log)
204
205/* Global advskew demotion. */
206static VNET_DEFINE(int, carp_demotion) = 0;
207#define	V_carp_demotion	VNET(carp_demotion)
208
209/* Send error demotion factor. */
210static VNET_DEFINE(int, carp_senderr_adj) = CARP_MAXSKEW;
211#define	V_carp_senderr_adj	VNET(carp_senderr_adj)
212
213/* Iface down demotion factor. */
214static VNET_DEFINE(int, carp_ifdown_adj) = CARP_MAXSKEW;
215#define	V_carp_ifdown_adj	VNET(carp_ifdown_adj)
216
217static int carp_allow_sysctl(SYSCTL_HANDLER_ARGS);
218static int carp_dscp_sysctl(SYSCTL_HANDLER_ARGS);
219static int carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS);
220
221SYSCTL_NODE(_net_inet, IPPROTO_CARP,	carp,	CTLFLAG_RW, 0,	"CARP");
222SYSCTL_PROC(_net_inet_carp, OID_AUTO, allow,
223    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0, carp_allow_sysctl, "I",
224    "Accept incoming CARP packets");
225SYSCTL_PROC(_net_inet_carp, OID_AUTO, dscp,
226    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, 0, 0, carp_dscp_sysctl, "I",
227    "DSCP value for carp packets");
228SYSCTL_INT(_net_inet_carp, OID_AUTO, preempt, CTLFLAG_VNET | CTLFLAG_RW,
229    &VNET_NAME(carp_preempt), 0, "High-priority backup preemption mode");
230SYSCTL_INT(_net_inet_carp, OID_AUTO, log, CTLFLAG_VNET | CTLFLAG_RW,
231    &VNET_NAME(carp_log), 0, "CARP log level");
232SYSCTL_PROC(_net_inet_carp, OID_AUTO, demotion,
233    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
234    0, 0, carp_demote_adj_sysctl, "I",
235    "Adjust demotion factor (skew of advskew)");
236SYSCTL_INT(_net_inet_carp, OID_AUTO, senderr_demotion_factor,
237    CTLFLAG_VNET | CTLFLAG_RW,
238    &VNET_NAME(carp_senderr_adj), 0, "Send error demotion factor adjustment");
239SYSCTL_INT(_net_inet_carp, OID_AUTO, ifdown_demotion_factor,
240    CTLFLAG_VNET | CTLFLAG_RW,
241    &VNET_NAME(carp_ifdown_adj), 0,
242    "Interface down demotion factor adjustment");
243
244VNET_PCPUSTAT_DEFINE(struct carpstats, carpstats);
245VNET_PCPUSTAT_SYSINIT(carpstats);
246VNET_PCPUSTAT_SYSUNINIT(carpstats);
247
248#define	CARPSTATS_ADD(name, val)	\
249    counter_u64_add(VNET(carpstats)[offsetof(struct carpstats, name) / \
250	sizeof(uint64_t)], (val))
251#define	CARPSTATS_INC(name)		CARPSTATS_ADD(name, 1)
252
253SYSCTL_VNET_PCPUSTAT(_net_inet_carp, OID_AUTO, stats, struct carpstats,
254    carpstats, "CARP statistics (struct carpstats, netinet/ip_carp.h)");
255
256#define	CARP_LOCK_INIT(sc)	mtx_init(&(sc)->sc_mtx, "carp_softc",   \
257	NULL, MTX_DEF)
258#define	CARP_LOCK_DESTROY(sc)	mtx_destroy(&(sc)->sc_mtx)
259#define	CARP_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
260#define	CARP_LOCK(sc)		mtx_lock(&(sc)->sc_mtx)
261#define	CARP_UNLOCK(sc)		mtx_unlock(&(sc)->sc_mtx)
262#define	CIF_LOCK_INIT(cif)	mtx_init(&(cif)->cif_mtx, "carp_if",   \
263	NULL, MTX_DEF)
264#define	CIF_LOCK_DESTROY(cif)	mtx_destroy(&(cif)->cif_mtx)
265#define	CIF_LOCK_ASSERT(cif)	mtx_assert(&(cif)->cif_mtx, MA_OWNED)
266#define	CIF_LOCK(cif)		mtx_lock(&(cif)->cif_mtx)
267#define	CIF_UNLOCK(cif)		mtx_unlock(&(cif)->cif_mtx)
268#define	CIF_FREE(cif)	do {				\
269		CIF_LOCK(cif);				\
270		if (TAILQ_EMPTY(&(cif)->cif_vrs))	\
271			carp_free_if(cif);		\
272		else					\
273			CIF_UNLOCK(cif);		\
274} while (0)
275
276#define	CARP_LOG(...)	do {				\
277	if (V_carp_log > 0)				\
278		log(LOG_INFO, "carp: " __VA_ARGS__);	\
279} while (0)
280
281#define	CARP_DEBUG(...)	do {				\
282	if (V_carp_log > 1)				\
283		log(LOG_DEBUG, __VA_ARGS__);		\
284} while (0)
285
286#define	IFNET_FOREACH_IFA(ifp, ifa)					\
287	IF_ADDR_LOCK_ASSERT(ifp);					\
288	TAILQ_FOREACH((ifa), &(ifp)->if_addrhead, ifa_link)		\
289		if ((ifa)->ifa_carp != NULL)
290
291#define	CARP_FOREACH_IFA(sc, ifa)					\
292	CARP_LOCK_ASSERT(sc);						\
293	for (int _i = 0;						\
294		_i < (sc)->sc_naddrs + (sc)->sc_naddrs6 &&		\
295		((ifa) = sc->sc_ifas[_i]) != NULL;			\
296		++_i)
297
298#define	IFNET_FOREACH_CARP(ifp, sc)					\
299	KASSERT(mtx_owned(&ifp->if_carp->cif_mtx) ||			\
300	    sx_xlocked(&carp_sx), ("cif_vrs not locked"));		\
301	TAILQ_FOREACH((sc), &(ifp)->if_carp->cif_vrs, sc_list)
302
303#define	DEMOTE_ADVSKEW(sc)					\
304    (((sc)->sc_advskew + V_carp_demotion > CARP_MAXSKEW) ?	\
305    CARP_MAXSKEW : ((sc)->sc_advskew + V_carp_demotion))
306
307static void	carp_input_c(struct mbuf *, struct carp_header *, sa_family_t);
308static struct carp_softc
309		*carp_alloc(struct ifnet *);
310static void	carp_destroy(struct carp_softc *);
311static struct carp_if
312		*carp_alloc_if(struct ifnet *);
313static void	carp_free_if(struct carp_if *);
314static void	carp_set_state(struct carp_softc *, int, const char* reason);
315static void	carp_sc_state(struct carp_softc *);
316static void	carp_setrun(struct carp_softc *, sa_family_t);
317static void	carp_master_down(void *);
318static void	carp_master_down_locked(struct carp_softc *,
319    		    const char* reason);
320static void	carp_send_ad(void *);
321static void	carp_send_ad_locked(struct carp_softc *);
322static void	carp_addroute(struct carp_softc *);
323static void	carp_ifa_addroute(struct ifaddr *);
324static void	carp_delroute(struct carp_softc *);
325static void	carp_ifa_delroute(struct ifaddr *);
326static void	carp_send_ad_all(void *, int);
327static void	carp_demote_adj(int, char *);
328
329static LIST_HEAD(, carp_softc) carp_list;
330static struct mtx carp_mtx;
331static struct sx carp_sx;
332static struct task carp_sendall_task =
333    TASK_INITIALIZER(0, carp_send_ad_all, NULL);
334
335static void
336carp_hmac_prepare(struct carp_softc *sc)
337{
338	uint8_t version = CARP_VERSION, type = CARP_ADVERTISEMENT;
339	uint8_t vhid = sc->sc_vhid & 0xff;
340	struct ifaddr *ifa;
341	int i, found;
342#ifdef INET
343	struct in_addr last, cur, in;
344#endif
345#ifdef INET6
346	struct in6_addr last6, cur6, in6;
347#endif
348
349	CARP_LOCK_ASSERT(sc);
350
351	/* Compute ipad from key. */
352	bzero(sc->sc_pad, sizeof(sc->sc_pad));
353	bcopy(sc->sc_key, sc->sc_pad, sizeof(sc->sc_key));
354	for (i = 0; i < sizeof(sc->sc_pad); i++)
355		sc->sc_pad[i] ^= 0x36;
356
357	/* Precompute first part of inner hash. */
358	SHA1Init(&sc->sc_sha1);
359	SHA1Update(&sc->sc_sha1, sc->sc_pad, sizeof(sc->sc_pad));
360	SHA1Update(&sc->sc_sha1, (void *)&version, sizeof(version));
361	SHA1Update(&sc->sc_sha1, (void *)&type, sizeof(type));
362	SHA1Update(&sc->sc_sha1, (void *)&vhid, sizeof(vhid));
363#ifdef INET
364	cur.s_addr = 0;
365	do {
366		found = 0;
367		last = cur;
368		cur.s_addr = 0xffffffff;
369		CARP_FOREACH_IFA(sc, ifa) {
370			in.s_addr = ifatoia(ifa)->ia_addr.sin_addr.s_addr;
371			if (ifa->ifa_addr->sa_family == AF_INET &&
372			    ntohl(in.s_addr) > ntohl(last.s_addr) &&
373			    ntohl(in.s_addr) < ntohl(cur.s_addr)) {
374				cur.s_addr = in.s_addr;
375				found++;
376			}
377		}
378		if (found)
379			SHA1Update(&sc->sc_sha1, (void *)&cur, sizeof(cur));
380	} while (found);
381#endif /* INET */
382#ifdef INET6
383	memset(&cur6, 0, sizeof(cur6));
384	do {
385		found = 0;
386		last6 = cur6;
387		memset(&cur6, 0xff, sizeof(cur6));
388		CARP_FOREACH_IFA(sc, ifa) {
389			in6 = ifatoia6(ifa)->ia_addr.sin6_addr;
390			if (IN6_IS_SCOPE_EMBED(&in6))
391				in6.s6_addr16[1] = 0;
392			if (ifa->ifa_addr->sa_family == AF_INET6 &&
393			    memcmp(&in6, &last6, sizeof(in6)) > 0 &&
394			    memcmp(&in6, &cur6, sizeof(in6)) < 0) {
395				cur6 = in6;
396				found++;
397			}
398		}
399		if (found)
400			SHA1Update(&sc->sc_sha1, (void *)&cur6, sizeof(cur6));
401	} while (found);
402#endif /* INET6 */
403
404	/* convert ipad to opad */
405	for (i = 0; i < sizeof(sc->sc_pad); i++)
406		sc->sc_pad[i] ^= 0x36 ^ 0x5c;
407}
408
409static void
410carp_hmac_generate(struct carp_softc *sc, uint32_t counter[2],
411    unsigned char md[20])
412{
413	SHA1_CTX sha1ctx;
414
415	CARP_LOCK_ASSERT(sc);
416
417	/* fetch first half of inner hash */
418	bcopy(&sc->sc_sha1, &sha1ctx, sizeof(sha1ctx));
419
420	SHA1Update(&sha1ctx, (void *)counter, sizeof(sc->sc_counter));
421	SHA1Final(md, &sha1ctx);
422
423	/* outer hash */
424	SHA1Init(&sha1ctx);
425	SHA1Update(&sha1ctx, sc->sc_pad, sizeof(sc->sc_pad));
426	SHA1Update(&sha1ctx, md, 20);
427	SHA1Final(md, &sha1ctx);
428}
429
430static int
431carp_hmac_verify(struct carp_softc *sc, uint32_t counter[2],
432    unsigned char md[20])
433{
434	unsigned char md2[20];
435
436	CARP_LOCK_ASSERT(sc);
437
438	carp_hmac_generate(sc, counter, md2);
439
440	return (bcmp(md, md2, sizeof(md2)));
441}
442
443/*
444 * process input packet.
445 * we have rearranged checks order compared to the rfc,
446 * but it seems more efficient this way or not possible otherwise.
447 */
448#ifdef INET
449int
450carp_input(struct mbuf **mp, int *offp, int proto)
451{
452	struct mbuf *m = *mp;
453	struct ip *ip = mtod(m, struct ip *);
454	struct carp_header *ch;
455	int iplen, len;
456
457	iplen = *offp;
458	*mp = NULL;
459
460	CARPSTATS_INC(carps_ipackets);
461
462	if (!V_carp_allow) {
463		m_freem(m);
464		return (IPPROTO_DONE);
465	}
466
467	/* verify that the IP TTL is 255.  */
468	if (ip->ip_ttl != CARP_DFLTTL) {
469		CARPSTATS_INC(carps_badttl);
470		CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
471		    ip->ip_ttl,
472		    m->m_pkthdr.rcvif->if_xname);
473		m_freem(m);
474		return (IPPROTO_DONE);
475	}
476
477	iplen = ip->ip_hl << 2;
478
479	if (m->m_pkthdr.len < iplen + sizeof(*ch)) {
480		CARPSTATS_INC(carps_badlen);
481		CARP_DEBUG("%s: received len %zd < sizeof(struct carp_header) "
482		    "on %s\n", __func__, m->m_len - sizeof(struct ip),
483		    m->m_pkthdr.rcvif->if_xname);
484		m_freem(m);
485		return (IPPROTO_DONE);
486	}
487
488	if (iplen + sizeof(*ch) < m->m_len) {
489		if ((m = m_pullup(m, iplen + sizeof(*ch))) == NULL) {
490			CARPSTATS_INC(carps_hdrops);
491			CARP_DEBUG("%s: pullup failed\n", __func__);
492			return (IPPROTO_DONE);
493		}
494		ip = mtod(m, struct ip *);
495	}
496	ch = (struct carp_header *)((char *)ip + iplen);
497
498	/*
499	 * verify that the received packet length is
500	 * equal to the CARP header
501	 */
502	len = iplen + sizeof(*ch);
503	if (len > m->m_pkthdr.len) {
504		CARPSTATS_INC(carps_badlen);
505		CARP_DEBUG("%s: packet too short %d on %s\n", __func__,
506		    m->m_pkthdr.len,
507		    m->m_pkthdr.rcvif->if_xname);
508		m_freem(m);
509		return (IPPROTO_DONE);
510	}
511
512	if ((m = m_pullup(m, len)) == NULL) {
513		CARPSTATS_INC(carps_hdrops);
514		return (IPPROTO_DONE);
515	}
516	ip = mtod(m, struct ip *);
517	ch = (struct carp_header *)((char *)ip + iplen);
518
519	/* verify the CARP checksum */
520	m->m_data += iplen;
521	if (in_cksum(m, len - iplen)) {
522		CARPSTATS_INC(carps_badsum);
523		CARP_DEBUG("%s: checksum failed on %s\n", __func__,
524		    m->m_pkthdr.rcvif->if_xname);
525		m_freem(m);
526		return (IPPROTO_DONE);
527	}
528	m->m_data -= iplen;
529
530	carp_input_c(m, ch, AF_INET);
531	return (IPPROTO_DONE);
532}
533#endif
534
535#ifdef INET6
536int
537carp6_input(struct mbuf **mp, int *offp, int proto)
538{
539	struct mbuf *m = *mp;
540	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
541	struct carp_header *ch;
542	u_int len;
543
544	CARPSTATS_INC(carps_ipackets6);
545
546	if (!V_carp_allow) {
547		m_freem(m);
548		return (IPPROTO_DONE);
549	}
550
551	/* check if received on a valid carp interface */
552	if (m->m_pkthdr.rcvif->if_carp == NULL) {
553		CARPSTATS_INC(carps_badif);
554		CARP_DEBUG("%s: packet received on non-carp interface: %s\n",
555		    __func__, m->m_pkthdr.rcvif->if_xname);
556		m_freem(m);
557		return (IPPROTO_DONE);
558	}
559
560	/* verify that the IP TTL is 255 */
561	if (ip6->ip6_hlim != CARP_DFLTTL) {
562		CARPSTATS_INC(carps_badttl);
563		CARP_DEBUG("%s: received ttl %d != 255 on %s\n", __func__,
564		    ip6->ip6_hlim, m->m_pkthdr.rcvif->if_xname);
565		m_freem(m);
566		return (IPPROTO_DONE);
567	}
568
569	/* verify that we have a complete carp packet */
570	len = m->m_len;
571	IP6_EXTHDR_GET(ch, struct carp_header *, m, *offp, sizeof(*ch));
572	if (ch == NULL) {
573		CARPSTATS_INC(carps_badlen);
574		CARP_DEBUG("%s: packet size %u too small\n", __func__, len);
575		return (IPPROTO_DONE);
576	}
577
578
579	/* verify the CARP checksum */
580	m->m_data += *offp;
581	if (in_cksum(m, sizeof(*ch))) {
582		CARPSTATS_INC(carps_badsum);
583		CARP_DEBUG("%s: checksum failed, on %s\n", __func__,
584		    m->m_pkthdr.rcvif->if_xname);
585		m_freem(m);
586		return (IPPROTO_DONE);
587	}
588	m->m_data -= *offp;
589
590	carp_input_c(m, ch, AF_INET6);
591	return (IPPROTO_DONE);
592}
593#endif /* INET6 */
594
595static void
596carp_input_c(struct mbuf *m, struct carp_header *ch, sa_family_t af)
597{
598	struct ifnet *ifp = m->m_pkthdr.rcvif;
599	struct ifaddr *ifa;
600	struct carp_softc *sc;
601	uint64_t tmp_counter;
602	struct timeval sc_tv, ch_tv;
603
604	/* verify that the VHID is valid on the receiving interface */
605	IF_ADDR_RLOCK(ifp);
606	IFNET_FOREACH_IFA(ifp, ifa)
607		if (ifa->ifa_addr->sa_family == af &&
608		    ifa->ifa_carp->sc_vhid == ch->carp_vhid) {
609			ifa_ref(ifa);
610			break;
611		}
612	IF_ADDR_RUNLOCK(ifp);
613
614	if (ifa == NULL) {
615		CARPSTATS_INC(carps_badvhid);
616		m_freem(m);
617		return;
618	}
619
620	/* verify the CARP version. */
621	if (ch->carp_version != CARP_VERSION) {
622		CARPSTATS_INC(carps_badver);
623		CARP_DEBUG("%s: invalid version %d\n", ifp->if_xname,
624		    ch->carp_version);
625		ifa_free(ifa);
626		m_freem(m);
627		return;
628	}
629
630	sc = ifa->ifa_carp;
631	CARP_LOCK(sc);
632	ifa_free(ifa);
633
634	if (carp_hmac_verify(sc, ch->carp_counter, ch->carp_md)) {
635		CARPSTATS_INC(carps_badauth);
636		CARP_DEBUG("%s: incorrect hash for VHID %u@%s\n", __func__,
637		    sc->sc_vhid, ifp->if_xname);
638		goto out;
639	}
640
641	tmp_counter = ntohl(ch->carp_counter[0]);
642	tmp_counter = tmp_counter<<32;
643	tmp_counter += ntohl(ch->carp_counter[1]);
644
645	/* XXX Replay protection goes here */
646
647	sc->sc_init_counter = 0;
648	sc->sc_counter = tmp_counter;
649
650	sc_tv.tv_sec = sc->sc_advbase;
651	sc_tv.tv_usec = DEMOTE_ADVSKEW(sc) * 1000000 / 256;
652	ch_tv.tv_sec = ch->carp_advbase;
653	ch_tv.tv_usec = ch->carp_advskew * 1000000 / 256;
654
655	switch (sc->sc_state) {
656	case INIT:
657		break;
658	case MASTER:
659		/*
660		 * If we receive an advertisement from a master who's going to
661		 * be more frequent than us, go into BACKUP state.
662		 */
663		if (timevalcmp(&sc_tv, &ch_tv, >) ||
664		    timevalcmp(&sc_tv, &ch_tv, ==)) {
665			callout_stop(&sc->sc_ad_tmo);
666			carp_set_state(sc, BACKUP,
667			    "more frequent advertisement received");
668			carp_setrun(sc, 0);
669			carp_delroute(sc);
670		}
671		break;
672	case BACKUP:
673		/*
674		 * If we're pre-empting masters who advertise slower than us,
675		 * and this one claims to be slower, treat him as down.
676		 */
677		if (V_carp_preempt && timevalcmp(&sc_tv, &ch_tv, <)) {
678			carp_master_down_locked(sc,
679			    "preempting a slower master");
680			break;
681		}
682
683		/*
684		 *  If the master is going to advertise at such a low frequency
685		 *  that he's guaranteed to time out, we'd might as well just
686		 *  treat him as timed out now.
687		 */
688		sc_tv.tv_sec = sc->sc_advbase * 3;
689		if (timevalcmp(&sc_tv, &ch_tv, <)) {
690			carp_master_down_locked(sc, "master will time out");
691			break;
692		}
693
694		/*
695		 * Otherwise, we reset the counter and wait for the next
696		 * advertisement.
697		 */
698		carp_setrun(sc, af);
699		break;
700	}
701
702out:
703	CARP_UNLOCK(sc);
704	m_freem(m);
705}
706
707static int
708carp_prepare_ad(struct mbuf *m, struct carp_softc *sc, struct carp_header *ch)
709{
710	struct m_tag *mtag;
711
712	if (sc->sc_init_counter) {
713		/* this could also be seconds since unix epoch */
714		sc->sc_counter = arc4random();
715		sc->sc_counter = sc->sc_counter << 32;
716		sc->sc_counter += arc4random();
717	} else
718		sc->sc_counter++;
719
720	ch->carp_counter[0] = htonl((sc->sc_counter>>32)&0xffffffff);
721	ch->carp_counter[1] = htonl(sc->sc_counter&0xffffffff);
722
723	carp_hmac_generate(sc, ch->carp_counter, ch->carp_md);
724
725	/* Tag packet for carp_output */
726	if ((mtag = m_tag_get(PACKET_TAG_CARP, sizeof(struct carp_softc *),
727	    M_NOWAIT)) == NULL) {
728		m_freem(m);
729		CARPSTATS_INC(carps_onomem);
730		return (ENOMEM);
731	}
732	bcopy(&sc, mtag + 1, sizeof(sc));
733	m_tag_prepend(m, mtag);
734
735	return (0);
736}
737
738/*
739 * To avoid LORs and possible recursions this function shouldn't
740 * be called directly, but scheduled via taskqueue.
741 */
742static void
743carp_send_ad_all(void *ctx __unused, int pending __unused)
744{
745	struct carp_softc *sc;
746
747	mtx_lock(&carp_mtx);
748	LIST_FOREACH(sc, &carp_list, sc_next)
749		if (sc->sc_state == MASTER) {
750			CARP_LOCK(sc);
751			CURVNET_SET(sc->sc_carpdev->if_vnet);
752			carp_send_ad_locked(sc);
753			CURVNET_RESTORE();
754			CARP_UNLOCK(sc);
755		}
756	mtx_unlock(&carp_mtx);
757}
758
759/* Send a periodic advertisement, executed in callout context. */
760static void
761carp_send_ad(void *v)
762{
763	struct carp_softc *sc = v;
764
765	CARP_LOCK_ASSERT(sc);
766	CURVNET_SET(sc->sc_carpdev->if_vnet);
767	carp_send_ad_locked(sc);
768	CURVNET_RESTORE();
769	CARP_UNLOCK(sc);
770}
771
772static void
773carp_send_ad_error(struct carp_softc *sc, int error)
774{
775
776	if (error) {
777		if (sc->sc_sendad_errors < INT_MAX)
778			sc->sc_sendad_errors++;
779		if (sc->sc_sendad_errors == CARP_SENDAD_MAX_ERRORS) {
780			static const char fmt[] = "send error %d on %s";
781			char msg[sizeof(fmt) + IFNAMSIZ];
782
783			sprintf(msg, fmt, error, sc->sc_carpdev->if_xname);
784			carp_demote_adj(V_carp_senderr_adj, msg);
785		}
786		sc->sc_sendad_success = 0;
787	} else {
788		if (sc->sc_sendad_errors >= CARP_SENDAD_MAX_ERRORS &&
789		    ++sc->sc_sendad_success >= CARP_SENDAD_MIN_SUCCESS) {
790			static const char fmt[] = "send ok on %s";
791			char msg[sizeof(fmt) + IFNAMSIZ];
792
793			sprintf(msg, fmt, sc->sc_carpdev->if_xname);
794			carp_demote_adj(-V_carp_senderr_adj, msg);
795			sc->sc_sendad_errors = 0;
796		} else
797			sc->sc_sendad_errors = 0;
798	}
799}
800
801static void
802carp_send_ad_locked(struct carp_softc *sc)
803{
804	struct carp_header ch;
805	struct timeval tv;
806	struct sockaddr sa;
807	struct ifaddr *ifa;
808	struct carp_header *ch_ptr;
809	struct mbuf *m;
810	int len, advskew;
811
812	CARP_LOCK_ASSERT(sc);
813
814	advskew = DEMOTE_ADVSKEW(sc);
815	tv.tv_sec = sc->sc_advbase;
816	tv.tv_usec = advskew * 1000000 / 256;
817
818	ch.carp_version = CARP_VERSION;
819	ch.carp_type = CARP_ADVERTISEMENT;
820	ch.carp_vhid = sc->sc_vhid;
821	ch.carp_advbase = sc->sc_advbase;
822	ch.carp_advskew = advskew;
823	ch.carp_authlen = 7;	/* XXX DEFINE */
824	ch.carp_pad1 = 0;	/* must be zero */
825	ch.carp_cksum = 0;
826
827	/* XXXGL: OpenBSD picks first ifaddr with needed family. */
828
829#ifdef INET
830	if (sc->sc_naddrs) {
831		struct ip *ip;
832
833		m = m_gethdr(M_NOWAIT, MT_DATA);
834		if (m == NULL) {
835			CARPSTATS_INC(carps_onomem);
836			goto resched;
837		}
838		len = sizeof(*ip) + sizeof(ch);
839		m->m_pkthdr.len = len;
840		m->m_pkthdr.rcvif = NULL;
841		m->m_len = len;
842		M_ALIGN(m, m->m_len);
843		m->m_flags |= M_MCAST;
844		ip = mtod(m, struct ip *);
845		ip->ip_v = IPVERSION;
846		ip->ip_hl = sizeof(*ip) >> 2;
847		ip->ip_tos = V_carp_dscp << IPTOS_DSCP_OFFSET;
848		ip->ip_len = htons(len);
849		ip->ip_off = htons(IP_DF);
850		ip->ip_ttl = CARP_DFLTTL;
851		ip->ip_p = IPPROTO_CARP;
852		ip->ip_sum = 0;
853		ip_fillid(ip);
854
855		bzero(&sa, sizeof(sa));
856		sa.sa_family = AF_INET;
857		ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev);
858		if (ifa != NULL) {
859			ip->ip_src.s_addr =
860			    ifatoia(ifa)->ia_addr.sin_addr.s_addr;
861			ifa_free(ifa);
862		} else
863			ip->ip_src.s_addr = 0;
864		ip->ip_dst.s_addr = htonl(INADDR_CARP_GROUP);
865
866		ch_ptr = (struct carp_header *)(&ip[1]);
867		bcopy(&ch, ch_ptr, sizeof(ch));
868		if (carp_prepare_ad(m, sc, ch_ptr))
869			goto resched;
870
871		m->m_data += sizeof(*ip);
872		ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip));
873		m->m_data -= sizeof(*ip);
874
875		CARPSTATS_INC(carps_opackets);
876
877		carp_send_ad_error(sc, ip_output(m, NULL, NULL, IP_RAWOUTPUT,
878		    &sc->sc_carpdev->if_carp->cif_imo, NULL));
879	}
880#endif /* INET */
881#ifdef INET6
882	if (sc->sc_naddrs6) {
883		struct ip6_hdr *ip6;
884
885		m = m_gethdr(M_NOWAIT, MT_DATA);
886		if (m == NULL) {
887			CARPSTATS_INC(carps_onomem);
888			goto resched;
889		}
890		len = sizeof(*ip6) + sizeof(ch);
891		m->m_pkthdr.len = len;
892		m->m_pkthdr.rcvif = NULL;
893		m->m_len = len;
894		M_ALIGN(m, m->m_len);
895		m->m_flags |= M_MCAST;
896		ip6 = mtod(m, struct ip6_hdr *);
897		bzero(ip6, sizeof(*ip6));
898		ip6->ip6_vfc |= IPV6_VERSION;
899		/* Traffic class isn't defined in ip6 struct instead
900		 * it gets offset into flowid field */
901		ip6->ip6_flow |= htonl(V_carp_dscp << (IPV6_FLOWLABEL_LEN +
902		    IPTOS_DSCP_OFFSET));
903		ip6->ip6_hlim = CARP_DFLTTL;
904		ip6->ip6_nxt = IPPROTO_CARP;
905		bzero(&sa, sizeof(sa));
906
907		/* set the source address */
908		sa.sa_family = AF_INET6;
909		ifa = ifaof_ifpforaddr(&sa, sc->sc_carpdev);
910		if (ifa != NULL) {
911			bcopy(IFA_IN6(ifa), &ip6->ip6_src,
912			    sizeof(struct in6_addr));
913			ifa_free(ifa);
914		} else
915			/* This should never happen with IPv6. */
916			bzero(&ip6->ip6_src, sizeof(struct in6_addr));
917
918		/* Set the multicast destination. */
919		ip6->ip6_dst.s6_addr16[0] = htons(0xff02);
920		ip6->ip6_dst.s6_addr8[15] = 0x12;
921		if (in6_setscope(&ip6->ip6_dst, sc->sc_carpdev, NULL) != 0) {
922			m_freem(m);
923			CARP_DEBUG("%s: in6_setscope failed\n", __func__);
924			goto resched;
925		}
926
927		ch_ptr = (struct carp_header *)(&ip6[1]);
928		bcopy(&ch, ch_ptr, sizeof(ch));
929		if (carp_prepare_ad(m, sc, ch_ptr))
930			goto resched;
931
932		m->m_data += sizeof(*ip6);
933		ch_ptr->carp_cksum = in_cksum(m, len - sizeof(*ip6));
934		m->m_data -= sizeof(*ip6);
935
936		CARPSTATS_INC(carps_opackets6);
937
938		carp_send_ad_error(sc, ip6_output(m, NULL, NULL, 0,
939		    &sc->sc_carpdev->if_carp->cif_im6o, NULL, NULL));
940	}
941#endif /* INET6 */
942
943resched:
944	callout_reset(&sc->sc_ad_tmo, tvtohz(&tv), carp_send_ad, sc);
945}
946
947static void
948carp_addroute(struct carp_softc *sc)
949{
950	struct ifaddr *ifa;
951
952	CARP_FOREACH_IFA(sc, ifa)
953		carp_ifa_addroute(ifa);
954}
955
956static void
957carp_ifa_addroute(struct ifaddr *ifa)
958{
959
960	switch (ifa->ifa_addr->sa_family) {
961#ifdef INET
962	case AF_INET:
963		in_addprefix(ifatoia(ifa), RTF_UP);
964		ifa_add_loopback_route(ifa,
965		    (struct sockaddr *)&ifatoia(ifa)->ia_addr);
966		break;
967#endif
968#ifdef INET6
969	case AF_INET6:
970		ifa_add_loopback_route(ifa,
971		    (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
972		nd6_add_ifa_lle(ifatoia6(ifa));
973		break;
974#endif
975	}
976}
977
978static void
979carp_delroute(struct carp_softc *sc)
980{
981	struct ifaddr *ifa;
982
983	CARP_FOREACH_IFA(sc, ifa)
984		carp_ifa_delroute(ifa);
985}
986
987static void
988carp_ifa_delroute(struct ifaddr *ifa)
989{
990
991	switch (ifa->ifa_addr->sa_family) {
992#ifdef INET
993	case AF_INET:
994		ifa_del_loopback_route(ifa,
995		    (struct sockaddr *)&ifatoia(ifa)->ia_addr);
996		in_scrubprefix(ifatoia(ifa), LLE_STATIC);
997		break;
998#endif
999#ifdef INET6
1000	case AF_INET6:
1001		ifa_del_loopback_route(ifa,
1002		    (struct sockaddr *)&ifatoia6(ifa)->ia_addr);
1003		nd6_rem_ifa_lle(ifatoia6(ifa), 1);
1004		break;
1005#endif
1006	}
1007}
1008
1009int
1010carp_master(struct ifaddr *ifa)
1011{
1012	struct carp_softc *sc = ifa->ifa_carp;
1013
1014	return (sc->sc_state == MASTER);
1015}
1016
1017#ifdef INET
1018/*
1019 * Broadcast a gratuitous ARP request containing
1020 * the virtual router MAC address for each IP address
1021 * associated with the virtual router.
1022 */
1023static void
1024carp_send_arp(struct carp_softc *sc)
1025{
1026	struct ifaddr *ifa;
1027	struct in_addr addr;
1028
1029	CARP_FOREACH_IFA(sc, ifa) {
1030		if (ifa->ifa_addr->sa_family != AF_INET)
1031			continue;
1032		addr = ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr;
1033		arp_announce_ifaddr(sc->sc_carpdev, addr, LLADDR(&sc->sc_addr));
1034	}
1035}
1036
1037int
1038carp_iamatch(struct ifaddr *ifa, uint8_t **enaddr)
1039{
1040	struct carp_softc *sc = ifa->ifa_carp;
1041
1042	if (sc->sc_state == MASTER) {
1043		*enaddr = LLADDR(&sc->sc_addr);
1044		return (1);
1045	}
1046
1047	return (0);
1048}
1049#endif
1050
1051#ifdef INET6
1052static void
1053carp_send_na(struct carp_softc *sc)
1054{
1055	static struct in6_addr mcast = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
1056	struct ifaddr *ifa;
1057	struct in6_addr *in6;
1058
1059	CARP_FOREACH_IFA(sc, ifa) {
1060		if (ifa->ifa_addr->sa_family != AF_INET6)
1061			continue;
1062
1063		in6 = IFA_IN6(ifa);
1064		nd6_na_output(sc->sc_carpdev, &mcast, in6,
1065		    ND_NA_FLAG_OVERRIDE, 1, NULL);
1066		DELAY(1000);	/* XXX */
1067	}
1068}
1069
1070/*
1071 * Returns ifa in case it's a carp address and it is MASTER, or if the address
1072 * matches and is not a carp address.  Returns NULL otherwise.
1073 */
1074struct ifaddr *
1075carp_iamatch6(struct ifnet *ifp, struct in6_addr *taddr)
1076{
1077	struct ifaddr *ifa;
1078
1079	ifa = NULL;
1080	IF_ADDR_RLOCK(ifp);
1081	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1082		if (ifa->ifa_addr->sa_family != AF_INET6)
1083			continue;
1084		if (!IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa)))
1085			continue;
1086		if (ifa->ifa_carp && ifa->ifa_carp->sc_state != MASTER)
1087			ifa = NULL;
1088		else
1089			ifa_ref(ifa);
1090		break;
1091	}
1092	IF_ADDR_RUNLOCK(ifp);
1093
1094	return (ifa);
1095}
1096
1097caddr_t
1098carp_macmatch6(struct ifnet *ifp, struct mbuf *m, const struct in6_addr *taddr)
1099{
1100	struct ifaddr *ifa;
1101
1102	IF_ADDR_RLOCK(ifp);
1103	IFNET_FOREACH_IFA(ifp, ifa)
1104		if (ifa->ifa_addr->sa_family == AF_INET6 &&
1105		    IN6_ARE_ADDR_EQUAL(taddr, IFA_IN6(ifa))) {
1106			struct carp_softc *sc = ifa->ifa_carp;
1107			struct m_tag *mtag;
1108
1109			IF_ADDR_RUNLOCK(ifp);
1110
1111			mtag = m_tag_get(PACKET_TAG_CARP,
1112			    sizeof(struct carp_softc *), M_NOWAIT);
1113			if (mtag == NULL)
1114				/* Better a bit than nothing. */
1115				return (LLADDR(&sc->sc_addr));
1116
1117			bcopy(&sc, mtag + 1, sizeof(sc));
1118			m_tag_prepend(m, mtag);
1119
1120			return (LLADDR(&sc->sc_addr));
1121		}
1122	IF_ADDR_RUNLOCK(ifp);
1123
1124	return (NULL);
1125}
1126#endif /* INET6 */
1127
1128int
1129carp_forus(struct ifnet *ifp, u_char *dhost)
1130{
1131	struct carp_softc *sc;
1132	uint8_t *ena = dhost;
1133
1134	if (ena[0] || ena[1] || ena[2] != 0x5e || ena[3] || ena[4] != 1)
1135		return (0);
1136
1137	CIF_LOCK(ifp->if_carp);
1138	IFNET_FOREACH_CARP(ifp, sc) {
1139		/*
1140		 * CARP_LOCK() is not here, since would protect nothing, but
1141		 * cause deadlock with if_bridge, calling this under its lock.
1142		 */
1143		if (sc->sc_state == MASTER && !bcmp(dhost, LLADDR(&sc->sc_addr),
1144		    ETHER_ADDR_LEN)) {
1145			CIF_UNLOCK(ifp->if_carp);
1146			return (1);
1147		}
1148	}
1149	CIF_UNLOCK(ifp->if_carp);
1150
1151	return (0);
1152}
1153
1154/* Master down timeout event, executed in callout context. */
1155static void
1156carp_master_down(void *v)
1157{
1158	struct carp_softc *sc = v;
1159
1160	CARP_LOCK_ASSERT(sc);
1161
1162	CURVNET_SET(sc->sc_carpdev->if_vnet);
1163	if (sc->sc_state == BACKUP) {
1164		carp_master_down_locked(sc, "master timed out");
1165	}
1166	CURVNET_RESTORE();
1167
1168	CARP_UNLOCK(sc);
1169}
1170
1171static void
1172carp_master_down_locked(struct carp_softc *sc, const char *reason)
1173{
1174
1175	CARP_LOCK_ASSERT(sc);
1176
1177	switch (sc->sc_state) {
1178	case BACKUP:
1179		carp_set_state(sc, MASTER, reason);
1180		carp_send_ad_locked(sc);
1181#ifdef INET
1182		carp_send_arp(sc);
1183#endif
1184#ifdef INET6
1185		carp_send_na(sc);
1186#endif
1187		carp_setrun(sc, 0);
1188		carp_addroute(sc);
1189		break;
1190	case INIT:
1191	case MASTER:
1192#ifdef INVARIANTS
1193		panic("carp: VHID %u@%s: master_down event in %s state\n",
1194		    sc->sc_vhid,
1195		    sc->sc_carpdev->if_xname,
1196		    sc->sc_state ? "MASTER" : "INIT");
1197#endif
1198		break;
1199	}
1200}
1201
1202/*
1203 * When in backup state, af indicates whether to reset the master down timer
1204 * for v4 or v6. If it's set to zero, reset the ones which are already pending.
1205 */
1206static void
1207carp_setrun(struct carp_softc *sc, sa_family_t af)
1208{
1209	struct timeval tv;
1210
1211	CARP_LOCK_ASSERT(sc);
1212
1213	if ((sc->sc_carpdev->if_flags & IFF_UP) == 0 ||
1214	    sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
1215	    (sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0) ||
1216	    !V_carp_allow)
1217		return;
1218
1219	switch (sc->sc_state) {
1220	case INIT:
1221		carp_set_state(sc, BACKUP, "initialization complete");
1222		carp_setrun(sc, 0);
1223		break;
1224	case BACKUP:
1225		callout_stop(&sc->sc_ad_tmo);
1226		tv.tv_sec = 3 * sc->sc_advbase;
1227		tv.tv_usec = sc->sc_advskew * 1000000 / 256;
1228		switch (af) {
1229#ifdef INET
1230		case AF_INET:
1231			callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
1232			    carp_master_down, sc);
1233			break;
1234#endif
1235#ifdef INET6
1236		case AF_INET6:
1237			callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
1238			    carp_master_down, sc);
1239			break;
1240#endif
1241		default:
1242#ifdef INET
1243			if (sc->sc_naddrs)
1244				callout_reset(&sc->sc_md_tmo, tvtohz(&tv),
1245				    carp_master_down, sc);
1246#endif
1247#ifdef INET6
1248			if (sc->sc_naddrs6)
1249				callout_reset(&sc->sc_md6_tmo, tvtohz(&tv),
1250				    carp_master_down, sc);
1251#endif
1252			break;
1253		}
1254		break;
1255	case MASTER:
1256		tv.tv_sec = sc->sc_advbase;
1257		tv.tv_usec = sc->sc_advskew * 1000000 / 256;
1258		callout_reset(&sc->sc_ad_tmo, tvtohz(&tv),
1259		    carp_send_ad, sc);
1260		break;
1261	}
1262}
1263
1264/*
1265 * Setup multicast structures.
1266 */
1267static int
1268carp_multicast_setup(struct carp_if *cif, sa_family_t sa)
1269{
1270	struct ifnet *ifp = cif->cif_ifp;
1271	int error = 0;
1272
1273	switch (sa) {
1274#ifdef INET
1275	case AF_INET:
1276	    {
1277		struct ip_moptions *imo = &cif->cif_imo;
1278		struct in_addr addr;
1279
1280		if (imo->imo_membership)
1281			return (0);
1282
1283		imo->imo_membership = (struct in_multi **)malloc(
1284		    (sizeof(struct in_multi *) * IP_MIN_MEMBERSHIPS), M_CARP,
1285		    M_WAITOK);
1286		imo->imo_mfilters = NULL;
1287		imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
1288		imo->imo_multicast_vif = -1;
1289
1290		addr.s_addr = htonl(INADDR_CARP_GROUP);
1291		if ((error = in_joingroup(ifp, &addr, NULL,
1292		    &imo->imo_membership[0])) != 0) {
1293			free(imo->imo_membership, M_CARP);
1294			break;
1295		}
1296		imo->imo_num_memberships++;
1297		imo->imo_multicast_ifp = ifp;
1298		imo->imo_multicast_ttl = CARP_DFLTTL;
1299		imo->imo_multicast_loop = 0;
1300		break;
1301	   }
1302#endif
1303#ifdef INET6
1304	case AF_INET6:
1305	    {
1306		struct ip6_moptions *im6o = &cif->cif_im6o;
1307		struct in6_addr in6;
1308		struct in6_multi *in6m;
1309
1310		if (im6o->im6o_membership)
1311			return (0);
1312
1313		im6o->im6o_membership = (struct in6_multi **)malloc(
1314		    (sizeof(struct in6_multi *) * IPV6_MIN_MEMBERSHIPS), M_CARP,
1315		    M_ZERO | M_WAITOK);
1316		im6o->im6o_mfilters = NULL;
1317		im6o->im6o_max_memberships = IPV6_MIN_MEMBERSHIPS;
1318		im6o->im6o_multicast_hlim = CARP_DFLTTL;
1319		im6o->im6o_multicast_ifp = ifp;
1320
1321		/* Join IPv6 CARP multicast group. */
1322		bzero(&in6, sizeof(in6));
1323		in6.s6_addr16[0] = htons(0xff02);
1324		in6.s6_addr8[15] = 0x12;
1325		if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
1326			free(im6o->im6o_membership, M_CARP);
1327			break;
1328		}
1329		in6m = NULL;
1330		if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) {
1331			free(im6o->im6o_membership, M_CARP);
1332			break;
1333		}
1334		im6o->im6o_membership[0] = in6m;
1335		im6o->im6o_num_memberships++;
1336
1337		/* Join solicited multicast address. */
1338		bzero(&in6, sizeof(in6));
1339		in6.s6_addr16[0] = htons(0xff02);
1340		in6.s6_addr32[1] = 0;
1341		in6.s6_addr32[2] = htonl(1);
1342		in6.s6_addr32[3] = 0;
1343		in6.s6_addr8[12] = 0xff;
1344		if ((error = in6_setscope(&in6, ifp, NULL)) != 0) {
1345			in6_mc_leave(im6o->im6o_membership[0], NULL);
1346			free(im6o->im6o_membership, M_CARP);
1347			break;
1348		}
1349		in6m = NULL;
1350		if ((error = in6_mc_join(ifp, &in6, NULL, &in6m, 0)) != 0) {
1351			in6_mc_leave(im6o->im6o_membership[0], NULL);
1352			free(im6o->im6o_membership, M_CARP);
1353			break;
1354		}
1355		im6o->im6o_membership[1] = in6m;
1356		im6o->im6o_num_memberships++;
1357		break;
1358	    }
1359#endif
1360	}
1361
1362	return (error);
1363}
1364
1365/*
1366 * Free multicast structures.
1367 */
1368static void
1369carp_multicast_cleanup(struct carp_if *cif, sa_family_t sa)
1370{
1371
1372	sx_assert(&carp_sx, SA_XLOCKED);
1373
1374	switch (sa) {
1375#ifdef INET
1376	case AF_INET:
1377		if (cif->cif_naddrs == 0) {
1378			struct ip_moptions *imo = &cif->cif_imo;
1379
1380			in_leavegroup(imo->imo_membership[0], NULL);
1381			KASSERT(imo->imo_mfilters == NULL,
1382			    ("%s: imo_mfilters != NULL", __func__));
1383			free(imo->imo_membership, M_CARP);
1384			imo->imo_membership = NULL;
1385
1386		}
1387		break;
1388#endif
1389#ifdef INET6
1390	case AF_INET6:
1391		if (cif->cif_naddrs6 == 0) {
1392			struct ip6_moptions *im6o = &cif->cif_im6o;
1393
1394			in6_mc_leave(im6o->im6o_membership[0], NULL);
1395			in6_mc_leave(im6o->im6o_membership[1], NULL);
1396			KASSERT(im6o->im6o_mfilters == NULL,
1397			    ("%s: im6o_mfilters != NULL", __func__));
1398			free(im6o->im6o_membership, M_CARP);
1399			im6o->im6o_membership = NULL;
1400		}
1401		break;
1402#endif
1403	}
1404}
1405
1406int
1407carp_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *sa)
1408{
1409	struct m_tag *mtag;
1410	struct carp_softc *sc;
1411
1412	if (!sa)
1413		return (0);
1414
1415	switch (sa->sa_family) {
1416#ifdef INET
1417	case AF_INET:
1418		break;
1419#endif
1420#ifdef INET6
1421	case AF_INET6:
1422		break;
1423#endif
1424	default:
1425		return (0);
1426	}
1427
1428	mtag = m_tag_find(m, PACKET_TAG_CARP, NULL);
1429	if (mtag == NULL)
1430		return (0);
1431
1432	bcopy(mtag + 1, &sc, sizeof(sc));
1433
1434	/* Set the source MAC address to the Virtual Router MAC Address. */
1435	switch (ifp->if_type) {
1436	case IFT_ETHER:
1437	case IFT_BRIDGE:
1438	case IFT_L2VLAN: {
1439			struct ether_header *eh;
1440
1441			eh = mtod(m, struct ether_header *);
1442			eh->ether_shost[0] = 0;
1443			eh->ether_shost[1] = 0;
1444			eh->ether_shost[2] = 0x5e;
1445			eh->ether_shost[3] = 0;
1446			eh->ether_shost[4] = 1;
1447			eh->ether_shost[5] = sc->sc_vhid;
1448		}
1449		break;
1450	case IFT_FDDI: {
1451			struct fddi_header *fh;
1452
1453			fh = mtod(m, struct fddi_header *);
1454			fh->fddi_shost[0] = 0;
1455			fh->fddi_shost[1] = 0;
1456			fh->fddi_shost[2] = 0x5e;
1457			fh->fddi_shost[3] = 0;
1458			fh->fddi_shost[4] = 1;
1459			fh->fddi_shost[5] = sc->sc_vhid;
1460		}
1461		break;
1462	case IFT_ISO88025: {
1463 			struct iso88025_header *th;
1464 			th = mtod(m, struct iso88025_header *);
1465			th->iso88025_shost[0] = 3;
1466			th->iso88025_shost[1] = 0;
1467			th->iso88025_shost[2] = 0x40 >> (sc->sc_vhid - 1);
1468			th->iso88025_shost[3] = 0x40000 >> (sc->sc_vhid - 1);
1469			th->iso88025_shost[4] = 0;
1470			th->iso88025_shost[5] = 0;
1471		}
1472		break;
1473	default:
1474		printf("%s: carp is not supported for the %d interface type\n",
1475		    ifp->if_xname, ifp->if_type);
1476		return (EOPNOTSUPP);
1477	}
1478
1479	return (0);
1480}
1481
1482static struct carp_softc*
1483carp_alloc(struct ifnet *ifp)
1484{
1485	struct carp_softc *sc;
1486	struct carp_if *cif;
1487
1488	sx_assert(&carp_sx, SA_XLOCKED);
1489
1490	if ((cif = ifp->if_carp) == NULL)
1491		cif = carp_alloc_if(ifp);
1492
1493	sc = malloc(sizeof(*sc), M_CARP, M_WAITOK|M_ZERO);
1494
1495	sc->sc_advbase = CARP_DFLTINTV;
1496	sc->sc_vhid = -1;	/* required setting */
1497	sc->sc_init_counter = 1;
1498	sc->sc_state = INIT;
1499
1500	sc->sc_ifasiz = sizeof(struct ifaddr *);
1501	sc->sc_ifas = malloc(sc->sc_ifasiz, M_CARP, M_WAITOK|M_ZERO);
1502	sc->sc_carpdev = ifp;
1503
1504	CARP_LOCK_INIT(sc);
1505#ifdef INET
1506	callout_init_mtx(&sc->sc_md_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1507#endif
1508#ifdef INET6
1509	callout_init_mtx(&sc->sc_md6_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1510#endif
1511	callout_init_mtx(&sc->sc_ad_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
1512
1513	CIF_LOCK(cif);
1514	TAILQ_INSERT_TAIL(&cif->cif_vrs, sc, sc_list);
1515	CIF_UNLOCK(cif);
1516
1517	mtx_lock(&carp_mtx);
1518	LIST_INSERT_HEAD(&carp_list, sc, sc_next);
1519	mtx_unlock(&carp_mtx);
1520
1521	return (sc);
1522}
1523
1524static void
1525carp_grow_ifas(struct carp_softc *sc)
1526{
1527	struct ifaddr **new;
1528
1529	new = malloc(sc->sc_ifasiz * 2, M_CARP, M_WAITOK | M_ZERO);
1530	CARP_LOCK(sc);
1531	bcopy(sc->sc_ifas, new, sc->sc_ifasiz);
1532	free(sc->sc_ifas, M_CARP);
1533	sc->sc_ifas = new;
1534	sc->sc_ifasiz *= 2;
1535	CARP_UNLOCK(sc);
1536}
1537
1538static void
1539carp_destroy(struct carp_softc *sc)
1540{
1541	struct ifnet *ifp = sc->sc_carpdev;
1542	struct carp_if *cif = ifp->if_carp;
1543
1544	sx_assert(&carp_sx, SA_XLOCKED);
1545
1546	if (sc->sc_suppress)
1547		carp_demote_adj(-V_carp_ifdown_adj, "vhid removed");
1548	CARP_UNLOCK(sc);
1549
1550	CIF_LOCK(cif);
1551	TAILQ_REMOVE(&cif->cif_vrs, sc, sc_list);
1552	CIF_UNLOCK(cif);
1553
1554	mtx_lock(&carp_mtx);
1555	LIST_REMOVE(sc, sc_next);
1556	mtx_unlock(&carp_mtx);
1557
1558	callout_drain(&sc->sc_ad_tmo);
1559#ifdef INET
1560	callout_drain(&sc->sc_md_tmo);
1561#endif
1562#ifdef INET6
1563	callout_drain(&sc->sc_md6_tmo);
1564#endif
1565	CARP_LOCK_DESTROY(sc);
1566
1567	free(sc->sc_ifas, M_CARP);
1568	free(sc, M_CARP);
1569}
1570
1571static struct carp_if*
1572carp_alloc_if(struct ifnet *ifp)
1573{
1574	struct carp_if *cif;
1575	int error;
1576
1577	cif = malloc(sizeof(*cif), M_CARP, M_WAITOK|M_ZERO);
1578
1579	if ((error = ifpromisc(ifp, 1)) != 0)
1580		printf("%s: ifpromisc(%s) failed: %d\n",
1581		    __func__, ifp->if_xname, error);
1582	else
1583		cif->cif_flags |= CIF_PROMISC;
1584
1585	CIF_LOCK_INIT(cif);
1586	cif->cif_ifp = ifp;
1587	TAILQ_INIT(&cif->cif_vrs);
1588
1589	IF_ADDR_WLOCK(ifp);
1590	ifp->if_carp = cif;
1591	if_ref(ifp);
1592	IF_ADDR_WUNLOCK(ifp);
1593
1594	return (cif);
1595}
1596
1597static void
1598carp_free_if(struct carp_if *cif)
1599{
1600	struct ifnet *ifp = cif->cif_ifp;
1601
1602	CIF_LOCK_ASSERT(cif);
1603	KASSERT(TAILQ_EMPTY(&cif->cif_vrs), ("%s: softc list not empty",
1604	    __func__));
1605
1606	IF_ADDR_WLOCK(ifp);
1607	ifp->if_carp = NULL;
1608	IF_ADDR_WUNLOCK(ifp);
1609
1610	CIF_LOCK_DESTROY(cif);
1611
1612	if (cif->cif_flags & CIF_PROMISC)
1613		ifpromisc(ifp, 0);
1614	if_rele(ifp);
1615
1616	free(cif, M_CARP);
1617}
1618
1619static void
1620carp_carprcp(struct carpreq *carpr, struct carp_softc *sc, int priv)
1621{
1622
1623	CARP_LOCK(sc);
1624	carpr->carpr_state = sc->sc_state;
1625	carpr->carpr_vhid = sc->sc_vhid;
1626	carpr->carpr_advbase = sc->sc_advbase;
1627	carpr->carpr_advskew = sc->sc_advskew;
1628	if (priv)
1629		bcopy(sc->sc_key, carpr->carpr_key, sizeof(carpr->carpr_key));
1630	else
1631		bzero(carpr->carpr_key, sizeof(carpr->carpr_key));
1632	CARP_UNLOCK(sc);
1633}
1634
1635int
1636carp_ioctl(struct ifreq *ifr, u_long cmd, struct thread *td)
1637{
1638	struct carpreq carpr;
1639	struct ifnet *ifp;
1640	struct carp_softc *sc = NULL;
1641	int error = 0, locked = 0;
1642
1643	if ((error = copyin(ifr_data_get_ptr(ifr), &carpr, sizeof carpr)))
1644		return (error);
1645
1646	ifp = ifunit_ref(ifr->ifr_name);
1647	if (ifp == NULL)
1648		return (ENXIO);
1649
1650	switch (ifp->if_type) {
1651	case IFT_ETHER:
1652	case IFT_L2VLAN:
1653	case IFT_BRIDGE:
1654	case IFT_FDDI:
1655	case IFT_ISO88025:
1656		break;
1657	default:
1658		error = EOPNOTSUPP;
1659		goto out;
1660	}
1661
1662	if ((ifp->if_flags & IFF_MULTICAST) == 0) {
1663		error = EADDRNOTAVAIL;
1664		goto out;
1665	}
1666
1667	sx_xlock(&carp_sx);
1668	switch (cmd) {
1669	case SIOCSVH:
1670		if ((error = priv_check(td, PRIV_NETINET_CARP)))
1671			break;
1672		if (carpr.carpr_vhid <= 0 || carpr.carpr_vhid > CARP_MAXVHID ||
1673		    carpr.carpr_advbase < 0 || carpr.carpr_advskew < 0) {
1674			error = EINVAL;
1675			break;
1676		}
1677
1678		if (ifp->if_carp) {
1679			IFNET_FOREACH_CARP(ifp, sc)
1680				if (sc->sc_vhid == carpr.carpr_vhid)
1681					break;
1682		}
1683		if (sc == NULL) {
1684			sc = carp_alloc(ifp);
1685			CARP_LOCK(sc);
1686			sc->sc_vhid = carpr.carpr_vhid;
1687			LLADDR(&sc->sc_addr)[0] = 0;
1688			LLADDR(&sc->sc_addr)[1] = 0;
1689			LLADDR(&sc->sc_addr)[2] = 0x5e;
1690			LLADDR(&sc->sc_addr)[3] = 0;
1691			LLADDR(&sc->sc_addr)[4] = 1;
1692			LLADDR(&sc->sc_addr)[5] = sc->sc_vhid;
1693		} else
1694			CARP_LOCK(sc);
1695		locked = 1;
1696		if (carpr.carpr_advbase > 0) {
1697			if (carpr.carpr_advbase > 255 ||
1698			    carpr.carpr_advbase < CARP_DFLTINTV) {
1699				error = EINVAL;
1700				break;
1701			}
1702			sc->sc_advbase = carpr.carpr_advbase;
1703		}
1704		if (carpr.carpr_advskew >= 255) {
1705			error = EINVAL;
1706			break;
1707		}
1708		sc->sc_advskew = carpr.carpr_advskew;
1709		if (carpr.carpr_key[0] != '\0') {
1710			bcopy(carpr.carpr_key, sc->sc_key, sizeof(sc->sc_key));
1711			carp_hmac_prepare(sc);
1712		}
1713		if (sc->sc_state != INIT &&
1714		    carpr.carpr_state != sc->sc_state) {
1715			switch (carpr.carpr_state) {
1716			case BACKUP:
1717				callout_stop(&sc->sc_ad_tmo);
1718				carp_set_state(sc, BACKUP,
1719				    "user requested via ifconfig");
1720				carp_setrun(sc, 0);
1721				carp_delroute(sc);
1722				break;
1723			case MASTER:
1724				carp_master_down_locked(sc,
1725				    "user requested via ifconfig");
1726				break;
1727			default:
1728				break;
1729			}
1730		}
1731		break;
1732
1733	case SIOCGVH:
1734	    {
1735		int priveleged;
1736
1737		if (carpr.carpr_vhid < 0 || carpr.carpr_vhid > CARP_MAXVHID) {
1738			error = EINVAL;
1739			break;
1740		}
1741		if (carpr.carpr_count < 1) {
1742			error = EMSGSIZE;
1743			break;
1744		}
1745		if (ifp->if_carp == NULL) {
1746			error = ENOENT;
1747			break;
1748		}
1749
1750		priveleged = (priv_check(td, PRIV_NETINET_CARP) == 0);
1751		if (carpr.carpr_vhid != 0) {
1752			IFNET_FOREACH_CARP(ifp, sc)
1753				if (sc->sc_vhid == carpr.carpr_vhid)
1754					break;
1755			if (sc == NULL) {
1756				error = ENOENT;
1757				break;
1758			}
1759			carp_carprcp(&carpr, sc, priveleged);
1760			error = copyout(&carpr, ifr_data_get_ptr(ifr),
1761			    sizeof(carpr));
1762		} else  {
1763			int i, count;
1764
1765			count = 0;
1766			IFNET_FOREACH_CARP(ifp, sc)
1767				count++;
1768
1769			if (count > carpr.carpr_count) {
1770				CIF_UNLOCK(ifp->if_carp);
1771				error = EMSGSIZE;
1772				break;
1773			}
1774
1775			i = 0;
1776			IFNET_FOREACH_CARP(ifp, sc) {
1777				carp_carprcp(&carpr, sc, priveleged);
1778				carpr.carpr_count = count;
1779				error = copyout(&carpr,
1780				    (caddr_t)ifr_data_get_ptr(ifr) +
1781				    (i * sizeof(carpr)), sizeof(carpr));
1782				if (error) {
1783					CIF_UNLOCK(ifp->if_carp);
1784					break;
1785				}
1786				i++;
1787			}
1788		}
1789		break;
1790	    }
1791	default:
1792		error = EINVAL;
1793	}
1794	sx_xunlock(&carp_sx);
1795
1796out:
1797	if (locked)
1798		CARP_UNLOCK(sc);
1799	if_rele(ifp);
1800
1801	return (error);
1802}
1803
1804static int
1805carp_get_vhid(struct ifaddr *ifa)
1806{
1807
1808	if (ifa == NULL || ifa->ifa_carp == NULL)
1809		return (0);
1810
1811	return (ifa->ifa_carp->sc_vhid);
1812}
1813
1814int
1815carp_attach(struct ifaddr *ifa, int vhid)
1816{
1817	struct ifnet *ifp = ifa->ifa_ifp;
1818	struct carp_if *cif = ifp->if_carp;
1819	struct carp_softc *sc;
1820	int index, error;
1821
1822	KASSERT(ifa->ifa_carp == NULL, ("%s: ifa %p attached", __func__, ifa));
1823
1824	switch (ifa->ifa_addr->sa_family) {
1825#ifdef INET
1826	case AF_INET:
1827#endif
1828#ifdef INET6
1829	case AF_INET6:
1830#endif
1831		break;
1832	default:
1833		return (EPROTOTYPE);
1834	}
1835
1836	sx_xlock(&carp_sx);
1837	if (ifp->if_carp == NULL) {
1838		sx_xunlock(&carp_sx);
1839		return (ENOPROTOOPT);
1840	}
1841
1842	IFNET_FOREACH_CARP(ifp, sc)
1843		if (sc->sc_vhid == vhid)
1844			break;
1845	if (sc == NULL) {
1846		sx_xunlock(&carp_sx);
1847		return (ENOENT);
1848	}
1849
1850	error = carp_multicast_setup(cif, ifa->ifa_addr->sa_family);
1851	if (error) {
1852		CIF_FREE(cif);
1853		sx_xunlock(&carp_sx);
1854		return (error);
1855	}
1856
1857	index = sc->sc_naddrs + sc->sc_naddrs6 + 1;
1858	if (index > sc->sc_ifasiz / sizeof(struct ifaddr *))
1859		carp_grow_ifas(sc);
1860
1861	switch (ifa->ifa_addr->sa_family) {
1862#ifdef INET
1863	case AF_INET:
1864		cif->cif_naddrs++;
1865		sc->sc_naddrs++;
1866		break;
1867#endif
1868#ifdef INET6
1869	case AF_INET6:
1870		cif->cif_naddrs6++;
1871		sc->sc_naddrs6++;
1872		break;
1873#endif
1874	}
1875
1876	ifa_ref(ifa);
1877
1878	CARP_LOCK(sc);
1879	sc->sc_ifas[index - 1] = ifa;
1880	ifa->ifa_carp = sc;
1881	carp_hmac_prepare(sc);
1882	carp_sc_state(sc);
1883	CARP_UNLOCK(sc);
1884
1885	sx_xunlock(&carp_sx);
1886
1887	return (0);
1888}
1889
1890void
1891carp_detach(struct ifaddr *ifa, bool keep_cif)
1892{
1893	struct ifnet *ifp = ifa->ifa_ifp;
1894	struct carp_if *cif = ifp->if_carp;
1895	struct carp_softc *sc = ifa->ifa_carp;
1896	int i, index;
1897
1898	KASSERT(sc != NULL, ("%s: %p not attached", __func__, ifa));
1899
1900	sx_xlock(&carp_sx);
1901
1902	CARP_LOCK(sc);
1903	/* Shift array. */
1904	index = sc->sc_naddrs + sc->sc_naddrs6;
1905	for (i = 0; i < index; i++)
1906		if (sc->sc_ifas[i] == ifa)
1907			break;
1908	KASSERT(i < index, ("%s: %p no backref", __func__, ifa));
1909	for (; i < index - 1; i++)
1910		sc->sc_ifas[i] = sc->sc_ifas[i+1];
1911	sc->sc_ifas[index - 1] = NULL;
1912
1913	switch (ifa->ifa_addr->sa_family) {
1914#ifdef INET
1915	case AF_INET:
1916		cif->cif_naddrs--;
1917		sc->sc_naddrs--;
1918		break;
1919#endif
1920#ifdef INET6
1921	case AF_INET6:
1922		cif->cif_naddrs6--;
1923		sc->sc_naddrs6--;
1924		break;
1925#endif
1926	}
1927
1928	carp_ifa_delroute(ifa);
1929	carp_multicast_cleanup(cif, ifa->ifa_addr->sa_family);
1930
1931	ifa->ifa_carp = NULL;
1932	ifa_free(ifa);
1933
1934	carp_hmac_prepare(sc);
1935	carp_sc_state(sc);
1936
1937	if (!keep_cif && sc->sc_naddrs == 0 && sc->sc_naddrs6 == 0)
1938		carp_destroy(sc);
1939	else
1940		CARP_UNLOCK(sc);
1941
1942	if (!keep_cif)
1943		CIF_FREE(cif);
1944
1945	sx_xunlock(&carp_sx);
1946}
1947
1948static void
1949carp_set_state(struct carp_softc *sc, int state, const char *reason)
1950{
1951
1952	CARP_LOCK_ASSERT(sc);
1953
1954	if (sc->sc_state != state) {
1955		const char *carp_states[] = { CARP_STATES };
1956		char subsys[IFNAMSIZ+5];
1957
1958		snprintf(subsys, IFNAMSIZ+5, "%u@%s", sc->sc_vhid,
1959		    sc->sc_carpdev->if_xname);
1960
1961		CARP_LOG("%s: %s -> %s (%s)\n", subsys,
1962		    carp_states[sc->sc_state], carp_states[state], reason);
1963
1964		sc->sc_state = state;
1965
1966		devctl_notify("CARP", subsys, carp_states[state], NULL);
1967	}
1968}
1969
1970static void
1971carp_linkstate(struct ifnet *ifp)
1972{
1973	struct carp_softc *sc;
1974
1975	CIF_LOCK(ifp->if_carp);
1976	IFNET_FOREACH_CARP(ifp, sc) {
1977		CARP_LOCK(sc);
1978		carp_sc_state(sc);
1979		CARP_UNLOCK(sc);
1980	}
1981	CIF_UNLOCK(ifp->if_carp);
1982}
1983
1984static void
1985carp_sc_state(struct carp_softc *sc)
1986{
1987
1988	CARP_LOCK_ASSERT(sc);
1989
1990	if (sc->sc_carpdev->if_link_state != LINK_STATE_UP ||
1991	    !(sc->sc_carpdev->if_flags & IFF_UP) ||
1992	    !V_carp_allow) {
1993		callout_stop(&sc->sc_ad_tmo);
1994#ifdef INET
1995		callout_stop(&sc->sc_md_tmo);
1996#endif
1997#ifdef INET6
1998		callout_stop(&sc->sc_md6_tmo);
1999#endif
2000		carp_set_state(sc, INIT, "hardware interface down");
2001		carp_setrun(sc, 0);
2002		if (!sc->sc_suppress)
2003			carp_demote_adj(V_carp_ifdown_adj, "interface down");
2004		sc->sc_suppress = 1;
2005	} else {
2006		carp_set_state(sc, INIT, "hardware interface up");
2007		carp_setrun(sc, 0);
2008		if (sc->sc_suppress)
2009			carp_demote_adj(-V_carp_ifdown_adj, "interface up");
2010		sc->sc_suppress = 0;
2011	}
2012}
2013
2014static void
2015carp_demote_adj(int adj, char *reason)
2016{
2017	atomic_add_int(&V_carp_demotion, adj);
2018	CARP_LOG("demoted by %d to %d (%s)\n", adj, V_carp_demotion, reason);
2019	taskqueue_enqueue(taskqueue_swi, &carp_sendall_task);
2020}
2021
2022static int
2023carp_allow_sysctl(SYSCTL_HANDLER_ARGS)
2024{
2025	int new, error;
2026	struct carp_softc *sc;
2027
2028	new = V_carp_allow;
2029	error = sysctl_handle_int(oidp, &new, 0, req);
2030	if (error || !req->newptr)
2031		return (error);
2032
2033	if (V_carp_allow != new) {
2034		V_carp_allow = new;
2035
2036		mtx_lock(&carp_mtx);
2037		LIST_FOREACH(sc, &carp_list, sc_next) {
2038			CARP_LOCK(sc);
2039			if (curvnet == sc->sc_carpdev->if_vnet)
2040				carp_sc_state(sc);
2041			CARP_UNLOCK(sc);
2042		}
2043		mtx_unlock(&carp_mtx);
2044	}
2045
2046	return (0);
2047}
2048
2049static int
2050carp_dscp_sysctl(SYSCTL_HANDLER_ARGS)
2051{
2052	int new, error;
2053
2054	new = V_carp_dscp;
2055	error = sysctl_handle_int(oidp, &new, 0, req);
2056	if (error || !req->newptr)
2057		return (error);
2058
2059	if (new < 0 || new > 63)
2060		return (EINVAL);
2061
2062	V_carp_dscp = new;
2063
2064	return (0);
2065}
2066
2067static int
2068carp_demote_adj_sysctl(SYSCTL_HANDLER_ARGS)
2069{
2070	int new, error;
2071
2072	new = V_carp_demotion;
2073	error = sysctl_handle_int(oidp, &new, 0, req);
2074	if (error || !req->newptr)
2075		return (error);
2076
2077	carp_demote_adj(new, "sysctl");
2078
2079	return (0);
2080}
2081
2082#ifdef INET
2083extern  struct domain inetdomain;
2084static struct protosw in_carp_protosw = {
2085	.pr_type =		SOCK_RAW,
2086	.pr_domain =		&inetdomain,
2087	.pr_protocol =		IPPROTO_CARP,
2088	.pr_flags =		PR_ATOMIC|PR_ADDR,
2089	.pr_input =		carp_input,
2090	.pr_output =		rip_output,
2091	.pr_ctloutput =		rip_ctloutput,
2092	.pr_usrreqs =		&rip_usrreqs
2093};
2094#endif
2095
2096#ifdef INET6
2097extern	struct domain inet6domain;
2098static struct protosw in6_carp_protosw = {
2099	.pr_type =		SOCK_RAW,
2100	.pr_domain =		&inet6domain,
2101	.pr_protocol =		IPPROTO_CARP,
2102	.pr_flags =		PR_ATOMIC|PR_ADDR,
2103	.pr_input =		carp6_input,
2104	.pr_output =		rip6_output,
2105	.pr_ctloutput =		rip6_ctloutput,
2106	.pr_usrreqs =		&rip6_usrreqs
2107};
2108#endif
2109
2110static void
2111carp_mod_cleanup(void)
2112{
2113
2114#ifdef INET
2115	if (proto_reg[CARP_INET] == 0) {
2116		(void)ipproto_unregister(IPPROTO_CARP);
2117		pf_proto_unregister(PF_INET, IPPROTO_CARP, SOCK_RAW);
2118		proto_reg[CARP_INET] = -1;
2119	}
2120	carp_iamatch_p = NULL;
2121#endif
2122#ifdef INET6
2123	if (proto_reg[CARP_INET6] == 0) {
2124		(void)ip6proto_unregister(IPPROTO_CARP);
2125		pf_proto_unregister(PF_INET6, IPPROTO_CARP, SOCK_RAW);
2126		proto_reg[CARP_INET6] = -1;
2127	}
2128	carp_iamatch6_p = NULL;
2129	carp_macmatch6_p = NULL;
2130#endif
2131	carp_ioctl_p = NULL;
2132	carp_attach_p = NULL;
2133	carp_detach_p = NULL;
2134	carp_get_vhid_p = NULL;
2135	carp_linkstate_p = NULL;
2136	carp_forus_p = NULL;
2137	carp_output_p = NULL;
2138	carp_demote_adj_p = NULL;
2139	carp_master_p = NULL;
2140	mtx_unlock(&carp_mtx);
2141	taskqueue_drain(taskqueue_swi, &carp_sendall_task);
2142	mtx_destroy(&carp_mtx);
2143	sx_destroy(&carp_sx);
2144}
2145
2146static int
2147carp_mod_load(void)
2148{
2149	int err;
2150
2151	mtx_init(&carp_mtx, "carp_mtx", NULL, MTX_DEF);
2152	sx_init(&carp_sx, "carp_sx");
2153	LIST_INIT(&carp_list);
2154	carp_get_vhid_p = carp_get_vhid;
2155	carp_forus_p = carp_forus;
2156	carp_output_p = carp_output;
2157	carp_linkstate_p = carp_linkstate;
2158	carp_ioctl_p = carp_ioctl;
2159	carp_attach_p = carp_attach;
2160	carp_detach_p = carp_detach;
2161	carp_demote_adj_p = carp_demote_adj;
2162	carp_master_p = carp_master;
2163#ifdef INET6
2164	carp_iamatch6_p = carp_iamatch6;
2165	carp_macmatch6_p = carp_macmatch6;
2166	proto_reg[CARP_INET6] = pf_proto_register(PF_INET6,
2167	    (struct protosw *)&in6_carp_protosw);
2168	if (proto_reg[CARP_INET6]) {
2169		printf("carp: error %d attaching to PF_INET6\n",
2170		    proto_reg[CARP_INET6]);
2171		carp_mod_cleanup();
2172		return (proto_reg[CARP_INET6]);
2173	}
2174	err = ip6proto_register(IPPROTO_CARP);
2175	if (err) {
2176		printf("carp: error %d registering with INET6\n", err);
2177		carp_mod_cleanup();
2178		return (err);
2179	}
2180#endif
2181#ifdef INET
2182	carp_iamatch_p = carp_iamatch;
2183	proto_reg[CARP_INET] = pf_proto_register(PF_INET, &in_carp_protosw);
2184	if (proto_reg[CARP_INET]) {
2185		printf("carp: error %d attaching to PF_INET\n",
2186		    proto_reg[CARP_INET]);
2187		carp_mod_cleanup();
2188		return (proto_reg[CARP_INET]);
2189	}
2190	err = ipproto_register(IPPROTO_CARP);
2191	if (err) {
2192		printf("carp: error %d registering with INET\n", err);
2193		carp_mod_cleanup();
2194		return (err);
2195	}
2196#endif
2197	return (0);
2198}
2199
2200static int
2201carp_modevent(module_t mod, int type, void *data)
2202{
2203	switch (type) {
2204	case MOD_LOAD:
2205		return carp_mod_load();
2206		/* NOTREACHED */
2207	case MOD_UNLOAD:
2208		mtx_lock(&carp_mtx);
2209		if (LIST_EMPTY(&carp_list))
2210			carp_mod_cleanup();
2211		else {
2212			mtx_unlock(&carp_mtx);
2213			return (EBUSY);
2214		}
2215		break;
2216
2217	default:
2218		return (EINVAL);
2219	}
2220
2221	return (0);
2222}
2223
2224static moduledata_t carp_mod = {
2225	"carp",
2226	carp_modevent,
2227	0
2228};
2229
2230DECLARE_MODULE(carp, carp_mod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
2231