if_lagg.c revision 227309
1/*	$OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $	*/
2
3/*
4 * Copyright (c) 2005, 2006 Reyk Floeter <reyk@openbsd.org>
5 * Copyright (c) 2007 Andrew Thompson <thompsa@FreeBSD.org>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20#include <sys/cdefs.h>
21__FBSDID("$FreeBSD: head/sys/net/if_lagg.c 227309 2011-11-07 15:43:11Z ed $");
22
23#include "opt_inet.h"
24#include "opt_inet6.h"
25
26#include <sys/param.h>
27#include <sys/kernel.h>
28#include <sys/malloc.h>
29#include <sys/mbuf.h>
30#include <sys/queue.h>
31#include <sys/socket.h>
32#include <sys/sockio.h>
33#include <sys/sysctl.h>
34#include <sys/module.h>
35#include <sys/priv.h>
36#include <sys/systm.h>
37#include <sys/proc.h>
38#include <sys/hash.h>
39#include <sys/lock.h>
40#include <sys/rwlock.h>
41#include <sys/taskqueue.h>
42#include <sys/eventhandler.h>
43
44#include <net/ethernet.h>
45#include <net/if.h>
46#include <net/if_clone.h>
47#include <net/if_arp.h>
48#include <net/if_dl.h>
49#include <net/if_llc.h>
50#include <net/if_media.h>
51#include <net/if_types.h>
52#include <net/if_var.h>
53#include <net/bpf.h>
54
55#if defined(INET) || defined(INET6)
56#include <netinet/in.h>
57#endif
58#ifdef INET
59#include <netinet/in_systm.h>
60#include <netinet/if_ether.h>
61#include <netinet/ip.h>
62#endif
63
64#ifdef INET6
65#include <netinet/ip6.h>
66#endif
67
68#include <net/if_vlan_var.h>
69#include <net/if_lagg.h>
70#include <net/ieee8023ad_lacp.h>
71
72/* Special flags we should propagate to the lagg ports. */
73static struct {
74	int flag;
75	int (*func)(struct ifnet *, int);
76} lagg_pflags[] = {
77	{IFF_PROMISC, ifpromisc},
78	{IFF_ALLMULTI, if_allmulti},
79	{0, NULL}
80};
81
82SLIST_HEAD(__trhead, lagg_softc) lagg_list;	/* list of laggs */
83static struct mtx	lagg_list_mtx;
84eventhandler_tag	lagg_detach_cookie = NULL;
85
86static int	lagg_clone_create(struct if_clone *, int, caddr_t);
87static void	lagg_clone_destroy(struct ifnet *);
88static void	lagg_lladdr(struct lagg_softc *, uint8_t *);
89static void	lagg_capabilities(struct lagg_softc *);
90static void	lagg_port_lladdr(struct lagg_port *, uint8_t *);
91static void	lagg_port_setlladdr(void *, int);
92static int	lagg_port_create(struct lagg_softc *, struct ifnet *);
93static int	lagg_port_destroy(struct lagg_port *, int);
94static struct mbuf *lagg_input(struct ifnet *, struct mbuf *);
95static void	lagg_linkstate(struct lagg_softc *);
96static void	lagg_port_state(struct ifnet *, int);
97static int	lagg_port_ioctl(struct ifnet *, u_long, caddr_t);
98static int	lagg_port_output(struct ifnet *, struct mbuf *,
99		    struct sockaddr *, struct route *);
100static void	lagg_port_ifdetach(void *arg __unused, struct ifnet *);
101#ifdef LAGG_PORT_STACKING
102static int	lagg_port_checkstacking(struct lagg_softc *);
103#endif
104static void	lagg_port2req(struct lagg_port *, struct lagg_reqport *);
105static void	lagg_init(void *);
106static void	lagg_stop(struct lagg_softc *);
107static int	lagg_ioctl(struct ifnet *, u_long, caddr_t);
108static int	lagg_ether_setmulti(struct lagg_softc *);
109static int	lagg_ether_cmdmulti(struct lagg_port *, int);
110static	int	lagg_setflag(struct lagg_port *, int, int,
111		    int (*func)(struct ifnet *, int));
112static	int	lagg_setflags(struct lagg_port *, int status);
113static void	lagg_start(struct ifnet *);
114static int	lagg_media_change(struct ifnet *);
115static void	lagg_media_status(struct ifnet *, struct ifmediareq *);
116static struct lagg_port *lagg_link_active(struct lagg_softc *,
117	    struct lagg_port *);
118static const void *lagg_gethdr(struct mbuf *, u_int, u_int, void *);
119
120IFC_SIMPLE_DECLARE(lagg, 0);
121
122/* Simple round robin */
123static int	lagg_rr_attach(struct lagg_softc *);
124static int	lagg_rr_detach(struct lagg_softc *);
125static int	lagg_rr_start(struct lagg_softc *, struct mbuf *);
126static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *,
127		    struct mbuf *);
128
129/* Active failover */
130static int	lagg_fail_attach(struct lagg_softc *);
131static int	lagg_fail_detach(struct lagg_softc *);
132static int	lagg_fail_start(struct lagg_softc *, struct mbuf *);
133static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *,
134		    struct mbuf *);
135
136/* Loadbalancing */
137static int	lagg_lb_attach(struct lagg_softc *);
138static int	lagg_lb_detach(struct lagg_softc *);
139static int	lagg_lb_port_create(struct lagg_port *);
140static void	lagg_lb_port_destroy(struct lagg_port *);
141static int	lagg_lb_start(struct lagg_softc *, struct mbuf *);
142static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *,
143		    struct mbuf *);
144static int	lagg_lb_porttable(struct lagg_softc *, struct lagg_port *);
145
146/* 802.3ad LACP */
147static int	lagg_lacp_attach(struct lagg_softc *);
148static int	lagg_lacp_detach(struct lagg_softc *);
149static int	lagg_lacp_start(struct lagg_softc *, struct mbuf *);
150static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *,
151		    struct mbuf *);
152static void	lagg_lacp_lladdr(struct lagg_softc *);
153
154/* lagg protocol table */
155static const struct {
156	int			ti_proto;
157	int			(*ti_attach)(struct lagg_softc *);
158} lagg_protos[] = {
159	{ LAGG_PROTO_ROUNDROBIN,	lagg_rr_attach },
160	{ LAGG_PROTO_FAILOVER,		lagg_fail_attach },
161	{ LAGG_PROTO_LOADBALANCE,	lagg_lb_attach },
162	{ LAGG_PROTO_ETHERCHANNEL,	lagg_lb_attach },
163	{ LAGG_PROTO_LACP,		lagg_lacp_attach },
164	{ LAGG_PROTO_NONE,		NULL }
165};
166
167SYSCTL_DECL(_net_link);
168static SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0,
169    "Link Aggregation");
170
171static int lagg_failover_rx_all = 0; /* Allow input on any failover links */
172SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW,
173    &lagg_failover_rx_all, 0,
174    "Accept input from any interface in a failover lagg");
175
176static int
177lagg_modevent(module_t mod, int type, void *data)
178{
179
180	switch (type) {
181	case MOD_LOAD:
182		mtx_init(&lagg_list_mtx, "if_lagg list", NULL, MTX_DEF);
183		SLIST_INIT(&lagg_list);
184		if_clone_attach(&lagg_cloner);
185		lagg_input_p = lagg_input;
186		lagg_linkstate_p = lagg_port_state;
187		lagg_detach_cookie = EVENTHANDLER_REGISTER(
188		    ifnet_departure_event, lagg_port_ifdetach, NULL,
189		    EVENTHANDLER_PRI_ANY);
190		break;
191	case MOD_UNLOAD:
192		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
193		    lagg_detach_cookie);
194		if_clone_detach(&lagg_cloner);
195		lagg_input_p = NULL;
196		lagg_linkstate_p = NULL;
197		mtx_destroy(&lagg_list_mtx);
198		break;
199	default:
200		return (EOPNOTSUPP);
201	}
202	return (0);
203}
204
205static moduledata_t lagg_mod = {
206	"if_lagg",
207	lagg_modevent,
208	0
209};
210
211DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
212MODULE_VERSION(if_lagg, 1);
213
214#if __FreeBSD_version >= 800000
215/*
216 * This routine is run via an vlan
217 * config EVENT
218 */
219static void
220lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
221{
222        struct lagg_softc       *sc = ifp->if_softc;
223        struct lagg_port        *lp;
224
225        if (ifp->if_softc !=  arg)   /* Not our event */
226                return;
227
228        LAGG_RLOCK(sc);
229        if (!SLIST_EMPTY(&sc->sc_ports)) {
230                SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
231                        EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag);
232        }
233        LAGG_RUNLOCK(sc);
234}
235
236/*
237 * This routine is run via an vlan
238 * unconfig EVENT
239 */
240static void
241lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
242{
243        struct lagg_softc       *sc = ifp->if_softc;
244        struct lagg_port        *lp;
245
246        if (ifp->if_softc !=  arg)   /* Not our event */
247                return;
248
249        LAGG_RLOCK(sc);
250        if (!SLIST_EMPTY(&sc->sc_ports)) {
251                SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
252                        EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag);
253        }
254        LAGG_RUNLOCK(sc);
255}
256#endif
257
258static int
259lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
260{
261	struct lagg_softc *sc;
262	struct ifnet *ifp;
263	int i, error = 0;
264	static const u_char eaddr[6];	/* 00:00:00:00:00:00 */
265
266	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
267	ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
268	if (ifp == NULL) {
269		free(sc, M_DEVBUF);
270		return (ENOSPC);
271	}
272
273	sc->sc_proto = LAGG_PROTO_NONE;
274	for (i = 0; lagg_protos[i].ti_proto != LAGG_PROTO_NONE; i++) {
275		if (lagg_protos[i].ti_proto == LAGG_PROTO_DEFAULT) {
276			sc->sc_proto = lagg_protos[i].ti_proto;
277			if ((error = lagg_protos[i].ti_attach(sc)) != 0) {
278				if_free_type(ifp, IFT_ETHER);
279				free(sc, M_DEVBUF);
280				return (error);
281			}
282			break;
283		}
284	}
285	LAGG_LOCK_INIT(sc);
286	SLIST_INIT(&sc->sc_ports);
287	TASK_INIT(&sc->sc_lladdr_task, 0, lagg_port_setlladdr, sc);
288
289	/* Initialise pseudo media types */
290	ifmedia_init(&sc->sc_media, 0, lagg_media_change,
291	    lagg_media_status);
292	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
293	ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
294
295	if_initname(ifp, ifc->ifc_name, unit);
296	ifp->if_type = IFT_ETHER;
297	ifp->if_softc = sc;
298	ifp->if_start = lagg_start;
299	ifp->if_init = lagg_init;
300	ifp->if_ioctl = lagg_ioctl;
301	ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
302
303	IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
304	ifp->if_snd.ifq_drv_maxlen = ifqmaxlen;
305	IFQ_SET_READY(&ifp->if_snd);
306
307	/*
308	 * Attach as an ordinary ethernet device, childs will be attached
309	 * as special device IFT_IEEE8023ADLAG.
310	 */
311	ether_ifattach(ifp, eaddr);
312
313#if __FreeBSD_version >= 800000
314	sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
315		lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
316	sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
317		lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
318#endif
319
320	/* Insert into the global list of laggs */
321	mtx_lock(&lagg_list_mtx);
322	SLIST_INSERT_HEAD(&lagg_list, sc, sc_entries);
323	mtx_unlock(&lagg_list_mtx);
324
325	return (0);
326}
327
328static void
329lagg_clone_destroy(struct ifnet *ifp)
330{
331	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
332	struct lagg_port *lp;
333
334	LAGG_WLOCK(sc);
335
336	lagg_stop(sc);
337	ifp->if_flags &= ~IFF_UP;
338
339#if __FreeBSD_version >= 800000
340	EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach);
341	EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach);
342#endif
343
344	/* Shutdown and remove lagg ports */
345	while ((lp = SLIST_FIRST(&sc->sc_ports)) != NULL)
346		lagg_port_destroy(lp, 1);
347	/* Unhook the aggregation protocol */
348	if (sc->sc_detach != NULL)
349		(*sc->sc_detach)(sc);
350
351	LAGG_WUNLOCK(sc);
352
353	ifmedia_removeall(&sc->sc_media);
354	ether_ifdetach(ifp);
355	if_free_type(ifp, IFT_ETHER);
356
357	mtx_lock(&lagg_list_mtx);
358	SLIST_REMOVE(&lagg_list, sc, lagg_softc, sc_entries);
359	mtx_unlock(&lagg_list_mtx);
360
361	taskqueue_drain(taskqueue_swi, &sc->sc_lladdr_task);
362	LAGG_LOCK_DESTROY(sc);
363	free(sc, M_DEVBUF);
364}
365
366static void
367lagg_lladdr(struct lagg_softc *sc, uint8_t *lladdr)
368{
369	struct ifnet *ifp = sc->sc_ifp;
370
371	if (memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0)
372		return;
373
374	bcopy(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN);
375	/* Let the protocol know the MAC has changed */
376	if (sc->sc_lladdr != NULL)
377		(*sc->sc_lladdr)(sc);
378	EVENTHANDLER_INVOKE(iflladdr_event, ifp);
379}
380
381static void
382lagg_capabilities(struct lagg_softc *sc)
383{
384	struct lagg_port *lp;
385	int cap = ~0, ena = ~0;
386	u_long hwa = ~0UL;
387
388	LAGG_WLOCK_ASSERT(sc);
389
390	/* Get capabilities from the lagg ports */
391	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
392		cap &= lp->lp_ifp->if_capabilities;
393		ena &= lp->lp_ifp->if_capenable;
394		hwa &= lp->lp_ifp->if_hwassist;
395	}
396	cap = (cap == ~0 ? 0 : cap);
397	ena = (ena == ~0 ? 0 : ena);
398	hwa = (hwa == ~0 ? 0 : hwa);
399
400	if (sc->sc_ifp->if_capabilities != cap ||
401	    sc->sc_ifp->if_capenable != ena ||
402	    sc->sc_ifp->if_hwassist != hwa) {
403		sc->sc_ifp->if_capabilities = cap;
404		sc->sc_ifp->if_capenable = ena;
405		sc->sc_ifp->if_hwassist = hwa;
406		getmicrotime(&sc->sc_ifp->if_lastchange);
407
408		if (sc->sc_ifflags & IFF_DEBUG)
409			if_printf(sc->sc_ifp,
410			    "capabilities 0x%08x enabled 0x%08x\n", cap, ena);
411	}
412}
413
414static void
415lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr)
416{
417	struct lagg_softc *sc = lp->lp_softc;
418	struct ifnet *ifp = lp->lp_ifp;
419	struct lagg_llq *llq;
420	int pending = 0;
421
422	LAGG_WLOCK_ASSERT(sc);
423
424	if (lp->lp_detaching ||
425	    memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0)
426		return;
427
428	/* Check to make sure its not already queued to be changed */
429	SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) {
430		if (llq->llq_ifp == ifp) {
431			pending = 1;
432			break;
433		}
434	}
435
436	if (!pending) {
437		llq = malloc(sizeof(struct lagg_llq), M_DEVBUF, M_NOWAIT);
438		if (llq == NULL)	/* XXX what to do */
439			return;
440	}
441
442	/* Update the lladdr even if pending, it may have changed */
443	llq->llq_ifp = ifp;
444	bcopy(lladdr, llq->llq_lladdr, ETHER_ADDR_LEN);
445
446	if (!pending)
447		SLIST_INSERT_HEAD(&sc->sc_llq_head, llq, llq_entries);
448
449	taskqueue_enqueue(taskqueue_swi, &sc->sc_lladdr_task);
450}
451
452/*
453 * Set the interface MAC address from a taskqueue to avoid a LOR.
454 */
455static void
456lagg_port_setlladdr(void *arg, int pending)
457{
458	struct lagg_softc *sc = (struct lagg_softc *)arg;
459	struct lagg_llq *llq, *head;
460	struct ifnet *ifp;
461	int error;
462
463	/* Grab a local reference of the queue and remove it from the softc */
464	LAGG_WLOCK(sc);
465	head = SLIST_FIRST(&sc->sc_llq_head);
466	SLIST_FIRST(&sc->sc_llq_head) = NULL;
467	LAGG_WUNLOCK(sc);
468
469	/*
470	 * Traverse the queue and set the lladdr on each ifp. It is safe to do
471	 * unlocked as we have the only reference to it.
472	 */
473	for (llq = head; llq != NULL; llq = head) {
474		ifp = llq->llq_ifp;
475
476		/* Set the link layer address */
477		error = if_setlladdr(ifp, llq->llq_lladdr, ETHER_ADDR_LEN);
478		if (error)
479			printf("%s: setlladdr failed on %s\n", __func__,
480			    ifp->if_xname);
481
482		head = SLIST_NEXT(llq, llq_entries);
483		free(llq, M_DEVBUF);
484	}
485}
486
487static int
488lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
489{
490	struct lagg_softc *sc_ptr;
491	struct lagg_port *lp;
492	int error = 0;
493
494	LAGG_WLOCK_ASSERT(sc);
495
496	/* Limit the maximal number of lagg ports */
497	if (sc->sc_count >= LAGG_MAX_PORTS)
498		return (ENOSPC);
499
500	/* Check if port has already been associated to a lagg */
501	if (ifp->if_lagg != NULL)
502		return (EBUSY);
503
504	/* XXX Disallow non-ethernet interfaces (this should be any of 802) */
505	if (ifp->if_type != IFT_ETHER)
506		return (EPROTONOSUPPORT);
507
508	/* Allow the first Ethernet member to define the MTU */
509	if (SLIST_EMPTY(&sc->sc_ports))
510		sc->sc_ifp->if_mtu = ifp->if_mtu;
511	else if (sc->sc_ifp->if_mtu != ifp->if_mtu) {
512		if_printf(sc->sc_ifp, "invalid MTU for %s\n",
513		    ifp->if_xname);
514		return (EINVAL);
515	}
516
517	if ((lp = malloc(sizeof(struct lagg_port),
518	    M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
519		return (ENOMEM);
520
521	/* Check if port is a stacked lagg */
522	mtx_lock(&lagg_list_mtx);
523	SLIST_FOREACH(sc_ptr, &lagg_list, sc_entries) {
524		if (ifp == sc_ptr->sc_ifp) {
525			mtx_unlock(&lagg_list_mtx);
526			free(lp, M_DEVBUF);
527			return (EINVAL);
528			/* XXX disable stacking for the moment, its untested */
529#ifdef LAGG_PORT_STACKING
530			lp->lp_flags |= LAGG_PORT_STACK;
531			if (lagg_port_checkstacking(sc_ptr) >=
532			    LAGG_MAX_STACKING) {
533				mtx_unlock(&lagg_list_mtx);
534				free(lp, M_DEVBUF);
535				return (E2BIG);
536			}
537#endif
538		}
539	}
540	mtx_unlock(&lagg_list_mtx);
541
542	/* Change the interface type */
543	lp->lp_iftype = ifp->if_type;
544	ifp->if_type = IFT_IEEE8023ADLAG;
545	ifp->if_lagg = lp;
546	lp->lp_ioctl = ifp->if_ioctl;
547	ifp->if_ioctl = lagg_port_ioctl;
548	lp->lp_output = ifp->if_output;
549	ifp->if_output = lagg_port_output;
550
551	lp->lp_ifp = ifp;
552	lp->lp_softc = sc;
553
554	/* Save port link layer address */
555	bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN);
556
557	if (SLIST_EMPTY(&sc->sc_ports)) {
558		sc->sc_primary = lp;
559		lagg_lladdr(sc, IF_LLADDR(ifp));
560	} else {
561		/* Update link layer address for this port */
562		lagg_port_lladdr(lp, IF_LLADDR(sc->sc_ifp));
563	}
564
565	/* Insert into the list of ports */
566	SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries);
567	sc->sc_count++;
568
569	/* Update lagg capabilities */
570	lagg_capabilities(sc);
571	lagg_linkstate(sc);
572
573	/* Add multicast addresses and interface flags to this port */
574	lagg_ether_cmdmulti(lp, 1);
575	lagg_setflags(lp, 1);
576
577	if (sc->sc_port_create != NULL)
578		error = (*sc->sc_port_create)(lp);
579	if (error) {
580		/* remove the port again, without calling sc_port_destroy */
581		lagg_port_destroy(lp, 0);
582		return (error);
583	}
584
585	return (error);
586}
587
588#ifdef LAGG_PORT_STACKING
589static int
590lagg_port_checkstacking(struct lagg_softc *sc)
591{
592	struct lagg_softc *sc_ptr;
593	struct lagg_port *lp;
594	int m = 0;
595
596	LAGG_WLOCK_ASSERT(sc);
597
598	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
599		if (lp->lp_flags & LAGG_PORT_STACK) {
600			sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc;
601			m = MAX(m, lagg_port_checkstacking(sc_ptr));
602		}
603	}
604
605	return (m + 1);
606}
607#endif
608
609static int
610lagg_port_destroy(struct lagg_port *lp, int runpd)
611{
612	struct lagg_softc *sc = lp->lp_softc;
613	struct lagg_port *lp_ptr;
614	struct lagg_llq *llq;
615	struct ifnet *ifp = lp->lp_ifp;
616
617	LAGG_WLOCK_ASSERT(sc);
618
619	if (runpd && sc->sc_port_destroy != NULL)
620		(*sc->sc_port_destroy)(lp);
621
622	/*
623	 * Remove multicast addresses and interface flags from this port and
624	 * reset the MAC address, skip if the interface is being detached.
625	 */
626	if (!lp->lp_detaching) {
627		lagg_ether_cmdmulti(lp, 0);
628		lagg_setflags(lp, 0);
629		lagg_port_lladdr(lp, lp->lp_lladdr);
630	}
631
632	/* Restore interface */
633	ifp->if_type = lp->lp_iftype;
634	ifp->if_ioctl = lp->lp_ioctl;
635	ifp->if_output = lp->lp_output;
636	ifp->if_lagg = NULL;
637
638	/* Finally, remove the port from the lagg */
639	SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries);
640	sc->sc_count--;
641
642	/* Update the primary interface */
643	if (lp == sc->sc_primary) {
644		uint8_t lladdr[ETHER_ADDR_LEN];
645
646		if ((lp_ptr = SLIST_FIRST(&sc->sc_ports)) == NULL) {
647			bzero(&lladdr, ETHER_ADDR_LEN);
648		} else {
649			bcopy(lp_ptr->lp_lladdr,
650			    lladdr, ETHER_ADDR_LEN);
651		}
652		lagg_lladdr(sc, lladdr);
653		sc->sc_primary = lp_ptr;
654
655		/* Update link layer address for each port */
656		SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries)
657			lagg_port_lladdr(lp_ptr, lladdr);
658	}
659
660	/* Remove any pending lladdr changes from the queue */
661	if (lp->lp_detaching) {
662		SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) {
663			if (llq->llq_ifp == ifp) {
664				SLIST_REMOVE(&sc->sc_llq_head, llq, lagg_llq,
665				    llq_entries);
666				free(llq, M_DEVBUF);
667				break;	/* Only appears once */
668			}
669		}
670	}
671
672	if (lp->lp_ifflags)
673		if_printf(ifp, "%s: lp_ifflags unclean\n", __func__);
674
675	free(lp, M_DEVBUF);
676
677	/* Update lagg capabilities */
678	lagg_capabilities(sc);
679	lagg_linkstate(sc);
680
681	return (0);
682}
683
684static int
685lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
686{
687	struct lagg_reqport *rp = (struct lagg_reqport *)data;
688	struct lagg_softc *sc;
689	struct lagg_port *lp = NULL;
690	int error = 0;
691
692	/* Should be checked by the caller */
693	if (ifp->if_type != IFT_IEEE8023ADLAG ||
694	    (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL)
695		goto fallback;
696
697	switch (cmd) {
698	case SIOCGLAGGPORT:
699		if (rp->rp_portname[0] == '\0' ||
700		    ifunit(rp->rp_portname) != ifp) {
701			error = EINVAL;
702			break;
703		}
704
705		LAGG_RLOCK(sc);
706		if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) {
707			error = ENOENT;
708			LAGG_RUNLOCK(sc);
709			break;
710		}
711
712		lagg_port2req(lp, rp);
713		LAGG_RUNLOCK(sc);
714		break;
715
716	case SIOCSIFCAP:
717		if (lp->lp_ioctl == NULL) {
718			error = EINVAL;
719			break;
720		}
721		error = (*lp->lp_ioctl)(ifp, cmd, data);
722		if (error)
723			break;
724
725		/* Update lagg interface capabilities */
726		LAGG_WLOCK(sc);
727		lagg_capabilities(sc);
728		LAGG_WUNLOCK(sc);
729		break;
730
731	case SIOCSIFMTU:
732		/* Do not allow the MTU to be changed once joined */
733		error = EINVAL;
734		break;
735
736	default:
737		goto fallback;
738	}
739
740	return (error);
741
742fallback:
743	if (lp->lp_ioctl != NULL)
744		return ((*lp->lp_ioctl)(ifp, cmd, data));
745
746	return (EINVAL);
747}
748
749static int
750lagg_port_output(struct ifnet *ifp, struct mbuf *m,
751	struct sockaddr *dst, struct route *ro)
752{
753	struct lagg_port *lp = ifp->if_lagg;
754	struct ether_header *eh;
755	short type = 0;
756
757	switch (dst->sa_family) {
758		case pseudo_AF_HDRCMPLT:
759		case AF_UNSPEC:
760			eh = (struct ether_header *)dst->sa_data;
761			type = eh->ether_type;
762			break;
763	}
764
765	/*
766	 * Only allow ethernet types required to initiate or maintain the link,
767	 * aggregated frames take a different path.
768	 */
769	switch (ntohs(type)) {
770		case ETHERTYPE_PAE:	/* EAPOL PAE/802.1x */
771			return ((*lp->lp_output)(ifp, m, dst, ro));
772	}
773
774	/* drop any other frames */
775	m_freem(m);
776	return (EBUSY);
777}
778
779static void
780lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp)
781{
782	struct lagg_port *lp;
783	struct lagg_softc *sc;
784
785	if ((lp = ifp->if_lagg) == NULL)
786		return;
787
788	sc = lp->lp_softc;
789
790	LAGG_WLOCK(sc);
791	lp->lp_detaching = 1;
792	lagg_port_destroy(lp, 1);
793	LAGG_WUNLOCK(sc);
794}
795
796static void
797lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp)
798{
799	struct lagg_softc *sc = lp->lp_softc;
800
801	strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname));
802	strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname));
803	rp->rp_prio = lp->lp_prio;
804	rp->rp_flags = lp->lp_flags;
805	if (sc->sc_portreq != NULL)
806		(*sc->sc_portreq)(lp, (caddr_t)&rp->rp_psc);
807
808	/* Add protocol specific flags */
809	switch (sc->sc_proto) {
810		case LAGG_PROTO_FAILOVER:
811			if (lp == sc->sc_primary)
812				rp->rp_flags |= LAGG_PORT_MASTER;
813			if (lp == lagg_link_active(sc, sc->sc_primary))
814				rp->rp_flags |= LAGG_PORT_ACTIVE;
815			break;
816
817		case LAGG_PROTO_ROUNDROBIN:
818		case LAGG_PROTO_LOADBALANCE:
819		case LAGG_PROTO_ETHERCHANNEL:
820			if (LAGG_PORTACTIVE(lp))
821				rp->rp_flags |= LAGG_PORT_ACTIVE;
822			break;
823
824		case LAGG_PROTO_LACP:
825			/* LACP has a different definition of active */
826			if (lacp_isactive(lp))
827				rp->rp_flags |= LAGG_PORT_ACTIVE;
828			if (lacp_iscollecting(lp))
829				rp->rp_flags |= LAGG_PORT_COLLECTING;
830			if (lacp_isdistributing(lp))
831				rp->rp_flags |= LAGG_PORT_DISTRIBUTING;
832			break;
833	}
834
835}
836
837static void
838lagg_init(void *xsc)
839{
840	struct lagg_softc *sc = (struct lagg_softc *)xsc;
841	struct lagg_port *lp;
842	struct ifnet *ifp = sc->sc_ifp;
843
844	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
845		return;
846
847	LAGG_WLOCK(sc);
848
849	ifp->if_drv_flags |= IFF_DRV_RUNNING;
850	/* Update the port lladdrs */
851	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
852		lagg_port_lladdr(lp, IF_LLADDR(ifp));
853
854	if (sc->sc_init != NULL)
855		(*sc->sc_init)(sc);
856
857	LAGG_WUNLOCK(sc);
858}
859
860static void
861lagg_stop(struct lagg_softc *sc)
862{
863	struct ifnet *ifp = sc->sc_ifp;
864
865	LAGG_WLOCK_ASSERT(sc);
866
867	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
868		return;
869
870	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
871
872	if (sc->sc_stop != NULL)
873		(*sc->sc_stop)(sc);
874}
875
876static int
877lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
878{
879	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
880	struct lagg_reqall *ra = (struct lagg_reqall *)data;
881	struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf;
882	struct ifreq *ifr = (struct ifreq *)data;
883	struct lagg_port *lp;
884	struct ifnet *tpif;
885	struct thread *td = curthread;
886	char *buf, *outbuf;
887	int count, buflen, len, error = 0;
888
889	bzero(&rpbuf, sizeof(rpbuf));
890
891	switch (cmd) {
892	case SIOCGLAGG:
893		LAGG_RLOCK(sc);
894		count = 0;
895		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
896			count++;
897		buflen = count * sizeof(struct lagg_reqport);
898		LAGG_RUNLOCK(sc);
899
900		outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
901
902		LAGG_RLOCK(sc);
903		ra->ra_proto = sc->sc_proto;
904		if (sc->sc_req != NULL)
905			(*sc->sc_req)(sc, (caddr_t)&ra->ra_psc);
906
907		count = 0;
908		buf = outbuf;
909		len = min(ra->ra_size, buflen);
910		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
911			if (len < sizeof(rpbuf))
912				break;
913
914			lagg_port2req(lp, &rpbuf);
915			memcpy(buf, &rpbuf, sizeof(rpbuf));
916			count++;
917			buf += sizeof(rpbuf);
918			len -= sizeof(rpbuf);
919		}
920		LAGG_RUNLOCK(sc);
921		ra->ra_ports = count;
922		ra->ra_size = count * sizeof(rpbuf);
923		error = copyout(outbuf, ra->ra_port, ra->ra_size);
924		free(outbuf, M_TEMP);
925		break;
926	case SIOCSLAGG:
927		error = priv_check(td, PRIV_NET_LAGG);
928		if (error)
929			break;
930		if (ra->ra_proto >= LAGG_PROTO_MAX) {
931			error = EPROTONOSUPPORT;
932			break;
933		}
934		if (sc->sc_proto != LAGG_PROTO_NONE) {
935			LAGG_WLOCK(sc);
936			error = sc->sc_detach(sc);
937			/* Reset protocol and pointers */
938			sc->sc_proto = LAGG_PROTO_NONE;
939			sc->sc_detach = NULL;
940			sc->sc_start = NULL;
941			sc->sc_input = NULL;
942			sc->sc_port_create = NULL;
943			sc->sc_port_destroy = NULL;
944			sc->sc_linkstate = NULL;
945			sc->sc_init = NULL;
946			sc->sc_stop = NULL;
947			sc->sc_lladdr = NULL;
948			sc->sc_req = NULL;
949			sc->sc_portreq = NULL;
950			LAGG_WUNLOCK(sc);
951		}
952		if (error != 0)
953			break;
954		for (int i = 0; i < (sizeof(lagg_protos) /
955		    sizeof(lagg_protos[0])); i++) {
956			if (lagg_protos[i].ti_proto == ra->ra_proto) {
957				if (sc->sc_ifflags & IFF_DEBUG)
958					printf("%s: using proto %u\n",
959					    sc->sc_ifname,
960					    lagg_protos[i].ti_proto);
961				LAGG_WLOCK(sc);
962				sc->sc_proto = lagg_protos[i].ti_proto;
963				if (sc->sc_proto != LAGG_PROTO_NONE)
964					error = lagg_protos[i].ti_attach(sc);
965				LAGG_WUNLOCK(sc);
966				return (error);
967			}
968		}
969		error = EPROTONOSUPPORT;
970		break;
971	case SIOCGLAGGPORT:
972		if (rp->rp_portname[0] == '\0' ||
973		    (tpif = ifunit(rp->rp_portname)) == NULL) {
974			error = EINVAL;
975			break;
976		}
977
978		LAGG_RLOCK(sc);
979		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
980		    lp->lp_softc != sc) {
981			error = ENOENT;
982			LAGG_RUNLOCK(sc);
983			break;
984		}
985
986		lagg_port2req(lp, rp);
987		LAGG_RUNLOCK(sc);
988		break;
989	case SIOCSLAGGPORT:
990		error = priv_check(td, PRIV_NET_LAGG);
991		if (error)
992			break;
993		if (rp->rp_portname[0] == '\0' ||
994		    (tpif = ifunit(rp->rp_portname)) == NULL) {
995			error = EINVAL;
996			break;
997		}
998		LAGG_WLOCK(sc);
999		error = lagg_port_create(sc, tpif);
1000		LAGG_WUNLOCK(sc);
1001		break;
1002	case SIOCSLAGGDELPORT:
1003		error = priv_check(td, PRIV_NET_LAGG);
1004		if (error)
1005			break;
1006		if (rp->rp_portname[0] == '\0' ||
1007		    (tpif = ifunit(rp->rp_portname)) == NULL) {
1008			error = EINVAL;
1009			break;
1010		}
1011
1012		LAGG_WLOCK(sc);
1013		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
1014		    lp->lp_softc != sc) {
1015			error = ENOENT;
1016			LAGG_WUNLOCK(sc);
1017			break;
1018		}
1019
1020		error = lagg_port_destroy(lp, 1);
1021		LAGG_WUNLOCK(sc);
1022		break;
1023	case SIOCSIFFLAGS:
1024		/* Set flags on ports too */
1025		LAGG_WLOCK(sc);
1026		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1027			lagg_setflags(lp, 1);
1028		}
1029		LAGG_WUNLOCK(sc);
1030
1031		if (!(ifp->if_flags & IFF_UP) &&
1032		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1033			/*
1034			 * If interface is marked down and it is running,
1035			 * then stop and disable it.
1036			 */
1037			LAGG_WLOCK(sc);
1038			lagg_stop(sc);
1039			LAGG_WUNLOCK(sc);
1040		} else if ((ifp->if_flags & IFF_UP) &&
1041		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1042			/*
1043			 * If interface is marked up and it is stopped, then
1044			 * start it.
1045			 */
1046			(*ifp->if_init)(sc);
1047		}
1048		break;
1049	case SIOCADDMULTI:
1050	case SIOCDELMULTI:
1051		LAGG_WLOCK(sc);
1052		error = lagg_ether_setmulti(sc);
1053		LAGG_WUNLOCK(sc);
1054		break;
1055	case SIOCSIFMEDIA:
1056	case SIOCGIFMEDIA:
1057		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
1058		break;
1059
1060	case SIOCSIFCAP:
1061	case SIOCSIFMTU:
1062		/* Do not allow the MTU or caps to be directly changed */
1063		error = EINVAL;
1064		break;
1065
1066	default:
1067		error = ether_ioctl(ifp, cmd, data);
1068		break;
1069	}
1070	return (error);
1071}
1072
1073static int
1074lagg_ether_setmulti(struct lagg_softc *sc)
1075{
1076	struct lagg_port *lp;
1077
1078	LAGG_WLOCK_ASSERT(sc);
1079
1080	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1081		/* First, remove any existing filter entries. */
1082		lagg_ether_cmdmulti(lp, 0);
1083		/* copy all addresses from the lagg interface to the port */
1084		lagg_ether_cmdmulti(lp, 1);
1085	}
1086	return (0);
1087}
1088
1089static int
1090lagg_ether_cmdmulti(struct lagg_port *lp, int set)
1091{
1092	struct lagg_softc *sc = lp->lp_softc;
1093	struct ifnet *ifp = lp->lp_ifp;
1094	struct ifnet *scifp = sc->sc_ifp;
1095	struct lagg_mc *mc;
1096	struct ifmultiaddr *ifma, *rifma = NULL;
1097	struct sockaddr_dl sdl;
1098	int error;
1099
1100	LAGG_WLOCK_ASSERT(sc);
1101
1102	bzero((char *)&sdl, sizeof(sdl));
1103	sdl.sdl_len = sizeof(sdl);
1104	sdl.sdl_family = AF_LINK;
1105	sdl.sdl_type = IFT_ETHER;
1106	sdl.sdl_alen = ETHER_ADDR_LEN;
1107	sdl.sdl_index = ifp->if_index;
1108
1109	if (set) {
1110		TAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) {
1111			if (ifma->ifma_addr->sa_family != AF_LINK)
1112				continue;
1113			bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1114			    LLADDR(&sdl), ETHER_ADDR_LEN);
1115
1116			error = if_addmulti(ifp, (struct sockaddr *)&sdl, &rifma);
1117			if (error)
1118				return (error);
1119			mc = malloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT);
1120			if (mc == NULL)
1121				return (ENOMEM);
1122			mc->mc_ifma = rifma;
1123			SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries);
1124		}
1125	} else {
1126		while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) {
1127			SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries);
1128			if_delmulti_ifma(mc->mc_ifma);
1129			free(mc, M_DEVBUF);
1130		}
1131	}
1132	return (0);
1133}
1134
1135/* Handle a ref counted flag that should be set on the lagg port as well */
1136static int
1137lagg_setflag(struct lagg_port *lp, int flag, int status,
1138	     int (*func)(struct ifnet *, int))
1139{
1140	struct lagg_softc *sc = lp->lp_softc;
1141	struct ifnet *scifp = sc->sc_ifp;
1142	struct ifnet *ifp = lp->lp_ifp;
1143	int error;
1144
1145	LAGG_WLOCK_ASSERT(sc);
1146
1147	status = status ? (scifp->if_flags & flag) : 0;
1148	/* Now "status" contains the flag value or 0 */
1149
1150	/*
1151	 * See if recorded ports status is different from what
1152	 * we want it to be.  If it is, flip it.  We record ports
1153	 * status in lp_ifflags so that we won't clear ports flag
1154	 * we haven't set.  In fact, we don't clear or set ports
1155	 * flags directly, but get or release references to them.
1156	 * That's why we can be sure that recorded flags still are
1157	 * in accord with actual ports flags.
1158	 */
1159	if (status != (lp->lp_ifflags & flag)) {
1160		error = (*func)(ifp, status);
1161		if (error)
1162			return (error);
1163		lp->lp_ifflags &= ~flag;
1164		lp->lp_ifflags |= status;
1165	}
1166	return (0);
1167}
1168
1169/*
1170 * Handle IFF_* flags that require certain changes on the lagg port
1171 * if "status" is true, update ports flags respective to the lagg
1172 * if "status" is false, forcedly clear the flags set on port.
1173 */
1174static int
1175lagg_setflags(struct lagg_port *lp, int status)
1176{
1177	int error, i;
1178
1179	for (i = 0; lagg_pflags[i].flag; i++) {
1180		error = lagg_setflag(lp, lagg_pflags[i].flag,
1181		    status, lagg_pflags[i].func);
1182		if (error)
1183			return (error);
1184	}
1185	return (0);
1186}
1187
1188static void
1189lagg_start(struct ifnet *ifp)
1190{
1191	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1192	struct mbuf *m;
1193	int error = 0;
1194
1195	LAGG_RLOCK(sc);
1196	/* We need a Tx algorithm and at least one port */
1197	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
1198		IF_DRAIN(&ifp->if_snd);
1199		LAGG_RUNLOCK(sc);
1200		return;
1201	}
1202
1203	for (;; error = 0) {
1204		IFQ_DEQUEUE(&ifp->if_snd, m);
1205		if (m == NULL)
1206			break;
1207
1208		ETHER_BPF_MTAP(ifp, m);
1209
1210		error = (*sc->sc_start)(sc, m);
1211		if (error == 0)
1212			ifp->if_opackets++;
1213		else
1214			ifp->if_oerrors++;
1215	}
1216	LAGG_RUNLOCK(sc);
1217}
1218
1219static struct mbuf *
1220lagg_input(struct ifnet *ifp, struct mbuf *m)
1221{
1222	struct lagg_port *lp = ifp->if_lagg;
1223	struct lagg_softc *sc = lp->lp_softc;
1224	struct ifnet *scifp = sc->sc_ifp;
1225
1226	LAGG_RLOCK(sc);
1227	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
1228	    (lp->lp_flags & LAGG_PORT_DISABLED) ||
1229	    sc->sc_proto == LAGG_PROTO_NONE) {
1230		LAGG_RUNLOCK(sc);
1231		m_freem(m);
1232		return (NULL);
1233	}
1234
1235	ETHER_BPF_MTAP(scifp, m);
1236
1237	m = (*sc->sc_input)(sc, lp, m);
1238
1239	if (m != NULL) {
1240		scifp->if_ipackets++;
1241		scifp->if_ibytes += m->m_pkthdr.len;
1242
1243		if (scifp->if_flags & IFF_MONITOR) {
1244			m_freem(m);
1245			m = NULL;
1246		}
1247	}
1248
1249	LAGG_RUNLOCK(sc);
1250	return (m);
1251}
1252
1253static int
1254lagg_media_change(struct ifnet *ifp)
1255{
1256	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1257
1258	if (sc->sc_ifflags & IFF_DEBUG)
1259		printf("%s\n", __func__);
1260
1261	/* Ignore */
1262	return (0);
1263}
1264
1265static void
1266lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr)
1267{
1268	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1269	struct lagg_port *lp;
1270
1271	imr->ifm_status = IFM_AVALID;
1272	imr->ifm_active = IFM_ETHER | IFM_AUTO;
1273
1274	LAGG_RLOCK(sc);
1275	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1276		if (LAGG_PORTACTIVE(lp))
1277			imr->ifm_status |= IFM_ACTIVE;
1278	}
1279	LAGG_RUNLOCK(sc);
1280}
1281
1282static void
1283lagg_linkstate(struct lagg_softc *sc)
1284{
1285	struct lagg_port *lp;
1286	int new_link = LINK_STATE_DOWN;
1287	uint64_t speed;
1288
1289	/* Our link is considered up if at least one of our ports is active */
1290	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1291		if (lp->lp_link_state == LINK_STATE_UP) {
1292			new_link = LINK_STATE_UP;
1293			break;
1294		}
1295	}
1296	if_link_state_change(sc->sc_ifp, new_link);
1297
1298	/* Update if_baudrate to reflect the max possible speed */
1299	switch (sc->sc_proto) {
1300		case LAGG_PROTO_FAILOVER:
1301			sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ?
1302			    sc->sc_primary->lp_ifp->if_baudrate : 0;
1303			break;
1304		case LAGG_PROTO_ROUNDROBIN:
1305		case LAGG_PROTO_LOADBALANCE:
1306		case LAGG_PROTO_ETHERCHANNEL:
1307			speed = 0;
1308			SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1309				speed += lp->lp_ifp->if_baudrate;
1310			sc->sc_ifp->if_baudrate = speed;
1311			break;
1312		case LAGG_PROTO_LACP:
1313			/* LACP updates if_baudrate itself */
1314			break;
1315	}
1316}
1317
1318static void
1319lagg_port_state(struct ifnet *ifp, int state)
1320{
1321	struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg;
1322	struct lagg_softc *sc = NULL;
1323
1324	if (lp != NULL)
1325		sc = lp->lp_softc;
1326	if (sc == NULL)
1327		return;
1328
1329	LAGG_WLOCK(sc);
1330	lagg_linkstate(sc);
1331	if (sc->sc_linkstate != NULL)
1332		(*sc->sc_linkstate)(lp);
1333	LAGG_WUNLOCK(sc);
1334}
1335
1336struct lagg_port *
1337lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp)
1338{
1339	struct lagg_port *lp_next, *rval = NULL;
1340	// int new_link = LINK_STATE_DOWN;
1341
1342	LAGG_RLOCK_ASSERT(sc);
1343	/*
1344	 * Search a port which reports an active link state.
1345	 */
1346
1347	if (lp == NULL)
1348		goto search;
1349	if (LAGG_PORTACTIVE(lp)) {
1350		rval = lp;
1351		goto found;
1352	}
1353	if ((lp_next = SLIST_NEXT(lp, lp_entries)) != NULL &&
1354	    LAGG_PORTACTIVE(lp_next)) {
1355		rval = lp_next;
1356		goto found;
1357	}
1358
1359search:
1360	SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
1361		if (LAGG_PORTACTIVE(lp_next)) {
1362			rval = lp_next;
1363			goto found;
1364		}
1365	}
1366
1367found:
1368	if (rval != NULL) {
1369		/*
1370		 * The IEEE 802.1D standard assumes that a lagg with
1371		 * multiple ports is always full duplex. This is valid
1372		 * for load sharing laggs and if at least two links
1373		 * are active. Unfortunately, checking the latter would
1374		 * be too expensive at this point.
1375		 XXX
1376		if ((sc->sc_capabilities & IFCAP_LAGG_FULLDUPLEX) &&
1377		    (sc->sc_count > 1))
1378			new_link = LINK_STATE_FULL_DUPLEX;
1379		else
1380			new_link = rval->lp_link_state;
1381		 */
1382	}
1383
1384	return (rval);
1385}
1386
1387static const void *
1388lagg_gethdr(struct mbuf *m, u_int off, u_int len, void *buf)
1389{
1390	if (m->m_pkthdr.len < (off + len)) {
1391		return (NULL);
1392	} else if (m->m_len < (off + len)) {
1393		m_copydata(m, off, len, buf);
1394		return (buf);
1395	}
1396	return (mtod(m, char *) + off);
1397}
1398
1399uint32_t
1400lagg_hashmbuf(struct mbuf *m, uint32_t key)
1401{
1402	uint16_t etype;
1403	uint32_t p = 0;
1404	int off;
1405	struct ether_header *eh;
1406	struct ether_vlan_header vlanbuf;
1407	const struct ether_vlan_header *vlan;
1408#ifdef INET
1409	const struct ip *ip;
1410	struct ip ipbuf;
1411#endif
1412#ifdef INET6
1413	const struct ip6_hdr *ip6;
1414	struct ip6_hdr ip6buf;
1415	uint32_t flow;
1416#endif
1417
1418	off = sizeof(*eh);
1419	if (m->m_len < off)
1420		goto out;
1421	eh = mtod(m, struct ether_header *);
1422	etype = ntohs(eh->ether_type);
1423	p = hash32_buf(&eh->ether_shost, ETHER_ADDR_LEN, key);
1424	p = hash32_buf(&eh->ether_dhost, ETHER_ADDR_LEN, p);
1425
1426	/* Special handling for encapsulating VLAN frames */
1427	if (m->m_flags & M_VLANTAG) {
1428		p = hash32_buf(&m->m_pkthdr.ether_vtag,
1429		    sizeof(m->m_pkthdr.ether_vtag), p);
1430	} else if (etype == ETHERTYPE_VLAN) {
1431		vlan = lagg_gethdr(m, off,  sizeof(*vlan), &vlanbuf);
1432		if (vlan == NULL)
1433			goto out;
1434
1435		p = hash32_buf(&vlan->evl_tag, sizeof(vlan->evl_tag), p);
1436		etype = ntohs(vlan->evl_proto);
1437		off += sizeof(*vlan) - sizeof(*eh);
1438	}
1439
1440	switch (etype) {
1441#ifdef INET
1442	case ETHERTYPE_IP:
1443		ip = lagg_gethdr(m, off, sizeof(*ip), &ipbuf);
1444		if (ip == NULL)
1445			goto out;
1446
1447		p = hash32_buf(&ip->ip_src, sizeof(struct in_addr), p);
1448		p = hash32_buf(&ip->ip_dst, sizeof(struct in_addr), p);
1449		break;
1450#endif
1451#ifdef INET6
1452	case ETHERTYPE_IPV6:
1453		ip6 = lagg_gethdr(m, off, sizeof(*ip6), &ip6buf);
1454		if (ip6 == NULL)
1455			goto out;
1456
1457		p = hash32_buf(&ip6->ip6_src, sizeof(struct in6_addr), p);
1458		p = hash32_buf(&ip6->ip6_dst, sizeof(struct in6_addr), p);
1459		flow = ip6->ip6_flow & IPV6_FLOWLABEL_MASK;
1460		p = hash32_buf(&flow, sizeof(flow), p);	/* IPv6 flow label */
1461		break;
1462#endif
1463	}
1464out:
1465	return (p);
1466}
1467
1468int
1469lagg_enqueue(struct ifnet *ifp, struct mbuf *m)
1470{
1471
1472	return (ifp->if_transmit)(ifp, m);
1473}
1474
1475/*
1476 * Simple round robin aggregation
1477 */
1478
1479static int
1480lagg_rr_attach(struct lagg_softc *sc)
1481{
1482	sc->sc_detach = lagg_rr_detach;
1483	sc->sc_start = lagg_rr_start;
1484	sc->sc_input = lagg_rr_input;
1485	sc->sc_port_create = NULL;
1486	sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX;
1487	sc->sc_seq = 0;
1488
1489	return (0);
1490}
1491
1492static int
1493lagg_rr_detach(struct lagg_softc *sc)
1494{
1495	return (0);
1496}
1497
1498static int
1499lagg_rr_start(struct lagg_softc *sc, struct mbuf *m)
1500{
1501	struct lagg_port *lp;
1502	uint32_t p;
1503
1504	p = atomic_fetchadd_32(&sc->sc_seq, 1);
1505	p %= sc->sc_count;
1506	lp = SLIST_FIRST(&sc->sc_ports);
1507	while (p--)
1508		lp = SLIST_NEXT(lp, lp_entries);
1509
1510	/*
1511	 * Check the port's link state. This will return the next active
1512	 * port if the link is down or the port is NULL.
1513	 */
1514	if ((lp = lagg_link_active(sc, lp)) == NULL) {
1515		m_freem(m);
1516		return (ENOENT);
1517	}
1518
1519	/* Send mbuf */
1520	return (lagg_enqueue(lp->lp_ifp, m));
1521}
1522
1523static struct mbuf *
1524lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1525{
1526	struct ifnet *ifp = sc->sc_ifp;
1527
1528	/* Just pass in the packet to our lagg device */
1529	m->m_pkthdr.rcvif = ifp;
1530
1531	return (m);
1532}
1533
1534/*
1535 * Active failover
1536 */
1537
1538static int
1539lagg_fail_attach(struct lagg_softc *sc)
1540{
1541	sc->sc_detach = lagg_fail_detach;
1542	sc->sc_start = lagg_fail_start;
1543	sc->sc_input = lagg_fail_input;
1544	sc->sc_port_create = NULL;
1545	sc->sc_port_destroy = NULL;
1546
1547	return (0);
1548}
1549
1550static int
1551lagg_fail_detach(struct lagg_softc *sc)
1552{
1553	return (0);
1554}
1555
1556static int
1557lagg_fail_start(struct lagg_softc *sc, struct mbuf *m)
1558{
1559	struct lagg_port *lp;
1560
1561	/* Use the master port if active or the next available port */
1562	if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) {
1563		m_freem(m);
1564		return (ENOENT);
1565	}
1566
1567	/* Send mbuf */
1568	return (lagg_enqueue(lp->lp_ifp, m));
1569}
1570
1571static struct mbuf *
1572lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1573{
1574	struct ifnet *ifp = sc->sc_ifp;
1575	struct lagg_port *tmp_tp;
1576
1577	if (lp == sc->sc_primary || lagg_failover_rx_all) {
1578		m->m_pkthdr.rcvif = ifp;
1579		return (m);
1580	}
1581
1582	if (!LAGG_PORTACTIVE(sc->sc_primary)) {
1583		tmp_tp = lagg_link_active(sc, sc->sc_primary);
1584		/*
1585		 * If tmp_tp is null, we've recieved a packet when all
1586		 * our links are down. Weird, but process it anyways.
1587		 */
1588		if ((tmp_tp == NULL || tmp_tp == lp)) {
1589			m->m_pkthdr.rcvif = ifp;
1590			return (m);
1591		}
1592	}
1593
1594	m_freem(m);
1595	return (NULL);
1596}
1597
1598/*
1599 * Loadbalancing
1600 */
1601
1602static int
1603lagg_lb_attach(struct lagg_softc *sc)
1604{
1605	struct lagg_port *lp;
1606	struct lagg_lb *lb;
1607
1608	if ((lb = (struct lagg_lb *)malloc(sizeof(struct lagg_lb),
1609	    M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
1610		return (ENOMEM);
1611
1612	sc->sc_detach = lagg_lb_detach;
1613	sc->sc_start = lagg_lb_start;
1614	sc->sc_input = lagg_lb_input;
1615	sc->sc_port_create = lagg_lb_port_create;
1616	sc->sc_port_destroy = lagg_lb_port_destroy;
1617	sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX;
1618
1619	lb->lb_key = arc4random();
1620	sc->sc_psc = (caddr_t)lb;
1621
1622	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1623		lagg_lb_port_create(lp);
1624
1625	return (0);
1626}
1627
1628static int
1629lagg_lb_detach(struct lagg_softc *sc)
1630{
1631	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
1632	if (lb != NULL)
1633		free(lb, M_DEVBUF);
1634	return (0);
1635}
1636
1637static int
1638lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp)
1639{
1640	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
1641	struct lagg_port *lp_next;
1642	int i = 0;
1643
1644	bzero(&lb->lb_ports, sizeof(lb->lb_ports));
1645	SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
1646		if (lp_next == lp)
1647			continue;
1648		if (i >= LAGG_MAX_PORTS)
1649			return (EINVAL);
1650		if (sc->sc_ifflags & IFF_DEBUG)
1651			printf("%s: port %s at index %d\n",
1652			    sc->sc_ifname, lp_next->lp_ifname, i);
1653		lb->lb_ports[i++] = lp_next;
1654	}
1655
1656	return (0);
1657}
1658
1659static int
1660lagg_lb_port_create(struct lagg_port *lp)
1661{
1662	struct lagg_softc *sc = lp->lp_softc;
1663	return (lagg_lb_porttable(sc, NULL));
1664}
1665
1666static void
1667lagg_lb_port_destroy(struct lagg_port *lp)
1668{
1669	struct lagg_softc *sc = lp->lp_softc;
1670	lagg_lb_porttable(sc, lp);
1671}
1672
1673static int
1674lagg_lb_start(struct lagg_softc *sc, struct mbuf *m)
1675{
1676	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
1677	struct lagg_port *lp = NULL;
1678	uint32_t p = 0;
1679
1680	if (m->m_flags & M_FLOWID)
1681		p = m->m_pkthdr.flowid;
1682	else
1683		p = lagg_hashmbuf(m, lb->lb_key);
1684	p %= sc->sc_count;
1685	lp = lb->lb_ports[p];
1686
1687	/*
1688	 * Check the port's link state. This will return the next active
1689	 * port if the link is down or the port is NULL.
1690	 */
1691	if ((lp = lagg_link_active(sc, lp)) == NULL) {
1692		m_freem(m);
1693		return (ENOENT);
1694	}
1695
1696	/* Send mbuf */
1697	return (lagg_enqueue(lp->lp_ifp, m));
1698}
1699
1700static struct mbuf *
1701lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1702{
1703	struct ifnet *ifp = sc->sc_ifp;
1704
1705	/* Just pass in the packet to our lagg device */
1706	m->m_pkthdr.rcvif = ifp;
1707
1708	return (m);
1709}
1710
1711/*
1712 * 802.3ad LACP
1713 */
1714
1715static int
1716lagg_lacp_attach(struct lagg_softc *sc)
1717{
1718	struct lagg_port *lp;
1719	int error;
1720
1721	sc->sc_detach = lagg_lacp_detach;
1722	sc->sc_port_create = lacp_port_create;
1723	sc->sc_port_destroy = lacp_port_destroy;
1724	sc->sc_linkstate = lacp_linkstate;
1725	sc->sc_start = lagg_lacp_start;
1726	sc->sc_input = lagg_lacp_input;
1727	sc->sc_init = lacp_init;
1728	sc->sc_stop = lacp_stop;
1729	sc->sc_lladdr = lagg_lacp_lladdr;
1730	sc->sc_req = lacp_req;
1731	sc->sc_portreq = lacp_portreq;
1732
1733	error = lacp_attach(sc);
1734	if (error)
1735		return (error);
1736
1737	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1738		lacp_port_create(lp);
1739
1740	return (error);
1741}
1742
1743static int
1744lagg_lacp_detach(struct lagg_softc *sc)
1745{
1746	struct lagg_port *lp;
1747	int error;
1748
1749	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1750		lacp_port_destroy(lp);
1751
1752	/* unlocking is safe here */
1753	LAGG_WUNLOCK(sc);
1754	error = lacp_detach(sc);
1755	LAGG_WLOCK(sc);
1756
1757	return (error);
1758}
1759
1760static void
1761lagg_lacp_lladdr(struct lagg_softc *sc)
1762{
1763	struct lagg_port *lp;
1764
1765	/* purge all the lacp ports */
1766	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1767		lacp_port_destroy(lp);
1768
1769	/* add them back in */
1770	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1771		lacp_port_create(lp);
1772}
1773
1774static int
1775lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m)
1776{
1777	struct lagg_port *lp;
1778
1779	lp = lacp_select_tx_port(sc, m);
1780	if (lp == NULL) {
1781		m_freem(m);
1782		return (EBUSY);
1783	}
1784
1785	/* Send mbuf */
1786	return (lagg_enqueue(lp->lp_ifp, m));
1787}
1788
1789static struct mbuf *
1790lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1791{
1792	struct ifnet *ifp = sc->sc_ifp;
1793	struct ether_header *eh;
1794	u_short etype;
1795
1796	eh = mtod(m, struct ether_header *);
1797	etype = ntohs(eh->ether_type);
1798
1799	/* Tap off LACP control messages */
1800	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) {
1801		m = lacp_input(lp, m);
1802		if (m == NULL)
1803			return (NULL);
1804	}
1805
1806	/*
1807	 * If the port is not collecting or not in the active aggregator then
1808	 * free and return.
1809	 */
1810	if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) {
1811		m_freem(m);
1812		return (NULL);
1813	}
1814
1815	m->m_pkthdr.rcvif = ifp;
1816	return (m);
1817}
1818