if_lagg.c revision 249506
1/*	$OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $	*/
2
3/*
4 * Copyright (c) 2005, 2006 Reyk Floeter <reyk@openbsd.org>
5 * Copyright (c) 2007 Andrew Thompson <thompsa@FreeBSD.org>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20#include <sys/cdefs.h>
21__FBSDID("$FreeBSD: head/sys/net/if_lagg.c 249506 2013-04-15 13:00:42Z glebius $");
22
23#include "opt_inet.h"
24#include "opt_inet6.h"
25
26#include <sys/param.h>
27#include <sys/kernel.h>
28#include <sys/malloc.h>
29#include <sys/mbuf.h>
30#include <sys/queue.h>
31#include <sys/socket.h>
32#include <sys/sockio.h>
33#include <sys/sysctl.h>
34#include <sys/module.h>
35#include <sys/priv.h>
36#include <sys/systm.h>
37#include <sys/proc.h>
38#include <sys/hash.h>
39#include <sys/lock.h>
40#include <sys/rwlock.h>
41#include <sys/taskqueue.h>
42#include <sys/eventhandler.h>
43
44#include <net/ethernet.h>
45#include <net/if.h>
46#include <net/if_clone.h>
47#include <net/if_arp.h>
48#include <net/if_dl.h>
49#include <net/if_llc.h>
50#include <net/if_media.h>
51#include <net/if_types.h>
52#include <net/if_var.h>
53#include <net/bpf.h>
54
55#if defined(INET) || defined(INET6)
56#include <netinet/in.h>
57#endif
58#ifdef INET
59#include <netinet/in_systm.h>
60#include <netinet/if_ether.h>
61#include <netinet/ip.h>
62#endif
63
64#ifdef INET6
65#include <netinet/ip6.h>
66#endif
67
68#include <net/if_vlan_var.h>
69#include <net/if_lagg.h>
70#include <net/ieee8023ad_lacp.h>
71
72/* Special flags we should propagate to the lagg ports. */
73static struct {
74	int flag;
75	int (*func)(struct ifnet *, int);
76} lagg_pflags[] = {
77	{IFF_PROMISC, ifpromisc},
78	{IFF_ALLMULTI, if_allmulti},
79	{0, NULL}
80};
81
82SLIST_HEAD(__trhead, lagg_softc) lagg_list;	/* list of laggs */
83static struct mtx	lagg_list_mtx;
84eventhandler_tag	lagg_detach_cookie = NULL;
85
86static int	lagg_clone_create(struct if_clone *, int, caddr_t);
87static void	lagg_clone_destroy(struct ifnet *);
88static struct if_clone *lagg_cloner;
89static const char laggname[] = "lagg";
90
91static void	lagg_lladdr(struct lagg_softc *, uint8_t *);
92static void	lagg_capabilities(struct lagg_softc *);
93static void	lagg_port_lladdr(struct lagg_port *, uint8_t *);
94static void	lagg_port_setlladdr(void *, int);
95static int	lagg_port_create(struct lagg_softc *, struct ifnet *);
96static int	lagg_port_destroy(struct lagg_port *, int);
97static struct mbuf *lagg_input(struct ifnet *, struct mbuf *);
98static void	lagg_linkstate(struct lagg_softc *);
99static void	lagg_port_state(struct ifnet *, int);
100static int	lagg_port_ioctl(struct ifnet *, u_long, caddr_t);
101static int	lagg_port_output(struct ifnet *, struct mbuf *,
102		    struct sockaddr *, struct route *);
103static void	lagg_port_ifdetach(void *arg __unused, struct ifnet *);
104#ifdef LAGG_PORT_STACKING
105static int	lagg_port_checkstacking(struct lagg_softc *);
106#endif
107static void	lagg_port2req(struct lagg_port *, struct lagg_reqport *);
108static void	lagg_init(void *);
109static void	lagg_stop(struct lagg_softc *);
110static int	lagg_ioctl(struct ifnet *, u_long, caddr_t);
111static int	lagg_ether_setmulti(struct lagg_softc *);
112static int	lagg_ether_cmdmulti(struct lagg_port *, int);
113static	int	lagg_setflag(struct lagg_port *, int, int,
114		    int (*func)(struct ifnet *, int));
115static	int	lagg_setflags(struct lagg_port *, int status);
116static int	lagg_transmit(struct ifnet *, struct mbuf *);
117static void	lagg_qflush(struct ifnet *);
118static int	lagg_media_change(struct ifnet *);
119static void	lagg_media_status(struct ifnet *, struct ifmediareq *);
120static struct lagg_port *lagg_link_active(struct lagg_softc *,
121	    struct lagg_port *);
122static const void *lagg_gethdr(struct mbuf *, u_int, u_int, void *);
123
124/* Simple round robin */
125static int	lagg_rr_attach(struct lagg_softc *);
126static int	lagg_rr_detach(struct lagg_softc *);
127static int	lagg_rr_start(struct lagg_softc *, struct mbuf *);
128static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *,
129		    struct mbuf *);
130
131/* Active failover */
132static int	lagg_fail_attach(struct lagg_softc *);
133static int	lagg_fail_detach(struct lagg_softc *);
134static int	lagg_fail_start(struct lagg_softc *, struct mbuf *);
135static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *,
136		    struct mbuf *);
137
138/* Loadbalancing */
139static int	lagg_lb_attach(struct lagg_softc *);
140static int	lagg_lb_detach(struct lagg_softc *);
141static int	lagg_lb_port_create(struct lagg_port *);
142static void	lagg_lb_port_destroy(struct lagg_port *);
143static int	lagg_lb_start(struct lagg_softc *, struct mbuf *);
144static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *,
145		    struct mbuf *);
146static int	lagg_lb_porttable(struct lagg_softc *, struct lagg_port *);
147
148/* 802.3ad LACP */
149static int	lagg_lacp_attach(struct lagg_softc *);
150static int	lagg_lacp_detach(struct lagg_softc *);
151static int	lagg_lacp_start(struct lagg_softc *, struct mbuf *);
152static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *,
153		    struct mbuf *);
154static void	lagg_lacp_lladdr(struct lagg_softc *);
155
156static void	lagg_callout(void *);
157
158/* lagg protocol table */
159static const struct {
160	int			ti_proto;
161	int			(*ti_attach)(struct lagg_softc *);
162} lagg_protos[] = {
163	{ LAGG_PROTO_ROUNDROBIN,	lagg_rr_attach },
164	{ LAGG_PROTO_FAILOVER,		lagg_fail_attach },
165	{ LAGG_PROTO_LOADBALANCE,	lagg_lb_attach },
166	{ LAGG_PROTO_ETHERCHANNEL,	lagg_lb_attach },
167	{ LAGG_PROTO_LACP,		lagg_lacp_attach },
168	{ LAGG_PROTO_NONE,		NULL }
169};
170
171SYSCTL_DECL(_net_link);
172static SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0,
173    "Link Aggregation");
174
175static int lagg_failover_rx_all = 0; /* Allow input on any failover links */
176SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW,
177    &lagg_failover_rx_all, 0,
178    "Accept input from any interface in a failover lagg");
179static int def_use_flowid = 1; /* Default value for using M_FLOWID */
180TUNABLE_INT("net.link.lagg.default_use_flowid", &def_use_flowid);
181SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RW,
182    &def_use_flowid, 0,
183    "Default setting for using flow id for load sharing");
184
185static int
186lagg_modevent(module_t mod, int type, void *data)
187{
188
189	switch (type) {
190	case MOD_LOAD:
191		mtx_init(&lagg_list_mtx, "if_lagg list", NULL, MTX_DEF);
192		SLIST_INIT(&lagg_list);
193		lagg_cloner = if_clone_simple(laggname, lagg_clone_create,
194		    lagg_clone_destroy, 0);
195		lagg_input_p = lagg_input;
196		lagg_linkstate_p = lagg_port_state;
197		lagg_detach_cookie = EVENTHANDLER_REGISTER(
198		    ifnet_departure_event, lagg_port_ifdetach, NULL,
199		    EVENTHANDLER_PRI_ANY);
200		break;
201	case MOD_UNLOAD:
202		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
203		    lagg_detach_cookie);
204		if_clone_detach(lagg_cloner);
205		lagg_input_p = NULL;
206		lagg_linkstate_p = NULL;
207		mtx_destroy(&lagg_list_mtx);
208		break;
209	default:
210		return (EOPNOTSUPP);
211	}
212	return (0);
213}
214
215static moduledata_t lagg_mod = {
216	"if_lagg",
217	lagg_modevent,
218	0
219};
220
221DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
222MODULE_VERSION(if_lagg, 1);
223
224/*
225 * This routine is run via an vlan
226 * config EVENT
227 */
228static void
229lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
230{
231        struct lagg_softc       *sc = ifp->if_softc;
232        struct lagg_port        *lp;
233
234        if (ifp->if_softc !=  arg)   /* Not our event */
235                return;
236
237        LAGG_RLOCK(sc);
238        if (!SLIST_EMPTY(&sc->sc_ports)) {
239                SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
240                        EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag);
241        }
242        LAGG_RUNLOCK(sc);
243}
244
245/*
246 * This routine is run via an vlan
247 * unconfig EVENT
248 */
249static void
250lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
251{
252        struct lagg_softc       *sc = ifp->if_softc;
253        struct lagg_port        *lp;
254
255        if (ifp->if_softc !=  arg)   /* Not our event */
256                return;
257
258        LAGG_RLOCK(sc);
259        if (!SLIST_EMPTY(&sc->sc_ports)) {
260                SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
261                        EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag);
262        }
263        LAGG_RUNLOCK(sc);
264}
265
266static int
267lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
268{
269	struct lagg_softc *sc;
270	struct ifnet *ifp;
271	int i, error = 0;
272	static const u_char eaddr[6];	/* 00:00:00:00:00:00 */
273	struct sysctl_oid *oid;
274	char num[14];			/* sufficient for 32 bits */
275
276	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
277	ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
278	if (ifp == NULL) {
279		free(sc, M_DEVBUF);
280		return (ENOSPC);
281	}
282
283	sc->sc_ipackets = counter_u64_alloc(M_WAITOK);
284	sc->sc_opackets = counter_u64_alloc(M_WAITOK);
285	sc->sc_ibytes = counter_u64_alloc(M_WAITOK);
286	sc->sc_obytes = counter_u64_alloc(M_WAITOK);
287
288	sysctl_ctx_init(&sc->ctx);
289	snprintf(num, sizeof(num), "%u", unit);
290	sc->use_flowid = def_use_flowid;
291	oid = SYSCTL_ADD_NODE(&sc->ctx, &SYSCTL_NODE_CHILDREN(_net_link, lagg),
292		OID_AUTO, num, CTLFLAG_RD, NULL, "");
293	SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
294		"use_flowid", CTLTYPE_INT|CTLFLAG_RW, &sc->use_flowid, sc->use_flowid,
295		"Use flow id for load sharing");
296	SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
297		"count", CTLTYPE_INT|CTLFLAG_RD, &sc->sc_count, sc->sc_count,
298		"Total number of ports");
299	/* Hash all layers by default */
300	sc->sc_flags = LAGG_F_HASHL2|LAGG_F_HASHL3|LAGG_F_HASHL4;
301
302	sc->sc_proto = LAGG_PROTO_NONE;
303	for (i = 0; lagg_protos[i].ti_proto != LAGG_PROTO_NONE; i++) {
304		if (lagg_protos[i].ti_proto == LAGG_PROTO_DEFAULT) {
305			sc->sc_proto = lagg_protos[i].ti_proto;
306			if ((error = lagg_protos[i].ti_attach(sc)) != 0) {
307				if_free(ifp);
308				free(sc, M_DEVBUF);
309				return (error);
310			}
311			break;
312		}
313	}
314	LAGG_LOCK_INIT(sc);
315	SLIST_INIT(&sc->sc_ports);
316	TASK_INIT(&sc->sc_lladdr_task, 0, lagg_port_setlladdr, sc);
317	callout_init_rw(&sc->sc_callout, &sc->sc_mtx, CALLOUT_SHAREDLOCK);
318
319	/* Initialise pseudo media types */
320	ifmedia_init(&sc->sc_media, 0, lagg_media_change,
321	    lagg_media_status);
322	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
323	ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
324
325	if_initname(ifp, laggname, unit);
326	ifp->if_softc = sc;
327	ifp->if_transmit = lagg_transmit;
328	ifp->if_qflush = lagg_qflush;
329	ifp->if_init = lagg_init;
330	ifp->if_ioctl = lagg_ioctl;
331	ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
332
333	/*
334	 * Attach as an ordinary ethernet device, children will be attached
335	 * as special device IFT_IEEE8023ADLAG.
336	 */
337	ether_ifattach(ifp, eaddr);
338
339	sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
340		lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
341	sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
342		lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
343
344	/* Insert into the global list of laggs */
345	mtx_lock(&lagg_list_mtx);
346	SLIST_INSERT_HEAD(&lagg_list, sc, sc_entries);
347	mtx_unlock(&lagg_list_mtx);
348
349	callout_reset(&sc->sc_callout, hz, lagg_callout, sc);
350
351	return (0);
352}
353
354static void
355lagg_clone_destroy(struct ifnet *ifp)
356{
357	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
358	struct lagg_port *lp;
359
360	LAGG_WLOCK(sc);
361
362	lagg_stop(sc);
363	ifp->if_flags &= ~IFF_UP;
364
365	EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach);
366	EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach);
367
368	/* Shutdown and remove lagg ports */
369	while ((lp = SLIST_FIRST(&sc->sc_ports)) != NULL)
370		lagg_port_destroy(lp, 1);
371	/* Unhook the aggregation protocol */
372	if (sc->sc_detach != NULL)
373		(*sc->sc_detach)(sc);
374
375	LAGG_WUNLOCK(sc);
376
377	sysctl_ctx_free(&sc->ctx);
378	ifmedia_removeall(&sc->sc_media);
379	ether_ifdetach(ifp);
380	if_free(ifp);
381
382	callout_drain(&sc->sc_callout);
383	counter_u64_free(sc->sc_ipackets);
384	counter_u64_free(sc->sc_opackets);
385	counter_u64_free(sc->sc_ibytes);
386	counter_u64_free(sc->sc_obytes);
387
388	mtx_lock(&lagg_list_mtx);
389	SLIST_REMOVE(&lagg_list, sc, lagg_softc, sc_entries);
390	mtx_unlock(&lagg_list_mtx);
391
392	taskqueue_drain(taskqueue_swi, &sc->sc_lladdr_task);
393	LAGG_LOCK_DESTROY(sc);
394	free(sc, M_DEVBUF);
395}
396
397static void
398lagg_lladdr(struct lagg_softc *sc, uint8_t *lladdr)
399{
400	struct ifnet *ifp = sc->sc_ifp;
401
402	if (memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0)
403		return;
404
405	bcopy(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN);
406	/* Let the protocol know the MAC has changed */
407	if (sc->sc_lladdr != NULL)
408		(*sc->sc_lladdr)(sc);
409	EVENTHANDLER_INVOKE(iflladdr_event, ifp);
410}
411
412static void
413lagg_capabilities(struct lagg_softc *sc)
414{
415	struct lagg_port *lp;
416	int cap = ~0, ena = ~0;
417	u_long hwa = ~0UL;
418
419	LAGG_WLOCK_ASSERT(sc);
420
421	/* Get capabilities from the lagg ports */
422	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
423		cap &= lp->lp_ifp->if_capabilities;
424		ena &= lp->lp_ifp->if_capenable;
425		hwa &= lp->lp_ifp->if_hwassist;
426	}
427	cap = (cap == ~0 ? 0 : cap);
428	ena = (ena == ~0 ? 0 : ena);
429	hwa = (hwa == ~0 ? 0 : hwa);
430
431	if (sc->sc_ifp->if_capabilities != cap ||
432	    sc->sc_ifp->if_capenable != ena ||
433	    sc->sc_ifp->if_hwassist != hwa) {
434		sc->sc_ifp->if_capabilities = cap;
435		sc->sc_ifp->if_capenable = ena;
436		sc->sc_ifp->if_hwassist = hwa;
437		getmicrotime(&sc->sc_ifp->if_lastchange);
438
439		if (sc->sc_ifflags & IFF_DEBUG)
440			if_printf(sc->sc_ifp,
441			    "capabilities 0x%08x enabled 0x%08x\n", cap, ena);
442	}
443}
444
445static void
446lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr)
447{
448	struct lagg_softc *sc = lp->lp_softc;
449	struct ifnet *ifp = lp->lp_ifp;
450	struct lagg_llq *llq;
451	int pending = 0;
452
453	LAGG_WLOCK_ASSERT(sc);
454
455	if (lp->lp_detaching ||
456	    memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0)
457		return;
458
459	/* Check to make sure its not already queued to be changed */
460	SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) {
461		if (llq->llq_ifp == ifp) {
462			pending = 1;
463			break;
464		}
465	}
466
467	if (!pending) {
468		llq = malloc(sizeof(struct lagg_llq), M_DEVBUF, M_NOWAIT);
469		if (llq == NULL)	/* XXX what to do */
470			return;
471	}
472
473	/* Update the lladdr even if pending, it may have changed */
474	llq->llq_ifp = ifp;
475	bcopy(lladdr, llq->llq_lladdr, ETHER_ADDR_LEN);
476
477	if (!pending)
478		SLIST_INSERT_HEAD(&sc->sc_llq_head, llq, llq_entries);
479
480	taskqueue_enqueue(taskqueue_swi, &sc->sc_lladdr_task);
481}
482
483/*
484 * Set the interface MAC address from a taskqueue to avoid a LOR.
485 */
486static void
487lagg_port_setlladdr(void *arg, int pending)
488{
489	struct lagg_softc *sc = (struct lagg_softc *)arg;
490	struct lagg_llq *llq, *head;
491	struct ifnet *ifp;
492	int error;
493
494	/* Grab a local reference of the queue and remove it from the softc */
495	LAGG_WLOCK(sc);
496	head = SLIST_FIRST(&sc->sc_llq_head);
497	SLIST_FIRST(&sc->sc_llq_head) = NULL;
498	LAGG_WUNLOCK(sc);
499
500	/*
501	 * Traverse the queue and set the lladdr on each ifp. It is safe to do
502	 * unlocked as we have the only reference to it.
503	 */
504	for (llq = head; llq != NULL; llq = head) {
505		ifp = llq->llq_ifp;
506
507		/* Set the link layer address */
508		error = if_setlladdr(ifp, llq->llq_lladdr, ETHER_ADDR_LEN);
509		if (error)
510			printf("%s: setlladdr failed on %s\n", __func__,
511			    ifp->if_xname);
512
513		head = SLIST_NEXT(llq, llq_entries);
514		free(llq, M_DEVBUF);
515	}
516}
517
518static int
519lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
520{
521	struct lagg_softc *sc_ptr;
522	struct lagg_port *lp;
523	int error = 0;
524
525	LAGG_WLOCK_ASSERT(sc);
526
527	/* Limit the maximal number of lagg ports */
528	if (sc->sc_count >= LAGG_MAX_PORTS)
529		return (ENOSPC);
530
531	/* Check if port has already been associated to a lagg */
532	if (ifp->if_lagg != NULL) {
533		/* Port is already in the current lagg? */
534		lp = (struct lagg_port *)ifp->if_lagg;
535		if (lp->lp_softc == sc)
536			return (EEXIST);
537		return (EBUSY);
538	}
539
540	/* XXX Disallow non-ethernet interfaces (this should be any of 802) */
541	if (ifp->if_type != IFT_ETHER)
542		return (EPROTONOSUPPORT);
543
544	/* Allow the first Ethernet member to define the MTU */
545	if (SLIST_EMPTY(&sc->sc_ports))
546		sc->sc_ifp->if_mtu = ifp->if_mtu;
547	else if (sc->sc_ifp->if_mtu != ifp->if_mtu) {
548		if_printf(sc->sc_ifp, "invalid MTU for %s\n",
549		    ifp->if_xname);
550		return (EINVAL);
551	}
552
553	if ((lp = malloc(sizeof(struct lagg_port),
554	    M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
555		return (ENOMEM);
556
557	/* Check if port is a stacked lagg */
558	mtx_lock(&lagg_list_mtx);
559	SLIST_FOREACH(sc_ptr, &lagg_list, sc_entries) {
560		if (ifp == sc_ptr->sc_ifp) {
561			mtx_unlock(&lagg_list_mtx);
562			free(lp, M_DEVBUF);
563			return (EINVAL);
564			/* XXX disable stacking for the moment, its untested */
565#ifdef LAGG_PORT_STACKING
566			lp->lp_flags |= LAGG_PORT_STACK;
567			if (lagg_port_checkstacking(sc_ptr) >=
568			    LAGG_MAX_STACKING) {
569				mtx_unlock(&lagg_list_mtx);
570				free(lp, M_DEVBUF);
571				return (E2BIG);
572			}
573#endif
574		}
575	}
576	mtx_unlock(&lagg_list_mtx);
577
578	/* Change the interface type */
579	lp->lp_iftype = ifp->if_type;
580	ifp->if_type = IFT_IEEE8023ADLAG;
581	ifp->if_lagg = lp;
582	lp->lp_ioctl = ifp->if_ioctl;
583	ifp->if_ioctl = lagg_port_ioctl;
584	lp->lp_output = ifp->if_output;
585	ifp->if_output = lagg_port_output;
586
587	lp->lp_ifp = ifp;
588	lp->lp_softc = sc;
589
590	/* Save port link layer address */
591	bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN);
592
593	if (SLIST_EMPTY(&sc->sc_ports)) {
594		sc->sc_primary = lp;
595		lagg_lladdr(sc, IF_LLADDR(ifp));
596	} else {
597		/* Update link layer address for this port */
598		lagg_port_lladdr(lp, IF_LLADDR(sc->sc_ifp));
599	}
600
601	/* Insert into the list of ports */
602	SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries);
603	sc->sc_count++;
604
605	/* Update lagg capabilities */
606	lagg_capabilities(sc);
607	lagg_linkstate(sc);
608
609	/* Add multicast addresses and interface flags to this port */
610	lagg_ether_cmdmulti(lp, 1);
611	lagg_setflags(lp, 1);
612
613	if (sc->sc_port_create != NULL)
614		error = (*sc->sc_port_create)(lp);
615	if (error) {
616		/* remove the port again, without calling sc_port_destroy */
617		lagg_port_destroy(lp, 0);
618		return (error);
619	}
620
621	return (error);
622}
623
624#ifdef LAGG_PORT_STACKING
625static int
626lagg_port_checkstacking(struct lagg_softc *sc)
627{
628	struct lagg_softc *sc_ptr;
629	struct lagg_port *lp;
630	int m = 0;
631
632	LAGG_WLOCK_ASSERT(sc);
633
634	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
635		if (lp->lp_flags & LAGG_PORT_STACK) {
636			sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc;
637			m = MAX(m, lagg_port_checkstacking(sc_ptr));
638		}
639	}
640
641	return (m + 1);
642}
643#endif
644
645static int
646lagg_port_destroy(struct lagg_port *lp, int runpd)
647{
648	struct lagg_softc *sc = lp->lp_softc;
649	struct lagg_port *lp_ptr;
650	struct lagg_llq *llq;
651	struct ifnet *ifp = lp->lp_ifp;
652
653	LAGG_WLOCK_ASSERT(sc);
654
655	if (runpd && sc->sc_port_destroy != NULL)
656		(*sc->sc_port_destroy)(lp);
657
658	/*
659	 * Remove multicast addresses and interface flags from this port and
660	 * reset the MAC address, skip if the interface is being detached.
661	 */
662	if (!lp->lp_detaching) {
663		lagg_ether_cmdmulti(lp, 0);
664		lagg_setflags(lp, 0);
665		lagg_port_lladdr(lp, lp->lp_lladdr);
666	}
667
668	/* Restore interface */
669	ifp->if_type = lp->lp_iftype;
670	ifp->if_ioctl = lp->lp_ioctl;
671	ifp->if_output = lp->lp_output;
672	ifp->if_lagg = NULL;
673
674	/* Finally, remove the port from the lagg */
675	SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries);
676	sc->sc_count--;
677
678	/* Update the primary interface */
679	if (lp == sc->sc_primary) {
680		uint8_t lladdr[ETHER_ADDR_LEN];
681
682		if ((lp_ptr = SLIST_FIRST(&sc->sc_ports)) == NULL) {
683			bzero(&lladdr, ETHER_ADDR_LEN);
684		} else {
685			bcopy(lp_ptr->lp_lladdr,
686			    lladdr, ETHER_ADDR_LEN);
687		}
688		lagg_lladdr(sc, lladdr);
689		sc->sc_primary = lp_ptr;
690
691		/* Update link layer address for each port */
692		SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries)
693			lagg_port_lladdr(lp_ptr, lladdr);
694	}
695
696	/* Remove any pending lladdr changes from the queue */
697	if (lp->lp_detaching) {
698		SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) {
699			if (llq->llq_ifp == ifp) {
700				SLIST_REMOVE(&sc->sc_llq_head, llq, lagg_llq,
701				    llq_entries);
702				free(llq, M_DEVBUF);
703				break;	/* Only appears once */
704			}
705		}
706	}
707
708	if (lp->lp_ifflags)
709		if_printf(ifp, "%s: lp_ifflags unclean\n", __func__);
710
711	free(lp, M_DEVBUF);
712
713	/* Update lagg capabilities */
714	lagg_capabilities(sc);
715	lagg_linkstate(sc);
716
717	return (0);
718}
719
720static int
721lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
722{
723	struct lagg_reqport *rp = (struct lagg_reqport *)data;
724	struct lagg_softc *sc;
725	struct lagg_port *lp = NULL;
726	int error = 0;
727
728	/* Should be checked by the caller */
729	if (ifp->if_type != IFT_IEEE8023ADLAG ||
730	    (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL)
731		goto fallback;
732
733	switch (cmd) {
734	case SIOCGLAGGPORT:
735		if (rp->rp_portname[0] == '\0' ||
736		    ifunit(rp->rp_portname) != ifp) {
737			error = EINVAL;
738			break;
739		}
740
741		LAGG_RLOCK(sc);
742		if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) {
743			error = ENOENT;
744			LAGG_RUNLOCK(sc);
745			break;
746		}
747
748		lagg_port2req(lp, rp);
749		LAGG_RUNLOCK(sc);
750		break;
751
752	case SIOCSIFCAP:
753		if (lp->lp_ioctl == NULL) {
754			error = EINVAL;
755			break;
756		}
757		error = (*lp->lp_ioctl)(ifp, cmd, data);
758		if (error)
759			break;
760
761		/* Update lagg interface capabilities */
762		LAGG_WLOCK(sc);
763		lagg_capabilities(sc);
764		LAGG_WUNLOCK(sc);
765		break;
766
767	case SIOCSIFMTU:
768		/* Do not allow the MTU to be changed once joined */
769		error = EINVAL;
770		break;
771
772	default:
773		goto fallback;
774	}
775
776	return (error);
777
778fallback:
779	if (lp->lp_ioctl != NULL)
780		return ((*lp->lp_ioctl)(ifp, cmd, data));
781
782	return (EINVAL);
783}
784
785/*
786 * For direct output to child ports.
787 */
788static int
789lagg_port_output(struct ifnet *ifp, struct mbuf *m,
790	struct sockaddr *dst, struct route *ro)
791{
792	struct lagg_port *lp = ifp->if_lagg;
793
794	switch (dst->sa_family) {
795		case pseudo_AF_HDRCMPLT:
796		case AF_UNSPEC:
797			return ((*lp->lp_output)(ifp, m, dst, ro));
798	}
799
800	/* drop any other frames */
801	m_freem(m);
802	return (ENETDOWN);
803}
804
805static void
806lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp)
807{
808	struct lagg_port *lp;
809	struct lagg_softc *sc;
810
811	if ((lp = ifp->if_lagg) == NULL)
812		return;
813	/* If the ifnet is just being renamed, don't do anything. */
814	if (ifp->if_flags & IFF_RENAMING)
815		return;
816
817	sc = lp->lp_softc;
818
819	LAGG_WLOCK(sc);
820	lp->lp_detaching = 1;
821	lagg_port_destroy(lp, 1);
822	LAGG_WUNLOCK(sc);
823}
824
825static void
826lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp)
827{
828	struct lagg_softc *sc = lp->lp_softc;
829
830	strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname));
831	strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname));
832	rp->rp_prio = lp->lp_prio;
833	rp->rp_flags = lp->lp_flags;
834	if (sc->sc_portreq != NULL)
835		(*sc->sc_portreq)(lp, (caddr_t)&rp->rp_psc);
836
837	/* Add protocol specific flags */
838	switch (sc->sc_proto) {
839		case LAGG_PROTO_FAILOVER:
840			if (lp == sc->sc_primary)
841				rp->rp_flags |= LAGG_PORT_MASTER;
842			if (lp == lagg_link_active(sc, sc->sc_primary))
843				rp->rp_flags |= LAGG_PORT_ACTIVE;
844			break;
845
846		case LAGG_PROTO_ROUNDROBIN:
847		case LAGG_PROTO_LOADBALANCE:
848		case LAGG_PROTO_ETHERCHANNEL:
849			if (LAGG_PORTACTIVE(lp))
850				rp->rp_flags |= LAGG_PORT_ACTIVE;
851			break;
852
853		case LAGG_PROTO_LACP:
854			/* LACP has a different definition of active */
855			if (lacp_isactive(lp))
856				rp->rp_flags |= LAGG_PORT_ACTIVE;
857			if (lacp_iscollecting(lp))
858				rp->rp_flags |= LAGG_PORT_COLLECTING;
859			if (lacp_isdistributing(lp))
860				rp->rp_flags |= LAGG_PORT_DISTRIBUTING;
861			break;
862	}
863
864}
865
866static void
867lagg_init(void *xsc)
868{
869	struct lagg_softc *sc = (struct lagg_softc *)xsc;
870	struct lagg_port *lp;
871	struct ifnet *ifp = sc->sc_ifp;
872
873	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
874		return;
875
876	LAGG_WLOCK(sc);
877
878	ifp->if_drv_flags |= IFF_DRV_RUNNING;
879	/* Update the port lladdrs */
880	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
881		lagg_port_lladdr(lp, IF_LLADDR(ifp));
882
883	if (sc->sc_init != NULL)
884		(*sc->sc_init)(sc);
885
886	LAGG_WUNLOCK(sc);
887}
888
889static void
890lagg_stop(struct lagg_softc *sc)
891{
892	struct ifnet *ifp = sc->sc_ifp;
893
894	LAGG_WLOCK_ASSERT(sc);
895
896	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
897		return;
898
899	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
900
901	if (sc->sc_stop != NULL)
902		(*sc->sc_stop)(sc);
903}
904
905static int
906lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
907{
908	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
909	struct lagg_reqall *ra = (struct lagg_reqall *)data;
910	struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf;
911	struct lagg_reqflags *rf = (struct lagg_reqflags *)data;
912	struct ifreq *ifr = (struct ifreq *)data;
913	struct lagg_port *lp;
914	struct ifnet *tpif;
915	struct thread *td = curthread;
916	char *buf, *outbuf;
917	int count, buflen, len, error = 0;
918
919	bzero(&rpbuf, sizeof(rpbuf));
920
921	switch (cmd) {
922	case SIOCGLAGG:
923		LAGG_RLOCK(sc);
924		count = 0;
925		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
926			count++;
927		buflen = count * sizeof(struct lagg_reqport);
928		LAGG_RUNLOCK(sc);
929
930		outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
931
932		LAGG_RLOCK(sc);
933		ra->ra_proto = sc->sc_proto;
934		if (sc->sc_req != NULL)
935			(*sc->sc_req)(sc, (caddr_t)&ra->ra_psc);
936
937		count = 0;
938		buf = outbuf;
939		len = min(ra->ra_size, buflen);
940		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
941			if (len < sizeof(rpbuf))
942				break;
943
944			lagg_port2req(lp, &rpbuf);
945			memcpy(buf, &rpbuf, sizeof(rpbuf));
946			count++;
947			buf += sizeof(rpbuf);
948			len -= sizeof(rpbuf);
949		}
950		LAGG_RUNLOCK(sc);
951		ra->ra_ports = count;
952		ra->ra_size = count * sizeof(rpbuf);
953		error = copyout(outbuf, ra->ra_port, ra->ra_size);
954		free(outbuf, M_TEMP);
955		break;
956	case SIOCSLAGG:
957		error = priv_check(td, PRIV_NET_LAGG);
958		if (error)
959			break;
960		if (ra->ra_proto >= LAGG_PROTO_MAX) {
961			error = EPROTONOSUPPORT;
962			break;
963		}
964		LAGG_WLOCK(sc);
965		if (sc->sc_proto != LAGG_PROTO_NONE) {
966			/* Reset protocol first in case detach unlocks */
967			sc->sc_proto = LAGG_PROTO_NONE;
968			error = sc->sc_detach(sc);
969			sc->sc_detach = NULL;
970			sc->sc_start = NULL;
971			sc->sc_input = NULL;
972			sc->sc_port_create = NULL;
973			sc->sc_port_destroy = NULL;
974			sc->sc_linkstate = NULL;
975			sc->sc_init = NULL;
976			sc->sc_stop = NULL;
977			sc->sc_lladdr = NULL;
978			sc->sc_req = NULL;
979			sc->sc_portreq = NULL;
980		} else if (sc->sc_input != NULL) {
981			/* Still detaching */
982			error = EBUSY;
983		}
984		if (error != 0) {
985			LAGG_WUNLOCK(sc);
986			break;
987		}
988		for (int i = 0; i < (sizeof(lagg_protos) /
989		    sizeof(lagg_protos[0])); i++) {
990			if (lagg_protos[i].ti_proto == ra->ra_proto) {
991				if (sc->sc_ifflags & IFF_DEBUG)
992					printf("%s: using proto %u\n",
993					    sc->sc_ifname,
994					    lagg_protos[i].ti_proto);
995				sc->sc_proto = lagg_protos[i].ti_proto;
996				if (sc->sc_proto != LAGG_PROTO_NONE)
997					error = lagg_protos[i].ti_attach(sc);
998				LAGG_WUNLOCK(sc);
999				return (error);
1000			}
1001		}
1002		LAGG_WUNLOCK(sc);
1003		error = EPROTONOSUPPORT;
1004		break;
1005	case SIOCGLAGGFLAGS:
1006		rf->rf_flags = sc->sc_flags;
1007		break;
1008	case SIOCSLAGGHASH:
1009		error = priv_check(td, PRIV_NET_LAGG);
1010		if (error)
1011			break;
1012		if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) {
1013			error = EINVAL;
1014			break;
1015		}
1016		LAGG_WLOCK(sc);
1017		sc->sc_flags &= ~LAGG_F_HASHMASK;
1018		sc->sc_flags |= rf->rf_flags & LAGG_F_HASHMASK;
1019		LAGG_WUNLOCK(sc);
1020		break;
1021	case SIOCGLAGGPORT:
1022		if (rp->rp_portname[0] == '\0' ||
1023		    (tpif = ifunit(rp->rp_portname)) == NULL) {
1024			error = EINVAL;
1025			break;
1026		}
1027
1028		LAGG_RLOCK(sc);
1029		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
1030		    lp->lp_softc != sc) {
1031			error = ENOENT;
1032			LAGG_RUNLOCK(sc);
1033			break;
1034		}
1035
1036		lagg_port2req(lp, rp);
1037		LAGG_RUNLOCK(sc);
1038		break;
1039	case SIOCSLAGGPORT:
1040		error = priv_check(td, PRIV_NET_LAGG);
1041		if (error)
1042			break;
1043		if (rp->rp_portname[0] == '\0' ||
1044		    (tpif = ifunit(rp->rp_portname)) == NULL) {
1045			error = EINVAL;
1046			break;
1047		}
1048		LAGG_WLOCK(sc);
1049		error = lagg_port_create(sc, tpif);
1050		LAGG_WUNLOCK(sc);
1051		break;
1052	case SIOCSLAGGDELPORT:
1053		error = priv_check(td, PRIV_NET_LAGG);
1054		if (error)
1055			break;
1056		if (rp->rp_portname[0] == '\0' ||
1057		    (tpif = ifunit(rp->rp_portname)) == NULL) {
1058			error = EINVAL;
1059			break;
1060		}
1061
1062		LAGG_WLOCK(sc);
1063		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
1064		    lp->lp_softc != sc) {
1065			error = ENOENT;
1066			LAGG_WUNLOCK(sc);
1067			break;
1068		}
1069
1070		error = lagg_port_destroy(lp, 1);
1071		LAGG_WUNLOCK(sc);
1072		break;
1073	case SIOCSIFFLAGS:
1074		/* Set flags on ports too */
1075		LAGG_WLOCK(sc);
1076		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1077			lagg_setflags(lp, 1);
1078		}
1079		LAGG_WUNLOCK(sc);
1080
1081		if (!(ifp->if_flags & IFF_UP) &&
1082		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1083			/*
1084			 * If interface is marked down and it is running,
1085			 * then stop and disable it.
1086			 */
1087			LAGG_WLOCK(sc);
1088			lagg_stop(sc);
1089			LAGG_WUNLOCK(sc);
1090		} else if ((ifp->if_flags & IFF_UP) &&
1091		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1092			/*
1093			 * If interface is marked up and it is stopped, then
1094			 * start it.
1095			 */
1096			(*ifp->if_init)(sc);
1097		}
1098		break;
1099	case SIOCADDMULTI:
1100	case SIOCDELMULTI:
1101		LAGG_WLOCK(sc);
1102		error = lagg_ether_setmulti(sc);
1103		LAGG_WUNLOCK(sc);
1104		break;
1105	case SIOCSIFMEDIA:
1106	case SIOCGIFMEDIA:
1107		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
1108		break;
1109
1110	case SIOCSIFCAP:
1111	case SIOCSIFMTU:
1112		/* Do not allow the MTU or caps to be directly changed */
1113		error = EINVAL;
1114		break;
1115
1116	default:
1117		error = ether_ioctl(ifp, cmd, data);
1118		break;
1119	}
1120	return (error);
1121}
1122
1123static int
1124lagg_ether_setmulti(struct lagg_softc *sc)
1125{
1126	struct lagg_port *lp;
1127
1128	LAGG_WLOCK_ASSERT(sc);
1129
1130	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1131		/* First, remove any existing filter entries. */
1132		lagg_ether_cmdmulti(lp, 0);
1133		/* copy all addresses from the lagg interface to the port */
1134		lagg_ether_cmdmulti(lp, 1);
1135	}
1136	return (0);
1137}
1138
1139static int
1140lagg_ether_cmdmulti(struct lagg_port *lp, int set)
1141{
1142	struct lagg_softc *sc = lp->lp_softc;
1143	struct ifnet *ifp = lp->lp_ifp;
1144	struct ifnet *scifp = sc->sc_ifp;
1145	struct lagg_mc *mc;
1146	struct ifmultiaddr *ifma, *rifma = NULL;
1147	struct sockaddr_dl sdl;
1148	int error;
1149
1150	LAGG_WLOCK_ASSERT(sc);
1151
1152	bzero((char *)&sdl, sizeof(sdl));
1153	sdl.sdl_len = sizeof(sdl);
1154	sdl.sdl_family = AF_LINK;
1155	sdl.sdl_type = IFT_ETHER;
1156	sdl.sdl_alen = ETHER_ADDR_LEN;
1157	sdl.sdl_index = ifp->if_index;
1158
1159	if (set) {
1160		TAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) {
1161			if (ifma->ifma_addr->sa_family != AF_LINK)
1162				continue;
1163			bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1164			    LLADDR(&sdl), ETHER_ADDR_LEN);
1165
1166			error = if_addmulti(ifp, (struct sockaddr *)&sdl, &rifma);
1167			if (error)
1168				return (error);
1169			mc = malloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT);
1170			if (mc == NULL)
1171				return (ENOMEM);
1172			mc->mc_ifma = rifma;
1173			SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries);
1174		}
1175	} else {
1176		while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) {
1177			SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries);
1178			if_delmulti_ifma(mc->mc_ifma);
1179			free(mc, M_DEVBUF);
1180		}
1181	}
1182	return (0);
1183}
1184
1185/* Handle a ref counted flag that should be set on the lagg port as well */
1186static int
1187lagg_setflag(struct lagg_port *lp, int flag, int status,
1188	     int (*func)(struct ifnet *, int))
1189{
1190	struct lagg_softc *sc = lp->lp_softc;
1191	struct ifnet *scifp = sc->sc_ifp;
1192	struct ifnet *ifp = lp->lp_ifp;
1193	int error;
1194
1195	LAGG_WLOCK_ASSERT(sc);
1196
1197	status = status ? (scifp->if_flags & flag) : 0;
1198	/* Now "status" contains the flag value or 0 */
1199
1200	/*
1201	 * See if recorded ports status is different from what
1202	 * we want it to be.  If it is, flip it.  We record ports
1203	 * status in lp_ifflags so that we won't clear ports flag
1204	 * we haven't set.  In fact, we don't clear or set ports
1205	 * flags directly, but get or release references to them.
1206	 * That's why we can be sure that recorded flags still are
1207	 * in accord with actual ports flags.
1208	 */
1209	if (status != (lp->lp_ifflags & flag)) {
1210		error = (*func)(ifp, status);
1211		if (error)
1212			return (error);
1213		lp->lp_ifflags &= ~flag;
1214		lp->lp_ifflags |= status;
1215	}
1216	return (0);
1217}
1218
1219/*
1220 * Handle IFF_* flags that require certain changes on the lagg port
1221 * if "status" is true, update ports flags respective to the lagg
1222 * if "status" is false, forcedly clear the flags set on port.
1223 */
1224static int
1225lagg_setflags(struct lagg_port *lp, int status)
1226{
1227	int error, i;
1228
1229	for (i = 0; lagg_pflags[i].flag; i++) {
1230		error = lagg_setflag(lp, lagg_pflags[i].flag,
1231		    status, lagg_pflags[i].func);
1232		if (error)
1233			return (error);
1234	}
1235	return (0);
1236}
1237
1238static int
1239lagg_transmit(struct ifnet *ifp, struct mbuf *m)
1240{
1241	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1242	int error, len, mcast;
1243
1244	len = m->m_pkthdr.len;
1245	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
1246
1247	LAGG_RLOCK(sc);
1248	/* We need a Tx algorithm and at least one port */
1249	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
1250		LAGG_RUNLOCK(sc);
1251		m_freem(m);
1252		ifp->if_oerrors++;
1253		return (ENXIO);
1254	}
1255
1256	ETHER_BPF_MTAP(ifp, m);
1257
1258	error = (*sc->sc_start)(sc, m);
1259	LAGG_RUNLOCK(sc);
1260
1261	if (error == 0) {
1262		counter_u64_add(sc->sc_opackets, 1);
1263		counter_u64_add(sc->sc_obytes, len);
1264		ifp->if_omcasts += mcast;
1265	} else
1266		ifp->if_oerrors++;
1267
1268	return (error);
1269}
1270
1271/*
1272 * The ifp->if_qflush entry point for lagg(4) is no-op.
1273 */
1274static void
1275lagg_qflush(struct ifnet *ifp __unused)
1276{
1277}
1278
1279static struct mbuf *
1280lagg_input(struct ifnet *ifp, struct mbuf *m)
1281{
1282	struct lagg_port *lp = ifp->if_lagg;
1283	struct lagg_softc *sc = lp->lp_softc;
1284	struct ifnet *scifp = sc->sc_ifp;
1285
1286	LAGG_RLOCK(sc);
1287	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
1288	    (lp->lp_flags & LAGG_PORT_DISABLED) ||
1289	    sc->sc_proto == LAGG_PROTO_NONE) {
1290		LAGG_RUNLOCK(sc);
1291		m_freem(m);
1292		return (NULL);
1293	}
1294
1295	ETHER_BPF_MTAP(scifp, m);
1296
1297	m = (*sc->sc_input)(sc, lp, m);
1298
1299	if (m != NULL) {
1300		counter_u64_add(sc->sc_ipackets, 1);
1301		counter_u64_add(sc->sc_ibytes, m->m_pkthdr.len);
1302
1303		if (scifp->if_flags & IFF_MONITOR) {
1304			m_freem(m);
1305			m = NULL;
1306		}
1307	}
1308
1309	LAGG_RUNLOCK(sc);
1310	return (m);
1311}
1312
1313static int
1314lagg_media_change(struct ifnet *ifp)
1315{
1316	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1317
1318	if (sc->sc_ifflags & IFF_DEBUG)
1319		printf("%s\n", __func__);
1320
1321	/* Ignore */
1322	return (0);
1323}
1324
1325static void
1326lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr)
1327{
1328	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1329	struct lagg_port *lp;
1330
1331	imr->ifm_status = IFM_AVALID;
1332	imr->ifm_active = IFM_ETHER | IFM_AUTO;
1333
1334	LAGG_RLOCK(sc);
1335	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1336		if (LAGG_PORTACTIVE(lp))
1337			imr->ifm_status |= IFM_ACTIVE;
1338	}
1339	LAGG_RUNLOCK(sc);
1340}
1341
1342static void
1343lagg_linkstate(struct lagg_softc *sc)
1344{
1345	struct lagg_port *lp;
1346	int new_link = LINK_STATE_DOWN;
1347	uint64_t speed;
1348
1349	/* Our link is considered up if at least one of our ports is active */
1350	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1351		if (lp->lp_link_state == LINK_STATE_UP) {
1352			new_link = LINK_STATE_UP;
1353			break;
1354		}
1355	}
1356	if_link_state_change(sc->sc_ifp, new_link);
1357
1358	/* Update if_baudrate to reflect the max possible speed */
1359	switch (sc->sc_proto) {
1360		case LAGG_PROTO_FAILOVER:
1361			sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ?
1362			    sc->sc_primary->lp_ifp->if_baudrate : 0;
1363			break;
1364		case LAGG_PROTO_ROUNDROBIN:
1365		case LAGG_PROTO_LOADBALANCE:
1366		case LAGG_PROTO_ETHERCHANNEL:
1367			speed = 0;
1368			SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1369				speed += lp->lp_ifp->if_baudrate;
1370			sc->sc_ifp->if_baudrate = speed;
1371			break;
1372		case LAGG_PROTO_LACP:
1373			/* LACP updates if_baudrate itself */
1374			break;
1375	}
1376}
1377
1378static void
1379lagg_port_state(struct ifnet *ifp, int state)
1380{
1381	struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg;
1382	struct lagg_softc *sc = NULL;
1383
1384	if (lp != NULL)
1385		sc = lp->lp_softc;
1386	if (sc == NULL)
1387		return;
1388
1389	LAGG_WLOCK(sc);
1390	lagg_linkstate(sc);
1391	if (sc->sc_linkstate != NULL)
1392		(*sc->sc_linkstate)(lp);
1393	LAGG_WUNLOCK(sc);
1394}
1395
1396struct lagg_port *
1397lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp)
1398{
1399	struct lagg_port *lp_next, *rval = NULL;
1400	// int new_link = LINK_STATE_DOWN;
1401
1402	LAGG_RLOCK_ASSERT(sc);
1403	/*
1404	 * Search a port which reports an active link state.
1405	 */
1406
1407	if (lp == NULL)
1408		goto search;
1409	if (LAGG_PORTACTIVE(lp)) {
1410		rval = lp;
1411		goto found;
1412	}
1413	if ((lp_next = SLIST_NEXT(lp, lp_entries)) != NULL &&
1414	    LAGG_PORTACTIVE(lp_next)) {
1415		rval = lp_next;
1416		goto found;
1417	}
1418
1419search:
1420	SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
1421		if (LAGG_PORTACTIVE(lp_next)) {
1422			rval = lp_next;
1423			goto found;
1424		}
1425	}
1426
1427found:
1428	if (rval != NULL) {
1429		/*
1430		 * The IEEE 802.1D standard assumes that a lagg with
1431		 * multiple ports is always full duplex. This is valid
1432		 * for load sharing laggs and if at least two links
1433		 * are active. Unfortunately, checking the latter would
1434		 * be too expensive at this point.
1435		 XXX
1436		if ((sc->sc_capabilities & IFCAP_LAGG_FULLDUPLEX) &&
1437		    (sc->sc_count > 1))
1438			new_link = LINK_STATE_FULL_DUPLEX;
1439		else
1440			new_link = rval->lp_link_state;
1441		 */
1442	}
1443
1444	return (rval);
1445}
1446
1447static const void *
1448lagg_gethdr(struct mbuf *m, u_int off, u_int len, void *buf)
1449{
1450	if (m->m_pkthdr.len < (off + len)) {
1451		return (NULL);
1452	} else if (m->m_len < (off + len)) {
1453		m_copydata(m, off, len, buf);
1454		return (buf);
1455	}
1456	return (mtod(m, char *) + off);
1457}
1458
1459uint32_t
1460lagg_hashmbuf(struct lagg_softc *sc, struct mbuf *m, uint32_t key)
1461{
1462	uint16_t etype;
1463	uint32_t p = key;
1464	int off;
1465	struct ether_header *eh;
1466	const struct ether_vlan_header *vlan;
1467#ifdef INET
1468	const struct ip *ip;
1469	const uint32_t *ports;
1470	int iphlen;
1471#endif
1472#ifdef INET6
1473	const struct ip6_hdr *ip6;
1474	uint32_t flow;
1475#endif
1476	union {
1477#ifdef INET
1478		struct ip ip;
1479#endif
1480#ifdef INET6
1481		struct ip6_hdr ip6;
1482#endif
1483		struct ether_vlan_header vlan;
1484		uint32_t port;
1485	} buf;
1486
1487
1488	off = sizeof(*eh);
1489	if (m->m_len < off)
1490		goto out;
1491	eh = mtod(m, struct ether_header *);
1492	etype = ntohs(eh->ether_type);
1493	if (sc->sc_flags & LAGG_F_HASHL2) {
1494		p = hash32_buf(&eh->ether_shost, ETHER_ADDR_LEN, p);
1495		p = hash32_buf(&eh->ether_dhost, ETHER_ADDR_LEN, p);
1496	}
1497
1498	/* Special handling for encapsulating VLAN frames */
1499	if ((m->m_flags & M_VLANTAG) && (sc->sc_flags & LAGG_F_HASHL2)) {
1500		p = hash32_buf(&m->m_pkthdr.ether_vtag,
1501		    sizeof(m->m_pkthdr.ether_vtag), p);
1502	} else if (etype == ETHERTYPE_VLAN) {
1503		vlan = lagg_gethdr(m, off,  sizeof(*vlan), &buf);
1504		if (vlan == NULL)
1505			goto out;
1506
1507		if (sc->sc_flags & LAGG_F_HASHL2)
1508			p = hash32_buf(&vlan->evl_tag, sizeof(vlan->evl_tag), p);
1509		etype = ntohs(vlan->evl_proto);
1510		off += sizeof(*vlan) - sizeof(*eh);
1511	}
1512
1513	switch (etype) {
1514#ifdef INET
1515	case ETHERTYPE_IP:
1516		ip = lagg_gethdr(m, off, sizeof(*ip), &buf);
1517		if (ip == NULL)
1518			goto out;
1519
1520		if (sc->sc_flags & LAGG_F_HASHL3) {
1521			p = hash32_buf(&ip->ip_src, sizeof(struct in_addr), p);
1522			p = hash32_buf(&ip->ip_dst, sizeof(struct in_addr), p);
1523		}
1524		if (!(sc->sc_flags & LAGG_F_HASHL4))
1525			break;
1526		switch (ip->ip_p) {
1527			case IPPROTO_TCP:
1528			case IPPROTO_UDP:
1529			case IPPROTO_SCTP:
1530				iphlen = ip->ip_hl << 2;
1531				if (iphlen < sizeof(*ip))
1532					break;
1533				off += iphlen;
1534				ports = lagg_gethdr(m, off, sizeof(*ports), &buf);
1535				if (ports == NULL)
1536					break;
1537				p = hash32_buf(ports, sizeof(*ports), p);
1538				break;
1539		}
1540		break;
1541#endif
1542#ifdef INET6
1543	case ETHERTYPE_IPV6:
1544		if (!(sc->sc_flags & LAGG_F_HASHL3))
1545			break;
1546		ip6 = lagg_gethdr(m, off, sizeof(*ip6), &buf);
1547		if (ip6 == NULL)
1548			goto out;
1549
1550		p = hash32_buf(&ip6->ip6_src, sizeof(struct in6_addr), p);
1551		p = hash32_buf(&ip6->ip6_dst, sizeof(struct in6_addr), p);
1552		flow = ip6->ip6_flow & IPV6_FLOWLABEL_MASK;
1553		p = hash32_buf(&flow, sizeof(flow), p);	/* IPv6 flow label */
1554		break;
1555#endif
1556	}
1557out:
1558	return (p);
1559}
1560
1561int
1562lagg_enqueue(struct ifnet *ifp, struct mbuf *m)
1563{
1564
1565	return (ifp->if_transmit)(ifp, m);
1566}
1567
1568/*
1569 * Simple round robin aggregation
1570 */
1571
1572static int
1573lagg_rr_attach(struct lagg_softc *sc)
1574{
1575	sc->sc_detach = lagg_rr_detach;
1576	sc->sc_start = lagg_rr_start;
1577	sc->sc_input = lagg_rr_input;
1578	sc->sc_port_create = NULL;
1579	sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX;
1580	sc->sc_seq = 0;
1581
1582	return (0);
1583}
1584
1585static int
1586lagg_rr_detach(struct lagg_softc *sc)
1587{
1588	return (0);
1589}
1590
1591static int
1592lagg_rr_start(struct lagg_softc *sc, struct mbuf *m)
1593{
1594	struct lagg_port *lp;
1595	uint32_t p;
1596
1597	p = atomic_fetchadd_32(&sc->sc_seq, 1);
1598	p %= sc->sc_count;
1599	lp = SLIST_FIRST(&sc->sc_ports);
1600	while (p--)
1601		lp = SLIST_NEXT(lp, lp_entries);
1602
1603	/*
1604	 * Check the port's link state. This will return the next active
1605	 * port if the link is down or the port is NULL.
1606	 */
1607	if ((lp = lagg_link_active(sc, lp)) == NULL) {
1608		m_freem(m);
1609		return (ENOENT);
1610	}
1611
1612	/* Send mbuf */
1613	return (lagg_enqueue(lp->lp_ifp, m));
1614}
1615
1616static struct mbuf *
1617lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1618{
1619	struct ifnet *ifp = sc->sc_ifp;
1620
1621	/* Just pass in the packet to our lagg device */
1622	m->m_pkthdr.rcvif = ifp;
1623
1624	return (m);
1625}
1626
1627/*
1628 * Active failover
1629 */
1630
1631static int
1632lagg_fail_attach(struct lagg_softc *sc)
1633{
1634	sc->sc_detach = lagg_fail_detach;
1635	sc->sc_start = lagg_fail_start;
1636	sc->sc_input = lagg_fail_input;
1637	sc->sc_port_create = NULL;
1638	sc->sc_port_destroy = NULL;
1639
1640	return (0);
1641}
1642
1643static int
1644lagg_fail_detach(struct lagg_softc *sc)
1645{
1646	return (0);
1647}
1648
1649static int
1650lagg_fail_start(struct lagg_softc *sc, struct mbuf *m)
1651{
1652	struct lagg_port *lp;
1653
1654	/* Use the master port if active or the next available port */
1655	if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) {
1656		m_freem(m);
1657		return (ENOENT);
1658	}
1659
1660	/* Send mbuf */
1661	return (lagg_enqueue(lp->lp_ifp, m));
1662}
1663
1664static struct mbuf *
1665lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1666{
1667	struct ifnet *ifp = sc->sc_ifp;
1668	struct lagg_port *tmp_tp;
1669
1670	if (lp == sc->sc_primary || lagg_failover_rx_all) {
1671		m->m_pkthdr.rcvif = ifp;
1672		return (m);
1673	}
1674
1675	if (!LAGG_PORTACTIVE(sc->sc_primary)) {
1676		tmp_tp = lagg_link_active(sc, sc->sc_primary);
1677		/*
1678		 * If tmp_tp is null, we've recieved a packet when all
1679		 * our links are down. Weird, but process it anyways.
1680		 */
1681		if ((tmp_tp == NULL || tmp_tp == lp)) {
1682			m->m_pkthdr.rcvif = ifp;
1683			return (m);
1684		}
1685	}
1686
1687	m_freem(m);
1688	return (NULL);
1689}
1690
1691/*
1692 * Loadbalancing
1693 */
1694
1695static int
1696lagg_lb_attach(struct lagg_softc *sc)
1697{
1698	struct lagg_port *lp;
1699	struct lagg_lb *lb;
1700
1701	if ((lb = (struct lagg_lb *)malloc(sizeof(struct lagg_lb),
1702	    M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
1703		return (ENOMEM);
1704
1705	sc->sc_detach = lagg_lb_detach;
1706	sc->sc_start = lagg_lb_start;
1707	sc->sc_input = lagg_lb_input;
1708	sc->sc_port_create = lagg_lb_port_create;
1709	sc->sc_port_destroy = lagg_lb_port_destroy;
1710	sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX;
1711
1712	lb->lb_key = arc4random();
1713	sc->sc_psc = (caddr_t)lb;
1714
1715	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1716		lagg_lb_port_create(lp);
1717
1718	return (0);
1719}
1720
1721static int
1722lagg_lb_detach(struct lagg_softc *sc)
1723{
1724	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
1725	if (lb != NULL)
1726		free(lb, M_DEVBUF);
1727	return (0);
1728}
1729
1730static int
1731lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp)
1732{
1733	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
1734	struct lagg_port *lp_next;
1735	int i = 0;
1736
1737	bzero(&lb->lb_ports, sizeof(lb->lb_ports));
1738	SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
1739		if (lp_next == lp)
1740			continue;
1741		if (i >= LAGG_MAX_PORTS)
1742			return (EINVAL);
1743		if (sc->sc_ifflags & IFF_DEBUG)
1744			printf("%s: port %s at index %d\n",
1745			    sc->sc_ifname, lp_next->lp_ifname, i);
1746		lb->lb_ports[i++] = lp_next;
1747	}
1748
1749	return (0);
1750}
1751
1752static int
1753lagg_lb_port_create(struct lagg_port *lp)
1754{
1755	struct lagg_softc *sc = lp->lp_softc;
1756	return (lagg_lb_porttable(sc, NULL));
1757}
1758
1759static void
1760lagg_lb_port_destroy(struct lagg_port *lp)
1761{
1762	struct lagg_softc *sc = lp->lp_softc;
1763	lagg_lb_porttable(sc, lp);
1764}
1765
1766static int
1767lagg_lb_start(struct lagg_softc *sc, struct mbuf *m)
1768{
1769	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
1770	struct lagg_port *lp = NULL;
1771	uint32_t p = 0;
1772
1773	if (sc->use_flowid && (m->m_flags & M_FLOWID))
1774		p = m->m_pkthdr.flowid;
1775	else
1776		p = lagg_hashmbuf(sc, m, lb->lb_key);
1777	p %= sc->sc_count;
1778	lp = lb->lb_ports[p];
1779
1780	/*
1781	 * Check the port's link state. This will return the next active
1782	 * port if the link is down or the port is NULL.
1783	 */
1784	if ((lp = lagg_link_active(sc, lp)) == NULL) {
1785		m_freem(m);
1786		return (ENOENT);
1787	}
1788
1789	/* Send mbuf */
1790	return (lagg_enqueue(lp->lp_ifp, m));
1791}
1792
1793static struct mbuf *
1794lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1795{
1796	struct ifnet *ifp = sc->sc_ifp;
1797
1798	/* Just pass in the packet to our lagg device */
1799	m->m_pkthdr.rcvif = ifp;
1800
1801	return (m);
1802}
1803
1804/*
1805 * 802.3ad LACP
1806 */
1807
1808static int
1809lagg_lacp_attach(struct lagg_softc *sc)
1810{
1811	struct lagg_port *lp;
1812	int error;
1813
1814	sc->sc_detach = lagg_lacp_detach;
1815	sc->sc_port_create = lacp_port_create;
1816	sc->sc_port_destroy = lacp_port_destroy;
1817	sc->sc_linkstate = lacp_linkstate;
1818	sc->sc_start = lagg_lacp_start;
1819	sc->sc_input = lagg_lacp_input;
1820	sc->sc_init = lacp_init;
1821	sc->sc_stop = lacp_stop;
1822	sc->sc_lladdr = lagg_lacp_lladdr;
1823	sc->sc_req = lacp_req;
1824	sc->sc_portreq = lacp_portreq;
1825
1826	error = lacp_attach(sc);
1827	if (error)
1828		return (error);
1829
1830	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1831		lacp_port_create(lp);
1832
1833	return (error);
1834}
1835
1836static int
1837lagg_lacp_detach(struct lagg_softc *sc)
1838{
1839	struct lagg_port *lp;
1840	int error;
1841
1842	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1843		lacp_port_destroy(lp);
1844
1845	/* unlocking is safe here */
1846	LAGG_WUNLOCK(sc);
1847	error = lacp_detach(sc);
1848	LAGG_WLOCK(sc);
1849
1850	return (error);
1851}
1852
1853static void
1854lagg_lacp_lladdr(struct lagg_softc *sc)
1855{
1856	struct lagg_port *lp;
1857
1858	/* purge all the lacp ports */
1859	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1860		lacp_port_destroy(lp);
1861
1862	/* add them back in */
1863	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1864		lacp_port_create(lp);
1865}
1866
1867static int
1868lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m)
1869{
1870	struct lagg_port *lp;
1871
1872	lp = lacp_select_tx_port(sc, m);
1873	if (lp == NULL) {
1874		m_freem(m);
1875		return (ENETDOWN);
1876	}
1877
1878	/* Send mbuf */
1879	return (lagg_enqueue(lp->lp_ifp, m));
1880}
1881
1882static struct mbuf *
1883lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1884{
1885	struct ifnet *ifp = sc->sc_ifp;
1886	struct ether_header *eh;
1887	u_short etype;
1888
1889	eh = mtod(m, struct ether_header *);
1890	etype = ntohs(eh->ether_type);
1891
1892	/* Tap off LACP control messages */
1893	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) {
1894		m = lacp_input(lp, m);
1895		if (m == NULL)
1896			return (NULL);
1897	}
1898
1899	/*
1900	 * If the port is not collecting or not in the active aggregator then
1901	 * free and return.
1902	 */
1903	if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) {
1904		m_freem(m);
1905		return (NULL);
1906	}
1907
1908	m->m_pkthdr.rcvif = ifp;
1909	return (m);
1910}
1911
1912static void
1913lagg_callout(void *arg)
1914{
1915	struct lagg_softc *sc = (struct lagg_softc *)arg;
1916	struct ifnet *ifp = sc->sc_ifp;
1917
1918	ifp->if_ipackets = counter_u64_fetch(sc->sc_ipackets);
1919	ifp->if_opackets = counter_u64_fetch(sc->sc_opackets);
1920	ifp->if_ibytes = counter_u64_fetch(sc->sc_ibytes);
1921	ifp->if_obytes = counter_u64_fetch(sc->sc_obytes);
1922
1923	callout_reset(&sc->sc_callout, hz, lagg_callout, sc);
1924}
1925