if_lagg.c revision 251490
1/*	$OpenBSD: if_trunk.c,v 1.30 2007/01/31 06:20:19 reyk Exp $	*/
2
3/*
4 * Copyright (c) 2005, 2006 Reyk Floeter <reyk@openbsd.org>
5 * Copyright (c) 2007 Andrew Thompson <thompsa@FreeBSD.org>
6 *
7 * Permission to use, copy, modify, and distribute this software for any
8 * purpose with or without fee is hereby granted, provided that the above
9 * copyright notice and this permission notice appear in all copies.
10 *
11 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 */
19
20#include <sys/cdefs.h>
21__FBSDID("$FreeBSD: head/sys/net/if_lagg.c 251490 2013-06-07 10:27:50Z trociny $");
22
23#include "opt_inet.h"
24#include "opt_inet6.h"
25
26#include <sys/param.h>
27#include <sys/kernel.h>
28#include <sys/malloc.h>
29#include <sys/mbuf.h>
30#include <sys/queue.h>
31#include <sys/socket.h>
32#include <sys/sockio.h>
33#include <sys/sysctl.h>
34#include <sys/module.h>
35#include <sys/priv.h>
36#include <sys/systm.h>
37#include <sys/proc.h>
38#include <sys/hash.h>
39#include <sys/lock.h>
40#include <sys/rwlock.h>
41#include <sys/taskqueue.h>
42#include <sys/eventhandler.h>
43
44#include <net/ethernet.h>
45#include <net/if.h>
46#include <net/if_clone.h>
47#include <net/if_arp.h>
48#include <net/if_dl.h>
49#include <net/if_llc.h>
50#include <net/if_media.h>
51#include <net/if_types.h>
52#include <net/if_var.h>
53#include <net/bpf.h>
54
55#if defined(INET) || defined(INET6)
56#include <netinet/in.h>
57#endif
58#ifdef INET
59#include <netinet/in_systm.h>
60#include <netinet/if_ether.h>
61#include <netinet/ip.h>
62#endif
63
64#ifdef INET6
65#include <netinet/ip6.h>
66#endif
67
68#include <net/if_vlan_var.h>
69#include <net/if_lagg.h>
70#include <net/ieee8023ad_lacp.h>
71
72/* Special flags we should propagate to the lagg ports. */
73static struct {
74	int flag;
75	int (*func)(struct ifnet *, int);
76} lagg_pflags[] = {
77	{IFF_PROMISC, ifpromisc},
78	{IFF_ALLMULTI, if_allmulti},
79	{0, NULL}
80};
81
82SLIST_HEAD(__trhead, lagg_softc) lagg_list;	/* list of laggs */
83static struct mtx	lagg_list_mtx;
84eventhandler_tag	lagg_detach_cookie = NULL;
85
86static int	lagg_clone_create(struct if_clone *, int, caddr_t);
87static void	lagg_clone_destroy(struct ifnet *);
88static struct if_clone *lagg_cloner;
89static const char laggname[] = "lagg";
90
91static void	lagg_lladdr(struct lagg_softc *, uint8_t *);
92static void	lagg_capabilities(struct lagg_softc *);
93static void	lagg_port_lladdr(struct lagg_port *, uint8_t *);
94static void	lagg_port_setlladdr(void *, int);
95static int	lagg_port_create(struct lagg_softc *, struct ifnet *);
96static int	lagg_port_destroy(struct lagg_port *, int);
97static struct mbuf *lagg_input(struct ifnet *, struct mbuf *);
98static void	lagg_linkstate(struct lagg_softc *);
99static void	lagg_port_state(struct ifnet *, int);
100static int	lagg_port_ioctl(struct ifnet *, u_long, caddr_t);
101static int	lagg_port_output(struct ifnet *, struct mbuf *,
102		    const struct sockaddr *, struct route *);
103static void	lagg_port_ifdetach(void *arg __unused, struct ifnet *);
104#ifdef LAGG_PORT_STACKING
105static int	lagg_port_checkstacking(struct lagg_softc *);
106#endif
107static void	lagg_port2req(struct lagg_port *, struct lagg_reqport *);
108static void	lagg_init(void *);
109static void	lagg_stop(struct lagg_softc *);
110static int	lagg_ioctl(struct ifnet *, u_long, caddr_t);
111static int	lagg_ether_setmulti(struct lagg_softc *);
112static int	lagg_ether_cmdmulti(struct lagg_port *, int);
113static	int	lagg_setflag(struct lagg_port *, int, int,
114		    int (*func)(struct ifnet *, int));
115static	int	lagg_setflags(struct lagg_port *, int status);
116static int	lagg_transmit(struct ifnet *, struct mbuf *);
117static void	lagg_qflush(struct ifnet *);
118static int	lagg_media_change(struct ifnet *);
119static void	lagg_media_status(struct ifnet *, struct ifmediareq *);
120static struct lagg_port *lagg_link_active(struct lagg_softc *,
121	    struct lagg_port *);
122static const void *lagg_gethdr(struct mbuf *, u_int, u_int, void *);
123
124/* Simple round robin */
125static int	lagg_rr_attach(struct lagg_softc *);
126static int	lagg_rr_detach(struct lagg_softc *);
127static int	lagg_rr_start(struct lagg_softc *, struct mbuf *);
128static struct mbuf *lagg_rr_input(struct lagg_softc *, struct lagg_port *,
129		    struct mbuf *);
130
131/* Active failover */
132static int	lagg_fail_attach(struct lagg_softc *);
133static int	lagg_fail_detach(struct lagg_softc *);
134static int	lagg_fail_start(struct lagg_softc *, struct mbuf *);
135static struct mbuf *lagg_fail_input(struct lagg_softc *, struct lagg_port *,
136		    struct mbuf *);
137
138/* Loadbalancing */
139static int	lagg_lb_attach(struct lagg_softc *);
140static int	lagg_lb_detach(struct lagg_softc *);
141static int	lagg_lb_port_create(struct lagg_port *);
142static void	lagg_lb_port_destroy(struct lagg_port *);
143static int	lagg_lb_start(struct lagg_softc *, struct mbuf *);
144static struct mbuf *lagg_lb_input(struct lagg_softc *, struct lagg_port *,
145		    struct mbuf *);
146static int	lagg_lb_porttable(struct lagg_softc *, struct lagg_port *);
147
148/* 802.3ad LACP */
149static int	lagg_lacp_attach(struct lagg_softc *);
150static int	lagg_lacp_detach(struct lagg_softc *);
151static int	lagg_lacp_start(struct lagg_softc *, struct mbuf *);
152static struct mbuf *lagg_lacp_input(struct lagg_softc *, struct lagg_port *,
153		    struct mbuf *);
154static void	lagg_lacp_lladdr(struct lagg_softc *);
155
156static void	lagg_callout(void *);
157
158/* lagg protocol table */
159static const struct {
160	int			ti_proto;
161	int			(*ti_attach)(struct lagg_softc *);
162} lagg_protos[] = {
163	{ LAGG_PROTO_ROUNDROBIN,	lagg_rr_attach },
164	{ LAGG_PROTO_FAILOVER,		lagg_fail_attach },
165	{ LAGG_PROTO_LOADBALANCE,	lagg_lb_attach },
166	{ LAGG_PROTO_ETHERCHANNEL,	lagg_lb_attach },
167	{ LAGG_PROTO_LACP,		lagg_lacp_attach },
168	{ LAGG_PROTO_NONE,		NULL }
169};
170
171SYSCTL_DECL(_net_link);
172static SYSCTL_NODE(_net_link, OID_AUTO, lagg, CTLFLAG_RW, 0,
173    "Link Aggregation");
174
175static int lagg_failover_rx_all = 0; /* Allow input on any failover links */
176SYSCTL_INT(_net_link_lagg, OID_AUTO, failover_rx_all, CTLFLAG_RW,
177    &lagg_failover_rx_all, 0,
178    "Accept input from any interface in a failover lagg");
179static int def_use_flowid = 1; /* Default value for using M_FLOWID */
180TUNABLE_INT("net.link.lagg.default_use_flowid", &def_use_flowid);
181SYSCTL_INT(_net_link_lagg, OID_AUTO, default_use_flowid, CTLFLAG_RW,
182    &def_use_flowid, 0,
183    "Default setting for using flow id for load sharing");
184
185static int
186lagg_modevent(module_t mod, int type, void *data)
187{
188
189	switch (type) {
190	case MOD_LOAD:
191		mtx_init(&lagg_list_mtx, "if_lagg list", NULL, MTX_DEF);
192		SLIST_INIT(&lagg_list);
193		lagg_cloner = if_clone_simple(laggname, lagg_clone_create,
194		    lagg_clone_destroy, 0);
195		lagg_input_p = lagg_input;
196		lagg_linkstate_p = lagg_port_state;
197		lagg_detach_cookie = EVENTHANDLER_REGISTER(
198		    ifnet_departure_event, lagg_port_ifdetach, NULL,
199		    EVENTHANDLER_PRI_ANY);
200		break;
201	case MOD_UNLOAD:
202		EVENTHANDLER_DEREGISTER(ifnet_departure_event,
203		    lagg_detach_cookie);
204		if_clone_detach(lagg_cloner);
205		lagg_input_p = NULL;
206		lagg_linkstate_p = NULL;
207		mtx_destroy(&lagg_list_mtx);
208		break;
209	default:
210		return (EOPNOTSUPP);
211	}
212	return (0);
213}
214
215static moduledata_t lagg_mod = {
216	"if_lagg",
217	lagg_modevent,
218	0
219};
220
221DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
222MODULE_VERSION(if_lagg, 1);
223
224/*
225 * This routine is run via an vlan
226 * config EVENT
227 */
228static void
229lagg_register_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
230{
231        struct lagg_softc       *sc = ifp->if_softc;
232        struct lagg_port        *lp;
233
234        if (ifp->if_softc !=  arg)   /* Not our event */
235                return;
236
237        LAGG_RLOCK(sc);
238        if (!SLIST_EMPTY(&sc->sc_ports)) {
239                SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
240                        EVENTHANDLER_INVOKE(vlan_config, lp->lp_ifp, vtag);
241        }
242        LAGG_RUNLOCK(sc);
243}
244
245/*
246 * This routine is run via an vlan
247 * unconfig EVENT
248 */
249static void
250lagg_unregister_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag)
251{
252        struct lagg_softc       *sc = ifp->if_softc;
253        struct lagg_port        *lp;
254
255        if (ifp->if_softc !=  arg)   /* Not our event */
256                return;
257
258        LAGG_RLOCK(sc);
259        if (!SLIST_EMPTY(&sc->sc_ports)) {
260                SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
261                        EVENTHANDLER_INVOKE(vlan_unconfig, lp->lp_ifp, vtag);
262        }
263        LAGG_RUNLOCK(sc);
264}
265
266static int
267lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params)
268{
269	struct lagg_softc *sc;
270	struct ifnet *ifp;
271	int i, error = 0;
272	static const u_char eaddr[6];	/* 00:00:00:00:00:00 */
273	struct sysctl_oid *oid;
274	char num[14];			/* sufficient for 32 bits */
275
276	sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO);
277	ifp = sc->sc_ifp = if_alloc(IFT_ETHER);
278	if (ifp == NULL) {
279		free(sc, M_DEVBUF);
280		return (ENOSPC);
281	}
282
283	sc->sc_ipackets = counter_u64_alloc(M_WAITOK);
284	sc->sc_opackets = counter_u64_alloc(M_WAITOK);
285	sc->sc_ibytes = counter_u64_alloc(M_WAITOK);
286	sc->sc_obytes = counter_u64_alloc(M_WAITOK);
287
288	sysctl_ctx_init(&sc->ctx);
289	snprintf(num, sizeof(num), "%u", unit);
290	sc->use_flowid = def_use_flowid;
291	oid = SYSCTL_ADD_NODE(&sc->ctx, &SYSCTL_NODE_CHILDREN(_net_link, lagg),
292		OID_AUTO, num, CTLFLAG_RD, NULL, "");
293	SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
294		"use_flowid", CTLTYPE_INT|CTLFLAG_RW, &sc->use_flowid, sc->use_flowid,
295		"Use flow id for load sharing");
296	SYSCTL_ADD_INT(&sc->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
297		"count", CTLTYPE_INT|CTLFLAG_RD, &sc->sc_count, sc->sc_count,
298		"Total number of ports");
299	/* Hash all layers by default */
300	sc->sc_flags = LAGG_F_HASHL2|LAGG_F_HASHL3|LAGG_F_HASHL4;
301
302	sc->sc_proto = LAGG_PROTO_NONE;
303	for (i = 0; lagg_protos[i].ti_proto != LAGG_PROTO_NONE; i++) {
304		if (lagg_protos[i].ti_proto == LAGG_PROTO_DEFAULT) {
305			sc->sc_proto = lagg_protos[i].ti_proto;
306			if ((error = lagg_protos[i].ti_attach(sc)) != 0) {
307				if_free(ifp);
308				free(sc, M_DEVBUF);
309				return (error);
310			}
311			break;
312		}
313	}
314	LAGG_LOCK_INIT(sc);
315	SLIST_INIT(&sc->sc_ports);
316	TASK_INIT(&sc->sc_lladdr_task, 0, lagg_port_setlladdr, sc);
317	callout_init_rw(&sc->sc_callout, &sc->sc_mtx, CALLOUT_SHAREDLOCK);
318
319	/* Initialise pseudo media types */
320	ifmedia_init(&sc->sc_media, 0, lagg_media_change,
321	    lagg_media_status);
322	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
323	ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
324
325	if_initname(ifp, laggname, unit);
326	ifp->if_softc = sc;
327	ifp->if_transmit = lagg_transmit;
328	ifp->if_qflush = lagg_qflush;
329	ifp->if_init = lagg_init;
330	ifp->if_ioctl = lagg_ioctl;
331	ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
332
333	/*
334	 * Attach as an ordinary ethernet device, children will be attached
335	 * as special device IFT_IEEE8023ADLAG.
336	 */
337	ether_ifattach(ifp, eaddr);
338
339	sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
340		lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
341	sc->vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
342		lagg_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
343
344	/* Insert into the global list of laggs */
345	mtx_lock(&lagg_list_mtx);
346	SLIST_INSERT_HEAD(&lagg_list, sc, sc_entries);
347	mtx_unlock(&lagg_list_mtx);
348
349	callout_reset(&sc->sc_callout, hz, lagg_callout, sc);
350
351	return (0);
352}
353
354static void
355lagg_clone_destroy(struct ifnet *ifp)
356{
357	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
358	struct lagg_port *lp;
359
360	LAGG_WLOCK(sc);
361
362	lagg_stop(sc);
363	ifp->if_flags &= ~IFF_UP;
364
365	EVENTHANDLER_DEREGISTER(vlan_config, sc->vlan_attach);
366	EVENTHANDLER_DEREGISTER(vlan_unconfig, sc->vlan_detach);
367
368	/* Shutdown and remove lagg ports */
369	while ((lp = SLIST_FIRST(&sc->sc_ports)) != NULL)
370		lagg_port_destroy(lp, 1);
371	/* Unhook the aggregation protocol */
372	if (sc->sc_detach != NULL)
373		(*sc->sc_detach)(sc);
374
375	LAGG_WUNLOCK(sc);
376
377	sysctl_ctx_free(&sc->ctx);
378	ifmedia_removeall(&sc->sc_media);
379	ether_ifdetach(ifp);
380	if_free(ifp);
381
382	callout_drain(&sc->sc_callout);
383	counter_u64_free(sc->sc_ipackets);
384	counter_u64_free(sc->sc_opackets);
385	counter_u64_free(sc->sc_ibytes);
386	counter_u64_free(sc->sc_obytes);
387
388	mtx_lock(&lagg_list_mtx);
389	SLIST_REMOVE(&lagg_list, sc, lagg_softc, sc_entries);
390	mtx_unlock(&lagg_list_mtx);
391
392	taskqueue_drain(taskqueue_swi, &sc->sc_lladdr_task);
393	LAGG_LOCK_DESTROY(sc);
394	free(sc, M_DEVBUF);
395}
396
397static void
398lagg_lladdr(struct lagg_softc *sc, uint8_t *lladdr)
399{
400	struct ifnet *ifp = sc->sc_ifp;
401
402	if (memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0)
403		return;
404
405	bcopy(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN);
406	/* Let the protocol know the MAC has changed */
407	if (sc->sc_lladdr != NULL)
408		(*sc->sc_lladdr)(sc);
409	EVENTHANDLER_INVOKE(iflladdr_event, ifp);
410}
411
412static void
413lagg_capabilities(struct lagg_softc *sc)
414{
415	struct lagg_port *lp;
416	int cap = ~0, ena = ~0;
417	u_long hwa = ~0UL;
418
419	LAGG_WLOCK_ASSERT(sc);
420
421	/* Get capabilities from the lagg ports */
422	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
423		cap &= lp->lp_ifp->if_capabilities;
424		ena &= lp->lp_ifp->if_capenable;
425		hwa &= lp->lp_ifp->if_hwassist;
426	}
427	cap = (cap == ~0 ? 0 : cap);
428	ena = (ena == ~0 ? 0 : ena);
429	hwa = (hwa == ~0 ? 0 : hwa);
430
431	if (sc->sc_ifp->if_capabilities != cap ||
432	    sc->sc_ifp->if_capenable != ena ||
433	    sc->sc_ifp->if_hwassist != hwa) {
434		sc->sc_ifp->if_capabilities = cap;
435		sc->sc_ifp->if_capenable = ena;
436		sc->sc_ifp->if_hwassist = hwa;
437		getmicrotime(&sc->sc_ifp->if_lastchange);
438
439		if (sc->sc_ifflags & IFF_DEBUG)
440			if_printf(sc->sc_ifp,
441			    "capabilities 0x%08x enabled 0x%08x\n", cap, ena);
442	}
443}
444
445static void
446lagg_port_lladdr(struct lagg_port *lp, uint8_t *lladdr)
447{
448	struct lagg_softc *sc = lp->lp_softc;
449	struct ifnet *ifp = lp->lp_ifp;
450	struct lagg_llq *llq;
451	int pending = 0;
452
453	LAGG_WLOCK_ASSERT(sc);
454
455	if (lp->lp_detaching ||
456	    memcmp(lladdr, IF_LLADDR(ifp), ETHER_ADDR_LEN) == 0)
457		return;
458
459	/* Check to make sure its not already queued to be changed */
460	SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) {
461		if (llq->llq_ifp == ifp) {
462			pending = 1;
463			break;
464		}
465	}
466
467	if (!pending) {
468		llq = malloc(sizeof(struct lagg_llq), M_DEVBUF, M_NOWAIT);
469		if (llq == NULL)	/* XXX what to do */
470			return;
471	}
472
473	/* Update the lladdr even if pending, it may have changed */
474	llq->llq_ifp = ifp;
475	bcopy(lladdr, llq->llq_lladdr, ETHER_ADDR_LEN);
476
477	if (!pending)
478		SLIST_INSERT_HEAD(&sc->sc_llq_head, llq, llq_entries);
479
480	taskqueue_enqueue(taskqueue_swi, &sc->sc_lladdr_task);
481}
482
483/*
484 * Set the interface MAC address from a taskqueue to avoid a LOR.
485 */
486static void
487lagg_port_setlladdr(void *arg, int pending)
488{
489	struct lagg_softc *sc = (struct lagg_softc *)arg;
490	struct lagg_llq *llq, *head;
491	struct ifnet *ifp;
492	int error;
493
494	/* Grab a local reference of the queue and remove it from the softc */
495	LAGG_WLOCK(sc);
496	head = SLIST_FIRST(&sc->sc_llq_head);
497	SLIST_FIRST(&sc->sc_llq_head) = NULL;
498	LAGG_WUNLOCK(sc);
499
500	/*
501	 * Traverse the queue and set the lladdr on each ifp. It is safe to do
502	 * unlocked as we have the only reference to it.
503	 */
504	for (llq = head; llq != NULL; llq = head) {
505		ifp = llq->llq_ifp;
506
507		/* Set the link layer address */
508		CURVNET_SET(ifp->if_vnet);
509		error = if_setlladdr(ifp, llq->llq_lladdr, ETHER_ADDR_LEN);
510		CURVNET_RESTORE();
511		if (error)
512			printf("%s: setlladdr failed on %s\n", __func__,
513			    ifp->if_xname);
514
515		head = SLIST_NEXT(llq, llq_entries);
516		free(llq, M_DEVBUF);
517	}
518}
519
520static int
521lagg_port_create(struct lagg_softc *sc, struct ifnet *ifp)
522{
523	struct lagg_softc *sc_ptr;
524	struct lagg_port *lp;
525	int error = 0;
526
527	LAGG_WLOCK_ASSERT(sc);
528
529	/* Limit the maximal number of lagg ports */
530	if (sc->sc_count >= LAGG_MAX_PORTS)
531		return (ENOSPC);
532
533	/* Check if port has already been associated to a lagg */
534	if (ifp->if_lagg != NULL) {
535		/* Port is already in the current lagg? */
536		lp = (struct lagg_port *)ifp->if_lagg;
537		if (lp->lp_softc == sc)
538			return (EEXIST);
539		return (EBUSY);
540	}
541
542	/* XXX Disallow non-ethernet interfaces (this should be any of 802) */
543	if (ifp->if_type != IFT_ETHER)
544		return (EPROTONOSUPPORT);
545
546	/* Allow the first Ethernet member to define the MTU */
547	if (SLIST_EMPTY(&sc->sc_ports))
548		sc->sc_ifp->if_mtu = ifp->if_mtu;
549	else if (sc->sc_ifp->if_mtu != ifp->if_mtu) {
550		if_printf(sc->sc_ifp, "invalid MTU for %s\n",
551		    ifp->if_xname);
552		return (EINVAL);
553	}
554
555	if ((lp = malloc(sizeof(struct lagg_port),
556	    M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
557		return (ENOMEM);
558
559	/* Check if port is a stacked lagg */
560	mtx_lock(&lagg_list_mtx);
561	SLIST_FOREACH(sc_ptr, &lagg_list, sc_entries) {
562		if (ifp == sc_ptr->sc_ifp) {
563			mtx_unlock(&lagg_list_mtx);
564			free(lp, M_DEVBUF);
565			return (EINVAL);
566			/* XXX disable stacking for the moment, its untested */
567#ifdef LAGG_PORT_STACKING
568			lp->lp_flags |= LAGG_PORT_STACK;
569			if (lagg_port_checkstacking(sc_ptr) >=
570			    LAGG_MAX_STACKING) {
571				mtx_unlock(&lagg_list_mtx);
572				free(lp, M_DEVBUF);
573				return (E2BIG);
574			}
575#endif
576		}
577	}
578	mtx_unlock(&lagg_list_mtx);
579
580	/* Change the interface type */
581	lp->lp_iftype = ifp->if_type;
582	ifp->if_type = IFT_IEEE8023ADLAG;
583	ifp->if_lagg = lp;
584	lp->lp_ioctl = ifp->if_ioctl;
585	ifp->if_ioctl = lagg_port_ioctl;
586	lp->lp_output = ifp->if_output;
587	ifp->if_output = lagg_port_output;
588
589	lp->lp_ifp = ifp;
590	lp->lp_softc = sc;
591
592	/* Save port link layer address */
593	bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN);
594
595	if (SLIST_EMPTY(&sc->sc_ports)) {
596		sc->sc_primary = lp;
597		lagg_lladdr(sc, IF_LLADDR(ifp));
598	} else {
599		/* Update link layer address for this port */
600		lagg_port_lladdr(lp, IF_LLADDR(sc->sc_ifp));
601	}
602
603	/* Insert into the list of ports */
604	SLIST_INSERT_HEAD(&sc->sc_ports, lp, lp_entries);
605	sc->sc_count++;
606
607	/* Update lagg capabilities */
608	lagg_capabilities(sc);
609	lagg_linkstate(sc);
610
611	/* Add multicast addresses and interface flags to this port */
612	lagg_ether_cmdmulti(lp, 1);
613	lagg_setflags(lp, 1);
614
615	if (sc->sc_port_create != NULL)
616		error = (*sc->sc_port_create)(lp);
617	if (error) {
618		/* remove the port again, without calling sc_port_destroy */
619		lagg_port_destroy(lp, 0);
620		return (error);
621	}
622
623	return (error);
624}
625
626#ifdef LAGG_PORT_STACKING
627static int
628lagg_port_checkstacking(struct lagg_softc *sc)
629{
630	struct lagg_softc *sc_ptr;
631	struct lagg_port *lp;
632	int m = 0;
633
634	LAGG_WLOCK_ASSERT(sc);
635
636	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
637		if (lp->lp_flags & LAGG_PORT_STACK) {
638			sc_ptr = (struct lagg_softc *)lp->lp_ifp->if_softc;
639			m = MAX(m, lagg_port_checkstacking(sc_ptr));
640		}
641	}
642
643	return (m + 1);
644}
645#endif
646
647static int
648lagg_port_destroy(struct lagg_port *lp, int runpd)
649{
650	struct lagg_softc *sc = lp->lp_softc;
651	struct lagg_port *lp_ptr;
652	struct lagg_llq *llq;
653	struct ifnet *ifp = lp->lp_ifp;
654
655	LAGG_WLOCK_ASSERT(sc);
656
657	if (runpd && sc->sc_port_destroy != NULL)
658		(*sc->sc_port_destroy)(lp);
659
660	/*
661	 * Remove multicast addresses and interface flags from this port and
662	 * reset the MAC address, skip if the interface is being detached.
663	 */
664	if (!lp->lp_detaching) {
665		lagg_ether_cmdmulti(lp, 0);
666		lagg_setflags(lp, 0);
667		lagg_port_lladdr(lp, lp->lp_lladdr);
668	}
669
670	/* Restore interface */
671	ifp->if_type = lp->lp_iftype;
672	ifp->if_ioctl = lp->lp_ioctl;
673	ifp->if_output = lp->lp_output;
674	ifp->if_lagg = NULL;
675
676	/* Finally, remove the port from the lagg */
677	SLIST_REMOVE(&sc->sc_ports, lp, lagg_port, lp_entries);
678	sc->sc_count--;
679
680	/* Update the primary interface */
681	if (lp == sc->sc_primary) {
682		uint8_t lladdr[ETHER_ADDR_LEN];
683
684		if ((lp_ptr = SLIST_FIRST(&sc->sc_ports)) == NULL) {
685			bzero(&lladdr, ETHER_ADDR_LEN);
686		} else {
687			bcopy(lp_ptr->lp_lladdr,
688			    lladdr, ETHER_ADDR_LEN);
689		}
690		lagg_lladdr(sc, lladdr);
691		sc->sc_primary = lp_ptr;
692
693		/* Update link layer address for each port */
694		SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries)
695			lagg_port_lladdr(lp_ptr, lladdr);
696	}
697
698	/* Remove any pending lladdr changes from the queue */
699	if (lp->lp_detaching) {
700		SLIST_FOREACH(llq, &sc->sc_llq_head, llq_entries) {
701			if (llq->llq_ifp == ifp) {
702				SLIST_REMOVE(&sc->sc_llq_head, llq, lagg_llq,
703				    llq_entries);
704				free(llq, M_DEVBUF);
705				break;	/* Only appears once */
706			}
707		}
708	}
709
710	if (lp->lp_ifflags)
711		if_printf(ifp, "%s: lp_ifflags unclean\n", __func__);
712
713	free(lp, M_DEVBUF);
714
715	/* Update lagg capabilities */
716	lagg_capabilities(sc);
717	lagg_linkstate(sc);
718
719	return (0);
720}
721
722static int
723lagg_port_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
724{
725	struct lagg_reqport *rp = (struct lagg_reqport *)data;
726	struct lagg_softc *sc;
727	struct lagg_port *lp = NULL;
728	int error = 0;
729
730	/* Should be checked by the caller */
731	if (ifp->if_type != IFT_IEEE8023ADLAG ||
732	    (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL)
733		goto fallback;
734
735	switch (cmd) {
736	case SIOCGLAGGPORT:
737		if (rp->rp_portname[0] == '\0' ||
738		    ifunit(rp->rp_portname) != ifp) {
739			error = EINVAL;
740			break;
741		}
742
743		LAGG_RLOCK(sc);
744		if ((lp = ifp->if_lagg) == NULL || lp->lp_softc != sc) {
745			error = ENOENT;
746			LAGG_RUNLOCK(sc);
747			break;
748		}
749
750		lagg_port2req(lp, rp);
751		LAGG_RUNLOCK(sc);
752		break;
753
754	case SIOCSIFCAP:
755		if (lp->lp_ioctl == NULL) {
756			error = EINVAL;
757			break;
758		}
759		error = (*lp->lp_ioctl)(ifp, cmd, data);
760		if (error)
761			break;
762
763		/* Update lagg interface capabilities */
764		LAGG_WLOCK(sc);
765		lagg_capabilities(sc);
766		LAGG_WUNLOCK(sc);
767		break;
768
769	case SIOCSIFMTU:
770		/* Do not allow the MTU to be changed once joined */
771		error = EINVAL;
772		break;
773
774	default:
775		goto fallback;
776	}
777
778	return (error);
779
780fallback:
781	if (lp->lp_ioctl != NULL)
782		return ((*lp->lp_ioctl)(ifp, cmd, data));
783
784	return (EINVAL);
785}
786
787/*
788 * For direct output to child ports.
789 */
790static int
791lagg_port_output(struct ifnet *ifp, struct mbuf *m,
792	const struct sockaddr *dst, struct route *ro)
793{
794	struct lagg_port *lp = ifp->if_lagg;
795
796	switch (dst->sa_family) {
797		case pseudo_AF_HDRCMPLT:
798		case AF_UNSPEC:
799			return ((*lp->lp_output)(ifp, m, dst, ro));
800	}
801
802	/* drop any other frames */
803	m_freem(m);
804	return (ENETDOWN);
805}
806
807static void
808lagg_port_ifdetach(void *arg __unused, struct ifnet *ifp)
809{
810	struct lagg_port *lp;
811	struct lagg_softc *sc;
812
813	if ((lp = ifp->if_lagg) == NULL)
814		return;
815	/* If the ifnet is just being renamed, don't do anything. */
816	if (ifp->if_flags & IFF_RENAMING)
817		return;
818
819	sc = lp->lp_softc;
820
821	LAGG_WLOCK(sc);
822	lp->lp_detaching = 1;
823	lagg_port_destroy(lp, 1);
824	LAGG_WUNLOCK(sc);
825}
826
827static void
828lagg_port2req(struct lagg_port *lp, struct lagg_reqport *rp)
829{
830	struct lagg_softc *sc = lp->lp_softc;
831
832	strlcpy(rp->rp_ifname, sc->sc_ifname, sizeof(rp->rp_ifname));
833	strlcpy(rp->rp_portname, lp->lp_ifp->if_xname, sizeof(rp->rp_portname));
834	rp->rp_prio = lp->lp_prio;
835	rp->rp_flags = lp->lp_flags;
836	if (sc->sc_portreq != NULL)
837		(*sc->sc_portreq)(lp, (caddr_t)&rp->rp_psc);
838
839	/* Add protocol specific flags */
840	switch (sc->sc_proto) {
841		case LAGG_PROTO_FAILOVER:
842			if (lp == sc->sc_primary)
843				rp->rp_flags |= LAGG_PORT_MASTER;
844			if (lp == lagg_link_active(sc, sc->sc_primary))
845				rp->rp_flags |= LAGG_PORT_ACTIVE;
846			break;
847
848		case LAGG_PROTO_ROUNDROBIN:
849		case LAGG_PROTO_LOADBALANCE:
850		case LAGG_PROTO_ETHERCHANNEL:
851			if (LAGG_PORTACTIVE(lp))
852				rp->rp_flags |= LAGG_PORT_ACTIVE;
853			break;
854
855		case LAGG_PROTO_LACP:
856			/* LACP has a different definition of active */
857			if (lacp_isactive(lp))
858				rp->rp_flags |= LAGG_PORT_ACTIVE;
859			if (lacp_iscollecting(lp))
860				rp->rp_flags |= LAGG_PORT_COLLECTING;
861			if (lacp_isdistributing(lp))
862				rp->rp_flags |= LAGG_PORT_DISTRIBUTING;
863			break;
864	}
865
866}
867
868static void
869lagg_init(void *xsc)
870{
871	struct lagg_softc *sc = (struct lagg_softc *)xsc;
872	struct lagg_port *lp;
873	struct ifnet *ifp = sc->sc_ifp;
874
875	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
876		return;
877
878	LAGG_WLOCK(sc);
879
880	ifp->if_drv_flags |= IFF_DRV_RUNNING;
881	/* Update the port lladdrs */
882	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
883		lagg_port_lladdr(lp, IF_LLADDR(ifp));
884
885	if (sc->sc_init != NULL)
886		(*sc->sc_init)(sc);
887
888	LAGG_WUNLOCK(sc);
889}
890
891static void
892lagg_stop(struct lagg_softc *sc)
893{
894	struct ifnet *ifp = sc->sc_ifp;
895
896	LAGG_WLOCK_ASSERT(sc);
897
898	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
899		return;
900
901	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
902
903	if (sc->sc_stop != NULL)
904		(*sc->sc_stop)(sc);
905}
906
907static int
908lagg_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
909{
910	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
911	struct lagg_reqall *ra = (struct lagg_reqall *)data;
912	struct lagg_reqport *rp = (struct lagg_reqport *)data, rpbuf;
913	struct lagg_reqflags *rf = (struct lagg_reqflags *)data;
914	struct ifreq *ifr = (struct ifreq *)data;
915	struct lagg_port *lp;
916	struct ifnet *tpif;
917	struct thread *td = curthread;
918	char *buf, *outbuf;
919	int count, buflen, len, error = 0;
920
921	bzero(&rpbuf, sizeof(rpbuf));
922
923	switch (cmd) {
924	case SIOCGLAGG:
925		LAGG_RLOCK(sc);
926		count = 0;
927		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
928			count++;
929		buflen = count * sizeof(struct lagg_reqport);
930		LAGG_RUNLOCK(sc);
931
932		outbuf = malloc(buflen, M_TEMP, M_WAITOK | M_ZERO);
933
934		LAGG_RLOCK(sc);
935		ra->ra_proto = sc->sc_proto;
936		if (sc->sc_req != NULL)
937			(*sc->sc_req)(sc, (caddr_t)&ra->ra_psc);
938
939		count = 0;
940		buf = outbuf;
941		len = min(ra->ra_size, buflen);
942		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
943			if (len < sizeof(rpbuf))
944				break;
945
946			lagg_port2req(lp, &rpbuf);
947			memcpy(buf, &rpbuf, sizeof(rpbuf));
948			count++;
949			buf += sizeof(rpbuf);
950			len -= sizeof(rpbuf);
951		}
952		LAGG_RUNLOCK(sc);
953		ra->ra_ports = count;
954		ra->ra_size = count * sizeof(rpbuf);
955		error = copyout(outbuf, ra->ra_port, ra->ra_size);
956		free(outbuf, M_TEMP);
957		break;
958	case SIOCSLAGG:
959		error = priv_check(td, PRIV_NET_LAGG);
960		if (error)
961			break;
962		if (ra->ra_proto >= LAGG_PROTO_MAX) {
963			error = EPROTONOSUPPORT;
964			break;
965		}
966		LAGG_WLOCK(sc);
967		if (sc->sc_proto != LAGG_PROTO_NONE) {
968			/* Reset protocol first in case detach unlocks */
969			sc->sc_proto = LAGG_PROTO_NONE;
970			error = sc->sc_detach(sc);
971			sc->sc_detach = NULL;
972			sc->sc_start = NULL;
973			sc->sc_input = NULL;
974			sc->sc_port_create = NULL;
975			sc->sc_port_destroy = NULL;
976			sc->sc_linkstate = NULL;
977			sc->sc_init = NULL;
978			sc->sc_stop = NULL;
979			sc->sc_lladdr = NULL;
980			sc->sc_req = NULL;
981			sc->sc_portreq = NULL;
982		} else if (sc->sc_input != NULL) {
983			/* Still detaching */
984			error = EBUSY;
985		}
986		if (error != 0) {
987			LAGG_WUNLOCK(sc);
988			break;
989		}
990		for (int i = 0; i < (sizeof(lagg_protos) /
991		    sizeof(lagg_protos[0])); i++) {
992			if (lagg_protos[i].ti_proto == ra->ra_proto) {
993				if (sc->sc_ifflags & IFF_DEBUG)
994					printf("%s: using proto %u\n",
995					    sc->sc_ifname,
996					    lagg_protos[i].ti_proto);
997				sc->sc_proto = lagg_protos[i].ti_proto;
998				if (sc->sc_proto != LAGG_PROTO_NONE)
999					error = lagg_protos[i].ti_attach(sc);
1000				LAGG_WUNLOCK(sc);
1001				return (error);
1002			}
1003		}
1004		LAGG_WUNLOCK(sc);
1005		error = EPROTONOSUPPORT;
1006		break;
1007	case SIOCGLAGGFLAGS:
1008		rf->rf_flags = sc->sc_flags;
1009		break;
1010	case SIOCSLAGGHASH:
1011		error = priv_check(td, PRIV_NET_LAGG);
1012		if (error)
1013			break;
1014		if ((rf->rf_flags & LAGG_F_HASHMASK) == 0) {
1015			error = EINVAL;
1016			break;
1017		}
1018		LAGG_WLOCK(sc);
1019		sc->sc_flags &= ~LAGG_F_HASHMASK;
1020		sc->sc_flags |= rf->rf_flags & LAGG_F_HASHMASK;
1021		LAGG_WUNLOCK(sc);
1022		break;
1023	case SIOCGLAGGPORT:
1024		if (rp->rp_portname[0] == '\0' ||
1025		    (tpif = ifunit(rp->rp_portname)) == NULL) {
1026			error = EINVAL;
1027			break;
1028		}
1029
1030		LAGG_RLOCK(sc);
1031		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
1032		    lp->lp_softc != sc) {
1033			error = ENOENT;
1034			LAGG_RUNLOCK(sc);
1035			break;
1036		}
1037
1038		lagg_port2req(lp, rp);
1039		LAGG_RUNLOCK(sc);
1040		break;
1041	case SIOCSLAGGPORT:
1042		error = priv_check(td, PRIV_NET_LAGG);
1043		if (error)
1044			break;
1045		if (rp->rp_portname[0] == '\0' ||
1046		    (tpif = ifunit(rp->rp_portname)) == NULL) {
1047			error = EINVAL;
1048			break;
1049		}
1050		LAGG_WLOCK(sc);
1051		error = lagg_port_create(sc, tpif);
1052		LAGG_WUNLOCK(sc);
1053		break;
1054	case SIOCSLAGGDELPORT:
1055		error = priv_check(td, PRIV_NET_LAGG);
1056		if (error)
1057			break;
1058		if (rp->rp_portname[0] == '\0' ||
1059		    (tpif = ifunit(rp->rp_portname)) == NULL) {
1060			error = EINVAL;
1061			break;
1062		}
1063
1064		LAGG_WLOCK(sc);
1065		if ((lp = (struct lagg_port *)tpif->if_lagg) == NULL ||
1066		    lp->lp_softc != sc) {
1067			error = ENOENT;
1068			LAGG_WUNLOCK(sc);
1069			break;
1070		}
1071
1072		error = lagg_port_destroy(lp, 1);
1073		LAGG_WUNLOCK(sc);
1074		break;
1075	case SIOCSIFFLAGS:
1076		/* Set flags on ports too */
1077		LAGG_WLOCK(sc);
1078		SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1079			lagg_setflags(lp, 1);
1080		}
1081		LAGG_WUNLOCK(sc);
1082
1083		if (!(ifp->if_flags & IFF_UP) &&
1084		    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1085			/*
1086			 * If interface is marked down and it is running,
1087			 * then stop and disable it.
1088			 */
1089			LAGG_WLOCK(sc);
1090			lagg_stop(sc);
1091			LAGG_WUNLOCK(sc);
1092		} else if ((ifp->if_flags & IFF_UP) &&
1093		    !(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
1094			/*
1095			 * If interface is marked up and it is stopped, then
1096			 * start it.
1097			 */
1098			(*ifp->if_init)(sc);
1099		}
1100		break;
1101	case SIOCADDMULTI:
1102	case SIOCDELMULTI:
1103		LAGG_WLOCK(sc);
1104		error = lagg_ether_setmulti(sc);
1105		LAGG_WUNLOCK(sc);
1106		break;
1107	case SIOCSIFMEDIA:
1108	case SIOCGIFMEDIA:
1109		error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd);
1110		break;
1111
1112	case SIOCSIFCAP:
1113	case SIOCSIFMTU:
1114		/* Do not allow the MTU or caps to be directly changed */
1115		error = EINVAL;
1116		break;
1117
1118	default:
1119		error = ether_ioctl(ifp, cmd, data);
1120		break;
1121	}
1122	return (error);
1123}
1124
1125static int
1126lagg_ether_setmulti(struct lagg_softc *sc)
1127{
1128	struct lagg_port *lp;
1129
1130	LAGG_WLOCK_ASSERT(sc);
1131
1132	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1133		/* First, remove any existing filter entries. */
1134		lagg_ether_cmdmulti(lp, 0);
1135		/* copy all addresses from the lagg interface to the port */
1136		lagg_ether_cmdmulti(lp, 1);
1137	}
1138	return (0);
1139}
1140
1141static int
1142lagg_ether_cmdmulti(struct lagg_port *lp, int set)
1143{
1144	struct lagg_softc *sc = lp->lp_softc;
1145	struct ifnet *ifp = lp->lp_ifp;
1146	struct ifnet *scifp = sc->sc_ifp;
1147	struct lagg_mc *mc;
1148	struct ifmultiaddr *ifma, *rifma = NULL;
1149	struct sockaddr_dl sdl;
1150	int error;
1151
1152	LAGG_WLOCK_ASSERT(sc);
1153
1154	bzero((char *)&sdl, sizeof(sdl));
1155	sdl.sdl_len = sizeof(sdl);
1156	sdl.sdl_family = AF_LINK;
1157	sdl.sdl_type = IFT_ETHER;
1158	sdl.sdl_alen = ETHER_ADDR_LEN;
1159	sdl.sdl_index = ifp->if_index;
1160
1161	if (set) {
1162		TAILQ_FOREACH(ifma, &scifp->if_multiaddrs, ifma_link) {
1163			if (ifma->ifma_addr->sa_family != AF_LINK)
1164				continue;
1165			bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1166			    LLADDR(&sdl), ETHER_ADDR_LEN);
1167
1168			error = if_addmulti(ifp, (struct sockaddr *)&sdl, &rifma);
1169			if (error)
1170				return (error);
1171			mc = malloc(sizeof(struct lagg_mc), M_DEVBUF, M_NOWAIT);
1172			if (mc == NULL)
1173				return (ENOMEM);
1174			mc->mc_ifma = rifma;
1175			SLIST_INSERT_HEAD(&lp->lp_mc_head, mc, mc_entries);
1176		}
1177	} else {
1178		while ((mc = SLIST_FIRST(&lp->lp_mc_head)) != NULL) {
1179			SLIST_REMOVE(&lp->lp_mc_head, mc, lagg_mc, mc_entries);
1180			if_delmulti_ifma(mc->mc_ifma);
1181			free(mc, M_DEVBUF);
1182		}
1183	}
1184	return (0);
1185}
1186
1187/* Handle a ref counted flag that should be set on the lagg port as well */
1188static int
1189lagg_setflag(struct lagg_port *lp, int flag, int status,
1190	     int (*func)(struct ifnet *, int))
1191{
1192	struct lagg_softc *sc = lp->lp_softc;
1193	struct ifnet *scifp = sc->sc_ifp;
1194	struct ifnet *ifp = lp->lp_ifp;
1195	int error;
1196
1197	LAGG_WLOCK_ASSERT(sc);
1198
1199	status = status ? (scifp->if_flags & flag) : 0;
1200	/* Now "status" contains the flag value or 0 */
1201
1202	/*
1203	 * See if recorded ports status is different from what
1204	 * we want it to be.  If it is, flip it.  We record ports
1205	 * status in lp_ifflags so that we won't clear ports flag
1206	 * we haven't set.  In fact, we don't clear or set ports
1207	 * flags directly, but get or release references to them.
1208	 * That's why we can be sure that recorded flags still are
1209	 * in accord with actual ports flags.
1210	 */
1211	if (status != (lp->lp_ifflags & flag)) {
1212		error = (*func)(ifp, status);
1213		if (error)
1214			return (error);
1215		lp->lp_ifflags &= ~flag;
1216		lp->lp_ifflags |= status;
1217	}
1218	return (0);
1219}
1220
1221/*
1222 * Handle IFF_* flags that require certain changes on the lagg port
1223 * if "status" is true, update ports flags respective to the lagg
1224 * if "status" is false, forcedly clear the flags set on port.
1225 */
1226static int
1227lagg_setflags(struct lagg_port *lp, int status)
1228{
1229	int error, i;
1230
1231	for (i = 0; lagg_pflags[i].flag; i++) {
1232		error = lagg_setflag(lp, lagg_pflags[i].flag,
1233		    status, lagg_pflags[i].func);
1234		if (error)
1235			return (error);
1236	}
1237	return (0);
1238}
1239
1240static int
1241lagg_transmit(struct ifnet *ifp, struct mbuf *m)
1242{
1243	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1244	int error, len, mcast;
1245
1246	len = m->m_pkthdr.len;
1247	mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
1248
1249	LAGG_RLOCK(sc);
1250	/* We need a Tx algorithm and at least one port */
1251	if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) {
1252		LAGG_RUNLOCK(sc);
1253		m_freem(m);
1254		ifp->if_oerrors++;
1255		return (ENXIO);
1256	}
1257
1258	ETHER_BPF_MTAP(ifp, m);
1259
1260	error = (*sc->sc_start)(sc, m);
1261	LAGG_RUNLOCK(sc);
1262
1263	if (error == 0) {
1264		counter_u64_add(sc->sc_opackets, 1);
1265		counter_u64_add(sc->sc_obytes, len);
1266		ifp->if_omcasts += mcast;
1267	} else
1268		ifp->if_oerrors++;
1269
1270	return (error);
1271}
1272
1273/*
1274 * The ifp->if_qflush entry point for lagg(4) is no-op.
1275 */
1276static void
1277lagg_qflush(struct ifnet *ifp __unused)
1278{
1279}
1280
1281static struct mbuf *
1282lagg_input(struct ifnet *ifp, struct mbuf *m)
1283{
1284	struct lagg_port *lp = ifp->if_lagg;
1285	struct lagg_softc *sc = lp->lp_softc;
1286	struct ifnet *scifp = sc->sc_ifp;
1287
1288	LAGG_RLOCK(sc);
1289	if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
1290	    (lp->lp_flags & LAGG_PORT_DISABLED) ||
1291	    sc->sc_proto == LAGG_PROTO_NONE) {
1292		LAGG_RUNLOCK(sc);
1293		m_freem(m);
1294		return (NULL);
1295	}
1296
1297	ETHER_BPF_MTAP(scifp, m);
1298
1299	m = (*sc->sc_input)(sc, lp, m);
1300
1301	if (m != NULL) {
1302		counter_u64_add(sc->sc_ipackets, 1);
1303		counter_u64_add(sc->sc_ibytes, m->m_pkthdr.len);
1304
1305		if (scifp->if_flags & IFF_MONITOR) {
1306			m_freem(m);
1307			m = NULL;
1308		}
1309	}
1310
1311	LAGG_RUNLOCK(sc);
1312	return (m);
1313}
1314
1315static int
1316lagg_media_change(struct ifnet *ifp)
1317{
1318	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1319
1320	if (sc->sc_ifflags & IFF_DEBUG)
1321		printf("%s\n", __func__);
1322
1323	/* Ignore */
1324	return (0);
1325}
1326
1327static void
1328lagg_media_status(struct ifnet *ifp, struct ifmediareq *imr)
1329{
1330	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
1331	struct lagg_port *lp;
1332
1333	imr->ifm_status = IFM_AVALID;
1334	imr->ifm_active = IFM_ETHER | IFM_AUTO;
1335
1336	LAGG_RLOCK(sc);
1337	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1338		if (LAGG_PORTACTIVE(lp))
1339			imr->ifm_status |= IFM_ACTIVE;
1340	}
1341	LAGG_RUNLOCK(sc);
1342}
1343
1344static void
1345lagg_linkstate(struct lagg_softc *sc)
1346{
1347	struct lagg_port *lp;
1348	int new_link = LINK_STATE_DOWN;
1349	uint64_t speed;
1350
1351	/* Our link is considered up if at least one of our ports is active */
1352	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) {
1353		if (lp->lp_link_state == LINK_STATE_UP) {
1354			new_link = LINK_STATE_UP;
1355			break;
1356		}
1357	}
1358	if_link_state_change(sc->sc_ifp, new_link);
1359
1360	/* Update if_baudrate to reflect the max possible speed */
1361	switch (sc->sc_proto) {
1362		case LAGG_PROTO_FAILOVER:
1363			sc->sc_ifp->if_baudrate = sc->sc_primary != NULL ?
1364			    sc->sc_primary->lp_ifp->if_baudrate : 0;
1365			break;
1366		case LAGG_PROTO_ROUNDROBIN:
1367		case LAGG_PROTO_LOADBALANCE:
1368		case LAGG_PROTO_ETHERCHANNEL:
1369			speed = 0;
1370			SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1371				speed += lp->lp_ifp->if_baudrate;
1372			sc->sc_ifp->if_baudrate = speed;
1373			break;
1374		case LAGG_PROTO_LACP:
1375			/* LACP updates if_baudrate itself */
1376			break;
1377	}
1378}
1379
1380static void
1381lagg_port_state(struct ifnet *ifp, int state)
1382{
1383	struct lagg_port *lp = (struct lagg_port *)ifp->if_lagg;
1384	struct lagg_softc *sc = NULL;
1385
1386	if (lp != NULL)
1387		sc = lp->lp_softc;
1388	if (sc == NULL)
1389		return;
1390
1391	LAGG_WLOCK(sc);
1392	lagg_linkstate(sc);
1393	if (sc->sc_linkstate != NULL)
1394		(*sc->sc_linkstate)(lp);
1395	LAGG_WUNLOCK(sc);
1396}
1397
1398struct lagg_port *
1399lagg_link_active(struct lagg_softc *sc, struct lagg_port *lp)
1400{
1401	struct lagg_port *lp_next, *rval = NULL;
1402	// int new_link = LINK_STATE_DOWN;
1403
1404	LAGG_RLOCK_ASSERT(sc);
1405	/*
1406	 * Search a port which reports an active link state.
1407	 */
1408
1409	if (lp == NULL)
1410		goto search;
1411	if (LAGG_PORTACTIVE(lp)) {
1412		rval = lp;
1413		goto found;
1414	}
1415	if ((lp_next = SLIST_NEXT(lp, lp_entries)) != NULL &&
1416	    LAGG_PORTACTIVE(lp_next)) {
1417		rval = lp_next;
1418		goto found;
1419	}
1420
1421search:
1422	SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
1423		if (LAGG_PORTACTIVE(lp_next)) {
1424			rval = lp_next;
1425			goto found;
1426		}
1427	}
1428
1429found:
1430	if (rval != NULL) {
1431		/*
1432		 * The IEEE 802.1D standard assumes that a lagg with
1433		 * multiple ports is always full duplex. This is valid
1434		 * for load sharing laggs and if at least two links
1435		 * are active. Unfortunately, checking the latter would
1436		 * be too expensive at this point.
1437		 XXX
1438		if ((sc->sc_capabilities & IFCAP_LAGG_FULLDUPLEX) &&
1439		    (sc->sc_count > 1))
1440			new_link = LINK_STATE_FULL_DUPLEX;
1441		else
1442			new_link = rval->lp_link_state;
1443		 */
1444	}
1445
1446	return (rval);
1447}
1448
1449static const void *
1450lagg_gethdr(struct mbuf *m, u_int off, u_int len, void *buf)
1451{
1452	if (m->m_pkthdr.len < (off + len)) {
1453		return (NULL);
1454	} else if (m->m_len < (off + len)) {
1455		m_copydata(m, off, len, buf);
1456		return (buf);
1457	}
1458	return (mtod(m, char *) + off);
1459}
1460
1461uint32_t
1462lagg_hashmbuf(struct lagg_softc *sc, struct mbuf *m, uint32_t key)
1463{
1464	uint16_t etype;
1465	uint32_t p = key;
1466	int off;
1467	struct ether_header *eh;
1468	const struct ether_vlan_header *vlan;
1469#ifdef INET
1470	const struct ip *ip;
1471	const uint32_t *ports;
1472	int iphlen;
1473#endif
1474#ifdef INET6
1475	const struct ip6_hdr *ip6;
1476	uint32_t flow;
1477#endif
1478	union {
1479#ifdef INET
1480		struct ip ip;
1481#endif
1482#ifdef INET6
1483		struct ip6_hdr ip6;
1484#endif
1485		struct ether_vlan_header vlan;
1486		uint32_t port;
1487	} buf;
1488
1489
1490	off = sizeof(*eh);
1491	if (m->m_len < off)
1492		goto out;
1493	eh = mtod(m, struct ether_header *);
1494	etype = ntohs(eh->ether_type);
1495	if (sc->sc_flags & LAGG_F_HASHL2) {
1496		p = hash32_buf(&eh->ether_shost, ETHER_ADDR_LEN, p);
1497		p = hash32_buf(&eh->ether_dhost, ETHER_ADDR_LEN, p);
1498	}
1499
1500	/* Special handling for encapsulating VLAN frames */
1501	if ((m->m_flags & M_VLANTAG) && (sc->sc_flags & LAGG_F_HASHL2)) {
1502		p = hash32_buf(&m->m_pkthdr.ether_vtag,
1503		    sizeof(m->m_pkthdr.ether_vtag), p);
1504	} else if (etype == ETHERTYPE_VLAN) {
1505		vlan = lagg_gethdr(m, off,  sizeof(*vlan), &buf);
1506		if (vlan == NULL)
1507			goto out;
1508
1509		if (sc->sc_flags & LAGG_F_HASHL2)
1510			p = hash32_buf(&vlan->evl_tag, sizeof(vlan->evl_tag), p);
1511		etype = ntohs(vlan->evl_proto);
1512		off += sizeof(*vlan) - sizeof(*eh);
1513	}
1514
1515	switch (etype) {
1516#ifdef INET
1517	case ETHERTYPE_IP:
1518		ip = lagg_gethdr(m, off, sizeof(*ip), &buf);
1519		if (ip == NULL)
1520			goto out;
1521
1522		if (sc->sc_flags & LAGG_F_HASHL3) {
1523			p = hash32_buf(&ip->ip_src, sizeof(struct in_addr), p);
1524			p = hash32_buf(&ip->ip_dst, sizeof(struct in_addr), p);
1525		}
1526		if (!(sc->sc_flags & LAGG_F_HASHL4))
1527			break;
1528		switch (ip->ip_p) {
1529			case IPPROTO_TCP:
1530			case IPPROTO_UDP:
1531			case IPPROTO_SCTP:
1532				iphlen = ip->ip_hl << 2;
1533				if (iphlen < sizeof(*ip))
1534					break;
1535				off += iphlen;
1536				ports = lagg_gethdr(m, off, sizeof(*ports), &buf);
1537				if (ports == NULL)
1538					break;
1539				p = hash32_buf(ports, sizeof(*ports), p);
1540				break;
1541		}
1542		break;
1543#endif
1544#ifdef INET6
1545	case ETHERTYPE_IPV6:
1546		if (!(sc->sc_flags & LAGG_F_HASHL3))
1547			break;
1548		ip6 = lagg_gethdr(m, off, sizeof(*ip6), &buf);
1549		if (ip6 == NULL)
1550			goto out;
1551
1552		p = hash32_buf(&ip6->ip6_src, sizeof(struct in6_addr), p);
1553		p = hash32_buf(&ip6->ip6_dst, sizeof(struct in6_addr), p);
1554		flow = ip6->ip6_flow & IPV6_FLOWLABEL_MASK;
1555		p = hash32_buf(&flow, sizeof(flow), p);	/* IPv6 flow label */
1556		break;
1557#endif
1558	}
1559out:
1560	return (p);
1561}
1562
1563int
1564lagg_enqueue(struct ifnet *ifp, struct mbuf *m)
1565{
1566
1567	return (ifp->if_transmit)(ifp, m);
1568}
1569
1570/*
1571 * Simple round robin aggregation
1572 */
1573
1574static int
1575lagg_rr_attach(struct lagg_softc *sc)
1576{
1577	sc->sc_detach = lagg_rr_detach;
1578	sc->sc_start = lagg_rr_start;
1579	sc->sc_input = lagg_rr_input;
1580	sc->sc_port_create = NULL;
1581	sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX;
1582	sc->sc_seq = 0;
1583
1584	return (0);
1585}
1586
1587static int
1588lagg_rr_detach(struct lagg_softc *sc)
1589{
1590	return (0);
1591}
1592
1593static int
1594lagg_rr_start(struct lagg_softc *sc, struct mbuf *m)
1595{
1596	struct lagg_port *lp;
1597	uint32_t p;
1598
1599	p = atomic_fetchadd_32(&sc->sc_seq, 1);
1600	p %= sc->sc_count;
1601	lp = SLIST_FIRST(&sc->sc_ports);
1602	while (p--)
1603		lp = SLIST_NEXT(lp, lp_entries);
1604
1605	/*
1606	 * Check the port's link state. This will return the next active
1607	 * port if the link is down or the port is NULL.
1608	 */
1609	if ((lp = lagg_link_active(sc, lp)) == NULL) {
1610		m_freem(m);
1611		return (ENOENT);
1612	}
1613
1614	/* Send mbuf */
1615	return (lagg_enqueue(lp->lp_ifp, m));
1616}
1617
1618static struct mbuf *
1619lagg_rr_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1620{
1621	struct ifnet *ifp = sc->sc_ifp;
1622
1623	/* Just pass in the packet to our lagg device */
1624	m->m_pkthdr.rcvif = ifp;
1625
1626	return (m);
1627}
1628
1629/*
1630 * Active failover
1631 */
1632
1633static int
1634lagg_fail_attach(struct lagg_softc *sc)
1635{
1636	sc->sc_detach = lagg_fail_detach;
1637	sc->sc_start = lagg_fail_start;
1638	sc->sc_input = lagg_fail_input;
1639	sc->sc_port_create = NULL;
1640	sc->sc_port_destroy = NULL;
1641
1642	return (0);
1643}
1644
1645static int
1646lagg_fail_detach(struct lagg_softc *sc)
1647{
1648	return (0);
1649}
1650
1651static int
1652lagg_fail_start(struct lagg_softc *sc, struct mbuf *m)
1653{
1654	struct lagg_port *lp;
1655
1656	/* Use the master port if active or the next available port */
1657	if ((lp = lagg_link_active(sc, sc->sc_primary)) == NULL) {
1658		m_freem(m);
1659		return (ENOENT);
1660	}
1661
1662	/* Send mbuf */
1663	return (lagg_enqueue(lp->lp_ifp, m));
1664}
1665
1666static struct mbuf *
1667lagg_fail_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1668{
1669	struct ifnet *ifp = sc->sc_ifp;
1670	struct lagg_port *tmp_tp;
1671
1672	if (lp == sc->sc_primary || lagg_failover_rx_all) {
1673		m->m_pkthdr.rcvif = ifp;
1674		return (m);
1675	}
1676
1677	if (!LAGG_PORTACTIVE(sc->sc_primary)) {
1678		tmp_tp = lagg_link_active(sc, sc->sc_primary);
1679		/*
1680		 * If tmp_tp is null, we've recieved a packet when all
1681		 * our links are down. Weird, but process it anyways.
1682		 */
1683		if ((tmp_tp == NULL || tmp_tp == lp)) {
1684			m->m_pkthdr.rcvif = ifp;
1685			return (m);
1686		}
1687	}
1688
1689	m_freem(m);
1690	return (NULL);
1691}
1692
1693/*
1694 * Loadbalancing
1695 */
1696
1697static int
1698lagg_lb_attach(struct lagg_softc *sc)
1699{
1700	struct lagg_port *lp;
1701	struct lagg_lb *lb;
1702
1703	if ((lb = (struct lagg_lb *)malloc(sizeof(struct lagg_lb),
1704	    M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
1705		return (ENOMEM);
1706
1707	sc->sc_detach = lagg_lb_detach;
1708	sc->sc_start = lagg_lb_start;
1709	sc->sc_input = lagg_lb_input;
1710	sc->sc_port_create = lagg_lb_port_create;
1711	sc->sc_port_destroy = lagg_lb_port_destroy;
1712	sc->sc_capabilities = IFCAP_LAGG_FULLDUPLEX;
1713
1714	lb->lb_key = arc4random();
1715	sc->sc_psc = (caddr_t)lb;
1716
1717	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1718		lagg_lb_port_create(lp);
1719
1720	return (0);
1721}
1722
1723static int
1724lagg_lb_detach(struct lagg_softc *sc)
1725{
1726	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
1727	if (lb != NULL)
1728		free(lb, M_DEVBUF);
1729	return (0);
1730}
1731
1732static int
1733lagg_lb_porttable(struct lagg_softc *sc, struct lagg_port *lp)
1734{
1735	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
1736	struct lagg_port *lp_next;
1737	int i = 0;
1738
1739	bzero(&lb->lb_ports, sizeof(lb->lb_ports));
1740	SLIST_FOREACH(lp_next, &sc->sc_ports, lp_entries) {
1741		if (lp_next == lp)
1742			continue;
1743		if (i >= LAGG_MAX_PORTS)
1744			return (EINVAL);
1745		if (sc->sc_ifflags & IFF_DEBUG)
1746			printf("%s: port %s at index %d\n",
1747			    sc->sc_ifname, lp_next->lp_ifname, i);
1748		lb->lb_ports[i++] = lp_next;
1749	}
1750
1751	return (0);
1752}
1753
1754static int
1755lagg_lb_port_create(struct lagg_port *lp)
1756{
1757	struct lagg_softc *sc = lp->lp_softc;
1758	return (lagg_lb_porttable(sc, NULL));
1759}
1760
1761static void
1762lagg_lb_port_destroy(struct lagg_port *lp)
1763{
1764	struct lagg_softc *sc = lp->lp_softc;
1765	lagg_lb_porttable(sc, lp);
1766}
1767
1768static int
1769lagg_lb_start(struct lagg_softc *sc, struct mbuf *m)
1770{
1771	struct lagg_lb *lb = (struct lagg_lb *)sc->sc_psc;
1772	struct lagg_port *lp = NULL;
1773	uint32_t p = 0;
1774
1775	if (sc->use_flowid && (m->m_flags & M_FLOWID))
1776		p = m->m_pkthdr.flowid;
1777	else
1778		p = lagg_hashmbuf(sc, m, lb->lb_key);
1779	p %= sc->sc_count;
1780	lp = lb->lb_ports[p];
1781
1782	/*
1783	 * Check the port's link state. This will return the next active
1784	 * port if the link is down or the port is NULL.
1785	 */
1786	if ((lp = lagg_link_active(sc, lp)) == NULL) {
1787		m_freem(m);
1788		return (ENOENT);
1789	}
1790
1791	/* Send mbuf */
1792	return (lagg_enqueue(lp->lp_ifp, m));
1793}
1794
1795static struct mbuf *
1796lagg_lb_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1797{
1798	struct ifnet *ifp = sc->sc_ifp;
1799
1800	/* Just pass in the packet to our lagg device */
1801	m->m_pkthdr.rcvif = ifp;
1802
1803	return (m);
1804}
1805
1806/*
1807 * 802.3ad LACP
1808 */
1809
1810static int
1811lagg_lacp_attach(struct lagg_softc *sc)
1812{
1813	struct lagg_port *lp;
1814	int error;
1815
1816	sc->sc_detach = lagg_lacp_detach;
1817	sc->sc_port_create = lacp_port_create;
1818	sc->sc_port_destroy = lacp_port_destroy;
1819	sc->sc_linkstate = lacp_linkstate;
1820	sc->sc_start = lagg_lacp_start;
1821	sc->sc_input = lagg_lacp_input;
1822	sc->sc_init = lacp_init;
1823	sc->sc_stop = lacp_stop;
1824	sc->sc_lladdr = lagg_lacp_lladdr;
1825	sc->sc_req = lacp_req;
1826	sc->sc_portreq = lacp_portreq;
1827
1828	error = lacp_attach(sc);
1829	if (error)
1830		return (error);
1831
1832	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1833		lacp_port_create(lp);
1834
1835	return (error);
1836}
1837
1838static int
1839lagg_lacp_detach(struct lagg_softc *sc)
1840{
1841	struct lagg_port *lp;
1842	int error;
1843
1844	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1845		lacp_port_destroy(lp);
1846
1847	/* unlocking is safe here */
1848	LAGG_WUNLOCK(sc);
1849	error = lacp_detach(sc);
1850	LAGG_WLOCK(sc);
1851
1852	return (error);
1853}
1854
1855static void
1856lagg_lacp_lladdr(struct lagg_softc *sc)
1857{
1858	struct lagg_port *lp;
1859
1860	/* purge all the lacp ports */
1861	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1862		lacp_port_destroy(lp);
1863
1864	/* add them back in */
1865	SLIST_FOREACH(lp, &sc->sc_ports, lp_entries)
1866		lacp_port_create(lp);
1867}
1868
1869static int
1870lagg_lacp_start(struct lagg_softc *sc, struct mbuf *m)
1871{
1872	struct lagg_port *lp;
1873
1874	lp = lacp_select_tx_port(sc, m);
1875	if (lp == NULL) {
1876		m_freem(m);
1877		return (ENETDOWN);
1878	}
1879
1880	/* Send mbuf */
1881	return (lagg_enqueue(lp->lp_ifp, m));
1882}
1883
1884static struct mbuf *
1885lagg_lacp_input(struct lagg_softc *sc, struct lagg_port *lp, struct mbuf *m)
1886{
1887	struct ifnet *ifp = sc->sc_ifp;
1888	struct ether_header *eh;
1889	u_short etype;
1890
1891	eh = mtod(m, struct ether_header *);
1892	etype = ntohs(eh->ether_type);
1893
1894	/* Tap off LACP control messages */
1895	if ((m->m_flags & M_VLANTAG) == 0 && etype == ETHERTYPE_SLOW) {
1896		m = lacp_input(lp, m);
1897		if (m == NULL)
1898			return (NULL);
1899	}
1900
1901	/*
1902	 * If the port is not collecting or not in the active aggregator then
1903	 * free and return.
1904	 */
1905	if (lacp_iscollecting(lp) == 0 || lacp_isactive(lp) == 0) {
1906		m_freem(m);
1907		return (NULL);
1908	}
1909
1910	m->m_pkthdr.rcvif = ifp;
1911	return (m);
1912}
1913
1914static void
1915lagg_callout(void *arg)
1916{
1917	struct lagg_softc *sc = (struct lagg_softc *)arg;
1918	struct ifnet *ifp = sc->sc_ifp;
1919
1920	ifp->if_ipackets = counter_u64_fetch(sc->sc_ipackets);
1921	ifp->if_opackets = counter_u64_fetch(sc->sc_opackets);
1922	ifp->if_ibytes = counter_u64_fetch(sc->sc_ibytes);
1923	ifp->if_obytes = counter_u64_fetch(sc->sc_obytes);
1924
1925	callout_reset(&sc->sc_callout, hz, lagg_callout, sc);
1926}
1927