1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27/*
28 *  This module supports AF_TRILL sockets and TRILL layer-2 forwarding.
29 */
30
31#include <sys/strsubr.h>
32#include <sys/socket.h>
33#include <sys/socketvar.h>
34#include <sys/modctl.h>
35#include <sys/cmn_err.h>
36#include <sys/tihdr.h>
37#include <sys/strsun.h>
38#include <sys/policy.h>
39#include <sys/ethernet.h>
40#include <sys/vlan.h>
41#include <net/trill.h>
42#include <net/if_dl.h>
43#include <sys/mac.h>
44#include <sys/mac_client.h>
45#include <sys/mac_provider.h>
46#include <sys/mac_client_priv.h>
47#include <sys/sdt.h>
48#include <sys/dls.h>
49#include <sys/sunddi.h>
50
51#include "trill_impl.h"
52
53static void trill_del_all(trill_inst_t *, boolean_t);
54static int trill_del_nick(trill_inst_t *, uint16_t, boolean_t);
55static void trill_stop_recv(trill_sock_t *);
56static void trill_ctrl_input(trill_sock_t *, mblk_t *, const uint8_t *,
57    uint16_t);
58static trill_node_t *trill_node_lookup(trill_inst_t *, uint16_t);
59static void trill_node_unref(trill_inst_t *, trill_node_t *);
60static void trill_sock_unref(trill_sock_t *);
61static void trill_kstats_init(trill_sock_t *, const char *);
62
63static list_t trill_inst_list;
64static krwlock_t trill_inst_rwlock;
65
66static sock_lower_handle_t trill_create(int, int, int, sock_downcalls_t **,
67    uint_t *, int *, int, cred_t *);
68
69static smod_reg_t sinfo = {
70	SOCKMOD_VERSION,
71	"trill",
72	SOCK_UC_VERSION,
73	SOCK_DC_VERSION,
74	trill_create,
75	NULL,
76};
77
78/* modldrv structure */
79static struct modlsockmod sockmod = {
80	&mod_sockmodops, "AF_TRILL socket module", &sinfo
81};
82
83/* modlinkage structure */
84static struct modlinkage ml = {
85	MODREV_1,
86	&sockmod,
87	NULL
88};
89
90#define	VALID_NICK(n)	((n) != RBRIDGE_NICKNAME_NONE && \
91			(n) != RBRIDGE_NICKNAME_UNUSED)
92
93static mblk_t *
94create_trill_header(trill_sock_t *tsock, mblk_t *mp, const uint8_t *daddr,
95    boolean_t trill_hdr_ok, boolean_t multidest, uint16_t tci,
96    size_t msglen)
97{
98	int extra_hdr_len;
99	struct ether_vlan_header *ethvlanhdr;
100	mblk_t *hdr_mp;
101	uint16_t etype;
102
103	etype = msglen > 0 ? (uint16_t)msglen : ETHERTYPE_TRILL;
104
105	/* When sending on the PVID, we must not give a VLAN ID */
106	if (tci == tsock->ts_link->bl_pvid)
107		tci = TRILL_NO_TCI;
108
109	/*
110	 * Create new Ethernet header and include additional space
111	 * for writing TRILL header and/or VLAN tag.
112	 */
113	extra_hdr_len = (trill_hdr_ok ? 0 : sizeof (trill_header_t)) +
114	    (tci != TRILL_NO_TCI ? sizeof (struct ether_vlan_extinfo) : 0);
115	hdr_mp = mac_header(tsock->ts_link->bl_mh, daddr,
116	    tci != TRILL_NO_TCI ? ETHERTYPE_VLAN : etype, mp, extra_hdr_len);
117	if (hdr_mp == NULL) {
118		freemsg(mp);
119		return (NULL);
120	}
121
122	if (tci != TRILL_NO_TCI) {
123		/* LINTED: alignment */
124		ethvlanhdr = (struct ether_vlan_header *)hdr_mp->b_rptr;
125		ethvlanhdr->ether_tci = htons(tci);
126		ethvlanhdr->ether_type = htons(etype);
127		hdr_mp->b_wptr += sizeof (struct ether_vlan_extinfo);
128	}
129
130	if (!trill_hdr_ok) {
131		trill_header_t *thp;
132		/* LINTED: alignment */
133		thp = (trill_header_t *)hdr_mp->b_wptr;
134		(void) memset(thp, 0, sizeof (trill_header_t));
135		thp->th_hopcount = TRILL_DEFAULT_HOPS;
136		thp->th_multidest = (multidest ? 1:0);
137		hdr_mp->b_wptr += sizeof (trill_header_t);
138	}
139
140	hdr_mp->b_cont = mp;
141	return (hdr_mp);
142}
143
144/*
145 * TRILL local recv function. TRILL data frames that should be received
146 * by the local system are decapsulated here and passed to bridging for
147 * learning and local system receive. Only called when we are the forwarder
148 * on the link (multi-dest frames) or the frame was destined for us.
149 */
150static void
151trill_recv_local(trill_sock_t *tsock, mblk_t *mp, uint16_t ingressnick)
152{
153	struct ether_header *inner_ethhdr;
154
155	/* LINTED: alignment */
156	inner_ethhdr = (struct ether_header *)mp->b_rptr;
157	DTRACE_PROBE1(trill__recv__local, struct ether_header *, inner_ethhdr);
158
159	DB_CKSUMFLAGS(mp) = 0;
160	/*
161	 * Transmit the decapsulated frame on the link via Bridging.
162	 * Bridging does source address learning and appropriate forwarding.
163	 */
164	bridge_trill_decaps(tsock->ts_link, mp, ingressnick);
165	KSPINCR(tks_decap);
166}
167
168/*
169 * Determines the outgoing link to reach a RBridge having the given nick
170 * Assumes caller has acquired the trill instance rwlock.
171 */
172static trill_sock_t *
173find_trill_link(trill_inst_t *tip, datalink_id_t linkid)
174{
175	trill_sock_t *tsp = NULL;
176
177	ASSERT(RW_LOCK_HELD(&tip->ti_rwlock));
178	for (tsp = list_head(&tip->ti_socklist); tsp != NULL;
179	    tsp = list_next(&tip->ti_socklist, tsp)) {
180		if (tsp->ts_link != NULL && tsp->ts_link->bl_linkid == linkid) {
181			ASSERT(tsp->ts_link->bl_mh != NULL);
182			ASSERT(!(tsp->ts_flags & TSF_SHUTDOWN));
183			atomic_inc_uint(&tsp->ts_refs);
184			break;
185		}
186	}
187	return (tsp);
188}
189
190/*
191 * TRILL destination forwarding function. Transmits the TRILL data packet
192 * to the next-hop, adjacent RBridge.  Consumes passed mblk_t.
193 */
194static void
195trill_dest_fwd(trill_inst_t *tip, mblk_t *fwd_mp, uint16_t adj_nick,
196    boolean_t has_trill_hdr, boolean_t multidest, uint16_t dtnick)
197{
198	trill_node_t *adj;
199	trill_sock_t *tsock = NULL;
200	trill_header_t *trillhdr;
201	struct ether_header *ethhdr;
202	int ethtype;
203	int ethhdrlen;
204
205	adj = trill_node_lookup(tip, adj_nick);
206	if (adj == NULL || ((tsock = adj->tn_tsp) == NULL))
207		goto dest_fwd_fail;
208
209	ASSERT(tsock->ts_link != NULL);
210	ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
211	ASSERT(adj->tn_ni != NULL);
212
213	DTRACE_PROBE3(trill__dest__fwd, uint16_t, adj_nick, trill_node_t,
214	    adj, trill_sock_t, tsock);
215
216	/*
217	 * For broadcast links by using the dest address of
218	 * the RBridge to forward the frame should result in
219	 * savings. When the link is a bridged LAN or there are
220	 * many end stations the frame will not always be flooded.
221	 */
222	fwd_mp = create_trill_header(tsock, fwd_mp, adj->tn_ni->tni_adjsnpa,
223	    has_trill_hdr, multidest, tsock->ts_desigvlan, 0);
224	if (fwd_mp == NULL)
225		goto dest_fwd_fail;
226
227	/* LINTED: alignment */
228	ethhdr = (struct ether_header *)fwd_mp->b_rptr;
229	ethtype = ntohs(ethhdr->ether_type);
230	ASSERT(ethtype == ETHERTYPE_VLAN || ethtype == ETHERTYPE_TRILL);
231
232	/* Pullup Ethernet and TRILL header (w/o TRILL options) */
233	ethhdrlen = sizeof (struct ether_header) +
234	    (ethtype == ETHERTYPE_VLAN ? sizeof (struct ether_vlan_extinfo):0);
235	if (!pullupmsg(fwd_mp, ethhdrlen + sizeof (trill_header_t)))
236		goto dest_fwd_fail;
237	/* LINTED: alignment */
238	trillhdr = (struct trill_header *)(fwd_mp->b_rptr + ethhdrlen);
239
240	/* Update TRILL header with ingress and egress nicks for new frames */
241	if (!has_trill_hdr) {
242		/* We are creating a new TRILL frame */
243		trillhdr->th_egressnick = (multidest ? dtnick:adj_nick);
244		rw_enter(&tip->ti_rwlock, RW_READER);
245		trillhdr->th_ingressnick = tip->ti_nick;
246		rw_exit(&tip->ti_rwlock);
247		if (!VALID_NICK(trillhdr->th_ingressnick))
248			goto dest_fwd_fail;
249	}
250
251	/* Set hop count and update header in packet */
252	ASSERT(trillhdr->th_hopcount != 0);
253	trillhdr->th_hopcount--;
254
255	/* Clear checksum flag and transmit frame on the link */
256	DB_CKSUMFLAGS(fwd_mp) = 0;
257	DTRACE_PROBE1(trill__dest__fwd__tx, trill_header_t *, &trillhdr);
258	fwd_mp = bridge_trill_output(tsock->ts_link, fwd_mp);
259	if (fwd_mp == NULL) {
260		KSPINCR(tks_sent);
261		KSPINCR(tks_forward);
262	} else {
263		freemsg(fwd_mp);
264		KSPINCR(tks_drops);
265	}
266	trill_node_unref(tip, adj);
267	return;
268
269dest_fwd_fail:
270	if (adj != NULL)
271		trill_node_unref(tip, adj);
272	if (tsock != NULL)
273		KSPINCR(tks_drops);
274	freemsg(fwd_mp);
275}
276
277/*
278 * TRILL multi-destination forwarding. Transmits the packet to the adjacencies
279 * on the distribution tree determined by the egress nick. Source addr (saddr)
280 * is NULL for new TRILL packets originating from us.
281 */
282static void
283trill_multidest_fwd(trill_inst_t *tip, mblk_t *mp, uint16_t egressnick,
284    uint16_t ingressnick, boolean_t is_trill_pkt, const uint8_t *saddr,
285    int inner_vlan, boolean_t free_mblk)
286{
287	int idx;
288	uint16_t adjnick;
289	trill_node_t *dest;
290	trill_node_t *adj;
291	mblk_t *fwd_mp;
292	boolean_t nicksaved = B_FALSE;
293	uint16_t adjnicksaved;
294
295	/* Lookup the egress nick info, this is the DT root */
296	if ((dest = trill_node_lookup(tip, egressnick)) == NULL)
297		goto fail_multidest_fwd;
298
299	/* Send a copy to all our adjacencies on the DT root  */
300	ASSERT(dest->tn_ni);
301	for (idx = 0; idx < dest->tn_ni->tni_adjcount; idx++) {
302
303		/* Check for a valid adjacency node */
304		adjnick = TNI_ADJNICK(dest->tn_ni, idx);
305		if (!VALID_NICK(adjnick) || ingressnick == adjnick ||
306		    ((adj = trill_node_lookup(tip, adjnick)) == NULL))
307			continue;
308
309		/* Do not forward back to adjacency that sent the pkt to us */
310		ASSERT(adj->tn_ni != NULL);
311		if ((saddr != NULL) &&
312		    (memcmp(adj->tn_ni->tni_adjsnpa, saddr,
313		    ETHERADDRL) == 0)) {
314			trill_node_unref(tip, adj);
315			continue;
316		}
317
318		/* Check if adj is marked as reaching inner VLAN downstream */
319		if ((inner_vlan != VLAN_ID_NONE) &&
320		    !TRILL_VLANISSET(TNI_VLANFILTERMAP(dest->tn_ni, idx),
321		    inner_vlan)) {
322			trill_node_unref(tip, adj);
323			DTRACE_PROBE4(trill__multi__dest__fwd__vlanfiltered,
324			    uint16_t, adjnick, uint16_t, ingressnick,
325			    uint16_t, egressnick, int, inner_vlan);
326			continue;
327		}
328
329		trill_node_unref(tip, adj);
330
331		/*
332		 * Save the nick and look ahead to see if we should forward the
333		 * frame to more adjacencies. We avoid doing a copy for this
334		 * nick and use the passed mblk when we can consume the passed
335		 * mblk.
336		 */
337		if (free_mblk && !nicksaved) {
338			adjnicksaved = adjnick;
339			nicksaved = B_TRUE;
340			continue;
341		}
342
343		fwd_mp = copymsg(mp);
344		if (fwd_mp == NULL)
345			break;
346		DTRACE_PROBE2(trill__multi__dest__fwd, uint16_t,
347		    adjnick, uint16_t, ingressnick);
348		trill_dest_fwd(tip, fwd_mp, adjnick, is_trill_pkt,
349		    B_TRUE, egressnick);
350	}
351	trill_node_unref(tip, dest);
352
353	if (nicksaved) {
354		ASSERT(free_mblk);
355		DTRACE_PROBE2(trill__multi__dest__fwd, uint16_t,
356		    adjnicksaved, uint16_t, ingressnick);
357		trill_dest_fwd(tip, mp, adjnicksaved, is_trill_pkt,
358		    B_TRUE, egressnick);
359		return;
360	}
361
362fail_multidest_fwd:
363	DTRACE_PROBE2(trill__multi__dest__fwd__fail, uint16_t,
364	    egressnick, uint16_t, ingressnick);
365	if (free_mblk) {
366		freemsg(mp);
367	}
368}
369
370/*
371 * TRILL data receive function. Forwards the received frame if necessary
372 * and also determines if the received frame should be consumed locally.
373 * Consumes passed mblk.
374 */
375static void
376trill_recv(trill_sock_t *tsock, mblk_t *mp, const uint8_t *mpsaddr)
377{
378	trill_header_t *trillhdr;
379	trill_node_t *dest = NULL;
380	trill_node_t *source = NULL;
381	trill_node_t *adj;
382	uint16_t ournick, adjnick, treeroot;
383	struct ether_header *ethhdr;
384	trill_inst_t *tip = tsock->ts_tip;
385	uint8_t srcaddr[ETHERADDRL];
386	size_t trillhdrlen;
387	int inner_vlan = VLAN_ID_NONE;
388	int tci;
389	int idx;
390	size_t min_size;
391
392	/* Copy Ethernet source address before modifying packet */
393	(void) memcpy(srcaddr, mpsaddr, ETHERADDRL);
394
395	/* Pull up TRILL header if necessary. */
396	min_size = sizeof (trill_header_t);
397	if ((MBLKL(mp) < min_size ||
398	    !IS_P2ALIGNED(mp->b_rptr, TRILL_HDR_ALIGN)) &&
399	    !pullupmsg(mp, min_size))
400		goto fail;
401
402	/* LINTED: alignment */
403	trillhdr = (trill_header_t *)mp->b_rptr;
404	if (trillhdr->th_version != TRILL_PROTOCOL_VERS) {
405		DTRACE_PROBE1(trill__recv__wrongversion,
406		    trill_header_t *, trillhdr);
407		goto fail;
408	}
409
410	/* Drop if unknown or invalid nickname */
411	if (!VALID_NICK(trillhdr->th_egressnick) ||
412	    !VALID_NICK(trillhdr->th_ingressnick)) {
413		DTRACE_PROBE1(trill__recv__invalidnick,
414		    trill_header_t *, trillhdr);
415		goto fail;
416	}
417
418	rw_enter(&tip->ti_rwlock, RW_READER);
419	ournick = tip->ti_nick;
420	treeroot = tip->ti_treeroot;
421	rw_exit(&tip->ti_rwlock);
422	/* Drop if we received a packet with our nick as ingress */
423	if (trillhdr->th_ingressnick == ournick)
424		goto fail;
425
426	/* Re-pull any TRILL options and inner Ethernet header */
427	min_size += GET_TRILL_OPTS_LEN(trillhdr) * sizeof (uint32_t) +
428	    sizeof (struct ether_header);
429	if (MBLKL(mp) < min_size) {
430		if (!pullupmsg(mp, min_size))
431			goto fail;
432		/* LINTED: alignment */
433		trillhdr = (trill_header_t *)mp->b_rptr;
434	}
435	trillhdrlen = sizeof (trill_header_t) +
436	    (GET_TRILL_OPTS_LEN(trillhdr) * sizeof (uint32_t));
437
438	/*
439	 * Get the inner Ethernet header, plus the inner VLAN header if there
440	 * is one.
441	 */
442	/* LINTED: alignment */
443	ethhdr = (struct ether_header *)(mp->b_rptr + trillhdrlen);
444	if (ethhdr->ether_type == htons(ETHERTYPE_VLAN)) {
445		min_size += sizeof (struct ether_vlan_extinfo);
446		if (MBLKL(mp) < min_size) {
447			if (!pullupmsg(mp, min_size))
448				goto fail;
449			/* LINTED: alignment */
450			trillhdr = (trill_header_t *)mp->b_rptr;
451			/* LINTED: alignment */
452			ethhdr = (struct ether_header *)(mp->b_rptr +
453			    trillhdrlen);
454		}
455
456		tci = ntohs(((struct ether_vlan_header *)ethhdr)->ether_tci);
457		inner_vlan = VLAN_ID(tci);
458	}
459
460	/* Known/single destination forwarding. */
461	if (!trillhdr->th_multidest) {
462
463		/* Inner MacDA must be unicast */
464		if (ethhdr->ether_dhost.ether_addr_octet[0] & 1)
465			goto fail;
466
467		/* Ingress and Egress nicks must be different */
468		if (trillhdr->th_egressnick == trillhdr->th_ingressnick)
469			goto fail;
470
471		DTRACE_PROBE1(trill__recv__singledest,
472		    trill_header_t *, trillhdr);
473		if (trillhdr->th_egressnick == ournick) {
474			mp->b_rptr += trillhdrlen;
475			trill_recv_local(tsock, mp, trillhdr->th_ingressnick);
476		} else if (trillhdr->th_hopcount > 0) {
477			trill_dest_fwd(tip, mp, trillhdr->th_egressnick,
478			    B_TRUE, B_FALSE, RBRIDGE_NICKNAME_NONE);
479		} else {
480			goto fail;
481		}
482		return;
483	}
484
485	/*
486	 * Multi-destination frame: perform checks verifying we have
487	 * received a valid multi-destination frame before receiving the
488	 * frame locally and forwarding the frame to other RBridges.
489	 *
490	 * Check if we received this multi-destination frame on a
491	 * adjacency in the distribution tree indicated by the frame's
492	 * egress nickname.
493	 */
494	if ((dest = trill_node_lookup(tip, trillhdr->th_egressnick)) == NULL)
495		goto fail;
496	for (idx = 0; idx < dest->tn_ni->tni_adjcount; idx++) {
497		adjnick = TNI_ADJNICK(dest->tn_ni, idx);
498		if ((adj = trill_node_lookup(tip, adjnick)) == NULL)
499			continue;
500		if (memcmp(adj->tn_ni->tni_adjsnpa, srcaddr, ETHERADDRL) == 0) {
501			trill_node_unref(tip, adj);
502			break;
503		}
504		trill_node_unref(tip, adj);
505	}
506
507	if (idx >= dest->tn_ni->tni_adjcount) {
508		DTRACE_PROBE2(trill__recv__multidest__adjcheckfail,
509		    trill_header_t *, trillhdr, trill_node_t *, dest);
510		goto fail;
511	}
512
513	/*
514	 * Reverse path forwarding check. Check if the ingress RBridge
515	 * that has forwarded the frame advertised the use of the
516	 * distribution tree specified in the egress nick.
517	 */
518	if ((source = trill_node_lookup(tip, trillhdr->th_ingressnick)) == NULL)
519		goto fail;
520	for (idx = 0; idx < source->tn_ni->tni_dtrootcount; idx++) {
521		if (TNI_DTROOTNICK(source->tn_ni, idx) ==
522		    trillhdr->th_egressnick)
523			break;
524	}
525
526	if (idx >= source->tn_ni->tni_dtrootcount) {
527		/*
528		 * Allow receipt of forwarded frame with the highest
529		 * tree root RBridge as the egress RBridge when the
530		 * ingress RBridge has not advertised the use of any
531		 * distribution trees.
532		 */
533		if (source->tn_ni->tni_dtrootcount != 0 ||
534		    trillhdr->th_egressnick != treeroot) {
535			DTRACE_PROBE3(
536			    trill__recv__multidest__rpfcheckfail,
537			    trill_header_t *, trillhdr, trill_node_t *,
538			    source, trill_inst_t *, tip);
539			goto fail;
540		}
541	}
542
543	/* Check hop count before doing any forwarding */
544	if (trillhdr->th_hopcount == 0)
545		goto fail;
546
547	/* Forward frame using the distribution tree specified by egress nick */
548	DTRACE_PROBE2(trill__recv__multidest, trill_header_t *,
549	    trillhdr, trill_node_t *, source);
550	trill_node_unref(tip, source);
551	trill_node_unref(tip, dest);
552
553	/* Tell forwarding not to free if we're the link forwarder. */
554	trill_multidest_fwd(tip, mp, trillhdr->th_egressnick,
555	    trillhdr->th_ingressnick, B_TRUE, srcaddr, inner_vlan,
556	    B_FALSE);
557
558	/*
559	 * Send de-capsulated frame locally if we are the link forwarder (also
560	 * does bridge learning).
561	 */
562	mp->b_rptr += trillhdrlen;
563	trill_recv_local(tsock, mp, trillhdr->th_ingressnick);
564	KSPINCR(tks_recv);
565	return;
566
567fail:
568	DTRACE_PROBE2(trill__recv__multidest__fail, mblk_t *, mp,
569	    trill_sock_t *, tsock);
570	if (dest != NULL)
571		trill_node_unref(tip, dest);
572	if (source != NULL)
573		trill_node_unref(tip, source);
574	freemsg(mp);
575	KSPINCR(tks_drops);
576}
577
578static void
579trill_stop_recv(trill_sock_t *tsock)
580{
581	mutex_enter(&tsock->ts_socklock);
582stop_retry:
583	if (tsock->ts_state == TS_UNBND || tsock->ts_link == NULL) {
584		mutex_exit(&tsock->ts_socklock);
585		return;
586	}
587
588	/*
589	 * If another thread is closing the socket then wait. Our callers
590	 * expect us to return only after the socket is closed.
591	 */
592	if (tsock->ts_flags & TSF_CLOSEWAIT) {
593		cv_wait(&tsock->ts_sockclosewait, &tsock->ts_socklock);
594		goto stop_retry;
595	}
596
597	/*
598	 * Set state and flags to block new bind or close calls
599	 * while we close the socket.
600	 */
601	tsock->ts_flags |= TSF_CLOSEWAIT;
602
603	/* Wait until all AF_TRILL socket transmit operations are done */
604	while (tsock->ts_sockthreadcount > 0)
605		cv_wait(&tsock->ts_sockthreadwait, &tsock->ts_socklock);
606
607	/*
608	 * We are guaranteed to be the only thread closing on the
609	 * socket while the TSF_CLOSEWAIT flag is set, all others cv_wait
610	 * for us to finish.
611	 */
612	ASSERT(tsock->ts_link != NULL);
613	if (tsock->ts_ksp != NULL)
614		kstat_delete(tsock->ts_ksp);
615
616	/*
617	 * Release lock before bridge_trill_lnunref to prevent deadlock
618	 * between trill_ctrl_input thread waiting to acquire ts_socklock
619	 * and bridge_trill_lnunref waiting for the trill thread to finish.
620	 */
621	mutex_exit(&tsock->ts_socklock);
622
623	/*
624	 * Release TRILL link reference from Bridging. On return from
625	 * bridge_trill_lnunref we can be sure there are no active TRILL data
626	 * threads for this link.
627	 */
628	bridge_trill_lnunref(tsock->ts_link);
629
630	/* Set socket as unbound & wakeup threads waiting for socket to close */
631	mutex_enter(&tsock->ts_socklock);
632	ASSERT(tsock->ts_link != NULL);
633	tsock->ts_link = NULL;
634	tsock->ts_state = TS_UNBND;
635	tsock->ts_flags &= ~TSF_CLOSEWAIT;
636	cv_broadcast(&tsock->ts_sockclosewait);
637	mutex_exit(&tsock->ts_socklock);
638}
639
640static int
641trill_start_recv(trill_sock_t *tsock, const struct sockaddr *sa, socklen_t len)
642{
643	struct sockaddr_dl *lladdr = (struct sockaddr_dl *)sa;
644	datalink_id_t linkid;
645	int err = 0;
646
647	if (len != sizeof (*lladdr))
648		return (EINVAL);
649
650	mutex_enter(&tsock->ts_socklock);
651	if (tsock->ts_tip == NULL || tsock->ts_state != TS_UNBND) {
652		err = EINVAL;
653		goto bind_error;
654	}
655
656	if (tsock->ts_flags & TSF_CLOSEWAIT || tsock->ts_link != NULL) {
657		err = EBUSY;
658		goto bind_error;
659	}
660
661	(void) memcpy(&(tsock->ts_lladdr), lladdr,
662	    sizeof (struct sockaddr_dl));
663	(void) memcpy(&linkid, tsock->ts_lladdr.sdl_data,
664	    sizeof (datalink_id_t));
665
666	tsock->ts_link = bridge_trill_lnref(tsock->ts_tip->ti_binst,
667	    linkid, tsock);
668	if (tsock->ts_link == NULL) {
669		err = EINVAL;
670		goto bind_error;
671	}
672
673	trill_kstats_init(tsock, tsock->ts_tip->ti_bridgename);
674	tsock->ts_state = TS_IDLE;
675
676bind_error:
677	mutex_exit(&tsock->ts_socklock);
678	return (err);
679}
680
681static int
682trill_do_unbind(trill_sock_t *tsock)
683{
684	/* If a bind has not been done, we can't unbind. */
685	if (tsock->ts_state != TS_IDLE)
686		return (EINVAL);
687
688	trill_stop_recv(tsock);
689	return (0);
690}
691
692static void
693trill_instance_unref(trill_inst_t *tip)
694{
695	rw_enter(&trill_inst_rwlock, RW_WRITER);
696	rw_enter(&tip->ti_rwlock, RW_WRITER);
697	if (atomic_dec_uint_nv(&tip->ti_refs) == 0) {
698		list_remove(&trill_inst_list, tip);
699		rw_exit(&tip->ti_rwlock);
700		rw_exit(&trill_inst_rwlock);
701		if (tip->ti_binst != NULL)
702			bridge_trill_brunref(tip->ti_binst);
703		list_destroy(&tip->ti_socklist);
704		rw_destroy(&tip->ti_rwlock);
705		kmem_free(tip, sizeof (*tip));
706	} else {
707		rw_exit(&tip->ti_rwlock);
708		rw_exit(&trill_inst_rwlock);
709	}
710}
711
712/*
713 * This is called when the bridge module receives a TRILL-encapsulated packet
714 * on a given link or a packet identified as "TRILL control."  We must verify
715 * that it's for us (it almost certainly will be), and then either decapsulate
716 * (if it's to our nickname), forward (if it's to someone else), or send up one
717 * of the sockets (if it's control traffic).
718 *
719 * Sadly, on Ethernet, the control traffic is identified by Outer.MacDA, and
720 * not by TRILL header information.
721 */
722static void
723trill_recv_pkt_cb(void *lptr, bridge_link_t *blp, mac_resource_handle_t rsrc,
724    mblk_t *mp, mac_header_info_t *hdr_info)
725{
726	trill_sock_t *tsock = lptr;
727
728	_NOTE(ARGUNUSED(rsrc));
729
730	ASSERT(tsock->ts_tip != NULL);
731	ASSERT(tsock->ts_link != NULL);
732	ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
733
734	/*
735	 * Only receive packet if the source address is not multicast (which is
736	 * bogus).
737	 */
738	if (hdr_info->mhi_saddr[0] & 1)
739		goto discard;
740
741	/*
742	 * Check if this is our own packet reflected back.  It should not be.
743	 */
744	if (bcmp(hdr_info->mhi_saddr, blp->bl_local_mac, ETHERADDRL) == 0)
745		goto discard;
746
747	/* Only receive unicast packet if addressed to us */
748	if (hdr_info->mhi_dsttype == MAC_ADDRTYPE_UNICAST &&
749	    bcmp(hdr_info->mhi_daddr, blp->bl_local_mac, ETHERADDRL) != 0)
750		goto discard;
751
752	if (hdr_info->mhi_bindsap == ETHERTYPE_TRILL) {
753		/* TRILL data packets */
754		trill_recv(tsock, mp, hdr_info->mhi_saddr);
755	} else {
756		/* Design constraint for cheap IS-IS/BPDU comparison */
757		ASSERT(all_isis_rbridges[4] != bridge_group_address[4]);
758		/* Send received control packet upstream */
759		trill_ctrl_input(tsock, mp, hdr_info->mhi_saddr,
760		    hdr_info->mhi_daddr[4] == all_isis_rbridges[4] ?
761		    hdr_info->mhi_tci : TRILL_TCI_BPDU);
762	}
763
764	return;
765
766discard:
767	freemsg(mp);
768	KSPINCR(tks_drops);
769}
770
771/*
772 * This is called when the bridge module discovers that the destination address
773 * for a packet is not local -- it's through some remote node.  We must verify
774 * that the remote node isn't our nickname (it shouldn't be), add a TRILL
775 * header, and then use the IS-IS data to determine which link and which
776 * next-hop RBridge should be used for output.  We then transmit on that link.
777 *
778 * The egress_nick is RBRIDGE_NICKNAME_NONE for the "unknown destination" case.
779 */
780static void
781trill_encap_pkt_cb(void *lptr, bridge_link_t *blp, mac_header_info_t *hdr_info,
782    mblk_t *mp, uint16_t egress_nick)
783{
784	uint16_t ournick;
785	uint16_t dtnick;
786	trill_node_t *self = NULL;
787	trill_sock_t *tsock = lptr;
788	trill_inst_t *tip = tsock->ts_tip;
789	int vlan = VLAN_ID_NONE;
790
791	_NOTE(ARGUNUSED(blp));
792	ASSERT(hdr_info->mhi_bindsap != ETHERTYPE_TRILL);
793
794	/* egress_nick = RBRIDGE_NICKNAME_NONE is valid */
795	if (egress_nick != RBRIDGE_NICKNAME_NONE && !VALID_NICK(egress_nick))
796		goto discard;
797
798	/* Check if our own nick is valid before we do any forwarding */
799	rw_enter(&tip->ti_rwlock, RW_READER);
800	ournick = tip->ti_nick;
801	dtnick = tip->ti_treeroot;
802	rw_exit(&tip->ti_rwlock);
803	if (!VALID_NICK(ournick))
804		goto discard;
805
806	/*
807	 * For Multi-Destination forwarding determine our choice of
808	 * root distribution tree. If we didn't choose a distribution
809	 * tree (dtroots_count=0) then we use the highest priority tree
810	 * root (t_treeroot) else we drop the packet without forwarding.
811	 */
812	if (egress_nick == RBRIDGE_NICKNAME_NONE) {
813		if ((self = trill_node_lookup(tip, ournick)) == NULL)
814			goto discard;
815
816		/*
817		 * Use the first DT configured for now. In future we
818		 * should have DT selection code here.
819		 */
820		if (self->tn_ni->tni_dtrootcount > 0) {
821			dtnick = TNI_DTROOTNICK(self->tn_ni, 0);
822		}
823
824		trill_node_unref(tip, self);
825		if (!VALID_NICK(dtnick)) {
826			DTRACE_PROBE(trill__fwd__packet__nodtroot);
827			goto discard;
828		}
829	}
830
831	/*
832	 * Retrieve VLAN ID of the native frame used for VLAN
833	 * pruning of multi-destination frames.
834	 */
835	if (hdr_info->mhi_istagged) {
836		vlan = VLAN_ID(hdr_info->mhi_tci);
837	}
838
839	DTRACE_PROBE2(trill__fwd__packet, mac_header_info_t *, hdr_info,
840	    uint16_t, egress_nick);
841	if (egress_nick == RBRIDGE_NICKNAME_NONE) {
842		trill_multidest_fwd(tip, mp, dtnick,
843		    ournick, B_FALSE, NULL, vlan, B_TRUE);
844	} else {
845		trill_dest_fwd(tip, mp, egress_nick, B_FALSE, B_FALSE,
846		    RBRIDGE_NICKNAME_NONE);
847	}
848	KSPINCR(tks_encap);
849	return;
850
851discard:
852	freemsg(mp);
853}
854
855/*
856 * This is called when the bridge module has completely torn down a bridge
857 * instance and all of the attached links.  We need to make the TRILL instance
858 * go away at this point.
859 */
860static void
861trill_br_dstr_cb(void *bptr, bridge_inst_t *bip)
862{
863	trill_inst_t *tip = bptr;
864
865	_NOTE(ARGUNUSED(bip));
866	rw_enter(&tip->ti_rwlock, RW_WRITER);
867	if (tip->ti_binst != NULL)
868		bridge_trill_brunref(tip->ti_binst);
869	tip->ti_binst = NULL;
870	rw_exit(&tip->ti_rwlock);
871}
872
873/*
874 * This is called when the bridge module is tearing down a link, but before the
875 * actual tear-down starts.  When this function returns, we must make sure that
876 * we will not initiate any new transmits on this link.
877 */
878static void
879trill_ln_dstr_cb(void *lptr, bridge_link_t *blp)
880{
881	trill_sock_t *tsock = lptr;
882
883	_NOTE(ARGUNUSED(blp));
884	trill_stop_recv(tsock);
885}
886
887static void
888trill_init(void)
889{
890	list_create(&trill_inst_list, sizeof (trill_inst_t),
891	    offsetof(trill_inst_t, ti_instnode));
892	rw_init(&trill_inst_rwlock, NULL, RW_DRIVER, NULL);
893	bridge_trill_register_cb(trill_recv_pkt_cb, trill_encap_pkt_cb,
894	    trill_br_dstr_cb, trill_ln_dstr_cb);
895}
896
897static void
898trill_fini(void)
899{
900	bridge_trill_register_cb(NULL, NULL, NULL, NULL);
901	rw_destroy(&trill_inst_rwlock);
902	list_destroy(&trill_inst_list);
903}
904
905/* Loadable module configuration entry points */
906int
907_init(void)
908{
909	int rc;
910
911	trill_init();
912	if ((rc = mod_install(&ml)) != 0)
913		trill_fini();
914	return (rc);
915}
916
917int
918_info(struct modinfo *modinfop)
919{
920	return (mod_info(&ml, modinfop));
921}
922
923int
924_fini(void)
925{
926	int rc;
927
928	rw_enter(&trill_inst_rwlock, RW_READER);
929	rc = list_is_empty(&trill_inst_list) ? 0 : EBUSY;
930	rw_exit(&trill_inst_rwlock);
931	if (rc == 0 && ((rc = mod_remove(&ml)) == 0))
932		trill_fini();
933	return (rc);
934}
935
936static void
937trill_kstats_init(trill_sock_t *tsock, const char *bname)
938{
939	int i;
940	char kstatname[KSTAT_STRLEN];
941	kstat_named_t  *knt;
942	static const char *sock_kstats_list[] = { TRILL_KSSOCK_NAMES };
943	char link_name[MAXNAMELEN];
944	int num;
945	int err;
946
947	bzero(link_name, sizeof (link_name));
948	if ((err = dls_mgmt_get_linkinfo(tsock->ts_link->bl_linkid, link_name,
949	    NULL, NULL, NULL)) != 0) {
950		cmn_err(CE_WARN, "%s: trill_kstats_init: error %d retrieving"
951		    " linkinfo for linkid:%d", "trill", err,
952		    tsock->ts_link->bl_linkid);
953		return;
954	}
955
956	bzero(kstatname, sizeof (kstatname));
957	(void) snprintf(kstatname, sizeof (kstatname), "%s-%s",
958	    bname, link_name);
959
960	num = sizeof (sock_kstats_list) / sizeof (*sock_kstats_list);
961	for (i = 0; i < num; i++) {
962		knt = (kstat_named_t *)&(tsock->ts_kstats);
963		kstat_named_init(&knt[i], sock_kstats_list[i],
964		    KSTAT_DATA_UINT64);
965	}
966
967	tsock->ts_ksp = kstat_create_zone("trill", 0, kstatname, "sock",
968	    KSTAT_TYPE_NAMED, num, KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID);
969	if (tsock->ts_ksp != NULL) {
970		tsock->ts_ksp->ks_data = &tsock->ts_kstats;
971		kstat_install(tsock->ts_ksp);
972	}
973}
974
975static trill_sock_t *
976trill_do_open(int flags)
977{
978	trill_sock_t *tsock;
979	int kmflag = ((flags & SOCKET_NOSLEEP)) ? KM_NOSLEEP:KM_SLEEP;
980
981	tsock = kmem_zalloc(sizeof (trill_sock_t), kmflag);
982	if (tsock != NULL) {
983		tsock->ts_state = TS_UNBND;
984		tsock->ts_refs++;
985		mutex_init(&tsock->ts_socklock, NULL, MUTEX_DRIVER, NULL);
986		cv_init(&tsock->ts_sockthreadwait, NULL, CV_DRIVER, NULL);
987		cv_init(&tsock->ts_sockclosewait, NULL, CV_DRIVER, NULL);
988	}
989	return (tsock);
990}
991
992static int
993trill_find_bridge(trill_sock_t *tsock, const char *bname, boolean_t can_create)
994{
995	trill_inst_t *tip, *newtip = NULL;
996
997	/* Allocate some memory (speculatively) before taking locks */
998	if (can_create)
999		newtip = kmem_zalloc(sizeof (*tip), KM_NOSLEEP);
1000
1001	rw_enter(&trill_inst_rwlock, RW_WRITER);
1002	for (tip = list_head(&trill_inst_list); tip != NULL;
1003	    tip = list_next(&trill_inst_list, tip)) {
1004		if (strcmp(tip->ti_bridgename, bname) == 0)
1005			break;
1006	}
1007	if (tip == NULL) {
1008		if (!can_create || newtip == NULL) {
1009			rw_exit(&trill_inst_rwlock);
1010			return (can_create ? ENOMEM : ENOENT);
1011		}
1012
1013		tip = newtip;
1014		newtip = NULL;
1015		(void) strcpy(tip->ti_bridgename, bname);
1016
1017		/* Register TRILL instance with bridging */
1018		tip->ti_binst = bridge_trill_brref(bname, tip);
1019		if (tip->ti_binst == NULL) {
1020			rw_exit(&trill_inst_rwlock);
1021			kmem_free(tip, sizeof (*tip));
1022			return (ENOENT);
1023		}
1024
1025		rw_init(&tip->ti_rwlock, NULL, RW_DRIVER, NULL);
1026		list_create(&tip->ti_socklist, sizeof (trill_sock_t),
1027		    offsetof(trill_sock_t, ts_socklistnode));
1028		list_insert_tail(&trill_inst_list, tip);
1029	}
1030	atomic_inc_uint(&tip->ti_refs);
1031	rw_exit(&trill_inst_rwlock);
1032
1033	/* If we didn't need the preallocated memory, then discard now. */
1034	if (newtip != NULL)
1035		kmem_free(newtip, sizeof (*newtip));
1036
1037	rw_enter(&tip->ti_rwlock, RW_WRITER);
1038	list_insert_tail(&(tip->ti_socklist), tsock);
1039	tsock->ts_tip = tip;
1040	rw_exit(&tip->ti_rwlock);
1041	return (0);
1042}
1043
1044static void
1045trill_clear_bridge(trill_sock_t *tsock)
1046{
1047	trill_inst_t *tip;
1048
1049	if ((tip = tsock->ts_tip) == NULL)
1050		return;
1051	rw_enter(&tip->ti_rwlock, RW_WRITER);
1052	list_remove(&tip->ti_socklist, tsock);
1053	if (list_is_empty(&tip->ti_socklist))
1054		trill_del_all(tip, B_TRUE);
1055	rw_exit(&tip->ti_rwlock);
1056}
1057
1058static void
1059trill_sock_unref(trill_sock_t *tsock)
1060{
1061	if (atomic_dec_uint_nv(&tsock->ts_refs) == 0) {
1062		mutex_destroy(&tsock->ts_socklock);
1063		cv_destroy(&tsock->ts_sockthreadwait);
1064		cv_destroy(&tsock->ts_sockclosewait);
1065		kmem_free(tsock, sizeof (trill_sock_t));
1066	}
1067}
1068
1069static void
1070trill_do_close(trill_sock_t *tsock)
1071{
1072	trill_inst_t *tip;
1073
1074	tip = tsock->ts_tip;
1075	trill_stop_recv(tsock);
1076	/* Remove socket from TRILL instance socket list */
1077	trill_clear_bridge(tsock);
1078	tsock->ts_flags |= TSF_SHUTDOWN;
1079	trill_sock_unref(tsock);
1080	if (tip != NULL)
1081		trill_instance_unref(tip);
1082}
1083
1084static void
1085trill_del_all(trill_inst_t *tip, boolean_t lockheld)
1086{
1087	int i;
1088
1089	if (!lockheld)
1090		rw_enter(&tip->ti_rwlock, RW_WRITER);
1091	for (i = RBRIDGE_NICKNAME_MIN; i < RBRIDGE_NICKNAME_MAX; i++) {
1092		if (tip->ti_nodes[i] != NULL)
1093			(void) trill_del_nick(tip, i, B_TRUE);
1094	}
1095	if (!lockheld)
1096		rw_exit(&tip->ti_rwlock);
1097}
1098
1099static void
1100trill_node_free(trill_node_t *nick_entry)
1101{
1102	trill_nickinfo_t *tni;
1103
1104	tni = nick_entry->tn_ni;
1105	kmem_free(tni, TNI_TOTALSIZE(tni));
1106	kmem_free(nick_entry, sizeof (trill_node_t));
1107}
1108
1109static void
1110trill_node_unref(trill_inst_t *tip, trill_node_t *tnp)
1111{
1112	if (atomic_dec_uint_nv(&tnp->tn_refs) == 0) {
1113		if (tnp->tn_tsp != NULL)
1114			trill_sock_unref(tnp->tn_tsp);
1115		trill_node_free(tnp);
1116		(void) atomic_dec_uint_nv(&tip->ti_nodecount);
1117	}
1118}
1119
1120static trill_node_t *
1121trill_node_lookup(trill_inst_t *tip, uint16_t nick)
1122{
1123	trill_node_t *nick_entry;
1124
1125	if (!VALID_NICK(nick))
1126		return (NULL);
1127	rw_enter(&tip->ti_rwlock, RW_READER);
1128	nick_entry = tip->ti_nodes[nick];
1129	if (nick_entry != NULL) {
1130		atomic_inc_uint(&nick_entry->tn_refs);
1131	}
1132	rw_exit(&tip->ti_rwlock);
1133	return (nick_entry);
1134}
1135
1136static int
1137trill_del_nick(trill_inst_t *tip, uint16_t nick, boolean_t lockheld)
1138{
1139	trill_node_t *nick_entry;
1140	int rc = ENOENT;
1141
1142	if (!lockheld)
1143		rw_enter(&tip->ti_rwlock, RW_WRITER);
1144	if (VALID_NICK(nick)) {
1145		nick_entry = tip->ti_nodes[nick];
1146		if (nick_entry != NULL) {
1147			trill_node_unref(tip, nick_entry);
1148			tip->ti_nodes[nick] = NULL;
1149			rc = 0;
1150		}
1151	}
1152	if (!lockheld)
1153		rw_exit(&tip->ti_rwlock);
1154	return (rc);
1155}
1156
1157static int
1158trill_add_nick(trill_inst_t *tip, void *arg, boolean_t self, int mode)
1159{
1160	uint16_t nick;
1161	int size;
1162	trill_node_t *tnode;
1163	trill_nickinfo_t tnihdr;
1164
1165	/* First make sure we have at least the header available */
1166	if (ddi_copyin(arg, &tnihdr, sizeof (trill_nickinfo_t), mode) != 0)
1167		return (EFAULT);
1168
1169	nick = tnihdr.tni_nick;
1170	if (!VALID_NICK(nick)) {
1171		DTRACE_PROBE1(trill__add__nick__bad, trill_nickinfo_t *,
1172		    &tnihdr);
1173		return (EINVAL);
1174	}
1175
1176	size = TNI_TOTALSIZE(&tnihdr);
1177	if (size > TNI_MAXSIZE)
1178		return (EINVAL);
1179	tnode = kmem_zalloc(sizeof (trill_node_t), KM_SLEEP);
1180	tnode->tn_ni = kmem_zalloc(size, KM_SLEEP);
1181	if (ddi_copyin(arg, tnode->tn_ni, size, mode) != 0) {
1182		kmem_free(tnode->tn_ni, size);
1183		kmem_free(tnode, sizeof (trill_node_t));
1184		return (EFAULT);
1185	}
1186
1187	tnode->tn_refs++;
1188	rw_enter(&tip->ti_rwlock, RW_WRITER);
1189	if (tip->ti_nodes[nick] != NULL)
1190		(void) trill_del_nick(tip, nick, B_TRUE);
1191
1192	if (self) {
1193		tip->ti_nick = nick;
1194	} else {
1195		tnode->tn_tsp = find_trill_link(tip,
1196		    tnode->tn_ni->tni_linkid);
1197	}
1198	DTRACE_PROBE2(trill__add__nick, trill_node_t *, tnode,
1199	    uint16_t, nick);
1200	tip->ti_nodes[nick] = tnode;
1201	tip->ti_nodecount++;
1202	rw_exit(&tip->ti_rwlock);
1203	return (0);
1204}
1205
1206static int
1207trill_do_ioctl(trill_sock_t *tsock, int cmd, void *arg, int mode)
1208{
1209	int error = 0;
1210	trill_inst_t *tip = tsock->ts_tip;
1211
1212	switch (cmd) {
1213	case TRILL_DESIGVLAN: {
1214		uint16_t desigvlan;
1215
1216		if (ddi_copyin(arg, &desigvlan, sizeof (desigvlan), mode) != 0)
1217			return (EFAULT);
1218		tsock->ts_desigvlan = desigvlan;
1219		break;
1220	}
1221	case TRILL_VLANFWDER: {
1222		uint8_t vlans[TRILL_VLANS_ARRSIZE];
1223
1224		if (tsock->ts_link == NULL)
1225			return (EINVAL);
1226		if ((ddi_copyin(arg, vlans, sizeof (vlans), mode)) != 0)
1227			return (EFAULT);
1228		bridge_trill_setvlans(tsock->ts_link, vlans);
1229		break;
1230	}
1231	case TRILL_SETNICK:
1232		if (tip == NULL)
1233			return (EINVAL);
1234		error = trill_add_nick(tip, arg, B_TRUE, mode);
1235		break;
1236
1237	case TRILL_GETNICK:
1238		if (tip == NULL)
1239			return (EINVAL);
1240		rw_enter(&tip->ti_rwlock, RW_READER);
1241		if (ddi_copyout(&tip->ti_nick, arg, sizeof (tip->ti_nick),
1242		    mode) != 0)
1243			error = EFAULT;
1244		rw_exit(&tip->ti_rwlock);
1245		break;
1246
1247	case TRILL_ADDNICK:
1248		if (tip == NULL)
1249			break;
1250		error = trill_add_nick(tip, arg, B_FALSE, mode);
1251		break;
1252
1253	case TRILL_DELNICK: {
1254		uint16_t delnick;
1255
1256		if (tip == NULL)
1257			break;
1258		if (ddi_copyin(arg, &delnick, sizeof (delnick), mode) != 0)
1259			return (EFAULT);
1260		error = trill_del_nick(tip, delnick, B_FALSE);
1261		break;
1262	}
1263	case TRILL_DELALL:
1264		if (tip == NULL)
1265			break;
1266		trill_del_all(tip, B_FALSE);
1267		break;
1268
1269	case TRILL_TREEROOT: {
1270		uint16_t treeroot;
1271
1272		if (tip == NULL)
1273			break;
1274		if (ddi_copyin(arg, &treeroot, sizeof (treeroot), mode) != 0)
1275			return (EFAULT);
1276		if (!VALID_NICK(treeroot))
1277			return (EINVAL);
1278		rw_enter(&tip->ti_rwlock, RW_WRITER);
1279		tip->ti_treeroot = treeroot;
1280		rw_exit(&tip->ti_rwlock);
1281		break;
1282	}
1283	case TRILL_HWADDR:
1284		if (tsock->ts_link == NULL)
1285			break;
1286		if (ddi_copyout(tsock->ts_link->bl_local_mac, arg, ETHERADDRL,
1287		    mode) != 0)
1288			return (EFAULT);
1289		break;
1290
1291	case TRILL_NEWBRIDGE: {
1292		char bname[MAXLINKNAMELEN];
1293
1294		if (tsock->ts_state != TS_UNBND)
1295			return (ENOTSUP);
1296		/* ts_tip can only be set once */
1297		if (tip != NULL)
1298			return (EEXIST);
1299		if (ddi_copyin(arg, bname, sizeof (bname), mode) != 0)
1300			return (EFAULT);
1301		bname[MAXLINKNAMELEN-1] = '\0';
1302		error = trill_find_bridge(tsock, bname, B_TRUE);
1303		break;
1304	}
1305
1306	case TRILL_GETBRIDGE: {
1307		char bname[MAXLINKNAMELEN];
1308
1309		/* ts_tip can only be set once */
1310		if (tip != NULL)
1311			return (EEXIST);
1312		if (ddi_copyin(arg, bname, sizeof (bname), mode) != 0)
1313			return (EFAULT);
1314		bname[MAXLINKNAMELEN - 1] = '\0';
1315		error = trill_find_bridge(tsock, bname, B_FALSE);
1316		break;
1317	}
1318
1319	case TRILL_LISTNICK: {
1320		trill_listnick_t tln;
1321		trill_node_t *tnp;
1322		trill_nickinfo_t *tnip;
1323		uint16_t nick;
1324
1325		if (tip == NULL)
1326			return (EINVAL);
1327		if (ddi_copyin(arg, &tln, sizeof (tln), mode) != 0)
1328			return (EFAULT);
1329		nick = tln.tln_nick;
1330		if (nick >= RBRIDGE_NICKNAME_MAX) {
1331			error = EINVAL;
1332			break;
1333		}
1334		rw_enter(&tip->ti_rwlock, RW_READER);
1335		while (++nick < RBRIDGE_NICKNAME_MAX) {
1336			if ((tnp = tip->ti_nodes[nick]) != NULL) {
1337				tnip = tnp->tn_ni;
1338				ASSERT(nick == tnip->tni_nick);
1339				tln.tln_nick = nick;
1340				bcopy(tnip->tni_adjsnpa, tln.tln_nexthop,
1341				    ETHERADDRL);
1342				tln.tln_ours = nick == tip->ti_nick;
1343				if (tln.tln_ours || tnp->tn_tsp == NULL) {
1344					tln.tln_linkid =
1345					    DATALINK_INVALID_LINKID;
1346				} else {
1347					tln.tln_linkid =
1348					    tnp->tn_tsp->ts_link->bl_linkid;
1349				}
1350				break;
1351			}
1352		}
1353		rw_exit(&tip->ti_rwlock);
1354		if (nick >= RBRIDGE_NICKNAME_MAX)
1355			bzero(&tln, sizeof (tln));
1356		if (ddi_copyout(&tln, arg, sizeof (tln), mode) != 0)
1357			return (EFAULT);
1358		break;
1359	}
1360
1361	/*
1362	 * Port flush: this is used when we lose AF on a port.  We must discard
1363	 * all regular bridge forwarding entries on this port with the
1364	 * indicated VLAN.
1365	 */
1366	case TRILL_PORTFLUSH: {
1367		uint16_t vlan = (uint16_t)(uintptr_t)arg;
1368
1369		if (tsock->ts_link == NULL)
1370			return (EINVAL);
1371		bridge_trill_flush(tsock->ts_link, vlan, B_FALSE);
1372		break;
1373	}
1374
1375	/*
1376	 * Nick flush: this is used when we lose AF on a port.  We must discard
1377	 * all bridge TRILL forwarding entries on this port with the indicated
1378	 * VLAN.
1379	 */
1380	case TRILL_NICKFLUSH: {
1381		uint16_t vlan = (uint16_t)(uintptr_t)arg;
1382
1383		if (tsock->ts_link == NULL)
1384			return (EINVAL);
1385		bridge_trill_flush(tsock->ts_link, vlan, B_TRUE);
1386		break;
1387	}
1388
1389	case TRILL_GETMTU:
1390		if (tsock->ts_link == NULL)
1391			break;
1392		if (ddi_copyout(&tsock->ts_link->bl_maxsdu, arg,
1393		    sizeof (uint_t), mode) != 0)
1394			return (EFAULT);
1395		break;
1396
1397	default:
1398		error = ENOTSUP;
1399		break;
1400	}
1401
1402	return (error);
1403}
1404
1405/*
1406 * Sends received packet back upstream on the TRILL socket.
1407 * Consumes passed mblk_t.
1408 */
1409static void
1410trill_ctrl_input(trill_sock_t *tsock, mblk_t *mp, const uint8_t *saddr,
1411    uint16_t tci)
1412{
1413	int udi_size;
1414	mblk_t *mp1;
1415	struct T_unitdata_ind *tudi;
1416	struct sockaddr_dl *sdl;
1417	char *lladdr;
1418	int error;
1419
1420	ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
1421	if (tsock->ts_flow_ctrld) {
1422		freemsg(mp);
1423		KSPINCR(tks_drops);
1424		return;
1425	}
1426
1427	udi_size =  sizeof (struct T_unitdata_ind) +
1428	    sizeof (struct sockaddr_dl);
1429	mp1 = allocb(udi_size, BPRI_MED);
1430	if (mp1 == NULL) {
1431		freemsg(mp);
1432		KSPINCR(tks_drops);
1433		return;
1434	}
1435
1436	mp1->b_cont = mp;
1437	mp = mp1;
1438	mp->b_datap->db_type = M_PROTO;
1439	/* LINTED: alignment */
1440	tudi = (struct T_unitdata_ind *)mp->b_rptr;
1441	mp->b_wptr = (uchar_t *)tudi + udi_size;
1442
1443	tudi->PRIM_type = T_UNITDATA_IND;
1444	tudi->SRC_length = sizeof (struct sockaddr_dl);
1445	tudi->SRC_offset = sizeof (struct T_unitdata_ind);
1446	tudi->OPT_length = 0;
1447	tudi->OPT_offset = sizeof (struct T_unitdata_ind) +
1448	    sizeof (struct sockaddr_dl);
1449
1450	/* Information of the link on which packet was received. */
1451	sdl = (struct sockaddr_dl *)&tudi[1];
1452	(void) memset(sdl, 0, sizeof (struct sockaddr_dl));
1453	sdl->sdl_family = AF_TRILL;
1454
1455	/* LINTED: alignment */
1456	*(datalink_id_t *)sdl->sdl_data = tsock->ts_link->bl_linkid;
1457	sdl->sdl_nlen = sizeof (tsock->ts_link->bl_linkid);
1458
1459	lladdr = LLADDR(sdl);
1460	(void) memcpy(lladdr, saddr, ETHERADDRL);
1461	lladdr += ETHERADDRL;
1462	sdl->sdl_alen = ETHERADDRL;
1463
1464	/* LINTED: alignment */
1465	*(uint16_t *)lladdr = tci;
1466	sdl->sdl_slen = sizeof (uint16_t);
1467
1468	DTRACE_PROBE2(trill__ctrl__input, trill_sock_t *, tsock, mblk_t *, mp);
1469	(*tsock->ts_conn_upcalls->su_recv)(tsock->ts_conn_upper_handle,
1470	    mp, msgdsize(mp), 0, &error, NULL);
1471
1472	if (error == ENOSPC) {
1473		mutex_enter(&tsock->ts_socklock);
1474		(*tsock->ts_conn_upcalls->su_recv)(tsock->ts_conn_upper_handle,
1475		    NULL, 0, 0, &error, NULL);
1476		if (error == ENOSPC)
1477			tsock->ts_flow_ctrld = B_TRUE;
1478		mutex_exit(&tsock->ts_socklock);
1479		KSPINCR(tks_drops);
1480	} else if (error != 0) {
1481		KSPINCR(tks_drops);
1482	} else {
1483		KSPINCR(tks_recv);
1484	}
1485
1486	DTRACE_PROBE2(trill__ctrl__input__done, trill_sock_t *,
1487	    tsock, int, error);
1488}
1489
1490/* ARGSUSED */
1491static void
1492trill_activate(sock_lower_handle_t proto_handle,
1493    sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls,
1494    int flags, cred_t *cr)
1495{
1496	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1497	struct sock_proto_props sopp;
1498
1499	tsock->ts_conn_upcalls = sock_upcalls;
1500	tsock->ts_conn_upper_handle = sock_handle;
1501
1502	sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT |
1503	    SOCKOPT_RCVLOWAT | SOCKOPT_MAXADDRLEN | SOCKOPT_MAXPSZ |
1504	    SOCKOPT_MAXBLK | SOCKOPT_MINPSZ;
1505	sopp.sopp_wroff = 0;
1506	sopp.sopp_rxhiwat = SOCKET_RECVHIWATER;
1507	sopp.sopp_rxlowat = SOCKET_RECVLOWATER;
1508	sopp.sopp_maxaddrlen = sizeof (struct sockaddr_dl);
1509	sopp.sopp_maxpsz = INFPSZ;
1510	sopp.sopp_maxblk = INFPSZ;
1511	sopp.sopp_minpsz = 0;
1512	(*tsock->ts_conn_upcalls->su_set_proto_props)(
1513	    tsock->ts_conn_upper_handle, &sopp);
1514}
1515
1516/* ARGSUSED */
1517static int
1518trill_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr)
1519{
1520	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1521
1522	trill_do_close(tsock);
1523	return (0);
1524}
1525
1526/* ARGSUSED */
1527static int
1528trill_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa,
1529    socklen_t len, cred_t *cr)
1530{
1531	int error;
1532	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1533
1534	if (sa == NULL)
1535		error = trill_do_unbind(tsock);
1536	else
1537		error = trill_start_recv(tsock, sa, len);
1538
1539	return (error);
1540}
1541
1542/* ARGSUSED */
1543static int
1544trill_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
1545    cred_t *cr)
1546{
1547	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1548	struct sockaddr_dl *laddr;
1549	uint16_t tci;
1550
1551	ASSERT(DB_TYPE(mp) == M_DATA);
1552	ASSERT(!(tsock->ts_flags & TSF_SHUTDOWN));
1553
1554	if (msg->msg_name == NULL || msg->msg_namelen != sizeof (*laddr))
1555		goto eproto;
1556
1557	/*
1558	 * The name is a datalink_id_t, the address is an Ethernet address, and
1559	 * the selector value is the VLAN ID.
1560	 */
1561	laddr = (struct sockaddr_dl *)msg->msg_name;
1562	if (laddr->sdl_nlen != sizeof (datalink_id_t) ||
1563	    laddr->sdl_alen != ETHERADDRL ||
1564	    (laddr->sdl_slen != sizeof (tci) && laddr->sdl_slen != 0))
1565		goto eproto;
1566
1567	mutex_enter(&tsock->ts_socklock);
1568	if (tsock->ts_state != TS_IDLE || tsock->ts_link == NULL) {
1569		mutex_exit(&tsock->ts_socklock);
1570		goto eproto;
1571	}
1572	atomic_inc_uint(&tsock->ts_sockthreadcount);
1573	mutex_exit(&tsock->ts_socklock);
1574
1575	/*
1576	 * Safe to dereference VLAN now, as we've checked the user's specified
1577	 * values, and alignment is now guaranteed.
1578	 */
1579	if (laddr->sdl_slen == 0) {
1580		tci = TRILL_NO_TCI;
1581	} else {
1582		/* LINTED: alignment */
1583		tci = *(uint16_t *)(LLADDR(laddr) + ETHERADDRL);
1584	}
1585
1586	mp = create_trill_header(tsock, mp, (const uchar_t *)LLADDR(laddr),
1587	    B_TRUE, B_FALSE, tci, msgdsize(mp));
1588	if (mp != NULL) {
1589		mp = bridge_trill_output(tsock->ts_link, mp);
1590		if (mp == NULL) {
1591			KSPINCR(tks_sent);
1592		} else {
1593			freemsg(mp);
1594			KSPINCR(tks_drops);
1595		}
1596	}
1597
1598	/* Wake up any threads blocking on us */
1599	if (atomic_dec_uint_nv(&tsock->ts_sockthreadcount) == 0)
1600		cv_broadcast(&tsock->ts_sockthreadwait);
1601	return (0);
1602
1603eproto:
1604	freemsg(mp);
1605	KSPINCR(tks_drops);
1606	return (EPROTO);
1607}
1608
1609/* ARGSUSED */
1610static int
1611trill_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg,
1612    int mode, int32_t *rvalp, cred_t *cr)
1613{
1614	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1615	int rc;
1616
1617	switch (cmd) {
1618	/* List of unprivileged TRILL ioctls */
1619	case TRILL_GETNICK:
1620	case TRILL_GETBRIDGE:
1621	case TRILL_LISTNICK:
1622		break;
1623	default:
1624		if (secpolicy_dl_config(cr) != 0)
1625			return (EPERM);
1626		break;
1627	}
1628
1629	/* Lock ensures socket state is unchanged during ioctl handling */
1630	mutex_enter(&tsock->ts_socklock);
1631	rc = trill_do_ioctl(tsock, cmd, (void *)arg, mode);
1632	mutex_exit(&tsock->ts_socklock);
1633	return (rc);
1634}
1635
1636static void
1637trill_clr_flowctrl(sock_lower_handle_t proto_handle)
1638{
1639	trill_sock_t *tsock = (trill_sock_t *)proto_handle;
1640
1641	mutex_enter(&tsock->ts_socklock);
1642	tsock->ts_flow_ctrld = B_FALSE;
1643	mutex_exit(&tsock->ts_socklock);
1644}
1645
1646static sock_downcalls_t sock_trill_downcalls = {
1647	trill_activate,			/* sd_activate */
1648	sock_accept_notsupp,		/* sd_accept */
1649	trill_bind,			/* sd_bind */
1650	sock_listen_notsupp,		/* sd_listen */
1651	sock_connect_notsupp,		/* sd_connect */
1652	sock_getpeername_notsupp,	/* sd_getpeername */
1653	sock_getsockname_notsupp,	/* sd_getsockname */
1654	sock_getsockopt_notsupp,	/* sd_getsockopt */
1655	sock_setsockopt_notsupp,	/* sd_setsockopt */
1656	trill_send,			/* sd_send */
1657	NULL,				/* sd_send_uio */
1658	NULL,				/* sd_recv_uio */
1659	NULL,				/* sd_poll */
1660	sock_shutdown_notsupp,		/* sd_shutdown */
1661	trill_clr_flowctrl,		/* sd_setflowctrl */
1662	trill_ioctl,			/* sd_ioctl */
1663	trill_close			/* sd_close */
1664};
1665
1666/* ARGSUSED */
1667static sock_lower_handle_t
1668trill_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls,
1669    uint_t *smodep, int *errorp, int flags, cred_t *credp)
1670{
1671	trill_sock_t *tsock;
1672
1673	if (family != AF_TRILL || type != SOCK_DGRAM || proto != 0) {
1674		*errorp = EPROTONOSUPPORT;
1675		return (NULL);
1676	}
1677
1678	*sock_downcalls = &sock_trill_downcalls;
1679	*smodep = SM_ATOMIC;
1680	tsock = trill_do_open(flags);
1681	*errorp = (tsock != NULL) ? 0:ENOMEM;
1682	return ((sock_lower_handle_t)tsock);
1683}
1684