1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26#include <sys/types.h>
27#include <sys/kmem.h>
28#include <sys/conf.h>
29#include <sys/ddi.h>
30#include <sys/sunddi.h>
31#include <sys/ksynch.h>
32#include <sys/dlpi.h>			/* HCKSUM_INET_FULL_V4 */
33#include <sys/pattr.h>			/* HCK_FULLCKSUM */
34#include <sys/ib/mgt/sm_attr.h>		/* SM_INIT_TYPE_REPLY_... */
35
36#include <sys/ib/clients/eoib/eib_impl.h>
37
38/*
39 * Declarations private to this file
40 */
41static void eib_ibt_reset_partitions(eib_t *);
42static void eib_ibt_wakeup_sqd_waiters(eib_t *, ibt_channel_hdl_t);
43static int eib_ibt_chan_pkey(eib_t *, eib_chan_t *, ib_pkey_t, boolean_t,
44    boolean_t *);
45static boolean_t eib_ibt_has_chan_pkey_changed(eib_t *, eib_chan_t *);
46static boolean_t eib_ibt_has_any_pkey_changed(eib_t *);
47static int eib_ibt_fill_avect(eib_t *, eib_avect_t *, ib_lid_t);
48static void eib_ibt_record_srate(eib_t *);
49
50/*
51 * Definitions private to this file
52 */
53
54/*
55 * SM's init type reply flags
56 */
57#define	EIB_PORT_ATTR_LOADED(itr)				\
58	(((itr) & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0)
59#define	EIB_PORT_ATTR_NOT_PRESERVED(itr)			\
60	(((itr) & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)
61#define	EIB_PORT_PRES_NOT_PRESERVED(itr)			\
62	(((itr) & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 0)
63
64/*
65 * eib_ibt_hca_init() initialization progress flags
66 */
67#define	EIB_HCAINIT_HCA_OPENED		0x01
68#define	EIB_HCAINIT_ATTRS_ALLOCD	0x02
69#define	EIB_HCAINIT_HCA_PORTS_QUERIED	0x04
70#define	EIB_HCAINIT_PD_ALLOCD		0x08
71#define	EIB_HCAINIT_CAPAB_RECORDED	0x10
72
73int
74eib_ibt_hca_init(eib_t *ss)
75{
76	ibt_status_t ret;
77	ibt_hca_portinfo_t *pi;
78	uint_t num_pi;
79	uint_t sz_pi;
80	uint_t progress = 0;
81
82	if (ss->ei_hca_hdl)
83		return (EIB_E_SUCCESS);
84
85	/*
86	 * Open the HCA
87	 */
88	ret = ibt_open_hca(ss->ei_ibt_hdl, ss->ei_props->ep_hca_guid,
89	    &ss->ei_hca_hdl);
90	if (ret != IBT_SUCCESS) {
91		EIB_DPRINTF_ERR(ss->ei_instance,
92		    "ibt_open_hca(hca_guid=0x%llx) "
93		    "failed, ret=%d", ss->ei_props->ep_hca_guid, ret);
94		goto ibt_hca_init_fail;
95	}
96	progress |= EIB_HCAINIT_HCA_OPENED;
97
98	/*
99	 * Query and store HCA attributes
100	 */
101	ss->ei_hca_attrs = kmem_zalloc(sizeof (ibt_hca_attr_t), KM_SLEEP);
102	progress |= EIB_HCAINIT_ATTRS_ALLOCD;
103
104	ret = ibt_query_hca(ss->ei_hca_hdl, ss->ei_hca_attrs);
105	if (ret != IBT_SUCCESS) {
106		EIB_DPRINTF_ERR(ss->ei_instance,
107		    "ibt_query_hca(hca_hdl=0x%llx, "
108		    "hca_guid=0x%llx) failed, ret=%d",
109		    ss->ei_hca_hdl, ss->ei_props->ep_hca_guid, ret);
110		goto ibt_hca_init_fail;
111	}
112
113	/*
114	 * At this point, we don't even care about the linkstate, we only want
115	 * to record our invariant base port guid and mtu
116	 */
117	ret = ibt_query_hca_ports(ss->ei_hca_hdl, ss->ei_props->ep_port_num,
118	    &pi, &num_pi, &sz_pi);
119	if (ret != IBT_SUCCESS) {
120		EIB_DPRINTF_ERR(ss->ei_instance,
121		    "ibt_query_hca_ports(hca_hdl=0x%llx, "
122		    "port=0x%x) failed, ret=%d", ss->ei_hca_hdl,
123		    ss->ei_props->ep_port_num, ret);
124		goto ibt_hca_init_fail;
125	}
126	if (num_pi != 1) {
127		EIB_DPRINTF_ERR(ss->ei_instance,
128		    "ibt_query_hca_ports(hca_hdl=0x%llx, "
129		    "port=0x%x) returned num_pi=%d", ss->ei_hca_hdl,
130		    ss->ei_props->ep_port_num, num_pi);
131		ibt_free_portinfo(pi, sz_pi);
132		goto ibt_hca_init_fail;
133	}
134
135	ss->ei_props->ep_sgid = pi->p_sgid_tbl[0];
136	ss->ei_props->ep_mtu = (128 << pi->p_mtu);
137	ibt_free_portinfo(pi, sz_pi);
138
139	progress |= EIB_HCAINIT_HCA_PORTS_QUERIED;
140
141	/*
142	 * Allocate a protection domain for all our transactions
143	 */
144	ret = ibt_alloc_pd(ss->ei_hca_hdl, IBT_PD_NO_FLAGS, &ss->ei_pd_hdl);
145	if (ret != IBT_SUCCESS) {
146		EIB_DPRINTF_ERR(ss->ei_instance,
147		    "ibt_alloc_pd(hca_hdl=0x%llx, "
148		    "hca_guid=0x%llx) failed, ret=%d",
149		    ss->ei_hca_hdl, ss->ei_props->ep_hca_guid, ret);
150		goto ibt_hca_init_fail;
151	}
152	progress |= EIB_HCAINIT_PD_ALLOCD;
153
154	/*
155	 * Finally, record the capabilities
156	 */
157	ss->ei_caps = kmem_zalloc(sizeof (eib_caps_t), KM_SLEEP);
158	eib_ibt_record_capab(ss, ss->ei_hca_attrs, ss->ei_caps);
159	eib_ibt_record_srate(ss);
160
161	progress |= EIB_HCAINIT_CAPAB_RECORDED;
162
163	return (EIB_E_SUCCESS);
164
165ibt_hca_init_fail:
166	eib_rb_ibt_hca_init(ss, progress);
167	return (EIB_E_FAILURE);
168}
169
170void
171eib_ibt_link_mod(eib_t *ss)
172{
173	eib_node_state_t *ns = ss->ei_node_state;
174	ibt_hca_portinfo_t *pi;
175	ibt_status_t ret;
176	uint8_t vn0_mac[ETHERADDRL];
177	boolean_t all_zombies = B_FALSE;
178	boolean_t all_need_rejoin = B_FALSE;
179	uint_t num_pi;
180	uint_t sz_pi;
181	uint8_t itr;
182
183	if (ns->ns_link_state == LINK_STATE_UNKNOWN)
184		return;
185
186	/*
187	 * See if we can get the port attributes or we're as good as down.
188	 */
189	ret = ibt_query_hca_ports(ss->ei_hca_hdl, ss->ei_props->ep_port_num,
190	    &pi, &num_pi, &sz_pi);
191	if ((ret != IBT_SUCCESS) || (pi->p_linkstate != IBT_PORT_ACTIVE)) {
192		ibt_free_portinfo(pi, sz_pi);
193		eib_mac_link_down(ss, B_FALSE);
194		return;
195	}
196
197	/*
198	 * If the SM re-initialized the port attributes, but did not preserve
199	 * the old attributes, we need to check more.
200	 */
201	itr = pi->p_init_type_reply;
202	if (EIB_PORT_ATTR_LOADED(itr) && EIB_PORT_ATTR_NOT_PRESERVED(itr)) {
203		/*
204		 * We're just coming back up; if we see that our base lid
205		 * or sgid table has changed, we'll update these and try to
206		 * restart all active vnics. If any of the vnic pkeys have
207		 * changed, we'll reset the affected channels to the new pkey.
208		 */
209		if (bcmp(pi->p_sgid_tbl, &ss->ei_props->ep_sgid,
210		    sizeof (ib_gid_t)) != 0) {
211			EIB_DPRINTF_VERBOSE(ss->ei_instance,
212			    "eib_ibt_link_mod: port sgid table changed "
213			    "(old %llx.%llx != new %llx.%llx), "
214			    "all vnics are zombies now.",
215			    ss->ei_props->ep_sgid.gid_prefix,
216			    ss->ei_props->ep_sgid.gid_guid,
217			    pi->p_sgid_tbl[0].gid_prefix,
218			    pi->p_sgid_tbl[0].gid_guid);
219
220			ss->ei_props->ep_sgid = pi->p_sgid_tbl[0];
221			all_zombies = B_TRUE;
222
223		} else if (ss->ei_props->ep_blid != pi->p_base_lid) {
224			EIB_DPRINTF_VERBOSE(ss->ei_instance,
225			    "eib_ibt_link_mod: port base lid changed "
226			    "(old 0x%x != new 0x%x), "
227			    "all vnics are zombies now.",
228			    ss->ei_props->ep_blid, pi->p_base_lid);
229
230			ss->ei_props->ep_blid = pi->p_base_lid;
231			all_zombies = B_TRUE;
232
233		} else if (eib_ibt_has_any_pkey_changed(ss)) {
234			EIB_DPRINTF_VERBOSE(ss->ei_instance,
235			    "eib_ibt_link_mod: pkey has changed for vnic(s), "
236			    "resetting all partitions");
237
238			eib_ibt_reset_partitions(ss);
239		}
240	}
241
242	if (pi) {
243		ibt_free_portinfo(pi, sz_pi);
244	}
245
246	/*
247	 * If the SM hasn't preserved our presence in MCGs, we need to
248	 * rejoin all of them.
249	 */
250	if (EIB_PORT_PRES_NOT_PRESERVED(itr)) {
251		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: "
252		    "hca_guid=0x%llx, port=0x%x presence not preserved in SM, "
253		    "rejoining all mcgs", ss->ei_props->ep_hca_guid,
254		    ss->ei_props->ep_port_num);
255
256		all_need_rejoin = B_TRUE;
257	}
258
259	/*
260	 * Before we do the actual work of restarting/rejoining, we need to
261	 * see if the GW is reachable at this point of time.  If not, we
262	 * still continue to keep our link "down."  Whenever the GW becomes
263	 * reachable again, we'll restart/rejoin all the vnics that we've
264	 * just marked.
265	 */
266	mutex_enter(&ss->ei_vnic_lock);
267	if (all_zombies) {
268		ss->ei_zombie_vnics = ss->ei_active_vnics;
269	}
270	if (all_need_rejoin) {
271		ss->ei_rejoin_vnics = ss->ei_active_vnics;
272	}
273	if (ss->ei_gw_unreachable) {
274		mutex_exit(&ss->ei_vnic_lock);
275
276		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_link_mod: "
277		    "gateway (gw_port=0x%x) unreachable for "
278		    "hca_guid=0x%llx, port=0x%x, link state down",
279		    ss->ei_gw_props->pp_gw_portid, ss->ei_props->ep_hca_guid,
280		    ss->ei_props->ep_port_num);
281
282		eib_mac_link_down(ss, B_FALSE);
283		return;
284	}
285	mutex_exit(&ss->ei_vnic_lock);
286
287	/*
288	 * Try to awaken the dead if possible
289	 */
290	bcopy(eib_zero_mac, vn0_mac, ETHERADDRL);
291	if (all_zombies) {
292		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: "
293		    "hca_guid=0x%llx, hca_port=0x%x, gw_port=0x%x, "
294		    "attempting to resurrect zombies",
295		    ss->ei_props->ep_hca_guid, ss->ei_props->ep_port_num,
296		    ss->ei_gw_props->pp_gw_portid);
297
298		eib_vnic_resurrect_zombies(ss, vn0_mac);
299	}
300
301	/*
302	 * Re-join the mcgs if we need to
303	 */
304	if (all_need_rejoin) {
305		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: "
306		    "hca_guid=0x%llx, hca_port=0x%x, gw_port=0x%x, "
307		    "attempting to rejoin mcgs",
308		    ss->ei_props->ep_hca_guid, ss->ei_props->ep_port_num,
309		    ss->ei_gw_props->pp_gw_portid);
310
311		eib_vnic_rejoin_mcgs(ss);
312	}
313
314	/*
315	 * If we've restarted the zombies because the gateway went down and
316	 * came back, it is possible our unicast mac address changed from
317	 * what it was earlier. If so, we need to update our unicast address
318	 * with the mac layer before marking the link up.
319	 */
320	if (bcmp(vn0_mac, eib_zero_mac, ETHERADDRL) != 0)
321		mac_unicst_update(ss->ei_mac_hdl, vn0_mac);
322
323	/*
324	 * Notify the link state up if required
325	 */
326	eib_mac_link_up(ss, B_FALSE);
327}
328
329int
330eib_ibt_modify_chan_pkey(eib_t *ss, eib_chan_t *chan, ib_pkey_t pkey)
331{
332	/*
333	 * Make sure the channel pkey and index are set to what we need
334	 */
335	return (eib_ibt_chan_pkey(ss, chan, pkey, B_TRUE, NULL));
336}
337
338eib_avect_t *
339eib_ibt_hold_avect(eib_t *ss, ib_lid_t dlid, uint8_t sl)
340{
341	uint_t ndx = dlid % EIB_AV_NBUCKETS;	/* simple hashing */
342	eib_avect_t *av;
343	eib_avect_t *prev;
344	int ret;
345
346	mutex_enter(&ss->ei_av_lock);
347
348	/*
349	 * See if we have the address vector
350	 */
351	prev = NULL;
352	for (av = ss->ei_av[ndx]; av; av = av->av_next) {
353		prev = av;
354		if ((av->av_vect).av_dlid == dlid)
355			break;
356	}
357
358	/*
359	 * If we don't have it, create a new one and chain it to
360	 * the same bucket
361	 */
362	if (av == NULL) {
363		av = kmem_zalloc(sizeof (eib_avect_t), KM_NOSLEEP);
364		if (av == NULL) {
365			mutex_exit(&ss->ei_av_lock);
366			EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_hold_avect: "
367			    "no memory, could not allocate address vector");
368			return (NULL);
369		}
370
371		ret = EIB_E_FAILURE;
372		if (!eib_wa_no_av_discover)
373			ret = eib_ibt_fill_avect(ss, av, dlid);
374
375		if (ret != EIB_E_SUCCESS) {
376			(av->av_vect).av_srate = IBT_SRATE_10;
377			(av->av_vect).av_srvl = sl;
378			(av->av_vect).av_port_num = ss->ei_props->ep_port_num;
379			(av->av_vect).av_send_grh = B_FALSE;
380			(av->av_vect).av_dlid = dlid;
381			(av->av_vect).av_src_path = 0;	/* we use base lid */
382		}
383
384		if (prev)
385			prev->av_next = av;
386		else
387			ss->ei_av[ndx] = av;
388	}
389
390	/*
391	 * Increment the address vector reference count before returning
392	 */
393	(av->av_ref)++;
394
395	mutex_exit(&ss->ei_av_lock);
396
397	return (av);
398}
399
400static int
401eib_ibt_fill_avect(eib_t *ss, eib_avect_t *av, ib_lid_t dlid)
402{
403	ibt_node_info_t ni;
404	ibt_path_attr_t attr;
405	ibt_path_info_t path;
406	ibt_status_t ret;
407	ib_gid_t dgid;
408
409	if ((ret = ibt_lid_to_node_info(dlid, &ni)) != IBT_SUCCESS) {
410		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_fill_avect: "
411		    "ibt_lid_to_node_info(dlid=0x%x) failed, ret=%d",
412		    dlid, ret);
413		return (EIB_E_FAILURE);
414	}
415	dgid.gid_prefix = ss->ei_gw_props->pp_gw_sn_prefix;
416	dgid.gid_guid = ni.n_port_guid;
417
418	/*
419	 * Get the reversible path information for this destination
420	 */
421	bzero(&attr, sizeof (ibt_path_info_t));
422	attr.pa_sgid = ss->ei_props->ep_sgid;
423	attr.pa_dgids = &dgid;
424	attr.pa_num_dgids = 1;
425
426	bzero(&path, sizeof (ibt_path_info_t));
427	ret = ibt_get_paths(ss->ei_ibt_hdl, IBT_PATH_NO_FLAGS,
428	    &attr, 1, &path, NULL);
429	if ((ret != IBT_SUCCESS) || (path.pi_hca_guid == 0)) {
430		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_fill_avect: "
431		    "ibt_get_paths(dgid=%llx.%llx) failed, ret=%d",
432		    dgid.gid_prefix, dgid.gid_guid);
433		return (EIB_E_FAILURE);
434	}
435
436	/*
437	 * Fill in the address vector
438	 */
439	bcopy(&path.pi_prim_cep_path.cep_adds_vect, &av->av_vect,
440	    sizeof (ibt_adds_vect_t));
441
442	return (EIB_E_SUCCESS);
443}
444
445void
446eib_ibt_release_avect(eib_t *ss, eib_avect_t *av)
447{
448	mutex_enter(&ss->ei_av_lock);
449
450	ASSERT(av->av_ref > 0);
451	(av->av_ref)--;
452
453	mutex_exit(&ss->ei_av_lock);
454}
455
456void
457eib_ibt_free_avects(eib_t *ss)
458{
459	eib_avect_t *av;
460	eib_avect_t *av_next;
461	int ndx;
462
463	mutex_enter(&ss->ei_av_lock);
464	for (ndx = 0; ndx < EIB_AV_NBUCKETS; ndx++) {
465		for (av = ss->ei_av[ndx]; av; av = av_next) {
466			av_next = av->av_next;
467
468			ASSERT(av->av_ref == 0);
469			kmem_free(av, sizeof (eib_avect_t));
470		}
471		ss->ei_av[ndx] = NULL;
472	}
473	mutex_exit(&ss->ei_av_lock);
474}
475
476/*ARGSUSED*/
477void
478eib_ibt_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
479    ibt_async_code_t code, ibt_async_event_t *event)
480{
481	eib_t *ss = (eib_t *)clnt_private;
482	eib_event_t *evi;
483	uint_t ev_code;
484
485	ev_code = EIB_EV_NONE;
486
487	switch (code) {
488	case IBT_EVENT_SQD:
489		EIB_DPRINTF_VERBOSE(ss->ei_instance,
490		    "eib_ibt_async_handler: got IBT_EVENT_SQD");
491		eib_ibt_wakeup_sqd_waiters(ss, event->ev_chan_hdl);
492		break;
493
494	case IBT_EVENT_PORT_UP:
495		if (event->ev_port == ss->ei_props->ep_port_num) {
496			EIB_DPRINTF_VERBOSE(ss->ei_instance,
497			    "eib_ibt_async_handler: got IBT_EVENT_PORT_UP");
498			ev_code = EIB_EV_PORT_UP;
499		}
500		break;
501
502	case IBT_ERROR_PORT_DOWN:
503		if (event->ev_port == ss->ei_props->ep_port_num) {
504			EIB_DPRINTF_VERBOSE(ss->ei_instance,
505			    "eib_ibt_async_handler: got IBT_ERROR_PORT_DOWN");
506			ev_code = EIB_EV_PORT_DOWN;
507		}
508		break;
509
510	case IBT_CLNT_REREG_EVENT:
511		if (event->ev_port == ss->ei_props->ep_port_num) {
512			EIB_DPRINTF_VERBOSE(ss->ei_instance,
513			    "eib_ibt_async_handler: got IBT_CLNT_REREG_EVENT");
514			ev_code = EIB_EV_CLNT_REREG;
515		}
516		break;
517
518	case IBT_PORT_CHANGE_EVENT:
519		if ((event->ev_port == ss->ei_props->ep_port_num) &&
520		    (event->ev_port_flags & IBT_PORT_CHANGE_PKEY)) {
521			EIB_DPRINTF_VERBOSE(ss->ei_instance,
522			    "eib_ibt_async_handler: "
523			    "got IBT_PORT_CHANGE_EVENT(PKEY_CHANGE)");
524			ev_code = EIB_EV_PKEY_CHANGE;
525		} else if ((event->ev_port == ss->ei_props->ep_port_num) &&
526		    (event->ev_port_flags & IBT_PORT_CHANGE_SGID)) {
527			EIB_DPRINTF_VERBOSE(ss->ei_instance,
528			    "eib_ibt_async_handler: "
529			    "got IBT_PORT_CHANGE_EVENT(SGID_CHANGE)");
530			ev_code = EIB_EV_SGID_CHANGE;
531		}
532		break;
533
534	case IBT_HCA_ATTACH_EVENT:
535		/*
536		 * For HCA attach, after a new HCA is plugged in and
537		 * configured using cfgadm, an explicit plumb will need
538		 * to be run, so we don't need to do anything here.
539		 */
540		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_async_handler: "
541		    "got IBT_HCA_ATTACH_EVENT");
542		break;
543
544	case IBT_HCA_DETACH_EVENT:
545		/*
546		 * Before an HCA unplug, cfgadm is expected to trigger
547		 * any rcm scripts to unplumb the EoIB instances on the
548		 * card. If so, we should not be holding any hca resource,
549		 * since we don't do ibt_open_hca() until plumb time. However,
550		 * if an earlier unplumb hadn't cleaned up the hca resources
551		 * properly because the network layer hadn't returned the
552		 * buffers at that time, we could be holding hca resources.
553		 * We'll try to release them here, and protect the code from
554		 * racing with some other plumb/unplumb operation.
555		 */
556		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_async_handler: "
557		    "got IBT_HCA_DETACH_EVENT");
558
559		eib_mac_set_nic_state(ss, EIB_NIC_STOPPING);
560		eib_rb_rsrc_setup_bufs(ss, B_FALSE);
561		if (ss->ei_tx || ss->ei_rx || ss->ei_lso) {
562			EIB_DPRINTF_WARN(ss->ei_instance,
563			    "eib_events_handler: nw layer still holding "
564			    "hca resources, could not detach HCA");
565		} else if (ss->ei_hca_hdl) {
566			eib_rb_ibt_hca_init(ss, ~0);
567		}
568		eib_mac_clr_nic_state(ss, EIB_NIC_STOPPING);
569
570		break;
571	}
572
573	if (ev_code != EIB_EV_NONE) {
574		evi = kmem_zalloc(sizeof (eib_event_t), KM_NOSLEEP);
575		if (evi == NULL) {
576			EIB_DPRINTF_WARN(ss->ei_instance,
577			    "eib_ibt_async_handler: "
578			    "no memory, could not handle event 0x%lx", ev_code);
579		} else {
580			evi->ev_code = ev_code;
581			evi->ev_arg = NULL;
582			eib_svc_enqueue_event(ss, evi);
583		}
584	}
585}
586
587/*ARGSUSED*/
588void
589eib_ibt_record_capab(eib_t *ss, ibt_hca_attr_t *hca_attrs, eib_caps_t *caps)
590{
591	uint_t max_swqe = EIB_DATA_MAX_SWQE;
592	uint_t max_rwqe = EIB_DATA_MAX_RWQE;
593
594	/*
595	 * Checksum
596	 */
597	caps->cp_cksum_flags = 0;
598	if ((!eib_wa_no_cksum_offload) &&
599	    (hca_attrs->hca_flags & IBT_HCA_CKSUM_FULL)) {
600		caps->cp_cksum_flags =
601		    HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
602		    /* HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM; */
603	}
604
605	/*
606	 * Reserved L-Key
607	 */
608	if (hca_attrs->hca_flags2 & IBT_HCA2_RES_LKEY) {
609		caps->cp_resv_lkey_capab = 1;
610		caps->cp_resv_lkey = hca_attrs->hca_reserved_lkey;
611	}
612
613	/*
614	 * LSO
615	 */
616	caps->cp_lso_maxlen = 0;
617	if (!eib_wa_no_lso) {
618		if (hca_attrs->hca_max_lso_size > EIB_LSO_MAXLEN) {
619			caps->cp_lso_maxlen = EIB_LSO_MAXLEN;
620		} else {
621			caps->cp_lso_maxlen = hca_attrs->hca_max_lso_size;
622		}
623	}
624
625	/*
626	 * SGL
627	 *
628	 * Translating virtual address regions into physical regions
629	 * for using the Reserved LKey feature results in a wr sgl that
630	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
631	 * we'll record a high-water mark (65%) when we should stop
632	 * trying to use Reserved LKey
633	 */
634	if (hca_attrs->hca_flags & IBT_HCA_WQE_SIZE_INFO) {
635		caps->cp_max_sgl = hca_attrs->hca_ud_send_sgl_sz;
636	} else {
637		caps->cp_max_sgl = hca_attrs->hca_max_sgl;
638	}
639	if (caps->cp_max_sgl > EIB_MAX_SGL) {
640		caps->cp_max_sgl = EIB_MAX_SGL;
641	}
642	caps->cp_hiwm_sgl = (caps->cp_max_sgl * 65) / 100;
643
644	/*
645	 * SWQE/RWQE: meet max chan size and max cq size limits (leave room
646	 * to avoid cq overflow event)
647	 */
648	if (max_swqe > hca_attrs->hca_max_chan_sz)
649		max_swqe = hca_attrs->hca_max_chan_sz;
650	if (max_swqe > (hca_attrs->hca_max_cq_sz - 1))
651		max_swqe = hca_attrs->hca_max_cq_sz - 1;
652	caps->cp_max_swqe = max_swqe;
653
654	if (max_rwqe > hca_attrs->hca_max_chan_sz)
655		max_rwqe = hca_attrs->hca_max_chan_sz;
656	if (max_rwqe > (hca_attrs->hca_max_cq_sz - 1))
657		max_rwqe = hca_attrs->hca_max_cq_sz - 1;
658	caps->cp_max_rwqe = max_rwqe;
659}
660
661void
662eib_rb_ibt_hca_init(eib_t *ss, uint_t progress)
663{
664	ibt_status_t ret;
665
666	if (progress & EIB_HCAINIT_CAPAB_RECORDED) {
667		if (ss->ei_caps) {
668			kmem_free(ss->ei_caps, sizeof (eib_caps_t));
669			ss->ei_caps = NULL;
670		}
671	}
672
673	if (progress & EIB_HCAINIT_PD_ALLOCD) {
674		if (ss->ei_pd_hdl) {
675			ret = ibt_free_pd(ss->ei_hca_hdl, ss->ei_pd_hdl);
676			if (ret != IBT_SUCCESS) {
677				EIB_DPRINTF_WARN(ss->ei_instance,
678				    "eib_rb_ibt_hca_init: "
679				    "ibt_free_pd(hca_hdl=0x%lx, pd_hdl=0x%lx) "
680				    "failed, ret=%d", ss->ei_hca_hdl,
681				    ss->ei_pd_hdl, ret);
682			}
683			ss->ei_pd_hdl = NULL;
684		}
685	}
686
687	if (progress & EIB_HCAINIT_HCA_PORTS_QUERIED) {
688		ss->ei_props->ep_mtu = 0;
689		bzero(&ss->ei_props->ep_sgid, sizeof (ib_gid_t));
690	}
691
692	if (progress & EIB_HCAINIT_ATTRS_ALLOCD) {
693		kmem_free(ss->ei_hca_attrs, sizeof (ibt_hca_attr_t));
694		ss->ei_hca_attrs = NULL;
695	}
696
697	if (progress & EIB_HCAINIT_HCA_OPENED) {
698		ret = ibt_close_hca(ss->ei_hca_hdl);
699		if (ret != IBT_SUCCESS) {
700			EIB_DPRINTF_WARN(ss->ei_instance,
701			    "ibt_close_hca(hca_hdl=0x%lx) failed, "
702			    "ret=%d", ss->ei_hca_hdl, ret);
703		}
704		ss->ei_hca_hdl = NULL;
705	}
706}
707
708static void
709eib_ibt_reset_partitions(eib_t *ss)
710{
711	eib_vnic_t *vnic;
712	eib_chan_t *chan = NULL;
713	uint64_t av;
714	int inst = 0;
715
716	/*
717	 * We already have the vhub pkey recorded in our eib_chan_t.
718	 * We only need to make sure our pkey index still matches it.
719	 * If not, modify the channel appropriately and update our
720	 * records.
721	 */
722	if ((chan = ss->ei_admin_chan) != NULL)
723		(void) eib_ibt_modify_chan_pkey(ss, chan, chan->ch_pkey);
724
725	mutex_enter(&ss->ei_vnic_lock);
726	av = ss->ei_active_vnics;
727	while ((inst = EIB_FIND_LSB_SET(av)) != -1) {
728		if ((vnic = ss->ei_vnic[inst]) != NULL) {
729			if ((chan = vnic->vn_ctl_chan) != NULL) {
730				(void) eib_ibt_modify_chan_pkey(ss, chan,
731				    chan->ch_pkey);
732			}
733			if ((chan = vnic->vn_data_chan) != NULL) {
734				(void) eib_ibt_modify_chan_pkey(ss, chan,
735				    chan->ch_pkey);
736			}
737		}
738		av &= (~((uint64_t)1 << inst));
739	}
740	mutex_exit(&ss->ei_vnic_lock);
741}
742
743static void
744eib_ibt_wakeup_sqd_waiters(eib_t *ss, ibt_channel_hdl_t ev_chan_hdl)
745{
746	eib_vnic_t *vnic;
747	eib_chan_t *chan = NULL;
748	uint64_t av;
749	int inst = 0;
750
751	/*
752	 * See if this channel has been waiting for its queue to drain.
753	 *
754	 * Note that since this is especially likely to be called during
755	 * logging in to the gateway, we also need to check the vnic
756	 * currently being created.
757	 */
758	mutex_enter(&ss->ei_vnic_lock);
759
760	if ((vnic = ss->ei_vnic_pending) != NULL) {
761		chan = vnic->vn_ctl_chan;
762		if ((chan) && (chan->ch_chan == ev_chan_hdl))
763			goto wakeup_sqd_waiters;
764
765		chan = vnic->vn_data_chan;
766		if ((chan) && (chan->ch_chan == ev_chan_hdl))
767			goto wakeup_sqd_waiters;
768	}
769
770	av = ss->ei_active_vnics;
771	while ((inst = EIB_FIND_LSB_SET(av)) != -1) {
772		if ((vnic = ss->ei_vnic[inst]) != NULL) {
773			chan = vnic->vn_ctl_chan;
774			if (chan->ch_chan == ev_chan_hdl)
775				break;
776
777			chan = vnic->vn_data_chan;
778			if (chan->ch_chan == ev_chan_hdl)
779				break;
780		}
781		av &= (~((uint64_t)1 << inst));
782	}
783
784wakeup_sqd_waiters:
785	if (chan) {
786		mutex_enter(&chan->ch_cep_lock);
787		chan->ch_cep_state = IBT_STATE_SQD;
788		cv_broadcast(&chan->ch_cep_cv);
789		mutex_exit(&chan->ch_cep_lock);
790	}
791
792	mutex_exit(&ss->ei_vnic_lock);
793}
794
795static int
796eib_ibt_chan_pkey(eib_t *ss, eib_chan_t *chan, ib_pkey_t new_pkey,
797    boolean_t set, boolean_t *pkey_changed)
798{
799	ibt_qp_info_t qp_attr;
800	ibt_status_t ret;
801	uint16_t new_pkey_ix;
802
803	ret = ibt_pkey2index(ss->ei_hca_hdl, ss->ei_props->ep_port_num,
804	    new_pkey, &new_pkey_ix);
805	if (ret != IBT_SUCCESS) {
806		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: "
807		    "ibt_pkey2index(hca_hdl=0x%llx, port_num=0x%x, "
808		    "pkey=0x%x) failed, ret=%d",
809		    ss->ei_hca_hdl, ss->ei_props->ep_port_num, new_pkey, ret);
810		return (EIB_E_FAILURE);
811	}
812
813	/*
814	 * If the pkey and the pkey index we have already matches the
815	 * new one, nothing to do.
816	 */
817	mutex_enter(&chan->ch_pkey_lock);
818	if ((chan->ch_pkey == new_pkey) && (chan->ch_pkey_ix == new_pkey_ix)) {
819		if (pkey_changed) {
820			*pkey_changed = B_FALSE;
821		}
822		mutex_exit(&chan->ch_pkey_lock);
823		return (EIB_E_SUCCESS);
824	}
825	if (pkey_changed) {
826		*pkey_changed = B_TRUE;
827	}
828	mutex_exit(&chan->ch_pkey_lock);
829
830	/*
831	 * Otherwise, if we're asked only to test if the pkey index
832	 * supplied matches the one recorded in the channel, return
833	 * success, but don't set the pkey.
834	 */
835	if (!set) {
836		return (EIB_E_SUCCESS);
837	}
838
839	/*
840	 * Otherwise, we need to change channel pkey.  Pause the
841	 * channel sendq first.
842	 */
843	ret = ibt_pause_sendq(chan->ch_chan, IBT_CEP_SET_SQD_EVENT);
844	if (ret != IBT_SUCCESS) {
845		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: "
846		    "ibt_pause_sendq(chan_hdl=0x%llx) failed, ret=%d",
847		    chan->ch_chan, ret);
848		return (EIB_E_FAILURE);
849	}
850
851	/*
852	 * Wait for the channel to enter the IBT_STATE_SQD state
853	 */
854	mutex_enter(&chan->ch_cep_lock);
855	while (chan->ch_cep_state != IBT_STATE_SQD)
856		cv_wait(&chan->ch_cep_cv, &chan->ch_cep_lock);
857	mutex_exit(&chan->ch_cep_lock);
858
859	/*
860	 * Modify the qp with the supplied pkey index and unpause the channel
861	 * If either of these operations fail, we'll leave the channel in
862	 * the paused state and fail.
863	 */
864	bzero(&qp_attr, sizeof (ibt_qp_info_t));
865
866	qp_attr.qp_trans = IBT_UD_SRV;
867	qp_attr.qp_current_state = IBT_STATE_SQD;
868	qp_attr.qp_state = IBT_STATE_SQD;
869	qp_attr.qp_transport.ud.ud_pkey_ix = new_pkey_ix;
870
871	/*
872	 * Modify the qp to set the new pkey index, then unpause the
873	 * channel and put it in RTS state and update the new values
874	 * in our records
875	 */
876	mutex_enter(&chan->ch_pkey_lock);
877
878	ret = ibt_modify_qp(chan->ch_chan,
879	    IBT_CEP_SET_STATE | IBT_CEP_SET_PKEY_IX, &qp_attr, NULL);
880	if (ret != IBT_SUCCESS) {
881		mutex_exit(&chan->ch_pkey_lock);
882		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: "
883		    "ibt_modify_qp(chan_hdl=0x%llx, IBT_CEP_SET_PKEY_IX) "
884		    "failed for new_pkey_ix=0x%x, ret=%d",
885		    chan->ch_chan, new_pkey_ix, ret);
886		return (EIB_E_FAILURE);
887	}
888
889	if ((ret = ibt_unpause_sendq(chan->ch_chan)) != IBT_SUCCESS) {
890		mutex_exit(&chan->ch_pkey_lock);
891		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: "
892		    "ibt_unpause_sendq(chan_hdl=0x%llx) failed, ret=%d",
893		    chan->ch_chan, ret);
894		return (EIB_E_FAILURE);
895	}
896
897	chan->ch_pkey = new_pkey;
898	chan->ch_pkey_ix = new_pkey_ix;
899	mutex_exit(&chan->ch_pkey_lock);
900
901	return (EIB_E_SUCCESS);
902}
903
904static boolean_t
905eib_ibt_has_chan_pkey_changed(eib_t *ss, eib_chan_t *chan)
906{
907	boolean_t changed;
908	int ret;
909
910	/*
911	 * Don't modify the pkey, just ask if the pkey index for the channel's
912	 * pkey has changed for any reason.  If we fail, assume that the pkey
913	 * has changed.
914	 */
915	ret = eib_ibt_chan_pkey(ss, chan, chan->ch_pkey, B_FALSE, &changed);
916	if (ret != EIB_E_SUCCESS)
917		changed = B_TRUE;
918
919	return (changed);
920}
921
922static boolean_t
923eib_ibt_has_any_pkey_changed(eib_t *ss)
924{
925	eib_vnic_t *vnic;
926	eib_chan_t *chan = NULL;
927	uint64_t av;
928	int inst = 0;
929
930	/*
931	 * Return true if the pkey index of any our pkeys (of the channels
932	 * of all active vnics) has changed.
933	 */
934
935	chan = ss->ei_admin_chan;
936	if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan)))
937		return (B_TRUE);
938
939	mutex_enter(&ss->ei_vnic_lock);
940	av = ss->ei_active_vnics;
941	while ((inst = EIB_FIND_LSB_SET(av)) != -1) {
942		if ((vnic = ss->ei_vnic[inst]) != NULL) {
943			chan = vnic->vn_ctl_chan;
944			if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan)))
945				return (B_TRUE);
946
947			chan = vnic->vn_data_chan;
948			if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan)))
949				return (B_TRUE);
950		}
951		av &= (~((uint64_t)1 << inst));
952	}
953	mutex_exit(&ss->ei_vnic_lock);
954
955	return (B_FALSE);
956}
957
958/*
959 * This routine is currently used simply to derive and record the port
960 * speed from the loopback path information (for debug purposes).  For
961 * EoIB, currently the srate used in address vectors to IB neighbors
962 * and the gateway is fixed at IBT_SRATE_10. Eventually though, this
963 * information (and sl) has to come from the gateway for all destinations
964 * in the vhub table.
965 */
966static void
967eib_ibt_record_srate(eib_t *ss)
968{
969	ib_gid_t sgid = ss->ei_props->ep_sgid;
970	ibt_srate_t srate = IBT_SRATE_10;
971	ibt_path_info_t path;
972	ibt_path_attr_t path_attr;
973	ibt_status_t ret;
974	uint8_t num_paths;
975
976	bzero(&path_attr, sizeof (path_attr));
977	path_attr.pa_dgids = &sgid;
978	path_attr.pa_num_dgids = 1;
979	path_attr.pa_sgid = sgid;
980
981	ret = ibt_get_paths(ss->ei_ibt_hdl, IBT_PATH_NO_FLAGS,
982	    &path_attr, 1, &path, &num_paths);
983	if (ret == IBT_SUCCESS && num_paths >= 1) {
984		switch (srate = path.pi_prim_cep_path.cep_adds_vect.av_srate) {
985		case IBT_SRATE_2:
986		case IBT_SRATE_10:
987		case IBT_SRATE_30:
988		case IBT_SRATE_5:
989		case IBT_SRATE_20:
990		case IBT_SRATE_40:
991		case IBT_SRATE_60:
992		case IBT_SRATE_80:
993		case IBT_SRATE_120:
994			break;
995		default:
996			srate = IBT_SRATE_10;
997		}
998	}
999
1000	ss->ei_props->ep_srate = srate;
1001
1002	EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_ibt_record_srate: "
1003	    "srate = %d", srate);
1004}
1005