1/*
2 * Copyright (c) 2004-2009 Voltaire, Inc. All rights reserved.
3 * Copyright (c) 2002-2015 Mellanox Technologies LTD. All rights reserved.
4 * Copyright (c) 1996-2003 Intel Corporation. All rights reserved.
5 *
6 * This software is available to you under a choice of one of two
7 * licenses.  You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
11 *
12 *     Redistribution and use in source and binary forms, with or
13 *     without modification, are permitted provided that the following
14 *     conditions are met:
15 *
16 *      - Redistributions of source code must retain the above
17 *        copyright notice, this list of conditions and the following
18 *        disclaimer.
19 *
20 *      - Redistributions in binary form must reproduce the above
21 *        copyright notice, this list of conditions and the following
22 *        disclaimer in the documentation and/or other materials
23 *        provided with the distribution.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * SOFTWARE.
33 *
34 */
35
36/*
37 * Abstract:
38 *    Implementation of osm_ucast_mgr_t.
39 * This file implements the Unicast Manager object.
40 */
41
42#if HAVE_CONFIG_H
43#  include <config.h>
44#endif				/* HAVE_CONFIG_H */
45
46#include <stdio.h>
47#include <stdlib.h>
48#include <string.h>
49#include <ctype.h>
50#include <iba/ib_types.h>
51#include <complib/cl_qmap.h>
52#include <complib/cl_debug.h>
53#include <complib/cl_qlist.h>
54#include <opensm/osm_file_ids.h>
55#define FILE_ID OSM_FILE_UCAST_MGR_C
56#include <opensm/osm_ucast_mgr.h>
57#include <opensm/osm_sm.h>
58#include <opensm/osm_log.h>
59#include <opensm/osm_node.h>
60#include <opensm/osm_switch.h>
61#include <opensm/osm_helper.h>
62#include <opensm/osm_msgdef.h>
63#include <opensm/osm_opensm.h>
64
65void osm_ucast_mgr_construct(IN osm_ucast_mgr_t * p_mgr)
66{
67	memset(p_mgr, 0, sizeof(*p_mgr));
68}
69
70void osm_ucast_mgr_destroy(IN osm_ucast_mgr_t * p_mgr)
71{
72	CL_ASSERT(p_mgr);
73
74	OSM_LOG_ENTER(p_mgr->p_log);
75
76	if (p_mgr->cache_valid)
77		osm_ucast_cache_invalidate(p_mgr);
78
79	OSM_LOG_EXIT(p_mgr->p_log);
80}
81
82ib_api_status_t osm_ucast_mgr_init(IN osm_ucast_mgr_t * p_mgr, IN osm_sm_t * sm)
83{
84	ib_api_status_t status = IB_SUCCESS;
85
86	OSM_LOG_ENTER(sm->p_log);
87
88	osm_ucast_mgr_construct(p_mgr);
89
90	p_mgr->sm = sm;
91	p_mgr->p_log = sm->p_log;
92	p_mgr->p_subn = sm->p_subn;
93	p_mgr->p_lock = sm->p_lock;
94
95	if (sm->p_subn->opt.use_ucast_cache)
96		cl_qmap_init(&p_mgr->cache_sw_tbl);
97
98	OSM_LOG_EXIT(p_mgr->p_log);
99	return status;
100}
101
102/**********************************************************************
103 Add each switch's own and neighbor LIDs to its LID matrix
104**********************************************************************/
105static void ucast_mgr_process_hop_0_1(IN cl_map_item_t * p_map_item,
106				      IN void *context)
107{
108	osm_switch_t * p_sw = (osm_switch_t *) p_map_item;
109	osm_node_t *p_remote_node;
110	uint16_t lid, remote_lid;
111	uint8_t i;
112
113	lid = cl_ntoh16(osm_node_get_base_lid(p_sw->p_node, 0));
114	osm_switch_set_hops(p_sw, lid, 0, 0);
115
116	for (i = 1; i < p_sw->num_ports; i++) {
117		osm_physp_t *p = osm_node_get_physp_ptr(p_sw->p_node, i);
118		p_remote_node = (p && p->p_remote_physp) ?
119		    p->p_remote_physp->p_node : NULL;
120
121		if (p_remote_node && p_remote_node->sw &&
122		    p_remote_node != p_sw->p_node) {
123			remote_lid = osm_node_get_base_lid(p_remote_node, 0);
124			remote_lid = cl_ntoh16(remote_lid);
125			osm_switch_set_hops(p_sw, remote_lid, i, p->hop_wf);
126		}
127	}
128}
129
130static void ucast_mgr_process_neighbor(IN osm_ucast_mgr_t * p_mgr,
131				       IN osm_switch_t * p_this_sw,
132				       IN osm_switch_t * p_remote_sw,
133				       IN uint8_t port_num,
134				       IN uint8_t remote_port_num)
135{
136	osm_switch_t *p_sw;
137	cl_map_item_t *item;
138	uint16_t lid_ho;
139	uint16_t hops;
140	osm_physp_t *p;
141
142	OSM_LOG_ENTER(p_mgr->p_log);
143
144	OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
145		"Node 0x%" PRIx64 ", remote node 0x%" PRIx64
146		", port %u, remote port %u\n",
147		cl_ntoh64(osm_node_get_node_guid(p_this_sw->p_node)),
148		cl_ntoh64(osm_node_get_node_guid(p_remote_sw->p_node)),
149		port_num, remote_port_num);
150
151	p = osm_node_get_physp_ptr(p_this_sw->p_node, port_num);
152
153	for (item = cl_qmap_head(&p_mgr->p_subn->sw_guid_tbl);
154	     item != cl_qmap_end(&p_mgr->p_subn->sw_guid_tbl);
155	     item = cl_qmap_next(item)) {
156		p_sw = (osm_switch_t *) item;
157		lid_ho = cl_ntoh16(osm_node_get_base_lid(p_sw->p_node, 0));
158		hops = osm_switch_get_least_hops(p_remote_sw, lid_ho);
159		if (hops == OSM_NO_PATH)
160			continue;
161		hops += p->hop_wf;
162		if (hops <
163		    osm_switch_get_hop_count(p_this_sw, lid_ho, port_num)) {
164			if (osm_switch_set_hops
165			    (p_this_sw, lid_ho, port_num, (uint8_t) hops) != 0)
166				OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A03: "
167					"cannot set hops for lid %u at switch 0x%"
168					PRIx64 "\n", lid_ho,
169					cl_ntoh64(osm_node_get_node_guid
170						  (p_this_sw->p_node)));
171			p_mgr->some_hop_count_set = TRUE;
172		}
173	}
174
175	OSM_LOG_EXIT(p_mgr->p_log);
176}
177
178static struct osm_remote_node *find_and_add_remote_sys(osm_switch_t * sw,
179						       uint8_t port,
180						       boolean_t dor, struct
181						       osm_remote_guids_count
182						       *r)
183{
184	unsigned i;
185	osm_physp_t *p = osm_node_get_physp_ptr(sw->p_node, port);
186	osm_node_t *node = p->p_remote_physp->p_node;
187	uint8_t rem_port = osm_physp_get_port_num(p->p_remote_physp);
188
189	for (i = 0; i < r->count; i++)
190		if (r->guids[i].node == node)
191			if (!dor || (r->guids[i].port == rem_port))
192				return &r->guids[i];
193
194	r->guids[i].node = node;
195	r->guids[i].forwarded_to = 0;
196	r->guids[i].port = rem_port;
197	r->count++;
198	return &r->guids[i];
199}
200
201static void ucast_mgr_process_port(IN osm_ucast_mgr_t * p_mgr,
202				   IN osm_switch_t * p_sw,
203				   IN osm_port_t * p_port,
204				   IN unsigned lid_offset)
205{
206	uint16_t min_lid_ho;
207	uint16_t max_lid_ho;
208	uint16_t lid_ho;
209	uint8_t port;
210	boolean_t is_ignored_by_port_prof;
211	ib_net64_t node_guid;
212	unsigned start_from = 1;
213
214	OSM_LOG_ENTER(p_mgr->p_log);
215
216	osm_port_get_lid_range_ho(p_port, &min_lid_ho, &max_lid_ho);
217
218	/* If the lids are zero - then there was some problem with
219	 * the initialization. Don't handle this port. */
220	if (min_lid_ho == 0 || max_lid_ho == 0) {
221		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A04: "
222			"Port 0x%" PRIx64 " (%s port %d) has LID 0. An "
223			"initialization error occurred. Ignoring port\n",
224			cl_ntoh64(osm_port_get_guid(p_port)),
225			p_port->p_node->print_desc,
226			p_port->p_physp->port_num);
227		goto Exit;
228	}
229
230	lid_ho = min_lid_ho + lid_offset;
231
232	if (lid_ho > max_lid_ho)
233		goto Exit;
234
235	if (lid_offset && !p_mgr->is_dor)
236		/* ignore potential overflow - it is handled in osm_switch.c */
237		start_from =
238		    osm_switch_get_port_by_lid(p_sw, lid_ho - 1, OSM_NEW_LFT) + 1;
239
240	OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
241		"Processing port 0x%" PRIx64
242		" (\'%s\' port %u), LID %u [%u,%u]\n",
243		cl_ntoh64(osm_port_get_guid(p_port)),
244		p_port->p_node->print_desc, p_port->p_physp->port_num, lid_ho,
245		min_lid_ho, max_lid_ho);
246
247	/* TODO - This should be runtime error, not a CL_ASSERT() */
248	CL_ASSERT(max_lid_ho <= IB_LID_UCAST_END_HO);
249
250	node_guid = osm_node_get_node_guid(p_sw->p_node);
251
252	/*
253	   The lid matrix contains the number of hops to each
254	   lid from each port.  From this information we determine
255	   how best to distribute the LID range across the ports
256	   that can reach those LIDs.
257	 */
258	port = osm_switch_recommend_path(p_sw, p_port, lid_ho, start_from,
259					 p_mgr->p_subn->ignore_existing_lfts,
260					 p_mgr->p_subn->opt.lmc,
261					 p_mgr->is_dor,
262					 p_mgr->p_subn->opt.port_shifting,
263					 !lid_offset && p_port->use_scatter,
264					 OSM_LFT);
265
266	if (port == OSM_NO_PATH) {
267		/* do not try to overwrite the ppro of non existing port ... */
268		is_ignored_by_port_prof = TRUE;
269
270		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
271			"No path to get to LID %u from switch 0x%" PRIx64 "\n",
272			lid_ho, cl_ntoh64(node_guid));
273	} else {
274		osm_physp_t *p = osm_node_get_physp_ptr(p_sw->p_node, port);
275		if (!p)
276			goto Exit;
277
278		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
279			"Routing LID %u to port %u for switch 0x%" PRIx64 "\n",
280			lid_ho, port, cl_ntoh64(node_guid));
281
282		/*
283		   we would like to optionally ignore this port in equalization
284		   as in the case of the Mellanox Anafa Internal PCI TCA port
285		 */
286		is_ignored_by_port_prof = p->is_prof_ignored;
287
288		/*
289		   We also would ignore this route if the target lid is of
290		   a switch and the port_profile_switch_node is not TRUE
291		 */
292		if (!p_mgr->p_subn->opt.port_profile_switch_nodes)
293			is_ignored_by_port_prof |=
294			    (osm_node_get_type(p_port->p_node) ==
295			     IB_NODE_TYPE_SWITCH);
296	}
297
298	/*
299	   We have selected the port for this LID.
300	   Write it to the forwarding tables.
301	 */
302	p_sw->new_lft[lid_ho] = port;
303	if (!is_ignored_by_port_prof) {
304		struct osm_remote_node *rem_node_used;
305		osm_switch_count_path(p_sw, port);
306		if (port > 0 && p_port->priv &&
307		    (rem_node_used = find_and_add_remote_sys(p_sw, port,
308							     p_mgr->is_dor,
309							     p_port->priv)))
310			rem_node_used->forwarded_to++;
311	}
312
313Exit:
314	OSM_LOG_EXIT(p_mgr->p_log);
315}
316
317static void alloc_ports_priv(osm_ucast_mgr_t * mgr)
318{
319	cl_qmap_t *port_tbl = &mgr->p_subn->port_guid_tbl;
320	struct osm_remote_guids_count *r;
321	osm_port_t *port;
322	cl_map_item_t *item;
323	unsigned lmc;
324
325	for (item = cl_qmap_head(port_tbl); item != cl_qmap_end(port_tbl);
326	     item = cl_qmap_next(item)) {
327		port = (osm_port_t *) item;
328		lmc = ib_port_info_get_lmc(&port->p_physp->port_info);
329		r = malloc(sizeof(*r) + sizeof(r->guids[0]) * (1 << lmc));
330		if (!r) {
331			OSM_LOG(mgr->p_log, OSM_LOG_ERROR, "ERR 3A09: "
332				"cannot allocate memory to track remote"
333				" systems for lmc > 0\n");
334			port->priv = NULL;
335			continue;
336		}
337		memset(r, 0, sizeof(*r) + sizeof(r->guids[0]) * (1 << lmc));
338		port->priv = r;
339	}
340}
341
342static void free_ports_priv(osm_ucast_mgr_t * mgr)
343{
344	cl_qmap_t *port_tbl = &mgr->p_subn->port_guid_tbl;
345	osm_port_t *port;
346	cl_map_item_t *item;
347	for (item = cl_qmap_head(port_tbl); item != cl_qmap_end(port_tbl);
348	     item = cl_qmap_next(item)) {
349		port = (osm_port_t *) item;
350		if (port->priv) {
351			free(port->priv);
352			port->priv = NULL;
353		}
354	}
355}
356
357static void ucast_mgr_process_tbl(IN cl_map_item_t * p_map_item,
358				  IN void *context)
359{
360	osm_ucast_mgr_t *p_mgr = context;
361	osm_switch_t * p_sw = (osm_switch_t *) p_map_item;
362	unsigned i, lids_per_port;
363
364	OSM_LOG_ENTER(p_mgr->p_log);
365
366	CL_ASSERT(p_sw && p_sw->p_node);
367
368	OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
369		"Processing switch 0x%" PRIx64 "\n",
370		cl_ntoh64(osm_node_get_node_guid(p_sw->p_node)));
371
372	/* Initialize LIDs in buffer to invalid port number. */
373	memset(p_sw->new_lft, OSM_NO_PATH, p_sw->max_lid_ho + 1);
374
375	alloc_ports_priv(p_mgr);
376
377	/*
378	   Iterate through every port setting LID routes for each
379	   port based on base LID and LMC value.
380	 */
381	lids_per_port = 1 << p_mgr->p_subn->opt.lmc;
382	for (i = 0; i < lids_per_port; i++) {
383		cl_qlist_t *list = &p_mgr->port_order_list;
384		cl_list_item_t *item;
385		for (item = cl_qlist_head(list); item != cl_qlist_end(list);
386		     item = cl_qlist_next(item)) {
387			osm_port_t *port = cl_item_obj(item, port, list_item);
388			ucast_mgr_process_port(p_mgr, p_sw, port, i);
389		}
390	}
391
392	free_ports_priv(p_mgr);
393
394	OSM_LOG_EXIT(p_mgr->p_log);
395}
396
397static void ucast_mgr_process_neighbors(IN cl_map_item_t * p_map_item,
398					IN void *context)
399{
400	osm_switch_t * p_sw = (osm_switch_t *) p_map_item;
401	osm_ucast_mgr_t * p_mgr = context;
402	osm_node_t *p_node;
403	osm_node_t *p_remote_node;
404	uint32_t port_num;
405	uint8_t remote_port_num;
406	uint32_t num_ports;
407	osm_physp_t *p_physp;
408
409	OSM_LOG_ENTER(p_mgr->p_log);
410
411	p_node = p_sw->p_node;
412
413	CL_ASSERT(p_node);
414	CL_ASSERT(osm_node_get_type(p_node) == IB_NODE_TYPE_SWITCH);
415
416	OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
417		"Processing switch with GUID 0x%" PRIx64 "\n",
418		cl_ntoh64(osm_node_get_node_guid(p_node)));
419
420	num_ports = osm_node_get_num_physp(p_node);
421
422	/*
423	   Start with port 1 to skip the switch's management port.
424	 */
425	for (port_num = 1; port_num < num_ports; port_num++) {
426		p_remote_node = osm_node_get_remote_node(p_node,
427							 (uint8_t) port_num,
428							 &remote_port_num);
429		if (p_remote_node && p_remote_node->sw
430		    && (p_remote_node != p_node)) {
431			/* make sure the link is healthy. If it is not - don't
432			   propagate through it. */
433			p_physp = osm_node_get_physp_ptr(p_node, port_num);
434			if (!p_physp || !osm_link_is_healthy(p_physp))
435				continue;
436
437			ucast_mgr_process_neighbor(p_mgr, p_sw,
438						   p_remote_node->sw,
439						   (uint8_t) port_num,
440						   remote_port_num);
441		}
442	}
443
444	OSM_LOG_EXIT(p_mgr->p_log);
445}
446
447static int set_hop_wf(void *ctx, uint64_t guid, char *p)
448{
449	osm_ucast_mgr_t *m = ctx;
450	osm_node_t *node = osm_get_node_by_guid(m->p_subn, cl_hton64(guid));
451	osm_physp_t *physp;
452	unsigned port, hop_wf;
453	char *e;
454
455	if (!node || !node->sw) {
456		OSM_LOG(m->p_log, OSM_LOG_DEBUG,
457			"switch with guid 0x%016" PRIx64 " is not found\n",
458			guid);
459		return 0;
460	}
461
462	if (!p || !*p || !(port = strtoul(p, &e, 0)) || (p == e) ||
463	    port >= node->sw->num_ports) {
464		OSM_LOG(m->p_log, OSM_LOG_DEBUG,
465			"bad port specified for guid 0x%016" PRIx64 "\n", guid);
466		return 0;
467	}
468
469	p = e + 1;
470
471	if (!*p || !(hop_wf = strtoul(p, &e, 0)) || p == e || hop_wf >= 0x100) {
472		OSM_LOG(m->p_log, OSM_LOG_DEBUG,
473			"bad hop weight factor specified for guid 0x%016" PRIx64
474			"port %u\n", guid, port);
475		return 0;
476	}
477
478	physp = osm_node_get_physp_ptr(node, port);
479	if (!physp)
480		return 0;
481
482	physp->hop_wf = hop_wf;
483
484	return 0;
485}
486
487static void set_default_hop_wf(cl_map_item_t * p_map_item, void *ctx)
488{
489	osm_switch_t *sw = (osm_switch_t *) p_map_item;
490	int i;
491
492	for (i = 1; i < sw->num_ports; i++) {
493		osm_physp_t *p = osm_node_get_physp_ptr(sw->p_node, i);
494		if (p)
495			p->hop_wf = 1;
496	}
497}
498
499static int set_search_ordering_ports(void *ctx, uint64_t guid, char *p)
500{
501	osm_subn_t *p_subn = ctx;
502	osm_node_t *node = osm_get_node_by_guid(p_subn, cl_hton64(guid));
503	osm_switch_t *sw;
504	uint8_t *search_ordering_ports = NULL;
505	uint8_t port;
506	unsigned int *ports = NULL;
507	const int bpw = sizeof(*ports)*8;
508	int words;
509	int i = 1; /* port 0 maps to port 0 */
510
511	if (!node || !(sw = node->sw)) {
512		OSM_LOG(&p_subn->p_osm->log, OSM_LOG_VERBOSE,
513			"switch with guid 0x%016" PRIx64 " is not found\n",
514			guid);
515		return 0;
516	}
517
518	if (sw->search_ordering_ports) {
519		OSM_LOG(&p_subn->p_osm->log, OSM_LOG_VERBOSE,
520			"switch with guid 0x%016" PRIx64 " already listed\n",
521			guid);
522		return 0;
523	}
524
525	search_ordering_ports = malloc(sizeof(*search_ordering_ports)*sw->num_ports);
526	if (!search_ordering_ports) {
527		OSM_LOG(&p_subn->p_osm->log, OSM_LOG_ERROR,
528			"ERR 3A07: cannot allocate memory for search_ordering_ports\n");
529		return -1;
530	}
531	memset(search_ordering_ports, 0, sizeof(*search_ordering_ports)*sw->num_ports);
532
533	/* the ports array is for record keeping of which ports have
534	 * been seen */
535	words = (sw->num_ports + bpw - 1)/bpw;
536	ports = malloc(words*sizeof(*ports));
537	if (!ports) {
538		OSM_LOG(&p_subn->p_osm->log, OSM_LOG_ERROR,
539			"ERR 3A08: cannot allocate memory for ports\n");
540		free(search_ordering_ports);
541		return -1;
542	}
543	memset(ports, 0, words*sizeof(*ports));
544
545	while ((*p != '\0') && (*p != '#')) {
546		char *e;
547
548		port = strtoul(p, &e, 0);
549		if ((p == e) || (port == 0) || (port >= sw->num_ports) ||
550		    !osm_node_get_physp_ptr(node, port)) {
551			OSM_LOG(&p_subn->p_osm->log, OSM_LOG_VERBOSE,
552				"bad port %d specified for guid 0x%016" PRIx64 "\n",
553				port, guid);
554			free(search_ordering_ports);
555			free(ports);
556			return 0;
557		}
558
559		if (ports[port/bpw] & (1u << (port%bpw))) {
560			OSM_LOG(&p_subn->p_osm->log, OSM_LOG_VERBOSE,
561				"port %d already specified for guid 0x%016" PRIx64 "\n",
562				port, guid);
563			free(search_ordering_ports);
564			free(ports);
565			return 0;
566		}
567
568		ports[port/bpw] |= (1u << (port%bpw));
569		search_ordering_ports[i++] = port;
570
571		p = e;
572		while (isspace(*p)) {
573			p++;
574		}
575	}
576
577	if (i > 1) {
578		for (port = 1; port < sw->num_ports; port++) {
579			/* fill out the rest of the search_ordering_ports array
580			 * in sequence using the remaining unspecified
581			 * ports.
582			 */
583			if (!(ports[port/bpw] & (1u << (port%bpw)))) {
584				search_ordering_ports[i++] = port;
585			}
586		}
587		sw->search_ordering_ports = search_ordering_ports;
588	} else {
589		free(search_ordering_ports);
590	}
591
592	free(ports);
593	return 0;
594}
595
596int osm_ucast_mgr_build_lid_matrices(IN osm_ucast_mgr_t * p_mgr)
597{
598	uint32_t i;
599	uint32_t iteration_max;
600	cl_qmap_t *p_sw_guid_tbl;
601
602	p_sw_guid_tbl = &p_mgr->p_subn->sw_guid_tbl;
603
604	OSM_LOG(p_mgr->p_log, OSM_LOG_VERBOSE,
605		"Starting switches' Min Hop Table Assignment\n");
606
607	/*
608	   Set up the weighting factors for the routing.
609	 */
610	cl_qmap_apply_func(p_sw_guid_tbl, set_default_hop_wf, NULL);
611	if (p_mgr->p_subn->opt.hop_weights_file) {
612		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
613			"Fetching hop weight factor file \'%s\'\n",
614			p_mgr->p_subn->opt.hop_weights_file);
615		if (parse_node_map(p_mgr->p_subn->opt.hop_weights_file,
616				   set_hop_wf, p_mgr)) {
617			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A05: "
618				"cannot parse hop_weights_file \'%s\'\n",
619				p_mgr->p_subn->opt.hop_weights_file);
620		}
621	}
622
623	/*
624	   Set the switch matrices for each switch's own port 0 LID(s)
625	   then set the lid matrices for the each switch's leaf nodes.
626	 */
627	cl_qmap_apply_func(p_sw_guid_tbl, ucast_mgr_process_hop_0_1, p_mgr);
628
629	/*
630	   Get the switch matrices for each switch's neighbors.
631	   This process requires a number of iterations equal to
632	   the number of switches in the subnet minus 1.
633
634	   In each iteration, a switch learns the lid/port/hop
635	   information (as contained by a switch's lid matrix) from
636	   its immediate neighbors.  After each iteration, a switch
637	   (and it's neighbors) know more routing information than
638	   it did on the previous iteration.
639	   Thus, by repeatedly absorbing the routing information of
640	   neighbor switches, every switch eventually learns how to
641	   route all LIDs on the subnet.
642
643	   Note that there may not be any switches in the subnet if
644	   we are in simple p2p configuration.
645	 */
646	iteration_max = cl_qmap_count(p_sw_guid_tbl);
647
648	/*
649	   If there are switches in the subnet, iterate until the lid
650	   matrix has been constructed.  Otherwise, just immediately
651	   indicate we're done if no switches exist.
652	 */
653	if (iteration_max) {
654		iteration_max--;
655
656		/*
657		   we need to find out when the propagation of
658		   hop counts has relaxed. So this global variable
659		   is preset to 0 on each iteration and if
660		   if non of the switches was set will exit the
661		   while loop
662		 */
663		p_mgr->some_hop_count_set = TRUE;
664		for (i = 0; (i < iteration_max) && p_mgr->some_hop_count_set;
665		     i++) {
666			p_mgr->some_hop_count_set = FALSE;
667			cl_qmap_apply_func(p_sw_guid_tbl,
668					   ucast_mgr_process_neighbors, p_mgr);
669		}
670		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
671			"Min-hop propagated in %d steps\n", i);
672	}
673
674	return 0;
675}
676
677static int ucast_mgr_setup_all_switches(osm_subn_t * p_subn)
678{
679	osm_switch_t *p_sw;
680	uint16_t lids;
681
682	lids = (uint16_t) cl_ptr_vector_get_size(&p_subn->port_lid_tbl);
683	lids = lids ? lids - 1 : 0;
684
685	for (p_sw = (osm_switch_t *) cl_qmap_head(&p_subn->sw_guid_tbl);
686	     p_sw != (osm_switch_t *) cl_qmap_end(&p_subn->sw_guid_tbl);
687	     p_sw = (osm_switch_t *) cl_qmap_next(&p_sw->map_item)) {
688		if (osm_switch_prepare_path_rebuild(p_sw, lids)) {
689			OSM_LOG(&p_subn->p_osm->log, OSM_LOG_ERROR, "ERR 3A0B: "
690				"cannot setup switch 0x%016" PRIx64 "\n",
691				cl_ntoh64(osm_node_get_node_guid
692					  (p_sw->p_node)));
693			return -1;
694		}
695		if (p_sw->search_ordering_ports) {
696			free(p_sw->search_ordering_ports);
697			p_sw->search_ordering_ports = NULL;
698		}
699	}
700
701	if (p_subn->opt.port_search_ordering_file) {
702		OSM_LOG(&p_subn->p_osm->log, OSM_LOG_DEBUG,
703			"Fetching dimension ports file \'%s\'\n",
704			p_subn->opt.port_search_ordering_file);
705		if (parse_node_map(p_subn->opt.port_search_ordering_file,
706				   set_search_ordering_ports, p_subn)) {
707			OSM_LOG(&p_subn->p_osm->log, OSM_LOG_ERROR, "ERR 3A0F: "
708				"cannot parse port_search_ordering_file \'%s\'\n",
709				p_subn->opt.port_search_ordering_file);
710		}
711	}
712
713	return 0;
714}
715
716static int add_guid_to_order_list(void *ctx, uint64_t guid, char *p)
717{
718	osm_ucast_mgr_t *m = ctx;
719	osm_port_t *port = osm_get_port_by_guid(m->p_subn, cl_hton64(guid));
720
721	if (!port) {
722		OSM_LOG(m->p_log, OSM_LOG_DEBUG,
723			"port guid not found: 0x%016" PRIx64 "\n", guid);
724		return 0;
725	}
726
727	if (port->flag) {
728		OSM_LOG(m->p_log, OSM_LOG_DEBUG,
729			"port guid specified multiple times 0x%016" PRIx64 "\n",
730			guid);
731		return 0;
732	}
733
734	cl_qlist_insert_tail(&m->port_order_list, &port->list_item);
735	port->flag = 1;
736	port->use_scatter =  (m->p_subn->opt.guid_routing_order_no_scatter == TRUE) ? 0 : m->p_subn->opt.scatter_ports;
737
738	return 0;
739}
740
741static void add_port_to_order_list(cl_map_item_t * p_map_item, void *ctx)
742{
743	osm_port_t *port = (osm_port_t *) p_map_item;
744	osm_ucast_mgr_t *m = ctx;
745
746	if (!port->flag) {
747		port->use_scatter = m->p_subn->opt.scatter_ports;
748		cl_qlist_insert_tail(&m->port_order_list, &port->list_item);
749	} else
750		port->flag = 0;
751}
752
753static int mark_ignored_port(void *ctx, uint64_t guid, char *p)
754{
755	osm_ucast_mgr_t *m = ctx;
756	osm_node_t *node = osm_get_node_by_guid(m->p_subn, cl_hton64(guid));
757	osm_physp_t *physp;
758	unsigned port;
759
760	if (!node || !node->sw) {
761		OSM_LOG(m->p_log, OSM_LOG_DEBUG,
762			"switch with guid 0x%016" PRIx64 " is not found\n",
763			guid);
764		return 0;
765	}
766
767	if (!p || !*p || !(port = strtoul(p, NULL, 0)) ||
768	    port >= node->sw->num_ports) {
769		OSM_LOG(m->p_log, OSM_LOG_DEBUG,
770			"bad port specified for guid 0x%016" PRIx64 "\n", guid);
771		return 0;
772	}
773
774	physp = osm_node_get_physp_ptr(node, port);
775	if (!physp)
776		return 0;
777
778	physp->is_prof_ignored = 1;
779
780	return 0;
781}
782
783static void clear_prof_ignore_flag(cl_map_item_t * p_map_item, void *ctx)
784{
785	osm_switch_t *sw = (osm_switch_t *) p_map_item;
786	int i;
787
788	for (i = 1; i < sw->num_ports; i++) {
789		osm_physp_t *p = osm_node_get_physp_ptr(sw->p_node, i);
790		if (p)
791			p->is_prof_ignored = 0;
792	}
793}
794
795static void add_sw_endports_to_order_list(osm_switch_t * sw,
796					  osm_ucast_mgr_t * m)
797{
798	osm_port_t *port;
799	osm_physp_t *p;
800	int i;
801
802	for (i = 1; i < sw->num_ports; i++) {
803		p = osm_node_get_physp_ptr(sw->p_node, i);
804		if (p && p->p_remote_physp && !p->p_remote_physp->p_node->sw) {
805			port = osm_get_port_by_guid(m->p_subn,
806						    p->p_remote_physp->
807						    port_guid);
808			if (!port || port->flag)
809				continue;
810			cl_qlist_insert_tail(&m->port_order_list,
811					     &port->list_item);
812			port->flag = 1;
813			port->use_scatter = m->p_subn->opt.scatter_ports;
814		}
815	}
816}
817
818static void sw_count_endport_links(osm_switch_t * sw)
819{
820	osm_physp_t *p;
821	int i;
822
823	sw->endport_links = 0;
824	for (i = 1; i < sw->num_ports; i++) {
825		p = osm_node_get_physp_ptr(sw->p_node, i);
826		if (p && p->p_remote_physp && !p->p_remote_physp->p_node->sw)
827			sw->endport_links++;
828	}
829}
830
831static int compar_sw_load(const void *s1, const void *s2)
832{
833#define get_sw_endport_links(s) (*(osm_switch_t **)s)->endport_links
834	return get_sw_endport_links(s2) - get_sw_endport_links(s1);
835}
836
837static void sort_ports_by_switch_load(osm_ucast_mgr_t * m)
838{
839	int i, num = cl_qmap_count(&m->p_subn->sw_guid_tbl);
840	void **s = malloc(num * sizeof(*s));
841	if (!s) {
842		OSM_LOG(m->p_log, OSM_LOG_ERROR, "ERR 3A0C: "
843			"No memory, skip by switch load sorting.\n");
844		return;
845	}
846	s[0] = cl_qmap_head(&m->p_subn->sw_guid_tbl);
847	for (i = 1; i < num; i++)
848		s[i] = cl_qmap_next(s[i - 1]);
849
850	for (i = 0; i < num; i++)
851		sw_count_endport_links(s[i]);
852
853	qsort(s, num, sizeof(*s), compar_sw_load);
854
855	for (i = 0; i < num; i++)
856		add_sw_endports_to_order_list(s[i], m);
857	free(s);
858}
859
860static int ucast_mgr_build_lfts(osm_ucast_mgr_t * p_mgr)
861{
862	cl_qlist_init(&p_mgr->port_order_list);
863
864	if (p_mgr->p_subn->opt.guid_routing_order_file) {
865		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
866			"Fetching guid routing order file \'%s\'\n",
867			p_mgr->p_subn->opt.guid_routing_order_file);
868
869		if (parse_node_map(p_mgr->p_subn->opt.guid_routing_order_file,
870				   add_guid_to_order_list, p_mgr))
871			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A0D: "
872				"cannot parse guid routing order file \'%s\'\n",
873				p_mgr->p_subn->opt.guid_routing_order_file);
874	}
875	sort_ports_by_switch_load(p_mgr);
876
877	if (p_mgr->p_subn->opt.port_prof_ignore_file) {
878		cl_qmap_apply_func(&p_mgr->p_subn->sw_guid_tbl,
879				   clear_prof_ignore_flag, NULL);
880		if (parse_node_map(p_mgr->p_subn->opt.port_prof_ignore_file,
881				   mark_ignored_port, p_mgr)) {
882			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A0E: "
883				"cannot parse port prof ignore file \'%s\'\n",
884				p_mgr->p_subn->opt.port_prof_ignore_file);
885		}
886	}
887
888	cl_qmap_apply_func(&p_mgr->p_subn->port_guid_tbl,
889			   add_port_to_order_list, p_mgr);
890
891	cl_qmap_apply_func(&p_mgr->p_subn->sw_guid_tbl, ucast_mgr_process_tbl,
892			   p_mgr);
893
894	cl_qlist_remove_all(&p_mgr->port_order_list);
895
896	return 0;
897}
898
899static void ucast_mgr_set_fwd_top(IN cl_map_item_t * p_map_item,
900				  IN void *cxt)
901{
902	osm_ucast_mgr_t *p_mgr = cxt;
903	osm_switch_t * p_sw = (osm_switch_t *) p_map_item;
904	osm_node_t *p_node;
905	osm_physp_t *p_physp;
906	osm_dr_path_t *p_path;
907	osm_madw_context_t context;
908	ib_api_status_t status;
909	ib_switch_info_t si;
910	boolean_t set_swinfo_require = FALSE;
911	uint16_t lin_top;
912	uint8_t life_state;
913
914	CL_ASSERT(p_mgr);
915
916	OSM_LOG_ENTER(p_mgr->p_log);
917
918	CL_ASSERT(p_sw && p_sw->max_lid_ho);
919
920	p_node = p_sw->p_node;
921
922	CL_ASSERT(p_node);
923
924	if (p_mgr->max_lid < p_sw->max_lid_ho)
925		p_mgr->max_lid = p_sw->max_lid_ho;
926
927	p_physp = osm_node_get_physp_ptr(p_node, 0);
928
929	CL_ASSERT(p_physp);
930
931	p_path = osm_physp_get_dr_path_ptr(p_physp);
932
933	/*
934	   Set the top of the unicast forwarding table.
935	 */
936	si = p_sw->switch_info;
937	lin_top = cl_hton16(p_sw->max_lid_ho);
938	if (lin_top != si.lin_top) {
939		set_swinfo_require = TRUE;
940		si.lin_top = lin_top;
941		context.si_context.lft_top_change = TRUE;
942	} else
943		context.si_context.lft_top_change = FALSE;
944
945	life_state = si.life_state;
946	ib_switch_info_set_life_time(&si, p_mgr->p_subn->opt.packet_life_time);
947
948	if (life_state != si.life_state)
949		set_swinfo_require = TRUE;
950
951	if (set_swinfo_require) {
952		OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
953			"Setting switch FT top to LID %u\n", p_sw->max_lid_ho);
954
955		context.si_context.light_sweep = FALSE;
956		context.si_context.node_guid = osm_node_get_node_guid(p_node);
957		context.si_context.set_method = TRUE;
958
959		status = osm_req_set(p_mgr->sm, p_path, (uint8_t *) & si,
960				     sizeof(si), IB_MAD_ATTR_SWITCH_INFO,
961				     0, FALSE,
962				     ib_port_info_get_m_key(&p_physp->port_info),
963				     CL_DISP_MSGID_NONE, &context);
964
965		if (status != IB_SUCCESS)
966			OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A06: "
967				"Sending SwitchInfo attribute failed (%s)\n",
968				ib_get_err_str(status));
969	}
970
971	OSM_LOG_EXIT(p_mgr->p_log);
972}
973
974static int set_lft_block(IN osm_switch_t *p_sw, IN osm_ucast_mgr_t *p_mgr,
975			 IN uint16_t block_id_ho)
976{
977	osm_madw_context_t context;
978	osm_dr_path_t *p_path;
979	osm_physp_t *p_physp;
980	ib_api_status_t status;
981
982	/*
983	   Send linear forwarding table blocks to the switch
984	   as long as the switch indicates it has blocks needing
985	   configuration.
986	 */
987	if (!p_sw->new_lft) {
988		/* any routing should provide the new_lft */
989		CL_ASSERT(p_mgr->p_subn->opt.use_ucast_cache &&
990			  p_mgr->cache_valid && !p_sw->need_update);
991		return -1;
992	}
993
994	p_physp = osm_node_get_physp_ptr(p_sw->p_node, 0);
995	if (!p_physp)
996		return -1;
997
998	p_path = osm_physp_get_dr_path_ptr(p_physp);
999
1000	context.lft_context.node_guid = osm_node_get_node_guid(p_sw->p_node);
1001	context.lft_context.set_method = TRUE;
1002
1003	if (!p_sw->need_update && !p_mgr->p_subn->need_update &&
1004	    !memcmp(p_sw->new_lft + block_id_ho * IB_SMP_DATA_SIZE,
1005		    p_sw->lft + block_id_ho * IB_SMP_DATA_SIZE,
1006		    IB_SMP_DATA_SIZE))
1007		return 0;
1008
1009	/*
1010	 * Zero the stored LFT block, so in case the MAD will end up
1011	 * with error, we will resend it in the next sweep.
1012	 */
1013	memset(p_sw->lft + block_id_ho * IB_SMP_DATA_SIZE, 0,
1014	       IB_SMP_DATA_SIZE);
1015
1016	OSM_LOG(p_mgr->p_log, OSM_LOG_DEBUG,
1017		"Writing FT block %u to switch 0x%" PRIx64 "\n", block_id_ho,
1018		cl_ntoh64(context.lft_context.node_guid));
1019
1020	status = osm_req_set(p_mgr->sm, p_path,
1021			     p_sw->new_lft + block_id_ho * IB_SMP_DATA_SIZE,
1022			     IB_SMP_DATA_SIZE, IB_MAD_ATTR_LIN_FWD_TBL,
1023			     cl_hton32(block_id_ho), FALSE,
1024			     ib_port_info_get_m_key(&p_physp->port_info),
1025			     CL_DISP_MSGID_NONE, &context);
1026
1027	if (status != IB_SUCCESS) {
1028		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR, "ERR 3A10: "
1029			"Sending linear fwd. tbl. block failed (%s)\n",
1030			ib_get_err_str(status));
1031		return -1;
1032	}
1033
1034	return 0;
1035}
1036
1037static void ucast_mgr_pipeline_fwd_tbl(osm_ucast_mgr_t * p_mgr)
1038{
1039	cl_qmap_t *tbl;
1040	cl_map_item_t *item;
1041	unsigned i, max_block = p_mgr->max_lid / IB_SMP_DATA_SIZE + 1;
1042
1043	tbl = &p_mgr->p_subn->sw_guid_tbl;
1044	for (i = 0; i < max_block; i++)
1045		for (item = cl_qmap_head(tbl); item != cl_qmap_end(tbl);
1046		     item = cl_qmap_next(item))
1047			set_lft_block((osm_switch_t *)item, p_mgr, i);
1048}
1049
1050void osm_ucast_mgr_set_fwd_tables(osm_ucast_mgr_t * p_mgr)
1051{
1052	p_mgr->max_lid = 0;
1053
1054	cl_qmap_apply_func(&p_mgr->p_subn->sw_guid_tbl, ucast_mgr_set_fwd_top,
1055			   p_mgr);
1056
1057	ucast_mgr_pipeline_fwd_tbl(p_mgr);
1058}
1059
1060static int ucast_mgr_route(struct osm_routing_engine *r, osm_opensm_t * osm)
1061{
1062	int ret;
1063
1064	OSM_LOG(&osm->log, OSM_LOG_VERBOSE,
1065		"building routing with \'%s\' routing algorithm...\n", r->name);
1066
1067	/* Set the before each lft build to keep the routes in place between sweeps */
1068	if (osm->subn.opt.scatter_ports)
1069		srandom(osm->subn.opt.scatter_ports);
1070
1071	if (!r->build_lid_matrices ||
1072	    (ret = r->build_lid_matrices(r->context)) > 0)
1073		ret = osm_ucast_mgr_build_lid_matrices(&osm->sm.ucast_mgr);
1074
1075	if (ret < 0) {
1076		OSM_LOG(&osm->log, OSM_LOG_ERROR,
1077			"%s: cannot build lid matrices\n", r->name);
1078		return ret;
1079	}
1080
1081	if (!r->ucast_build_fwd_tables ||
1082	    (ret = r->ucast_build_fwd_tables(r->context)) > 0)
1083		ret = ucast_mgr_build_lfts(&osm->sm.ucast_mgr);
1084
1085	if (ret < 0) {
1086		OSM_LOG(&osm->log, OSM_LOG_ERROR,
1087			"%s: cannot build fwd tables\n", r->name);
1088		return ret;
1089	}
1090
1091	osm->routing_engine_used = r;
1092
1093	osm_ucast_mgr_set_fwd_tables(&osm->sm.ucast_mgr);
1094
1095	return 0;
1096}
1097
1098int osm_ucast_mgr_process(IN osm_ucast_mgr_t * p_mgr)
1099{
1100	osm_opensm_t *p_osm;
1101	struct osm_routing_engine *p_routing_eng;
1102	cl_qmap_t *p_sw_guid_tbl;
1103	int failed = 0;
1104
1105	OSM_LOG_ENTER(p_mgr->p_log);
1106
1107	p_sw_guid_tbl = &p_mgr->p_subn->sw_guid_tbl;
1108	p_osm = p_mgr->p_subn->p_osm;
1109	p_routing_eng = p_osm->routing_engine_list;
1110
1111	CL_PLOCK_EXCL_ACQUIRE(p_mgr->p_lock);
1112
1113	/*
1114	   If there are no switches in the subnet, we are done.
1115	 */
1116	if (cl_qmap_count(p_sw_guid_tbl) == 0 ||
1117	    ucast_mgr_setup_all_switches(p_mgr->p_subn) < 0)
1118		goto Exit;
1119
1120	failed = -1;
1121	p_osm->routing_engine_used = NULL;
1122	while (p_routing_eng) {
1123		failed = ucast_mgr_route(p_routing_eng, p_osm);
1124		if (!failed)
1125			break;
1126		p_routing_eng = p_routing_eng->next;
1127	}
1128
1129	if (!p_osm->routing_engine_used &&
1130	    p_osm->no_fallback_routing_engine != TRUE) {
1131		/* If configured routing algorithm failed, use default MinHop */
1132		failed = ucast_mgr_route(p_osm->default_routing_engine, p_osm);
1133	}
1134
1135	if (p_osm->routing_engine_used) {
1136		OSM_LOG(p_mgr->p_log, OSM_LOG_INFO,
1137			"%s tables configured on all switches\n",
1138			osm_routing_engine_type_str(p_osm->
1139						    routing_engine_used->type));
1140
1141		if (p_mgr->p_subn->opt.use_ucast_cache)
1142			p_mgr->cache_valid = TRUE;
1143	} else {
1144		p_mgr->p_subn->subnet_initialization_error = TRUE;
1145		OSM_LOG(p_mgr->p_log, OSM_LOG_ERROR,
1146			"No routing engine able to successfully configure "
1147			" switch tables on current fabric\n");
1148	}
1149Exit:
1150	CL_PLOCK_RELEASE(p_mgr->p_lock);
1151	OSM_LOG_EXIT(p_mgr->p_log);
1152	return failed;
1153}
1154
1155static int ucast_build_lid_matrices(void *context)
1156{
1157	return osm_ucast_mgr_build_lid_matrices(context);
1158}
1159
1160static int ucast_build_lfts(void *context)
1161{
1162	return ucast_mgr_build_lfts(context);
1163}
1164
1165int osm_ucast_minhop_setup(struct osm_routing_engine *r, osm_opensm_t * osm)
1166{
1167	r->context = &osm->sm.ucast_mgr;
1168	r->build_lid_matrices = ucast_build_lid_matrices;
1169	r->ucast_build_fwd_tables = ucast_build_lfts;
1170	return 0;
1171}
1172
1173static int ucast_dor_build_lfts(void *context)
1174{
1175	osm_ucast_mgr_t *mgr = context;
1176	int ret;
1177
1178	mgr->is_dor = 1;
1179	ret = ucast_mgr_build_lfts(mgr);
1180	mgr->is_dor = 0;
1181
1182	return ret;
1183}
1184
1185int osm_ucast_dor_setup(struct osm_routing_engine *r, osm_opensm_t * osm)
1186{
1187	r->context = &osm->sm.ucast_mgr;
1188	r->build_lid_matrices = ucast_build_lid_matrices;
1189	r->ucast_build_fwd_tables = ucast_dor_build_lfts;
1190	return 0;
1191}
1192
1193int ucast_dummy_build_lid_matrices(void *context)
1194{
1195	return 0;
1196}
1197