1/*
2 * Copyright (c) 2007 The Regents of the University of California.
3 * Copyright (c) 2007-2008 Voltaire, Inc. All rights reserved.
4 *
5 * This software is available to you under a choice of one of two
6 * licenses.  You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
10 *
11 *     Redistribution and use in source and binary forms, with or
12 *     without modification, are permitted provided that the following
13 *     conditions are met:
14 *
15 *      - Redistributions of source code must retain the above
16 *        copyright notice, this list of conditions and the following
17 *        disclaimer.
18 *
19 *      - Redistributions in binary form must reproduce the above
20 *        copyright notice, this list of conditions and the following
21 *        disclaimer in the documentation and/or other materials
22 *        provided with the distribution.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31 * SOFTWARE.
32 *
33 */
34
35/*
36 * Abstract:
37 *    Implementation of osm_perfmgr_t.
38 * This object implements an IBA performance manager.
39 *
40 * Author:
41 *    Ira Weiny, LLNL
42 */
43
44#if HAVE_CONFIG_H
45#  include <config.h>
46#endif				/* HAVE_CONFIG_H */
47
48#ifdef ENABLE_OSM_PERF_MGR
49
50#include <stdlib.h>
51#include <stdint.h>
52#include <string.h>
53#include <poll.h>
54#include <errno.h>
55#include <sys/time.h>
56#include <netinet/in.h>
57#include <float.h>
58#include <arpa/inet.h>
59#include <iba/ib_types.h>
60#include <complib/cl_debug.h>
61#include <complib/cl_thread.h>
62#include <vendor/osm_vendor_api.h>
63#include <opensm/osm_perfmgr.h>
64#include <opensm/osm_log.h>
65#include <opensm/osm_node.h>
66#include <opensm/osm_opensm.h>
67
68#define OSM_PERFMGR_INITIAL_TID_VALUE 0xcafe
69
70#if ENABLE_OSM_PERF_MGR_PROFILE
71struct {
72	double fastest_us;
73	double slowest_us;
74	double avg_us;
75	uint64_t num;
76} perfmgr_mad_stats = {
77	fastest_us: DBL_MAX,
78	slowest_us: DBL_MIN,
79	avg_us: 0,
80	num: 0
81};
82
83/* diff must be something which can fit in a susecond_t */
84static inline void update_mad_stats(struct timeval *diff)
85{
86	double new = (diff->tv_sec * 1000000) + diff->tv_usec;
87	if (new < perfmgr_mad_stats.fastest_us)
88		perfmgr_mad_stats.fastest_us = new;
89	if (new > perfmgr_mad_stats.slowest_us)
90		perfmgr_mad_stats.slowest_us = new;
91
92	perfmgr_mad_stats.avg_us =
93	    ((perfmgr_mad_stats.avg_us * perfmgr_mad_stats.num) + new)
94	    / (perfmgr_mad_stats.num + 1);
95	perfmgr_mad_stats.num++;
96}
97
98static inline void perfmgr_clear_mad_stats(void)
99{
100	perfmgr_mad_stats.fastest_us = DBL_MAX;
101	perfmgr_mad_stats.slowest_us = DBL_MIN;
102	perfmgr_mad_stats.avg_us = 0;
103	perfmgr_mad_stats.num = 0;
104}
105
106/* after and diff can be the same struct */
107static inline void diff_time(struct timeval *before,
108			     struct timeval *after, struct timeval *diff)
109{
110	struct timeval tmp = *after;
111	if (tmp.tv_usec < before->tv_usec) {
112		tmp.tv_sec--;
113		tmp.tv_usec += 1000000;
114	}
115	diff->tv_sec = tmp.tv_sec - before->tv_sec;
116	diff->tv_usec = tmp.tv_usec - before->tv_usec;
117}
118
119#endif
120
121extern int wait_for_pending_transactions(osm_stats_t * stats);
122
123/**********************************************************************
124 * Internal helper functions.
125 **********************************************************************/
126static inline void __init_monitored_nodes(osm_perfmgr_t * pm)
127{
128	cl_qmap_init(&pm->monitored_map);
129	pm->remove_list = NULL;
130	cl_event_construct(&pm->sig_query);
131	cl_event_init(&pm->sig_query, FALSE);
132}
133
134static inline void
135__mark_for_removal(osm_perfmgr_t * pm, __monitored_node_t * node)
136{
137	if (pm->remove_list) {
138		node->next = pm->remove_list;
139		pm->remove_list = node;
140	} else {
141		node->next = NULL;
142		pm->remove_list = node;
143	}
144}
145
146static inline void __remove_marked_nodes(osm_perfmgr_t * pm)
147{
148	while (pm->remove_list) {
149		__monitored_node_t *next = pm->remove_list->next;
150
151		cl_qmap_remove_item(&(pm->monitored_map),
152				    (cl_map_item_t *) (pm->remove_list));
153
154		if (pm->remove_list->name)
155			free(pm->remove_list->name);
156		free(pm->remove_list);
157		pm->remove_list = next;
158	}
159}
160
161static inline void __decrement_outstanding_queries(osm_perfmgr_t * pm)
162{
163	cl_atomic_dec(&(pm->outstanding_queries));
164	cl_event_signal(&(pm->sig_query));
165}
166
167/**********************************************************************
168 * Receive the MAD from the vendor layer and post it for processing by
169 * the dispatcher.
170 **********************************************************************/
171static void
172osm_perfmgr_mad_recv_callback(osm_madw_t * p_madw, void *bind_context,
173			      osm_madw_t * p_req_madw)
174{
175	osm_perfmgr_t *pm = (osm_perfmgr_t *) bind_context;
176
177	OSM_LOG_ENTER(pm->log);
178
179	osm_madw_copy_context(p_madw, p_req_madw);
180	osm_mad_pool_put(pm->mad_pool, p_req_madw);
181
182	__decrement_outstanding_queries(pm);
183
184	/* post this message for later processing. */
185	if (cl_disp_post(pm->pc_disp_h, OSM_MSG_MAD_PORT_COUNTERS,
186			 (void *)p_madw, NULL, NULL) != CL_SUCCESS) {
187		OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C01: "
188			"PerfMgr Dispatcher post failed\n");
189		osm_mad_pool_put(pm->mad_pool, p_madw);
190	}
191	OSM_LOG_EXIT(pm->log);
192}
193
194/**********************************************************************
195 * Process MAD send errors.
196 **********************************************************************/
197static void
198osm_perfmgr_mad_send_err_callback(void *bind_context, osm_madw_t * p_madw)
199{
200	osm_perfmgr_t *pm = (osm_perfmgr_t *) bind_context;
201	osm_madw_context_t *context = &(p_madw->context);
202	uint64_t node_guid = context->perfmgr_context.node_guid;
203	uint8_t port = context->perfmgr_context.port;
204	cl_map_item_t *p_node;
205	__monitored_node_t *p_mon_node;
206
207	OSM_LOG_ENTER(pm->log);
208
209	/* go ahead and get the monitored node struct to have the printable
210	 * name if needed in messages
211	 */
212	if ((p_node = cl_qmap_get(&(pm->monitored_map), node_guid)) ==
213	    cl_qmap_end(&(pm->monitored_map))) {
214		OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C15: GUID 0x%016"
215			PRIx64 " not found in monitored map\n",
216			node_guid);
217		goto Exit;
218	}
219	p_mon_node = (__monitored_node_t *) p_node;
220
221	OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C02: %s (0x%" PRIx64
222		") port %u\n", p_mon_node->name, p_mon_node->guid, port);
223
224	if (pm->subn->opt.perfmgr_redir && p_madw->status == IB_TIMEOUT) {
225		/* First, find the node in the monitored map */
226		cl_plock_acquire(pm->lock);
227		/* Now, validate port number */
228		if (port > p_mon_node->redir_tbl_size) {
229			cl_plock_release(pm->lock);
230			OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C16: "
231				"Invalid port num %u for %s (GUID 0x%016"
232				PRIx64 ") num ports %u\n", port, p_mon_node->name,
233				p_mon_node->guid, p_mon_node->redir_tbl_size);
234			goto Exit;
235		}
236		/* Clear redirection info */
237		p_mon_node->redir_port[port].redir_lid = 0;
238		p_mon_node->redir_port[port].redir_qp = 0;
239		cl_plock_release(pm->lock);
240	}
241
242Exit:
243	osm_mad_pool_put(pm->mad_pool, p_madw);
244
245	__decrement_outstanding_queries(pm);
246
247	OSM_LOG_EXIT(pm->log);
248}
249
250/**********************************************************************
251 * Bind the PerfMgr to the vendor layer for MAD sends/receives
252 **********************************************************************/
253ib_api_status_t
254osm_perfmgr_bind(osm_perfmgr_t * const pm, const ib_net64_t port_guid)
255{
256	osm_bind_info_t bind_info;
257	ib_api_status_t status = IB_SUCCESS;
258
259	OSM_LOG_ENTER(pm->log);
260
261	if (pm->bind_handle != OSM_BIND_INVALID_HANDLE) {
262		OSM_LOG(pm->log, OSM_LOG_ERROR,
263			"ERR 4C03: Multiple binds not allowed\n");
264		status = IB_ERROR;
265		goto Exit;
266	}
267
268	bind_info.port_guid = port_guid;
269	bind_info.mad_class = IB_MCLASS_PERF;
270	bind_info.class_version = 1;
271	bind_info.is_responder = FALSE;
272	bind_info.is_report_processor = FALSE;
273	bind_info.is_trap_processor = FALSE;
274	bind_info.recv_q_size = OSM_PM_DEFAULT_QP1_RCV_SIZE;
275	bind_info.send_q_size = OSM_PM_DEFAULT_QP1_SEND_SIZE;
276
277	OSM_LOG(pm->log, OSM_LOG_VERBOSE,
278		"Binding to port GUID 0x%" PRIx64 "\n", cl_ntoh64(port_guid));
279
280	pm->bind_handle = osm_vendor_bind(pm->vendor,
281					  &bind_info,
282					  pm->mad_pool,
283					  osm_perfmgr_mad_recv_callback,
284					  osm_perfmgr_mad_send_err_callback,
285					  pm);
286
287	if (pm->bind_handle == OSM_BIND_INVALID_HANDLE) {
288		status = IB_ERROR;
289		OSM_LOG(pm->log, OSM_LOG_ERROR,
290			"ERR 4C04: Vendor specific bind failed (%s)\n",
291			ib_get_err_str(status));
292		goto Exit;
293	}
294
295Exit:
296	OSM_LOG_EXIT(pm->log);
297	return (status);
298}
299
300/**********************************************************************
301 * Unbind the PerfMgr from the vendor layer for MAD sends/receives
302 **********************************************************************/
303static void osm_perfmgr_mad_unbind(osm_perfmgr_t * const pm)
304{
305	OSM_LOG_ENTER(pm->log);
306	if (pm->bind_handle == OSM_BIND_INVALID_HANDLE) {
307		OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C05: No previous bind\n");
308		goto Exit;
309	}
310	osm_vendor_unbind(pm->bind_handle);
311Exit:
312	OSM_LOG_EXIT(pm->log);
313}
314
315/**********************************************************************
316 * Given a monitored node and a port, return the qp
317 **********************************************************************/
318static ib_net32_t get_qp(__monitored_node_t * mon_node, uint8_t port)
319{
320	ib_net32_t qp = cl_ntoh32(1);
321
322	if (mon_node && mon_node->redir_tbl_size &&
323	    port < mon_node->redir_tbl_size &&
324	    mon_node->redir_port[port].redir_lid &&
325	    mon_node->redir_port[port].redir_qp)
326		qp = mon_node->redir_port[port].redir_qp;
327
328	return qp;
329}
330
331/**********************************************************************
332 * Given a node, a port, and an optional monitored node,
333 * return the appropriate lid to query that port
334 **********************************************************************/
335static ib_net16_t
336get_lid(osm_node_t * p_node, uint8_t port, __monitored_node_t * mon_node)
337{
338	if (mon_node && mon_node->redir_tbl_size &&
339	    port < mon_node->redir_tbl_size &&
340	    mon_node->redir_port[port].redir_lid)
341		return mon_node->redir_port[port].redir_lid;
342
343	switch (p_node->node_info.node_type) {
344	case IB_NODE_TYPE_CA:
345	case IB_NODE_TYPE_ROUTER:
346		return osm_node_get_base_lid(p_node, port);
347	case IB_NODE_TYPE_SWITCH:
348		return osm_node_get_base_lid(p_node, 0);
349	default:
350		return 0;
351	}
352}
353
354/**********************************************************************
355 * Form and send the Port Counters MAD for a single port.
356 **********************************************************************/
357static ib_api_status_t
358osm_perfmgr_send_pc_mad(osm_perfmgr_t * perfmgr, ib_net16_t dest_lid,
359			ib_net32_t dest_qp, uint8_t port, uint8_t mad_method,
360			osm_madw_context_t * const p_context)
361{
362	ib_api_status_t status = IB_SUCCESS;
363	ib_port_counters_t *port_counter = NULL;
364	ib_perfmgt_mad_t *pm_mad = NULL;
365	osm_madw_t *p_madw = NULL;
366
367	OSM_LOG_ENTER(perfmgr->log);
368
369	p_madw =
370	    osm_mad_pool_get(perfmgr->mad_pool, perfmgr->bind_handle,
371			     MAD_BLOCK_SIZE, NULL);
372	if (p_madw == NULL)
373		return (IB_INSUFFICIENT_MEMORY);
374
375	pm_mad = osm_madw_get_perfmgt_mad_ptr(p_madw);
376
377	/* build the mad */
378	pm_mad->header.base_ver = 1;
379	pm_mad->header.mgmt_class = IB_MCLASS_PERF;
380	pm_mad->header.class_ver = 1;
381	pm_mad->header.method = mad_method;
382	pm_mad->header.status = 0;
383	pm_mad->header.class_spec = 0;
384	pm_mad->header.trans_id =
385	    cl_hton64((uint64_t) cl_atomic_inc(&(perfmgr->trans_id)));
386	pm_mad->header.attr_id = IB_MAD_ATTR_PORT_CNTRS;
387	pm_mad->header.resv = 0;
388	pm_mad->header.attr_mod = 0;
389
390	port_counter = (ib_port_counters_t *) & (pm_mad->data);
391	memset(port_counter, 0, sizeof(*port_counter));
392	port_counter->port_select = port;
393	port_counter->counter_select = 0xFFFF;
394
395	p_madw->mad_addr.dest_lid = dest_lid;
396	p_madw->mad_addr.addr_type.gsi.remote_qp = dest_qp;
397	p_madw->mad_addr.addr_type.gsi.remote_qkey =
398	    cl_hton32(IB_QP1_WELL_KNOWN_Q_KEY);
399	/* FIXME what about other partitions */
400	p_madw->mad_addr.addr_type.gsi.pkey_ix = 0;
401	p_madw->mad_addr.addr_type.gsi.service_level = 0;
402	p_madw->mad_addr.addr_type.gsi.global_route = FALSE;
403	p_madw->resp_expected = TRUE;
404
405	if (p_context)
406		p_madw->context = *p_context;
407
408	status = osm_vendor_send(perfmgr->bind_handle, p_madw, TRUE);
409
410	if (status == IB_SUCCESS) {
411		/* pause this thread if we have too many outstanding requests */
412		cl_atomic_inc(&(perfmgr->outstanding_queries));
413		if (perfmgr->outstanding_queries >
414		    perfmgr->max_outstanding_queries) {
415			perfmgr->sweep_state = PERFMGR_SWEEP_SUSPENDED;
416			cl_event_wait_on(&perfmgr->sig_query, EVENT_NO_TIMEOUT,
417					 TRUE);
418			perfmgr->sweep_state = PERFMGR_SWEEP_ACTIVE;
419		}
420	}
421
422	OSM_LOG_EXIT(perfmgr->log);
423	return (status);
424}
425
426/**********************************************************************
427 * sweep the node_guid_tbl and collect the node guids to be tracked
428 **********************************************************************/
429static void __collect_guids(cl_map_item_t * const p_map_item, void *context)
430{
431	osm_node_t *node = (osm_node_t *) p_map_item;
432	uint64_t node_guid = cl_ntoh64(node->node_info.node_guid);
433	osm_perfmgr_t *pm = (osm_perfmgr_t *) context;
434	__monitored_node_t *mon_node = NULL;
435	uint32_t size;
436
437	OSM_LOG_ENTER(pm->log);
438
439	if (cl_qmap_get(&(pm->monitored_map), node_guid)
440	    == cl_qmap_end(&(pm->monitored_map))) {
441		/* if not already in our map add it */
442		size = node->node_info.num_ports;
443		mon_node = malloc(sizeof(*mon_node) + sizeof(redir_t) * size);
444		if (!mon_node) {
445			OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 4C06: "
446				"malloc failed: not handling node %s"
447				"(GUID 0x%" PRIx64 ")\n", node->print_desc, node_guid);
448			goto Exit;
449		}
450		memset(mon_node, 0, sizeof(*mon_node) + sizeof(redir_t) * size);
451		mon_node->guid = node_guid;
452		mon_node->name = strdup(node->print_desc);
453		mon_node->redir_tbl_size = size + 1;
454		cl_qmap_insert(&(pm->monitored_map), node_guid,
455			       (cl_map_item_t *) mon_node);
456	}
457
458Exit:
459	OSM_LOG_EXIT(pm->log);
460}
461
462/**********************************************************************
463 * query the Port Counters of all the nodes in the subnet.
464 **********************************************************************/
465static void
466__osm_perfmgr_query_counters(cl_map_item_t * const p_map_item, void *context)
467{
468	ib_api_status_t status = IB_SUCCESS;
469	uint8_t port = 0, startport = 1;
470	osm_perfmgr_t *pm = (osm_perfmgr_t *) context;
471	osm_node_t *node = NULL;
472	__monitored_node_t *mon_node = (__monitored_node_t *) p_map_item;
473	osm_madw_context_t mad_context;
474	uint8_t num_ports = 0;
475	uint64_t node_guid = 0;
476	ib_net32_t remote_qp;
477
478	OSM_LOG_ENTER(pm->log);
479
480	cl_plock_acquire(pm->lock);
481	node = osm_get_node_by_guid(pm->subn, cl_hton64(mon_node->guid));
482	if (!node) {
483		OSM_LOG(pm->log, OSM_LOG_ERROR,
484			"ERR 4C07: Node \"%s\" (guid 0x%" PRIx64
485			") no longer exists so removing from PerfMgr monitoring\n",
486			mon_node->name, mon_node->guid);
487		__mark_for_removal(pm, mon_node);
488		goto Exit;
489	}
490
491	num_ports = osm_node_get_num_physp(node);
492	node_guid = cl_ntoh64(node->node_info.node_guid);
493
494	/* make sure we have a database object ready to store this information */
495	if (perfmgr_db_create_entry(pm->db, node_guid, num_ports,
496				    node->print_desc) !=
497	    PERFMGR_EVENT_DB_SUCCESS) {
498		OSM_LOG(pm->log, OSM_LOG_ERROR,
499			"ERR 4C08: DB create entry failed for 0x%"
500			PRIx64 " (%s) : %s\n", node_guid, node->print_desc,
501			strerror(errno));
502		goto Exit;
503	}
504
505	/* if switch, check for enhanced port 0 */
506	if (osm_node_get_type(node) == IB_NODE_TYPE_SWITCH &&
507	    node->sw &&
508	    ib_switch_info_is_enhanced_port0(&node->sw->switch_info))
509		startport = 0;
510
511	/* issue the query for each port */
512	for (port = startport; port < num_ports; port++) {
513		ib_net16_t lid;
514
515		if (!osm_node_get_physp_ptr(node, port))
516			continue;
517
518		lid = get_lid(node, port, mon_node);
519		if (lid == 0) {
520			OSM_LOG(pm->log, OSM_LOG_DEBUG, "WARN: node 0x%" PRIx64
521				" port %d (%s): port out of range, skipping\n",
522				cl_ntoh64(node->node_info.node_guid), port,
523				node->print_desc);
524			continue;
525		}
526
527		remote_qp = get_qp(mon_node, port);
528
529		mad_context.perfmgr_context.node_guid = node_guid;
530		mad_context.perfmgr_context.port = port;
531		mad_context.perfmgr_context.mad_method = IB_MAD_METHOD_GET;
532#if ENABLE_OSM_PERF_MGR_PROFILE
533		gettimeofday(&(mad_context.perfmgr_context.query_start), NULL);
534#endif
535		OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Getting stats for node 0x%"
536			PRIx64 " port %d (lid %u) (%s)\n", node_guid, port,
537			cl_ntoh16(lid), node->print_desc);
538		status =
539		    osm_perfmgr_send_pc_mad(pm, lid, remote_qp, port,
540					    IB_MAD_METHOD_GET, &mad_context);
541		if (status != IB_SUCCESS)
542			OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C09: "
543				"Failed to issue port counter query for node 0x%"
544				PRIx64 " port %d (%s)\n",
545				node->node_info.node_guid, port,
546				node->print_desc);
547	}
548Exit:
549	cl_plock_release(pm->lock);
550	OSM_LOG_EXIT(pm->log);
551}
552
553/**********************************************************************
554 * Discovery stuff.
555 * Basically this code should not be here, but merged with main OpenSM
556 **********************************************************************/
557extern void osm_drop_mgr_process(IN osm_sm_t *sm);
558
559static int sweep_hop_1(osm_sm_t * sm)
560{
561	ib_api_status_t status = IB_SUCCESS;
562	osm_bind_handle_t h_bind;
563	osm_madw_context_t context;
564	osm_node_t *p_node;
565	osm_port_t *p_port;
566	osm_physp_t *p_physp;
567	osm_dr_path_t *p_dr_path;
568	osm_dr_path_t hop_1_path;
569	ib_net64_t port_guid;
570	uint8_t port_num;
571	uint8_t path_array[IB_SUBNET_PATH_HOPS_MAX];
572	uint8_t num_ports;
573	osm_physp_t *p_ext_physp;
574
575	port_guid = sm->p_subn->sm_port_guid;
576
577	p_port = osm_get_port_by_guid(sm->p_subn, port_guid);
578	if (!p_port) {
579		OSM_LOG(sm->p_log, OSM_LOG_ERROR,
580			"ERR 4C81: No SM port object\n");
581		return -1;
582	}
583
584	p_node = p_port->p_node;
585	port_num = ib_node_info_get_local_port_num(&p_node->node_info);
586
587	OSM_LOG(sm->p_log, OSM_LOG_DEBUG,
588		"Probing hop 1 on local port %u\n", port_num);
589
590	p_physp = osm_node_get_physp_ptr(p_node, port_num);
591
592	CL_ASSERT(p_physp);
593
594	p_dr_path = osm_physp_get_dr_path_ptr(p_physp);
595	h_bind = osm_dr_path_get_bind_handle(p_dr_path);
596
597	CL_ASSERT(h_bind != OSM_BIND_INVALID_HANDLE);
598
599	memset(path_array, 0, sizeof(path_array));
600	/* the hop_1 operations depend on the type of our node.
601	 * Currently - legal nodes that can host SM are SW and CA */
602	switch (osm_node_get_type(p_node)) {
603	case IB_NODE_TYPE_CA:
604	case IB_NODE_TYPE_ROUTER:
605		memset(&context, 0, sizeof(context));
606		context.ni_context.node_guid = osm_node_get_node_guid(p_node);
607		context.ni_context.port_num = port_num;
608
609		path_array[1] = port_num;
610
611		osm_dr_path_init(&hop_1_path, h_bind, 1, path_array);
612		status = osm_req_get(sm, &hop_1_path,
613				     IB_MAD_ATTR_NODE_INFO, 0,
614				     CL_DISP_MSGID_NONE, &context);
615
616		if (status != IB_SUCCESS)
617			OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 4C82: "
618				"Request for NodeInfo failed\n");
619		break;
620
621	case IB_NODE_TYPE_SWITCH:
622		/* Need to go over all the ports of the switch, and send a node_info
623		 * from them. This doesn't include the port 0 of the switch, which
624		 * hosts the SM.
625		 * Note: We'll send another switchInfo on port 0, since if no ports
626		 * are connected, we still want to get some response, and have the
627		 * subnet come up.
628		 */
629		num_ports = osm_node_get_num_physp(p_node);
630		for (port_num = 0; port_num < num_ports; port_num++) {
631			/* go through the port only if the port is not DOWN */
632			p_ext_physp = osm_node_get_physp_ptr(p_node, port_num);
633			if (!p_ext_physp || ib_port_info_get_port_state
634			    (&p_ext_physp->port_info) <= IB_LINK_DOWN)
635				continue;
636
637			memset(&context, 0, sizeof(context));
638			context.ni_context.node_guid =
639			    osm_node_get_node_guid(p_node);
640			context.ni_context.port_num = port_num;
641
642			path_array[1] = port_num;
643
644			osm_dr_path_init(&hop_1_path, h_bind, 1, path_array);
645			status = osm_req_get(sm, &hop_1_path,
646					     IB_MAD_ATTR_NODE_INFO, 0,
647					     CL_DISP_MSGID_NONE, &context);
648
649			if (status != IB_SUCCESS)
650				OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 4C82: "
651					"Request for NodeInfo failed\n");
652		}
653		break;
654
655	default:
656		OSM_LOG(sm->p_log, OSM_LOG_ERROR,
657			"ERR 4C83: Unknown node type %d\n",
658			osm_node_get_type(p_node));
659	}
660
661	return (status);
662}
663
664static unsigned is_sm_port_down(osm_sm_t * const sm)
665{
666	ib_net64_t port_guid;
667	osm_port_t *p_port;
668
669	port_guid = sm->p_subn->sm_port_guid;
670	if (port_guid == 0)
671		return 1;
672
673	CL_PLOCK_ACQUIRE(sm->p_lock);
674	p_port = osm_get_port_by_guid(sm->p_subn, port_guid);
675	if (!p_port) {
676		CL_PLOCK_RELEASE(sm->p_lock);
677		OSM_LOG(sm->p_log, OSM_LOG_ERROR, "ERR 4C85: "
678			"SM port with GUID:%016" PRIx64 " is unknown\n",
679			cl_ntoh64(port_guid));
680		return 1;
681	}
682	CL_PLOCK_RELEASE(sm->p_lock);
683
684	return osm_physp_get_port_state(p_port->p_physp) == IB_LINK_DOWN;
685}
686
687static int sweep_hop_0(osm_sm_t * const sm)
688{
689	ib_api_status_t status;
690	osm_dr_path_t dr_path;
691	osm_bind_handle_t h_bind;
692	uint8_t path_array[IB_SUBNET_PATH_HOPS_MAX];
693
694	memset(path_array, 0, sizeof(path_array));
695
696	h_bind = osm_sm_mad_ctrl_get_bind_handle(&sm->mad_ctrl);
697	if (h_bind == OSM_BIND_INVALID_HANDLE) {
698		OSM_LOG(sm->p_log, OSM_LOG_DEBUG, "No bound ports.\n");
699		return -1;
700	}
701
702	osm_dr_path_init(&dr_path, h_bind, 0, path_array);
703	status = osm_req_get(sm, &dr_path, IB_MAD_ATTR_NODE_INFO, 0,
704			     CL_DISP_MSGID_NONE, NULL);
705
706	if (status != IB_SUCCESS)
707		OSM_LOG(sm->p_log, OSM_LOG_ERROR,
708			"ERR 4C86: Request for NodeInfo failed\n");
709
710	return (status);
711}
712
713static void reset_node_count(cl_map_item_t * const p_map_item, void *cxt)
714{
715	osm_node_t *p_node = (osm_node_t *) p_map_item;
716	p_node->discovery_count = 0;
717}
718
719static void reset_port_count(cl_map_item_t * const p_map_item, void *cxt)
720{
721	osm_port_t *p_port = (osm_port_t *) p_map_item;
722	p_port->discovery_count = 0;
723}
724
725static void reset_switch_count(cl_map_item_t * const p_map_item, void *cxt)
726{
727	osm_switch_t *p_sw = (osm_switch_t *) p_map_item;
728	p_sw->discovery_count = 0;
729	p_sw->need_update = 0;
730}
731
732static int perfmgr_discovery(osm_opensm_t * osm)
733{
734	int ret;
735
736	CL_PLOCK_ACQUIRE(&osm->lock);
737	cl_qmap_apply_func(&osm->subn.node_guid_tbl, reset_node_count, NULL);
738	cl_qmap_apply_func(&osm->subn.port_guid_tbl, reset_port_count, NULL);
739	cl_qmap_apply_func(&osm->subn.sw_guid_tbl, reset_switch_count, NULL);
740	CL_PLOCK_RELEASE(&osm->lock);
741
742	osm->subn.in_sweep_hop_0 = TRUE;
743
744	ret = sweep_hop_0(&osm->sm);
745	if (ret)
746		goto _exit;
747
748	if (wait_for_pending_transactions(&osm->stats))
749		goto _exit;
750
751	if (is_sm_port_down(&osm->sm)) {
752		OSM_LOG(&osm->log, OSM_LOG_VERBOSE, "SM port is down\n");
753		goto _drop;
754	}
755
756	osm->subn.in_sweep_hop_0 = FALSE;
757
758	ret = sweep_hop_1(&osm->sm);
759	if (ret)
760		goto _exit;
761
762	if (wait_for_pending_transactions(&osm->stats))
763		goto _exit;
764
765_drop:
766	osm_drop_mgr_process(&osm->sm);
767
768_exit:
769	return ret;
770}
771
772/**********************************************************************
773 * Main PerfMgr processor - query the performance counters.
774 **********************************************************************/
775void osm_perfmgr_process(osm_perfmgr_t * pm)
776{
777#if ENABLE_OSM_PERF_MGR_PROFILE
778	struct timeval before, after;
779#endif
780
781	if (pm->state != PERFMGR_STATE_ENABLED)
782		return;
783
784	if (pm->subn->sm_state == IB_SMINFO_STATE_STANDBY ||
785	    pm->subn->sm_state == IB_SMINFO_STATE_NOTACTIVE)
786		perfmgr_discovery(pm->subn->p_osm);
787
788#if ENABLE_OSM_PERF_MGR_PROFILE
789	gettimeofday(&before, NULL);
790#endif
791	pm->sweep_state = PERFMGR_SWEEP_ACTIVE;
792	/* With the global lock held collect the node guids */
793	/* FIXME we should be able to track SA notices
794	 * and not have to sweep the node_guid_tbl each pass
795	 */
796	OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Gathering PerfMgr stats\n");
797	cl_plock_acquire(pm->lock);
798	cl_qmap_apply_func(&(pm->subn->node_guid_tbl),
799			   __collect_guids, (void *)pm);
800	cl_plock_release(pm->lock);
801
802	/* then for each node query their counters */
803	cl_qmap_apply_func(&(pm->monitored_map),
804			   __osm_perfmgr_query_counters, (void *)pm);
805
806	/* Clean out any nodes found to be removed during the
807	 * sweep
808	 */
809	__remove_marked_nodes(pm);
810
811#if ENABLE_OSM_PERF_MGR_PROFILE
812	/* spin on outstanding queries */
813	while (pm->outstanding_queries > 0)
814		cl_event_wait_on(&pm->sig_sweep, 1000, TRUE);
815
816	gettimeofday(&after, NULL);
817	diff_time(&before, &after, &after);
818	osm_log(pm->log, OSM_LOG_INFO,
819		"PerfMgr total sweep time : %ld.%06ld s\n"
820		"        fastest mad      : %g us\n"
821		"        slowest mad      : %g us\n"
822		"        average mad      : %g us\n",
823		after.tv_sec, after.tv_usec,
824		perfmgr_mad_stats.fastest_us,
825		perfmgr_mad_stats.slowest_us, perfmgr_mad_stats.avg_us);
826	perfmgr_clear_mad_stats();
827#endif
828
829	pm->sweep_state = PERFMGR_SWEEP_SLEEP;
830}
831
832/**********************************************************************
833 * PerfMgr timer - loop continuously and signal SM to run PerfMgr
834 * processor.
835 **********************************************************************/
836static void perfmgr_sweep(void *arg)
837{
838	osm_perfmgr_t *pm = arg;
839
840	if (pm->state == PERFMGR_STATE_ENABLED)
841		osm_sm_signal(pm->sm, OSM_SIGNAL_PERFMGR_SWEEP);
842	cl_timer_start(&pm->sweep_timer, pm->sweep_time_s * 1000);
843}
844
845/**********************************************************************
846 **********************************************************************/
847void osm_perfmgr_shutdown(osm_perfmgr_t * const pm)
848{
849	OSM_LOG_ENTER(pm->log);
850	cl_timer_stop(&pm->sweep_timer);
851	osm_perfmgr_mad_unbind(pm);
852	OSM_LOG_EXIT(pm->log);
853}
854
855/**********************************************************************
856 **********************************************************************/
857void osm_perfmgr_destroy(osm_perfmgr_t * const pm)
858{
859	OSM_LOG_ENTER(pm->log);
860	perfmgr_db_destroy(pm->db);
861	cl_timer_destroy(&pm->sweep_timer);
862	OSM_LOG_EXIT(pm->log);
863}
864
865/**********************************************************************
866 * Detect if someone else on the network could have cleared the counters
867 * without us knowing.  This is easy to detect because the counters never wrap
868 * but are "sticky"
869 *
870 * The one time this will not work is if the port is getting errors fast enough
871 * to have the reading overtake the previous reading.  In this case counters
872 * will be missed.
873 **********************************************************************/
874static void
875osm_perfmgr_check_oob_clear(osm_perfmgr_t * pm, __monitored_node_t *mon_node,
876			    uint8_t port, perfmgr_db_err_reading_t * cr,
877			    perfmgr_db_data_cnt_reading_t * dc)
878{
879	perfmgr_db_err_reading_t prev_err;
880	perfmgr_db_data_cnt_reading_t prev_dc;
881
882	if (perfmgr_db_get_prev_err(pm->db, mon_node->guid, port, &prev_err)
883	    != PERFMGR_EVENT_DB_SUCCESS) {
884		OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Failed to find previous "
885			"error reading for %s (guid 0x%" PRIx64 ") port %u\n",
886			mon_node->name, mon_node->guid, port);
887		return;
888	}
889
890	if (cr->symbol_err_cnt < prev_err.symbol_err_cnt ||
891	    cr->link_err_recover < prev_err.link_err_recover ||
892	    cr->link_downed < prev_err.link_downed ||
893	    cr->rcv_err < prev_err.rcv_err ||
894	    cr->rcv_rem_phys_err < prev_err.rcv_rem_phys_err ||
895	    cr->rcv_switch_relay_err < prev_err.rcv_switch_relay_err ||
896	    cr->xmit_discards < prev_err.xmit_discards ||
897	    cr->xmit_constraint_err < prev_err.xmit_constraint_err ||
898	    cr->rcv_constraint_err < prev_err.rcv_constraint_err ||
899	    cr->link_integrity < prev_err.link_integrity ||
900	    cr->buffer_overrun < prev_err.buffer_overrun ||
901	    cr->vl15_dropped < prev_err.vl15_dropped) {
902		OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 4C0A: "
903			"Detected an out of band error clear "
904			"on %s (0x%" PRIx64 ") port %u\n",
905			mon_node->name, mon_node->guid, port);
906		perfmgr_db_clear_prev_err(pm->db, mon_node->guid, port);
907	}
908
909	/* FIXME handle extended counters */
910	if (perfmgr_db_get_prev_dc(pm->db, mon_node->guid, port, &prev_dc)
911	    != PERFMGR_EVENT_DB_SUCCESS) {
912		OSM_LOG(pm->log, OSM_LOG_VERBOSE,
913			"Failed to find previous data count "
914			"reading for %s (0x%" PRIx64 ") port %u\n",
915			mon_node->name, mon_node->guid, port);
916		return;
917	}
918
919	if (dc->xmit_data < prev_dc.xmit_data ||
920	    dc->rcv_data < prev_dc.rcv_data ||
921	    dc->xmit_pkts < prev_dc.xmit_pkts ||
922	    dc->rcv_pkts < prev_dc.rcv_pkts) {
923		OSM_LOG(pm->log, OSM_LOG_ERROR,
924			"PerfMgr: ERR 4C0B: Detected an out of band data counter "
925			"clear on node %s (0x%" PRIx64 ") port %u\n",
926			mon_node->name, mon_node->guid, port);
927		perfmgr_db_clear_prev_dc(pm->db, mon_node->guid, port);
928	}
929}
930
931/**********************************************************************
932 * Return 1 if the value is "close" to overflowing
933 **********************************************************************/
934static int counter_overflow_4(uint8_t val)
935{
936	return (val >= 10);
937}
938
939static int counter_overflow_8(uint8_t val)
940{
941	return (val >= (UINT8_MAX - (UINT8_MAX / 4)));
942}
943
944static int counter_overflow_16(ib_net16_t val)
945{
946	return (cl_ntoh16(val) >= (UINT16_MAX - (UINT16_MAX / 4)));
947}
948
949static int counter_overflow_32(ib_net32_t val)
950{
951	return (cl_ntoh32(val) >= (UINT32_MAX - (UINT32_MAX / 4)));
952}
953
954/**********************************************************************
955 * Check if the port counters have overflowed and if so issue a clear
956 * MAD to the port.
957 **********************************************************************/
958static void
959osm_perfmgr_check_overflow(osm_perfmgr_t * pm, __monitored_node_t *mon_node,
960			   uint8_t port, ib_port_counters_t * pc)
961{
962	osm_madw_context_t mad_context;
963	ib_api_status_t status;
964	ib_net32_t remote_qp;
965
966	OSM_LOG_ENTER(pm->log);
967
968	if (counter_overflow_16(pc->symbol_err_cnt) ||
969	    counter_overflow_8(pc->link_err_recover) ||
970	    counter_overflow_8(pc->link_downed) ||
971	    counter_overflow_16(pc->rcv_err) ||
972	    counter_overflow_16(pc->rcv_rem_phys_err) ||
973	    counter_overflow_16(pc->rcv_switch_relay_err) ||
974	    counter_overflow_16(pc->xmit_discards) ||
975	    counter_overflow_8(pc->xmit_constraint_err) ||
976	    counter_overflow_8(pc->rcv_constraint_err) ||
977	    counter_overflow_4(PC_LINK_INT(pc->link_int_buffer_overrun)) ||
978	    counter_overflow_4(PC_BUF_OVERRUN(pc->link_int_buffer_overrun)) ||
979	    counter_overflow_16(pc->vl15_dropped) ||
980	    counter_overflow_32(pc->xmit_data) ||
981	    counter_overflow_32(pc->rcv_data) ||
982	    counter_overflow_32(pc->xmit_pkts) ||
983	    counter_overflow_32(pc->rcv_pkts)) {
984		osm_node_t *p_node = NULL;
985		ib_net16_t lid = 0;
986
987		osm_log(pm->log, OSM_LOG_VERBOSE,
988			"PerfMgr: Counter overflow: %s (0x%" PRIx64
989			") port %d; clearing counters\n",
990			mon_node->name, mon_node->guid, port);
991
992		cl_plock_acquire(pm->lock);
993		p_node = osm_get_node_by_guid(pm->subn, cl_hton64(mon_node->guid));
994		lid = get_lid(p_node, port, mon_node);
995		cl_plock_release(pm->lock);
996		if (lid == 0) {
997			OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 4C0C: "
998				"Failed to clear counters for %s (0x%"
999				PRIx64 ") port %d; failed to get lid\n",
1000				mon_node->name, mon_node->guid, port);
1001			goto Exit;
1002		}
1003
1004		remote_qp = get_qp(NULL, port);
1005
1006		mad_context.perfmgr_context.node_guid = mon_node->guid;
1007		mad_context.perfmgr_context.port = port;
1008		mad_context.perfmgr_context.mad_method = IB_MAD_METHOD_SET;
1009		/* clear port counters */
1010		status =
1011		    osm_perfmgr_send_pc_mad(pm, lid, remote_qp, port,
1012					    IB_MAD_METHOD_SET, &mad_context);
1013		if (status != IB_SUCCESS)
1014			OSM_LOG(pm->log, OSM_LOG_ERROR, "PerfMgr: ERR 4C11: "
1015				"Failed to send clear counters MAD for %s (0x%"
1016				PRIx64 ") port %d\n",
1017				mon_node->name, mon_node->guid, port);
1018
1019		perfmgr_db_clear_prev_dc(pm->db, mon_node->guid, port);
1020	}
1021
1022Exit:
1023	OSM_LOG_EXIT(pm->log);
1024}
1025
1026/**********************************************************************
1027 * Check values for logging of errors
1028 **********************************************************************/
1029static void
1030osm_perfmgr_log_events(osm_perfmgr_t * pm, __monitored_node_t *mon_node, uint8_t port,
1031		       perfmgr_db_err_reading_t * reading)
1032{
1033	perfmgr_db_err_reading_t prev_read;
1034	time_t time_diff = 0;
1035	perfmgr_db_err_t err =
1036	    perfmgr_db_get_prev_err(pm->db, mon_node->guid, port, &prev_read);
1037
1038	if (err != PERFMGR_EVENT_DB_SUCCESS) {
1039		OSM_LOG(pm->log, OSM_LOG_VERBOSE, "Failed to find previous "
1040			"reading for %s (0x%" PRIx64 ") port %u\n",
1041			mon_node->name, mon_node->guid, port);
1042		return;
1043	}
1044	time_diff = (reading->time - prev_read.time);
1045
1046	/* FIXME these events should be defineable by the user in a config
1047	 * file somewhere. */
1048	if (reading->symbol_err_cnt > prev_read.symbol_err_cnt)
1049		OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C0D: "
1050			"Found %" PRIu64 " Symbol errors in %lu sec on %s (0x%"
1051			PRIx64 ") port %u\n",
1052			(reading->symbol_err_cnt - prev_read.symbol_err_cnt),
1053			time_diff, mon_node->name, mon_node->guid, port);
1054
1055	if (reading->rcv_err > prev_read.rcv_err)
1056		OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C0E: "
1057			"Found %" PRIu64
1058			" Receive errors in %lu sec on %s (0x%" PRIx64
1059			") port %u\n", (reading->rcv_err - prev_read.rcv_err),
1060			time_diff, mon_node->name, mon_node->guid, port);
1061
1062	if (reading->xmit_discards > prev_read.xmit_discards)
1063		OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C0F: "
1064			"Found %" PRIu64 " Xmit Discards in %lu sec on %s (0x%"
1065			PRIx64 ") port %u\n",
1066			(reading->xmit_discards - prev_read.xmit_discards),
1067			time_diff, mon_node->name, mon_node->guid, port);
1068}
1069
1070/**********************************************************************
1071 * The dispatcher uses a thread pool which will call this function when
1072 * we have a thread available to process our mad received from the wire.
1073 **********************************************************************/
1074static void osm_pc_rcv_process(void *context, void *data)
1075{
1076	osm_perfmgr_t *const pm = (osm_perfmgr_t *) context;
1077	osm_madw_t *p_madw = (osm_madw_t *) data;
1078	osm_madw_context_t *mad_context = &(p_madw->context);
1079	ib_port_counters_t *wire_read =
1080	    (ib_port_counters_t *) & (osm_madw_get_perfmgt_mad_ptr(p_madw)->
1081				      data);
1082	ib_mad_t *p_mad = osm_madw_get_mad_ptr(p_madw);
1083	uint64_t node_guid = mad_context->perfmgr_context.node_guid;
1084	uint8_t port = mad_context->perfmgr_context.port;
1085	perfmgr_db_err_reading_t err_reading;
1086	perfmgr_db_data_cnt_reading_t data_reading;
1087	cl_map_item_t *p_node;
1088	__monitored_node_t *p_mon_node;
1089
1090	OSM_LOG_ENTER(pm->log);
1091
1092	/* go ahead and get the monitored node struct to have the printable
1093	 * name if needed in messages
1094	 */
1095	if ((p_node = cl_qmap_get(&(pm->monitored_map), node_guid)) ==
1096	    cl_qmap_end(&(pm->monitored_map))) {
1097		OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C12: GUID 0x%016"
1098			PRIx64 " not found in monitored map\n",
1099			node_guid);
1100		goto Exit;
1101	}
1102	p_mon_node = (__monitored_node_t *) p_node;
1103
1104	OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1105		"Processing received MAD status 0x%x context 0x%"
1106		PRIx64 " port %u\n", p_mad->status, node_guid, port);
1107
1108	/* Response could also be redirection (IBM eHCA PMA does this) */
1109	if (p_mad->attr_id == IB_MAD_ATTR_CLASS_PORT_INFO) {
1110		char gid_str[INET6_ADDRSTRLEN];
1111		ib_class_port_info_t *cpi =
1112		    (ib_class_port_info_t *) &
1113		    (osm_madw_get_perfmgt_mad_ptr(p_madw)->data);
1114		ib_api_status_t status;
1115
1116		OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1117			"Redirection to LID %u GID %s QP 0x%x received\n",
1118			cl_ntoh16(cpi->redir_lid),
1119			inet_ntop(AF_INET6, cpi->redir_gid.raw, gid_str,
1120				  sizeof gid_str),
1121			cl_ntoh32(cpi->redir_qp));
1122
1123		/* LID or GID redirection ? */
1124		/* For GID redirection, need to get PathRecord from SA */
1125		if (cpi->redir_lid == 0) {
1126			OSM_LOG(pm->log, OSM_LOG_VERBOSE,
1127				"GID redirection not currently implemented!\n");
1128			goto Exit;
1129		}
1130
1131		if (!pm->subn->opt.perfmgr_redir) {
1132				OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C16: "
1133				       "redirection requested but disabled\n");
1134			goto Exit;
1135		}
1136
1137		/* LID redirection support (easier than GID redirection) */
1138		cl_plock_acquire(pm->lock);
1139		/* Now, validate port number */
1140		if (port > p_mon_node->redir_tbl_size) {
1141			cl_plock_release(pm->lock);
1142			OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C13: "
1143				"Invalid port num %d for GUID 0x%016"
1144				PRIx64 " num ports %d\n", port, node_guid,
1145				p_mon_node->redir_tbl_size);
1146			goto Exit;
1147		}
1148		p_mon_node->redir_port[port].redir_lid = cpi->redir_lid;
1149		p_mon_node->redir_port[port].redir_qp = cpi->redir_qp;
1150		cl_plock_release(pm->lock);
1151
1152		/* Finally, reissue the query to the redirected location */
1153		status =
1154		    osm_perfmgr_send_pc_mad(pm, cpi->redir_lid, cpi->redir_qp,
1155					    port,
1156					    mad_context->perfmgr_context.
1157					    mad_method, mad_context);
1158		if (status != IB_SUCCESS)
1159			OSM_LOG(pm->log, OSM_LOG_ERROR, "ERR 4C14: "
1160				"Failed to send redirected MAD with method 0x%x for node 0x%"
1161				PRIx64 " port %d\n",
1162				mad_context->perfmgr_context.mad_method,
1163				node_guid, port);
1164		goto Exit;
1165	}
1166
1167	CL_ASSERT(p_mad->attr_id == IB_MAD_ATTR_PORT_CNTRS);
1168
1169	perfmgr_db_fill_err_read(wire_read, &err_reading);
1170	/* FIXME separate query for extended counters if they are supported
1171	 * on the port.
1172	 */
1173	perfmgr_db_fill_data_cnt_read_pc(wire_read, &data_reading);
1174
1175	/* detect an out of band clear on the port */
1176	if (mad_context->perfmgr_context.mad_method != IB_MAD_METHOD_SET)
1177		osm_perfmgr_check_oob_clear(pm, p_mon_node, port,
1178					    &err_reading, &data_reading);
1179
1180	/* log any critical events from this reading */
1181	osm_perfmgr_log_events(pm, p_mon_node, port, &err_reading);
1182
1183	if (mad_context->perfmgr_context.mad_method == IB_MAD_METHOD_GET) {
1184		perfmgr_db_add_err_reading(pm->db, node_guid, port,
1185					   &err_reading);
1186		perfmgr_db_add_dc_reading(pm->db, node_guid, port,
1187					  &data_reading);
1188	} else {
1189		perfmgr_db_clear_prev_err(pm->db, node_guid, port);
1190		perfmgr_db_clear_prev_dc(pm->db, node_guid, port);
1191	}
1192
1193	osm_perfmgr_check_overflow(pm, p_mon_node, port, wire_read);
1194
1195#if ENABLE_OSM_PERF_MGR_PROFILE
1196	do {
1197		struct timeval proc_time;
1198		gettimeofday(&proc_time, NULL);
1199		diff_time(&(p_madw->context.perfmgr_context.query_start),
1200			  &proc_time, &proc_time);
1201		update_mad_stats(&proc_time);
1202	} while (0);
1203#endif
1204
1205Exit:
1206	osm_mad_pool_put(pm->mad_pool, p_madw);
1207
1208	OSM_LOG_EXIT(pm->log);
1209}
1210
1211/**********************************************************************
1212 * Initialize the PerfMgr object
1213 **********************************************************************/
1214ib_api_status_t
1215osm_perfmgr_init(osm_perfmgr_t * const pm, osm_opensm_t *osm,
1216		 const osm_subn_opt_t * const p_opt)
1217{
1218	ib_api_status_t status = IB_SUCCESS;
1219
1220	OSM_LOG_ENTER(&osm->log);
1221
1222	OSM_LOG(&osm->log, OSM_LOG_VERBOSE, "Initializing PerfMgr\n");
1223
1224	memset(pm, 0, sizeof(*pm));
1225
1226	cl_event_construct(&pm->sig_sweep);
1227	cl_event_init(&pm->sig_sweep, FALSE);
1228	pm->subn = &osm->subn;
1229	pm->sm = &osm->sm;
1230	pm->log = &osm->log;
1231	pm->mad_pool = &osm->mad_pool;
1232	pm->vendor = osm->p_vendor;
1233	pm->trans_id = OSM_PERFMGR_INITIAL_TID_VALUE;
1234	pm->lock = &osm->lock;
1235	pm->state =
1236	    p_opt->perfmgr ? PERFMGR_STATE_ENABLED : PERFMGR_STATE_DISABLE;
1237	pm->sweep_time_s = p_opt->perfmgr_sweep_time_s;
1238	pm->max_outstanding_queries = p_opt->perfmgr_max_outstanding_queries;
1239	pm->osm = osm;
1240
1241	status = cl_timer_init(&pm->sweep_timer, perfmgr_sweep, pm);
1242	if (status != IB_SUCCESS)
1243		goto Exit;
1244
1245	pm->db = perfmgr_db_construct(pm);
1246	if (!pm->db) {
1247		pm->state = PERFMGR_STATE_NO_DB;
1248		goto Exit;
1249	}
1250
1251	pm->pc_disp_h = cl_disp_register(&osm->disp, OSM_MSG_MAD_PORT_COUNTERS,
1252					 osm_pc_rcv_process, pm);
1253	if (pm->pc_disp_h == CL_DISP_INVALID_HANDLE)
1254		goto Exit;
1255
1256	__init_monitored_nodes(pm);
1257
1258	cl_timer_start(&pm->sweep_timer, pm->sweep_time_s * 1000);
1259
1260Exit:
1261	OSM_LOG_EXIT(pm->log);
1262	return (status);
1263}
1264
1265/**********************************************************************
1266 * Clear the counters from the db
1267 **********************************************************************/
1268void osm_perfmgr_clear_counters(osm_perfmgr_t * pm)
1269{
1270	/**
1271	 * FIXME todo issue clear on the fabric?
1272	 */
1273	perfmgr_db_clear_counters(pm->db);
1274	osm_log(pm->log, OSM_LOG_INFO, "PerfMgr counters cleared\n");
1275}
1276
1277/*******************************************************************
1278 * Have the DB dump its information to the file specified
1279 *******************************************************************/
1280void osm_perfmgr_dump_counters(osm_perfmgr_t * pm, perfmgr_db_dump_t dump_type)
1281{
1282	char path[256];
1283	char *file_name;
1284	if (pm->subn->opt.event_db_dump_file)
1285		file_name = pm->subn->opt.event_db_dump_file;
1286	else {
1287		snprintf(path, sizeof(path), "%s/%s",
1288			 pm->subn->opt.dump_files_dir,
1289			 OSM_PERFMGR_DEFAULT_DUMP_FILE);
1290		file_name = path;
1291	}
1292	if (perfmgr_db_dump(pm->db, file_name, dump_type) != 0)
1293		OSM_LOG(pm->log, OSM_LOG_ERROR, "Failed to dump file %s : %s",
1294			file_name, strerror(errno));
1295}
1296
1297/*******************************************************************
1298 * Have the DB print its information to the fp specified
1299 *******************************************************************/
1300void
1301osm_perfmgr_print_counters(osm_perfmgr_t *pm, char *nodename, FILE *fp)
1302{
1303	uint64_t guid = strtoull(nodename, NULL, 0);
1304	if (guid == 0 && errno == EINVAL) {
1305		perfmgr_db_print_by_name(pm->db, nodename, fp);
1306	} else {
1307		perfmgr_db_print_by_guid(pm->db, guid, fp);
1308	}
1309}
1310
1311#endif				/* ENABLE_OSM_PERF_MGR */
1312