1#!/usr/bin/perl
2#
3# Copyright (c) 2006 The Regents of the University of California.
4# Copyright (c) 2006-2008 Voltaire, Inc. All rights reserved.
5#
6# Produced at Lawrence Livermore National Laboratory.
7# Written by Ira Weiny <weiny2@llnl.gov>.
8#            Erez Strauss from Voltaire for help in the get_link_ends code.
9#
10# This software is available to you under a choice of one of two
11# licenses.  You may choose to be licensed under the terms of the GNU
12# General Public License (GPL) Version 2, available from the file
13# COPYING in the main directory of this source tree, or the
14# OpenIB.org BSD license below:
15#
16#     Redistribution and use in source and binary forms, with or
17#     without modification, are permitted provided that the following
18#     conditions are met:
19#
20#      - Redistributions of source code must retain the above
21#        copyright notice, this list of conditions and the following
22#        disclaimer.
23#
24#      - Redistributions in binary form must reproduce the above
25#        copyright notice, this list of conditions and the following
26#        disclaimer in the documentation and/or other materials
27#        provided with the distribution.
28#
29# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
33# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
34# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
35# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36# SOFTWARE.
37#
38
39use strict;
40
41%IBswcountlimits::cur_counts      = ();
42%IBswcountlimits::new_counts      = ();
43@IBswcountlimits::suppress_errors = ();
44$IBswcountlimits::link_ends       = undef;
45$IBswcountlimits::pause_time      = 10;
46$IBswcountlimits::cache_dir       = "/var/cache/infiniband-diags";
47
48# all the PerfMgt counters
49@IBswcountlimits::counters = (
50	"SymbolErrors",        "LinkRecovers",
51	"LinkDowned",          "RcvErrors",
52	"RcvRemotePhysErrors", "RcvSwRelayErrors",
53	"XmtDiscards",         "XmtConstraintErrors",
54	"RcvConstraintErrors", "LinkIntegrityErrors",
55	"ExcBufOverrunErrors", "VL15Dropped",
56	"XmtData",             "RcvData",
57	"XmtPkts",             "RcvPkts"
58);
59
60# non-critical counters
61%IBswcountlimits::error_counters = (
62	"SymbolErrors",
63"No action is required except if counter is increasing along with LinkRecovers",
64	"LinkRecovers",
65"If this is increasing along with SymbolErrors this may indicate a bad link, run ibswportwatch.pl on this port",
66	"LinkDowned",
67	"Number of times the port has gone down (Usually for valid reasons)",
68	"RcvErrors",
69"This is a bad link, if the link is internal to a 288 try setting SDR, otherwise check the cable",
70	"RcvRemotePhysErrors",
71	"This indicates a problem ELSEWHERE in the fabric.",
72	"XmtDiscards",
73"This is a symptom of congestion and may require tweaking either HOQ or switch lifetime values",
74	"XmtConstraintErrors",
75	"This is a result of bad partitioning, check partition configuration.",
76	"RcvConstraintErrors",
77	"This is a result of bad partitioning, check partition configuration.",
78	"LinkIntegrityErrors",
79	"May indicate a bad link, run ibswportwatch.pl on this port",
80	"ExcBufOverrunErrors",
81"This is a flow control state machine error and can be caused by packets with physical errors",
82	"VL15Dropped",
83	"check with ibswportwatch.pl, if increasing in SMALL increments, OK",
84	"RcvSwRelayErrors",
85	"This counter can increase due to a valid network event"
86);
87
88sub check_counters
89{
90	my $print_action = $_[0];
91	my $actions      = undef;
92
93	COUNTER: foreach my $cnt (keys %IBswcountlimits::error_counters) {
94		if ($IBswcountlimits::cur_counts{$cnt} > 0) {
95			foreach my $sup_cnt (@IBswcountlimits::suppress_errors) {
96				if ("$cnt" eq $sup_cnt) { next COUNTER; }
97			}
98			print " [$cnt == $IBswcountlimits::cur_counts{$cnt}]";
99			if ("$print_action" eq "yes") {
100				$actions = join " ",
101				  (
102					$actions,
103					"         $cnt: $IBswcountlimits::error_counters{$cnt}\n"
104				  );
105			}
106		}
107	}
108
109	if ($actions) {
110		print "\n         Actions:\n$actions";
111	}
112}
113
114# Data counters
115%IBswcountlimits::data_counters = (
116	"XmtData",
117"Total number of data octets, divided by 4, transmitted on all VLs from the port",
118	"RcvData",
119"Total number of data octets, divided by 4, received on all VLs to the port",
120	"XmtPkts",
121"Total number of packets, excluding link packets, transmitted on all VLs from the port",
122	"RcvPkts",
123"Total number of packets, excluding link packets, received on all VLs to the port"
124);
125
126sub check_data_counters
127{
128	my $print_action = $_[0];
129	my $actions      = undef;
130
131	COUNTER: foreach my $cnt (keys %IBswcountlimits::data_counters) {
132		print " [$cnt == $IBswcountlimits::cur_counts{$cnt}]";
133		if ("$print_action" eq "yes") {
134			$actions = join " ",
135			  (
136				$actions,
137				"         $cnt: $IBswcountlimits::data_counters{$cnt}\n"
138			  );
139		}
140	}
141	if ($actions) {
142		print "\n         Descriptions:\n$actions";
143	}
144}
145
146sub print_data_rates
147{
148	COUNTER: foreach my $cnt (keys %IBswcountlimits::data_counters) {
149		my $cnt_per_second = calculate_rate(
150			$IBswcountlimits::cur_counts{$cnt},
151			$IBswcountlimits::new_counts{$cnt}
152		);
153		print "   $cnt_per_second $cnt/second\n";
154	}
155}
156
157# =========================================================================
158# Rate dependent counters
159# calculate the count/sec
160# calculate_rate old_count new_count
161sub calculate_rate
162{
163	my $rate    = 0;
164	my $old_val = $_[0];
165	my $new_val = $_[1];
166	my $rate    = ($new_val - $old_val) / $IBswcountlimits::pause_time;
167	return ($rate);
168}
169%IBswcountlimits::rate_dep_thresholds = (
170	"SymbolErrors", 10, "LinkRecovers",        10,
171	"RcvErrors",    10, "LinkIntegrityErrors", 10,
172	"XmtDiscards",  10
173);
174
175sub check_counter_rates
176{
177	foreach my $rate_count (keys %IBswcountlimits::rate_dep_thresholds) {
178		my $rate = calculate_rate(
179			$IBswcountlimits::cur_counts{$rate_count},
180			$IBswcountlimits::new_counts{$rate_count}
181		);
182		if ($rate > $IBswcountlimits::rate_dep_thresholds{$rate_count}) {
183			print "Detected excessive rate for $rate_count ($rate cnts/sec)\n";
184		} elsif ($rate > 0) {
185			print "Detected rate for $rate_count ($rate cnts/sec)\n";
186		}
187	}
188}
189
190# =========================================================================
191#
192sub clear_counters
193{
194	# clear the counters
195	foreach my $count (@IBswcountlimits::counters) {
196		$IBswcountlimits::cur_counts{$count} = 0;
197	}
198}
199
200# =========================================================================
201#
202sub any_counts
203{
204	my $total = 0;
205	my $count = 0;
206	foreach $count (keys %IBswcountlimits::critical) {
207		$total = $total + $IBswcountlimits::cur_counts{$count};
208	}
209	COUNTER: foreach $count (keys %IBswcountlimits::error_counters) {
210		foreach my $sup_cnt (@IBswcountlimits::suppress_errors) {
211			if ("$count" eq $sup_cnt) { next COUNTER; }
212		}
213		$total = $total + $IBswcountlimits::cur_counts{$count};
214	}
215	return ($total);
216}
217
218# =========================================================================
219#
220sub ensure_cache_dir
221{
222	if (!(-d "$IBswcountlimits::cache_dir") &&
223	    !mkdir($IBswcountlimits::cache_dir, 0700)) {
224		die "cannot create $IBswcountlimits::cache_dir: $!\n";
225	}
226}
227
228# =========================================================================
229# get_cache_file(ca_name, ca_port)
230#
231sub get_cache_file
232{
233	my $ca_name = $_[0];
234	my $ca_port = $_[1];
235	ensure_cache_dir;
236	return (
237		"$IBswcountlimits::cache_dir/ibnetdiscover-$ca_name-$ca_port.topology");
238}
239
240# =========================================================================
241# get_ca_name_port_param_string(ca_name, ca_port)
242#
243sub get_ca_name_port_param_string
244{
245	my $ca_name = $_[0];
246	my $ca_port = $_[1];
247
248	if ("$ca_name" ne "") { $ca_name = "-C $ca_name"; }
249	if ("$ca_port" ne "") { $ca_port = "-P $ca_port"; }
250
251	return ("$ca_name $ca_port");
252}
253
254# =========================================================================
255# generate_ibnetdiscover_topology(ca_name, ca_port)
256#
257sub generate_ibnetdiscover_topology
258{
259	my $ca_name      = $_[0];
260	my $ca_port      = $_[1];
261	my $cache_file   = get_cache_file($ca_name, $ca_port);
262	my $extra_params = get_ca_name_port_param_string($ca_name, $ca_port);
263
264	if (`ibnetdiscover -g $extra_params > $cache_file`) {
265		die "Execution of ibnetdiscover failed: $!\n";
266	}
267}
268
269# =========================================================================
270# get_link_ends(regenerate_map, ca_name, ca_port)
271#
272sub get_link_ends
273{
274	my $regenerate_map = $_[0];
275	my $ca_name        = $_[1];
276	my $ca_port        = $_[2];
277
278	my $cache_file = get_cache_file($ca_name, $ca_port);
279
280	if ($regenerate_map || !(-f "$cache_file")) {
281		generate_ibnetdiscover_topology($ca_name, $ca_port);
282	}
283	open IBNET_TOPO, "<$cache_file"
284	  or die "Failed to open ibnet topology: $!\n";
285	my $in_switch  = "no";
286	my $desc       = "";
287	my $guid       = "";
288	my $loc_sw_lid = "";
289
290	my $loc_port = "";
291	my $line     = "";
292
293	while ($line = <IBNET_TOPO>) {
294		if ($line =~ /^Switch.*\"S-(.*)\"\s+#.*\"(.*)\".* lid (\d+).*/) {
295			$guid       = $1;
296			$desc       = $2;
297			$loc_sw_lid = $3;
298			$in_switch  = "yes";
299		}
300		if ($in_switch eq "yes") {
301			my $rec = undef;
302			if ($line =~
303/^\[(\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/
304			  )
305			{
306				$loc_port = $1;
307				my $rem_guid      = $2;
308				my $rem_port      = $3;
309				my $rem_port_guid = $4;
310				my $rem_desc      = $5;
311				my $rem_lid       = $6;
312				$rec = {
313					loc_guid      => "0x$guid",
314					loc_port      => $loc_port,
315					loc_ext_port  => "",
316					loc_desc      => $desc,
317					loc_sw_lid    => $loc_sw_lid,
318					rem_guid      => "0x$rem_guid",
319					rem_lid       => $rem_lid,
320					rem_port      => $rem_port,
321					rem_ext_port  => "",
322					rem_desc      => $rem_desc,
323					rem_port_guid => $rem_port_guid
324				};
325			}
326			if ($line =~
327/^\[(\d+)\]\[ext (\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/
328			  )
329			{
330				$loc_port = $1;
331				my $loc_ext_port  = $2;
332				my $rem_guid      = $3;
333				my $rem_port      = $4;
334				my $rem_port_guid = $5;
335				my $rem_desc      = $6;
336				my $rem_lid       = $7;
337				$rec = {
338					loc_guid      => "0x$guid",
339					loc_port      => $loc_port,
340					loc_ext_port  => $loc_ext_port,
341					loc_desc      => $desc,
342					loc_sw_lid    => $loc_sw_lid,
343					rem_guid      => "0x$rem_guid",
344					rem_lid       => $rem_lid,
345					rem_port      => $rem_port,
346					rem_ext_port  => "",
347					rem_desc      => $rem_desc,
348					rem_port_guid => $rem_port_guid
349				};
350			}
351			if ($line =~
352/^\[(\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\]\[ext (\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/
353			  )
354			{
355				$loc_port = $1;
356				my $rem_guid      = $2;
357				my $rem_port      = $3;
358				my $rem_ext_port  = $4;
359				my $rem_port_guid = $5;
360				my $rem_desc      = $6;
361				my $rem_lid       = $7;
362				$rec = {
363					loc_guid      => "0x$guid",
364					loc_port      => $loc_port,
365					loc_ext_port  => "",
366					loc_desc      => $desc,
367					loc_sw_lid    => $loc_sw_lid,
368					rem_guid      => "0x$rem_guid",
369					rem_lid       => $rem_lid,
370					rem_port      => $rem_port,
371					rem_ext_port  => $rem_ext_port,
372					rem_desc      => $rem_desc,
373					rem_port_guid => $rem_port_guid
374				};
375			}
376			if ($line =~
377/^\[(\d+)\]\[ext (\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\]\[ext (\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/
378			  )
379			{
380				$loc_port = $1;
381				my $loc_ext_port  = $2;
382				my $rem_guid      = $3;
383				my $rem_port      = $4;
384				my $rem_ext_port  = $5;
385				my $rem_port_guid = $6;
386				my $rem_desc      = $7;
387				my $rem_lid       = $8;
388				$rec = {
389					loc_guid      => "0x$guid",
390					loc_port      => $loc_port,
391					loc_ext_port  => $loc_ext_port,
392					loc_desc      => $desc,
393					loc_sw_lid    => $loc_sw_lid,
394					rem_guid      => "0x$rem_guid",
395					rem_lid       => $rem_lid,
396					rem_port      => $rem_port,
397					rem_ext_port  => $rem_ext_port,
398					rem_desc      => $rem_desc,
399					rem_port_guid => $rem_port_guid
400				};
401			}
402			if ($rec) {
403				$rec->{rem_port_guid} =~ s/\((.*)\)/$1/;
404				$IBswcountlimits::link_ends{"0x$guid"}{$loc_port} = $rec;
405			}
406		}
407
408		if ($line =~ /^Ca.*/ || $line =~ /^Rt.*/) { $in_switch = "no"; }
409	}
410	close IBNET_TOPO;
411}
412
413# =========================================================================
414# get_num_ports(switch_guid, ca_name, ca_port)
415#
416sub get_num_ports
417{
418	my $guid         = $_[0];
419	my $ca_name      = $_[1];
420	my $ca_port      = $_[2];
421	my $num_ports    = 0;
422	my $extra_params = get_ca_name_port_param_string($ca_name, $ca_port);
423
424	my $data         = `smpquery $extra_params -G nodeinfo $guid` ||
425		die "'smpquery $extra_params -G nodeinfo $guid' faild\n";
426	my @lines        = split("\n", $data);
427	my $pkt_lifetime = "";
428	foreach my $line (@lines) {
429		if ($line =~ /^NumPorts:\.+(.*)/) { $num_ports = $1; }
430	}
431	return ($num_ports);
432}
433
434# =========================================================================
435# format_guid(guid)
436# The diags store the guids as strings.  This converts the guid supplied
437# to the correct string format.
438# eg: 0x0008f10400411f56 == 0x8f10400411f56
439#
440sub format_guid
441{
442	my $guid     = $_[0];
443	my $guid_str = "";
444
445	$guid =~ tr/[A-F]/[a-f]/;
446	if ($guid =~ /0x(.*)/) {
447		$guid_str = sprintf("0x%016s", $1);
448	} else {
449		$guid_str = sprintf("0x%016s", $guid);
450	}
451	return ($guid_str);
452}
453
454# =========================================================================
455# convert_dr_to_guid(direct_route)
456#
457sub convert_dr_to_guid
458{
459	my $guid = undef;
460
461	my $data = `smpquery nodeinfo -D $_[0]` ||
462		die "'mpquery nodeinfo -D $_[0]' failed\n";
463	my @lines = split("\n", $data);
464	foreach my $line (@lines) {
465		if ($line =~ /^PortGuid:\.+(.*)/) { $guid = $1; }
466	}
467	return format_guid($guid);
468}
469
470# =========================================================================
471# get_node_type(guid_or_direct_route)
472#
473sub get_node_type
474{
475	my $type      = undef;
476	my $query_arg = "smpquery nodeinfo ";
477	if ($_[0] =~ /x/) {
478		# assume arg is a guid if contains an x
479		$query_arg .= "-G " . $_[0];
480	} else {
481		# assume arg is a direct path
482		$query_arg .= "-D " . $_[0];
483	}
484
485	my $data = `$query_arg` ||
486		die "'$query_arg' failed\n";
487	my @lines = split("\n", $data);
488	foreach my $line (@lines) {
489		if ($line =~ /^NodeType:\.+(.*)/) { $type = $1; }
490	}
491	return $type;
492}
493
494# =========================================================================
495# is_switch(guid_or_direct_route)
496#
497sub is_switch
498{
499	my $node_type = &get_node_type($_[0]);
500	return ($node_type =~ /Switch/);
501}
502