1#!/usr/bin/perl 2# 3# Copyright (c) 2006 The Regents of the University of California. 4# Copyright (c) 2006-2008 Voltaire, Inc. All rights reserved. 5# 6# Produced at Lawrence Livermore National Laboratory. 7# Written by Ira Weiny <weiny2@llnl.gov>. 8# Erez Strauss from Voltaire for help in the get_link_ends code. 9# 10# This software is available to you under a choice of one of two 11# licenses. You may choose to be licensed under the terms of the GNU 12# General Public License (GPL) Version 2, available from the file 13# COPYING in the main directory of this source tree, or the 14# OpenIB.org BSD license below: 15# 16# Redistribution and use in source and binary forms, with or 17# without modification, are permitted provided that the following 18# conditions are met: 19# 20# - Redistributions of source code must retain the above 21# copyright notice, this list of conditions and the following 22# disclaimer. 23# 24# - Redistributions in binary form must reproduce the above 25# copyright notice, this list of conditions and the following 26# disclaimer in the documentation and/or other materials 27# provided with the distribution. 28# 29# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 30# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 31# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 32# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 33# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 34# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 35# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 36# SOFTWARE. 37# 38 39use strict; 40 41%IBswcountlimits::cur_counts = (); 42%IBswcountlimits::new_counts = (); 43@IBswcountlimits::suppress_errors = (); 44$IBswcountlimits::link_ends = undef; 45$IBswcountlimits::pause_time = 10; 46$IBswcountlimits::cache_dir = "/var/cache/infiniband-diags"; 47 48# all the PerfMgt counters 49@IBswcountlimits::counters = ( 50 "SymbolErrors", "LinkRecovers", 51 "LinkDowned", "RcvErrors", 52 "RcvRemotePhysErrors", "RcvSwRelayErrors", 53 "XmtDiscards", "XmtConstraintErrors", 54 "RcvConstraintErrors", "LinkIntegrityErrors", 55 "ExcBufOverrunErrors", "VL15Dropped", 56 "XmtData", "RcvData", 57 "XmtPkts", "RcvPkts" 58); 59 60# non-critical counters 61%IBswcountlimits::error_counters = ( 62 "SymbolErrors", 63"No action is required except if counter is increasing along with LinkRecovers", 64 "LinkRecovers", 65"If this is increasing along with SymbolErrors this may indicate a bad link, run ibswportwatch.pl on this port", 66 "LinkDowned", 67 "Number of times the port has gone down (Usually for valid reasons)", 68 "RcvErrors", 69"This is a bad link, if the link is internal to a 288 try setting SDR, otherwise check the cable", 70 "RcvRemotePhysErrors", 71 "This indicates a problem ELSEWHERE in the fabric.", 72 "XmtDiscards", 73"This is a symptom of congestion and may require tweaking either HOQ or switch lifetime values", 74 "XmtConstraintErrors", 75 "This is a result of bad partitioning, check partition configuration.", 76 "RcvConstraintErrors", 77 "This is a result of bad partitioning, check partition configuration.", 78 "LinkIntegrityErrors", 79 "May indicate a bad link, run ibswportwatch.pl on this port", 80 "ExcBufOverrunErrors", 81"This is a flow control state machine error and can be caused by packets with physical errors", 82 "VL15Dropped", 83 "check with ibswportwatch.pl, if increasing in SMALL increments, OK", 84 "RcvSwRelayErrors", 85 "This counter can increase due to a valid network event" 86); 87 88sub check_counters 89{ 90 my $print_action = $_[0]; 91 my $actions = undef; 92 93 COUNTER: foreach my $cnt (keys %IBswcountlimits::error_counters) { 94 if ($IBswcountlimits::cur_counts{$cnt} > 0) { 95 foreach my $sup_cnt (@IBswcountlimits::suppress_errors) { 96 if ("$cnt" eq $sup_cnt) { next COUNTER; } 97 } 98 print " [$cnt == $IBswcountlimits::cur_counts{$cnt}]"; 99 if ("$print_action" eq "yes") { 100 $actions = join " ", 101 ( 102 $actions, 103 " $cnt: $IBswcountlimits::error_counters{$cnt}\n" 104 ); 105 } 106 } 107 } 108 109 if ($actions) { 110 print "\n Actions:\n$actions"; 111 } 112} 113 114# Data counters 115%IBswcountlimits::data_counters = ( 116 "XmtData", 117"Total number of data octets, divided by 4, transmitted on all VLs from the port", 118 "RcvData", 119"Total number of data octets, divided by 4, received on all VLs to the port", 120 "XmtPkts", 121"Total number of packets, excluding link packets, transmitted on all VLs from the port", 122 "RcvPkts", 123"Total number of packets, excluding link packets, received on all VLs to the port" 124); 125 126sub check_data_counters 127{ 128 my $print_action = $_[0]; 129 my $actions = undef; 130 131 COUNTER: foreach my $cnt (keys %IBswcountlimits::data_counters) { 132 print " [$cnt == $IBswcountlimits::cur_counts{$cnt}]"; 133 if ("$print_action" eq "yes") { 134 $actions = join " ", 135 ( 136 $actions, 137 " $cnt: $IBswcountlimits::data_counters{$cnt}\n" 138 ); 139 } 140 } 141 if ($actions) { 142 print "\n Descriptions:\n$actions"; 143 } 144} 145 146sub print_data_rates 147{ 148 COUNTER: foreach my $cnt (keys %IBswcountlimits::data_counters) { 149 my $cnt_per_second = calculate_rate( 150 $IBswcountlimits::cur_counts{$cnt}, 151 $IBswcountlimits::new_counts{$cnt} 152 ); 153 print " $cnt_per_second $cnt/second\n"; 154 } 155} 156 157# ========================================================================= 158# Rate dependent counters 159# calculate the count/sec 160# calculate_rate old_count new_count 161sub calculate_rate 162{ 163 my $rate = 0; 164 my $old_val = $_[0]; 165 my $new_val = $_[1]; 166 my $rate = ($new_val - $old_val) / $IBswcountlimits::pause_time; 167 return ($rate); 168} 169%IBswcountlimits::rate_dep_thresholds = ( 170 "SymbolErrors", 10, "LinkRecovers", 10, 171 "RcvErrors", 10, "LinkIntegrityErrors", 10, 172 "XmtDiscards", 10 173); 174 175sub check_counter_rates 176{ 177 foreach my $rate_count (keys %IBswcountlimits::rate_dep_thresholds) { 178 my $rate = calculate_rate( 179 $IBswcountlimits::cur_counts{$rate_count}, 180 $IBswcountlimits::new_counts{$rate_count} 181 ); 182 if ($rate > $IBswcountlimits::rate_dep_thresholds{$rate_count}) { 183 print "Detected excessive rate for $rate_count ($rate cnts/sec)\n"; 184 } elsif ($rate > 0) { 185 print "Detected rate for $rate_count ($rate cnts/sec)\n"; 186 } 187 } 188} 189 190# ========================================================================= 191# 192sub clear_counters 193{ 194 # clear the counters 195 foreach my $count (@IBswcountlimits::counters) { 196 $IBswcountlimits::cur_counts{$count} = 0; 197 } 198} 199 200# ========================================================================= 201# 202sub any_counts 203{ 204 my $total = 0; 205 my $count = 0; 206 foreach $count (keys %IBswcountlimits::critical) { 207 $total = $total + $IBswcountlimits::cur_counts{$count}; 208 } 209 COUNTER: foreach $count (keys %IBswcountlimits::error_counters) { 210 foreach my $sup_cnt (@IBswcountlimits::suppress_errors) { 211 if ("$count" eq $sup_cnt) { next COUNTER; } 212 } 213 $total = $total + $IBswcountlimits::cur_counts{$count}; 214 } 215 return ($total); 216} 217 218# ========================================================================= 219# 220sub ensure_cache_dir 221{ 222 if (!(-d "$IBswcountlimits::cache_dir") && 223 !mkdir($IBswcountlimits::cache_dir, 0700)) { 224 die "cannot create $IBswcountlimits::cache_dir: $!\n"; 225 } 226} 227 228# ========================================================================= 229# get_cache_file(ca_name, ca_port) 230# 231sub get_cache_file 232{ 233 my $ca_name = $_[0]; 234 my $ca_port = $_[1]; 235 ensure_cache_dir; 236 return ( 237 "$IBswcountlimits::cache_dir/ibnetdiscover-$ca_name-$ca_port.topology"); 238} 239 240# ========================================================================= 241# get_ca_name_port_param_string(ca_name, ca_port) 242# 243sub get_ca_name_port_param_string 244{ 245 my $ca_name = $_[0]; 246 my $ca_port = $_[1]; 247 248 if ("$ca_name" ne "") { $ca_name = "-C $ca_name"; } 249 if ("$ca_port" ne "") { $ca_port = "-P $ca_port"; } 250 251 return ("$ca_name $ca_port"); 252} 253 254# ========================================================================= 255# generate_ibnetdiscover_topology(ca_name, ca_port) 256# 257sub generate_ibnetdiscover_topology 258{ 259 my $ca_name = $_[0]; 260 my $ca_port = $_[1]; 261 my $cache_file = get_cache_file($ca_name, $ca_port); 262 my $extra_params = get_ca_name_port_param_string($ca_name, $ca_port); 263 264 if (`ibnetdiscover -g $extra_params > $cache_file`) { 265 die "Execution of ibnetdiscover failed: $!\n"; 266 } 267} 268 269# ========================================================================= 270# get_link_ends(regenerate_map, ca_name, ca_port) 271# 272sub get_link_ends 273{ 274 my $regenerate_map = $_[0]; 275 my $ca_name = $_[1]; 276 my $ca_port = $_[2]; 277 278 my $cache_file = get_cache_file($ca_name, $ca_port); 279 280 if ($regenerate_map || !(-f "$cache_file")) { 281 generate_ibnetdiscover_topology($ca_name, $ca_port); 282 } 283 open IBNET_TOPO, "<$cache_file" 284 or die "Failed to open ibnet topology: $!\n"; 285 my $in_switch = "no"; 286 my $desc = ""; 287 my $guid = ""; 288 my $loc_sw_lid = ""; 289 290 my $loc_port = ""; 291 my $line = ""; 292 293 while ($line = <IBNET_TOPO>) { 294 if ($line =~ /^Switch.*\"S-(.*)\"\s+#.*\"(.*)\".* lid (\d+).*/) { 295 $guid = $1; 296 $desc = $2; 297 $loc_sw_lid = $3; 298 $in_switch = "yes"; 299 } 300 if ($in_switch eq "yes") { 301 my $rec = undef; 302 if ($line =~ 303/^\[(\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/ 304 ) 305 { 306 $loc_port = $1; 307 my $rem_guid = $2; 308 my $rem_port = $3; 309 my $rem_port_guid = $4; 310 my $rem_desc = $5; 311 my $rem_lid = $6; 312 $rec = { 313 loc_guid => "0x$guid", 314 loc_port => $loc_port, 315 loc_ext_port => "", 316 loc_desc => $desc, 317 loc_sw_lid => $loc_sw_lid, 318 rem_guid => "0x$rem_guid", 319 rem_lid => $rem_lid, 320 rem_port => $rem_port, 321 rem_ext_port => "", 322 rem_desc => $rem_desc, 323 rem_port_guid => $rem_port_guid 324 }; 325 } 326 if ($line =~ 327/^\[(\d+)\]\[ext (\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/ 328 ) 329 { 330 $loc_port = $1; 331 my $loc_ext_port = $2; 332 my $rem_guid = $3; 333 my $rem_port = $4; 334 my $rem_port_guid = $5; 335 my $rem_desc = $6; 336 my $rem_lid = $7; 337 $rec = { 338 loc_guid => "0x$guid", 339 loc_port => $loc_port, 340 loc_ext_port => $loc_ext_port, 341 loc_desc => $desc, 342 loc_sw_lid => $loc_sw_lid, 343 rem_guid => "0x$rem_guid", 344 rem_lid => $rem_lid, 345 rem_port => $rem_port, 346 rem_ext_port => "", 347 rem_desc => $rem_desc, 348 rem_port_guid => $rem_port_guid 349 }; 350 } 351 if ($line =~ 352/^\[(\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\]\[ext (\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/ 353 ) 354 { 355 $loc_port = $1; 356 my $rem_guid = $2; 357 my $rem_port = $3; 358 my $rem_ext_port = $4; 359 my $rem_port_guid = $5; 360 my $rem_desc = $6; 361 my $rem_lid = $7; 362 $rec = { 363 loc_guid => "0x$guid", 364 loc_port => $loc_port, 365 loc_ext_port => "", 366 loc_desc => $desc, 367 loc_sw_lid => $loc_sw_lid, 368 rem_guid => "0x$rem_guid", 369 rem_lid => $rem_lid, 370 rem_port => $rem_port, 371 rem_ext_port => $rem_ext_port, 372 rem_desc => $rem_desc, 373 rem_port_guid => $rem_port_guid 374 }; 375 } 376 if ($line =~ 377/^\[(\d+)\]\[ext (\d+)\]\s+\"[HSR]-(.+)\"\[(\d+)\]\[ext (\d+)\](\(.+\))?\s+#.*\"(.*)\"\.* lid (\d+).*/ 378 ) 379 { 380 $loc_port = $1; 381 my $loc_ext_port = $2; 382 my $rem_guid = $3; 383 my $rem_port = $4; 384 my $rem_ext_port = $5; 385 my $rem_port_guid = $6; 386 my $rem_desc = $7; 387 my $rem_lid = $8; 388 $rec = { 389 loc_guid => "0x$guid", 390 loc_port => $loc_port, 391 loc_ext_port => $loc_ext_port, 392 loc_desc => $desc, 393 loc_sw_lid => $loc_sw_lid, 394 rem_guid => "0x$rem_guid", 395 rem_lid => $rem_lid, 396 rem_port => $rem_port, 397 rem_ext_port => $rem_ext_port, 398 rem_desc => $rem_desc, 399 rem_port_guid => $rem_port_guid 400 }; 401 } 402 if ($rec) { 403 $rec->{rem_port_guid} =~ s/\((.*)\)/$1/; 404 $IBswcountlimits::link_ends{"0x$guid"}{$loc_port} = $rec; 405 } 406 } 407 408 if ($line =~ /^Ca.*/ || $line =~ /^Rt.*/) { $in_switch = "no"; } 409 } 410 close IBNET_TOPO; 411} 412 413# ========================================================================= 414# get_num_ports(switch_guid, ca_name, ca_port) 415# 416sub get_num_ports 417{ 418 my $guid = $_[0]; 419 my $ca_name = $_[1]; 420 my $ca_port = $_[2]; 421 my $num_ports = 0; 422 my $extra_params = get_ca_name_port_param_string($ca_name, $ca_port); 423 424 my $data = `smpquery $extra_params -G nodeinfo $guid` || 425 die "'smpquery $extra_params -G nodeinfo $guid' faild\n"; 426 my @lines = split("\n", $data); 427 my $pkt_lifetime = ""; 428 foreach my $line (@lines) { 429 if ($line =~ /^NumPorts:\.+(.*)/) { $num_ports = $1; } 430 } 431 return ($num_ports); 432} 433 434# ========================================================================= 435# format_guid(guid) 436# The diags store the guids as strings. This converts the guid supplied 437# to the correct string format. 438# eg: 0x0008f10400411f56 == 0x8f10400411f56 439# 440sub format_guid 441{ 442 my $guid = $_[0]; 443 my $guid_str = ""; 444 445 $guid =~ tr/[A-F]/[a-f]/; 446 if ($guid =~ /0x(.*)/) { 447 $guid_str = sprintf("0x%016s", $1); 448 } else { 449 $guid_str = sprintf("0x%016s", $guid); 450 } 451 return ($guid_str); 452} 453 454# ========================================================================= 455# convert_dr_to_guid(direct_route) 456# 457sub convert_dr_to_guid 458{ 459 my $guid = undef; 460 461 my $data = `smpquery nodeinfo -D $_[0]` || 462 die "'mpquery nodeinfo -D $_[0]' failed\n"; 463 my @lines = split("\n", $data); 464 foreach my $line (@lines) { 465 if ($line =~ /^PortGuid:\.+(.*)/) { $guid = $1; } 466 } 467 return format_guid($guid); 468} 469 470# ========================================================================= 471# get_node_type(guid_or_direct_route) 472# 473sub get_node_type 474{ 475 my $type = undef; 476 my $query_arg = "smpquery nodeinfo "; 477 if ($_[0] =~ /x/) { 478 # assume arg is a guid if contains an x 479 $query_arg .= "-G " . $_[0]; 480 } else { 481 # assume arg is a direct path 482 $query_arg .= "-D " . $_[0]; 483 } 484 485 my $data = `$query_arg` || 486 die "'$query_arg' failed\n"; 487 my @lines = split("\n", $data); 488 foreach my $line (@lines) { 489 if ($line =~ /^NodeType:\.+(.*)/) { $type = $1; } 490 } 491 return $type; 492} 493 494# ========================================================================= 495# is_switch(guid_or_direct_route) 496# 497sub is_switch 498{ 499 my $node_type = &get_node_type($_[0]); 500 return ($node_type =~ /Switch/); 501} 502