mlx5_health.c revision 347861
1/*- 2 * Copyright (c) 2013-2017, Mellanox Technologies, Ltd. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 * $FreeBSD: stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c 347861 2019-05-16 18:12:14Z hselasky $ 26 */ 27 28#include <linux/kernel.h> 29#include <linux/module.h> 30#include <linux/random.h> 31#include <linux/vmalloc.h> 32#include <linux/hardirq.h> 33#include <linux/delay.h> 34#include <dev/mlx5/driver.h> 35#include <dev/mlx5/mlx5_ifc.h> 36#include "mlx5_core.h" 37 38#define MLX5_HEALTH_POLL_INTERVAL (2 * HZ) 39#define MAX_MISSES 3 40 41enum { 42 MLX5_DROP_NEW_HEALTH_WORK, 43 MLX5_DROP_NEW_RECOVERY_WORK, 44}; 45 46enum { 47 MLX5_SENSOR_NO_ERR = 0, 48 MLX5_SENSOR_PCI_COMM_ERR = 1, 49 MLX5_SENSOR_PCI_ERR = 2, 50 MLX5_SENSOR_NIC_DISABLED = 3, 51 MLX5_SENSOR_NIC_SW_RESET = 4, 52 MLX5_SENSOR_FW_SYND_RFR = 5, 53}; 54 55static int mlx5_fw_reset_enable = 1; 56SYSCTL_INT(_hw_mlx5, OID_AUTO, fw_reset_enable, CTLFLAG_RWTUN, 57 &mlx5_fw_reset_enable, 0, 58 "Enable firmware reset"); 59 60static unsigned int sw_reset_to = 1200; 61SYSCTL_UINT(_hw_mlx5, OID_AUTO, sw_reset_timeout, CTLFLAG_RWTUN, 62 &sw_reset_to, 0, 63 "Minimum timeout in seconds between two firmware resets"); 64 65 66static int lock_sem_sw_reset(struct mlx5_core_dev *dev) 67{ 68 int ret; 69 70 /* Lock GW access */ 71 ret = -mlx5_vsc_lock(dev); 72 if (ret) { 73 mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret); 74 return ret; 75 } 76 77 ret = -mlx5_vsc_lock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET); 78 if (ret) { 79 if (ret == -EBUSY) 80 mlx5_core_dbg(dev, "SW reset FW semaphore already locked, another function will handle the reset\n"); 81 else 82 mlx5_core_warn(dev, "SW reset semaphore lock return %d\n", ret); 83 } 84 85 /* Unlock GW access */ 86 mlx5_vsc_unlock(dev); 87 88 return ret; 89} 90 91static int unlock_sem_sw_reset(struct mlx5_core_dev *dev) 92{ 93 int ret; 94 95 /* Lock GW access */ 96 ret = -mlx5_vsc_lock(dev); 97 if (ret) { 98 mlx5_core_warn(dev, "Timed out locking gateway %d\n", ret); 99 return ret; 100 } 101 102 ret = -mlx5_vsc_unlock_addr_space(dev, MLX5_SEMAPHORE_SW_RESET); 103 104 /* Unlock GW access */ 105 mlx5_vsc_unlock(dev); 106 107 return ret; 108} 109 110u8 mlx5_get_nic_state(struct mlx5_core_dev *dev) 111{ 112 return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7; 113} 114 115void mlx5_set_nic_state(struct mlx5_core_dev *dev, u8 state) 116{ 117 u32 cur_cmdq_addr_l_sz; 118 119 cur_cmdq_addr_l_sz = ioread32be(&dev->iseg->cmdq_addr_l_sz); 120 iowrite32be((cur_cmdq_addr_l_sz & 0xFFFFF000) | 121 state << MLX5_NIC_IFC_OFFSET, 122 &dev->iseg->cmdq_addr_l_sz); 123} 124 125static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev) 126{ 127 struct mlx5_core_health *health = &dev->priv.health; 128 struct mlx5_health_buffer __iomem *h = health->health; 129 u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET; 130 u8 synd = ioread8(&h->synd); 131 132 if (rfr && synd) 133 mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd); 134 return rfr && synd; 135} 136 137static void mlx5_trigger_cmd_completions(struct mlx5_core_dev *dev) 138{ 139 unsigned long flags; 140 u64 vector; 141 142 /* wait for pending handlers to complete */ 143 synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector); 144 spin_lock_irqsave(&dev->cmd.alloc_lock, flags); 145 vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1); 146 if (!vector) 147 goto no_trig; 148 149 vector |= MLX5_TRIGGERED_CMD_COMP; 150 spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); 151 152 mlx5_core_dbg(dev, "vector 0x%jx\n", (uintmax_t)vector); 153 mlx5_cmd_comp_handler(dev, vector, MLX5_CMD_MODE_EVENTS); 154 return; 155 156no_trig: 157 spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); 158} 159 160static bool sensor_pci_no_comm(struct mlx5_core_dev *dev) 161{ 162 struct mlx5_core_health *health = &dev->priv.health; 163 struct mlx5_health_buffer __iomem *h = health->health; 164 bool err = ioread32be(&h->fw_ver) == 0xffffffff; 165 166 return err; 167} 168 169static bool sensor_nic_disabled(struct mlx5_core_dev *dev) 170{ 171 return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_DISABLED; 172} 173 174static bool sensor_nic_sw_reset(struct mlx5_core_dev *dev) 175{ 176 return mlx5_get_nic_state(dev) == MLX5_NIC_IFC_SW_RESET; 177} 178 179static u32 check_fatal_sensors(struct mlx5_core_dev *dev) 180{ 181 if (sensor_pci_no_comm(dev)) 182 return MLX5_SENSOR_PCI_COMM_ERR; 183 if (pci_channel_offline(dev->pdev)) 184 return MLX5_SENSOR_PCI_ERR; 185 if (sensor_nic_disabled(dev)) 186 return MLX5_SENSOR_NIC_DISABLED; 187 if (sensor_nic_sw_reset(dev)) 188 return MLX5_SENSOR_NIC_SW_RESET; 189 if (sensor_fw_synd_rfr(dev)) 190 return MLX5_SENSOR_FW_SYND_RFR; 191 192 return MLX5_SENSOR_NO_ERR; 193} 194 195static void reset_fw_if_needed(struct mlx5_core_dev *dev) 196{ 197 bool supported; 198 u32 cmdq_addr, fatal_error; 199 200 if (!mlx5_fw_reset_enable) 201 return; 202 supported = (ioread32be(&dev->iseg->initializing) >> 203 MLX5_FW_RESET_SUPPORTED_OFFSET) & 1; 204 if (!supported) 205 return; 206 207 /* The reset only needs to be issued by one PF. The health buffer is 208 * shared between all functions, and will be cleared during a reset. 209 * Check again to avoid a redundant 2nd reset. If the fatal erros was 210 * PCI related a reset won't help. 211 */ 212 fatal_error = check_fatal_sensors(dev); 213 if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR || 214 fatal_error == MLX5_SENSOR_NIC_DISABLED || 215 fatal_error == MLX5_SENSOR_NIC_SW_RESET) { 216 mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help.\n"); 217 return; 218 } 219 220 mlx5_core_warn(dev, "Issuing FW Reset\n"); 221 /* Write the NIC interface field to initiate the reset, the command 222 * interface address also resides here, don't overwrite it. 223 */ 224 cmdq_addr = ioread32be(&dev->iseg->cmdq_addr_l_sz); 225 iowrite32be((cmdq_addr & 0xFFFFF000) | 226 MLX5_NIC_IFC_SW_RESET << MLX5_NIC_IFC_OFFSET, 227 &dev->iseg->cmdq_addr_l_sz); 228} 229 230static bool 231mlx5_health_allow_reset(struct mlx5_core_dev *dev) 232{ 233 struct mlx5_core_health *health = &dev->priv.health; 234 unsigned int delta; 235 bool ret; 236 237 if (health->last_reset_req != 0) { 238 delta = ticks - health->last_reset_req; 239 delta /= hz; 240 ret = delta >= sw_reset_to; 241 } else { 242 ret = true; 243 } 244 245 /* 246 * In principle, ticks may be 0. Setting it to off by one (-1) 247 * to prevent certain reset in next request. 248 */ 249 health->last_reset_req = ticks ? : -1; 250 if (!ret) 251 mlx5_core_warn(dev, "Firmware reset elided due to " 252 "auto-reset frequency threshold.\n"); 253 return (ret); 254} 255 256#define MLX5_CRDUMP_WAIT_MS 60000 257#define MLX5_FW_RESET_WAIT_MS 1000 258#define MLX5_NIC_STATE_POLL_MS 5 259void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) 260{ 261 int end, delay_ms = MLX5_CRDUMP_WAIT_MS; 262 u32 fatal_error; 263 int lock = -EBUSY; 264 265 fatal_error = check_fatal_sensors(dev); 266 267 if (fatal_error || force) { 268 if (xchg(&dev->state, MLX5_DEVICE_STATE_INTERNAL_ERROR) == 269 MLX5_DEVICE_STATE_INTERNAL_ERROR) 270 return; 271 if (!force) 272 mlx5_core_err(dev, "internal state error detected\n"); 273 mlx5_trigger_cmd_completions(dev); 274 } 275 276 mutex_lock(&dev->intf_state_mutex); 277 278 if (force) 279 goto err_state_done; 280 281 if (fatal_error == MLX5_SENSOR_FW_SYND_RFR && 282 mlx5_health_allow_reset(dev)) { 283 /* Get cr-dump and reset FW semaphore */ 284 if (mlx5_core_is_pf(dev)) 285 lock = lock_sem_sw_reset(dev); 286 287 /* Execute cr-dump and SW reset */ 288 if (lock != -EBUSY) { 289 mlx5_fwdump(dev); 290 reset_fw_if_needed(dev); 291 delay_ms = MLX5_FW_RESET_WAIT_MS; 292 } 293 } 294 295 /* Recover from SW reset */ 296 end = jiffies + msecs_to_jiffies(delay_ms); 297 do { 298 if (sensor_nic_disabled(dev)) 299 break; 300 301 msleep(MLX5_NIC_STATE_POLL_MS); 302 } while (!time_after(jiffies, end)); 303 304 if (!sensor_nic_disabled(dev)) { 305 dev_err(&dev->pdev->dev, "NIC IFC still %d after %ums.\n", 306 mlx5_get_nic_state(dev), delay_ms); 307 } 308 309 /* Release FW semaphore if you are the lock owner */ 310 if (!lock) 311 unlock_sem_sw_reset(dev); 312 313 mlx5_core_err(dev, "system error event triggered\n"); 314 315err_state_done: 316 mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 1); 317 mutex_unlock(&dev->intf_state_mutex); 318} 319 320static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) 321{ 322 u8 nic_mode = mlx5_get_nic_state(dev); 323 324 if (nic_mode == MLX5_NIC_IFC_SW_RESET) { 325 /* The IFC mode field is 3 bits, so it will read 0x7 in two cases: 326 * 1. PCI has been disabled (ie. PCI-AER, PF driver unloaded 327 * and this is a VF), this is not recoverable by SW reset. 328 * Logging of this is handled elsewhere. 329 * 2. FW reset has been issued by another function, driver can 330 * be reloaded to recover after the mode switches to 331 * MLX5_NIC_IFC_DISABLED. 332 */ 333 if (dev->priv.health.fatal_error != MLX5_SENSOR_PCI_COMM_ERR) 334 mlx5_core_warn(dev, "NIC SW reset is already progress\n"); 335 else 336 mlx5_core_warn(dev, "Communication with FW over the PCI link is down\n"); 337 } else { 338 mlx5_core_warn(dev, "NIC mode %d\n", nic_mode); 339 } 340 341 mlx5_disable_device(dev); 342} 343 344#define MLX5_FW_RESET_WAIT_MS 1000 345#define MLX5_NIC_STATE_POLL_MS 5 346static void health_recover(struct work_struct *work) 347{ 348 unsigned long end = jiffies + msecs_to_jiffies(MLX5_FW_RESET_WAIT_MS); 349 struct mlx5_core_health *health; 350 struct delayed_work *dwork; 351 struct mlx5_core_dev *dev; 352 struct mlx5_priv *priv; 353 bool recover = true; 354 u8 nic_mode; 355 356 dwork = container_of(work, struct delayed_work, work); 357 health = container_of(dwork, struct mlx5_core_health, recover_work); 358 priv = container_of(health, struct mlx5_priv, health); 359 dev = container_of(priv, struct mlx5_core_dev, priv); 360 361 mtx_lock(&Giant); /* XXX newbus needs this */ 362 363 if (sensor_pci_no_comm(dev)) { 364 dev_err(&dev->pdev->dev, "health recovery flow aborted, PCI reads still not working\n"); 365 recover = false; 366 } 367 368 nic_mode = mlx5_get_nic_state(dev); 369 while (nic_mode != MLX5_NIC_IFC_DISABLED && 370 !time_after(jiffies, end)) { 371 msleep(MLX5_NIC_STATE_POLL_MS); 372 nic_mode = mlx5_get_nic_state(dev); 373 } 374 375 if (nic_mode != MLX5_NIC_IFC_DISABLED) { 376 dev_err(&dev->pdev->dev, "health recovery flow aborted, unexpected NIC IFC mode %d.\n", 377 nic_mode); 378 recover = false; 379 } 380 381 if (recover) { 382 dev_err(&dev->pdev->dev, "starting health recovery flow\n"); 383 mlx5_recover_device(dev); 384 } 385 386 mtx_unlock(&Giant); 387} 388 389/* How much time to wait until health resetting the driver (in msecs) */ 390#define MLX5_RECOVERY_DELAY_MSECS 60000 391#define MLX5_RECOVERY_NO_DELAY 0 392static unsigned long get_recovery_delay(struct mlx5_core_dev *dev) 393{ 394 return dev->priv.health.fatal_error == MLX5_SENSOR_PCI_ERR || 395 dev->priv.health.fatal_error == MLX5_SENSOR_PCI_COMM_ERR ? 396 MLX5_RECOVERY_DELAY_MSECS : MLX5_RECOVERY_NO_DELAY; 397} 398 399static void health_care(struct work_struct *work) 400{ 401 struct mlx5_core_health *health; 402 unsigned long recover_delay; 403 struct mlx5_core_dev *dev; 404 struct mlx5_priv *priv; 405 unsigned long flags; 406 407 health = container_of(work, struct mlx5_core_health, work); 408 priv = container_of(health, struct mlx5_priv, health); 409 dev = container_of(priv, struct mlx5_core_dev, priv); 410 411 mlx5_core_warn(dev, "handling bad device here\n"); 412 mlx5_handle_bad_state(dev); 413 recover_delay = msecs_to_jiffies(get_recovery_delay(dev)); 414 415 spin_lock_irqsave(&health->wq_lock, flags); 416 if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) { 417 mlx5_core_warn(dev, "Scheduling recovery work with %lums delay\n", 418 recover_delay); 419 schedule_delayed_work(&health->recover_work, recover_delay); 420 } else { 421 dev_err(&dev->pdev->dev, 422 "new health works are not permitted at this stage\n"); 423 } 424 spin_unlock_irqrestore(&health->wq_lock, flags); 425} 426 427static int get_next_poll_jiffies(void) 428{ 429 unsigned long next; 430 431 get_random_bytes(&next, sizeof(next)); 432 next %= HZ; 433 next += jiffies + MLX5_HEALTH_POLL_INTERVAL; 434 435 return next; 436} 437 438void mlx5_trigger_health_work(struct mlx5_core_dev *dev) 439{ 440 struct mlx5_core_health *health = &dev->priv.health; 441 unsigned long flags; 442 443 spin_lock_irqsave(&health->wq_lock, flags); 444 if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) 445 queue_work(health->wq, &health->work); 446 else 447 dev_err(&dev->pdev->dev, 448 "new health works are not permitted at this stage\n"); 449 spin_unlock_irqrestore(&health->wq_lock, flags); 450} 451 452static const char *hsynd_str(u8 synd) 453{ 454 switch (synd) { 455 case MLX5_HEALTH_SYNDR_FW_ERR: 456 return "firmware internal error"; 457 case MLX5_HEALTH_SYNDR_IRISC_ERR: 458 return "irisc not responding"; 459 case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR: 460 return "unrecoverable hardware error"; 461 case MLX5_HEALTH_SYNDR_CRC_ERR: 462 return "firmware CRC error"; 463 case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR: 464 return "ICM fetch PCI error"; 465 case MLX5_HEALTH_SYNDR_HW_FTL_ERR: 466 return "HW fatal error\n"; 467 case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR: 468 return "async EQ buffer overrun"; 469 case MLX5_HEALTH_SYNDR_EQ_ERR: 470 return "EQ error"; 471 case MLX5_HEALTH_SYNDR_EQ_INV: 472 return "Invalid EQ referenced"; 473 case MLX5_HEALTH_SYNDR_FFSER_ERR: 474 return "FFSER error"; 475 case MLX5_HEALTH_SYNDR_HIGH_TEMP: 476 return "High temprature"; 477 default: 478 return "unrecognized error"; 479 } 480} 481 482static void print_health_info(struct mlx5_core_dev *dev) 483{ 484 struct mlx5_core_health *health = &dev->priv.health; 485 struct mlx5_health_buffer __iomem *h = health->health; 486 char fw_str[18]; 487 u32 fw; 488 int i; 489 490 /* If the syndrom is 0, the device is OK and no need to print buffer */ 491 if (!ioread8(&h->synd)) 492 return; 493 494 for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) 495 printf("mlx5_core: INFO: ""assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i)); 496 497 printf("mlx5_core: INFO: ""assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr)); 498 printf("mlx5_core: INFO: ""assert_callra 0x%08x\n", ioread32be(&h->assert_callra)); 499 snprintf(fw_str, sizeof(fw_str), "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev)); 500 printf("mlx5_core: INFO: ""fw_ver %s\n", fw_str); 501 printf("mlx5_core: INFO: ""hw_id 0x%08x\n", ioread32be(&h->hw_id)); 502 printf("mlx5_core: INFO: ""irisc_index %d\n", ioread8(&h->irisc_index)); 503 printf("mlx5_core: INFO: ""synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd))); 504 printf("mlx5_core: INFO: ""ext_synd 0x%04x\n", ioread16be(&h->ext_synd)); 505 fw = ioread32be(&h->fw_ver); 506 printf("mlx5_core: INFO: ""raw fw_ver 0x%08x\n", fw); 507} 508 509static void poll_health(unsigned long data) 510{ 511 struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data; 512 struct mlx5_core_health *health = &dev->priv.health; 513 u32 fatal_error; 514 u32 count; 515 516 if (dev->state != MLX5_DEVICE_STATE_UP) 517 return; 518 519 if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) 520 goto out; 521 522 count = ioread32be(health->health_counter); 523 if (count == health->prev) 524 ++health->miss_counter; 525 else 526 health->miss_counter = 0; 527 528 health->prev = count; 529 if (health->miss_counter == MAX_MISSES) { 530 mlx5_core_err(dev, "device's health compromised - reached miss count\n"); 531 print_health_info(dev); 532 } 533 534 fatal_error = check_fatal_sensors(dev); 535 536 if (fatal_error && !health->fatal_error) { 537 mlx5_core_err(dev, "Fatal error %u detected\n", fatal_error); 538 dev->priv.health.fatal_error = fatal_error; 539 print_health_info(dev); 540 mlx5_trigger_health_work(dev); 541 } 542 543out: 544 mod_timer(&health->timer, get_next_poll_jiffies()); 545} 546 547void mlx5_start_health_poll(struct mlx5_core_dev *dev) 548{ 549 struct mlx5_core_health *health = &dev->priv.health; 550 551 init_timer(&health->timer); 552 health->fatal_error = MLX5_SENSOR_NO_ERR; 553 clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); 554 clear_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); 555 health->health = &dev->iseg->health; 556 health->health_counter = &dev->iseg->health_counter; 557 558 setup_timer(&health->timer, poll_health, (unsigned long)dev); 559 mod_timer(&health->timer, 560 round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL)); 561} 562 563void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health) 564{ 565 struct mlx5_core_health *health = &dev->priv.health; 566 unsigned long flags; 567 568 if (disable_health) { 569 spin_lock_irqsave(&health->wq_lock, flags); 570 set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); 571 set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); 572 spin_unlock_irqrestore(&health->wq_lock, flags); 573 } 574 575 del_timer_sync(&health->timer); 576} 577 578void mlx5_drain_health_wq(struct mlx5_core_dev *dev) 579{ 580 struct mlx5_core_health *health = &dev->priv.health; 581 unsigned long flags; 582 583 spin_lock_irqsave(&health->wq_lock, flags); 584 set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); 585 set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); 586 spin_unlock_irqrestore(&health->wq_lock, flags); 587 cancel_delayed_work_sync(&health->recover_work); 588 cancel_work_sync(&health->work); 589} 590 591void mlx5_drain_health_recovery(struct mlx5_core_dev *dev) 592{ 593 struct mlx5_core_health *health = &dev->priv.health; 594 unsigned long flags; 595 596 spin_lock_irqsave(&health->wq_lock, flags); 597 set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags); 598 spin_unlock_irqrestore(&health->wq_lock, flags); 599 cancel_delayed_work_sync(&dev->priv.health.recover_work); 600} 601 602void mlx5_health_cleanup(struct mlx5_core_dev *dev) 603{ 604 struct mlx5_core_health *health = &dev->priv.health; 605 606 destroy_workqueue(health->wq); 607 destroy_workqueue(health->wq_watchdog); 608} 609 610int mlx5_health_init(struct mlx5_core_dev *dev) 611{ 612 struct mlx5_core_health *health; 613 char name[64]; 614 615 health = &dev->priv.health; 616 617 snprintf(name, sizeof(name), "%s-rec", dev_name(&dev->pdev->dev)); 618 health->wq = create_singlethread_workqueue(name); 619 if (!health->wq) 620 return -ENOMEM; 621 622 snprintf(name, sizeof(name), "%s-wdg", dev_name(&dev->pdev->dev)); 623 health->wq_watchdog = create_singlethread_workqueue(name); 624 if (!health->wq_watchdog) { 625 destroy_workqueue(health->wq); 626 return -ENOMEM; 627 } 628 629 spin_lock_init(&health->wq_lock); 630 INIT_WORK(&health->work, health_care); 631 INIT_DELAYED_WORK(&health->recover_work, health_recover); 632 633 return 0; 634} 635