mlx5_health.c (322148) | mlx5_health.c (331580) |
---|---|
1/*- 2 * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. --- 8 unchanged lines hidden (view full) --- 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * | 1/*- 2 * Copyright (c) 2013-2015, Mellanox Technologies, Ltd. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. --- 8 unchanged lines hidden (view full) --- 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * |
25 * $FreeBSD: stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c 322148 2017-08-07 12:44:18Z hselasky $ | 25 * $FreeBSD: stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c 331580 2018-03-26 20:33:31Z hselasky $ |
26 */ 27 28#include <linux/kernel.h> 29#include <linux/module.h> 30#include <linux/random.h> 31#include <linux/vmalloc.h> | 26 */ 27 28#include <linux/kernel.h> 29#include <linux/module.h> 30#include <linux/random.h> 31#include <linux/vmalloc.h> |
32#include <linux/hardirq.h> |
|
32#include <dev/mlx5/driver.h> 33#include <dev/mlx5/mlx5_ifc.h> 34#include "mlx5_core.h" 35 36#define MLX5_HEALTH_POLL_INTERVAL (2 * HZ) 37#define MAX_MISSES 3 38 | 33#include <dev/mlx5/driver.h> 34#include <dev/mlx5/mlx5_ifc.h> 35#include "mlx5_core.h" 36 37#define MLX5_HEALTH_POLL_INTERVAL (2 * HZ) 38#define MAX_MISSES 3 39 |
39static DEFINE_SPINLOCK(health_lock); 40static LIST_HEAD(health_list); 41static struct work_struct health_work; | 40enum { 41 MLX5_NIC_IFC_FULL = 0, 42 MLX5_NIC_IFC_DISABLED = 1, 43 MLX5_NIC_IFC_NO_DRAM_NIC = 2 44}; |
42 | 45 |
46static u8 get_nic_interface(struct mlx5_core_dev *dev) 47{ 48 return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; 49} 50 51static void mlx5_trigger_cmd_completions(struct mlx5_core_dev *dev) 52{ 53 unsigned long flags; 54 u64 vector; 55 56 /* wait for pending handlers to complete */ 57 synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector); 58 spin_lock_irqsave(&dev->cmd.alloc_lock, flags); 59 vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1); 60 if (!vector) 61 goto no_trig; 62 63 vector |= MLX5_TRIGGERED_CMD_COMP; 64 spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); 65 66 mlx5_core_dbg(dev, "vector 0x%lx\n", vector); 67 mlx5_cmd_comp_handler(dev, vector); 68 return; 69 70no_trig: 71 spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags); 72} 73 74static int in_fatal(struct mlx5_core_dev *dev) 75{ 76 struct mlx5_core_health *health = &dev->priv.health; 77 struct mlx5_health_buffer __iomem *h = health->health; 78 79 if (get_nic_interface(dev) == MLX5_NIC_IFC_DISABLED) 80 return 1; 81 82 if (ioread32be(&h->fw_ver) == 0xffffffff) 83 return 1; 84 85 return 0; 86} 87 88void mlx5_enter_error_state(struct mlx5_core_dev *dev) 89{ 90 if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) 91 return; 92 93 mlx5_core_err(dev, "start\n"); 94 if (pci_channel_offline(dev->pdev) || in_fatal(dev)) 95 dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; 96 97 mlx5_core_event(dev, MLX5_DEV_EVENT_SYS_ERROR, 0); 98 mlx5_core_err(dev, "end\n"); 99} 100 101static void mlx5_handle_bad_state(struct mlx5_core_dev *dev) 102{ 103 u8 nic_interface = get_nic_interface(dev); 104 105 switch (nic_interface) { 106 case MLX5_NIC_IFC_FULL: 107 mlx5_core_warn(dev, "Expected to see disabled NIC but it is full driver\n"); 108 break; 109 110 case MLX5_NIC_IFC_DISABLED: 111 mlx5_core_warn(dev, "starting teardown\n"); 112 break; 113 114 case MLX5_NIC_IFC_NO_DRAM_NIC: 115 mlx5_core_warn(dev, "Expected to see disabled NIC but it is no dram nic\n"); 116 break; 117 default: 118 mlx5_core_warn(dev, "Expected to see disabled NIC but it is has invalid value %d\n", 119 nic_interface); 120 } 121 122 mlx5_disable_device(dev); 123} 124 |
|
43static void health_care(struct work_struct *work) 44{ | 125static void health_care(struct work_struct *work) 126{ |
45 struct mlx5_core_health *health, *n; | 127 struct mlx5_core_health *health; |
46 struct mlx5_core_dev *dev; 47 struct mlx5_priv *priv; | 128 struct mlx5_core_dev *dev; 129 struct mlx5_priv *priv; |
48 LIST_HEAD(tlist); | |
49 | 130 |
50 spin_lock_irq(&health_lock); 51 list_splice_init(&health_list, &tlist); | 131 health = container_of(work, struct mlx5_core_health, work); 132 priv = container_of(health, struct mlx5_priv, health); 133 dev = container_of(priv, struct mlx5_core_dev, priv); 134 mlx5_core_warn(dev, "handling bad device here\n"); 135 mlx5_handle_bad_state(dev); 136} |
52 | 137 |
53 spin_unlock_irq(&health_lock); | 138static int get_next_poll_jiffies(void) 139{ 140 unsigned long next; |
54 | 141 |
55 list_for_each_entry_safe(health, n, &tlist, list) { 56 priv = container_of(health, struct mlx5_priv, health); 57 dev = container_of(priv, struct mlx5_core_dev, priv); 58 mlx5_core_warn(dev, "handling bad device here\n"); 59 /* nothing yet */ 60 spin_lock_irq(&health_lock); 61 list_del_init(&health->list); 62 spin_unlock_irq(&health_lock); 63 } | 142 get_random_bytes(&next, sizeof(next)); 143 next %= HZ; 144 next += jiffies + MLX5_HEALTH_POLL_INTERVAL; 145 146 return next; |
64} 65 66static const char *hsynd_str(u8 synd) 67{ 68 switch (synd) { 69 case MLX5_HEALTH_SYNDR_FW_ERR: 70 return "firmware internal error"; 71 case MLX5_HEALTH_SYNDR_IRISC_ERR: 72 return "irisc not responding"; | 147} 148 149static const char *hsynd_str(u8 synd) 150{ 151 switch (synd) { 152 case MLX5_HEALTH_SYNDR_FW_ERR: 153 return "firmware internal error"; 154 case MLX5_HEALTH_SYNDR_IRISC_ERR: 155 return "irisc not responding"; |
156 case MLX5_HEALTH_SYNDR_HW_UNRECOVERABLE_ERR: 157 return "unrecoverable hardware error"; |
|
73 case MLX5_HEALTH_SYNDR_CRC_ERR: 74 return "firmware CRC error"; 75 case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR: 76 return "ICM fetch PCI error"; 77 case MLX5_HEALTH_SYNDR_HW_FTL_ERR: 78 return "HW fatal error\n"; 79 case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR: 80 return "async EQ buffer overrun"; 81 case MLX5_HEALTH_SYNDR_EQ_ERR: 82 return "EQ error"; | 158 case MLX5_HEALTH_SYNDR_CRC_ERR: 159 return "firmware CRC error"; 160 case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR: 161 return "ICM fetch PCI error"; 162 case MLX5_HEALTH_SYNDR_HW_FTL_ERR: 163 return "HW fatal error\n"; 164 case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR: 165 return "async EQ buffer overrun"; 166 case MLX5_HEALTH_SYNDR_EQ_ERR: 167 return "EQ error"; |
168 case MLX5_HEALTH_SYNDR_EQ_INV: 169 return "Invalid EQ referenced"; |
|
83 case MLX5_HEALTH_SYNDR_FFSER_ERR: 84 return "FFSER error"; | 170 case MLX5_HEALTH_SYNDR_FFSER_ERR: 171 return "FFSER error"; |
172 case MLX5_HEALTH_SYNDR_HIGH_TEMP: 173 return "High temprature"; |
|
85 default: 86 return "unrecognized error"; 87 } 88} 89 | 174 default: 175 return "unrecognized error"; 176 } 177} 178 |
90static u16 read_be16(__be16 __iomem *p) 91{ 92 return swab16(readl((__force u16 __iomem *) p)); 93} 94 95static u32 read_be32(__be32 __iomem *p) 96{ 97 return swab32(readl((__force u32 __iomem *) p)); 98} 99 | |
100static void print_health_info(struct mlx5_core_dev *dev) 101{ 102 struct mlx5_core_health *health = &dev->priv.health; 103 struct mlx5_health_buffer __iomem *h = health->health; | 179static void print_health_info(struct mlx5_core_dev *dev) 180{ 181 struct mlx5_core_health *health = &dev->priv.health; 182 struct mlx5_health_buffer __iomem *h = health->health; |
183 char fw_str[18]; 184 u32 fw; |
|
104 int i; 105 | 185 int i; 186 |
187 /* If the syndrom is 0, the device is OK and no need to print buffer */ 188 if (!ioread8(&h->synd)) 189 return; 190 |
|
106 for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) | 191 for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) |
107 printf("mlx5_core: INFO: ""assert_var[%d] 0x%08x\n", i, read_be32(h->assert_var + i)); | 192 printf("mlx5_core: INFO: ""assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i)); |
108 | 193 |
109 printf("mlx5_core: INFO: ""assert_exit_ptr 0x%08x\n", read_be32(&h->assert_exit_ptr)); 110 printf("mlx5_core: INFO: ""assert_callra 0x%08x\n", read_be32(&h->assert_callra)); 111 printf("mlx5_core: INFO: ""fw_ver 0x%08x\n", read_be32(&h->fw_ver)); 112 printf("mlx5_core: INFO: ""hw_id 0x%08x\n", read_be32(&h->hw_id)); 113 printf("mlx5_core: INFO: ""irisc_index %d\n", readb(&h->irisc_index)); 114 printf("mlx5_core: INFO: ""synd 0x%x: %s\n", readb(&h->synd), hsynd_str(readb(&h->synd))); 115 printf("mlx5_core: INFO: ""ext_sync 0x%04x\n", read_be16(&h->ext_sync)); | 194 printf("mlx5_core: INFO: ""assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr)); 195 printf("mlx5_core: INFO: ""assert_callra 0x%08x\n", ioread32be(&h->assert_callra)); 196 snprintf(fw_str, sizeof(fw_str), "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev)); 197 printf("mlx5_core: INFO: ""fw_ver %s\n", fw_str); 198 printf("mlx5_core: INFO: ""hw_id 0x%08x\n", ioread32be(&h->hw_id)); 199 printf("mlx5_core: INFO: ""irisc_index %d\n", ioread8(&h->irisc_index)); 200 printf("mlx5_core: INFO: ""synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd))); 201 printf("mlx5_core: INFO: ""ext_synd 0x%04x\n", ioread16be(&h->ext_synd)); 202 fw = ioread32be(&h->fw_ver); 203 printf("mlx5_core: INFO: ""raw fw_ver 0x%08x\n", fw); |
116} 117 118static void poll_health(unsigned long data) 119{ 120 struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data; 121 struct mlx5_core_health *health = &dev->priv.health; | 204} 205 206static void poll_health(unsigned long data) 207{ 208 struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data; 209 struct mlx5_core_health *health = &dev->priv.health; |
122 int next; | |
123 u32 count; 124 125 if (dev->state != MLX5_DEVICE_STATE_UP) 126 return; 127 | 210 u32 count; 211 212 if (dev->state != MLX5_DEVICE_STATE_UP) 213 return; 214 |
215 if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { 216 mlx5_trigger_cmd_completions(dev); 217 mod_timer(&health->timer, get_next_poll_jiffies()); 218 return; 219 } 220 |
|
128 count = ioread32be(health->health_counter); 129 if (count == health->prev) 130 ++health->miss_counter; 131 else 132 health->miss_counter = 0; 133 134 health->prev = count; 135 if (health->miss_counter == MAX_MISSES) { | 221 count = ioread32be(health->health_counter); 222 if (count == health->prev) 223 ++health->miss_counter; 224 else 225 health->miss_counter = 0; 226 227 health->prev = count; 228 if (health->miss_counter == MAX_MISSES) { |
136 mlx5_core_err(dev, "device's health compromised\n"); | 229 mlx5_core_err(dev, "device's health compromised - reached miss count\n"); |
137 print_health_info(dev); | 230 print_health_info(dev); |
138 spin_lock_irq(&health_lock); 139 list_add_tail(&health->list, &health_list); 140 spin_unlock_irq(&health_lock); 141 142 if (!queue_work(mlx5_core_wq, &health_work)) 143 mlx5_core_warn(dev, "failed to queue health work\n"); | |
144 } else { | 231 } else { |
145 get_random_bytes(&next, sizeof(next)); 146 next %= HZ; 147 next += jiffies + MLX5_HEALTH_POLL_INTERVAL; 148 mod_timer(&health->timer, next); | 232 mod_timer(&health->timer, get_next_poll_jiffies()); |
149 } | 233 } |
234 235 if (in_fatal(dev) && !health->sick) { 236 health->sick = true; 237 print_health_info(dev); 238 queue_work(health->wq, &health->work); 239 } |
|
150} 151 152void mlx5_start_health_poll(struct mlx5_core_dev *dev) 153{ 154 struct mlx5_core_health *health = &dev->priv.health; 155 | 240} 241 242void mlx5_start_health_poll(struct mlx5_core_dev *dev) 243{ 244 struct mlx5_core_health *health = &dev->priv.health; 245 |
156 INIT_LIST_HEAD(&health->list); | |
157 init_timer(&health->timer); | 246 init_timer(&health->timer); |
247 health->sick = 0; |
|
158 health->health = &dev->iseg->health; 159 health->health_counter = &dev->iseg->health_counter; 160 161 setup_timer(&health->timer, poll_health, (unsigned long)dev); 162 mod_timer(&health->timer, 163 round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL)); 164} 165 166void mlx5_stop_health_poll(struct mlx5_core_dev *dev) 167{ 168 struct mlx5_core_health *health = &dev->priv.health; 169 170 del_timer_sync(&health->timer); | 248 health->health = &dev->iseg->health; 249 health->health_counter = &dev->iseg->health_counter; 250 251 setup_timer(&health->timer, poll_health, (unsigned long)dev); 252 mod_timer(&health->timer, 253 round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL)); 254} 255 256void mlx5_stop_health_poll(struct mlx5_core_dev *dev) 257{ 258 struct mlx5_core_health *health = &dev->priv.health; 259 260 del_timer_sync(&health->timer); |
171 172 spin_lock_irq(&health_lock); 173 if (!list_empty(&health->list)) 174 list_del_init(&health->list); 175 spin_unlock_irq(&health_lock); | |
176} 177 | 261} 262 |
178void mlx5_health_cleanup(void) | 263void mlx5_health_cleanup(struct mlx5_core_dev *dev) |
179{ | 264{ |
265 struct mlx5_core_health *health = &dev->priv.health; 266 267 destroy_workqueue(health->wq); |
|
180} 181 | 268} 269 |
182void __init mlx5_health_init(void) | 270#define HEALTH_NAME "mlx5_health" 271int mlx5_health_init(struct mlx5_core_dev *dev) |
183{ | 272{ |
273 struct mlx5_core_health *health; 274 char *name; 275 int len; |
|
184 | 276 |
185 INIT_WORK(&health_work, health_care); | 277 health = &dev->priv.health; 278 len = strlen(HEALTH_NAME) + strlen(dev_name(&dev->pdev->dev)); 279 name = kmalloc(len + 1, GFP_KERNEL); 280 if (!name) 281 return -ENOMEM; 282 283 snprintf(name, len, "%s:%s", HEALTH_NAME, dev_name(&dev->pdev->dev)); 284 health->wq = create_singlethread_workqueue(name); 285 kfree(name); 286 if (!health->wq) 287 return -ENOMEM; 288 289 INIT_WORK(&health->work, health_care); 290 291 return 0; |
186} | 292} |