mthca_catas.c revision 256281
165557Sjasone/* 265557Sjasone * Copyright (c) 2005 Cisco Systems. All rights reserved. 365557Sjasone * 465557Sjasone * This software is available to you under a choice of one of two 565557Sjasone * licenses. You may choose to be licensed under the terms of the GNU 665557Sjasone * General Public License (GPL) Version 2, available from the file 765557Sjasone * COPYING in the main directory of this source tree, or the 865557Sjasone * OpenIB.org BSD license below: 965557Sjasone * 1065557Sjasone * Redistribution and use in source and binary forms, with or 1165557Sjasone * without modification, are permitted provided that the following 1265557Sjasone * conditions are met: 1365557Sjasone * 1465557Sjasone * - Redistributions of source code must retain the above 1565557Sjasone * copyright notice, this list of conditions and the following 1665557Sjasone * disclaimer. 1765557Sjasone * 1865557Sjasone * - Redistributions in binary form must reproduce the above 1965557Sjasone * copyright notice, this list of conditions and the following 2065557Sjasone * disclaimer in the documentation and/or other materials 2165557Sjasone * provided with the distribution. 2265557Sjasone * 2365557Sjasone * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 2465557Sjasone * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 2565557Sjasone * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 2665557Sjasone * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 2765557Sjasone * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 2865557Sjasone * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 2967352Sjhb * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 3065557Sjasone * SOFTWARE. 3165557Sjasone */ 3265557Sjasone 3374912Sjhb#include <linux/jiffies.h> 3474912Sjhb#include <linux/timer.h> 3574912Sjhb#include <linux/workqueue.h> 3672200Sbmilekic 3772200Sbmilekic#include "mthca_dev.h" 3872200Sbmilekic 3965557Sjasoneenum { 4065557Sjasone MTHCA_CATAS_TYPE_INTERNAL = 0, 4165557Sjasone MTHCA_CATAS_TYPE_UPLINK = 3, 4265557Sjasone MTHCA_CATAS_TYPE_DDR = 4, 4365557Sjasone MTHCA_CATAS_TYPE_PARITY = 5, 4465557Sjasone}; 4565557Sjasone 4665557Sjasone#define MTHCA_CATAS_POLL_INTERVAL (5 * HZ) 4765557Sjasone 4865557Sjasonestatic DEFINE_SPINLOCK(catas_lock); 4965557Sjasone 5065557Sjasonestatic LIST_HEAD(catas_list); 5165557Sjasonestatic struct workqueue_struct *catas_wq; 5265557Sjasonestatic struct work_struct catas_work; 5365557Sjasone 5465557Sjasonestatic int catas_reset_disable; 5565557Sjasonemodule_param_named(catas_reset_disable, catas_reset_disable, int, 0644); 5665557SjasoneMODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero"); 5765557Sjasone 58111881Sjhbstatic void catas_reset(struct work_struct *work) 59111881Sjhb{ 60111881Sjhb struct mthca_dev *dev, *tmpdev; 61111881Sjhb LIST_HEAD(tlist); 62111881Sjhb int ret; 63111881Sjhb 64111881Sjhb mutex_lock(&mthca_device_mutex); 65111881Sjhb 66111881Sjhb spin_lock_irq(&catas_lock); 67111881Sjhb list_splice_init(&catas_list, &tlist); 68111881Sjhb spin_unlock_irq(&catas_lock); 69111881Sjhb 70111881Sjhb list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) { 71111881Sjhb struct pci_dev *pdev = dev->pdev; 72111881Sjhb ret = __mthca_restart_one(dev->pdev); 73111881Sjhb /* 'dev' now is not valid */ 74111881Sjhb if (ret) 75111881Sjhb printk(KERN_ERR "mthca %s: Reset failed (%d)\n", 76111881Sjhb pci_name(pdev), ret); 77111881Sjhb else { 78111881Sjhb struct mthca_dev *d = pci_get_drvdata(pdev); 79111881Sjhb mthca_dbg(d, "Reset succeeded\n"); 80111881Sjhb } 81111881Sjhb } 82111881Sjhb 83111881Sjhb mutex_unlock(&mthca_device_mutex); 84116182Sobrien} 85116182Sobrien 86116182Sobrienstatic void handle_catas(struct mthca_dev *dev) 8768790Sjhb{ 8867676Sjhb struct ib_event event; 8967676Sjhb unsigned long flags; 9065557Sjasone const char *type; 9167352Sjhb int i; 92131930Smarcel 9367352Sjhb event.device = &dev->ib_dev; 9474912Sjhb event.event = IB_EVENT_DEVICE_FATAL; 9574912Sjhb event.element.port_num = 0; 9667352Sjhb dev->active = 0; 9774912Sjhb 9865557Sjasone ib_dispatch_event(&event); 9967676Sjhb 10065557Sjasone switch (swab32(readl(dev->catas_err.map)) >> 24) { 10165557Sjasone case MTHCA_CATAS_TYPE_INTERNAL: 10268790Sjhb type = "internal error"; 10368790Sjhb break; 104111881Sjhb case MTHCA_CATAS_TYPE_UPLINK: 105111881Sjhb type = "uplink bus error"; 106105508Sphk break; 107105508Sphk case MTHCA_CATAS_TYPE_DDR: 108105508Sphk type = "DDR data error"; 109139378Sjhb break; 11074912Sjhb case MTHCA_CATAS_TYPE_PARITY: 11165557Sjasone type = "internal parity error"; 11283798Sjhb break; 11374912Sjhb default: 11474912Sjhb type = "unknown error"; 11567352Sjhb break; 11674912Sjhb } 11771352Sjasone 11874912Sjhb mthca_err(dev, "Catastrophic error detected: %s\n", type); 11971352Sjasone for (i = 0; i < dev->catas_err.size; ++i) 12074912Sjhb mthca_err(dev, " buf[%02x]: %08x\n", 12171352Sjasone i, swab32(readl(dev->catas_err.map + i))); 12274912Sjhb 12374912Sjhb if (catas_reset_disable) 12474912Sjhb return; 12574912Sjhb 12674912Sjhb spin_lock_irqsave(&catas_lock, flags); 12774912Sjhb list_add(&dev->catas_err.list, &catas_list); 12874912Sjhb queue_work(catas_wq, &catas_work); 12974912Sjhb spin_unlock_irqrestore(&catas_lock, flags); 13074912Sjhb} 13174912Sjhb 13274912Sjhbstatic void poll_catas(unsigned long dev_ptr) 13374912Sjhb{ 13474912Sjhb struct mthca_dev *dev = (struct mthca_dev *) dev_ptr; 135112118Sjhb int i; 13674912Sjhb 13771352Sjasone for (i = 0; i < dev->catas_err.size; ++i) 13874912Sjhb if (readl(dev->catas_err.map + i)) { 13974912Sjhb handle_catas(dev); 14074912Sjhb return; 14174912Sjhb } 14274912Sjhb 14371352Sjasone mod_timer(&dev->catas_err.timer, 14474912Sjhb round_jiffies(jiffies + MTHCA_CATAS_POLL_INTERVAL)); 14571352Sjasone} 146105508Sphk 14774912Sjhbvoid mthca_start_catas_poll(struct mthca_dev *dev) 14874912Sjhb{ 14974912Sjhb unsigned long addr; 15074912Sjhb 151105508Sphk init_timer(&dev->catas_err.timer); 15271352Sjasone dev->catas_err.map = NULL; 15374912Sjhb 15474912Sjhb addr = pci_resource_start(dev->pdev, 0) + 15574912Sjhb ((pci_resource_len(dev->pdev, 0) - 1) & 15674912Sjhb dev->catas_err.addr); 15771352Sjasone 158112117Sjhb dev->catas_err.map = ioremap(addr, dev->catas_err.size * 4); 159112117Sjhb if (!dev->catas_err.map) { 160112117Sjhb mthca_warn(dev, "couldn't map catastrophic error region " 161112117Sjhb "at 0x%lx/0x%x\n", addr, dev->catas_err.size * 4); 16274912Sjhb return; 16374912Sjhb } 164112117Sjhb 165112117Sjhb dev->catas_err.timer.data = (unsigned long) dev; 166112117Sjhb dev->catas_err.timer.function = poll_catas; 16774912Sjhb dev->catas_err.timer.expires = jiffies + MTHCA_CATAS_POLL_INTERVAL; 168112117Sjhb INIT_LIST_HEAD(&dev->catas_err.list); 16974912Sjhb add_timer(&dev->catas_err.timer); 170112117Sjhb} 171112117Sjhb 172112562Sjhbvoid mthca_stop_catas_poll(struct mthca_dev *dev) 17374912Sjhb{ 174112118Sjhb del_timer_sync(&dev->catas_err.timer); 175112116Sjhb 17674912Sjhb if (dev->catas_err.map) 17774912Sjhb iounmap(dev->catas_err.map); 17874912Sjhb 17974912Sjhb spin_lock_irq(&catas_lock); 18074912Sjhb list_del(&dev->catas_err.list); 18174912Sjhb spin_unlock_irq(&catas_lock); 18274912Sjhb} 18374912Sjhb 18476272Sjhbint __init mthca_catas_init(void) 18576272Sjhb{ 186111881Sjhb INIT_WORK(&catas_work, catas_reset); 187112115Sjhb 188112061Sjhb catas_wq = create_singlethread_workqueue("mthcacatas"); 189100011Smp if (!catas_wq) 190100011Smp return -ENOMEM; 191100011Smp 192100011Smp return 0; 19372200Sbmilekic} 194134873Sjmg 19572200Sbmilekicvoid mthca_catas_cleanup(void) 196112562Sjhb{ 197112562Sjhb destroy_workqueue(catas_wq); 198112562Sjhb} 199112562Sjhb