mthca_catas.c revision 256281
165557Sjasone/*
265557Sjasone * Copyright (c) 2005 Cisco Systems.  All rights reserved.
365557Sjasone *
465557Sjasone * This software is available to you under a choice of one of two
565557Sjasone * licenses.  You may choose to be licensed under the terms of the GNU
665557Sjasone * General Public License (GPL) Version 2, available from the file
765557Sjasone * COPYING in the main directory of this source tree, or the
865557Sjasone * OpenIB.org BSD license below:
965557Sjasone *
1065557Sjasone *     Redistribution and use in source and binary forms, with or
1165557Sjasone *     without modification, are permitted provided that the following
1265557Sjasone *     conditions are met:
1365557Sjasone *
1465557Sjasone *      - Redistributions of source code must retain the above
1565557Sjasone *        copyright notice, this list of conditions and the following
1665557Sjasone *        disclaimer.
1765557Sjasone *
1865557Sjasone *      - Redistributions in binary form must reproduce the above
1965557Sjasone *        copyright notice, this list of conditions and the following
2065557Sjasone *        disclaimer in the documentation and/or other materials
2165557Sjasone *        provided with the distribution.
2265557Sjasone *
2365557Sjasone * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
2465557Sjasone * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
2565557Sjasone * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
2665557Sjasone * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
2765557Sjasone * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
2865557Sjasone * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
2967352Sjhb * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
3065557Sjasone * SOFTWARE.
3165557Sjasone */
3265557Sjasone
3374912Sjhb#include <linux/jiffies.h>
3474912Sjhb#include <linux/timer.h>
3574912Sjhb#include <linux/workqueue.h>
3672200Sbmilekic
3772200Sbmilekic#include "mthca_dev.h"
3872200Sbmilekic
3965557Sjasoneenum {
4065557Sjasone	MTHCA_CATAS_TYPE_INTERNAL	= 0,
4165557Sjasone	MTHCA_CATAS_TYPE_UPLINK		= 3,
4265557Sjasone	MTHCA_CATAS_TYPE_DDR		= 4,
4365557Sjasone	MTHCA_CATAS_TYPE_PARITY		= 5,
4465557Sjasone};
4565557Sjasone
4665557Sjasone#define	MTHCA_CATAS_POLL_INTERVAL	(5 * HZ)
4765557Sjasone
4865557Sjasonestatic DEFINE_SPINLOCK(catas_lock);
4965557Sjasone
5065557Sjasonestatic LIST_HEAD(catas_list);
5165557Sjasonestatic struct workqueue_struct *catas_wq;
5265557Sjasonestatic struct work_struct catas_work;
5365557Sjasone
5465557Sjasonestatic int catas_reset_disable;
5565557Sjasonemodule_param_named(catas_reset_disable, catas_reset_disable, int, 0644);
5665557SjasoneMODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero");
5765557Sjasone
58111881Sjhbstatic void catas_reset(struct work_struct *work)
59111881Sjhb{
60111881Sjhb	struct mthca_dev *dev, *tmpdev;
61111881Sjhb	LIST_HEAD(tlist);
62111881Sjhb	int ret;
63111881Sjhb
64111881Sjhb	mutex_lock(&mthca_device_mutex);
65111881Sjhb
66111881Sjhb	spin_lock_irq(&catas_lock);
67111881Sjhb	list_splice_init(&catas_list, &tlist);
68111881Sjhb	spin_unlock_irq(&catas_lock);
69111881Sjhb
70111881Sjhb	list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) {
71111881Sjhb		struct pci_dev *pdev = dev->pdev;
72111881Sjhb		ret = __mthca_restart_one(dev->pdev);
73111881Sjhb		/* 'dev' now is not valid */
74111881Sjhb		if (ret)
75111881Sjhb			printk(KERN_ERR "mthca %s: Reset failed (%d)\n",
76111881Sjhb			       pci_name(pdev), ret);
77111881Sjhb		else {
78111881Sjhb			struct mthca_dev *d = pci_get_drvdata(pdev);
79111881Sjhb			mthca_dbg(d, "Reset succeeded\n");
80111881Sjhb		}
81111881Sjhb	}
82111881Sjhb
83111881Sjhb	mutex_unlock(&mthca_device_mutex);
84116182Sobrien}
85116182Sobrien
86116182Sobrienstatic void handle_catas(struct mthca_dev *dev)
8768790Sjhb{
8867676Sjhb	struct ib_event event;
8967676Sjhb	unsigned long flags;
9065557Sjasone	const char *type;
9167352Sjhb	int i;
92131930Smarcel
9367352Sjhb	event.device = &dev->ib_dev;
9474912Sjhb	event.event  = IB_EVENT_DEVICE_FATAL;
9574912Sjhb	event.element.port_num = 0;
9667352Sjhb	dev->active = 0;
9774912Sjhb
9865557Sjasone	ib_dispatch_event(&event);
9967676Sjhb
10065557Sjasone	switch (swab32(readl(dev->catas_err.map)) >> 24) {
10165557Sjasone	case MTHCA_CATAS_TYPE_INTERNAL:
10268790Sjhb		type = "internal error";
10368790Sjhb		break;
104111881Sjhb	case MTHCA_CATAS_TYPE_UPLINK:
105111881Sjhb		type = "uplink bus error";
106105508Sphk		break;
107105508Sphk	case MTHCA_CATAS_TYPE_DDR:
108105508Sphk		type = "DDR data error";
109139378Sjhb		break;
11074912Sjhb	case MTHCA_CATAS_TYPE_PARITY:
11165557Sjasone		type = "internal parity error";
11283798Sjhb		break;
11374912Sjhb	default:
11474912Sjhb		type = "unknown error";
11567352Sjhb		break;
11674912Sjhb	}
11771352Sjasone
11874912Sjhb	mthca_err(dev, "Catastrophic error detected: %s\n", type);
11971352Sjasone	for (i = 0; i < dev->catas_err.size; ++i)
12074912Sjhb		mthca_err(dev, "  buf[%02x]: %08x\n",
12171352Sjasone			  i, swab32(readl(dev->catas_err.map + i)));
12274912Sjhb
12374912Sjhb	if (catas_reset_disable)
12474912Sjhb		return;
12574912Sjhb
12674912Sjhb	spin_lock_irqsave(&catas_lock, flags);
12774912Sjhb	list_add(&dev->catas_err.list, &catas_list);
12874912Sjhb	queue_work(catas_wq, &catas_work);
12974912Sjhb	spin_unlock_irqrestore(&catas_lock, flags);
13074912Sjhb}
13174912Sjhb
13274912Sjhbstatic void poll_catas(unsigned long dev_ptr)
13374912Sjhb{
13474912Sjhb	struct mthca_dev *dev = (struct mthca_dev *) dev_ptr;
135112118Sjhb	int i;
13674912Sjhb
13771352Sjasone	for (i = 0; i < dev->catas_err.size; ++i)
13874912Sjhb		if (readl(dev->catas_err.map + i)) {
13974912Sjhb			handle_catas(dev);
14074912Sjhb			return;
14174912Sjhb		}
14274912Sjhb
14371352Sjasone	mod_timer(&dev->catas_err.timer,
14474912Sjhb		  round_jiffies(jiffies + MTHCA_CATAS_POLL_INTERVAL));
14571352Sjasone}
146105508Sphk
14774912Sjhbvoid mthca_start_catas_poll(struct mthca_dev *dev)
14874912Sjhb{
14974912Sjhb	unsigned long addr;
15074912Sjhb
151105508Sphk	init_timer(&dev->catas_err.timer);
15271352Sjasone	dev->catas_err.map  = NULL;
15374912Sjhb
15474912Sjhb	addr = pci_resource_start(dev->pdev, 0) +
15574912Sjhb		((pci_resource_len(dev->pdev, 0) - 1) &
15674912Sjhb		 dev->catas_err.addr);
15771352Sjasone
158112117Sjhb	dev->catas_err.map = ioremap(addr, dev->catas_err.size * 4);
159112117Sjhb	if (!dev->catas_err.map) {
160112117Sjhb		mthca_warn(dev, "couldn't map catastrophic error region "
161112117Sjhb			   "at 0x%lx/0x%x\n", addr, dev->catas_err.size * 4);
16274912Sjhb		return;
16374912Sjhb	}
164112117Sjhb
165112117Sjhb	dev->catas_err.timer.data     = (unsigned long) dev;
166112117Sjhb	dev->catas_err.timer.function = poll_catas;
16774912Sjhb	dev->catas_err.timer.expires  = jiffies + MTHCA_CATAS_POLL_INTERVAL;
168112117Sjhb	INIT_LIST_HEAD(&dev->catas_err.list);
16974912Sjhb	add_timer(&dev->catas_err.timer);
170112117Sjhb}
171112117Sjhb
172112562Sjhbvoid mthca_stop_catas_poll(struct mthca_dev *dev)
17374912Sjhb{
174112118Sjhb	del_timer_sync(&dev->catas_err.timer);
175112116Sjhb
17674912Sjhb	if (dev->catas_err.map)
17774912Sjhb		iounmap(dev->catas_err.map);
17874912Sjhb
17974912Sjhb	spin_lock_irq(&catas_lock);
18074912Sjhb	list_del(&dev->catas_err.list);
18174912Sjhb	spin_unlock_irq(&catas_lock);
18274912Sjhb}
18374912Sjhb
18476272Sjhbint __init mthca_catas_init(void)
18576272Sjhb{
186111881Sjhb	INIT_WORK(&catas_work, catas_reset);
187112115Sjhb
188112061Sjhb	catas_wq = create_singlethread_workqueue("mthcacatas");
189100011Smp	if (!catas_wq)
190100011Smp		return -ENOMEM;
191100011Smp
192100011Smp	return 0;
19372200Sbmilekic}
194134873Sjmg
19572200Sbmilekicvoid mthca_catas_cleanup(void)
196112562Sjhb{
197112562Sjhb	destroy_workqueue(catas_wq);
198112562Sjhb}
199112562Sjhb