1// SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only)
2/* Copyright(c) 2014 - 2020 Intel Corporation */
3#include <linux/kernel.h>
4#include <linux/pci.h>
5#include <linux/completion.h>
6#include <linux/workqueue.h>
7#include <linux/delay.h>
8#include "adf_accel_devices.h"
9#include "adf_common_drv.h"
10#include "adf_pfvf_pf_msg.h"
11
12struct adf_fatal_error_data {
13	struct adf_accel_dev *accel_dev;
14	struct work_struct work;
15};
16
17static struct workqueue_struct *device_reset_wq;
18static struct workqueue_struct *device_sriov_wq;
19
20static pci_ers_result_t adf_error_detected(struct pci_dev *pdev,
21					   pci_channel_state_t state)
22{
23	struct adf_accel_dev *accel_dev = adf_devmgr_pci_to_accel_dev(pdev);
24
25	dev_info(&pdev->dev, "Acceleration driver hardware error detected.\n");
26	if (!accel_dev) {
27		dev_err(&pdev->dev, "Can't find acceleration device\n");
28		return PCI_ERS_RESULT_DISCONNECT;
29	}
30
31	if (state == pci_channel_io_perm_failure) {
32		dev_err(&pdev->dev, "Can't recover from device error\n");
33		return PCI_ERS_RESULT_DISCONNECT;
34	}
35
36	set_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
37	if (accel_dev->hw_device->exit_arb) {
38		dev_dbg(&pdev->dev, "Disabling arbitration\n");
39		accel_dev->hw_device->exit_arb(accel_dev);
40	}
41	adf_error_notifier(accel_dev);
42	adf_pf2vf_notify_fatal_error(accel_dev);
43	adf_dev_restarting_notify(accel_dev);
44	adf_pf2vf_notify_restarting(accel_dev);
45	adf_pf2vf_wait_for_restarting_complete(accel_dev);
46	pci_clear_master(pdev);
47	adf_dev_down(accel_dev, false);
48
49	return PCI_ERS_RESULT_NEED_RESET;
50}
51
52/* reset dev data */
53struct adf_reset_dev_data {
54	int mode;
55	struct adf_accel_dev *accel_dev;
56	struct completion compl;
57	struct work_struct reset_work;
58};
59
60/* sriov dev data */
61struct adf_sriov_dev_data {
62	struct adf_accel_dev *accel_dev;
63	struct completion compl;
64	struct work_struct sriov_work;
65};
66
67void adf_reset_sbr(struct adf_accel_dev *accel_dev)
68{
69	struct pci_dev *pdev = accel_to_pci_dev(accel_dev);
70	struct pci_dev *parent = pdev->bus->self;
71	u16 bridge_ctl = 0;
72
73	if (!parent)
74		parent = pdev;
75
76	if (!pci_wait_for_pending_transaction(pdev))
77		dev_info(&GET_DEV(accel_dev),
78			 "Transaction still in progress. Proceeding\n");
79
80	dev_info(&GET_DEV(accel_dev), "Secondary bus reset\n");
81
82	pci_read_config_word(parent, PCI_BRIDGE_CONTROL, &bridge_ctl);
83	bridge_ctl |= PCI_BRIDGE_CTL_BUS_RESET;
84	pci_write_config_word(parent, PCI_BRIDGE_CONTROL, bridge_ctl);
85	msleep(100);
86	bridge_ctl &= ~PCI_BRIDGE_CTL_BUS_RESET;
87	pci_write_config_word(parent, PCI_BRIDGE_CONTROL, bridge_ctl);
88	msleep(100);
89}
90EXPORT_SYMBOL_GPL(adf_reset_sbr);
91
92void adf_reset_flr(struct adf_accel_dev *accel_dev)
93{
94	pcie_flr(accel_to_pci_dev(accel_dev));
95}
96EXPORT_SYMBOL_GPL(adf_reset_flr);
97
98void adf_dev_restore(struct adf_accel_dev *accel_dev)
99{
100	struct adf_hw_device_data *hw_device = accel_dev->hw_device;
101	struct pci_dev *pdev = accel_to_pci_dev(accel_dev);
102
103	if (hw_device->reset_device) {
104		dev_info(&GET_DEV(accel_dev), "Resetting device qat_dev%d\n",
105			 accel_dev->accel_id);
106		hw_device->reset_device(accel_dev);
107		pci_restore_state(pdev);
108		pci_save_state(pdev);
109	}
110}
111
112static void adf_device_sriov_worker(struct work_struct *work)
113{
114	struct adf_sriov_dev_data *sriov_data =
115		container_of(work, struct adf_sriov_dev_data, sriov_work);
116
117	adf_reenable_sriov(sriov_data->accel_dev);
118	complete(&sriov_data->compl);
119}
120
121static void adf_device_reset_worker(struct work_struct *work)
122{
123	struct adf_reset_dev_data *reset_data =
124		  container_of(work, struct adf_reset_dev_data, reset_work);
125	struct adf_accel_dev *accel_dev = reset_data->accel_dev;
126	unsigned long wait_jiffies = msecs_to_jiffies(10000);
127	struct adf_sriov_dev_data sriov_data;
128
129	adf_dev_restarting_notify(accel_dev);
130	if (adf_dev_restart(accel_dev)) {
131		/* The device hanged and we can't restart it so stop here */
132		dev_err(&GET_DEV(accel_dev), "Restart device failed\n");
133		if (reset_data->mode == ADF_DEV_RESET_ASYNC ||
134		    completion_done(&reset_data->compl))
135			kfree(reset_data);
136		WARN(1, "QAT: device restart failed. Device is unusable\n");
137		return;
138	}
139
140	sriov_data.accel_dev = accel_dev;
141	init_completion(&sriov_data.compl);
142	INIT_WORK(&sriov_data.sriov_work, adf_device_sriov_worker);
143	queue_work(device_sriov_wq, &sriov_data.sriov_work);
144	if (wait_for_completion_timeout(&sriov_data.compl, wait_jiffies))
145		adf_pf2vf_notify_restarted(accel_dev);
146
147	adf_dev_restarted_notify(accel_dev);
148	clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
149
150	/*
151	 * The dev is back alive. Notify the caller if in sync mode
152	 *
153	 * If device restart will take a more time than expected,
154	 * the schedule_reset() function can timeout and exit. This can be
155	 * detected by calling the completion_done() function. In this case
156	 * the reset_data structure needs to be freed here.
157	 */
158	if (reset_data->mode == ADF_DEV_RESET_ASYNC ||
159	    completion_done(&reset_data->compl))
160		kfree(reset_data);
161	else
162		complete(&reset_data->compl);
163}
164
165static int adf_dev_aer_schedule_reset(struct adf_accel_dev *accel_dev,
166				      enum adf_dev_reset_mode mode)
167{
168	struct adf_reset_dev_data *reset_data;
169
170	if (!adf_dev_started(accel_dev) ||
171	    test_bit(ADF_STATUS_RESTARTING, &accel_dev->status))
172		return 0;
173
174	set_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
175	reset_data = kzalloc(sizeof(*reset_data), GFP_KERNEL);
176	if (!reset_data)
177		return -ENOMEM;
178	reset_data->accel_dev = accel_dev;
179	init_completion(&reset_data->compl);
180	reset_data->mode = mode;
181	INIT_WORK(&reset_data->reset_work, adf_device_reset_worker);
182	queue_work(device_reset_wq, &reset_data->reset_work);
183
184	/* If in sync mode wait for the result */
185	if (mode == ADF_DEV_RESET_SYNC) {
186		int ret = 0;
187		/* Maximum device reset time is 10 seconds */
188		unsigned long wait_jiffies = msecs_to_jiffies(10000);
189		unsigned long timeout = wait_for_completion_timeout(
190				   &reset_data->compl, wait_jiffies);
191		if (!timeout) {
192			dev_err(&GET_DEV(accel_dev),
193				"Reset device timeout expired\n");
194			ret = -EFAULT;
195		} else {
196			kfree(reset_data);
197		}
198		return ret;
199	}
200	return 0;
201}
202
203static pci_ers_result_t adf_slot_reset(struct pci_dev *pdev)
204{
205	struct adf_accel_dev *accel_dev = adf_devmgr_pci_to_accel_dev(pdev);
206	int res = 0;
207
208	if (!accel_dev) {
209		pr_err("QAT: Can't find acceleration device\n");
210		return PCI_ERS_RESULT_DISCONNECT;
211	}
212
213	if (!pdev->is_busmaster)
214		pci_set_master(pdev);
215	pci_restore_state(pdev);
216	pci_save_state(pdev);
217	res = adf_dev_up(accel_dev, false);
218	if (res && res != -EALREADY)
219		return PCI_ERS_RESULT_DISCONNECT;
220
221	adf_reenable_sriov(accel_dev);
222	adf_pf2vf_notify_restarted(accel_dev);
223	adf_dev_restarted_notify(accel_dev);
224	clear_bit(ADF_STATUS_RESTARTING, &accel_dev->status);
225	return PCI_ERS_RESULT_RECOVERED;
226}
227
228static void adf_resume(struct pci_dev *pdev)
229{
230	dev_info(&pdev->dev, "Acceleration driver reset completed\n");
231	dev_info(&pdev->dev, "Device is up and running\n");
232}
233
234const struct pci_error_handlers adf_err_handler = {
235	.error_detected = adf_error_detected,
236	.slot_reset = adf_slot_reset,
237	.resume = adf_resume,
238};
239EXPORT_SYMBOL_GPL(adf_err_handler);
240
241int adf_dev_autoreset(struct adf_accel_dev *accel_dev)
242{
243	if (accel_dev->autoreset_on_error)
244		return adf_dev_aer_schedule_reset(accel_dev, ADF_DEV_RESET_ASYNC);
245
246	return 0;
247}
248
249static void adf_notify_fatal_error_worker(struct work_struct *work)
250{
251	struct adf_fatal_error_data *wq_data =
252			container_of(work, struct adf_fatal_error_data, work);
253	struct adf_accel_dev *accel_dev = wq_data->accel_dev;
254	struct adf_hw_device_data *hw_device = accel_dev->hw_device;
255
256	adf_error_notifier(accel_dev);
257
258	if (!accel_dev->is_vf) {
259		/* Disable arbitration to stop processing of new requests */
260		if (accel_dev->autoreset_on_error && hw_device->exit_arb)
261			hw_device->exit_arb(accel_dev);
262		if (accel_dev->pf.vf_info)
263			adf_pf2vf_notify_fatal_error(accel_dev);
264		adf_dev_autoreset(accel_dev);
265	}
266
267	kfree(wq_data);
268}
269
270int adf_notify_fatal_error(struct adf_accel_dev *accel_dev)
271{
272	struct adf_fatal_error_data *wq_data;
273
274	wq_data = kzalloc(sizeof(*wq_data), GFP_ATOMIC);
275	if (!wq_data)
276		return -ENOMEM;
277
278	wq_data->accel_dev = accel_dev;
279	INIT_WORK(&wq_data->work, adf_notify_fatal_error_worker);
280	adf_misc_wq_queue_work(&wq_data->work);
281
282	return 0;
283}
284
285int adf_init_aer(void)
286{
287	device_reset_wq = alloc_workqueue("qat_device_reset_wq",
288					  WQ_MEM_RECLAIM, 0);
289	if (!device_reset_wq)
290		return -EFAULT;
291
292	device_sriov_wq = alloc_workqueue("qat_device_sriov_wq", 0, 0);
293	if (!device_sriov_wq)
294		return -EFAULT;
295
296	return 0;
297}
298
299void adf_exit_aer(void)
300{
301	if (device_reset_wq)
302		destroy_workqueue(device_reset_wq);
303	device_reset_wq = NULL;
304
305	if (device_sriov_wq)
306		destroy_workqueue(device_sriov_wq);
307	device_sriov_wq = NULL;
308}
309