1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * OPAL hypervisor Maintenance interrupt handling support in PowerNV.
4 *
5 * Copyright 2014 IBM Corporation
6 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
7 */
8
9#undef DEBUG
10
11#include <linux/kernel.h>
12#include <linux/init.h>
13#include <linux/of.h>
14#include <linux/mm.h>
15#include <linux/slab.h>
16
17#include <asm/opal.h>
18#include <asm/cputable.h>
19#include <asm/machdep.h>
20
21#include "powernv.h"
22
23static int opal_hmi_handler_nb_init;
24struct OpalHmiEvtNode {
25	struct list_head list;
26	struct OpalHMIEvent hmi_evt;
27};
28
29struct xstop_reason {
30	uint32_t xstop_reason;
31	const char *unit_failed;
32	const char *description;
33};
34
35static LIST_HEAD(opal_hmi_evt_list);
36static DEFINE_SPINLOCK(opal_hmi_evt_lock);
37
38static void print_core_checkstop_reason(const char *level,
39					struct OpalHMIEvent *hmi_evt)
40{
41	int i;
42	static const struct xstop_reason xstop_reason[] = {
43		{ CORE_CHECKSTOP_IFU_REGFILE, "IFU",
44				"RegFile core check stop" },
45		{ CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
46		{ CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
47				"Core checkstop during recovery" },
48		{ CORE_CHECKSTOP_ISU_REGFILE, "ISU",
49				"RegFile core check stop (mapper error)" },
50		{ CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
51		{ CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
52		{ CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
53		{ CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
54				"Recovery in maintenance mode" },
55		{ CORE_CHECKSTOP_LSU_REGFILE, "LSU",
56				"RegFile core check stop" },
57		{ CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
58				"Forward Progress Error" },
59		{ CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
60		{ CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
61		{ CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
62				"Hypervisor Resource error - core check stop" },
63		{ CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
64				"Hang Recovery Failed (core check stop)" },
65		{ CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
66				"Ambiguous Hang Detected (unknown source)" },
67		{ CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
68				"Debug Trigger Error inject" },
69		{ CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
70				"Hypervisor check stop via SPRC/SPRD" },
71	};
72
73	/* Validity check */
74	if (!hmi_evt->u.xstop_error.xstop_reason) {
75		printk("%s	Unknown Core check stop.\n", level);
76		return;
77	}
78
79	printk("%s	CPU PIR: %08x\n", level,
80			be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
81	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
82		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
83					xstop_reason[i].xstop_reason)
84			printk("%s	[Unit: %-3s] %s\n", level,
85					xstop_reason[i].unit_failed,
86					xstop_reason[i].description);
87}
88
89static void print_nx_checkstop_reason(const char *level,
90					struct OpalHMIEvent *hmi_evt)
91{
92	int i;
93	static const struct xstop_reason xstop_reason[] = {
94		{ NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
95					"SHM invalid state error" },
96		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
97					"DMA invalid state error bit 15" },
98		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
99					"DMA invalid state error bit 16" },
100		{ NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
101					"Channel 0 invalid state error" },
102		{ NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
103					"Channel 1 invalid state error" },
104		{ NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
105					"Channel 2 invalid state error" },
106		{ NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
107					"Channel 3 invalid state error" },
108		{ NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
109					"Channel 4 invalid state error" },
110		{ NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
111					"Channel 5 invalid state error" },
112		{ NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
113					"Channel 6 invalid state error" },
114		{ NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
115					"Channel 7 invalid state error" },
116		{ NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
117					"UE error on CRB(CSB address, CCB)" },
118		{ NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
119					"SUE error on CRB(CSB address, CCB)" },
120		{ NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
121		"CRB Kill ISN received while holding ISN with UE error" },
122	};
123
124	/* Validity check */
125	if (!hmi_evt->u.xstop_error.xstop_reason) {
126		printk("%s	Unknown NX check stop.\n", level);
127		return;
128	}
129
130	printk("%s	NX checkstop on CHIP ID: %x\n", level,
131			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
132	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
133		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
134					xstop_reason[i].xstop_reason)
135			printk("%s	[Unit: %-3s] %s\n", level,
136					xstop_reason[i].unit_failed,
137					xstop_reason[i].description);
138}
139
140static void print_npu_checkstop_reason(const char *level,
141					struct OpalHMIEvent *hmi_evt)
142{
143	uint8_t reason, reason_count, i;
144
145	/*
146	 * We may not have a checkstop reason on some combination of
147	 * hardware and/or skiboot version
148	 */
149	if (!hmi_evt->u.xstop_error.xstop_reason) {
150		printk("%s	NPU checkstop on chip %x\n", level,
151			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
152		return;
153	}
154
155	/*
156	 * NPU2 has 3 FIRs. Reason encoded on a byte as:
157	 *   2 bits for the FIR number
158	 *   6 bits for the bit number
159	 * It may be possible to find several reasons.
160	 *
161	 * We don't display a specific message per FIR bit as there
162	 * are too many and most are meaningless without the workbook
163	 * and/or hw team help anyway.
164	 */
165	reason_count = sizeof(hmi_evt->u.xstop_error.xstop_reason) /
166		sizeof(reason);
167	for (i = 0; i < reason_count; i++) {
168		reason = (hmi_evt->u.xstop_error.xstop_reason >> (8 * i)) & 0xFF;
169		if (reason)
170			printk("%s	NPU checkstop on chip %x: FIR%d bit %d is set\n",
171				level,
172				be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id),
173				reason >> 6, reason & 0x3F);
174	}
175}
176
177static void print_checkstop_reason(const char *level,
178					struct OpalHMIEvent *hmi_evt)
179{
180	uint8_t type = hmi_evt->u.xstop_error.xstop_type;
181	switch (type) {
182	case CHECKSTOP_TYPE_CORE:
183		print_core_checkstop_reason(level, hmi_evt);
184		break;
185	case CHECKSTOP_TYPE_NX:
186		print_nx_checkstop_reason(level, hmi_evt);
187		break;
188	case CHECKSTOP_TYPE_NPU:
189		print_npu_checkstop_reason(level, hmi_evt);
190		break;
191	default:
192		printk("%s	Unknown Malfunction Alert of type %d\n",
193		       level, type);
194		break;
195	}
196}
197
198static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
199{
200	const char *level, *sevstr, *error_info;
201	static const char *hmi_error_types[] = {
202		"Malfunction Alert",
203		"Processor Recovery done",
204		"Processor recovery occurred again",
205		"Processor recovery occurred for masked error",
206		"Timer facility experienced an error",
207		"TFMR SPR is corrupted",
208		"UPS (Uninterrupted Power System) Overflow indication",
209		"An XSCOM operation failure",
210		"An XSCOM operation completed",
211		"SCOM has set a reserved FIR bit to cause recovery",
212		"Debug trigger has set a reserved FIR bit to cause recovery",
213		"A hypervisor resource error occurred",
214		"CAPP recovery process is in progress",
215	};
216	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
217				      DEFAULT_RATELIMIT_BURST);
218
219	/* Print things out */
220	if (hmi_evt->version < OpalHMIEvt_V1) {
221		pr_err("HMI Interrupt, Unknown event version %d !\n",
222			hmi_evt->version);
223		return;
224	}
225	switch (hmi_evt->severity) {
226	case OpalHMI_SEV_NO_ERROR:
227		level = KERN_INFO;
228		sevstr = "Harmless";
229		break;
230	case OpalHMI_SEV_WARNING:
231		level = KERN_WARNING;
232		sevstr = "";
233		break;
234	case OpalHMI_SEV_ERROR_SYNC:
235		level = KERN_ERR;
236		sevstr = "Severe";
237		break;
238	case OpalHMI_SEV_FATAL:
239	default:
240		level = KERN_ERR;
241		sevstr = "Fatal";
242		break;
243	}
244
245	if (hmi_evt->severity != OpalHMI_SEV_NO_ERROR || __ratelimit(&rs)) {
246		printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
247			level, sevstr,
248			hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
249			"Recovered" : "Not recovered");
250		error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
251				hmi_error_types[hmi_evt->type]
252				: "Unknown";
253		printk("%s Error detail: %s\n", level, error_info);
254		printk("%s	HMER: %016llx\n", level,
255					be64_to_cpu(hmi_evt->hmer));
256		if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
257			(hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
258			printk("%s	TFMR: %016llx\n", level,
259						be64_to_cpu(hmi_evt->tfmr));
260	}
261
262	if (hmi_evt->version < OpalHMIEvt_V2)
263		return;
264
265	/* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
266	if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
267		print_checkstop_reason(level, hmi_evt);
268}
269
270static void hmi_event_handler(struct work_struct *work)
271{
272	unsigned long flags;
273	struct OpalHMIEvent *hmi_evt;
274	struct OpalHmiEvtNode *msg_node;
275	uint8_t disposition;
276	struct opal_msg msg;
277	int unrecoverable = 0;
278
279	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
280	while (!list_empty(&opal_hmi_evt_list)) {
281		msg_node = list_entry(opal_hmi_evt_list.next,
282					   struct OpalHmiEvtNode, list);
283		list_del(&msg_node->list);
284		spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
285
286		hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
287		print_hmi_event_info(hmi_evt);
288		disposition = hmi_evt->disposition;
289		kfree(msg_node);
290
291		/*
292		 * Check if HMI event has been recovered or not. If not
293		 * then kernel can't continue, we need to panic.
294		 * But before we do that, display all the HMI event
295		 * available on the list and set unrecoverable flag to 1.
296		 */
297		if (disposition != OpalHMI_DISPOSITION_RECOVERED)
298			unrecoverable = 1;
299
300		spin_lock_irqsave(&opal_hmi_evt_lock, flags);
301	}
302	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
303
304	if (unrecoverable) {
305		/* Pull all HMI events from OPAL before we panic. */
306		while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
307			u32 type;
308
309			type = be32_to_cpu(msg.msg_type);
310
311			/* skip if not HMI event */
312			if (type != OPAL_MSG_HMI_EVT)
313				continue;
314
315			/* HMI event info starts from param[0] */
316			hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
317			print_hmi_event_info(hmi_evt);
318		}
319
320		pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception");
321	}
322}
323
324static DECLARE_WORK(hmi_event_work, hmi_event_handler);
325/*
326 * opal_handle_hmi_event - notifier handler that queues up HMI events
327 * to be preocessed later.
328 */
329static int opal_handle_hmi_event(struct notifier_block *nb,
330			  unsigned long msg_type, void *msg)
331{
332	unsigned long flags;
333	struct OpalHMIEvent *hmi_evt;
334	struct opal_msg *hmi_msg = msg;
335	struct OpalHmiEvtNode *msg_node;
336
337	/* Sanity Checks */
338	if (msg_type != OPAL_MSG_HMI_EVT)
339		return 0;
340
341	/* HMI event info starts from param[0] */
342	hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
343
344	/* Delay the logging of HMI events to workqueue. */
345	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
346	if (!msg_node) {
347		pr_err("HMI: out of memory, Opal message event not handled\n");
348		return -ENOMEM;
349	}
350	memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(*hmi_evt));
351
352	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
353	list_add(&msg_node->list, &opal_hmi_evt_list);
354	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
355
356	schedule_work(&hmi_event_work);
357	return 0;
358}
359
360static struct notifier_block opal_hmi_handler_nb = {
361	.notifier_call	= opal_handle_hmi_event,
362	.next		= NULL,
363	.priority	= 0,
364};
365
366int __init opal_hmi_handler_init(void)
367{
368	int ret;
369
370	if (!opal_hmi_handler_nb_init) {
371		ret = opal_message_notifier_register(
372				OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
373		if (ret) {
374			pr_err("%s: Can't register OPAL event notifier (%d)\n",
375			       __func__, ret);
376			return ret;
377		}
378		opal_hmi_handler_nb_init = 1;
379	}
380	return 0;
381}
382