cmd_dp_page.c revision 1186:7791ded250f8
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29/*
30 * Support routines for managing potential page and bank faults that have
31 * been deferred due to a datapath error.  Currently deferment only occurs
32 * if a memory UE occurs while a datapath error is active.  When this happens
33 * a page case is created with a special subtype of CMD_PTR_DP_PAGE_DEFER.  An
34 * entry (a cmd_dp_defer_t) is added to a list of deferred pages.  The entry
35 * links to the cmd_page_t in the cmd_pages list and also keeps track of what
36 * memory controller ids are associated with the first AFAR and any more that
37 * are seen while the page is deferred.  This information is used to determine
38 * if the page should be faulted if the fault should be skipped because an
39 * intervening datapath fault has occurred.  If a page is faulted when it is
40 * replayed, the corresponding bank is faulted, too, since the original error
41 * was a UE.  Note that no action is taken to undo any action taken by the
42 * kernel when the UE was detected.  Currently the kernel will attempt to
43 * immediately retire the page where a UE is detected and the retire may or
44 * may not have completed by the time FMA receives an ereport.  The possibility
45 * of a datapath fault resulting in memory UEs is very small, so the likelihood
46 * of encountering this scenario is also very small.
47 */
48
49#include <cmd.h>
50#include <cmd_dp.h>
51#include <cmd_dp_page.h>
52#include <cmd_bank.h>
53#include <cmd_page.h>
54
55#include <fm/fmd_api.h>
56#include <sys/nvpair.h>
57
58extern void cmd_bank_fault(fmd_hdl_t *, cmd_bank_t *);
59
60static void
61dp_page_defer_data_write(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage)
62{
63	fmd_buf_write(hdl, dpage->dp_defer_page->page_case, "mcids",
64	    &dpage->dp_defer_mcids, sizeof (dpage->dp_defer_mcids));
65}
66
67static void
68dp_page_defer_data_restore(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage)
69{
70	fmd_buf_read(hdl, dpage->dp_defer_page->page_case, "mcids",
71	    &dpage->dp_defer_mcids, sizeof (dpage->dp_defer_mcids));
72}
73
74static void
75dp_page_defer_add_data(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage, uint64_t afar)
76{
77	int mcid;
78	int i;
79
80	if (cmd_dp_get_mcid(afar, &mcid) < 0)
81		fmd_hdl_abort(hdl, "cmd_dp_get_mcid failed");
82
83	for (i = 0; i < DP_MAX_MCS; i++) {
84		if (dpage->dp_defer_mcids[i] == -1) {
85			dpage->dp_defer_mcids[i] = mcid;
86			break;
87		}
88		if (dpage->dp_defer_mcids[i] == mcid)
89			break;
90	}
91
92	if (i == DP_MAX_MCS)
93		fmd_hdl_abort(hdl, "too many mcids for deferred page");
94
95	dp_page_defer_data_write(hdl, dpage);
96}
97
98static cmd_dp_defer_t *
99dp_page_defer_create(fmd_hdl_t *hdl, cmd_page_t *page, uint64_t afar)
100{
101	cmd_dp_defer_t *dpage;
102	int i;
103
104	dpage = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_defer_t), FMD_SLEEP);
105
106	dpage->dp_defer_page = page;
107
108	for (i = 0; i < DP_MAX_MCS; i++)
109		dpage->dp_defer_mcids[i] = -1;
110
111	dp_page_defer_add_data(hdl, dpage, afar);
112
113	cmd_list_append(&cmd.cmd_deferred_pages, dpage);
114
115	return (dpage);
116}
117
118static cmd_dp_defer_t *
119dp_page_defer_lookup(cmd_page_t *page)
120{
121	cmd_dp_defer_t *dpage;
122
123	for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL;
124	    dpage = cmd_list_next(dpage)) {
125		if (dpage->dp_defer_page == page)
126			return (dpage);
127	}
128
129	return (NULL);
130}
131
132void
133cmd_dp_page_defer(fmd_hdl_t *hdl, nvlist_t *modasru, fmd_event_t *ep,
134    uint64_t afar)
135{
136	cmd_dp_defer_t *dpage;
137	cmd_page_t *page = cmd_page_lookup(afar);
138	const char *uuid;
139
140	if (page == NULL) {
141		page = cmd_page_create(hdl, modasru, afar);
142		dpage = dp_page_defer_create(hdl, page, afar);
143		page->page_case = cmd_case_create(hdl, &page->page_header,
144		    CMD_PTR_DP_PAGE_DEFER, &uuid);
145		fmd_case_setprincipal(hdl, page->page_case, ep);
146	} else {
147		dpage = dp_page_defer_lookup(page);
148		if (dpage == NULL)
149			fmd_hdl_abort(hdl, "deferred page with no defer data");
150		fmd_case_add_ereport(hdl, page->page_case, ep);
151	}
152
153	dp_page_defer_add_data(hdl, dpage, afar);
154}
155
156int
157cmd_dp_page_check(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage)
158{
159	int i;
160
161	for (i = 0; i < DP_MAX_MCS; i++) {
162		if (dpage->dp_defer_mcids[i] == -1)
163			break;
164		/*
165		 * If there's no datapath fault corresponding to
166		 * an mcid, that means the page incurred an error
167		 * not attributable to a datapath fault.
168		 */
169		if (cmd_dp_lookup_fault(hdl, dpage->dp_defer_mcids[i]) == 0)
170			return (0);
171	}
172
173	return (1);
174}
175
176void
177cmd_dp_page_replay(fmd_hdl_t *hdl)
178{
179	fmd_event_t *ep;
180	cmd_page_t *page;
181	cmd_bank_t *bank;
182	cmd_dp_defer_t *dpage;
183	nvlist_t *nvl;
184
185	while ((dpage = cmd_list_next(&cmd.cmd_deferred_pages)) != NULL) {
186		fmd_hdl_debug(hdl, "replaying deferred page, "
187		    "pa=%llx\n", dpage->dp_defer_page->page_physbase);
188
189		page = dpage->dp_defer_page;
190
191		if (cmd_dp_page_check(hdl, dpage)) {
192			fmd_hdl_debug(hdl, "deferred memory UE  overtaken by "
193			    "dp fault");
194			CMD_STAT_BUMP(dp_ignored_ue);
195			fmd_case_close(hdl, page->page_case);
196			cmd_list_delete(&cmd.cmd_deferred_pages, dpage);
197			fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t));
198			cmd_page_destroy(hdl, page);
199			continue;
200		}
201
202		nvl = page->page_asru_nvl;
203
204		bank = cmd_bank_lookup(hdl, nvl);
205
206		ep = fmd_case_getprincipal(hdl, page->page_case);
207		fmd_case_add_ereport(hdl, bank->bank_case.cc_cp, ep);
208
209		bank->bank_nretired++;
210		bank->bank_retstat.fmds_value.ui64++;
211		cmd_bank_dirty(hdl, bank);
212
213		fmd_case_reset(hdl, page->page_case);
214		cmd_case_fini(hdl, page->page_case, FMD_B_TRUE);
215
216		page->page_case = NULL;
217		cmd_page_fault(hdl, nvl, nvl, ep, page->page_physbase);
218		cmd_bank_fault(hdl, bank);
219
220		cmd_list_delete(&cmd.cmd_deferred_pages, dpage);
221		fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t));
222	}
223
224	fmd_hdl_debug(hdl, "cmd_page_defer_replay() complete\n");
225}
226
227void
228cmd_dp_page_restore(fmd_hdl_t *hdl, cmd_page_t *page)
229{
230	cmd_dp_defer_t *dpage;
231
232	dpage = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_defer_t), FMD_SLEEP);
233
234	dpage->dp_defer_page = page;
235
236	dp_page_defer_data_restore(hdl, dpage);
237
238	cmd_list_append(&cmd.cmd_deferred_pages, dpage);
239}
240
241void
242cmd_dp_page_validate(fmd_hdl_t *hdl)
243{
244	cmd_dp_defer_t *dpage, *next;
245	cmd_page_t *page;
246
247	for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL;
248	    dpage = next) {
249		next = cmd_list_next(dpage);
250
251		page = dpage->dp_defer_page;
252
253		if (fmd_nvl_fmri_unusable(hdl, page->page_asru_nvl)) {
254			cmd_page_destroy(hdl, page);
255			cmd_list_delete(&cmd.cmd_deferred_pages, dpage);
256			fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t));
257		}
258	}
259}
260