cmd_dp_page.c revision 1186:7791ded250f8
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29/* 30 * Support routines for managing potential page and bank faults that have 31 * been deferred due to a datapath error. Currently deferment only occurs 32 * if a memory UE occurs while a datapath error is active. When this happens 33 * a page case is created with a special subtype of CMD_PTR_DP_PAGE_DEFER. An 34 * entry (a cmd_dp_defer_t) is added to a list of deferred pages. The entry 35 * links to the cmd_page_t in the cmd_pages list and also keeps track of what 36 * memory controller ids are associated with the first AFAR and any more that 37 * are seen while the page is deferred. This information is used to determine 38 * if the page should be faulted if the fault should be skipped because an 39 * intervening datapath fault has occurred. If a page is faulted when it is 40 * replayed, the corresponding bank is faulted, too, since the original error 41 * was a UE. Note that no action is taken to undo any action taken by the 42 * kernel when the UE was detected. Currently the kernel will attempt to 43 * immediately retire the page where a UE is detected and the retire may or 44 * may not have completed by the time FMA receives an ereport. The possibility 45 * of a datapath fault resulting in memory UEs is very small, so the likelihood 46 * of encountering this scenario is also very small. 47 */ 48 49#include <cmd.h> 50#include <cmd_dp.h> 51#include <cmd_dp_page.h> 52#include <cmd_bank.h> 53#include <cmd_page.h> 54 55#include <fm/fmd_api.h> 56#include <sys/nvpair.h> 57 58extern void cmd_bank_fault(fmd_hdl_t *, cmd_bank_t *); 59 60static void 61dp_page_defer_data_write(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage) 62{ 63 fmd_buf_write(hdl, dpage->dp_defer_page->page_case, "mcids", 64 &dpage->dp_defer_mcids, sizeof (dpage->dp_defer_mcids)); 65} 66 67static void 68dp_page_defer_data_restore(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage) 69{ 70 fmd_buf_read(hdl, dpage->dp_defer_page->page_case, "mcids", 71 &dpage->dp_defer_mcids, sizeof (dpage->dp_defer_mcids)); 72} 73 74static void 75dp_page_defer_add_data(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage, uint64_t afar) 76{ 77 int mcid; 78 int i; 79 80 if (cmd_dp_get_mcid(afar, &mcid) < 0) 81 fmd_hdl_abort(hdl, "cmd_dp_get_mcid failed"); 82 83 for (i = 0; i < DP_MAX_MCS; i++) { 84 if (dpage->dp_defer_mcids[i] == -1) { 85 dpage->dp_defer_mcids[i] = mcid; 86 break; 87 } 88 if (dpage->dp_defer_mcids[i] == mcid) 89 break; 90 } 91 92 if (i == DP_MAX_MCS) 93 fmd_hdl_abort(hdl, "too many mcids for deferred page"); 94 95 dp_page_defer_data_write(hdl, dpage); 96} 97 98static cmd_dp_defer_t * 99dp_page_defer_create(fmd_hdl_t *hdl, cmd_page_t *page, uint64_t afar) 100{ 101 cmd_dp_defer_t *dpage; 102 int i; 103 104 dpage = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_defer_t), FMD_SLEEP); 105 106 dpage->dp_defer_page = page; 107 108 for (i = 0; i < DP_MAX_MCS; i++) 109 dpage->dp_defer_mcids[i] = -1; 110 111 dp_page_defer_add_data(hdl, dpage, afar); 112 113 cmd_list_append(&cmd.cmd_deferred_pages, dpage); 114 115 return (dpage); 116} 117 118static cmd_dp_defer_t * 119dp_page_defer_lookup(cmd_page_t *page) 120{ 121 cmd_dp_defer_t *dpage; 122 123 for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL; 124 dpage = cmd_list_next(dpage)) { 125 if (dpage->dp_defer_page == page) 126 return (dpage); 127 } 128 129 return (NULL); 130} 131 132void 133cmd_dp_page_defer(fmd_hdl_t *hdl, nvlist_t *modasru, fmd_event_t *ep, 134 uint64_t afar) 135{ 136 cmd_dp_defer_t *dpage; 137 cmd_page_t *page = cmd_page_lookup(afar); 138 const char *uuid; 139 140 if (page == NULL) { 141 page = cmd_page_create(hdl, modasru, afar); 142 dpage = dp_page_defer_create(hdl, page, afar); 143 page->page_case = cmd_case_create(hdl, &page->page_header, 144 CMD_PTR_DP_PAGE_DEFER, &uuid); 145 fmd_case_setprincipal(hdl, page->page_case, ep); 146 } else { 147 dpage = dp_page_defer_lookup(page); 148 if (dpage == NULL) 149 fmd_hdl_abort(hdl, "deferred page with no defer data"); 150 fmd_case_add_ereport(hdl, page->page_case, ep); 151 } 152 153 dp_page_defer_add_data(hdl, dpage, afar); 154} 155 156int 157cmd_dp_page_check(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage) 158{ 159 int i; 160 161 for (i = 0; i < DP_MAX_MCS; i++) { 162 if (dpage->dp_defer_mcids[i] == -1) 163 break; 164 /* 165 * If there's no datapath fault corresponding to 166 * an mcid, that means the page incurred an error 167 * not attributable to a datapath fault. 168 */ 169 if (cmd_dp_lookup_fault(hdl, dpage->dp_defer_mcids[i]) == 0) 170 return (0); 171 } 172 173 return (1); 174} 175 176void 177cmd_dp_page_replay(fmd_hdl_t *hdl) 178{ 179 fmd_event_t *ep; 180 cmd_page_t *page; 181 cmd_bank_t *bank; 182 cmd_dp_defer_t *dpage; 183 nvlist_t *nvl; 184 185 while ((dpage = cmd_list_next(&cmd.cmd_deferred_pages)) != NULL) { 186 fmd_hdl_debug(hdl, "replaying deferred page, " 187 "pa=%llx\n", dpage->dp_defer_page->page_physbase); 188 189 page = dpage->dp_defer_page; 190 191 if (cmd_dp_page_check(hdl, dpage)) { 192 fmd_hdl_debug(hdl, "deferred memory UE overtaken by " 193 "dp fault"); 194 CMD_STAT_BUMP(dp_ignored_ue); 195 fmd_case_close(hdl, page->page_case); 196 cmd_list_delete(&cmd.cmd_deferred_pages, dpage); 197 fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t)); 198 cmd_page_destroy(hdl, page); 199 continue; 200 } 201 202 nvl = page->page_asru_nvl; 203 204 bank = cmd_bank_lookup(hdl, nvl); 205 206 ep = fmd_case_getprincipal(hdl, page->page_case); 207 fmd_case_add_ereport(hdl, bank->bank_case.cc_cp, ep); 208 209 bank->bank_nretired++; 210 bank->bank_retstat.fmds_value.ui64++; 211 cmd_bank_dirty(hdl, bank); 212 213 fmd_case_reset(hdl, page->page_case); 214 cmd_case_fini(hdl, page->page_case, FMD_B_TRUE); 215 216 page->page_case = NULL; 217 cmd_page_fault(hdl, nvl, nvl, ep, page->page_physbase); 218 cmd_bank_fault(hdl, bank); 219 220 cmd_list_delete(&cmd.cmd_deferred_pages, dpage); 221 fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t)); 222 } 223 224 fmd_hdl_debug(hdl, "cmd_page_defer_replay() complete\n"); 225} 226 227void 228cmd_dp_page_restore(fmd_hdl_t *hdl, cmd_page_t *page) 229{ 230 cmd_dp_defer_t *dpage; 231 232 dpage = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_defer_t), FMD_SLEEP); 233 234 dpage->dp_defer_page = page; 235 236 dp_page_defer_data_restore(hdl, dpage); 237 238 cmd_list_append(&cmd.cmd_deferred_pages, dpage); 239} 240 241void 242cmd_dp_page_validate(fmd_hdl_t *hdl) 243{ 244 cmd_dp_defer_t *dpage, *next; 245 cmd_page_t *page; 246 247 for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL; 248 dpage = next) { 249 next = cmd_list_next(dpage); 250 251 page = dpage->dp_defer_page; 252 253 if (fmd_nvl_fmri_unusable(hdl, page->page_asru_nvl)) { 254 cmd_page_destroy(hdl, page); 255 cmd_list_delete(&cmd.cmd_deferred_pages, dpage); 256 fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t)); 257 } 258 } 259} 260