cmd_dp_page.c revision 1283:d3e5610e2d1c
113240Sprr/* 213240Sprr * CDDL HEADER START 313240Sprr * 413240Sprr * The contents of this file are subject to the terms of the 513240Sprr * Common Development and Distribution License, Version 1.0 only 613240Sprr * (the "License"). You may not use this file except in compliance 713240Sprr * with the License. 813240Sprr * 913240Sprr * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 1013240Sprr * or http://www.opensolaris.org/os/licensing. 1113240Sprr * See the License for the specific language governing permissions 1213240Sprr * and limitations under the License. 1313240Sprr * 1413240Sprr * When distributing Covered Code, include this CDDL HEADER in each 1513240Sprr * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 1613240Sprr * If applicable, add the following below this CDDL HEADER, with the 1713240Sprr * fields enclosed by brackets "[]" replaced with your own identifying 1813240Sprr * information: Portions Copyright [yyyy] [name of copyright owner] 1913240Sprr * 2013240Sprr * CDDL HEADER END 2113240Sprr */ 2213240Sprr/* 2313240Sprr * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 2413240Sprr * Use is subject to license terms. 2513240Sprr */ 2613240Sprr 2713240Sprr#pragma ident "%Z%%M% %I% %E% SMI" 2813240Sprr 2913240Sprr/* 3013240Sprr * Support routines for managing potential page and bank faults that have 3113240Sprr * been deferred due to a datapath error. Currently deferment only occurs 3213240Sprr * if a memory UE occurs while a datapath error is active. When this happens 3313240Sprr * a page case is created with a special subtype of CMD_PTR_DP_PAGE_DEFER. An 3413240Sprr * entry (a cmd_dp_defer_t) is added to a list of deferred pages. The entry 3513240Sprr * links to the cmd_page_t in the cmd_pages list and also keeps track of what 3613240Sprr * memory controller ids are associated with the first AFAR and any more that 3713240Sprr * are seen while the page is deferred. This information is used to determine 3813240Sprr * if the page should be faulted if the fault should be skipped because an 3913240Sprr * intervening datapath fault has occurred. If a page is faulted when it is 4013240Sprr * replayed, the corresponding bank is faulted, too, since the original error 4113240Sprr * was a UE. Note that no action is taken to undo any action taken by the 4213240Sprr * kernel when the UE was detected. Currently the kernel will attempt to 4313240Sprr * immediately retire the page where a UE is detected and the retire may or 4413240Sprr * may not have completed by the time FMA receives an ereport. The possibility 4513240Sprr * of a datapath fault resulting in memory UEs is very small, so the likelihood 4613240Sprr * of encountering this scenario is also very small. 4713240Sprr */ 4813240Sprr 4913240Sprr#include <cmd.h> 5013240Sprr#include <cmd_dp.h> 5113240Sprr#include <cmd_dp_page.h> 5213240Sprr#include <cmd_bank.h> 5313240Sprr#include <cmd_page.h> 5413240Sprr 5513240Sprr#include <fm/fmd_api.h> 5613240Sprr#include <sys/nvpair.h> 5713240Sprr 5813240Sprrextern void cmd_bank_fault(fmd_hdl_t *, cmd_bank_t *); 5913240Sprr 6013240Sprrstatic void 6113240Sprrdp_page_defer_data_write(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage) 6213240Sprr{ 6313240Sprr fmd_buf_write(hdl, dpage->dp_defer_page->page_case, "mcids", 6413240Sprr &dpage->dp_defer_mcids, sizeof (dpage->dp_defer_mcids)); 6513240Sprr} 6613240Sprr 6713240Sprrstatic void 6813240Sprrdp_page_defer_data_restore(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage) 6913240Sprr{ 7013240Sprr fmd_buf_read(hdl, dpage->dp_defer_page->page_case, "mcids", 7113240Sprr &dpage->dp_defer_mcids, sizeof (dpage->dp_defer_mcids)); 7213240Sprr} 7313240Sprr 7413240Sprrstatic void 7513240Sprrdp_page_defer_add_data(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage, uint64_t afar) 7613240Sprr{ 7713240Sprr int mcid; 7813240Sprr int i; 7913240Sprr 8013240Sprr if (cmd_dp_get_mcid(afar, &mcid) < 0) 8113240Sprr fmd_hdl_abort(hdl, "cmd_dp_get_mcid failed"); 8213240Sprr 8313240Sprr for (i = 0; i < DP_MAX_MCS; i++) { 8413240Sprr if (dpage->dp_defer_mcids[i] == -1) { 8513240Sprr dpage->dp_defer_mcids[i] = mcid; 8613240Sprr break; 8713240Sprr } 8813240Sprr if (dpage->dp_defer_mcids[i] == mcid) 8913240Sprr break; 9013240Sprr } 9113240Sprr 9213240Sprr if (i == DP_MAX_MCS) 9313240Sprr fmd_hdl_abort(hdl, "too many mcids for deferred page"); 9413240Sprr 9513240Sprr dp_page_defer_data_write(hdl, dpage); 9613240Sprr} 9713240Sprr 9813240Sprrstatic cmd_dp_defer_t * 9913240Sprrdp_page_defer_create(fmd_hdl_t *hdl, cmd_page_t *page, uint64_t afar) 10013240Sprr{ 10113240Sprr cmd_dp_defer_t *dpage; 10213240Sprr int i; 10313240Sprr 10413240Sprr dpage = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_defer_t), FMD_SLEEP); 10513240Sprr 10613240Sprr dpage->dp_defer_page = page; 10713240Sprr 10813240Sprr for (i = 0; i < DP_MAX_MCS; i++) 10913240Sprr dpage->dp_defer_mcids[i] = -1; 11013240Sprr 11113240Sprr dp_page_defer_add_data(hdl, dpage, afar); 11213240Sprr 11313240Sprr cmd_list_append(&cmd.cmd_deferred_pages, dpage); 11413240Sprr 11513240Sprr return (dpage); 11613240Sprr} 11713240Sprr 11813240Sprrstatic cmd_dp_defer_t * 11913240Sprrdp_page_defer_lookup(cmd_page_t *page) 12013240Sprr{ 12113240Sprr cmd_dp_defer_t *dpage; 12213240Sprr 12313240Sprr for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL; 12413240Sprr dpage = cmd_list_next(dpage)) { 12513240Sprr if (dpage->dp_defer_page == page) 12613240Sprr return (dpage); 12713240Sprr } 12813240Sprr 12913240Sprr return (NULL); 13013240Sprr} 13113240Sprr 13213240Sprrvoid 13313240Sprrcmd_dp_page_defer(fmd_hdl_t *hdl, nvlist_t *modasru, fmd_event_t *ep, 13413240Sprr uint64_t afar) 13513240Sprr{ 13613240Sprr cmd_dp_defer_t *dpage; 13713240Sprr cmd_page_t *page = cmd_page_lookup(afar); 13813240Sprr const char *uuid; 13913240Sprr 14013240Sprr if (page == NULL) { 14113240Sprr page = cmd_page_create(hdl, modasru, afar); 14213240Sprr dpage = dp_page_defer_create(hdl, page, afar); 14313240Sprr page->page_case = cmd_case_create(hdl, &page->page_header, 14413240Sprr CMD_PTR_DP_PAGE_DEFER, &uuid); 14513240Sprr fmd_case_setprincipal(hdl, page->page_case, ep); 14613240Sprr } else { 14713240Sprr dpage = dp_page_defer_lookup(page); 14813240Sprr if (dpage == NULL) 14913240Sprr fmd_hdl_abort(hdl, "deferred page with no defer data"); 15013240Sprr fmd_case_add_ereport(hdl, page->page_case, ep); 15113240Sprr } 15213240Sprr 15313240Sprr dp_page_defer_add_data(hdl, dpage, afar); 15413240Sprr} 15513240Sprr 15613240Sprrint 15713240Sprrcmd_dp_page_check(fmd_hdl_t *hdl, cmd_dp_defer_t *dpage) 15813240Sprr{ 15913240Sprr int i; 16013240Sprr 16113240Sprr for (i = 0; i < DP_MAX_MCS; i++) { 16213240Sprr if (dpage->dp_defer_mcids[i] == -1) 16313240Sprr break; 16413240Sprr /* 16513240Sprr * If there's no datapath fault corresponding to 16613240Sprr * an mcid, that means the page incurred an error 16713240Sprr * not attributable to a datapath fault. 16813240Sprr */ 16913240Sprr if (cmd_dp_lookup_fault(hdl, dpage->dp_defer_mcids[i]) == 0) 17013240Sprr return (0); 17113240Sprr } 17213240Sprr 17313240Sprr return (1); 17413240Sprr} 17513240Sprr 17613240Sprrvoid 17713240Sprrcmd_dp_page_replay(fmd_hdl_t *hdl) 17813240Sprr{ 17913240Sprr fmd_event_t *ep; 18013240Sprr cmd_page_t *page; 18113240Sprr cmd_bank_t *bank; 18213240Sprr cmd_dp_defer_t *dpage; 18313240Sprr nvlist_t *nvl; 18413240Sprr 18513240Sprr while ((dpage = cmd_list_next(&cmd.cmd_deferred_pages)) != NULL) { 18613240Sprr fmd_hdl_debug(hdl, "replaying deferred page, " 18713240Sprr "pa=%llx\n", dpage->dp_defer_page->page_physbase); 18813240Sprr 18913240Sprr page = dpage->dp_defer_page; 19013240Sprr 19113240Sprr if (cmd_dp_page_check(hdl, dpage)) { 19213240Sprr fmd_hdl_debug(hdl, "deferred memory UE overtaken by " 19313240Sprr "dp fault"); 19413240Sprr CMD_STAT_BUMP(dp_ignored_ue); 19513240Sprr fmd_case_close(hdl, page->page_case); 19613240Sprr cmd_list_delete(&cmd.cmd_deferred_pages, dpage); 19713240Sprr fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t)); 19813240Sprr cmd_page_destroy(hdl, page); 19913240Sprr continue; 20013240Sprr } 20113240Sprr 20213240Sprr nvl = page->page_asru_nvl; 20313240Sprr 20413240Sprr bank = cmd_bank_lookup(hdl, nvl); 20513240Sprr 20613240Sprr ep = fmd_case_getprincipal(hdl, page->page_case); 20713240Sprr fmd_case_add_ereport(hdl, bank->bank_case.cc_cp, ep); 20813240Sprr 20913240Sprr bank->bank_nretired++; 21013240Sprr bank->bank_retstat.fmds_value.ui64++; 21113240Sprr cmd_bank_dirty(hdl, bank); 21213240Sprr 21313240Sprr fmd_case_reset(hdl, page->page_case); 21413240Sprr cmd_case_fini(hdl, page->page_case, FMD_B_TRUE); 21513240Sprr 21613240Sprr page->page_case = NULL; 21713240Sprr cmd_page_fault(hdl, nvl, nvl, ep, page->page_physbase); 21813240Sprr cmd_bank_fault(hdl, bank); 21913240Sprr 22013240Sprr cmd_list_delete(&cmd.cmd_deferred_pages, dpage); 22115400Sprr fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t)); 22213240Sprr } 22313240Sprr 22413240Sprr fmd_hdl_debug(hdl, "cmd_page_defer_replay() complete\n"); 22513240Sprr} 22613240Sprr 22713240Sprrvoid 22813240Sprrcmd_dp_page_restore(fmd_hdl_t *hdl, cmd_page_t *page) 22913240Sprr{ 23013240Sprr cmd_dp_defer_t *dpage; 23113240Sprr 23213240Sprr dpage = fmd_hdl_zalloc(hdl, sizeof (cmd_dp_defer_t), FMD_SLEEP); 23313240Sprr 23413240Sprr dpage->dp_defer_page = page; 235 236 dp_page_defer_data_restore(hdl, dpage); 237 238 cmd_list_append(&cmd.cmd_deferred_pages, dpage); 239} 240 241void 242cmd_dp_page_validate(fmd_hdl_t *hdl) 243{ 244 cmd_dp_defer_t *dpage, *next; 245 cmd_page_t *page; 246 247 for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL; 248 dpage = next) { 249 next = cmd_list_next(dpage); 250 251 page = dpage->dp_defer_page; 252 253 if (!fmd_nvl_fmri_present(hdl, page->page_asru_nvl)) { 254 cmd_page_destroy(hdl, page); 255 cmd_list_delete(&cmd.cmd_deferred_pages, dpage); 256 fmd_hdl_free(hdl, dpage, sizeof (cmd_dp_defer_t)); 257 } 258 } 259} 260 261/*ARGSUSED*/ 262int 263cmd_dp_page_isdeferred(fmd_hdl_t *hdl, cmd_page_t *page) 264{ 265 cmd_dp_defer_t *dpage, *next; 266 267 for (dpage = cmd_list_next(&cmd.cmd_deferred_pages); dpage != NULL; 268 dpage = next) { 269 next = cmd_list_next(dpage); 270 271 if (dpage->dp_defer_page == page) { 272 return (1); 273 } 274 } 275 276 return (0); 277} 278