1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25#ifndef _CMD_MEM_H 26#define _CMD_MEM_H 27 28/* 29 * Support routines for managing state related to memory modules. 30 * 31 * Correctable errors generally cause changes to the DIMM-related state (see 32 * cmd_dimm.c), whereas uncorrectable errors tend to use the bank-related 33 * routines (see cmd_bank.c). The primary exception to this division (though 34 * it eventually devolves to one of the two) is the RxE/FRx pair emitted by 35 * UltraSPARC-IIIi processors. With these errors, a complete pair must be 36 * received and matched before we know whether we're dealing with a CE or a UE. 37 */ 38 39#include <cmd.h> 40#include <cmd_state.h> 41#include <cmd_fmri.h> 42#include <sys/errclassify.h> 43#include <cmd_cpu.h> 44 45#ifdef __cplusplus 46extern "C" { 47#endif 48 49#define CMD_MEM_F_FAULTING 0x1 50 51/* 52 * Used to store as-yet unmatched IOxEs, RxEs, and FRxs. When a new IOxE, 53 * RxE or FRx arrives, we traverse the cmd.cmd_iorxefrx list, looking for 54 * matching entries. Matching has a cpuid-based component, as well as a 55 * temporal one. We can compare the cpuids directly, using the cmd_iorxefrx_t 56 * and the newly-received event. Temporal comparison isn't performed directly. 57 * Instead, we ensure that entries in the iorxefrx list are removed when they 58 * expire by means of timers. This frees the matching code from the need to 59 * worry about time. 60 */ 61typedef struct cmd_iorxefrx { 62 cmd_list_t rf_list; /* List of cmd_iorxefrx_t's */ 63 cmd_errcl_t rf_errcl; /* Error type (CMD_ERRCL_*) */ 64 uint_t rf_afsr_agentid; /* Remote Agent ID (from AFSR) */ 65 uint_t rf_det_agentid; /* Locat Agent ID (from detector) */ 66 id_t rf_expid; /* Timer ID for entry expiration */ 67 uint64_t rf_afar; /* Valid for RxE only */ 68 uint8_t rf_afar_status; /* Valid for RxE only */ 69 ce_dispact_t rf_type; /* Valid for RxE only */ 70 uint16_t rf_synd; /* Valid for FRx only */ 71 uint8_t rf_synd_status; /* Valid for FRx only */ 72 uint64_t rf_afsr; /* Valid for FRx only */ 73 uint64_t rf_disp; /* Valid for RCE only */ 74} cmd_iorxefrx_t; 75 76typedef struct cmd_dimm cmd_dimm_t; 77typedef struct cmd_bank cmd_bank_t; 78#ifdef sun4v 79typedef struct cmd_branch cmd_branch_t; 80#endif 81 82/* 83 * Correctable and Uncorrectable memory errors 84 * 85 * CEs of "Unknown" or "Intermittent" classification are not used in diagnosis. 86 * 87 * "Persistent" CEs are added to per-DIMM SERD engines. When the 88 * engine for a given DIMM fires, the page corresponding to the CE that 89 * caused the engine to fire is retired, and the SERD engine for that 90 * DIMM is reset. 91 * 92 * "Possibly Persistent" CEs are at least Persistent and so are treated 93 * as "Persistent" errors above, being added to the same SERD engines. 94 * 95 * "Leaky" CEs and "Sticky" CEs trigger immediate page retirement. 96 * 97 * "Possibly Sticky" CEs to which no valid partner test has been applied 98 * are not used in diagnosis. Where a valid partner test has been applied 99 * but did not confirm "Sticky" status there is a _suggestion_ that the 100 * original cpu may be a bad reader or writer or suffering from other 101 * datapath issues. To avoid retiring pages for such non-DIMM problems 102 * these classifications are also not used in diagnosis. 103 * 104 * UEs immediately trigger page retirements, but do not affect the CE SERD 105 * engines. In addition, UEs are recorded in the UE caches of the detecting 106 * CPUs. When a page is to be retired, a fault.memory.page fault is 107 * generated. 108 * 109 */ 110 111typedef cmd_evdisp_t cmd_xe_handler_f(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 112 const char *, uint64_t, uint8_t, uint16_t, uint8_t, ce_dispact_t, uint64_t, 113 nvlist_t *); 114 115extern ce_dispact_t cmd_mem_name2type(const char *, int); 116extern int cmd_synd2upos(uint16_t); 117extern cmd_evdisp_t cmd_ce(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 118 const char *, cmd_errcl_t); 119extern cmd_evdisp_t cmd_ue(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 120 const char *, cmd_errcl_t); 121extern cmd_evdisp_t cmd_ce_common(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 122 const char *, uint64_t, uint8_t, uint16_t, uint8_t, 123 ce_dispact_t, uint64_t, nvlist_t *); 124extern cmd_evdisp_t cmd_ue_common(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 125 const char *, uint64_t, uint8_t, uint16_t, uint8_t, 126 ce_dispact_t, uint64_t, nvlist_t *); 127extern cmd_evdisp_t cmd_mem_synd_check(fmd_hdl_t *, uint64_t, uint8_t, 128 uint16_t, uint8_t, cmd_cpu_t *); 129extern void cmd_dimm_close(fmd_hdl_t *, void *); 130extern void cmd_bank_close(fmd_hdl_t *, void *); 131extern int cmd_same_datapath_dimms(cmd_dimm_t *, cmd_dimm_t *); 132extern void cmd_gen_datapath_fault(fmd_hdl_t *, cmd_dimm_t *, cmd_dimm_t *, 133 uint16_t, nvlist_t *); 134extern void cmd_to_hashed_addr(uint64_t *, uint64_t, const char *); 135 136#ifdef sun4u 137extern char *cmd_cpu_getfrustr_by_id(fmd_hdl_t *, uint32_t); 138#endif 139 140#ifdef sun4v 141extern void cmd_branch_close(fmd_hdl_t *, void *); 142extern cmd_evdisp_t cmd_fb(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 143 const char *, cmd_errcl_t); 144extern cmd_evdisp_t cmd_fw_defect(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 145 const char *, cmd_errcl_t); 146extern cmd_evdisp_t cmd_fb_train(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 147 const char *, cmd_errcl_t); 148extern cmd_evdisp_t cmd_ue_train(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 149 const char *, cmd_errcl_t); 150#endif 151 152/* 153 * US-IIIi I/O, Remote and Foreign Read memory errors 154 * 155 * When one processor or I/O bridge attempts to read memory local to 156 * another processor, one each of IOCE/IOUE/RCE/RUE and FRC/FRU will be 157 * generated, depending on the type of error. Both the IOxE/RxE and the FRx 158 * are needed, as each contains data necessary to the diagnosis of the error. 159 * Upon receipt of one of the errors, we wait until we receive the other. 160 * When the pair has been successfully received and matched, a CE or UE, 161 * as appropriate, is synthesized from the data in the matched ereports. 162 * The synthesized ereports are handled by the normal CE and UE mechanisms. 163 */ 164extern cmd_evdisp_t cmd_frx(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 165 const char *, cmd_errcl_t); 166extern cmd_evdisp_t cmd_rxe(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 167 const char *, cmd_errcl_t); 168extern cmd_evdisp_t cmd_ioxe(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 169 const char *, cmd_errcl_t); 170extern cmd_evdisp_t cmd_ioxe_sec(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 171 const char *, cmd_errcl_t); 172extern cmd_evdisp_t cmd_rxefrx_common(fmd_hdl_t *hdl, fmd_event_t *ep, 173 nvlist_t *nvl, const char *class, cmd_errcl_t clcode, 174 cmd_errcl_t matchmask); 175 176/* 177 * A list of received IOxE/RxE/FRx ereports is maintained for correlation 178 * purposes (see above). These two routines manage the addition of new 179 * ereports, and the retrieval of existing ones. Pruning of the list is 180 * handled automatically. 181 */ 182extern void cmd_iorxefrx_queue(fmd_hdl_t *, cmd_iorxefrx_t *); 183extern void cmd_iorxefrx_free(fmd_hdl_t *, cmd_iorxefrx_t *); 184 185extern const char *cmd_fmri_get_unum(nvlist_t *); 186extern nvlist_t *cmd_mem_fmri_create(const char *, char **, size_t); 187extern nvlist_t *cmd_mem_fmri_derive(fmd_hdl_t *, uint64_t, uint64_t, uint16_t); 188 189extern void cmd_mem_case_restore(fmd_hdl_t *, cmd_case_t *, fmd_case_t *, 190 const char *, const char *); 191extern char *cmd_mem_serdnm_create(fmd_hdl_t *, const char *, const char *); 192extern char *cmd_page_serdnm_create(fmd_hdl_t *, const char *, uint64_t); 193extern char *cmd_mq_serdnm_create(fmd_hdl_t *, const char *, uint64_t, 194 uint16_t, uint16_t); 195extern void cmd_mem_retirestat_create(fmd_hdl_t *, fmd_stat_t *, const char *, 196 uint64_t, const char *); 197extern int cmd_mem_thresh_check(fmd_hdl_t *, uint_t); 198extern ulong_t cmd_mem_get_phys_pages(fmd_hdl_t *); 199 200extern void cmd_mem_timeout(fmd_hdl_t *, id_t); 201extern void cmd_mem_gc(fmd_hdl_t *); 202extern void cmd_mem_fini(fmd_hdl_t *); 203 204#ifdef __cplusplus 205} 206#endif 207 208#endif /* _CMD_MEM_H */ 209