1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <cma.h>
27
28#include <sys/fm/ldom.h>
29#include <sys/fm/protocol.h>
30#include <fm/fmd_fmri.h>
31#include <fm/libtopo.h>
32
33#include <assert.h>
34#include <fcntl.h>
35#include <unistd.h>
36#include <errno.h>
37#include <strings.h>
38
39#include <sys/types.h>
40#include <sys/processor.h>
41
42extern ldom_hdl_t *cma_lhp;
43
44/*ARGSUSED*/
45int
46cpu_blacklist_cmd(fmd_hdl_t *hdl, nvlist_t *fmri, boolean_t repair)
47{
48	if (repair)
49		return (ldom_fmri_unblacklist(cma_lhp, fmri));
50	else
51		return (ldom_fmri_blacklist(cma_lhp, fmri));
52}
53
54int
55cma_cpu_blacklist(fmd_hdl_t *hdl, nvlist_t *nvl, nvlist_t *asru,
56    boolean_t repair)
57{
58	nvlist_t *fmri;
59	int rc, err;
60
61	/*
62	 * Some platforms have special unums for the E$ DIMMs.	If we're dealing
63	 * with a platform that has these unums, one will have been added to the
64	 * fault as the resource.  We'll use that for the blacklisting.  If we
65	 * can't find a resource, we'll fall back to the ASRU.
66	 */
67	if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &fmri) != 0)
68		fmri = asru;
69
70	rc = cpu_blacklist_cmd(hdl, fmri, repair);
71	err = errno;
72
73	if (rc < 0 && err != ENOTSUP) {
74		errno = err;
75		return (-1);
76	}
77
78	return (0);
79}
80
81/*ARGSUSED*/
82static int
83cpu_cmd(fmd_hdl_t *hdl, nvlist_t *fmri, int cmd)
84{
85	int rc = 0;
86	char *scheme;
87
88	/*
89	 * We're using topo retire if the fmri is in "hc" scheme.
90	 */
91	if (nvlist_lookup_string(fmri, FM_FMRI_SCHEME, &scheme) == 0 &&
92	    strcmp(scheme, FM_FMRI_SCHEME_HC) == 0) {
93		if (cmd != P_STATUS) {
94			errno = EINVAL;
95			return (-1);
96		}
97		rc = fmd_nvl_fmri_service_state(hdl, fmri);
98		switch (rc) {
99		case FMD_SERVICE_STATE_UNUSABLE:
100			return (P_FAULTED);
101		case -1:
102			return (-1);
103		default:
104			return (P_ONLINE);
105		}
106	}
107
108	switch (cmd & ~P_FORCED) {
109	case P_STATUS:
110		rc = ldom_fmri_status(cma_lhp, fmri);
111		break;
112	case P_FAULTED:
113		rc = ldom_fmri_retire(cma_lhp, fmri);
114		break;
115	case P_ONLINE:
116		rc = ldom_fmri_unretire(cma_lhp, fmri);
117		break;
118	default:
119		errno = EINVAL;
120		return (-1);
121	}
122
123	if (rc != P_OFFLINE && rc != P_ONLINE && rc != P_FAULTED) {
124		errno = rc;
125		return (-1);
126	}
127
128	return (rc);
129}
130
131void
132cma_cpu_start_retry(fmd_hdl_t *hdl, nvlist_t *fmri, const char *uuid,
133    boolean_t repair)
134{
135	cma_cpu_t *cpu;
136	char *scheme;
137	uint_t cpuid;
138	nvlist_t *asru = NULL;
139	topo_hdl_t *thp;
140	int err;
141
142	if (repair || nvlist_lookup_string(fmri, FM_FMRI_SCHEME, &scheme) != 0)
143		return;
144	if (strcmp(scheme, FM_FMRI_SCHEME_CPU) == 0) {
145		if (nvlist_lookup_uint32(fmri, FM_FMRI_CPU_ID, &cpuid) != 0)
146			return;
147	} else if (strcmp(scheme, FM_FMRI_SCHEME_HC) != 0) {
148		return;
149	} else {
150		/* lookup cpuid from ASRU */
151		thp = fmd_fmri_topo_hold(TOPO_VERSION);
152		if (thp != NULL) {
153			(void) topo_fmri_asru(thp, fmri, &asru, &err);
154			fmd_fmri_topo_rele(thp);
155		}
156		if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
157			nvlist_free(asru);
158			return;
159		}
160	}
161
162	/*
163	 * check to see if the cpu has been offline.
164	 */
165	fmd_hdl_debug(hdl, "cpu %u is not offline yet - sleeping\n", cpuid);
166
167	/*
168	 * Create a cpu node and add to the head of the cpu list
169	 */
170	cpu = fmd_hdl_zalloc(hdl, sizeof (cma_cpu_t), FMD_SLEEP);
171	(void) nvlist_dup(fmri, &cpu->cpu_fmri, 0);
172	if (uuid != NULL)
173		cpu->cpu_uuid = fmd_hdl_strdup(hdl, uuid, FMD_SLEEP);
174
175	cpu->cpuid = cpuid;
176	cpu->cpu_next = cma.cma_cpus;
177	cma.cma_cpus = cpu;
178
179	if (cma.cma_cpu_timerid != 0)
180		fmd_timer_remove(hdl, cma.cma_cpu_timerid);
181
182	cma.cma_cpu_curdelay = cma.cma_cpu_mindelay;
183
184	cma.cma_cpu_timerid =
185	    fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay);
186}
187
188
189int
190cma_cpu_statechange(fmd_hdl_t *hdl, nvlist_t *asru, const char *uuid,
191    int cpustate, boolean_t repair)
192{
193	int i;
194	uint_t cpuid;
195
196	if (nvlist_lookup_uint32(asru, FM_FMRI_CPU_ID, &cpuid) != 0) {
197		fmd_hdl_debug(hdl, "missing '%s'\n", FM_FMRI_CPU_ID);
198		cma_stats.bad_flts.fmds_value.ui64++;
199		return (CMA_RA_FAILURE);
200	}
201
202	/*
203	 * cpu offlining using ldom_fmri_retire() may be asynchronous, so we
204	 * have to set the timer and check the cpu status later.
205	 */
206	for (i = 0; i < cma.cma_cpu_tries;
207	    i++, (void) nanosleep(&cma.cma_cpu_delay, NULL)) {
208		if (cpu_cmd(hdl, asru, cpustate) != -1) {
209			if (repair)
210				cma_stats.cpu_repairs.fmds_value.ui64++;
211			else
212				cma_stats.cpu_flts.fmds_value.ui64++;
213			break;
214		}
215	}
216
217	if (i >= cma.cma_cpu_tries) {
218		cma_stats.cpu_fails.fmds_value.ui64++;
219	}
220
221	cma_cpu_start_retry(hdl, asru, uuid, repair);
222
223	return (CMA_RA_FAILURE);
224}
225
226static int
227cpu_retry(fmd_hdl_t *hdl, cma_cpu_t *cpu)
228{
229	int rc = 0;
230
231	fmd_hdl_debug(hdl, "cpu_retry()\n");
232
233	if (cpu->cpu_fmri == NULL) {
234		return (1);
235	}
236
237	if (!fmd_nvl_fmri_present(hdl, cpu->cpu_fmri)) {
238		fmd_hdl_debug(hdl, "cpu %u is not present", cpu->cpuid);
239		return (1);
240	}
241
242	rc = cpu_cmd(hdl, cpu->cpu_fmri, P_STATUS);
243	if (rc == P_FAULTED || rc == P_OFFLINE) {
244		fmd_hdl_debug(hdl, "cpu %u is offlined on retry %u\n",
245		    cpu->cpuid, cpu->cpu_nretries);
246		cma_stats.cpu_flts.fmds_value.ui64++;
247
248		if (cpu->cpu_uuid != NULL)
249			fmd_case_uuclose(hdl, cpu->cpu_uuid);
250		return (1); /* success */
251	}
252
253	if (rc == -1) {
254		fmd_hdl_debug(hdl, "failed to retry cpu %u\n", cpu->cpuid);
255		cma_stats.page_fails.fmds_value.ui64++;
256		return (1); /* give up */
257	}
258
259	return (0);
260}
261
262static void
263cma_cpu_free(fmd_hdl_t *hdl, cma_cpu_t *cpu)
264{
265	if (cpu->cpu_fmri != NULL)
266		nvlist_free(cpu->cpu_fmri);
267	if (cpu->cpu_uuid != NULL)
268		fmd_hdl_strfree(hdl, cpu->cpu_uuid);
269	fmd_hdl_free(hdl, cpu, sizeof (cma_cpu_t));
270}
271
272void
273cma_cpu_retry(fmd_hdl_t *hdl)
274{
275	cma_cpu_t **cpup;
276
277	fmd_hdl_debug(hdl, "cma_cpu_retry: timer fired\n");
278
279	cma.cma_cpu_timerid = 0;
280
281	cpup = &cma.cma_cpus;
282	while (*cpup != NULL) {
283		cma_cpu_t *cpu = *cpup;
284
285		if (cpu_retry(hdl, cpu)) {
286			/*
287			 * Successful retry or we're giving up - remove from
288			 * the list
289			 */
290			*cpup = cpu->cpu_next;
291
292			cma_cpu_free(hdl, cpu);
293		} else {
294			cpu->cpu_nretries++;
295			cpup = &cpu->cpu_next;
296		}
297	}
298
299	if (cma.cma_cpus == NULL)
300		return; /* no more cpus */
301
302	/*
303	 * We still have cpus to check.  Back the delay
304	 * off, and schedule a retry.
305	 */
306	cma.cma_cpu_curdelay = MIN(cma.cma_cpu_curdelay * 2,
307	    cma.cma_cpu_maxdelay);
308
309	fmd_hdl_debug(hdl, "scheduled cpu offline retry for %llu secs\n",
310	    (u_longlong_t)(cma.cma_cpu_curdelay / NANOSEC));
311
312	cma.cma_cpu_timerid =
313	    fmd_timer_install(hdl, NULL, NULL, cma.cma_cpu_curdelay);
314}
315
316void
317cma_cpu_fini(fmd_hdl_t *hdl)
318{
319	cma_cpu_t *cpu;
320
321	while ((cpu = cma.cma_cpus) != NULL) {
322		cma.cma_cpus = cpu->cpu_next;
323		cma_cpu_free(hdl, cpu);
324	}
325}
326